1727 files changed, 236274 insertions, 46765 deletions
diff --git a/libavcodec/012v.c b/libavcodec/012v.c
new file mode 100644
index 0000000000..b5a4066656
--- /dev/null
+++ b/libavcodec/012v.c
@@ -0,0 +1,155 @@
+/*
+ * 012v decoder
+ *
+ * Copyright (C) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int zero12v_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 10;
+
+    if (avctx->codec_tag == MKTAG('a', '1', '2', 'v'))
+        avpriv_request_sample(avctx, "transparency");
+
+    return 0;
+}
+
+static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame, AVPacket *avpkt)
+{
+    int line, ret;
+    const int width = avctx->width;
+    AVFrame *pic = data;
+    uint16_t *y, *u, *v;
+    const uint8_t *line_end, *src = avpkt->data;
+    int stride = avctx->width * 8 / 3;
+
+    if (width <= 1 || avctx->height <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions %dx%d not supported.\n", width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (   avctx->codec_tag == MKTAG('0', '1', '2', 'v')
+        && avpkt->size % avctx->height == 0
+        && avpkt->size / avctx->height * 3 >= width * 8)
+        stride = avpkt->size / avctx->height;
+
+    if (avpkt->size < avctx->height * stride) {
+        av_log(avctx, AV_LOG_ERROR, "Packet too small: %d instead of %d\n",
+               avpkt->size, avctx->height * stride);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    line_end = avpkt->data + stride;
+    for (line = 0; line < avctx->height; line++) {
+        uint16_t y_temp[6] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+        uint16_t u_temp[3] = {0x8000, 0x8000, 0x8000};
+        uint16_t v_temp[3] = {0x8000, 0x8000, 0x8000};
+        int x;
+        y = (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+        u = (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+        v = (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+
+        for (x = 0; x < width; x += 6) {
+            uint32_t t;
+
+            if (width - x < 6 || line_end - src < 16) {
+                y = y_temp;
+                u = u_temp;
+                v = v_temp;
+            }
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *u++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *v++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *u++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *v++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *u++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *v++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (width - x < 6)
+                break;
+        }
+
+        if (x < width) {
+            y = x   + (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+            u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+            v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+            memcpy(y, y_temp, sizeof(*y) * (width - x));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
+        }
+
+        line_end += stride;
+        src = line_end - stride;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_zero12v_decoder = {
+    .name           = "012v",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_012V,
+    .init           = zero12v_decode_init,
+    .decode         = zero12v_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index b2d4db26b0..a7a757a078 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -2,20 +2,20 @@
  * 4XM codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
@@ -36,6 +37,7 @@
 #include "get_bits.h"
 #include "internal.h"
 
+
 #define BLOCK_TYPE_VLC_BITS 5
 #define ACDC_VLC_BITS 9
 
@@ -289,7 +291,7 @@ static void init_mv(FourXContext *f, int linesize)
     }
 #endif
 
-static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
+static inline void mcdc(uint16_t *dst, const uint16_t *src, int log2w,
                         int h, int stride, int scale, unsigned dc)
 {
     int i;
@@ -333,36 +335,32 @@ static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
         }
         break;
     default:
-        break;
+        av_assert0(0);
     }
 }
 
-static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
+static int decode_p_block(FourXContext *f, uint16_t *dst, const uint16_t *src,
                           int log2w, int log2h, int stride)
 {
     int index, h, code, ret, scale = 1;
     uint16_t *start, *end;
     unsigned dc = 0;
 
-    if (log2h < 0 || log2w < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(log2w >= 0 && log2h >= 0);
 
     index = size2index[log2h][log2w];
-    if (index < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(index >= 0);
 
     h     = 1 << log2h;
     code  = get_vlc2(&f->gb, block_type_vlc[1 - (f->version > 1)][index].table,
                      BLOCK_TYPE_VLC_BITS, 1);
-    if (code < 0 || code > 6)
-        return AVERROR_INVALIDDATA;
+    av_assert0(code >= 0 && code <= 6);
 
     start = f->last_frame_buffer;
     end   = start + stride * (f->avctx->height - h + 1) - (1 << log2w);
 
     if (code == 1) {
-        if (--log2h < 0)
-            return AVERROR_INVALIDDATA;
+        log2h--;
         if ((ret = decode_p_block(f, dst, src, log2w, log2h, stride)) < 0)
             return ret;
         return decode_p_block(f, dst + (stride << log2h),
@@ -376,24 +374,42 @@ static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
                               src + (1 << log2w),
                               log2w, log2h, stride);
     } else if (code == 6) {
+        if (bytestream2_get_bytes_left(&f->g2) < 4) {
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         if (log2w) {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[1]      = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[1]      = bytestream2_get_le16u(&f->g2);
         } else {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[stride] = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[stride] = bytestream2_get_le16u(&f->g2);
         }
         return 0;
     }
 
+    if ((code&3)==0 && bytestream2_get_bytes_left(&f->g) < 1) {
+        av_log(f->avctx, AV_LOG_ERROR, "bytestream overread\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (code == 0) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
     } else if (code == 3 && f->version >= 2) {
         return 0;
     } else if (code == 4) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         dc    = bytestream2_get_le16(&f->g2);
     } else if (code == 5) {
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert0(start <= src && src <= end);
         scale = 0;
         dc    = bytestream2_get_le16(&f->g2);
     }
@@ -422,9 +438,9 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
     src = f->last_frame_buffer;
 
     if (f->version > 1) {
-        if (length < 20)
-            return AVERROR_INVALIDDATA;
         extra           = 20;
+        if (length < extra)
+            return AVERROR_INVALIDDATA;
         bitstream_size  = AV_RL32(buf + 8);
         wordstream_size = AV_RL32(buf + 12);
         bytestream_size = AV_RL32(buf + 16);
@@ -435,24 +451,21 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
         bytestream_size = FFMAX(length - bitstream_size - wordstream_size, 0);
     }
 
-    if (bitstream_size + bytestream_size + wordstream_size + extra != length
-        || bitstream_size  > (1 << 26)
-        || bytestream_size > (1 << 26)
-        || wordstream_size > (1 << 26)) {
-        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n",
-               bitstream_size, bytestream_size, wordstream_size,
-               bitstream_size + bytestream_size + wordstream_size - length);
+    if (bitstream_size > length || bitstream_size >= INT_MAX/8 ||
+        bytestream_size > length - bitstream_size ||
+        wordstream_size > length - bytestream_size - bitstream_size ||
+        extra > length - bytestream_size - bitstream_size - wordstream_size) {
+        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n", bitstream_size, bytestream_size, wordstream_size,
+        bitstream_size+ bytestream_size+ wordstream_size - length);
         return AVERROR_INVALIDDATA;
     }
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   bitstream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          bitstream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) (buf + extra),
                        bitstream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + bitstream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&f->gb, f->bitstream_buffer, 8 * bitstream_size);
 
     wordstream_offset = extra + bitstream_size;
@@ -483,10 +496,17 @@ static int decode_i_block(FourXContext *f, int16_t *block)
 {
     int code, i, j, level, val;
 
+    if (get_bits_left(&f->gb) < 2){
+        av_log(f->avctx, AV_LOG_ERROR, "%d bits left before decode_i_block()\n", get_bits_left(&f->gb));
+        return -1;
+    }
+
     /* DC coef */
     val = get_vlc2(&f->pre_gb, f->pre_vlc.table, ACDC_VLC_BITS, 3);
-    if (val >> 4)
+    if (val >> 4) {
         av_log(f->avctx, AV_LOG_ERROR, "error dc run != 0\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     if (val)
         val = get_xbits(&f->gb, val);
@@ -504,7 +524,12 @@ static int decode_i_block(FourXContext *f, int16_t *block)
         if (code == 0xf0) {
             i += 16;
         } else {
-            level = get_xbits(&f->gb, code & 0xf);
+            if (code & 0xf) {
+                level = get_xbits(&f->gb, code & 0xf);
+            } else {
+                av_log(f->avctx, AV_LOG_ERROR, "0 coeff\n");
+                return AVERROR_INVALIDDATA;
+            }
             i    += code >> 4;
             if (i >= 64) {
                 av_log(f->avctx, AV_LOG_ERROR, "run %d oveflow\n", i);
@@ -584,7 +609,7 @@ static int decode_i_mb(FourXContext *f)
 
 static const uint8_t *read_huffman_tables(FourXContext *f,
                                           const uint8_t * const buf,
-                                          int len)
+                                          int buf_size)
 {
     int frequency[512] = { 0 };
     uint8_t flag[512];
@@ -593,6 +618,7 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     int bits_tab[257];
     int start, end;
     const uint8_t *ptr = buf;
+    const uint8_t *ptr_end = buf + buf_size;
     int j;
 
     memset(up, -1, sizeof(up));
@@ -602,10 +628,10 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     for (;;) {
         int i;
 
-        len -= end - start + 1;
-
-        if (end < start || len < 0)
+        if (ptr_end - ptr < FFMAX(end - start + 1, 0) + 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid data in read_huffman_tables\n");
             return NULL;
+        }
 
         for (i = start; i <= end; i++)
             frequency[i] = *ptr++;
@@ -613,9 +639,6 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
         if (start == 0)
             break;
 
-        if (--len < 0)
-            return NULL;
-
         end = *ptr++;
     }
     frequency[256] = 1;
@@ -623,6 +646,11 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     while ((ptr - buf) & 3)
         ptr++; // 4byte align
 
+    if (ptr > ptr_end) {
+        av_log(f->avctx, AV_LOG_ERROR, "ptr overflow in read_huffman_tables\n");
+        return NULL;
+    }
+
     for (j = 257; j < 512; j++) {
         int min_freq[2] = { 256 * 256, 256 * 256 };
         int smallest[2] = { 0, 0 };
@@ -691,6 +719,7 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     const int height = f->avctx->height;
     const int mbs    = (FFALIGN(width, 16) >> 4) * (FFALIGN(height, 16) >> 4);
     uint16_t *dst    = f->frame_buffer;
+    const uint8_t *buf_end = buf + length;
     GetByteContext g3;
 
     if (length < mbs * 8) {
@@ -702,6 +731,8 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     for (y = 0; y < height; y += 16) {
         for (x = 0; x < width; x += 16) {
             unsigned int color[4] = { 0 }, bits;
+            if (buf_end - buf < 8)
+                return -1;
             // warning following is purely guessed ...
             color[0] = bytestream2_get_le16u(&g3);
             color[1] = bytestream2_get_le16u(&g3);
@@ -735,7 +766,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
     const int width  = f->avctx->width;
     const int height = f->avctx->height;
     const unsigned int bitstream_size = AV_RL32(buf);
-    int token_count av_unused;
     unsigned int prestream_size;
     const uint8_t *prestream;
 
@@ -747,7 +777,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
-    token_count    =     AV_RL32(buf + bitstream_size + 8);
     prestream_size = 4 * AV_RL32(buf + bitstream_size + 4);
     prestream      =             buf + bitstream_size + 12;
 
@@ -764,18 +793,18 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
+    av_assert0(prestream <= buf + length);
+
     init_get_bits(&f->gb, buf + 4, 8 * bitstream_size);
 
     prestream_size = length + buf - prestream;
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   prestream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          prestream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) prestream,
                        prestream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + prestream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&f->pre_gb, f->bitstream_buffer, 8 * prestream_size);
 
     f->last_dc = 0 * 128 * 8 * 8;
@@ -807,11 +836,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 20)
         return AVERROR_INVALIDDATA;
 
-    if (avctx->width % 16 || avctx->height % 16) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Dimensions non-multiple of 16 are invalid.\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(avctx->width % 16 == 0 && avctx->height % 16 == 0);
 
     if (buf_size < AV_RL32(buf + 4) + 8) {
         av_log(f->avctx, AV_LOG_ERROR, "size mismatch %d %"PRIu32"\n",
@@ -827,9 +852,19 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         const int data_size  = buf_size - 20;
         CFrameBuffer *cfrm;
 
+        if (f->version <= 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "cfrm in version %d\n", f->version);
+            return AVERROR_INVALIDDATA;
+        }
+
         id         = AV_RL32(buf + 12);
         whole_size = AV_RL32(buf + 16);
 
+        if (data_size < 0 || whole_size < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "sizes invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < CFRAME_BUFFER_COUNT; i++)
             if (f->cfrm[i].id && f->cfrm[i].id < avctx->frame_number)
                 av_log(f->avctx, AV_LOG_ERROR, "lost c frame %d\n",
@@ -848,11 +883,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         }
         cfrm = &f->cfrm[i];
 
+        if (data_size > UINT_MAX -  cfrm->size - AV_INPUT_BUFFER_PADDING_SIZE)
+            return AVERROR_INVALIDDATA;
+
         cfrm->data = av_fast_realloc(cfrm->data, &cfrm->allocated_size,
                                      cfrm->size + data_size + AV_INPUT_BUFFER_PADDING_SIZE);
         // explicit check needed as memcpy below might not catch a NULL
         if (!cfrm->data) {
-            av_log(f->avctx, AV_LOG_ERROR, "realloc failure");
+            av_log(f->avctx, AV_LOG_ERROR, "realloc failure\n");
             return AVERROR(ENOMEM);
         }
 
@@ -879,24 +917,27 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         frame_size = buf_size - 12;
     }
 
-
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
         return ret;
-    }
 
     if (frame_4cc == AV_RL32("ifr2")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0)
+        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i2 frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("ifrm")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_i_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("pfrm") || frame_4cc == AV_RL32("pfr2")) {
         picture->pict_type = AV_PICTURE_TYPE_P;
-        if ((ret = decode_p_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_p_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode p frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("snd_")) {
         av_log(avctx, AV_LOG_ERROR, "ignoring snd_ chunk length:%d\n",
                buf_size);
@@ -946,6 +987,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "extradata wrong or missing\n");
         return AVERROR_INVALIDDATA;
     }
+    if((avctx->width % 16) || (avctx->height % 16)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported width/height\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
     if (ret < 0)
diff --git a/libavcodec/8bps.c b/libavcodec/8bps.c
index 78b616fdf4..2e4464dbb4 100644
--- a/libavcodec/8bps.c
+++ b/libavcodec/8bps.c
@@ -2,20 +2,20 @@
  * Quicktime Planar RGB (8BPS) Video Decoder
  * Copyright (C) 2003 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
  *
  * Supports: PAL8 (RGB 8bpp, paletted)
  *         : BGR24 (RGB 24bpp) (can also output it as RGB32)
- *         : RGB32 (RGB 32bpp, 4th plane is probably alpha and it's ignored)
+ *         : RGB32 (RGB 32bpp, 4th plane is alpha)
  *
  */
 
@@ -66,27 +66,18 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     unsigned int dlen, p, row;
     const unsigned char *lp, *dp, *ep;
     unsigned char count;
-    unsigned int px_inc;
     unsigned int planes     = c->planes;
     unsigned char *planemap = c->planemap;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     ep = encoded + buf_size;
 
     /* Set data pointer after line lengths */
     dp = encoded + planes * (height << 1);
 
-    /* Ignore alpha plane, don't know what to do with it */
-    if (planes == 4)
-        planes--;
-
-    px_inc = planes + (avctx->pix_fmt == AV_PIX_FMT_RGB32);
-
     for (p = 0; p < planes; p++) {
         /* Lines length pointer for this plane */
         lp = encoded + p * (height << 1);
@@ -105,21 +96,21 @@ static int decode_frame(AVCodecContext *avctx, void *data,
                 if ((count = *dp++) <= 127) {
                     count++;
                     dlen -= count + 1;
-                    if (pixptr_end - pixptr < count * px_inc)
+                    if (pixptr_end - pixptr < count * planes)
                         break;
                     if (ep - dp < count)
                         return AVERROR_INVALIDDATA;
                     while (count--) {
                         *pixptr = *dp++;
-                        pixptr += px_inc;
+                        pixptr += planes;
                     }
                 } else {
                     count = 257 - count;
-                    if (pixptr_end - pixptr < count * px_inc)
+                    if (pixptr_end - pixptr < count * planes)
                         break;
                     while (count--) {
                         *pixptr = *dp;
-                        pixptr += px_inc;
+                        pixptr += planes;
                     }
                     dp++;
                     dlen -= 2;
@@ -180,7 +171,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         c->planemap[0] = HAVE_BIGENDIAN ? 1 : 2; // 1st plane is red
         c->planemap[1] = HAVE_BIGENDIAN ? 2 : 1; // 2nd plane is green
         c->planemap[2] = HAVE_BIGENDIAN ? 3 : 0; // 3rd plane is blue
-        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha???
+        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha
     }
     return 0;
 }
diff --git a/libavcodec/8svx.c b/libavcodec/8svx.c
index fe90b168e3..edc945c697 100644
--- a/libavcodec/8svx.c
+++ b/libavcodec/8svx.c
@@ -1,21 +1,21 @@
 /*
- * 8SVX audio decoder
  * Copyright (C) 2008 Jaikrishnan Menon
+ * Copyright (C) 2011 Stefano Sabatini
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,18 @@
  *
  * supports: fibonacci delta encoding
  *         : exponential encoding
+ *
+ * For more information about the 8SVX format:
+ * http://netghost.narod.ru/gff/vendspec/iff/iff.txt
+ * http://sox.sourceforge.net/AudioFormats-11.html
+ * http://aminet.net/package/mus/misc/wavepak
+ * http://amigan.1emu.net/reg/8SVX.txt
+ *
+ * Samples can be found here:
+ * http://aminet.net/mods/smpl/
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/common.h"
@@ -44,18 +54,17 @@ typedef struct EightSvxContext {
     int data_idx;
 } EightSvxContext;
 
-static const int8_t fibonacci[16]   = { -34, -21, -13,  -8, -5, -3, -2, -1,
-                                          0,   1,   2,   3,  5,  8, 13, 21 };
-static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1,
-                                           0,   1,   2,   4,  8, 16, 32, 64 };
+static const int8_t fibonacci[16]   = { -34,  -21, -13,  -8, -5, -3, -2, -1, 0, 1, 2, 3, 5, 8,  13, 21 };
+static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64 };
 
-#define MAX_FRAME_SIZE 32768
+#define MAX_FRAME_SIZE 2048
 
 /**
  * Delta decode the compressed values in src, and put the resulting
  * decoded samples in dst.
  *
  * @param[in,out] state starting value. it is saved for use in the next call.
+ * @param table delta sequence table
  */
 static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
                          uint8_t *state, const int8_t *table)
@@ -73,12 +82,6 @@ static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
     *state = val;
 }
 
-static void raw_decode(uint8_t *dst, const int8_t *src, int src_size)
-{
-    while (src_size--)
-        *dst++ = *src++ + 128;
-}
-
 /** decode a frame */
 static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
                                  int *got_frame_ptr, AVPacket *avpkt)
@@ -87,27 +90,23 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame       = data;
     int buf_size;
     int ch, ret;
-    int is_compr = (avctx->codec_id != AV_CODEC_ID_PCM_S8_PLANAR);
+    int hdr_size = 2;
 
-    /* for the first packet, copy data to buffer */
-    if (avpkt->data) {
-        int hdr_size  = is_compr ? 2 : 0;
-        int chan_size = (avpkt->size - hdr_size * avctx->channels) / avctx->channels;
+    /* decode and interleave the first packet */
+    if (!esc->data[0] && avpkt) {
+        int chan_size = avpkt->size / avctx->channels - hdr_size;
 
-        if (avpkt->size < hdr_size * avctx->channels) {
-            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
-            return AVERROR_INVALIDDATA;
+        if (avpkt->size % avctx->channels) {
+            av_log(avctx, AV_LOG_WARNING, "Packet with odd size, ignoring last byte\n");
         }
-        if (esc->data[0]) {
-            av_log(avctx, AV_LOG_ERROR, "unexpected data after first packet\n");
+        if (avpkt->size < (hdr_size + 1) * avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if (is_compr) {
         esc->fib_acc[0] = avpkt->data[1] + 128;
         if (avctx->channels == 2)
             esc->fib_acc[1] = avpkt->data[2+chan_size+1] + 128;
-        }
 
         esc->data_idx  = 0;
         esc->data_size = chan_size;
@@ -136,30 +135,22 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* get output buffer */
-    frame->nb_samples = buf_size * (is_compr + 1);
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = buf_size * 2;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
-        if (is_compr) {
-            delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                         buf_size, &esc->fib_acc[ch], esc->table);
-        } else {
-            raw_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                       buf_size);
-        }
+        delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
+                     buf_size, &esc->fib_acc[ch], esc->table);
     }
 
     esc->data_idx += buf_size;
 
     *got_frame_ptr = 1;
 
-    return avpkt->size;
+    return ((avctx->frame_number == 0)*hdr_size + buf_size)*avctx->channels;
 }
 
-/** initialize 8svx decoder */
 static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
 {
     EightSvxContext *esc = avctx->priv_data;
@@ -169,17 +160,12 @@ static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    switch(avctx->codec->id) {
-        case AV_CODEC_ID_8SVX_FIB:
-          esc->table = fibonacci;
-          break;
-        case AV_CODEC_ID_8SVX_EXP:
-          esc->table = exponential;
-          break;
-        case AV_CODEC_ID_PCM_S8_PLANAR:
-            break;
-        default:
-          return AVERROR_INVALIDDATA;
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_8SVX_FIB: esc->table = fibonacci;    break;
+    case AV_CODEC_ID_8SVX_EXP: esc->table = exponential;  break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid codec id %d.\n", avctx->codec->id);
+        return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
 
@@ -192,10 +178,13 @@ static av_cold int eightsvx_decode_close(AVCodecContext *avctx)
 
     av_freep(&esc->data[0]);
     av_freep(&esc->data[1]);
+    esc->data_size = 0;
+    esc->data_idx = 0;
 
     return 0;
 }
 
+#if CONFIG_EIGHTSVX_FIB_DECODER
 AVCodec ff_eightsvx_fib_decoder = {
   .name           = "8svx_fib",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX fibonacci"),
@@ -203,13 +192,14 @@ AVCodec ff_eightsvx_fib_decoder = {
   .id             = AV_CODEC_ID_8SVX_FIB,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_EIGHTSVX_EXP_DECODER
 AVCodec ff_eightsvx_exp_decoder = {
   .name           = "8svx_exp",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX exponential"),
@@ -217,23 +207,10 @@ AVCodec ff_eightsvx_exp_decoder = {
   .id             = AV_CODEC_ID_8SVX_EXP,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
-AVCodec ff_pcm_s8_planar_decoder = {
-    .name           = "pcm_s8_planar",
-    .long_name      = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_PCM_S8_PLANAR,
-    .priv_data_size = sizeof(EightSvxContext),
-    .init           = eightsvx_decode_init,
-    .close          = eightsvx_decode_close,
-    .decode         = eightsvx_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
-                                                      AV_SAMPLE_FMT_NONE },
-};
+#endif
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index ba711ae07d..9075077265 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1,3 +1,5 @@
+include $(SUBDIR)../config.mak
+
 NAME = avcodec
 
 HEADERS = avcodec.h                                                     \
@@ -10,10 +12,13 @@ HEADERS = avcodec.h                                                     \
           vda.h                                                         \
           vdpau.h                                                       \
           version.h                                                     \
+          videotoolbox.h                                                \
           vorbis_parser.h                                               \
           xvmc.h                                                        \
 
 OBJS = allcodecs.o                                                      \
+       audioconvert.o                                                   \
+       avdct.o                                                          \
        avpacket.o                                                       \
        avpicture.o                                                      \
        bitstream.o                                                      \
@@ -22,12 +27,13 @@ OBJS = allcodecs.o                                                      \
        d3d11va.o                                                        \
        dv_profile.o                                                     \
        imgconvert.o                                                     \
-       log2_tab.o                                                       \
        mathtables.o                                                     \
        options.o                                                        \
        parser.o                                                         \
        qsv_api.o                                                        \
        raw.o                                                            \
+       resample.o                                                       \
+       resample2.o                                                      \
        utils.o                                                          \
        vorbis_parser.o                                                  \
        xiph.o                                                           \
@@ -40,13 +46,16 @@ OBJS-$(CONFIG_AUDIODSP)                += audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
+OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
+OBJS-$(CONFIG_EXIF)                    += exif.o tiff_common.o
 OBJS-$(CONFIG_FAANDCT)                 += faandct.o
 OBJS-$(CONFIG_FAANIDCT)                += faanidct.o
 OBJS-$(CONFIG_FDCTDSP)                 += fdctdsp.o jfdctfst.o jfdctint.o
 FFT-OBJS-$(CONFIG_HARDCODED_TABLES)    += cos_tables.o cos_fixed_tables.o
 OBJS-$(CONFIG_FFT)                     += avfft.o fft_fixed.o fft_float.o \
+                                          fft_fixed_32.o fft_init_table.o \
                                           $(FFT-OBJS-yes)
 OBJS-$(CONFIG_FLACDSP)                 += flacdsp.o
 OBJS-$(CONFIG_FMTCONVERT)              += fmtconvert.o
@@ -67,10 +76,12 @@ OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o
 OBJS-$(CONFIG_IVIDSP)                  += ivi_dsp.o
 OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
 OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
+OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
+OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
 OBJS-$(CONFIG_LSP)                     += lsp.o
 OBJS-$(CONFIG_LZF)                     += lzf.o
-OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o
+OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o mdct_fixed_32.o
 OBJS-$(CONFIG_ME_CMP)                  += me_cmp.o
 OBJS-$(CONFIG_MPEG_ER)                 += mpeg_er.o
 OBJS-$(CONFIG_MPEGAUDIO)               += mpegaudio.o mpegaudiodata.o   \
@@ -86,8 +97,8 @@ OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                           motion_est.o ratecontrol.o    \
                                           mpegvideoencdsp.o
 OBJS-$(CONFIG_MSS34DSP)                += mss34dsp.o
+OBJS-$(CONFIG_NVENC)                   += nvenc.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
-OBJS-${CONFIG_NVENC}                   += nvenc.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_QSV)                     += qsv.o
 OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
@@ -96,7 +107,8 @@ OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
 RDFT-OBJS-$(CONFIG_HARDCODED_TABLES)   += sin_tables.o
 OBJS-$(CONFIG_RDFT)                    += rdft.o $(RDFT-OBJS-yes)
 OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
-OBJS-$(CONFIG_SINEWIN)                 += sinewin.o
+OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
 OBJS-$(CONFIG_SNAPPY)                  += snappy.o
 OBJS-$(CONFIG_STARTCODE)               += startcode.o
 OBJS-$(CONFIG_TEXTUREDSP)              += texturedsp.o
@@ -110,21 +122,29 @@ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
 
 # decoders/encoders
+OBJS-$(CONFIG_ZERO12V_DECODER)         += 012v.o
 OBJS-$(CONFIG_A64MULTI_ENCODER)        += a64multienc.o elbg.o
 OBJS-$(CONFIG_A64MULTI5_ENCODER)       += a64multienc.o elbg.o
-OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps.o \
+OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps_float.o \
+                                          aacadtsdec.o mpeg4audio.o kbdwin.o \
+                                          sbrdsp.o aacpsdsp_float.o
+OBJS-$(CONFIG_AAC_FIXED_DECODER)       += aacdec_fixed.o aactab.o aacsbr_fixed.o aacps_fixed.o \
                                           aacadtsdec.o mpeg4audio.o kbdwin.o \
-                                          sbrdsp.o aacpsdsp.o
-OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o    \
+                                          sbrdsp_fixed.o aacpsdsp_fixed.o
+OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o aacenctab.o    \
                                           aacpsy.o aactab.o      \
+                                          aacenc_is.o \
+                                          aacenc_tns.o \
+                                          aacenc_pred.o \
                                           psymodel.o mpeg4audio.o kbdwin.o
 OBJS-$(CONFIG_AASC_DECODER)            += aasc.o msrledec.o
-OBJS-$(CONFIG_AC3_DECODER)             += ac3dec.o ac3dec_data.o ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_DECODER)             += ac3dec_float.o ac3dec_data.o ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_FIXED_DECODER)       += ac3dec_fixed.o ac3dec_data.o ac3.o kbdwin.o
 OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc_float.o ac3enc.o ac3tab.o \
                                           ac3.o kbdwin.o
 OBJS-$(CONFIG_AC3_FIXED_ENCODER)       += ac3enc_fixed.o ac3enc.o ac3tab.o ac3.o
 OBJS-$(CONFIG_AIC_DECODER)             += aic.o
-OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o
+OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o alacdsp.o
 OBJS-$(CONFIG_ALAC_ENCODER)            += alacenc.o alac_data.o
 OBJS-$(CONFIG_ALIAS_PIX_DECODER)       += aliaspixdec.o
 OBJS-$(CONFIG_ALIAS_PIX_ENCODER)       += aliaspixenc.o
@@ -137,10 +157,18 @@ OBJS-$(CONFIG_AMRWB_DECODER)           += amrwbdec.o celp_filters.o   \
                                           celp_math.o acelp_filters.o \
                                           acelp_vectors.o             \
                                           acelp_pitch_delay.o
+OBJS-$(CONFIG_AMV_ENCODER)             += mjpegenc.o mjpegenc_common.o \
+                                          mpegvideo_enc.o motion_est.o \
+                                          ratecontrol.o mpeg12data.o   \
+                                          mpegvideo.o
 OBJS-$(CONFIG_ANM_DECODER)             += anm.o
 OBJS-$(CONFIG_ANSI_DECODER)            += ansi.o cga_data.o
 OBJS-$(CONFIG_APE_DECODER)             += apedec.o
-OBJS-$(CONFIG_ASS_DECODER)             += assdec.o ass.o
+OBJS-$(CONFIG_APNG_DECODER)            += png.o pngdec.o pngdsp.o
+OBJS-$(CONFIG_APNG_ENCODER)            += png.o pngenc.o
+OBJS-$(CONFIG_SSA_DECODER)             += assdec.o ass.o ass_split.o
+OBJS-$(CONFIG_SSA_ENCODER)             += assenc.o ass.o
+OBJS-$(CONFIG_ASS_DECODER)             += assdec.o ass.o ass_split.o
 OBJS-$(CONFIG_ASS_ENCODER)             += assenc.o ass.o
 OBJS-$(CONFIG_ASV1_DECODER)            += asvdec.o asv.o mpeg12data.o
 OBJS-$(CONFIG_ASV1_ENCODER)            += asvenc.o asv.o mpeg12data.o
@@ -152,12 +180,20 @@ OBJS-$(CONFIG_ATRAC3P_DECODER)         += atrac3plusdec.o atrac3plus.o \
                                           atrac3plusdsp.o atrac.o
 OBJS-$(CONFIG_AURA_DECODER)            += cyuv.o
 OBJS-$(CONFIG_AURA2_DECODER)           += aura.o
+OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o mjpegdec.o
+OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
+OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_AVS_DECODER)             += avs.o
+OBJS-$(CONFIG_AVUI_DECODER)            += avuidec.o
+OBJS-$(CONFIG_AVUI_ENCODER)            += avuienc.o
+OBJS-$(CONFIG_AYUV_DECODER)            += v408dec.o
+OBJS-$(CONFIG_AYUV_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_BETHSOFTVID_DECODER)     += bethsoftvideo.o
 OBJS-$(CONFIG_BFI_DECODER)             += bfi.o
 OBJS-$(CONFIG_BINK_DECODER)            += bink.o binkdsp.o
 OBJS-$(CONFIG_BINKAUDIO_DCT_DECODER)   += binkaudio.o
 OBJS-$(CONFIG_BINKAUDIO_RDFT_DECODER)  += binkaudio.o
+OBJS-$(CONFIG_BINTEXT_DECODER)         += bintext.o cga_data.o
 OBJS-$(CONFIG_BMP_DECODER)             += bmp.o msrledec.o
 OBJS-$(CONFIG_BMP_ENCODER)             += bmpenc.o
 OBJS-$(CONFIG_BMV_AUDIO_DECODER)       += bmvaudio.o
@@ -166,26 +202,36 @@ OBJS-$(CONFIG_BRENDER_PIX_DECODER)     += brenderpix.o
 OBJS-$(CONFIG_C93_DECODER)             += c93.o
 OBJS-$(CONFIG_CAVS_DECODER)            += cavs.o cavsdec.o cavsdsp.o \
                                           cavsdata.o mpeg12data.o
+OBJS-$(CONFIG_CCAPTION_DECODER)        += ccaption_dec.o
 OBJS-$(CONFIG_CDGRAPHICS_DECODER)      += cdgraphics.o
 OBJS-$(CONFIG_CDXL_DECODER)            += cdxl.o
 OBJS-$(CONFIG_CINEPAK_DECODER)         += cinepak.o
+OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o elbg.o
 OBJS-$(CONFIG_CLJR_DECODER)            += cljrdec.o
 OBJS-$(CONFIG_CLJR_ENCODER)            += cljrenc.o
 OBJS-$(CONFIG_CLLC_DECODER)            += cllc.o canopus.o
 OBJS-$(CONFIG_COOK_DECODER)            += cook.o
 OBJS-$(CONFIG_COMFORTNOISE_DECODER)    += cngdec.o celp_filters.o
 OBJS-$(CONFIG_COMFORTNOISE_ENCODER)    += cngenc.o
+OBJS-$(CONFIG_CPIA_DECODER)            += cpia.o
 OBJS-$(CONFIG_CSCD_DECODER)            += cscd.o
 OBJS-$(CONFIG_CYUV_DECODER)            += cyuv.o
 OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadsp.o      \
                                           dcadata.o dca_exss.o         \
                                           dca_xll.o synth_filter.o
+OBJS-$(CONFIG_DCA_ENCODER)             += dcaenc.o dca.o dcadata.o
 OBJS-$(CONFIG_DDS_DECODER)             += dds.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += diracdec.o dirac.o diracdsp.o \
+                                          dirac_arith.o mpeg12data.o dirac_dwt.o
 OBJS-$(CONFIG_DFA_DECODER)             += dfa.o
 OBJS-$(CONFIG_DNXHD_DECODER)           += dnxhddec.o dnxhddata.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += dnxhdenc.o dnxhddata.o
 OBJS-$(CONFIG_DPX_DECODER)             += dpx.o
 OBJS-$(CONFIG_DPX_ENCODER)             += dpxenc.o
+OBJS-$(CONFIG_DSD_LSBF_DECODER)        += dsddec.o
+OBJS-$(CONFIG_DSD_MSBF_DECODER)        += dsddec.o
+OBJS-$(CONFIG_DSD_LSBF_PLANAR_DECODER) += dsddec.o
+OBJS-$(CONFIG_DSD_MSBF_PLANAR_DECODER) += dsddec.o
 OBJS-$(CONFIG_DSICINAUDIO_DECODER)     += dsicinaudio.o
 OBJS-$(CONFIG_DSICINVIDEO_DECODER)     += dsicinvideo.o
 OBJS-$(CONFIG_DSS_SP_DECODER)          += dss_sp.o
@@ -198,7 +244,7 @@ OBJS-$(CONFIG_DVVIDEO_ENCODER)         += dvenc.o dv.o dvdata.o
 OBJS-$(CONFIG_DXA_DECODER)             += dxa.o
 OBJS-$(CONFIG_DXTORY_DECODER)          += dxtory.o
 OBJS-$(CONFIG_DXV_DECODER)             += dxv.o
-OBJS-$(CONFIG_EAC3_DECODER)            += eac3dec.o eac3_data.o
+OBJS-$(CONFIG_EAC3_DECODER)            += eac3_data.o
 OBJS-$(CONFIG_EAC3_ENCODER)            += eac3enc.o eac3_data.o
 OBJS-$(CONFIG_EACMV_DECODER)           += eacmv.o
 OBJS-$(CONFIG_EAMAD_DECODER)           += eamad.o eaidct.o mpeg12.o \
@@ -211,14 +257,17 @@ OBJS-$(CONFIG_EIGHTSVX_EXP_DECODER)    += 8svx.o
 OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
+OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
 OBJS-$(CONFIG_EXR_DECODER)             += exr.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
+OBJS-$(CONFIG_FFWAVESYNTH_DECODER)     += ffwavesynth.o
 OBJS-$(CONFIG_FIC_DECODER)             += fic.o
 OBJS-$(CONFIG_FLAC_DECODER)            += flacdec.o flacdata.o flac.o
-OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o vorbis_data.o
 OBJS-$(CONFIG_FLASHSV_DECODER)         += flashsv.o
 OBJS-$(CONFIG_FLASHSV_ENCODER)         += flashsvenc.o
+OBJS-$(CONFIG_FLASHSV2_ENCODER)        += flashsv2enc.o
 OBJS-$(CONFIG_FLASHSV2_DECODER)        += flashsv.o
 OBJS-$(CONFIG_FLIC_DECODER)            += flicvideo.o
 OBJS-$(CONFIG_FOURXM_DECODER)          += 4xm.o
@@ -226,7 +275,9 @@ OBJS-$(CONFIG_FRAPS_DECODER)           += fraps.o
 OBJS-$(CONFIG_FRWU_DECODER)            += frwu.o
 OBJS-$(CONFIG_G2M_DECODER)             += g2meet.o elsdec.o
 OBJS-$(CONFIG_G723_1_DECODER)          += g723_1.o acelp_vectors.o \
-                                          celp_filters.o
+                                          celp_filters.o celp_math.o
+OBJS-$(CONFIG_G723_1_ENCODER)          += g723_1.o acelp_vectors.o celp_math.o
+OBJS-$(CONFIG_G729_DECODER)            += g729dec.o lsp.o celp_math.o acelp_filters.o acelp_pitch_delay.o acelp_vectors.o g729postfilter.o
 OBJS-$(CONFIG_GIF_DECODER)             += gifdec.o lzw.o
 OBJS-$(CONFIG_GIF_ENCODER)             += gif.o lzwenc.o
 OBJS-$(CONFIG_GSM_DECODER)             += gsmdec.o gsmdec_data.o msgsmdec.o
@@ -237,13 +288,13 @@ OBJS-$(CONFIG_H263_DECODER)            += h263dec.o h263.o ituh263dec.o        \
                                           mpeg4video.o mpeg4videodec.o flvdec.o\
                                           intelh263dec.o h263data.o
 OBJS-$(CONFIG_H263_ENCODER)            += mpeg4videoenc.o mpeg4video.o  \
-                                          h263.o ituh263enc.o flvenc.o
+                                          h263.o h263data.o ituh263enc.o flvenc.o
 OBJS-$(CONFIG_H264_DECODER)            += h264.o h264_cabac.o h264_cavlc.o \
                                           h264_direct.o h264_loopfilter.o  \
                                           h264_mb.o h264_picture.o h264_ps.o \
                                           h264_refs.o h264_sei.o h264_slice.o
 OBJS-$(CONFIG_H264_MMAL_DECODER)       += mmaldec.o
-OBJS-$(CONFIG_H264_NVENC_ENCODER)      += nvenc_h264.o
+OBJS-$(CONFIG_H264_VDA_DECODER)        += vda_h264_dec.o
 OBJS-$(CONFIG_H264_QSV_DECODER)        += qsvdec_h2645.o
 OBJS-$(CONFIG_H264_QSV_ENCODER)        += qsvenc_h264.o
 OBJS-$(CONFIG_HAP_DECODER)             += hapdec.o hap.o
@@ -251,7 +302,6 @@ OBJS-$(CONFIG_HAP_ENCODER)             += hapenc.o hap.o
 OBJS-$(CONFIG_HEVC_DECODER)            += hevc.o hevc_mvs.o hevc_ps.o hevc_sei.o \
                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
                                           hevcdsp.o hevc_filter.o hevc_parse.o hevc_data.o
-OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc_hevc.o
 OBJS-$(CONFIG_HEVC_QSV_DECODER)        += qsvdec_h2645.o
 OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o hevc_parse.o
 OBJS-$(CONFIG_HNM4_VIDEO_DECODER)      += hnm4video.o
@@ -261,6 +311,7 @@ OBJS-$(CONFIG_HQX_DECODER)             += hqx.o hqxvlc.o hqxdsp.o canopus.o
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o huffyuvdec.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o huffyuvenc.o
 OBJS-$(CONFIG_IDCIN_DECODER)           += idcinvideo.o
+OBJS-$(CONFIG_IDF_DECODER)             += bintext.o cga_data.o
 OBJS-$(CONFIG_IFF_BYTERUN1_DECODER)    += iff.o
 OBJS-$(CONFIG_IFF_ILBM_DECODER)        += iff.o
 OBJS-$(CONFIG_IMC_DECODER)             += imc.o
@@ -270,6 +321,9 @@ OBJS-$(CONFIG_INDEO4_DECODER)          += indeo4.o ivi.o
 OBJS-$(CONFIG_INDEO5_DECODER)          += indeo5.o ivi.o
 OBJS-$(CONFIG_INTERPLAY_DPCM_DECODER)  += dpcm.o
 OBJS-$(CONFIG_INTERPLAY_VIDEO_DECODER) += interplayvideo.o
+OBJS-$(CONFIG_JACOSUB_DECODER)         += jacosubdec.o ass.o
+OBJS-$(CONFIG_JPEG2000_ENCODER)        += j2kenc.o mqcenc.o mqc.o jpeg2000.o \
+                                          jpeg2000dwt.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += jpeg2000dec.o jpeg2000.o jpeg2000dsp.o \
                                           jpeg2000dwt.o mqcdec.o mqc.o
 OBJS-$(CONFIG_JPEGLS_DECODER)          += jpeglsdec.o jpegls.o
@@ -285,6 +339,7 @@ OBJS-$(CONFIG_MACE6_DECODER)           += mace.o
 OBJS-$(CONFIG_MDEC_DECODER)            += mdec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_METASOUND_DECODER)       += metasound.o metasound_data.o \
                                           twinvq.o
+OBJS-$(CONFIG_MICRODVD_DECODER)        += microdvddec.o ass.o
 OBJS-$(CONFIG_MIMIC_DECODER)           += mimic.o
 OBJS-$(CONFIG_MJPEG_DECODER)           += mjpegdec.o
 OBJS-$(CONFIG_MJPEG_ENCODER)           += mjpegenc.o mjpegenc_common.o
@@ -292,10 +347,14 @@ OBJS-$(CONFIG_MJPEGB_DECODER)          += mjpegbdec.o
 OBJS-$(CONFIG_MLP_DECODER)             += mlpdec.o mlpdsp.o
 OBJS-$(CONFIG_MMVIDEO_DECODER)         += mmvideo.o
 OBJS-$(CONFIG_MOTIONPIXELS_DECODER)    += motionpixels.o
+OBJS-$(CONFIG_MOVTEXT_DECODER)         += movtextdec.o ass.o
+OBJS-$(CONFIG_MOVTEXT_ENCODER)         += movtextenc.o ass_split.o
 OBJS-$(CONFIG_MP1_DECODER)             += mpegaudiodec_fixed.o
 OBJS-$(CONFIG_MP1FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP2_DECODER)             += mpegaudiodec_fixed.o
-OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc.o mpegaudio.o \
+OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc_float.o mpegaudio.o \
+                                          mpegaudiodata.o mpegaudiodsp_data.o
+OBJS-$(CONFIG_MP2FIXED_ENCODER)        += mpegaudioenc_fixed.o mpegaudio.o \
                                           mpegaudiodata.o mpegaudiodsp_data.o
 OBJS-$(CONFIG_MP2FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP3_DECODER)             += mpegaudiodec_fixed.o
@@ -306,7 +365,7 @@ OBJS-$(CONFIG_MP3ON4_DECODER)          += mpegaudiodec_fixed.o mpeg4audio.o
 OBJS-$(CONFIG_MP3ON4FLOAT_DECODER)     += mpegaudiodec_float.o mpeg4audio.o
 OBJS-$(CONFIG_MPC7_DECODER)            += mpc7.o mpc.o
 OBJS-$(CONFIG_MPC8_DECODER)            += mpc8.o mpc.o
-OBJS-$(CONFIG_MPEG_XVMC_DECODER)       += mpegvideo_xvmc.o
+OBJS-$(CONFIG_MPEGVIDEO_DECODER)       += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
 OBJS-$(CONFIG_MPEG2VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
@@ -314,6 +373,7 @@ OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
 OBJS-$(CONFIG_MPEG2_QSV_DECODER)       += qsvdec_mpeg2.o
 OBJS-$(CONFIG_MPEG2_QSV_ENCODER)       += qsvenc_mpeg2.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += xvididct.o
+OBJS-$(CONFIG_MPL2_DECODER)            += mpl2dec.o ass.o
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V2_ENCODER)       += msmpeg4enc.o msmpeg4.o msmpeg4data.o
@@ -324,6 +384,7 @@ OBJS-$(CONFIG_MSA1_DECODER)            += mss3.o
 OBJS-$(CONFIG_MSS1_DECODER)            += mss1.o mss12.o
 OBJS-$(CONFIG_MSS2_DECODER)            += mss2.o mss12.o mss2dsp.o
 OBJS-$(CONFIG_MSVIDEO1_DECODER)        += msvideo1.o
+OBJS-$(CONFIG_MSVIDEO1_ENCODER)        += msvideo1enc.o elbg.o
 OBJS-$(CONFIG_MSZH_DECODER)            += lcldec.o
 OBJS-$(CONFIG_MTS2_DECODER)            += mss4.o
 OBJS-$(CONFIG_MVC1_DECODER)            += mvcdec.o
@@ -349,12 +410,16 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
 OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
 OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
 OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
+OBJS-$(CONFIG_PJS_DECODER)             += textdec.o ass.o
 OBJS-$(CONFIG_PNG_DECODER)             += png.o pngdec.o pngdsp.o
 OBJS-$(CONFIG_PNG_ENCODER)             += png.o pngenc.o
 OBJS-$(CONFIG_PPM_DECODER)             += pnmdec.o pnm.o
 OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o
-OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdata.o proresdsp.o
-OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc.o proresdata.o
+OBJS-$(CONFIG_PRORES_DECODER)          += proresdec2.o proresdsp.o proresdata.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += proresdec_lgpl.o proresdsp.o proresdata.o
+OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o
+OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o
+OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o
 OBJS-$(CONFIG_PTX_DECODER)             += ptx.o
 OBJS-$(CONFIG_QCELP_DECODER)           += qcelpdec.o                     \
                                           celp_filters.o acelp_vectors.o \
@@ -365,13 +430,16 @@ OBJS-$(CONFIG_QPEG_DECODER)            += qpeg.o
 OBJS-$(CONFIG_QTRLE_DECODER)           += qtrle.o
 OBJS-$(CONFIG_QTRLE_ENCODER)           += qtrleenc.o
 OBJS-$(CONFIG_R10K_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R10K_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_R210_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R210_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_RA_144_DECODER)          += ra144dec.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_144_ENCODER)          += ra144enc.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_288_DECODER)          += ra288.o celp_filters.o
 OBJS-$(CONFIG_RALF_DECODER)            += ralf.o
 OBJS-$(CONFIG_RAWVIDEO_DECODER)        += rawdec.o
 OBJS-$(CONFIG_RAWVIDEO_ENCODER)        += rawenc.o
+OBJS-$(CONFIG_REALTEXT_DECODER)        += realtextdec.o ass.o
 OBJS-$(CONFIG_RL2_DECODER)             += rl2.o
 OBJS-$(CONFIG_ROQ_DECODER)             += roqvideodec.o roqvideo.o
 OBJS-$(CONFIG_ROQ_ENCODER)             += roqvideoenc.o roqvideo.o elbg.o
@@ -384,7 +452,9 @@ OBJS-$(CONFIG_RV20_DECODER)            += rv10.o
 OBJS-$(CONFIG_RV20_ENCODER)            += rv20enc.o
 OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o
 OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o
+OBJS-$(CONFIG_SAMI_DECODER)            += samidec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_S302M_DECODER)           += s302m.o
+OBJS-$(CONFIG_S302M_ENCODER)           += s302menc.o
 OBJS-$(CONFIG_SANM_DECODER)            += sanm.o
 OBJS-$(CONFIG_SCREENPRESSO_DECODER)    += screenpresso.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
@@ -398,29 +468,46 @@ OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
 OBJS-$(CONFIG_SMACKAUD_DECODER)        += smacker.o
 OBJS-$(CONFIG_SMACKER_DECODER)         += smacker.o
 OBJS-$(CONFIG_SMC_DECODER)             += smc.o
+OBJS-$(CONFIG_SMVJPEG_DECODER)         += smvjpegdec.o
+OBJS-$(CONFIG_SNOW_DECODER)            += snowdec.o snow.o snow_dwt.o
+OBJS-$(CONFIG_SNOW_ENCODER)            += snowenc.o snow.o snow_dwt.o             \
+                                          h263.o ituh263enc.o
 OBJS-$(CONFIG_SOL_DPCM_DECODER)        += dpcm.o
+OBJS-$(CONFIG_SONIC_DECODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_ENCODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_LS_ENCODER)        += sonic.o
 OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o
-OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o
+OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SRT_ENCODER)             += srtenc.o ass_split.o
+OBJS-$(CONFIG_STL_DECODER)             += textdec.o ass.o
+OBJS-$(CONFIG_SUBRIP_DECODER)          += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SUBRIP_ENCODER)          += srtenc.o ass_split.o
+OBJS-$(CONFIG_SUBVIEWER1_DECODER)      += textdec.o ass.o
+OBJS-$(CONFIG_SUBVIEWER_DECODER)       += subviewerdec.o ass.o
 OBJS-$(CONFIG_SUNRAST_DECODER)         += sunrast.o
 OBJS-$(CONFIG_SUNRAST_ENCODER)         += sunrastenc.o
 OBJS-$(CONFIG_SVQ1_DECODER)            += svq1dec.o svq1.o svq13.o h263.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += svq1enc.o svq1.o    \
                                           h263.o ituh263enc.o
 OBJS-$(CONFIG_SVQ3_DECODER)            += svq3.o svq13.o mpegutils.o
+OBJS-$(CONFIG_TEXT_DECODER)            += textdec.o ass.o
 OBJS-$(CONFIG_TAK_DECODER)             += takdec.o tak.o
 OBJS-$(CONFIG_TARGA_DECODER)           += targa.o
 OBJS-$(CONFIG_TARGA_ENCODER)           += targaenc.o rle.o
+OBJS-$(CONFIG_TARGA_Y216_DECODER)      += targa_y216dec.o
 OBJS-$(CONFIG_TDSC_DECODER)            += tdsc.o
 OBJS-$(CONFIG_TIERTEXSEQVIDEO_DECODER) += tiertexseqv.o
-OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o
-OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o
+OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o tiff_data.o tiff_common.o
+OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o tiff_data.o
 OBJS-$(CONFIG_TMV_DECODER)             += tmv.o cga_data.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += mlpdec.o mlpdsp.o
 OBJS-$(CONFIG_TRUEMOTION1_DECODER)     += truemotion1.o
 OBJS-$(CONFIG_TRUEMOTION2_DECODER)     += truemotion2.o
 OBJS-$(CONFIG_TRUESPEECH_DECODER)      += truespeech.o
 OBJS-$(CONFIG_TSCC_DECODER)            += tscc.o msrledec.o
 OBJS-$(CONFIG_TSCC2_DECODER)           += tscc2.o
-OBJS-$(CONFIG_TTA_DECODER)             += tta.o
+OBJS-$(CONFIG_TTA_DECODER)             += tta.o ttadata.o ttadsp.o
+OBJS-$(CONFIG_TTA_ENCODER)             += ttaenc.o ttadata.o
 OBJS-$(CONFIG_TWINVQ_DECODER)          += twinvqdec.o twinvq.o
 OBJS-$(CONFIG_TXD_DECODER)             += txd.o
 OBJS-$(CONFIG_ULTI_DECODER)            += ulti.o
@@ -428,6 +515,10 @@ OBJS-$(CONFIG_UTVIDEO_DECODER)         += utvideodec.o utvideo.o
 OBJS-$(CONFIG_UTVIDEO_ENCODER)         += utvideoenc.o utvideo.o
 OBJS-$(CONFIG_V210_DECODER)            += v210dec.o
 OBJS-$(CONFIG_V210_ENCODER)            += v210enc.o
+OBJS-$(CONFIG_V308_DECODER)            += v308dec.o
+OBJS-$(CONFIG_V308_ENCODER)            += v308enc.o
+OBJS-$(CONFIG_V408_DECODER)            += v408dec.o
+OBJS-$(CONFIG_V408_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_V410_DECODER)            += v410dec.o
 OBJS-$(CONFIG_V410_ENCODER)            += v410enc.o
 OBJS-$(CONFIG_V210X_DECODER)           += v210x.o
@@ -436,7 +527,9 @@ OBJS-$(CONFIG_VBLE_DECODER)            += vble.o
 OBJS-$(CONFIG_VC1_DECODER)             += vc1dec.o vc1_block.o vc1_loopfilter.o \
                                           vc1_mc.o vc1_pred.o vc1.o vc1data.o \
                                           vc1dsp.o \
-                                          msmpeg4dec.o msmpeg4.o msmpeg4data.o
+                                          msmpeg4dec.o msmpeg4.o msmpeg4data.o \
+                                          wmv2dsp.o
+OBJS-$(CONFIG_VC1_QSV_DECODER)         += qsvdec_vc1.o
 OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
 OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdaudio.o
 OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdvideo.o
@@ -451,11 +544,16 @@ OBJS-$(CONFIG_VP6_DECODER)             += vp6.o vp56.o vp56data.o \
                                           vp6dsp.o vp56rac.o
 OBJS-$(CONFIG_VP7_DECODER)             += vp8.o vp56rac.o
 OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp56rac.o
-OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o \
-                                          vp9block.o vp9prob.o vp9mvs.o vp56rac.o
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9dsp.o vp56rac.o vp9dsp_8bpp.o \
+                                          vp9dsp_10bpp.o vp9dsp_12bpp.o
+OBJS-$(CONFIG_VPLAYER_DECODER)         += textdec.o ass.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o
-OBJS-$(CONFIG_WEBP_DECODER)            += webp.o
+OBJS-$(CONFIG_WAVPACK_ENCODER)         += wavpackenc.o
+OBJS-$(CONFIG_WEBP_DECODER)            += vp8.o vp8dsp.o vp56rac.o
+OBJS-$(CONFIG_WEBP_DECODER)            += webp.o exif.o tiff_common.o
+OBJS-$(CONFIG_WEBVTT_DECODER)          += webvttdec.o ass.o
+OBJS-$(CONFIG_WEBVTT_ENCODER)          += webvttenc.o ass_split.o
 OBJS-$(CONFIG_WMALOSSLESS_DECODER)     += wmalosslessdec.o wma_common.o
 OBJS-$(CONFIG_WMAPRO_DECODER)          += wmaprodec.o wma.o wma_common.o
 OBJS-$(CONFIG_WMAV1_DECODER)           += wmadec.o wma.o wma_common.o aactab.o
@@ -466,6 +564,7 @@ OBJS-$(CONFIG_WMAVOICE_DECODER)        += wmavoice.o \
                                           celp_filters.o \
                                           acelp_vectors.o acelp_filters.o
 OBJS-$(CONFIG_WMV1_DECODER)            += msmpeg4dec.o msmpeg4.o msmpeg4data.o
+OBJS-$(CONFIG_WMV1_ENCODER)            += msmpeg4enc.o
 OBJS-$(CONFIG_WMV2_DECODER)            += wmv2dec.o wmv2.o \
                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_WMV2_ENCODER)            += wmv2enc.o wmv2.o \
@@ -475,14 +574,21 @@ OBJS-$(CONFIG_WS_SND1_DECODER)         += ws-snd1.o
 OBJS-$(CONFIG_XAN_DPCM_DECODER)        += dpcm.o
 OBJS-$(CONFIG_XAN_WC3_DECODER)         += xan.o
 OBJS-$(CONFIG_XAN_WC4_DECODER)         += xxan.o
+OBJS-$(CONFIG_XBIN_DECODER)            += bintext.o cga_data.o
 OBJS-$(CONFIG_XBM_DECODER)             += xbmdec.o
 OBJS-$(CONFIG_XBM_ENCODER)             += xbmenc.o
+OBJS-$(CONFIG_XFACE_DECODER)           += xfacedec.o xface.o
+OBJS-$(CONFIG_XFACE_ENCODER)           += xfaceenc.o xface.o
 OBJS-$(CONFIG_XL_DECODER)              += xl.o
 OBJS-$(CONFIG_XSUB_DECODER)            += xsubdec.o
 OBJS-$(CONFIG_XSUB_ENCODER)            += xsubenc.o
 OBJS-$(CONFIG_XWD_DECODER)             += xwddec.o
 OBJS-$(CONFIG_XWD_ENCODER)             += xwdenc.o
+OBJS-$(CONFIG_Y41P_DECODER)            += y41pdec.o
+OBJS-$(CONFIG_Y41P_ENCODER)            += y41penc.o
 OBJS-$(CONFIG_YOP_DECODER)             += yop.o
+OBJS-$(CONFIG_YUV4_DECODER)            += yuv4dec.o
+OBJS-$(CONFIG_YUV4_ENCODER)            += yuv4enc.o
 OBJS-$(CONFIG_ZEROCODEC_DECODER)       += zerocodec.o
 OBJS-$(CONFIG_ZLIB_DECODER)            += lcldec.o
 OBJS-$(CONFIG_ZLIB_ENCODER)            += lclenc.o
@@ -507,13 +613,16 @@ OBJS-$(CONFIG_PCM_MULAW_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_MULAW_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_S8_ENCODER)             += pcm.o
-OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += 8svx.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += pcm.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_ENCODER)      += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16BE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24DAUD_DECODER)        += pcm.o
@@ -521,11 +630,13 @@ OBJS-$(CONFIG_PCM_S24DAUD_ENCODER)        += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S24LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S32LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_U8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U8_ENCODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U16BE_DECODER)          += pcm.o
@@ -545,7 +656,9 @@ OBJS-$(CONFIG_PCM_ZORK_DECODER)           += pcm.o
 OBJS-$(CONFIG_ADPCM_4XM_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_ADX_DECODER)          += adxdec.o adx.o
 OBJS-$(CONFIG_ADPCM_ADX_ENCODER)          += adxenc.o adx.o
+OBJS-$(CONFIG_ADPCM_AFC_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_CT_DECODER)           += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_DTK_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_MAXIS_XA_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_R1_DECODER)        += adpcm.o adpcm_data.o
@@ -556,6 +669,7 @@ OBJS-$(CONFIG_ADPCM_G722_DECODER)         += g722.o g722dsp.o g722dec.o
 OBJS-$(CONFIG_ADPCM_G722_ENCODER)         += g722.o g722dsp.o g722enc.o
 OBJS-$(CONFIG_ADPCM_G726_DECODER)         += g726.o
 OBJS-$(CONFIG_ADPCM_G726_ENCODER)         += g726.o
+OBJS-$(CONFIG_ADPCM_G726LE_DECODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_IMA_AMV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_APC_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_DK3_DECODER)      += adpcm.o adpcm_data.o
@@ -563,8 +677,10 @@ OBJS-$(CONFIG_ADPCM_IMA_DK4_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_EACS_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_SEAD_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ISS_DECODER)      += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_OKI_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_ENCODER)       += adpcmenc.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_RAD_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_SMJPEG_DECODER)   += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_ENCODER)      += adpcmenc.o adpcm_data.o
@@ -586,25 +702,35 @@ OBJS-$(CONFIG_ADPCM_YAMAHA_ENCODER)       += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_D3D11VA)                    += dxva2.o
 OBJS-$(CONFIG_DXVA2)                      += dxva2.o
 OBJS-$(CONFIG_VAAPI)                      += vaapi.o
-OBJS-$(CONFIG_VDA)                        += vda.o
+OBJS-$(CONFIG_VDA)                        += vda.o videotoolbox.o
+OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                      += vdpau.o
 
 OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
 OBJS-$(CONFIG_H263_VDPAU_HWACCEL)         += vdpau_mpeg4.o
+OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
 OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
 OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
 OBJS-$(CONFIG_H264_VDA_HWACCEL)           += vda_h264.o
 OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
+OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o
+OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
 OBJS-$(CONFIG_MPEG1_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG1_XVMC_HWACCEL)         += mpegvideo_xvmc.o
 OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
 OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
 OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
 OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG2_XVMC_HWACCEL)         += mpegvideo_xvmc.o
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)        += vaapi_mpeg4.o
 OBJS-$(CONFIG_MPEG4_VDPAU_HWACCEL)        += vdpau_mpeg4.o
+OBJS-$(CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
 OBJS-$(CONFIG_VC1_D3D11VA_HWACCEL)        += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_DXVA2_HWACCEL)          += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
@@ -612,38 +738,50 @@ OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
 
 # libavformat dependencies
 OBJS-$(CONFIG_ADTS_MUXER)              += mpeg4audio.o
+OBJS-$(CONFIG_AVI_DEMUXER)             += mpeg4audio.o mpegaudiodata.o
 OBJS-$(CONFIG_CAF_DEMUXER)             += mpeg4audio.o mpegaudiodata.o  \
                                           ac3tab.o
-OBJS-$(CONFIG_FLAC_MUXER)              += flac.o flacdata.o
+OBJS-$(CONFIG_FLAC_DEMUXER)            += flac.o flacdata.o vorbis_data.o
+OBJS-$(CONFIG_FLAC_MUXER)              += flac.o flacdata.o vorbis_data.o
 OBJS-$(CONFIG_FLV_DEMUXER)             += mpeg4audio.o
 OBJS-$(CONFIG_GXF_DEMUXER)             += mpeg12data.o
 OBJS-$(CONFIG_IFF_DEMUXER)             += iff.o
 OBJS-$(CONFIG_ISMV_MUXER)              += mpeg4audio.o mpegaudiodata.o
 OBJS-$(CONFIG_LATM_MUXER)              += mpeg4audio.o
-OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)    += mpeg4audio.o                  \
+OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)    += mpeg4audio.o vorbis_data.o    \
                                           flac.o flacdata.o
 OBJS-$(CONFIG_MATROSKA_DEMUXER)        += mpeg4audio.o mpegaudiodata.o
 OBJS-$(CONFIG_MATROSKA_MUXER)          += mpeg4audio.o mpegaudiodata.o  \
-                                          flac.o flacdata.o
+                                          flac.o flacdata.o vorbis_data.o
 OBJS-$(CONFIG_MP2_MUXER)               += mpegaudiodata.o mpegaudiodecheader.o
 OBJS-$(CONFIG_MP3_MUXER)               += mpegaudiodata.o mpegaudiodecheader.o
 OBJS-$(CONFIG_MOV_DEMUXER)             += mpeg4audio.o mpegaudiodata.o ac3tab.o
 OBJS-$(CONFIG_MOV_MUXER)               += mpeg4audio.o mpegaudiodata.o
 OBJS-$(CONFIG_MPEGTS_MUXER)            += mpeg4audio.o
 OBJS-$(CONFIG_MPEGTS_DEMUXER)          += mpeg4audio.o mpegaudiodata.o
+OBJS-$(CONFIG_MXF_MUXER)               += dnxhddata.o
 OBJS-$(CONFIG_NUT_MUXER)               += mpegaudiodata.o
+OBJS-$(CONFIG_NUT_DEMUXER)             += mpegaudiodata.o mpeg4audio.o
+OBJS-$(CONFIG_OGA_MUXER)               += flac.o flacdata.o
 OBJS-$(CONFIG_OGG_DEMUXER)             += mpeg12data.o \
-                                          dirac.o
-OBJS-$(CONFIG_OGG_MUXER)               += flac.o flacdata.o
+                                          dirac.o vorbis_data.o
+OBJS-$(CONFIG_OGG_MUXER)               += flac.o flacdata.o \
+                                          vorbis_data.o
 OBJS-$(CONFIG_RTP_MUXER)               += mpeg4audio.o
 OBJS-$(CONFIG_SPDIF_DEMUXER)           += aacadtsdec.o mpeg4audio.o
 OBJS-$(CONFIG_SPDIF_MUXER)             += dca.o
 OBJS-$(CONFIG_TAK_DEMUXER)             += tak.o
 OBJS-$(CONFIG_WEBM_MUXER)              += mpeg4audio.o mpegaudiodata.o  \
-                                          flac.o flacdata.o
+                                          flac.o flacdata.o \
+                                          vorbis_data.o
 OBJS-$(CONFIG_WTV_DEMUXER)             += mpeg4audio.o mpegaudiodata.o
 
+# libavfilter dependencies
+OBJS-$(CONFIG_ELBG_FILTER)             += elbg.o
+
 # external codec libraries
+OBJS-$(CONFIG_LIBAACPLUS_ENCODER)         += libaacplus.o
+OBJS-$(CONFIG_LIBCELT_DECODER)            += libcelt_dec.o
 OBJS-$(CONFIG_LIBDCADEC_DECODER)          += libdcadec.o dca.o
 OBJS-$(CONFIG_LIBFAAC_ENCODER)            += libfaac.o
 OBJS-$(CONFIG_LIBFDK_AAC_DECODER)         += libfdk-aacdec.o
@@ -654,6 +792,7 @@ OBJS-$(CONFIG_LIBGSM_MS_DECODER)          += libgsmdec.o
 OBJS-$(CONFIG_LIBGSM_MS_ENCODER)          += libgsmenc.o
 OBJS-$(CONFIG_LIBILBC_DECODER)            += libilbc.o
 OBJS-$(CONFIG_LIBILBC_ENCODER)            += libilbc.o
+OBJS-$(CONFIG_LIBKVAZAAR_ENCODER)         += libkvazaar.o
 OBJS-$(CONFIG_LIBMP3LAME_ENCODER)         += libmp3lame.o mpegaudiodecheader.o
 OBJS-$(CONFIG_LIBOPENCORE_AMRNB_DECODER)  += libopencore-amr.o
 OBJS-$(CONFIG_LIBOPENCORE_AMRNB_ENCODER)  += libopencore-amr.o
@@ -669,25 +808,32 @@ OBJS-$(CONFIG_LIBSCHROEDINGER_DECODER)    += libschroedingerdec.o \
                                              libschroedinger.o
 OBJS-$(CONFIG_LIBSCHROEDINGER_ENCODER)    += libschroedingerenc.o \
                                              libschroedinger.o
+OBJS-$(CONFIG_LIBSHINE_ENCODER)           += libshine.o
 OBJS-$(CONFIG_LIBSPEEX_DECODER)           += libspeexdec.o
 OBJS-$(CONFIG_LIBSPEEX_ENCODER)           += libspeexenc.o
+OBJS-$(CONFIG_LIBSTAGEFRIGHT_H264_DECODER)+= libstagefright.o
 OBJS-$(CONFIG_LIBTHEORA_ENCODER)          += libtheoraenc.o
 OBJS-$(CONFIG_LIBTWOLAME_ENCODER)         += libtwolame.o
+OBJS-$(CONFIG_LIBUTVIDEO_DECODER)         += libutvideodec.o
+OBJS-$(CONFIG_LIBUTVIDEO_ENCODER)         += libutvideoenc.o
 OBJS-$(CONFIG_LIBVO_AACENC_ENCODER)       += libvo-aacenc.o mpeg4audio.o
 OBJS-$(CONFIG_LIBVO_AMRWBENC_ENCODER)     += libvo-amrwbenc.o
-OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbis.o \
+OBJS-$(CONFIG_LIBVORBIS_DECODER)          += libvorbisdec.o
+OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbisenc.o \
                                              vorbis_data.o
-OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o libvpx.o
-OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o libvpx.o
+OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o
+OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o
 OBJS-$(CONFIG_LIBVPX_VP9_DECODER)         += libvpxdec.o libvpx.o
 OBJS-$(CONFIG_LIBVPX_VP9_ENCODER)         += libvpxenc.o libvpx.o
 OBJS-$(CONFIG_LIBWAVPACK_ENCODER)         += libwavpackenc.o
-OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc_common.o libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ANIM_ENCODER)       += libwebpenc_common.o libwebpenc_animencoder.o
 OBJS-$(CONFIG_LIBX262_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX264_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX265_ENCODER)            += libx265.o
 OBJS-$(CONFIG_LIBXAVS_ENCODER)            += libxavs.o
 OBJS-$(CONFIG_LIBXVID_ENCODER)            += libxvid.o
+OBJS-$(CONFIG_LIBZVBI_TELETEXT_DECODER)   += libzvbi-teletextdec.o
 
 # parsers
 OBJS-$(CONFIG_AAC_PARSER)              += aac_parser.o aac_ac3_parser.o \
@@ -704,8 +850,11 @@ OBJS-$(CONFIG_DIRAC_PARSER)            += dirac_parser.o
 OBJS-$(CONFIG_DNXHD_PARSER)            += dnxhd_parser.o
 OBJS-$(CONFIG_DPX_PARSER)              += dpx_parser.o
 OBJS-$(CONFIG_DVBSUB_PARSER)           += dvbsub_parser.o
+OBJS-$(CONFIG_DVD_NAV_PARSER)          += dvd_nav_parser.o
 OBJS-$(CONFIG_DVDSUB_PARSER)           += dvdsub_parser.o
-OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o \
+                                          vorbis_data.o
+OBJS-$(CONFIG_G729_PARSER)             += g729_parser.o
 OBJS-$(CONFIG_GSM_PARSER)              += gsm_parser.o
 OBJS-$(CONFIG_H261_PARSER)             += h261_parser.o
 OBJS-$(CONFIG_H263_PARSER)             += h263_parser.o
@@ -716,6 +865,7 @@ OBJS-$(CONFIG_MLP_PARSER)              += mlp_parser.o mlp.o
 OBJS-$(CONFIG_MPEG4VIDEO_PARSER)       += mpeg4video_parser.o h263.o \
                                           mpeg4videodec.o mpeg4video.o \
                                           ituh263dec.o h263dec.o h263data.o
+OBJS-$(CONFIG_PNG_PARSER)              += png_parser.o
 OBJS-$(CONFIG_MPEGAUDIO_PARSER)        += mpegaudio_parser.o \
                                           mpegaudiodecheader.o mpegaudiodata.o
 OBJS-$(CONFIG_MPEGVIDEO_PARSER)        += mpegvideo_parser.o    \
@@ -729,6 +879,7 @@ OBJS-$(CONFIG_TAK_PARSER)              += tak_parser.o tak.o
 OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o
 OBJS-$(CONFIG_VP3_PARSER)              += vp3_parser.o
 OBJS-$(CONFIG_VP8_PARSER)              += vp8_parser.o
+OBJS-$(CONFIG_VP9_PARSER)              += vp9_parser.o
 
 # bitstream filters
 OBJS-$(CONFIG_AAC_ADTSTOASC_BSF)          += aac_adtstoasc_bsf.o aacadtsdec.o \
@@ -740,7 +891,10 @@ OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF)       += hevc_mp4toannexb_bsf.o
 OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
 OBJS-$(CONFIG_MJPEG2JPEG_BSF)             += mjpeg2jpeg_bsf.o
 OBJS-$(CONFIG_MJPEGA_DUMP_HEADER_BSF)     += mjpega_dump_header_bsf.o
+OBJS-$(CONFIG_MPEG4_UNPACK_BFRAMES_BSF)   += mpeg4_unpack_bframes_bsf.o
 OBJS-$(CONFIG_MOV2TEXTSUB_BSF)            += movsub_bsf.o
+OBJS-$(CONFIG_MP3_HEADER_DECOMPRESS_BSF)  += mp3_header_decompress_bsf.o \
+                                             mpegaudiodata.o
 OBJS-$(CONFIG_NOISE_BSF)                  += noise_bsf.o
 OBJS-$(CONFIG_REMOVE_EXTRADATA_BSF)       += remove_extradata_bsf.o
 OBJS-$(CONFIG_TEXT2MOVSUB_BSF)            += movsub_bsf.o
@@ -749,44 +903,72 @@ OBJS-$(CONFIG_TEXT2MOVSUB_BSF)            += movsub_bsf.o
 OBJS-$(HAVE_LIBC_MSVCRT)               += file_open.o
 OBJS-$(HAVE_THREADS)                   += pthread.o pthread_slice.o pthread_frame.o
 
+OBJS-$(CONFIG_FRAME_THREAD_ENCODER)    += frame_thread_encoder.o
+
+# Windows resource file
+SLIBOBJS-$(HAVE_GNU_WINDRES)           += avcodecres.o
+
 SKIPHEADERS                            += %_tablegen.h                  \
                                           %_tables.h                    \
                                           aac_tablegen_decl.h           \
                                           fft-internal.h                \
                                           tableprint.h                  \
+                                          tableprint_vlc.h              \
+                                          aaccoder_twoloop.h            \
+                                          aaccoder_trellis.h            \
                                           $(ARCH)/vp56_arith.h          \
 
 SKIPHEADERS-$(CONFIG_D3D11VA)          += d3d11va.h dxva2_internal.h
 SKIPHEADERS-$(CONFIG_DXVA2)            += dxva2.h dxva2_internal.h
 SKIPHEADERS-$(CONFIG_LIBSCHROEDINGER)  += libschroedinger.h
+SKIPHEADERS-$(CONFIG_LIBUTVIDEO)       += libutvideo.h
 SKIPHEADERS-$(CONFIG_LIBVPX)           += libvpx.h
-SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
-SKIPHEADERS-$(CONFIG_NVENC)            += nvenc.h
+SKIPHEADERS-$(CONFIG_LIBWEBP_ENCODER)  += libwebpenc_common.h
 SKIPHEADERS-$(CONFIG_QSV)              += qsv.h qsv_internal.h
 SKIPHEADERS-$(CONFIG_QSVDEC)           += qsvdec.h
 SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h
+SKIPHEADERS-$(CONFIG_XVMC)             += xvmc.h
 SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_internal.h
-SKIPHEADERS-$(CONFIG_VDA)              += vda.h vda_internal.h
+SKIPHEADERS-$(CONFIG_VDA)              += vda.h vda_vt_internal.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h
+SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vda_vt_internal.h
 
-TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed
+TESTPROGS = imgconvert                                                  \
+            jpeg2000dwt                                                 \
+            mathops                                                    \
+            options                                                     \
+            avfft                                                       \
+
+TESTPROGS-$(CONFIG_CABAC)                 += cabac
+TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed fft-fixed32
 TESTPROGS-$(CONFIG_IDCTDSP)               += dct
 TESTPROGS-$(CONFIG_IIRFILTER)             += iirfilter
+TESTPROGS-$(HAVE_MMX)                     += motion
 TESTPROGS-$(CONFIG_GOLOMB)                += golomb
 TESTPROGS-$(CONFIG_RANGECODER)            += rangecoder
+TESTPROGS-$(CONFIG_SNOW_ENCODER)          += snowenc
 
 TESTOBJS = dctref.o
 
+TOOLS = fourcc2pixfmt
+
 HOSTPROGS = aac_tablegen                                                \
             aacps_tablegen                                              \
+            aacps_fixed_tablegen                                        \
+            aacsbr_tablegen                                             \
+            aacsbr_fixed_tablegen                                       \
+            cabac_tablegen                                              \
             cbrt_tablegen                                               \
+            cbrt_fixed_tablegen                                         \
             cos_tablegen                                                \
+            dsd_tablegen                                                \
             dv_tablegen                                                 \
             motionpixels_tablegen                                       \
             mpegaudio_tablegen                                          \
             pcm_tablegen                                                \
             qdm2_tablegen                                               \
             sinewin_tablegen                                            \
+            sinewin_fixed_tablegen                                      \
 
 CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF)
 
@@ -805,8 +987,9 @@ else
 $(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=0
 endif
 
-GEN_HEADERS = cbrt_tables.h aacps_tables.h aac_tables.h dv_tables.h     \
-              sinewin_tables.h mpegaudio_tables.h motionpixels_tables.h \
+GEN_HEADERS = cabac_tables.h cbrt_tables.h cbrt_fixed_tables.h aacps_tables.h aacps_fixed_tables.h aacsbr_tables.h \
+              aacsbr_fixed_tables.h aac_tables.h dsd_tables.h dv_tables.h     \
+              sinewin_tables.h sinewin_fixed_tables.h mpegaudio_tables.h motionpixels_tables.h \
               pcm_tables.h qdm2_tables.h
 GEN_HEADERS := $(addprefix $(SUBDIR), $(GEN_HEADERS))
 
@@ -815,10 +998,18 @@ $(GEN_HEADERS): $(SUBDIR)%_tables.h: $(SUBDIR)%_tablegen$(HOSTEXESUF)
 
 ifdef CONFIG_HARDCODED_TABLES
 $(SUBDIR)aacdec.o: $(SUBDIR)cbrt_tables.h
-$(SUBDIR)aacps.o: $(SUBDIR)aacps_tables.h
+$(SUBDIR)aacdec_fixed.o: $(SUBDIR)cbrt_fixed_tables.h
+$(SUBDIR)aacps_float.o: $(SUBDIR)aacps_tables.h
+$(SUBDIR)aacps_fixed.o: $(SUBDIR)aacps_fixed_tables.h
+$(SUBDIR)aacsbr.o: $(SUBDIR)aacsbr_tables.h
+$(SUBDIR)aacsbr_fixed.o: $(SUBDIR)aacsbr_fixed_tables.h
 $(SUBDIR)aactab.o: $(SUBDIR)aac_tables.h
+$(SUBDIR)aactab_fixed.o: $(SUBDIR)aac_fixed_tables.h
+$(SUBDIR)cabac.o: $(SUBDIR)cabac_tables.h
+$(SUBDIR)dsddec.o: $(SUBDIR)dsd_tables.h
 $(SUBDIR)dvenc.o: $(SUBDIR)dv_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
 $(SUBDIR)mpegaudiodec_fixed.o: $(SUBDIR)mpegaudio_tables.h
 $(SUBDIR)mpegaudiodec_float.o: $(SUBDIR)mpegaudio_tables.h
 $(SUBDIR)motionpixels.o: $(SUBDIR)motionpixels_tables.h
diff --git a/libavcodec/a64colors.h b/libavcodec/a64colors.h
index d977426fc0..a9cdb6fa76 100644
--- a/libavcodec/a64colors.h
+++ b/libavcodec/a64colors.h
@@ -2,20 +2,20 @@
  * a64 video encoder - c64 colors in rgb (Pepto)
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/a64enc.h b/libavcodec/a64enc.h
deleted file mode 100644
index 65c1d30951..0000000000
--- a/libavcodec/a64enc.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * a64 video encoder - basic headers
- * Copyright (c) 2009 Tobias Bindhammer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * a64 video encoder - basic headers
- */
-
-#ifndef AVCODEC_A64ENC_H
-#define AVCODEC_A64ENC_H
-
-#include "libavutil/lfg.h"
-#include "avcodec.h"
-
-#define C64XRES 320
-#define C64YRES 200
-
-typedef struct A64Context {
-    /* variables for multicolor modes */
-    AVLFG randctx;
-    int mc_lifetime;
-    int mc_use_5col;
-    unsigned mc_frame_counter;
-    int *mc_meta_charset;
-    int *mc_charmap;
-    int *mc_best_cb;
-    int mc_luma_vals[5];
-    uint8_t *mc_charset;
-    uint8_t *mc_colram;
-    uint8_t *mc_palette;
-    int mc_pal_size;
-
-    /* pts of the next packet that will be output */
-    int64_t next_pts;
-} A64Context;
-
-#endif /* AVCODEC_A64ENC_H */
diff --git a/libavcodec/a64multienc.c b/libavcodec/a64multienc.c
index 5d8d162348..91aac0933f 100644
--- a/libavcodec/a64multienc.c
+++ b/libavcodec/a64multienc.c
@@ -2,20 +2,20 @@
  * a64 video encoder - multicolor modes
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,11 +24,11 @@
  * a64 video encoder - multicolor modes
  */
 
-#include "a64enc.h"
 #include "a64colors.h"
 #include "a64tables.h"
 #include "elbg.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -37,6 +37,28 @@
 #define INTERLACED    1
 #define CROP_SCREENS  1
 
+#define C64XRES 320
+#define C64YRES 200
+
+typedef struct A64Context {
+    /* variables for multicolor modes */
+    AVLFG randctx;
+    int mc_lifetime;
+    int mc_use_5col;
+    unsigned mc_frame_counter;
+    int *mc_meta_charset;
+    int *mc_charmap;
+    int *mc_best_cb;
+    int mc_luma_vals[5];
+    uint8_t *mc_charset;
+    uint8_t *mc_colram;
+    uint8_t *mc_palette;
+    int mc_pal_size;
+
+    /* pts of the next packet that will be output */
+    int64_t next_pts;
+} A64Context;
+
 /* gray gradient */
 static const int mc_colors[5]={0x0,0xb,0xc,0xf,0x1};
 
@@ -58,9 +80,13 @@ static void to_meta_with_crop(AVCodecContext *avctx,
             for (y = blocky; y < blocky + 8 && y < C64YRES; y++) {
                 for (x = blockx; x < blockx + 8 && x < C64XRES; x += 2) {
                     if(x < width && y < height) {
-                        /* build average over 2 pixels */
-                        luma = (src[(x + 0 + y * p->linesize[0])] +
-                                src[(x + 1 + y * p->linesize[0])]) / 2;
+                        if (x + 1 < width) {
+                            /* build average over 2 pixels */
+                            luma = (src[(x + 0 + y * p->linesize[0])] +
+                                    src[(x + 1 + y * p->linesize[0])]) / 2;
+                        } else {
+                            luma = src[(x + y * p->linesize[0])];
+                        }
                         /* write blocks as linear data now so they are suitable for elbg */
                         dest[0] = luma;
                     }
@@ -166,11 +192,11 @@ static void render_charset(AVCodecContext *avctx, uint8_t *charset,
 static av_cold int a64multi_close_encoder(AVCodecContext *avctx)
 {
     A64Context *c = avctx->priv_data;
-    av_free(c->mc_meta_charset);
-    av_free(c->mc_best_cb);
-    av_free(c->mc_charset);
-    av_free(c->mc_charmap);
-    av_free(c->mc_colram);
+    av_freep(&c->mc_meta_charset);
+    av_freep(&c->mc_best_cb);
+    av_freep(&c->mc_charset);
+    av_freep(&c->mc_charmap);
+    av_freep(&c->mc_colram);
     return 0;
 }
 
@@ -199,9 +225,9 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
                            a64_palette[mc_colors[a]][2] * 0.11;
     }
 
-    if (!(c->mc_meta_charset = av_malloc(32000 * c->mc_lifetime * sizeof(int))) ||
+    if (!(c->mc_meta_charset = av_mallocz_array(c->mc_lifetime, 32000 * sizeof(int))) ||
        !(c->mc_best_cb       = av_malloc(CHARSET_CHARS * 32 * sizeof(int)))     ||
-       !(c->mc_charmap       = av_mallocz(1000 * c->mc_lifetime * sizeof(int))) ||
+       !(c->mc_charmap       = av_mallocz_array(c->mc_lifetime, 1000 * sizeof(int))) ||
        !(c->mc_colram        = av_mallocz(CHARSET_CHARS * sizeof(uint8_t)))     ||
        !(c->mc_charset       = av_malloc(0x800 * (INTERLACED+1) * sizeof(uint8_t)))) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate buffer memory.\n");
@@ -217,12 +243,6 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
     AV_WB32(avctx->extradata, c->mc_lifetime);
     AV_WB32(avctx->extradata + 16, INTERLACED);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     if (!avctx->codec_tag)
          avctx->codec_tag = AV_RL32("a64m");
 
@@ -247,7 +267,7 @@ static void a64_compress_colram(unsigned char *buf, int *charmap, uint8_t *colra
 }
 
 static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                                 const AVFrame *pict, int *got_packet)
+                                 const AVFrame *p, int *got_packet)
 {
     A64Context *c = avctx->priv_data;
 
@@ -257,7 +277,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int b_width;
 
     int req_size, ret;
-    uint8_t *buf;
+    uint8_t *buf = NULL;
 
     int *charmap     = c->mc_charmap;
     uint8_t *colram  = c->mc_colram;
@@ -280,7 +300,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     /* no data, means end encoding asap */
-    if (!pict) {
+    if (!p) {
         /* all done, end encoding */
         if (!c->mc_lifetime) return 0;
         /* no more frames in queue, prepare to flush remaining frames */
@@ -293,16 +313,10 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     } else {
         /* fill up mc_meta_charset with data until lifetime exceeds */
         if (c->mc_frame_counter < c->mc_lifetime) {
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-            avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-            to_meta_with_crop(avctx, pict, meta + 32000 * c->mc_frame_counter);
+            to_meta_with_crop(avctx, p, meta + 32000 * c->mc_frame_counter);
             c->mc_frame_counter++;
             if (c->next_pts == AV_NOPTS_VALUE)
-                c->next_pts = pict->pts;
+                c->next_pts = p->pts;
             /* lifetime is not reached so wait for next frame first */
             return 0;
         }
@@ -313,19 +327,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
         req_size = 0;
         /* any frames to encode? */
         if (c->mc_lifetime) {
-            req_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
-            if ((ret = ff_alloc_packet(pkt, req_size)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", req_size);
+            int alloc_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
+            if ((ret = ff_alloc_packet2(avctx, pkt, alloc_size, 0)) < 0)
                 return ret;
-            }
             buf = pkt->data;
 
             /* calc optimal new charset + charmaps */
-            ret = ff_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                                CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
-            ret = ff_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                              CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
@@ -338,7 +350,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             /* advance pointers */
             buf      += charset_size;
-            charset  += charset_size;
+            req_size += charset_size;
         }
 
         /* write x frames to buf */
@@ -375,6 +387,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         pkt->pts = pkt->dts = c->next_pts;
         c->next_pts         = AV_NOPTS_VALUE;
 
+        av_assert0(pkt->size >= req_size);
         pkt->size   = req_size;
         pkt->flags |= AV_PKT_FLAG_KEY;
         *got_packet = !!req_size;
@@ -382,6 +395,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#if CONFIG_A64MULTI_ENCODER
 AVCodec ff_a64multi_encoder = {
     .name           = "a64multi",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64"),
@@ -394,7 +408,8 @@ AVCodec ff_a64multi_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
-
+#endif
+#if CONFIG_A64MULTI5_ENCODER
 AVCodec ff_a64multi5_encoder = {
     .name           = "a64multi5",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64, extended with 5th color (colram)"),
@@ -407,3 +422,4 @@ AVCodec ff_a64multi5_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
+#endif
diff --git a/libavcodec/a64tables.h b/libavcodec/a64tables.h
index b95c5ce754..a955ef4caa 100644
--- a/libavcodec/a64tables.h
+++ b/libavcodec/a64tables.h
@@ -2,20 +2,20 @@
  * a64 video encoder - tables used by a64 encoders
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index fed6bf4214..17af49c766 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,14 @@
 #ifndef AVCODEC_AAC_H
 #define AVCODEC_AAC_H
 
+
+#include "aac_defines.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "avcodec.h"
+#if !USE_FIXED
 #include "imdct15.h"
+#endif
 #include "fft.h"
 #include "mpeg4audio.h"
 #include "sbr.h"
@@ -45,6 +50,8 @@
 #define TNS_MAX_ORDER 20
 #define MAX_LTP_LONG_SFB 40
 
+#define CLIP_AVOIDANCE_FACTOR 0.95f
+
 enum RawDataBlockType {
     TYPE_SCE,
     TYPE_CPE,
@@ -76,12 +83,13 @@ enum BandType {
     ZERO_BT        = 0,     ///< Scalefactors and spectral data are all zero.
     FIRST_PAIR_BT  = 5,     ///< This and later band types encode two values (rather than four) with one code word.
     ESC_BT         = 11,    ///< Spectral data are coded with an escape sequence.
+    RESERVED_BT    = 12,    ///< Band types following are encoded differently from others.
     NOISE_BT       = 13,    ///< Spectral data are scaled white noise not coded in the bitstream.
-    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions.
-    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions.
+    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions (out of phase).
+    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions (in phase).
 };
 
-#define IS_CODEBOOK_UNSIGNED(x) ((x - 1) & 10)
+#define IS_CODEBOOK_UNSIGNED(x) (((x) - 1) & 10)
 
 enum ChannelPosition {
     AAC_CHANNEL_OFF   = 0,
@@ -125,12 +133,14 @@ typedef struct OutputConfiguration {
  * Predictor State
  */
 typedef struct PredictorState {
-    float cor0;
-    float cor1;
-    float var0;
-    float var1;
-    float r0;
-    float r1;
+    AAC_FLOAT cor0;
+    AAC_FLOAT cor1;
+    AAC_FLOAT var0;
+    AAC_FLOAT var1;
+    AAC_FLOAT r0;
+    AAC_FLOAT r1;
+    AAC_FLOAT k1;
+    AAC_FLOAT x_est;
 } PredictorState;
 
 #define MAX_PREDICTORS 672
@@ -141,13 +151,17 @@ typedef struct PredictorState {
 #define SCALE_MAX_DIFF   60    ///< maximum scalefactor difference allowed by standard
 #define SCALE_DIFF_ZERO  60    ///< codebook index corresponding to zero scalefactor indices difference
 
+#define NOISE_PRE       256    ///< preamble for NOISE_BT, put in bitstream with the first noise band
+#define NOISE_PRE_BITS    9    ///< length of preamble
+#define NOISE_OFFSET     90    ///< subtracted from global gain, used as offset for the preamble
+
 /**
  * Long Term Prediction
  */
 typedef struct LongTermPrediction {
     int8_t present;
     int16_t lag;
-    float coef;
+    INTFLOAT coef;
     int8_t used[MAX_LTP_LONG_SFB];
 } LongTermPrediction;
 
@@ -169,7 +183,10 @@ typedef struct IndividualChannelStream {
     int predictor_present;
     int predictor_initialized;
     int predictor_reset_group;
+    int predictor_reset_count[31];  ///< used by encoder to count prediction resets
     uint8_t prediction_used[41];
+    uint8_t window_clipping[8]; ///< set if a certain window is near clipping
+    float clip_avoidance_factor; ///< set if any window is near clipping to the necessary atennuation factor to avoid it
 } IndividualChannelStream;
 
 /**
@@ -181,7 +198,8 @@ typedef struct TemporalNoiseShaping {
     int length[8][4];
     int direction[8][4];
     int order[8][4];
-    float coef[8][4][TNS_MAX_ORDER];
+    int coef_idx[8][4][TNS_MAX_ORDER];
+    INTFLOAT coef[8][4][TNS_MAX_ORDER];
 } TemporalNoiseShaping;
 
 /**
@@ -218,7 +236,7 @@ typedef struct ChannelCoupling {
     int ch_select[8];      /**< [0] shared list of gains; [1] list of gains for right channel;
                             *   [2] list of gains for left channel; [3] lists of gains for both channels
                             */
-    float gain[16][120];
+    INTFLOAT gain[16][120];
 } ChannelCoupling;
 
 /**
@@ -229,26 +247,34 @@ typedef struct SingleChannelElement {
     TemporalNoiseShaping tns;
     Pulse pulse;
     enum BandType band_type[128];                   ///< band types
+    enum BandType band_alt[128];                    ///< alternative band type (used by encoder)
     int band_type_run_end[120];                     ///< band type run end points
-    float sf[120];                                  ///< scalefactors
+    INTFLOAT sf[120];                               ///< scalefactors
     int sf_idx[128];                                ///< scalefactor indices (used by encoder)
     uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(32, float,   saved)[1536];      ///< overlap
-    DECLARE_ALIGNED(32, float,   ret_buf)[2048];    ///< PCM output buffer
-    DECLARE_ALIGNED(16, float,   ltp_state)[3072];  ///< time signal for LTP
+    float  is_ener[128];                            ///< Intensity stereo pos (used by encoder)
+    float pns_ener[128];                            ///< Noise energy values (used by encoder)
+    DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024];   ///< coefficients for IMDCT, pristine
+    DECLARE_ALIGNED(32, INTFLOAT, coeffs)[1024];    ///< coefficients for IMDCT, maybe processed
+    DECLARE_ALIGNED(32, INTFLOAT, saved)[1536];     ///< overlap
+    DECLARE_ALIGNED(32, INTFLOAT, ret_buf)[2048];   ///< PCM output buffer
+    DECLARE_ALIGNED(16, INTFLOAT, ltp_state)[3072]; ///< time signal for LTP
+    DECLARE_ALIGNED(32, AAC_FLOAT, prcoeffs)[1024]; ///< Main prediction coefs (used by encoder)
     PredictorState predictor_state[MAX_PREDICTORS];
-    float *ret;                                     ///< PCM output
+    INTFLOAT *ret;                                  ///< PCM output
 } SingleChannelElement;
 
 /**
  * channel element - generic struct for SCE/CPE/CCE/LFE
  */
 typedef struct ChannelElement {
+    int present;
     // CPE specific
     int common_window;        ///< Set if channels share a common 'IndividualChannelStream' in bitstream.
     int     ms_mode;          ///< Signals mid/side stereo flags coding mode (used by encoder)
+    uint8_t is_mode;          ///< Set if any bands have been encoded using intensity stereo (used by encoder)
     uint8_t ms_mask[128];     ///< Set if mid/side stereo is used for each scalefactor window band
+    uint8_t is_mask[128];     ///< Set if intensity stereo is used (used by encoder)
     // shared
     SingleChannelElement ch[2];
     // CCE specific
@@ -259,7 +285,8 @@ typedef struct ChannelElement {
 /**
  * main AAC context
  */
-typedef struct AACContext {
+struct AACContext {
+    AVClass        *class;
     AVCodecContext *avctx;
     AVFrame *frame;
 
@@ -273,6 +300,7 @@ typedef struct AACContext {
     ChannelElement          *che[4][MAX_ELEM_ID];
     ChannelElement  *tag_che_map[4][MAX_ELEM_ID];
     int tags_mapped;
+    int warned_remapping_once;
     /** @} */
 
     /**
@@ -280,7 +308,7 @@ typedef struct AACContext {
      * (We do not want to have these on the stack.)
      * @{
      */
-    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, INTFLOAT, buf_mdct)[1024];
     /** @} */
 
     /**
@@ -291,8 +319,12 @@ typedef struct AACContext {
     FFTContext mdct_small;
     FFTContext mdct_ld;
     FFTContext mdct_ltp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
     IMDCT15Context *mdct480;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
+#endif /* USE_FIXED */
     int random_state;
     /** @} */
 
@@ -303,9 +335,33 @@ typedef struct AACContext {
     SingleChannelElement *output_element[MAX_CHANNELS]; ///< Points to each SingleChannelElement
     /** @} */
 
-    DECLARE_ALIGNED(32, float, temp)[128];
+
+    /**
+     * @name Japanese DTV specific extension
+     * @{
+     */
+    int force_dmono_mode;///< 0->not dmono, 1->use first channel, 2->use second channel
+    int dmono_mode;      ///< 0->not dmono, 1->use first channel, 2->use second channel
+    /** @} */
+
+    DECLARE_ALIGNED(32, INTFLOAT, temp)[128];
 
     OutputConfiguration oc[2];
-} AACContext;
+    int warned_num_aac_frames;
+
+    /* aacdec functions pointers */
+    void (*imdct_and_windowing)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_tns)(INTFLOAT coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode);
+    void (*windowing_and_mdct_ltp)(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics);
+    void (*update_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*vector_pow43)(int *coefs, int len);
+    void (*subband_scale)(int *dst, int *src, int scale, int offset, int len);
+
+};
+
+void ff_aacdec_init_mips(AACContext *c);
 
 #endif /* AVCODEC_AAC_H */
diff --git a/libavcodec/aac_ac3_parser.c b/libavcodec/aac_ac3_parser.c
index 806a826ea0..2f7d56807c 100644
--- a/libavcodec/aac_ac3_parser.c
+++ b/libavcodec/aac_ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_ac3_parser.h b/libavcodec/aac_ac3_parser.h
index 99286f0711..c2506a5bfd 100644
--- a/libavcodec/aac_ac3_parser.h
+++ b/libavcodec/aac_ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_adtstoasc_bsf.c b/libavcodec/aac_adtstoasc_bsf.c
index d3cbeae9d0..9c117c6005 100644
--- a/libavcodec/aac_adtstoasc_bsf.c
+++ b/libavcodec/aac_adtstoasc_bsf.c
@@ -2,20 +2,20 @@
  * MPEG-2/4 AAC ADTS to MPEG-4 Audio Specific Configuration bitstream filter
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -87,10 +87,13 @@ static int aac_adtstoasc_filter(AVBitStreamFilterContext *bsfc,
             buf_size -= get_bits_count(&gb)/8;
             buf      += get_bits_count(&gb)/8;
         }
+        av_free(avctx->extradata);
         avctx->extradata_size = 2 + pce_size;
         avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata)
+        if (!avctx->extradata) {
+            avctx->extradata_size = 0;
             return AVERROR(ENOMEM);
+        }
 
         init_put_bits(&pb, avctx->extradata, avctx->extradata_size);
         put_bits(&pb, 5, hdr.object_type);
@@ -114,7 +117,7 @@ static int aac_adtstoasc_filter(AVBitStreamFilterContext *bsfc,
 }
 
 AVBitStreamFilter ff_aac_adtstoasc_bsf = {
-    "aac_adtstoasc",
-    sizeof(AACBSFContext),
-    aac_adtstoasc_filter,
+    .name           = "aac_adtstoasc",
+    .priv_data_size = sizeof(AACBSFContext),
+    .filter         = aac_adtstoasc_filter,
 };
diff --git a/libavcodec/aac_defines.h b/libavcodec/aac_defines.h
new file mode 100644
index 0000000000..3c45742ee1
--- /dev/null
+++ b/libavcodec/aac_defines.h
@@ -0,0 +1,114 @@
+/*
+ * AAC defines
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AAC_DEFINES_H
+#define AVCODEC_AAC_DEFINES_H
+
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#include "libavutil/softfloat.h"
+
+#define FFT_FLOAT    0
+#define FFT_FIXED_32 1
+
+#define AAC_RENAME(x)       x ## _fixed
+#define AAC_RENAME_32(x)    x ## _fixed_32
+#define INTFLOAT int
+#define INT64FLOAT          int64_t
+#define SHORTFLOAT int16_t
+#define AAC_FLOAT SoftFloat
+#define AAC_SIGNE           int
+#define FIXR(a)             ((int)((a) * 1 + 0.5))
+#define FIXR10(a)           ((int)((a) * 1024.0 + 0.5))
+#define Q23(a)              (int)((a) * 8388608.0 + 0.5)
+#define Q30(x)              (int)((x)*1073741824.0 + 0.5)
+#define Q31(x)              (int)((x)*2147483648.0 + 0.5)
+#define RANGE15(x)          x
+#define GET_GAIN(x, y)      (-(y) << (x)) + 1024
+#define AAC_MUL16(x, y)     (int)(((int64_t)(x) * (y) + 0x8000) >> 16)
+#define AAC_MUL26(x, y)     (int)(((int64_t)(x) * (y) + 0x2000000) >> 26)
+#define AAC_MUL30(x, y)     (int)(((int64_t)(x) * (y) + 0x20000000) >> 30)
+#define AAC_MUL31(x, y)     (int)(((int64_t)(x) * (y) + 0x40000000) >> 31)
+#define AAC_MADD28(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x8000000) >> 28)
+#define AAC_MADD30(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) + \
+                                                     ((int64_t)(c) * (d)) + \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB30(x, y, a, b) (int)((((int64_t)(x) * (y)) - \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) - \
+                                                     ((int64_t)(c) * (d)) - \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB31_V3(x, y, z)    (int)((((int64_t)(x) * (z)) - \
+                                      ((int64_t)(y) * (z)) + \
+                                        0x40000000) >> 31)
+#define AAC_HALF_SUM(x, y)  (x) >> 1 + (y) >> 1
+#define AAC_SRA_R(x, y)     (int)(((x) + (1 << ((y) - 1))) >> (y))
+
+#else
+
+#define FFT_FLOAT    1
+#define FFT_FIXED_32 0
+
+#define AAC_RENAME(x)       x
+#define AAC_RENAME_32(x)    x
+#define INTFLOAT float
+#define INT64FLOAT          float
+#define SHORTFLOAT float
+#define AAC_FLOAT float
+#define AAC_SIGNE           unsigned
+#define FIXR(x)             ((float)(x))
+#define FIXR10(x)           ((float)(x))
+#define Q23(x)              x
+#define Q30(x)              x
+#define Q31(x)              x
+#define RANGE15(x)          (32768.0 * (x))
+#define GET_GAIN(x, y)      powf((x), -(y))
+#define AAC_MUL16(x, y)     ((x) * (y))
+#define AAC_MUL26(x, y)     ((x) * (y))
+#define AAC_MUL30(x, y)     ((x) * (y))
+#define AAC_MUL31(x, y)     ((x) * (y))
+#define AAC_MADD28(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) + \
+                                               (c) * (d) + (e) * (f))
+#define AAC_MSUB30(x, y, a, b) ((x) * (y) - (a) * (b))
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) - \
+                                               (c) * (d) - (e) * (f))
+#define AAC_MSUB31_V3(x, y, z)    ((x) - (y)) * (z)
+#define AAC_HALF_SUM(x, y)  ((x) + (y)) * 0.5f
+#define AAC_SRA_R(x, y)     (x)
+
+#endif /* USE_FIXED */
+
+#endif /* AVCODEC_AAC_DEFINES_H */
diff --git a/libavcodec/aac_parser.c b/libavcodec/aac_parser.c
index eae120a249..0b868edcb2 100644
--- a/libavcodec/aac_parser.c
+++ b/libavcodec/aac_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_tablegen.c b/libavcodec/aac_tablegen.c
index b2c6c954e0..2d13211d45 100644
--- a/libavcodec/aac_tablegen.c
+++ b/libavcodec/aac_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,5 +33,7 @@ int main(void)
 
     WRITE_ARRAY("const", float, ff_aac_pow2sf_tab);
 
+    WRITE_ARRAY("const", float, ff_aac_pow34sf_tab);
+
     return 0;
 }
diff --git a/libavcodec/aac_tablegen.h b/libavcodec/aac_tablegen.h
index 8a05ec5973..8b223f972e 100644
--- a/libavcodec/aac_tablegen.h
+++ b/libavcodec/aac_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,12 +30,15 @@
 #else
 #include "libavutil/mathematics.h"
 float ff_aac_pow2sf_tab[428];
+float ff_aac_pow34sf_tab[428];
 
-void ff_aac_tableinit(void)
+av_cold void ff_aac_tableinit(void)
 {
     int i;
-    for (i = 0; i < 428; i++)
+    for (i = 0; i < 428; i++) {
         ff_aac_pow2sf_tab[i] = pow(2, (i - POW_SF2_ZERO) / 4.0);
+        ff_aac_pow34sf_tab[i] = pow(ff_aac_pow2sf_tab[i], 3.0/4.0);
+    }
 }
 #endif /* CONFIG_HARDCODED_TABLES */
 
diff --git a/libavcodec/aac_tablegen_decl.h b/libavcodec/aac_tablegen_decl.h
index a5fd1cf345..ef86f85e1f 100644
--- a/libavcodec/aac_tablegen_decl.h
+++ b/libavcodec/aac_tablegen_decl.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,11 @@
 #if CONFIG_HARDCODED_TABLES
 #define ff_aac_tableinit()
 extern const float ff_aac_pow2sf_tab[428];
+extern const float ff_aac_pow34sf_tab[428];
 #else
 void ff_aac_tableinit(void);
 extern       float ff_aac_pow2sf_tab[428];
+extern       float ff_aac_pow34sf_tab[428];
 #endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_AAC_TABLEGEN_DECL_H */
diff --git a/libavcodec/aacadtsdec.c b/libavcodec/aacadtsdec.c
index 2994bce243..d0814ac27e 100644
--- a/libavcodec/aacadtsdec.c
+++ b/libavcodec/aacadtsdec.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2009 Alex Converse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aacadtsdec.h b/libavcodec/aacadtsdec.h
index 6319efcea3..d0584ef36a 100644
--- a/libavcodec/aacadtsdec.h
+++ b/libavcodec/aacadtsdec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index ee89148ef4..10ea14b141 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -2,20 +2,20 @@
  * AAC coefficients encoder
  * Copyright (C) 2008-2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,263 +39,29 @@
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
+#include "aac_tablegen_decl.h"
 
-/** bits needed to code codebook run value for long windows */
-static const uint8_t run_value_bits_long[64] = {
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
-};
-
-/** bits needed to code codebook run value for short windows */
-static const uint8_t run_value_bits_short[16] = {
-    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
-};
-
-static const uint8_t *run_value_bits[2] = {
-    run_value_bits_long, run_value_bits_short
-};
-
-
-/**
- * Quantize one coefficient.
- * @return absolute value of the quantized coefficient
- * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
- */
-static av_always_inline int quant(float coef, const float Q)
-{
-    float a = coef * Q;
-    return sqrtf(a * sqrtf(a)) + 0.4054;
-}
-
-static void quantize_bands(int *out, const float *in, const float *scaled,
-                           int size, float Q34, int is_signed, int maxval)
-{
-    int i;
-    double qc;
-    for (i = 0; i < size; i++) {
-        qc = scaled[i] * Q34;
-        out[i] = (int)FFMIN(qc + 0.4054, (double)maxval);
-        if (is_signed && in[i] < 0.0f) {
-            out[i] = -out[i];
-        }
-    }
-}
-
-static void abs_pow34_v(float *out, const float *in, const int size)
-{
-#ifndef USE_REALLY_FULL_SEARCH
-    int i;
-    for (i = 0; i < size; i++) {
-        float a = fabsf(in[i]);
-        out[i] = sqrtf(a * sqrtf(a));
-    }
-#endif /* USE_REALLY_FULL_SEARCH */
-}
-
-static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
-static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
-
-/**
- * Calculate rate distortion cost for quantizing with given codebook
- *
- * @return quantization distortion
- */
-static av_always_inline float quantize_and_encode_band_cost_template(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits, int BT_ZERO, int BT_UNSIGNED,
-                                int BT_PAIR, int BT_ESC)
-{
-    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
-    const float Q   = ff_aac_pow2sf_tab [q_idx];
-    const float Q34 = ff_aac_pow34sf_tab[q_idx];
-    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
-    const float CLIPPED_ESCAPE = 165140.0f*IQ;
-    int i, j;
-    float cost = 0;
-    const int dim = BT_PAIR ? 2 : 4;
-    int resbits = 0;
-    const int range  = aac_cb_range[cb];
-    const int maxval = aac_cb_maxval[cb];
-    int off;
-
-    if (BT_ZERO) {
-        for (i = 0; i < size; i++)
-            cost += in[i]*in[i];
-        if (bits)
-            *bits = 0;
-        return cost * lambda;
-    }
-    if (!scaled) {
-        abs_pow34_v(s->scoefs, in, size);
-        scaled = s->scoefs;
-    }
-    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, maxval);
-    if (BT_UNSIGNED) {
-        off = 0;
-    } else {
-        off = maxval;
-    }
-    for (i = 0; i < size; i += dim) {
-        const float *vec;
-        int *quants = s->qcoefs + i;
-        int curidx = 0;
-        int curbits;
-        float rd = 0.0f;
-        for (j = 0; j < dim; j++) {
-            curidx *= range;
-            curidx += quants[j] + off;
-        }
-        curbits =  ff_aac_spectral_bits[cb-1][curidx];
-        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
-        if (BT_UNSIGNED) {
-            for (j = 0; j < dim; j++) {
-                float t = fabsf(in[i+j]);
-                float di;
-                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
-                    if (t >= CLIPPED_ESCAPE) {
-                        di = t - CLIPPED_ESCAPE;
-                        curbits += 21;
-                    } else {
-                        int c = av_clip_uintp2(quant(t, Q), 13);
-                        di = t - c*cbrtf(c)*IQ;
-                        curbits += av_log2(c)*2 - 4 + 1;
-                    }
-                } else {
-                    di = t - vec[j]*IQ;
-                }
-                if (vec[j] != 0.0f)
-                    curbits++;
-                rd += di*di;
-            }
-        } else {
-            for (j = 0; j < dim; j++) {
-                float di = in[i+j] - vec[j]*IQ;
-                rd += di*di;
-            }
-        }
-        cost    += rd * lambda + curbits;
-        resbits += curbits;
-        if (cost >= uplim)
-            return uplim;
-        if (pb) {
-            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
-            if (BT_UNSIGNED)
-                for (j = 0; j < dim; j++)
-                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
-                        put_bits(pb, 1, in[i+j] < 0.0f);
-            if (BT_ESC) {
-                for (j = 0; j < 2; j++) {
-                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
-                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q), 13);
-                        int len = av_log2(coef);
+#include "aacenc_is.h"
+#include "aacenc_tns.h"
+#include "aacenc_pred.h"
 
-                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
-                        put_bits(pb, len, coef & ((1 << len) - 1));
-                    }
-                }
-            }
-        }
-    }
-
-    if (bits)
-        *bits = resbits;
-    return cost;
-}
-
-#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC) \
-static float quantize_and_encode_band_cost_ ## NAME(                                        \
-                                struct AACEncContext *s,                                \
-                                PutBitContext *pb, const float *in,                     \
-                                const float *scaled, int size, int scale_idx,           \
-                                int cb, const float lambda, const float uplim,          \
-                                int *bits) {                                            \
-    return quantize_and_encode_band_cost_template(                                      \
-                                s, pb, in, scaled, size, scale_idx,                     \
-                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,              \
-                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC);                 \
-}
-
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1)
-
-static float (*const quantize_and_encode_band_cost_arr[])(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits) = {
-    quantize_and_encode_band_cost_ZERO,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_ESC,
-};
+#include "libavcodec/aaccoder_twoloop.h"
 
-#define quantize_and_encode_band_cost(                                  \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
-    quantize_and_encode_band_cost_arr[cb](                              \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+/** Frequency in Hz for lower limit of noise substitution **/
+#define NOISE_LOW_LIMIT 4000
 
-static float quantize_band_cost(struct AACEncContext *s, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits)
-{
-    return quantize_and_encode_band_cost(s, NULL, in, scaled, size, scale_idx,
-                                         cb, lambda, uplim, bits);
-}
+/* Parameter of f(x) = a*(lambda/100), defines the maximum fourier spread
+ * beyond which no PNS is used (since the SFBs contain tone rather than noise) */
+#define NOISE_SPREAD_THRESHOLD 0.5073f
 
-static void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
-                                     const float *in, int size, int scale_idx,
-                                     int cb, const float lambda)
-{
-    quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL);
-}
-
-static float find_max_val(int group_len, int swb_size, const float *scaled) {
-    float maxval = 0.0f;
-    int w2, i;
-    for (w2 = 0; w2 < group_len; w2++) {
-        for (i = 0; i < swb_size; i++) {
-            maxval = FFMAX(maxval, scaled[w2*128+i]);
-        }
-    }
-    return maxval;
-}
+/* Parameter of f(x) = a*(100/lambda), defines how much PNS is allowed to
+ * replace low energy non zero bands */
+#define NOISE_LAMBDA_REPLACE 1.948f
 
-static int find_min_book(float maxval, int sf) {
-    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
-    float Q34 = sqrtf(Q * sqrtf(Q));
-    int qmaxval, cb;
-    qmaxval = maxval * Q34 + 0.4054f;
-    if      (qmaxval ==  0) cb = 0;
-    else if (qmaxval ==  1) cb = 1;
-    else if (qmaxval ==  2) cb = 3;
-    else if (qmaxval <=  4) cb = 5;
-    else if (qmaxval <=  7) cb = 7;
-    else if (qmaxval <= 12) cb = 9;
-    else                    cb = 11;
-    return cb;
-}
+#include "libavcodec/aaccoder_trellis.h"
 
 /**
  * structure used in optimal codebook search
@@ -312,7 +78,7 @@ typedef struct BandCodingPath {
 static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda)
 {
-    BandCodingPath path[120][12];
+    BandCodingPath path[120][CB_TOT_ALL];
     int w, swb, cb, start, size;
     int i, j;
     const int max_sfb  = sce->ics.max_sfb;
@@ -325,7 +91,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
 
     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
     start = win*128;
-    for (cb = 0; cb < 12; cb++) {
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
         path[0][cb].cost     = 0.0f;
         path[0][cb].prev_idx = -1;
         path[0][cb].run      = 0;
@@ -333,7 +99,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     for (swb = 0; swb < max_sfb; swb++) {
         size = sce->ics.swb_sizes[swb];
         if (sce->zeroes[win*16 + swb]) {
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 path[swb+1][cb].prev_idx = cb;
                 path[swb+1][cb].cost     = path[swb][cb].cost;
                 path[swb+1][cb].run      = path[swb][cb].run + 1;
@@ -343,15 +109,22 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
             int mincb = next_mincb;
             next_minrd = INFINITY;
             next_mincb = 0;
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 float cost_stay_here, cost_get_here;
                 float rd = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] < aac_cb_out_map[cb] ||
+                    cb  < aac_cb_in_map[sce->band_type[win*16+swb]] && sce->band_type[win*16+swb] > aac_cb_out_map[cb]) {
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].cost     = INFINITY;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                    continue;
+                }
                 for (w = 0; w < group_len; w++) {
                     FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(win+w)*16+swb];
-                    rd += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                             s->scoefs + start + w*128, size,
-                                             sce->sf_idx[(win+w)*16+swb], cb,
-                                             lambda / band->threshold, INFINITY, NULL);
+                    rd += quantize_band_cost(s, &sce->coeffs[start + w*128],
+                                             &s->scoefs[start + w*128], size,
+                                             sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
+                                             lambda / band->threshold, INFINITY, NULL, 0);
                 }
                 cost_stay_here = path[swb][cb].cost + rd;
                 cost_get_here  = minrd              + rd + run_bits + 4;
@@ -379,11 +152,12 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //convert resulting path from backward-linked list
     stack_len = 0;
     idx       = 0;
-    for (cb = 1; cb < 12; cb++)
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
             idx = cb;
     ppos = max_sfb;
     while (ppos > 0) {
+        av_assert1(idx >= 0);
         cb = idx;
         stackrun[stack_len] = path[ppos][cb].run;
         stackcb [stack_len] = cb;
@@ -394,12 +168,13 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //perform actual band info encoding
     start = 0;
     for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
         count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
+        memset(sce->zeroes + win*16 + start, !cb, count);
         //XXX: memset when band_type is also uint8_t
         for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
+            sce->band_type[win*16 + start] = cb;
             start++;
         }
         while (count >= run_esc) {
@@ -410,147 +185,52 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     }
 }
 
-static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
-                                  int win, int group_len, const float lambda)
+
+typedef struct TrellisPath {
+    float cost;
+    int prev;
+} TrellisPath;
+
+#define TRELLIS_STAGES 121
+#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
+
+static void set_special_band_scalefactors(AACEncContext *s, SingleChannelElement *sce)
 {
-    BandCodingPath path[120][12];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
+    int w, g, start = 0;
+    int minscaler_n = sce->sf_idx[0], minscaler_i = sce->sf_idx[0];
+    int bands = 0;
 
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < 12; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < 12; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < 12; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                               s->scoefs + start + w*128, size,
-                                               sce->sf_idx[(win+w)*16+swb], cb,
-                                               0, INFINITY, NULL);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = av_clip(roundf(log2f(sce->is_ener[w*16+g])*2), -155, 100);
+                minscaler_i = FFMIN(minscaler_i, sce->sf_idx[w*16+g]);
+                bands++;
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = av_clip(3+ceilf(log2f(sce->pns_ener[w*16+g])*2), -100, 155);
+                minscaler_n = FFMIN(minscaler_n, sce->sf_idx[w*16+g]);
+                bands++;
             }
+            start += sce->ics.swb_sizes[g];
         }
-        start += sce->ics.swb_sizes[swb];
     }
 
-    //convert resulting path from backward-linked list
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < 12; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        assert(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-    //perform actual band info encoding
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
-        //XXX: memset when band_type is also uint8_t
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
+    if (!bands)
+        return;
+
+    /* Clip the scalefactor indices */
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler_i, minscaler_i + SCALE_MAX_DIFF);
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler_n, minscaler_n + SCALE_MAX_DIFF);
+            }
         }
-        put_bits(&s->pb, run_bits, count);
     }
 }
 
-/** Return the minimum scalefactor where the quantized coef does not clip. */
-static av_always_inline uint8_t coef2minsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-/** Return the maximum scalefactor where the quantized coef is not zero. */
-static av_always_inline uint8_t coef2maxsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-typedef struct TrellisPath {
-    float cost;
-    int prev;
-} TrellisPath;
-
-#define TRELLIS_STAGES 121
-#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
-
 static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                                        SingleChannelElement *sce,
                                        const float lambda)
@@ -616,7 +296,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
         for (g = 0; g < sce->ics.num_swb; g++) {
-            const float *coefs = sce->coeffs + start;
+            const float *coefs = &sce->coeffs[start];
             float qmin, qmax;
             int nz = 0;
 
@@ -655,7 +335,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                         dist += quantize_band_cost(s, coefs + w2*128, s->scoefs + start + w2*128, sce->ics.swb_sizes[g],
-                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL);
+                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, 0);
                     }
                     minrd = FFMIN(minrd, dist);
 
@@ -701,153 +381,6 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                 sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-/**
- * two-loop quantizers search taken from ISO 13818-7 Appendix C
- */
-static void search_for_quantizers_twoloop(AVCodecContext *avctx,
-                                          AACEncContext *s,
-                                          SingleChannelElement *sce,
-                                          const float lambda)
-{
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128];
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
-
-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
-    destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            int nz = 0;
-            float uplim = 0.0f;
-            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim += band->threshold;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
-                    sce->zeroes[(w+w2)*16+g] = 1;
-                    continue;
-                }
-                nz = 1;
-            }
-            uplims[w*16+g] = uplim *512;
-            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
-            allz |= nz;
-        }
-    }
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            if (sce->zeroes[w*16+g]) {
-                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
-            }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
-        }
-    }
-
-    if (!allz)
-        return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *scaled = s->scoefs + start;
-            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
-            start += sce->ics.swb_sizes[g];
-        }
-    }
-
-    //perform two-loop search
-    //outer loop - improve quality
-    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
-        //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
-        do {
-            int prev = -1;
-            tbits = 0;
-            fflag = 0;
-            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                start = w*128;
-                for (g = 0;  g < sce->ics.num_swb; g++) {
-                    const float *coefs = sce->coeffs + start;
-                    const float *scaled = s->scoefs + start;
-                    int bits = 0;
-                    int cb;
-                    float dist = 0.0f;
-
-                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                        start += sce->ics.swb_sizes[g];
-                        continue;
-                    }
-                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                        int b;
-                        dist += quantize_band_cost(s, coefs + w2*128,
-                                                   scaled + w2*128,
-                                                   sce->ics.swb_sizes[g],
-                                                   sce->sf_idx[w*16+g],
-                                                   cb,
-                                                   1.0f,
-                                                   INFINITY,
-                                                   &b);
-                        bits += b;
-                    }
-                    dists[w*16+g] = dist - bits;
-                    if (prev != -1) {
-                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                    }
-                    tbits += bits;
-                    start += sce->ics.swb_sizes[g];
-                    prev = sce->sf_idx[w*16+g];
-                }
-            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
-            }
-            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
-                qstep = 1;
-        } while (qstep);
-
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
-        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-            for (g = 0; g < sce->ics.num_swb; g++) {
-                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
-                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
-                if (sce->sf_idx[w*16+g] != prevsc)
-                    fflag = 1;
-                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-            }
-        }
-        its++;
-    } while (fflag && its < 10);
-}
 
 static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
                                        SingleChannelElement *sce,
@@ -874,8 +407,8 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
         }
     } else {
         for (w = 0; w < 8; w++) {
-            const float *coeffs = sce->coeffs + w*128;
-            start = 0;
+            const float *coeffs = &sce->coeffs[w*128];
+            curband = start = 0;
             for (i = 0; i < 128; i++) {
                 if (i - start >= sce->ics.swb_sizes[curband]) {
                     start += sce->ics.swb_sizes[curband];
@@ -899,7 +432,7 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
         for (g = 0; g < sce->ics.num_swb; g++) {
-            float *coefs   = sce->coeffs + start;
+            float *coefs   = &sce->coeffs[start];
             const int size = sce->ics.swb_sizes[g];
             int start2 = start, end2 = start + size, peakpos = start;
             float maxval = -1, thr = 0.0f, t;
@@ -940,8 +473,8 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
         for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *coefs  = sce->coeffs + start;
-            const float *scaled = s->scoefs   + start;
+            const float *coefs  = &sce->coeffs[start];
+            const float *scaled = &s->scoefs[start];
             const int size      = sce->ics.swb_sizes[g];
             int scf, prev_scf, step;
             int min_scf = -1, max_scf = 256;
@@ -953,7 +486,6 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
             }
             sce->zeroes[w*16+g] = 0;
             scf  = prev_scf = av_clip(SCALE_ONE_POS - SCALE_DIV_512 - log2f(1/maxq[w*16+g])*16/3, 60, 218);
-            step = 16;
             for (;;) {
                 float dist = 0.0f;
                 int quant_max;
@@ -967,11 +499,12 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
                                                ESC_BT,
                                                lambda,
                                                INFINITY,
-                                               &b);
+                                               &b,
+                                               0);
                     dist -= b;
                 }
                 dist *= 1.0f / 512.0f / lambda;
-                quant_max = quant(maxq[w*16+g], ff_aac_pow2sf_tab[POW_SF2_ZERO - scf + SCALE_ONE_POS - SCALE_DIV_512]);
+                quant_max = quant(maxq[w*16+g], ff_aac_pow2sf_tab[POW_SF2_ZERO - scf + SCALE_ONE_POS - SCALE_DIV_512], ROUND_STANDARD);
                 if (quant_max >= 8191) { // too much, return to the previous quantizer
                     sce->sf_idx[w*16+g] = prev_scf;
                     break;
@@ -1051,17 +584,96 @@ static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s,
                 sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-static void search_for_ms(AACEncContext *s, ChannelElement *cpe,
-                          const float lambda)
+static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
+{
+    FFPsyBand *band;
+    int w, g, w2, i;
+    float *PNS = &s->scoefs[0*128], *PNS34 = &s->scoefs[1*128];
+    float *NOR34 = &s->scoefs[3*128];
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate/(1024.0f/sce->ics.num_windows)/2.0f;
+    const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
+    const float spread_threshold = NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f);
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        int wstart = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int noise_sfi;
+            float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
+            float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 0.0f;
+            const int start = wstart+sce->ics.swb_offset[g];
+            const float freq = (start-wstart)*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || avctx->cutoff && freq >= avctx->cutoff)
+                continue;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     += band->spread;
+                threshold  += band->threshold;
+            }
+
+            /* Ramps down at ~8000Hz and loosens the dist threshold */
+            dist_thresh = FFMIN(2.5f*NOISE_LOW_LIMIT/freq, 2.5f);
+
+            /* zero and energy close to threshold usually means hole avoidance,
+             * we do want to remain avoiding holes with PNS
+             */
+            if (((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.5f/freq_boost)) || spread < spread_threshold ||
+                (sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost)) {
+                sce->pns_ener[w*16+g] = sfb_energy;
+                continue;
+            }
+
+            pns_tgt_energy = sfb_energy*spread*spread/sce->ics.group_len[w];
+            noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
+            noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                float band_energy, scale, pns_senergy;
+                const int start_c = (w+w2)*128+sce->ics.swb_offset[g];
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                for (i = 0; i < sce->ics.swb_sizes[g]; i++)
+                    PNS[i] = s->random_state = lcg_random(s->random_state);
+                band_energy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                scale = noise_amp/sqrtf(band_energy);
+                s->fdsp->vector_fmul_scalar(PNS, PNS, scale, sce->ics.swb_sizes[g]);
+                pns_senergy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                pns_energy += pns_senergy;
+                abs_pow34_v(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]);
+                abs_pow34_v(PNS34, PNS, sce->ics.swb_sizes[g]);
+                dist1 += quantize_band_cost(s, &sce->coeffs[start_c],
+                                            NOR34,
+                                            sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g],
+                                            sce->band_alt[(w+w2)*16+g],
+                                            lambda/band->threshold, INFINITY, NULL, 0);
+                /* Estimate rd on average as 9 bits for CB and sf + spread energy * lambda/thr */
+                dist2 += 9+band->energy/(band->spread*band->spread)*lambda/band->threshold;
+            }
+            energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
+            sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
+            if (energy_ratio > 0.85f && energy_ratio < 1.25f && (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || dist2*dist_thresh < dist1)) {
+                sce->band_type[w*16+g] = NOISE_BT;
+                sce->zeroes[w*16+g] = 0;
+            }
+        }
+    }
+}
+
+static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
 {
     int start = 0, i, w, w2, g;
     float M[128], S[128];
     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
     SingleChannelElement *sce0 = &cpe->ch[0];
     SingleChannelElement *sce1 = &cpe->ch[1];
     if (!cpe->common_window)
         return;
     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
         for (g = 0;  g < sce0->ics.num_swb; g++) {
             if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
                 float dist1 = 0.0f, dist2 = 0.0f;
@@ -1071,39 +683,39 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe,
                     float minthr = FFMIN(band0->threshold, band1->threshold);
                     float maxthr = FFMAX(band0->threshold, band1->threshold);
                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
-                        M[i] = (sce0->coeffs[start+w2*128+i]
-                              + sce1->coeffs[start+w2*128+i]) * 0.5;
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
                         S[i] =  M[i]
-                              - sce1->coeffs[start+w2*128+i];
+                              - sce1->coeffs[start+(w+w2)*128+i];
                     }
-                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
                     abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
                     abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
+                    dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
                                                 L34,
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL);
-                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
+                                                lambda / band0->threshold, INFINITY, NULL, 0);
+                    dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                 R34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL);
+                                                lambda / band1->threshold, INFINITY, NULL, 0);
                     dist2 += quantize_band_cost(s, M,
                                                 M34,
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL);
+                                                lambda / maxthr, INFINITY, NULL, 0);
                     dist2 += quantize_band_cost(s, S,
                                                 S34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL);
+                                                lambda / minthr, INFINITY, NULL, 0);
                 }
                 cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
@@ -1112,29 +724,69 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe,
     }
 }
 
-AACCoefficientsEncoder ff_aac_coders[] = {
-    {
+AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
+    [AAC_CODER_FAAC] = {
         search_for_quantizers_faac,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_prediction,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        set_special_band_scalefactors,
+        search_for_pns,
+        ff_aac_search_for_tns,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_ANMR] = {
         search_for_quantizers_anmr,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_prediction,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        set_special_band_scalefactors,
+        search_for_pns,
+        ff_aac_search_for_tns,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_TWOLOOP] = {
         search_for_quantizers_twoloop,
         codebook_trellis_rate,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_prediction,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        set_special_band_scalefactors,
+        search_for_pns,
+        ff_aac_search_for_tns,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_FAST] = {
         search_for_quantizers_fast,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_prediction,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        set_special_band_scalefactors,
+        search_for_pns,
+        ff_aac_search_for_tns,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
 };
diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h
new file mode 100644
index 0000000000..7d685ebe8c
--- /dev/null
+++ b/libavcodec/aaccoder_trellis.h
@@ -0,0 +1,194 @@
+/*
+ * AAC encoder trellis codebook selector
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder trellis codebook selector
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the codebook_trellis_rate selector function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost_bits
+ *  - abs_pow34_v
+ */
+
+#ifndef AVCODEC_AACCODER_TRELLIS_H
+#define AVCODEC_AACCODER_TRELLIS_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+#include "aac_tablegen_decl.h"
+
+
+/**
+ * structure used in optimal codebook search
+ */
+typedef struct TrellisBandCodingPath {
+    int prev_idx; ///< pointer to the previous path point
+    float cost;   ///< path cost
+    int run;
+} TrellisBandCodingPath;
+
+
+static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
+                                  int win, int group_len, const float lambda)
+{
+    TrellisBandCodingPath path[120][CB_TOT_ALL];
+    int w, swb, cb, start, size;
+    int i, j;
+    const int max_sfb  = sce->ics.max_sfb;
+    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
+    const int run_esc  = (1 << run_bits) - 1;
+    int idx, ppos, count;
+    int stackrun[120], stackcb[120], stack_len;
+    float next_minbits = INFINITY;
+    int next_mincb = 0;
+
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    start = win*128;
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
+        path[0][cb].cost     = run_bits+4;
+        path[0][cb].prev_idx = -1;
+        path[0][cb].run      = 0;
+    }
+    for (swb = 0; swb < max_sfb; swb++) {
+        size = sce->ics.swb_sizes[swb];
+        if (sce->zeroes[win*16 + swb]) {
+            float cost_stay_here = path[swb][0].cost;
+            float cost_get_here  = next_minbits + run_bits + 4;
+            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
+                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
+                cost_stay_here += run_bits;
+            if (cost_get_here < cost_stay_here) {
+                path[swb+1][0].prev_idx = next_mincb;
+                path[swb+1][0].cost     = cost_get_here;
+                path[swb+1][0].run      = 1;
+            } else {
+                path[swb+1][0].prev_idx = 0;
+                path[swb+1][0].cost     = cost_stay_here;
+                path[swb+1][0].run      = path[swb][0].run + 1;
+            }
+            next_minbits = path[swb+1][0].cost;
+            next_mincb = 0;
+            for (cb = 1; cb < CB_TOT_ALL; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+        } else {
+            float minbits = next_minbits;
+            int mincb = next_mincb;
+            int startcb = sce->band_type[win*16+swb];
+            startcb = aac_cb_in_map[startcb];
+            next_minbits = INFINITY;
+            next_mincb = 0;
+            for (cb = 0; cb < startcb; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
+                float cost_stay_here, cost_get_here;
+                float bits = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
+                    path[swb+1][cb].cost = 61450;
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].run = 0;
+                    continue;
+                }
+                for (w = 0; w < group_len; w++) {
+                    bits += quantize_band_cost_bits(s, &sce->coeffs[start + w*128],
+                                               &s->scoefs[start + w*128], size,
+                                               sce->sf_idx[win*16+swb],
+                                               aac_cb_out_map[cb],
+                                               0, INFINITY, NULL, 0);
+                }
+                cost_stay_here = path[swb][cb].cost + bits;
+                cost_get_here  = minbits            + bits + run_bits + 4;
+                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
+                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
+                    cost_stay_here += run_bits;
+                if (cost_get_here < cost_stay_here) {
+                    path[swb+1][cb].prev_idx = mincb;
+                    path[swb+1][cb].cost     = cost_get_here;
+                    path[swb+1][cb].run      = 1;
+                } else {
+                    path[swb+1][cb].prev_idx = cb;
+                    path[swb+1][cb].cost     = cost_stay_here;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                }
+                if (path[swb+1][cb].cost < next_minbits) {
+                    next_minbits = path[swb+1][cb].cost;
+                    next_mincb = cb;
+                }
+            }
+        }
+        start += sce->ics.swb_sizes[swb];
+    }
+
+    //convert resulting path from backward-linked list
+    stack_len = 0;
+    idx       = 0;
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
+        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
+            idx = cb;
+    ppos = max_sfb;
+    while (ppos > 0) {
+        av_assert1(idx >= 0);
+        cb = idx;
+        stackrun[stack_len] = path[ppos][cb].run;
+        stackcb [stack_len] = cb;
+        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
+        ppos -= path[ppos][cb].run;
+        stack_len++;
+    }
+    //perform actual band info encoding
+    start = 0;
+    for (i = stack_len - 1; i >= 0; i--) {
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
+        count = stackrun[i];
+        memset(sce->zeroes + win*16 + start, !cb, count);
+        //XXX: memset when band_type is also uint8_t
+        for (j = 0; j < count; j++) {
+            sce->band_type[win*16 + start] = cb;
+            start++;
+        }
+        while (count >= run_esc) {
+            put_bits(&s->pb, run_bits, run_esc);
+            count -= run_esc;
+        }
+        put_bits(&s->pb, run_bits, count);
+    }
+}
+
+
+#endif /* AVCODEC_AACCODER_TRELLIS_H */
diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h
new file mode 100644
index 0000000000..5ac09dc9cc
--- /dev/null
+++ b/libavcodec/aaccoder_twoloop.h
@@ -0,0 +1,203 @@
+/*
+ * AAC encoder twoloop coder
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder twoloop coder
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the twoloop coder function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost
+ *  - abs_pow34_v
+ *  - find_max_val
+ *  - find_min_book
+ */
+
+#ifndef AVCODEC_AACCODER_TWOLOOP_H
+#define AVCODEC_AACCODER_TWOLOOP_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+#include "aac_tablegen_decl.h"
+
+
+/**
+ * two-loop quantizers search taken from ISO 13818-7 Appendix C
+ */
+static void search_for_quantizers_twoloop(AVCodecContext *avctx,
+                                          AACEncContext *s,
+                                          SingleChannelElement *sce,
+                                          const float lambda)
+{
+    int start = 0, i, w, w2, g;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
+    float dists[128] = { 0 }, uplims[128] = { 0 };
+    float maxvals[128];
+    int fflag, minscaler;
+    int its  = 0;
+    int allz = 0;
+    float minthr = INFINITY;
+
+    // for values above this the decoder might end up in an endless loop
+    // due to always having more bits than what can be encoded.
+    destbits = FFMIN(destbits, 5800);
+    //XXX: some heuristic to determine initial quantizers will reduce search time
+    //determine zero bands and upper limits
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int nz = 0;
+            float uplim = 0.0f, energy = 0.0f;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                uplim  += band->threshold;
+                energy += band->energy;
+                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                    sce->zeroes[(w+w2)*16+g] = 1;
+                    continue;
+                }
+                nz = 1;
+            }
+            uplims[w*16+g] = uplim *512;
+            sce->zeroes[w*16+g] = !nz;
+            if (nz)
+                minthr = FFMIN(minthr, uplim);
+            allz |= nz;
+        }
+    }
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g]) {
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
+                continue;
+            }
+            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
+        }
+    }
+
+    if (!allz)
+        return;
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            const float *scaled = s->scoefs + start;
+            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    //perform two-loop search
+    //outer loop - improve quality
+    do {
+        int tbits, qstep;
+        minscaler = sce->sf_idx[0];
+        //inner loop - quantize spectrum to fit into given number of bits
+        qstep = its ? 1 : 32;
+        do {
+            int prev = -1;
+            tbits = 0;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                start = w*128;
+                for (g = 0;  g < sce->ics.num_swb; g++) {
+                    const float *coefs = &sce->coeffs[start];
+                    const float *scaled = &s->scoefs[start];
+                    int bits = 0;
+                    int cb;
+                    float dist = 0.0f;
+
+                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                        start += sce->ics.swb_sizes[g];
+                        continue;
+                    }
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        int b;
+                        dist += quantize_band_cost(s, coefs + w2*128,
+                                                   scaled + w2*128,
+                                                   sce->ics.swb_sizes[g],
+                                                   sce->sf_idx[w*16+g],
+                                                   cb,
+                                                   1.0f,
+                                                   INFINITY,
+                                                   &b,
+                                                   0);
+                        bits += b;
+                    }
+                    dists[w*16+g] = dist - bits;
+                    if (prev != -1) {
+                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
+                    }
+                    tbits += bits;
+                    start += sce->ics.swb_sizes[g];
+                    prev = sce->sf_idx[w*16+g];
+                }
+            }
+            if (tbits > destbits) {
+                for (i = 0; i < 128; i++)
+                    if (sce->sf_idx[i] < 218 - qstep)
+                        sce->sf_idx[i] += qstep;
+            } else {
+                for (i = 0; i < 128; i++)
+                    if (sce->sf_idx[i] > 60 - qstep)
+                        sce->sf_idx[i] -= qstep;
+            }
+            qstep >>= 1;
+            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
+                qstep = 1;
+        } while (qstep);
+
+        fflag = 0;
+        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
+
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                int prevsc = sce->sf_idx[w*16+g];
+                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
+                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
+                        sce->sf_idx[w*16+g]--;
+                    else //Try to make sure there is some energy in every band
+                        sce->sf_idx[w*16+g]-=2;
+                }
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
+                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
+                if (sce->sf_idx[w*16+g] != prevsc)
+                    fflag = 1;
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+            }
+        }
+        its++;
+    } while (fflag && its < 10);
+}
+
+#endif /* AVCODEC_AACCODER_TWOLOOP_H */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index f0c7960d28..837102f40a 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -8,20 +8,20 @@
  * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
  * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,55 +32,12 @@
  * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
  */
 
-/*
- * supported tools
- *
- * Support?             Name
- * N (code in SoC repo) gain control
- * Y                    block switching
- * Y                    window shapes - standard
- * N                    window shapes - Low Delay
- * Y                    filterbank - standard
- * N (code in SoC repo) filterbank - Scalable Sample Rate
- * Y                    Temporal Noise Shaping
- * Y                    Long Term Prediction
- * Y                    intensity stereo
- * Y                    channel coupling
- * Y                    frequency domain prediction
- * Y                    Perceptual Noise Substitution
- * Y                    Mid/Side stereo
- * N                    Scalable Inverse AAC Quantization
- * N                    Frequency Selective Switch
- * N                    upsampling filter
- * Y                    quantization & coding - AAC
- * N                    quantization & coding - TwinVQ
- * N                    quantization & coding - BSAC
- * N                    AAC Error Resilience tools
- * N                    Error Resilience payload syntax
- * N                    Error Protection tool
- * N                    CELP
- * N                    Silence Compression
- * N                    HVXC
- * N                    HVXC 4kbits/s VR
- * N                    Structured Audio tools
- * N                    Structured Audio Sample Bank Format
- * N                    MIDI
- * N                    Harmonic and Individual Lines plus Noise
- * N                    Text-To-Speech Interface
- * Y                    Spectral Band Replication
- * Y (not in this code) Layer-1
- * Y (not in this code) Layer-2
- * Y (not in this code) Layer-3
- * N                    SinuSoidal Coding (Transient, Sinusoid, Noise)
- * Y                    Parametric Stereo
- * N                    Direct Stream Transfer
- *
- * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
- *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
-           Parametric Stereo.
- */
+#define FFT_FLOAT 1
+#define FFT_FIXED_32 0
+#define USE_FIXED 0
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -100,7 +57,6 @@
 #include "aacadtsdec.h"
 #include "libavutil/intfloat.h"
 
-#include <assert.h>
 #include <errno.h>
 #include <math.h>
 #include <stdint.h>
@@ -108,853 +64,10 @@
 
 #if ARCH_ARM
 #   include "arm/aac.h"
+#elif ARCH_MIPS
+#   include "mips/aacdec_mips.h"
 #endif
 
-static VLC vlc_scalefactors;
-static VLC vlc_spectral[11];
-
-static const char overread_err[] = "Input buffer exhausted before END element found\n";
-
-static int count_channels(uint8_t (*layout)[3], int tags)
-{
-    int i, sum = 0;
-    for (i = 0; i < tags; i++) {
-        int syn_ele = layout[i][0];
-        int pos     = layout[i][2];
-        sum += (1 + (syn_ele == TYPE_CPE)) *
-               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
-    }
-    return sum;
-}
-
-/**
- * Check for the channel element in the current channel position configuration.
- * If it exists, make sure the appropriate element is allocated and map the
- * channel order to match the internal Libav channel layout.
- *
- * @param   che_pos current channel position configuration
- * @param   type channel element type
- * @param   id channel element id
- * @param   channels count of the number of channels in the configuration
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static av_cold int che_configure(AACContext *ac,
-                                 enum ChannelPosition che_pos,
-                                 int type, int id, int *channels)
-{
-    if (che_pos) {
-        if (!ac->che[type][id]) {
-            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
-                return AVERROR(ENOMEM);
-            ff_aac_sbr_ctx_init(ac, &ac->che[type][id]->sbr);
-        }
-        if (type != TYPE_CCE) {
-            if (*channels >= MAX_CHANNELS - 2)
-                return AVERROR_INVALIDDATA;
-            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
-            if (type == TYPE_CPE ||
-                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
-                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
-            }
-        }
-    } else {
-        if (ac->che[type][id])
-            ff_aac_sbr_ctx_close(&ac->che[type][id]->sbr);
-        av_freep(&ac->che[type][id]);
-    }
-    return 0;
-}
-
-static int frame_configure_elements(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int type, id, ch, ret;
-
-    /* set channel pointers to internal buffers by default */
-    for (type = 0; type < 4; type++) {
-        for (id = 0; id < MAX_ELEM_ID; id++) {
-            ChannelElement *che = ac->che[type][id];
-            if (che) {
-                che->ch[0].ret = che->ch[0].ret_buf;
-                che->ch[1].ret = che->ch[1].ret_buf;
-            }
-        }
-    }
-
-    /* get output buffer */
-    av_frame_unref(ac->frame);
-    ac->frame->nb_samples = 2048;
-    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-
-    /* map output channel pointers to AVFrame data */
-    for (ch = 0; ch < avctx->channels; ch++) {
-        if (ac->output_element[ch])
-            ac->output_element[ch]->ret = (float *)ac->frame->extended_data[ch];
-    }
-
-    return 0;
-}
-
-struct elem_to_channel {
-    uint64_t av_position;
-    uint8_t syn_ele;
-    uint8_t elem_id;
-    uint8_t aac_position;
-};
-
-static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
-                       uint8_t (*layout_map)[3], int offset, uint64_t left,
-                       uint64_t right, int pos)
-{
-    if (layout_map[offset][0] == TYPE_CPE) {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left | right,
-            .syn_ele      = TYPE_CPE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        return 1;
-    } else {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        e2c_vec[offset + 1] = (struct elem_to_channel) {
-            .av_position  = right,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset + 1][1],
-            .aac_position = pos
-        };
-        return 2;
-    }
-}
-
-static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
-                                 int *current)
-{
-    int num_pos_channels = 0;
-    int first_cpe        = 0;
-    int sce_parity       = 0;
-    int i;
-    for (i = *current; i < tags; i++) {
-        if (layout_map[i][2] != pos)
-            break;
-        if (layout_map[i][0] == TYPE_CPE) {
-            if (sce_parity) {
-                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
-                    sce_parity = 0;
-                } else {
-                    return -1;
-                }
-            }
-            num_pos_channels += 2;
-            first_cpe         = 1;
-        } else {
-            num_pos_channels++;
-            sce_parity ^= 1;
-        }
-    }
-    if (sce_parity &&
-        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
-        return -1;
-    *current = i;
-    return num_pos_channels;
-}
-
-static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
-{
-    int i, n, total_non_cc_elements;
-    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
-    int num_front_channels, num_side_channels, num_back_channels;
-    uint64_t layout;
-
-    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
-        return 0;
-
-    i = 0;
-    num_front_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
-    if (num_front_channels < 0)
-        return 0;
-    num_side_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
-    if (num_side_channels < 0)
-        return 0;
-    num_back_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
-    if (num_back_channels < 0)
-        return 0;
-
-    if (num_side_channels == 0 && num_back_channels >= 4) {
-        num_side_channels = 2;
-        num_back_channels -= 2;
-    }
-
-    i = 0;
-    if (num_front_channels & 1) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_FRONT_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_FRONT
-        };
-        i++;
-        num_front_channels--;
-    }
-    if (num_front_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT_OF_CENTER,
-                         AV_CH_FRONT_RIGHT_OF_CENTER,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    if (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT,
-                         AV_CH_FRONT_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    while (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-
-    if (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_SIDE_LEFT,
-                         AV_CH_SIDE_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_side_channels -= 2;
-    }
-    while (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_SIDE);
-        num_side_channels -= 2;
-    }
-
-    while (num_back_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_BACK_LEFT,
-                         AV_CH_BACK_RIGHT,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_BACK_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_BACK
-        };
-        i++;
-        num_back_channels--;
-    }
-
-    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_LOW_FREQUENCY,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = UINT64_MAX,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-
-    // Must choose a stable sort
-    total_non_cc_elements = n = i;
-    do {
-        int next_n = 0;
-        for (i = 1; i < n; i++)
-            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
-                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
-                next_n = i;
-            }
-        n = next_n;
-    } while (n > 0);
-
-    layout = 0;
-    for (i = 0; i < total_non_cc_elements; i++) {
-        layout_map[i][0] = e2c_vec[i].syn_ele;
-        layout_map[i][1] = e2c_vec[i].elem_id;
-        layout_map[i][2] = e2c_vec[i].aac_position;
-        if (e2c_vec[i].av_position != UINT64_MAX) {
-            layout |= e2c_vec[i].av_position;
-        }
-    }
-
-    return layout;
-}
-
-/**
- * Save current output configuration if and only if it has been locked.
- */
-static void push_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status == OC_LOCKED) {
-        ac->oc[0] = ac->oc[1];
-    }
-    ac->oc[1].status = OC_NONE;
-}
-
-/**
- * Restore the previous output configuration if and only if the current
- * configuration is unlocked.
- */
-static void pop_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
-        ac->oc[1] = ac->oc[0];
-        ac->avctx->channels = ac->oc[1].channels;
-        ac->avctx->channel_layout = ac->oc[1].channel_layout;
-    }
-}
-
-/**
- * Configure output channel order based on the current program
- * configuration element.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int output_configure(AACContext *ac,
-                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
-                            enum OCStatus oc_type, int get_new_frame)
-{
-    AVCodecContext *avctx = ac->avctx;
-    int i, channels = 0, ret;
-    uint64_t layout = 0;
-    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
-    uint8_t type_counts[TYPE_END] = { 0 };
-
-    if (ac->oc[1].layout_map != layout_map) {
-        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
-        ac->oc[1].layout_map_tags = tags;
-    }
-    for (i = 0; i < tags; i++) {
-        int type =         layout_map[i][0];
-        int id =           layout_map[i][1];
-        id_map[type][id] = type_counts[type]++;
-    }
-    // Try to sniff a reasonable channel order, otherwise output the
-    // channels in the order the PCE declared them.
-    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
-        layout = sniff_channel_order(layout_map, tags);
-    for (i = 0; i < tags; i++) {
-        int type =     layout_map[i][0];
-        int id =       layout_map[i][1];
-        int iid =      id_map[type][id];
-        int position = layout_map[i][2];
-        // Allocate or free elements depending on if they are in the
-        // current program configuration.
-        ret = che_configure(ac, position, type, iid, &channels);
-        if (ret < 0)
-            return ret;
-        ac->tag_che_map[type][id] = ac->che[type][iid];
-    }
-    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
-        if (layout == AV_CH_FRONT_CENTER) {
-            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
-        } else {
-            layout = 0;
-        }
-    }
-
-    avctx->channel_layout = ac->oc[1].channel_layout = layout;
-    avctx->channels       = ac->oc[1].channels       = channels;
-    ac->oc[1].status = oc_type;
-
-    if (get_new_frame) {
-        if ((ret = frame_configure_elements(ac->avctx)) < 0)
-            return ret;
-    }
-
-    return 0;
-}
-
-/**
- * Set up channel positions based on a default channel configuration
- * as specified in table 1.17.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int set_default_channel_config(AVCodecContext *avctx,
-                                      uint8_t (*layout_map)[3],
-                                      int *tags,
-                                      int channel_config)
-{
-    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
-        channel_config > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid default channel configuration (%d)\n",
-               channel_config);
-        return AVERROR_INVALIDDATA;
-    }
-    *tags = tags_per_config[channel_config];
-    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
-           *tags * sizeof(*layout_map));
-    return 0;
-}
-
-static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
-{
-    /* For PCE based channel configurations map the channels solely based
-     * on tags. */
-    if (!ac->oc[1].m4ac.chan_config) {
-        return ac->tag_che_map[type][elem_id];
-    }
-    // Allow single CPE stereo files to be signalled with mono configuration.
-    if (!ac->tags_mapped && type == TYPE_CPE &&
-        ac->oc[1].m4ac.chan_config == 1) {
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 2) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 2;
-        ac->oc[1].m4ac.ps = 0;
-    }
-    // And vice-versa
-    if (!ac->tags_mapped && type == TYPE_SCE &&
-        ac->oc[1].m4ac.chan_config == 2) {
-        uint8_t layout_map[MAX_ELEM_ID * 4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 1) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 1;
-        if (ac->oc[1].m4ac.sbr)
-            ac->oc[1].m4ac.ps = -1;
-    }
-    /* For indexed channel configurations map the channels solely based
-     * on position. */
-    switch (ac->oc[1].m4ac.chan_config) {
-    case 12:
-    case 7:
-        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
-        }
-    case 11:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 11 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 6:
-        /* Some streams incorrectly code 5.1 audio as
-         * SCE[0] CPE[0] CPE[1] SCE[1]
-         * instead of
-         * SCE[0] CPE[0] CPE[1] LFE[0].
-         * If we seem to have encountered such a stream, transfer
-         * the LFE[0] element to the SCE[1]'s mapping */
-        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
-        }
-    case 5:
-        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
-        }
-    case 4:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 4 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 3:
-    case 2:
-        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
-            type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
-        } else if (ac->oc[1].m4ac.chan_config == 2) {
-            return NULL;
-        }
-    case 1:
-        if (!ac->tags_mapped && type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
-        }
-    default:
-        return NULL;
-    }
-}
-
-/**
- * Decode an array of 4 bit element IDs, optionally interleaved with a
- * stereo/mono switching bit.
- *
- * @param type speaker type/position for these channels
- */
-static void decode_channel_map(uint8_t layout_map[][3],
-                               enum ChannelPosition type,
-                               GetBitContext *gb, int n)
-{
-    while (n--) {
-        enum RawDataBlockType syn_ele;
-        switch (type) {
-        case AAC_CHANNEL_FRONT:
-        case AAC_CHANNEL_BACK:
-        case AAC_CHANNEL_SIDE:
-            syn_ele = get_bits1(gb);
-            break;
-        case AAC_CHANNEL_CC:
-            skip_bits1(gb);
-            syn_ele = TYPE_CCE;
-            break;
-        case AAC_CHANNEL_LFE:
-            syn_ele = TYPE_LFE;
-            break;
-        default:
-            // AAC_CHANNEL_OFF has no channel map
-            return;
-        }
-        layout_map[0][0] = syn_ele;
-        layout_map[0][1] = get_bits(gb, 4);
-        layout_map[0][2] = type;
-        layout_map++;
-    }
-}
-
-/**
- * Decode program configuration element; reference: table 4.2.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
-                      uint8_t (*layout_map)[3],
-                      GetBitContext *gb)
-{
-    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
-    int sampling_index;
-    int comment_len;
-    int tags;
-
-    skip_bits(gb, 2);  // object_type
-
-    sampling_index = get_bits(gb, 4);
-    if (m4ac->sampling_index != sampling_index)
-        av_log(avctx, AV_LOG_WARNING,
-               "Sample rate index in program config element does not "
-               "match the sample rate index configured by the container.\n");
-
-    num_front       = get_bits(gb, 4);
-    num_side        = get_bits(gb, 4);
-    num_back        = get_bits(gb, 4);
-    num_lfe         = get_bits(gb, 2);
-    num_assoc_data  = get_bits(gb, 3);
-    num_cc          = get_bits(gb, 4);
-
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // mono_mixdown_tag
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // stereo_mixdown_tag
-
-    if (get_bits1(gb))
-        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
-
-    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
-    tags = num_front;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
-    tags += num_side;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
-    tags += num_back;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
-    tags += num_lfe;
-
-    skip_bits_long(gb, 4 * num_assoc_data);
-
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
-    tags += num_cc;
-
-    align_get_bits(gb);
-
-    /* comment field, first byte is length */
-    comment_len = get_bits(gb, 8) * 8;
-    if (get_bits_left(gb) < comment_len) {
-        av_log(avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, comment_len);
-    return tags;
-}
-
-/**
- * Decode GA "General Audio" specific configuration; reference: table 4.1.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int extension_flag, ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-
-    if (get_bits1(gb)) { // frameLengthFlag
-        avpriv_request_sample(avctx, "960/120 MDCT window");
-        return AVERROR_PATCHWELCOME;
-    }
-    m4ac->frame_length_short = 0;
-
-    if (get_bits1(gb))       // dependsOnCoreCoder
-        skip_bits(gb, 14);   // coreCoderDelay
-    extension_flag = get_bits1(gb);
-
-    if (m4ac->object_type == AOT_AAC_SCALABLE ||
-        m4ac->object_type == AOT_ER_AAC_SCALABLE)
-        skip_bits(gb, 3);     // layerNr
-
-    if (channel_config == 0) {
-        skip_bits(gb, 4);  // element_instance_tag
-        tags = decode_pce(avctx, m4ac, layout_map, gb);
-        if (tags < 0)
-            return tags;
-    } else {
-        if ((ret = set_default_channel_config(avctx, layout_map,
-                                              &tags, channel_config)))
-            return ret;
-    }
-
-    if (count_channels(layout_map, tags) > 1) {
-        m4ac->ps = 0;
-    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
-        m4ac->ps = 1;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    if (extension_flag) {
-        switch (m4ac->object_type) {
-        case AOT_ER_BSAC:
-            skip_bits(gb, 5);    // numOfSubFrame
-            skip_bits(gb, 11);   // layer_length
-            break;
-        case AOT_ER_AAC_LC:
-        case AOT_ER_AAC_LTP:
-        case AOT_ER_AAC_SCALABLE:
-        case AOT_ER_AAC_LD:
-            res_flags = get_bits(gb, 3);
-            if (res_flags) {
-                avpriv_report_missing_feature(avctx,
-                                              "AAC data resilience (flags %x)",
-                                              res_flags);
-                return AVERROR_PATCHWELCOME;
-            }
-            break;
-        }
-        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
-    }
-    switch (m4ac->object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_SCALABLE:
-    case AOT_ER_AAC_LD:
-        ep_config = get_bits(gb, 2);
-        if (ep_config) {
-            avpriv_report_missing_feature(avctx,
-                                          "epConfig %d", ep_config);
-            return AVERROR_PATCHWELCOME;
-        }
-    }
-    return 0;
-}
-
-static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-    const int ELDEXT_TERM = 0;
-
-    m4ac->ps  = 0;
-    m4ac->sbr = 0;
-
-    m4ac->frame_length_short = get_bits1(gb);
-    res_flags = get_bits(gb, 3);
-    if (res_flags) {
-        avpriv_report_missing_feature(avctx,
-                                      "AAC data resilience (flags %x)",
-                                      res_flags);
-        return AVERROR_PATCHWELCOME;
-    }
-
-    if (get_bits1(gb)) { // ldSbrPresentFlag
-        avpriv_report_missing_feature(avctx,
-                                      "Low Delay SBR");
-        return AVERROR_PATCHWELCOME;
-    }
-
-    while (get_bits(gb, 4) != ELDEXT_TERM) {
-        int len = get_bits(gb, 4);
-        if (len == 15)
-            len += get_bits(gb, 8);
-        if (len == 15 + 255)
-            len += get_bits(gb, 16);
-        if (get_bits_left(gb) < len * 8 + 4) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            return AVERROR_INVALIDDATA;
-        }
-        skip_bits_long(gb, 8 * len);
-    }
-
-    if ((ret = set_default_channel_config(avctx, layout_map,
-                                          &tags, channel_config)))
-        return ret;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    ep_config = get_bits(gb, 2);
-    if (ep_config) {
-        avpriv_report_missing_feature(avctx,
-                                      "epConfig %d", ep_config);
-        return AVERROR_PATCHWELCOME;
-    }
-    return 0;
-}
-
-/**
- * Decode audio specific configuration; reference: table 1.13.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
- * @param   data        pointer to buffer holding an audio specific config
- * @param   bit_size    size of audio specific config or data in bits
- * @param   sync_extension look for an appended sync extension
- *
- * @return  Returns error status or number of consumed bits. <0 - error
- */
-static int decode_audio_specific_config(AACContext *ac,
-                                        AVCodecContext *avctx,
-                                        MPEG4AudioConfig *m4ac,
-                                        const uint8_t *data, int bit_size,
-                                        int sync_extension)
-{
-    GetBitContext gb;
-    int i, ret;
-
-    ff_dlog(avctx, "extradata size %d\n", avctx->extradata_size);
-    for (i = 0; i < avctx->extradata_size; i++)
-        ff_dlog(avctx, "%02x ", avctx->extradata[i]);
-    ff_dlog(avctx, "\n");
-
-    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
-        return ret;
-
-    if ((i = avpriv_mpeg4audio_get_config(m4ac, data, bit_size,
-                                          sync_extension)) < 0)
-        return AVERROR_INVALIDDATA;
-    if (m4ac->sampling_index > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-    if (m4ac->object_type == AOT_ER_AAC_LD &&
-        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid low delay sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-
-    skip_bits_long(&gb, i);
-
-    switch (m4ac->object_type) {
-    case AOT_AAC_MAIN:
-    case AOT_AAC_LC:
-    case AOT_AAC_LTP:
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LD:
-        if ((ret = decode_ga_specific_config(ac, avctx, &gb,
-                                            m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    case AOT_ER_AAC_ELD:
-        if ((ret = decode_eld_specific_config(ac, avctx, &gb,
-                                              m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    default:
-        avpriv_report_missing_feature(avctx,
-                                      "Audio object type %s%d",
-                                      m4ac->sbr == 1 ? "SBR+" : "",
-                                      m4ac->object_type);
-        return AVERROR(ENOSYS);
-    }
-
-    ff_dlog(avctx,
-            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
-            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
-            m4ac->sample_rate, m4ac->sbr,
-            m4ac->ps);
-
-    return get_bits_count(&gb);
-}
-
-/**
- * linear congruential pseudorandom number generator
- *
- * @param   previous_val    pointer to the current state of the generator
- *
- * @return  Returns a 32-bit pseudorandom integer
- */
-static av_always_inline int lcg_random(int previous_val)
-{
-    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
-    return v.s;
-}
-
 static av_always_inline void reset_predict_state(PredictorState *ps)
 {
     ps->r0   = 0.0f;
@@ -965,499 +78,6 @@ static av_always_inline void reset_predict_state(PredictorState *ps)
     ps->var1 = 1.0f;
 }
 
-static void reset_all_predictors(PredictorState *ps)
-{
-    int i;
-    for (i = 0; i < MAX_PREDICTORS; i++)
-        reset_predict_state(&ps[i]);
-}
-
-static int sample_rate_idx (int rate)
-{
-         if (92017 <= rate) return 0;
-    else if (75132 <= rate) return 1;
-    else if (55426 <= rate) return 2;
-    else if (46009 <= rate) return 3;
-    else if (37566 <= rate) return 4;
-    else if (27713 <= rate) return 5;
-    else if (23004 <= rate) return 6;
-    else if (18783 <= rate) return 7;
-    else if (13856 <= rate) return 8;
-    else if (11502 <= rate) return 9;
-    else if (9391  <= rate) return 10;
-    else                    return 11;
-}
-
-static void reset_predictor_group(PredictorState *ps, int group_num)
-{
-    int i;
-    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
-        reset_predict_state(&ps[i]);
-}
-
-#define AAC_INIT_VLC_STATIC(num, size)                                     \
-    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
-         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
-                                    sizeof(ff_aac_spectral_bits[num][0]),  \
-        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
-                                    sizeof(ff_aac_spectral_codes[num][0]), \
-        size);
-
-static av_cold int aac_decode_init(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int ret;
-
-    ac->avctx = avctx;
-    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
-
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-
-    if (avctx->extradata_size > 0) {
-        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                                avctx->extradata,
-                                                avctx->extradata_size * 8,
-                                                1)) < 0)
-            return ret;
-    } else {
-        int sr, i;
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-
-        sr = sample_rate_idx(avctx->sample_rate);
-        ac->oc[1].m4ac.sampling_index = sr;
-        ac->oc[1].m4ac.channels = avctx->channels;
-        ac->oc[1].m4ac.sbr = -1;
-        ac->oc[1].m4ac.ps = -1;
-
-        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
-            if (ff_mpeg4audio_channels[i] == avctx->channels)
-                break;
-        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
-            i = 0;
-        }
-        ac->oc[1].m4ac.chan_config = i;
-
-        if (ac->oc[1].m4ac.chan_config) {
-            int ret = set_default_channel_config(avctx, layout_map,
-                &layout_map_tags, ac->oc[1].m4ac.chan_config);
-            if (!ret)
-                output_configure(ac, layout_map, layout_map_tags,
-                                 OC_GLOBAL_HDR, 0);
-            else if (avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-    }
-
-    AAC_INIT_VLC_STATIC( 0, 304);
-    AAC_INIT_VLC_STATIC( 1, 270);
-    AAC_INIT_VLC_STATIC( 2, 550);
-    AAC_INIT_VLC_STATIC( 3, 300);
-    AAC_INIT_VLC_STATIC( 4, 328);
-    AAC_INIT_VLC_STATIC( 5, 294);
-    AAC_INIT_VLC_STATIC( 6, 306);
-    AAC_INIT_VLC_STATIC( 7, 268);
-    AAC_INIT_VLC_STATIC( 8, 510);
-    AAC_INIT_VLC_STATIC( 9, 366);
-    AAC_INIT_VLC_STATIC(10, 462);
-
-    ff_aac_sbr_init();
-
-    avpriv_float_dsp_init(&ac->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-
-    ac->random_state = 0x1f2e3d4c;
-
-    ff_aac_tableinit();
-
-    INIT_VLC_STATIC(&vlc_scalefactors, 7,
-                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
-                    ff_aac_scalefactor_bits,
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    ff_aac_scalefactor_code,
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    352);
-
-    ff_mdct_init(&ac->mdct,       11, 1, 1.0 / (32768.0 * 1024.0));
-    ff_mdct_init(&ac->mdct_ld,    10, 1, 1.0 / (32768.0 * 512.0));
-    ff_mdct_init(&ac->mdct_small,  8, 1, 1.0 / (32768.0 * 128.0));
-    ff_mdct_init(&ac->mdct_ltp,   11, 0, -2.0 * 32768.0);
-    ret = ff_imdct15_init(&ac->mdct480, 5);
-    if (ret < 0)
-        return ret;
-
-    // window initialization
-    ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
-    ff_kbd_window_init(ff_aac_kbd_short_128, 6.0, 128);
-    ff_init_ff_sine_windows(10);
-    ff_init_ff_sine_windows( 9);
-    ff_init_ff_sine_windows( 7);
-
-    cbrt_tableinit();
-
-    return 0;
-}
-
-/**
- * Skip data_stream_element; reference: table 4.10.
- */
-static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
-{
-    int byte_align = get_bits1(gb);
-    int count = get_bits(gb, 8);
-    if (count == 255)
-        count += get_bits(gb, 8);
-    if (byte_align)
-        align_get_bits(gb);
-
-    if (get_bits_left(gb) < 8 * count) {
-        av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, 8 * count);
-    return 0;
-}
-
-static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
-                             GetBitContext *gb)
-{
-    int sfb;
-    if (get_bits1(gb)) {
-        ics->predictor_reset_group = get_bits(gb, 5);
-        if (ics->predictor_reset_group == 0 ||
-            ics->predictor_reset_group > 30) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid Predictor Reset Group.\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
-    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
-        ics->prediction_used[sfb] = get_bits1(gb);
-    }
-    return 0;
-}
-
-/**
- * Decode Long Term Prediction data; reference: table 4.xx.
- */
-static void decode_ltp(LongTermPrediction *ltp,
-                       GetBitContext *gb, uint8_t max_sfb)
-{
-    int sfb;
-
-    ltp->lag  = get_bits(gb, 11);
-    ltp->coef = ltp_coef[get_bits(gb, 3)];
-    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
-        ltp->used[sfb] = get_bits1(gb);
-}
-
-/**
- * Decode Individual Channel Stream info; reference: table 4.6.
- */
-static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
-                           GetBitContext *gb)
-{
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    const int aot = m4ac->object_type;
-    const int sampling_index = m4ac->sampling_index;
-    if (aot != AOT_ER_AAC_ELD) {
-        if (get_bits1(gb)) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
-            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
-                return AVERROR_INVALIDDATA;
-        }
-        ics->window_sequence[1] = ics->window_sequence[0];
-        ics->window_sequence[0] = get_bits(gb, 2);
-        if (aot == AOT_ER_AAC_LD &&
-            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
-                   "window sequence %d found.\n", ics->window_sequence[0]);
-            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
-            return AVERROR_INVALIDDATA;
-        }
-        ics->use_kb_window[1]   = ics->use_kb_window[0];
-        ics->use_kb_window[0]   = get_bits1(gb);
-    }
-    ics->num_window_groups  = 1;
-    ics->group_len[0]       = 1;
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        int i;
-        ics->max_sfb = get_bits(gb, 4);
-        for (i = 0; i < 7; i++) {
-            if (get_bits1(gb)) {
-                ics->group_len[ics->num_window_groups - 1]++;
-            } else {
-                ics->num_window_groups++;
-                ics->group_len[ics->num_window_groups - 1] = 1;
-            }
-        }
-        ics->num_windows       = 8;
-        ics->swb_offset        =    ff_swb_offset_128[sampling_index];
-        ics->num_swb           =   ff_aac_num_swb_128[sampling_index];
-        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
-        ics->predictor_present = 0;
-    } else {
-        ics->max_sfb           = get_bits(gb, 6);
-        ics->num_windows       = 1;
-        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
-            if (m4ac->frame_length_short) {
-                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
-            } else {
-                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
-            }
-            if (!ics->num_swb || !ics->swb_offset)
-                return AVERROR_BUG;
-        } else {
-            ics->swb_offset    =    ff_swb_offset_1024[sampling_index];
-            ics->num_swb       =   ff_aac_num_swb_1024[sampling_index];
-            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
-        }
-        if (aot != AOT_ER_AAC_ELD) {
-            ics->predictor_present     = get_bits1(gb);
-            ics->predictor_reset_group = 0;
-        }
-        if (ics->predictor_present) {
-            if (aot == AOT_AAC_MAIN) {
-                if (decode_prediction(ac, ics, gb)) {
-                    return AVERROR_INVALIDDATA;
-                }
-            } else if (aot == AOT_AAC_LC ||
-                       aot == AOT_ER_AAC_LC) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Prediction is not allowed in AAC-LC.\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                if (aot == AOT_ER_AAC_LD) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "LTP in ER AAC LD not yet implemented.\n");
-                    return AVERROR_PATCHWELCOME;
-                }
-                if ((ics->ltp.present = get_bits(gb, 1)))
-                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
-            }
-        }
-    }
-
-    if (ics->max_sfb > ics->num_swb) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Number of scalefactor bands in group (%d) "
-               "exceeds limit (%d).\n",
-               ics->max_sfb, ics->num_swb);
-        return AVERROR_INVALIDDATA;
-    }
-
-    return 0;
-}
-
-/**
- * Decode band types (section_data payload); reference: table 4.46.
- *
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_band_types(AACContext *ac, enum BandType band_type[120],
-                             int band_type_run_end[120], GetBitContext *gb,
-                             IndividualChannelStream *ics)
-{
-    int g, idx = 0;
-    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        int k = 0;
-        while (k < ics->max_sfb) {
-            uint8_t sect_end = k;
-            int sect_len_incr;
-            int sect_band_type = get_bits(gb, 4);
-            if (sect_band_type == 12) {
-                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
-                return AVERROR_INVALIDDATA;
-            }
-            do {
-                sect_len_incr = get_bits(gb, bits);
-                sect_end += sect_len_incr;
-                if (get_bits_left(gb) < 0) {
-                    av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (sect_end > ics->max_sfb) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "Number of bands (%d) exceeds limit (%d).\n",
-                           sect_end, ics->max_sfb);
-                    return AVERROR_INVALIDDATA;
-                }
-            } while (sect_len_incr == (1 << bits) - 1);
-            for (; k < sect_end; k++) {
-                band_type        [idx]   = sect_band_type;
-                band_type_run_end[idx++] = sect_end;
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode scalefactors; reference: table 4.47.
- *
- * @param   global_gain         first scalefactor value as scalefactors are differentially coded
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- * @param   sf                  array of scalefactors or intensity stereo positions
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_scalefactors(AACContext *ac, float sf[120], GetBitContext *gb,
-                               unsigned int global_gain,
-                               IndividualChannelStream *ics,
-                               enum BandType band_type[120],
-                               int band_type_run_end[120])
-{
-    int g, i, idx = 0;
-    int offset[3] = { global_gain, global_gain - 90, 0 };
-    int clipped_offset;
-    int noise_flag = 1;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            int run_end = band_type_run_end[idx];
-            if (band_type[idx] == ZERO_BT) {
-                for (; i < run_end; i++, idx++)
-                    sf[idx] = 0.0;
-            } else if ((band_type[idx] == INTENSITY_BT) ||
-                       (band_type[idx] == INTENSITY_BT2)) {
-                for (; i < run_end; i++, idx++) {
-                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[2], -155, 100);
-                    if (offset[2] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped intensity stereo position (%d -> %d)",
-                                              offset[2], clipped_offset);
-                    }
-                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
-                }
-            } else if (band_type[idx] == NOISE_BT) {
-                for (; i < run_end; i++, idx++) {
-                    if (noise_flag-- > 0)
-                        offset[1] += get_bits(gb, 9) - 256;
-                    else
-                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[1], -100, 155);
-                    if (offset[1] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped noise gain (%d -> %d)",
-                                              offset[1], clipped_offset);
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
-                }
-            } else {
-                for (; i < run_end; i++, idx++) {
-                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    if (offset[0] > 255U) {
-                        av_log(ac->avctx, AV_LOG_ERROR,
-                               "Scalefactor (%d) out of range.\n", offset[0]);
-                        return AVERROR_INVALIDDATA;
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode pulse data; reference: table 4.7.
- */
-static int decode_pulses(Pulse *pulse, GetBitContext *gb,
-                         const uint16_t *swb_offset, int num_swb)
-{
-    int i, pulse_swb;
-    pulse->num_pulse = get_bits(gb, 2) + 1;
-    pulse_swb        = get_bits(gb, 6);
-    if (pulse_swb >= num_swb)
-        return -1;
-    pulse->pos[0]    = swb_offset[pulse_swb];
-    pulse->pos[0]   += get_bits(gb, 5);
-    if (pulse->pos[0] > 1023)
-        return -1;
-    pulse->amp[0]    = get_bits(gb, 4);
-    for (i = 1; i < pulse->num_pulse; i++) {
-        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
-        if (pulse->pos[i] > 1023)
-            return -1;
-        pulse->amp[i] = get_bits(gb, 4);
-    }
-    return 0;
-}
-
-/**
- * Decode Temporal Noise Shaping data; reference: table 4.48.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
-                      GetBitContext *gb, const IndividualChannelStream *ics)
-{
-    int w, filt, i, coef_len, coef_res, coef_compress;
-    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
-    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
-    for (w = 0; w < ics->num_windows; w++) {
-        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
-            coef_res = get_bits1(gb);
-
-            for (filt = 0; filt < tns->n_filt[w]; filt++) {
-                int tmp2_idx;
-                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
-
-                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "TNS filter order %d is greater than maximum %d.\n",
-                           tns->order[w][filt], tns_max_order);
-                    tns->order[w][filt] = 0;
-                    return AVERROR_INVALIDDATA;
-                }
-                if (tns->order[w][filt]) {
-                    tns->direction[w][filt] = get_bits1(gb);
-                    coef_compress = get_bits1(gb);
-                    coef_len = coef_res + 3 - coef_compress;
-                    tmp2_idx = 2 * coef_compress + coef_res;
-
-                    for (i = 0; i < tns->order[w][filt]; i++)
-                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode Mid/Side data; reference: table 4.54.
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
-                                   int ms_present)
-{
-    int idx;
-    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
-    if (ms_present == 1) {
-        for (idx = 0; idx < max_idx; idx++)
-            cpe->ms_mask[idx] = get_bits1(gb);
-    } else if (ms_present == 2) {
-        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
-    }
-}
-
 #ifndef VMUL2
 static inline float *VMUL2(float *dst, const float *v, unsigned idx,
                            const float *scale)
@@ -1526,233 +146,6 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
 }
 #endif
 
-/**
- * Decode spectral data; reference: table 4.50.
- * Dequantize and scale spectral data; reference: 4.6.3.3.
- *
- * @param   coef            array of dequantized, scaled spectral data
- * @param   sf              array of scalefactors or intensity stereo positions
- * @param   pulse_present   set if pulses are present
- * @param   pulse           pointer to pulse data struct
- * @param   band_type       array of the used band type
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_spectrum_and_dequant(AACContext *ac, float coef[1024],
-                                       GetBitContext *gb, const float sf[120],
-                                       int pulse_present, const Pulse *pulse,
-                                       const IndividualChannelStream *ics,
-                                       enum BandType band_type[120])
-{
-    int i, k, g, idx = 0;
-    const int c = 1024 / ics->num_windows;
-    const uint16_t *offsets = ics->swb_offset;
-    float *coef_base = coef;
-
-    for (g = 0; g < ics->num_windows; g++)
-        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
-               sizeof(float) * (c - offsets[ics->max_sfb]));
-
-    for (g = 0; g < ics->num_window_groups; g++) {
-        unsigned g_len = ics->group_len[g];
-
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            const unsigned cbt_m1 = band_type[idx] - 1;
-            float *cfo = coef + offsets[i];
-            int off_len = offsets[i + 1] - offsets[i];
-            int group;
-
-            if (cbt_m1 >= INTENSITY_BT2 - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    memset(cfo, 0, off_len * sizeof(float));
-                }
-            } else if (cbt_m1 == NOISE_BT - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    float scale;
-                    float band_energy;
-
-                    for (k = 0; k < off_len; k++) {
-                        ac->random_state  = lcg_random(ac->random_state);
-                        cfo[k] = ac->random_state;
-                    }
-
-                    band_energy = ac->fdsp.scalarproduct_float(cfo, cfo, off_len);
-                    scale = sf[idx] / sqrtf(band_energy);
-                    ac->fdsp.vector_fmul_scalar(cfo, cfo, scale, off_len);
-                }
-            } else {
-                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
-                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
-                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
-                OPEN_READER(re, gb);
-
-                switch (cbt_m1 >> 1) {
-                case 0:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 1:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            bits = nnz ? GET_CACHE(re, gb) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 2:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                case 3:
-                case 4:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            unsigned sign;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                default:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        uint32_t *icf = (uint32_t *) cf;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nzt, nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-                            int j;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-
-                            if (!code) {
-                                *icf++ = 0;
-                                *icf++ = 0;
-                                continue;
-                            }
-
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 12;
-                            nzt = cb_idx >> 8;
-                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
-                            LAST_SKIP_BITS(re, gb, nnz);
-
-                            for (j = 0; j < 2; j++) {
-                                if (nzt & 1<<j) {
-                                    uint32_t b;
-                                    int n;
-                                    /* The total length of escape_sequence must be < 22 bits according
-                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
-                                    UPDATE_CACHE(re, gb);
-                                    b = GET_CACHE(re, gb);
-                                    b = 31 - av_log2(~b);
-
-                                    if (b > 8) {
-                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
-                                        return AVERROR_INVALIDDATA;
-                                    }
-
-                                    SKIP_BITS(re, gb, b + 1);
-                                    b += 4;
-                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
-                                    LAST_SKIP_BITS(re, gb, b);
-                                    *icf++ = cbrt_tab[n] | (bits & 1U<<31);
-                                    bits <<= 1;
-                                } else {
-                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
-                                    *icf++ = (bits & 1U<<31) | v;
-                                    bits <<= !!v;
-                                }
-                                cb_idx >>= 4;
-                            }
-                        } while (len -= 2);
-
-                        ac->fdsp.vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
-                    }
-                }
-
-                CLOSE_READER(re, gb);
-            }
-        }
-        coef += g_len << 7;
-    }
-
-    if (pulse_present) {
-        idx = 0;
-        for (i = 0; i < pulse->num_pulse; i++) {
-            float co = coef_base[ pulse->pos[i] ];
-            while (offsets[idx + 1] <= pulse->pos[i])
-                idx++;
-            if (band_type[idx] != NOISE_BT && sf[idx]) {
-                float ico = -pulse->amp[i];
-                if (co) {
-                    co /= sf[idx];
-                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
-                }
-                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
-            }
-        }
-    }
-    return 0;
-}
-
 static av_always_inline float flt16_round(float pf)
 {
     union av_intfloat32 tmp;
@@ -1809,738 +202,6 @@ static av_always_inline void predict(PredictorState *ps, float *coef,
 }
 
 /**
- * Apply AAC-Main style frequency domain prediction.
- */
-static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
-{
-    int sfb, k;
-
-    if (!sce->ics.predictor_initialized) {
-        reset_all_predictors(sce->predictor_state);
-        sce->ics.predictor_initialized = 1;
-    }
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        for (sfb = 0;
-             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
-             sfb++) {
-            for (k = sce->ics.swb_offset[sfb];
-                 k < sce->ics.swb_offset[sfb + 1];
-                 k++) {
-                predict(&sce->predictor_state[k], &sce->coeffs[k],
-                        sce->ics.predictor_present &&
-                        sce->ics.prediction_used[sfb]);
-            }
-        }
-        if (sce->ics.predictor_reset_group)
-            reset_predictor_group(sce->predictor_state,
-                                  sce->ics.predictor_reset_group);
-    } else
-        reset_all_predictors(sce->predictor_state);
-}
-
-/**
- * Decode an individual_channel_stream payload; reference: table 4.44.
- *
- * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
- * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ics(AACContext *ac, SingleChannelElement *sce,
-                      GetBitContext *gb, int common_window, int scale_flag)
-{
-    Pulse pulse;
-    TemporalNoiseShaping    *tns = &sce->tns;
-    IndividualChannelStream *ics = &sce->ics;
-    float *out = sce->coeffs;
-    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
-    int ret;
-
-    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    /* This assignment is to silence a GCC warning about the variable being used
-     * uninitialized when in fact it always is.
-     */
-    pulse.num_pulse = 0;
-
-    global_gain = get_bits(gb, 8);
-
-    if (!common_window && !scale_flag) {
-        if (decode_ics_info(ac, ics, gb) < 0)
-            return AVERROR_INVALIDDATA;
-    }
-
-    if ((ret = decode_band_types(ac, sce->band_type,
-                                 sce->band_type_run_end, gb, ics)) < 0)
-        return ret;
-    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
-                                  sce->band_type, sce->band_type_run_end)) < 0)
-        return ret;
-
-    pulse_present = 0;
-    if (!scale_flag) {
-        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
-            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse tool not allowed in eight short sequence.\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse data corrupt or invalid.\n");
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        tns->present = get_bits1(gb);
-        if (tns->present && !er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-        if (!eld_syntax && get_bits1(gb)) {
-            avpriv_request_sample(ac->avctx, "SSR");
-            return AVERROR_PATCHWELCOME;
-        }
-        // I see no textual basis in the spec for this occuring after SSR gain
-        // control, but this is what both reference and real implmentations do
-        if (tns->present && er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-    }
-
-    if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
-                                    &pulse, ics, sce->band_type) < 0)
-        return AVERROR_INVALIDDATA;
-
-    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
-        apply_prediction(ac, sce);
-
-    return 0;
-}
-
-/**
- * Mid/Side stereo decoding; reference: 4.6.8.1.3.
- */
-static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
-{
-    const IndividualChannelStream *ics = &cpe->ch[0].ics;
-    float *ch0 = cpe->ch[0].coeffs;
-    float *ch1 = cpe->ch[1].coeffs;
-    int g, i, group, idx = 0;
-    const uint16_t *offsets = ics->swb_offset;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            if (cpe->ms_mask[idx] &&
-                cpe->ch[0].band_type[idx] < NOISE_BT &&
-                cpe->ch[1].band_type[idx] < NOISE_BT) {
-                for (group = 0; group < ics->group_len[g]; group++) {
-                    ac->fdsp.butterflies_float(ch0 + group * 128 + offsets[i],
-                                               ch1 + group * 128 + offsets[i],
-                                               offsets[i+1] - offsets[i]);
-                }
-            }
-        }
-        ch0 += ics->group_len[g] * 128;
-        ch1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * intensity stereo decoding; reference: 4.6.8.2.3
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void apply_intensity_stereo(AACContext *ac,
-                                   ChannelElement *cpe, int ms_present)
-{
-    const IndividualChannelStream *ics = &cpe->ch[1].ics;
-    SingleChannelElement         *sce1 = &cpe->ch[1];
-    float *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
-    const uint16_t *offsets = ics->swb_offset;
-    int g, group, i, idx = 0;
-    int c;
-    float scale;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            if (sce1->band_type[idx] == INTENSITY_BT ||
-                sce1->band_type[idx] == INTENSITY_BT2) {
-                const int bt_run_end = sce1->band_type_run_end[idx];
-                for (; i < bt_run_end; i++, idx++) {
-                    c = -1 + 2 * (sce1->band_type[idx] - 14);
-                    if (ms_present)
-                        c *= 1 - 2 * cpe->ms_mask[idx];
-                    scale = c * sce1->sf[idx];
-                    for (group = 0; group < ics->group_len[g]; group++)
-                        ac->fdsp.vector_fmul_scalar(coef1 + group * 128 + offsets[i],
-                                                    coef0 + group * 128 + offsets[i],
-                                                    scale,
-                                                    offsets[i + 1] - offsets[i]);
-                }
-            } else {
-                int bt_run_end = sce1->band_type_run_end[idx];
-                idx += bt_run_end - i;
-                i    = bt_run_end;
-            }
-        }
-        coef0 += ics->group_len[g] * 128;
-        coef1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * Decode a channel_pair_element; reference: table 4.4.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
-{
-    int i, ret, common_window, ms_present = 0;
-    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    common_window = eld_syntax || get_bits1(gb);
-    if (common_window) {
-        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
-            return AVERROR_INVALIDDATA;
-        i = cpe->ch[1].ics.use_kb_window[0];
-        cpe->ch[1].ics = cpe->ch[0].ics;
-        cpe->ch[1].ics.use_kb_window[1] = i;
-        if (cpe->ch[1].ics.predictor_present &&
-            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
-            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
-                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
-        ms_present = get_bits(gb, 2);
-        if (ms_present == 3) {
-            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
-            return AVERROR_INVALIDDATA;
-        } else if (ms_present)
-            decode_mid_side_stereo(cpe, gb, ms_present);
-    }
-    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
-        return ret;
-    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
-        return ret;
-
-    if (common_window) {
-        if (ms_present)
-            apply_mid_side_stereo(ac, cpe);
-        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
-            apply_prediction(ac, &cpe->ch[0]);
-            apply_prediction(ac, &cpe->ch[1]);
-        }
-    }
-
-    apply_intensity_stereo(ac, cpe, ms_present);
-    return 0;
-}
-
-static const float cce_scale[] = {
-    1.09050773266525765921, //2^(1/8)
-    1.18920711500272106672, //2^(1/4)
-    M_SQRT2,
-    2,
-};
-
-/**
- * Decode coupling_channel_element; reference: table 4.8.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
-{
-    int num_gain = 0;
-    int c, g, sfb, ret;
-    int sign;
-    float scale;
-    SingleChannelElement *sce = &che->ch[0];
-    ChannelCoupling     *coup = &che->coup;
-
-    coup->coupling_point = 2 * get_bits1(gb);
-    coup->num_coupled = get_bits(gb, 3);
-    for (c = 0; c <= coup->num_coupled; c++) {
-        num_gain++;
-        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
-        coup->id_select[c] = get_bits(gb, 4);
-        if (coup->type[c] == TYPE_CPE) {
-            coup->ch_select[c] = get_bits(gb, 2);
-            if (coup->ch_select[c] == 3)
-                num_gain++;
-        } else
-            coup->ch_select[c] = 2;
-    }
-    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
-
-    sign  = get_bits(gb, 1);
-    scale = cce_scale[get_bits(gb, 2)];
-
-    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
-        return ret;
-
-    for (c = 0; c < num_gain; c++) {
-        int idx  = 0;
-        int cge  = 1;
-        int gain = 0;
-        float gain_cache = 1.0;
-        if (c) {
-            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
-            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
-            gain_cache = powf(scale, -gain);
-        }
-        if (coup->coupling_point == AFTER_IMDCT) {
-            coup->gain[c][0] = gain_cache;
-        } else {
-            for (g = 0; g < sce->ics.num_window_groups; g++) {
-                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
-                    if (sce->band_type[idx] != ZERO_BT) {
-                        if (!cge) {
-                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                            if (t) {
-                                int s = 1;
-                                t = gain += t;
-                                if (sign) {
-                                    s  -= 2 * (t & 0x1);
-                                    t >>= 1;
-                                }
-                                gain_cache = powf(scale, -t) * s;
-                            }
-                        }
-                        coup->gain[c][idx] = gain_cache;
-                    }
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
-                                         GetBitContext *gb)
-{
-    int i;
-    int num_excl_chan = 0;
-
-    do {
-        for (i = 0; i < 7; i++)
-            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
-    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
-
-    return num_excl_chan / 7;
-}
-
-/**
- * Decode dynamic range information; reference: table 4.52.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_dynamic_range(DynamicRangeControl *che_drc,
-                                GetBitContext *gb)
-{
-    int n             = 1;
-    int drc_num_bands = 1;
-    int i;
-
-    /* pce_tag_present? */
-    if (get_bits1(gb)) {
-        che_drc->pce_instance_tag  = get_bits(gb, 4);
-        skip_bits(gb, 4); // tag_reserved_bits
-        n++;
-    }
-
-    /* excluded_chns_present? */
-    if (get_bits1(gb)) {
-        n += decode_drc_channel_exclusions(che_drc, gb);
-    }
-
-    /* drc_bands_present? */
-    if (get_bits1(gb)) {
-        che_drc->band_incr            = get_bits(gb, 4);
-        che_drc->interpolation_scheme = get_bits(gb, 4);
-        n++;
-        drc_num_bands += che_drc->band_incr;
-        for (i = 0; i < drc_num_bands; i++) {
-            che_drc->band_top[i] = get_bits(gb, 8);
-            n++;
-        }
-    }
-
-    /* prog_ref_level_present? */
-    if (get_bits1(gb)) {
-        che_drc->prog_ref_level = get_bits(gb, 7);
-        skip_bits1(gb); // prog_ref_level_reserved_bits
-        n++;
-    }
-
-    for (i = 0; i < drc_num_bands; i++) {
-        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
-        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
-        n++;
-    }
-
-    return n;
-}
-
-/**
- * Decode extension data (incomplete); reference: table 4.51.
- *
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return Returns number of bytes consumed
- */
-static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
-                                    ChannelElement *che, enum RawDataBlockType elem_type)
-{
-    int crc_flag = 0;
-    int res = cnt;
-    switch (get_bits(gb, 4)) { // extension type
-    case EXT_SBR_DATA_CRC:
-        crc_flag++;
-    case EXT_SBR_DATA:
-        if (!che) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
-            return res;
-        } else if (!ac->oc[1].m4ac.sbr) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->oc[1].m4ac.ps = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
-                             ac->oc[1].status, 1);
-        } else {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE;
-        }
-        res = ff_decode_sbr_extension(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
-        break;
-    case EXT_DYNAMIC_RANGE:
-        res = decode_dynamic_range(&ac->che_drc, gb);
-        break;
-    case EXT_FILL:
-    case EXT_FILL_DATA:
-    case EXT_DATA_ELEMENT:
-    default:
-        skip_bits_long(gb, 8 * cnt - 4);
-        break;
-    };
-    return res;
-}
-
-/**
- * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
- *
- * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
- * @param   coef    spectral coefficients
- */
-static void apply_tns(float coef[1024], TemporalNoiseShaping *tns,
-                      IndividualChannelStream *ics, int decode)
-{
-    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
-    int w, filt, m, i;
-    int bottom, top, order, start, end, size, inc;
-    float lpc[TNS_MAX_ORDER];
-    float tmp[TNS_MAX_ORDER + 1];
-
-    for (w = 0; w < ics->num_windows; w++) {
-        bottom = ics->num_swb;
-        for (filt = 0; filt < tns->n_filt[w]; filt++) {
-            top    = bottom;
-            bottom = FFMAX(0, top - tns->length[w][filt]);
-            order  = tns->order[w][filt];
-            if (order == 0)
-                continue;
-
-            // tns_decode_coef
-            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
-
-            start = ics->swb_offset[FFMIN(bottom, mmm)];
-            end   = ics->swb_offset[FFMIN(   top, mmm)];
-            if ((size = end - start) <= 0)
-                continue;
-            if (tns->direction[w][filt]) {
-                inc = -1;
-                start = end - 1;
-            } else {
-                inc = 1;
-            }
-            start += w * 128;
-
-            if (decode) {
-                // ar filter
-                for (m = 0; m < size; m++, start += inc)
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] -= coef[start - i * inc] * lpc[i - 1];
-            } else {
-                // ma filter
-                for (m = 0; m < size; m++, start += inc) {
-                    tmp[0] = coef[start];
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] += tmp[i] * lpc[i - 1];
-                    for (i = order; i > 0; i--)
-                        tmp[i] = tmp[i - 1];
-                }
-            }
-        }
-    }
-}
-
-/**
- *  Apply windowing and MDCT to obtain the spectral
- *  coefficient from the predicted sample by LTP.
- */
-static void windowing_and_mdct_ltp(AACContext *ac, float *out,
-                                   float *in, IndividualChannelStream *ics)
-{
-    const float *lwindow      = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-
-    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
-        ac->fdsp.vector_fmul(in, in, lwindow_prev, 1024);
-    } else {
-        memset(in, 0, 448 * sizeof(float));
-        ac->fdsp.vector_fmul(in + 448, in + 448, swindow_prev, 128);
-    }
-    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
-        ac->fdsp.vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
-    } else {
-        ac->fdsp.vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
-        memset(in + 1024 + 576, 0, 448 * sizeof(float));
-    }
-    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
-}
-
-/**
- * Apply the long term prediction
- */
-static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    const LongTermPrediction *ltp = &sce->ics.ltp;
-    const uint16_t *offsets = sce->ics.swb_offset;
-    int i, sfb;
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        float *predTime = sce->ret;
-        float *predFreq = ac->buf_mdct;
-        int16_t num_samples = 2048;
-
-        if (ltp->lag < 1024)
-            num_samples = ltp->lag + 1024;
-        for (i = 0; i < num_samples; i++)
-            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
-        memset(&predTime[i], 0, (2048 - i) * sizeof(float));
-
-        windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
-
-        if (sce->tns.present)
-            apply_tns(predFreq, &sce->tns, &sce->ics, 0);
-
-        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
-            if (ltp->used[sfb])
-                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
-                    sce->coeffs[i] += predFreq[i];
-    }
-}
-
-/**
- * Update the LTP buffer for next frame
- */
-static void update_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *saved     = sce->saved;
-    float *saved_ltp = sce->coeffs;
-    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    int i;
-
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(saved_ltp,       saved, 512 * sizeof(float));
-        memset(saved_ltp + 576, 0,     448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(float));
-        memset(saved_ltp + 576, 0,                  448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else { // LONG_STOP or ONLY_LONG
-        ac->fdsp.vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
-        for (i = 0; i < 512; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * lwindow[511 - i];
-    }
-
-    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
-}
-
-/**
- * Conduct IMDCT and windowing.
- */
-static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-    float *buf  = ac->buf_mdct;
-    float *temp = ac->temp;
-    int i;
-
-    // imdct
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        for (i = 0; i < 1024; i += 128)
-            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
-    } else
-        ac->mdct.imdct_half(&ac->mdct, buf, in);
-
-    /* window overlapping
-     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
-     * and long to short transitions are considered to be short to short
-     * transitions. This leaves just two cases (long to long and short to short)
-     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
-     */
-    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
-            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
-        ac->fdsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
-    } else {
-        memcpy(                         out,               saved,            448 * sizeof(float));
-
-        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-            ac->fdsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
-            ac->fdsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
-            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(float));
-        } else {
-            ac->fdsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
-            memcpy(                     out + 576,         buf + 64,         448 * sizeof(float));
-        }
-    }
-
-    // buffer update
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(                     saved,       temp + 64,         64 * sizeof(float));
-        ac->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(                     saved,       buf + 512,        448 * sizeof(float));
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else { // LONG_STOP or ONLY_LONG
-        memcpy(                     saved,       buf + 512,        512 * sizeof(float));
-    }
-}
-
-static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-
-    // imdct
-    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-
-    // window overlapping
-    if (ics->use_kb_window[1]) {
-        // AAC LD uses a low overlap sine window instead of a KBD window
-        memcpy(out, saved, 192 * sizeof(float));
-        ac->fdsp.vector_fmul_window(out + 192, saved + 192, buf, ff_sine_128, 64);
-        memcpy(                     out + 320, buf + 64, 192 * sizeof(float));
-    } else {
-        ac->fdsp.vector_fmul_window(out, saved, buf, ff_sine_512, 256);
-    }
-
-    // buffer update
-    memcpy(saved, buf + 256, 256 * sizeof(float));
-}
-
-static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
-{
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-    int i;
-    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
-    const int n2 = n >> 1;
-    const int n4 = n >> 2;
-    const float *const window = n == 480 ? ff_aac_eld_window_480 :
-                                           ff_aac_eld_window_512;
-
-    // Inverse transform, mapped to the conventional IMDCT by
-    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
-    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
-    // Audio, Language and Image Processing, 2008. ICALIP 2008. International Conference on
-    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
-    for (i = 0; i < n2; i+=2) {
-        float temp;
-        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
-        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
-    }
-    if (n == 480)
-        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1, -1.f/(16*1024*960));
-    else
-        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-    for (i = 0; i < n; i+=2) {
-        buf[i] = -buf[i];
-    }
-    // Like with the regular IMDCT at this point we still have the middle half
-    // of a transform but with even symmetry on the left and odd symmetry on
-    // the right
-
-    // window overlapping
-    // The spec says to use samples [0..511] but the reference decoder uses
-    // samples [128..639].
-    for (i = n4; i < n2; i ++) {
-        out[i - n4] =    buf[n2 - 1 - i]       * window[i       - n4] +
-                       saved[      i + n2]     * window[i +   n - n4] +
-                      -saved[  n + n2 - 1 - i] * window[i + 2*n - n4] +
-                      -saved[2*n + n2 + i]     * window[i + 3*n - n4];
-    }
-    for (i = 0; i < n2; i ++) {
-        out[n4 + i] =    buf[i]               * window[i + n2       - n4] +
-                      -saved[      n - 1 - i] * window[i + n2 +   n - n4] +
-                      -saved[  n + i]         * window[i + n2 + 2*n - n4] +
-                       saved[2*n + n - 1 - i] * window[i + n2 + 3*n - n4];
-    }
-    for (i = 0; i < n4; i ++) {
-        out[n2 + n4 + i] =    buf[      i + n2]     * window[i +   n - n4] +
-                           -saved[      n2 - 1 - i] * window[i + 2*n - n4] +
-                           -saved[  n + n2 + i]     * window[i + 3*n - n4];
-    }
-
-    // buffer update
-    memmove(saved + n, saved, 2 * n * sizeof(float));
-    memcpy( saved,       buf,     n * sizeof(float));
-}
-
-/**
  * Apply dependent channel coupling (applied before IMDCT).
  *
  * @param   index   index into coupling gain array
@@ -2595,447 +256,13 @@ static void apply_independent_coupling(AACContext *ac,
         dest[i] += gain * src[i];
 }
 
-/**
- * channel coupling transformation interface
- *
- * @param   apply_coupling_method   pointer to (in)dependent coupling function
- */
-static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
-                                   enum RawDataBlockType type, int elem_id,
-                                   enum CouplingPoint coupling_point,
-                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
-{
-    int i, c;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        ChannelElement *cce = ac->che[TYPE_CCE][i];
-        int index = 0;
-
-        if (cce && cce->coup.coupling_point == coupling_point) {
-            ChannelCoupling *coup = &cce->coup;
-
-            for (c = 0; c <= coup->num_coupled; c++) {
-                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
-                    if (coup->ch_select[c] != 1) {
-                        apply_coupling_method(ac, &cc->ch[0], cce, index);
-                        if (coup->ch_select[c] != 0)
-                            index++;
-                    }
-                    if (coup->ch_select[c] != 2)
-                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
-                } else
-                    index += 1 + (coup->ch_select[c] == 3);
-            }
-        }
-    }
-}
-
-/**
- * Convert spectral data to float samples, applying all supported tools as appropriate.
- */
-static void spectral_to_sample(AACContext *ac)
-{
-    int i, type;
-    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LD:
-        imdct_and_window = imdct_and_windowing_ld;
-        break;
-    case AOT_ER_AAC_ELD:
-        imdct_and_window = imdct_and_windowing_eld;
-        break;
-    default:
-        imdct_and_window = imdct_and_windowing;
-    }
-    for (type = 3; type >= 0; type--) {
-        for (i = 0; i < MAX_ELEM_ID; i++) {
-            ChannelElement *che = ac->che[type][i];
-            if (che) {
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, apply_dependent_coupling);
-                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
-                    if (che->ch[0].ics.predictor_present) {
-                        if (che->ch[0].ics.ltp.present)
-                            apply_ltp(ac, &che->ch[0]);
-                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
-                            apply_ltp(ac, &che->ch[1]);
-                    }
-                }
-                if (che->ch[0].tns.present)
-                    apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
-                if (che->ch[1].tns.present)
-                    apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, apply_dependent_coupling);
-                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
-                    imdct_and_window(ac, &che->ch[0]);
-                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                        update_ltp(ac, &che->ch[0]);
-                    if (type == TYPE_CPE) {
-                        imdct_and_window(ac, &che->ch[1]);
-                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                            update_ltp(ac, &che->ch[1]);
-                    }
-                    if (ac->oc[1].m4ac.sbr > 0) {
-                        ff_sbr_apply(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
-                    }
-                }
-                if (type <= TYPE_CCE)
-                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, apply_independent_coupling);
-            }
-        }
-    }
-}
-
-static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
-{
-    int size;
-    AACADTSHeaderInfo hdr_info;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int layout_map_tags, ret;
-
-    size = avpriv_aac_parse_header(gb, &hdr_info);
-    if (size > 0) {
-        if (hdr_info.num_aac_frames != 1) {
-            avpriv_report_missing_feature(ac->avctx,
-                                          "More than one AAC RDB per ADTS frame");
-            return AVERROR_PATCHWELCOME;
-        }
-        push_output_configuration(ac);
-        if (hdr_info.chan_config) {
-            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
-            if ((ret = set_default_channel_config(ac->avctx,
-                                                  layout_map,
-                                                  &layout_map_tags,
-                                                  hdr_info.chan_config)) < 0)
-                return ret;
-            if ((ret = output_configure(ac, layout_map, layout_map_tags,
-                                        FFMAX(ac->oc[1].status,
-                                              OC_TRIAL_FRAME), 0)) < 0)
-                return ret;
-        } else {
-            ac->oc[1].m4ac.chan_config = 0;
-        }
-        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
-        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
-        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
-        ac->oc[1].m4ac.frame_length_short = 0;
-        if (ac->oc[0].status != OC_LOCKED ||
-            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
-            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
-            ac->oc[1].m4ac.sbr = -1;
-            ac->oc[1].m4ac.ps  = -1;
-        }
-        if (!hdr_info.crc_absent)
-            skip_bits(gb, 16);
-    }
-    return size;
-}
-
-static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
-                               int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    ChannelElement *che;
-    int err, i;
-    int samples = m4ac->frame_length_short ? 960 : 1024;
-    int chan_config = m4ac->chan_config;
-    int aot = m4ac->object_type;
-
-    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
-        samples >>= 1;
-
-    ac->frame = data;
-
-    if ((err = frame_configure_elements(avctx)) < 0)
-        return err;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = aot - 1;
-
-    ac->tags_mapped = 0;
-
-    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
-        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
-                              chan_config);
-        return AVERROR_INVALIDDATA;
-    }
-    for (i = 0; i < tags_per_config[chan_config]; i++) {
-        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
-        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
-        if (!(che=get_che(ac, elem_type, elem_id))) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "channel element %d.%d is not allocated\n",
-                   elem_type, elem_id);
-            return AVERROR_INVALIDDATA;
-        }
-        if (aot != AOT_ER_AAC_ELD)
-            skip_bits(gb, 4);
-        switch (elem_type) {
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            break;
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        }
-        if (err < 0)
-            return err;
-    }
-
-    spectral_to_sample(ac);
-
-    ac->frame->nb_samples = samples;
-    ac->frame->sample_rate = avctx->sample_rate;
-    *got_frame_ptr = 1;
-
-    skip_bits_long(gb, get_bits_left(gb));
-    return 0;
-}
-
-static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
-                                int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    ChannelElement *che = NULL, *che_prev = NULL;
-    enum RawDataBlockType elem_type, elem_type_prev = TYPE_END;
-    int err, elem_id;
-    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
-
-    ac->frame = data;
-
-    if (show_bits(gb, 12) == 0xfff) {
-        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
-            goto fail;
-        }
-        if (ac->oc[1].m4ac.sampling_index > 12) {
-            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (avctx->channels)
-        if ((err = frame_configure_elements(avctx)) < 0)
-            goto fail;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
-
-    ac->tags_mapped = 0;
-    // parse
-    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
-        elem_id = get_bits(gb, 4);
-
-        if (!avctx->channels && elem_type != TYPE_PCE) {
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-
-        if (elem_type < TYPE_DSE) {
-            if (!(che=get_che(ac, elem_type, elem_id))) {
-                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
-                       elem_type, elem_id);
-                err = AVERROR_INVALIDDATA;
-                goto fail;
-            }
-            samples = 1024;
-        }
-
-        switch (elem_type) {
-
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            audio_found = 1;
-            break;
-
-        case TYPE_CCE:
-            err = decode_cce(ac, gb, che);
-            break;
-
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_DSE:
-            err = skip_data_stream_element(ac, gb);
-            break;
-
-        case TYPE_PCE: {
-            uint8_t layout_map[MAX_ELEM_ID*4][3];
-            int tags;
-            push_output_configuration(ac);
-            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb);
-            if (tags < 0) {
-                err = tags;
-                break;
-            }
-            if (pce_found) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
-                pop_output_configuration(ac);
-            } else {
-                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
-                pce_found = 1;
-            }
-            break;
-        }
-
-        case TYPE_FIL:
-            if (elem_id == 15)
-                elem_id += get_bits(gb, 8) - 1;
-            if (get_bits_left(gb) < 8 * elem_id) {
-                    av_log(avctx, AV_LOG_ERROR, overread_err);
-                    err = AVERROR_INVALIDDATA;
-                    goto fail;
-            }
-            while (elem_id > 0)
-                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, elem_type_prev);
-            err = 0; /* FIXME */
-            break;
-
-        default:
-            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
-            break;
-        }
-
-        che_prev       = che;
-        elem_type_prev = elem_type;
-
-        if (err)
-            goto fail;
-
-        if (get_bits_left(gb) < 3) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (!avctx->channels) {
-        *got_frame_ptr = 0;
-        return 0;
-    }
-
-    spectral_to_sample(ac);
-
-    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
-    samples <<= multiplier;
-
-    if (ac->oc[1].status && audio_found) {
-        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
-        avctx->frame_size = samples;
-        ac->oc[1].status = OC_LOCKED;
-    }
-
-    if (samples) {
-        ac->frame->nb_samples = samples;
-        ac->frame->sample_rate = avctx->sample_rate;
-    }
-    *got_frame_ptr = !!samples;
-
-    return 0;
-fail:
-    pop_output_configuration(ac);
-    return err;
-}
-
-static int aac_decode_frame(AVCodecContext *avctx, void *data,
-                            int *got_frame_ptr, AVPacket *avpkt)
-{
-    AACContext *ac = avctx->priv_data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
-    GetBitContext gb;
-    int buf_consumed;
-    int buf_offset;
-    int err;
-    int new_extradata_size;
-    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
-                                       AV_PKT_DATA_NEW_EXTRADATA,
-                                       &new_extradata_size);
-
-    if (new_extradata) {
-        av_free(avctx->extradata);
-        avctx->extradata = av_mallocz(new_extradata_size +
-                                      AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata)
-            return AVERROR(ENOMEM);
-        avctx->extradata_size = new_extradata_size;
-        memcpy(avctx->extradata, new_extradata, new_extradata_size);
-        push_output_configuration(ac);
-        if (decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                         avctx->extradata,
-                                         avctx->extradata_size*8, 1) < 0) {
-            pop_output_configuration(ac);
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
-    if ((err = init_get_bits(&gb, buf, buf_size * 8)) < 0)
-        return err;
-
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_LD:
-    case AOT_ER_AAC_ELD:
-        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
-        break;
-    default:
-        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb);
-    }
-    if (err < 0)
-        return err;
-
-    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
-    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
-        if (buf[buf_offset])
-            break;
-
-    return buf_size > buf_offset ? buf_consumed : buf_size;
-}
-
-static av_cold int aac_decode_close(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int i, type;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        for (type = 0; type < 4; type++) {
-            if (ac->che[type][i])
-                ff_aac_sbr_ctx_close(&ac->che[type][i]->sbr);
-            av_freep(&ac->che[type][i]);
-        }
-    }
-
-    ff_mdct_end(&ac->mdct);
-    ff_mdct_end(&ac->mdct_small);
-    ff_mdct_end(&ac->mdct_ld);
-    ff_mdct_end(&ac->mdct_ltp);
-    ff_imdct15_uninit(&ac->mdct480);
-    return 0;
-}
-
+#include "aacdec_template.c"
 
 #define LOAS_SYNC_WORD   0x2b7       ///< 11 bits LOAS sync word
 
 struct LATMContext {
     AACContext aac_ctx;     ///< containing AACContext
-    int initialized;        ///< initilized after a valid extradata was seen
+    int initialized;        ///< initialized after a valid extradata was seen
 
     // parser data
     int audio_mux_version_A; ///< LATM syntax version
@@ -3084,7 +311,11 @@ static int latm_decode_audio_specific_config(struct LATMContext *latmctx,
         ac->oc[1].m4ac.sample_rate != m4ac.sample_rate ||
         ac->oc[1].m4ac.chan_config != m4ac.chan_config) {
 
-        av_log(avctx, AV_LOG_INFO, "audio config changed\n");
+        if(latmctx->initialized) {
+            av_log(avctx, AV_LOG_INFO, "audio config changed\n");
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "initializing latmctx\n");
+        }
         latmctx->initialized = 0;
 
         esize = (bits_consumed+7) / 8;
@@ -3127,9 +358,9 @@ static int read_stream_mux_config(struct LATMContext *latmctx,
             return AVERROR_PATCHWELCOME;
         }
 
-        // for each program (which there is only on in DVB)
+        // for each program (which there is only one in DVB)
 
-        // for each layer (which there is only on in DVB)
+        // for each layer (which there is only one in DVB)
         if (get_bits(gb, 3)) {                   // numLayer
             avpriv_request_sample(latmctx->aac_ctx.avctx, "Multiple layers");
             return AVERROR_PATCHWELCOME;
@@ -3242,7 +473,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
     int                 muxlength, err;
     GetBitContext       gb;
 
-    if ((err = init_get_bits(&gb, avpkt->data, avpkt->size * 8)) < 0)
+    if ((err = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
         return err;
 
     // check for LOAS sync word
@@ -3250,7 +481,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         return AVERROR_INVALIDDATA;
 
     muxlength = get_bits(&gb, 13) + 3;
-    // not enough data, the parser should have sorted this
+    // not enough data, the parser should have sorted this out
     if (muxlength > avpkt->size)
         return AVERROR_INVALIDDATA;
 
@@ -3265,7 +496,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
             push_output_configuration(&latmctx->aac_ctx);
             if ((err = decode_audio_specific_config(
                     &latmctx->aac_ctx, avctx, &latmctx->aac_ctx.oc[1].m4ac,
-                    avctx->extradata, avctx->extradata_size*8, 1)) < 0) {
+                    avctx->extradata, avctx->extradata_size*8LL, 1)) < 0) {
                 pop_output_configuration(&latmctx->aac_ctx);
                 return err;
             }
@@ -3288,7 +519,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         err = aac_decode_er_frame(avctx, out, got_frame_ptr, &gb);
         break;
     default:
-        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb);
+        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb, avpkt);
     }
     if (err < 0)
         return err;
@@ -3307,7 +538,6 @@ static av_cold int latm_decode_init(AVCodecContext *avctx)
     return ret;
 }
 
-
 AVCodec ff_aac_decoder = {
     .name            = "aac",
     .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
@@ -3322,6 +552,9 @@ AVCodec ff_aac_decoder = {
     },
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .priv_class      = &aac_decoder_class,
+    .profiles        = profiles,
 };
 
 /*
@@ -3343,4 +576,6 @@ AVCodec ff_aac_latm_decoder = {
     },
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .profiles        = profiles,
 };
diff --git a/libavcodec/aacdec_fixed.c b/libavcodec/aacdec_fixed.c
new file mode 100644
index 0000000000..875ef582f4
--- /dev/null
+++ b/libavcodec/aacdec_fixed.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC decoder fixed-point implementation
+ *
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * Fixed point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#define USE_FIXED 1
+
+#include "libavutil/fixed_dsp.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "fft.h"
+#include "lpc.h"
+#include "kbdwin.h"
+#include "sinewin.h"
+
+#include "aac.h"
+#include "aactab.h"
+#include "aacdectab.h"
+#include "cbrt_tablegen.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "mpeg4audio.h"
+#include "aacadtsdec.h"
+#include "libavutil/intfloat.h"
+
+#include <math.h>
+#include <string.h>
+
+static av_always_inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0.mant   = 0;
+    ps->r0.exp   = 0;
+    ps->r1.mant   = 0;
+    ps->r1.exp   = 0;
+    ps->cor0.mant = 0;
+    ps->cor0.exp = 0;
+    ps->cor1.mant = 0;
+    ps->cor1.exp = 0;
+    ps->var0.mant = 0x20000000;
+    ps->var0.exp = 1;
+    ps->var1.mant = 0x20000000;
+    ps->var1.exp = 1;
+}
+
+static const int exp2tab[4] = { Q31(1.0000000000/2), Q31(1.1892071150/2), Q31(1.4142135624/2), Q31(1.6817928305/2) };  // 2^0, 2^0.25, 2^0.5, 2^0.75
+
+static inline int *DEC_SPAIR(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 15) - 4;
+    dst[1] = (idx >> 4 & 15) - 4;
+
+    return dst + 2;
+}
+
+static inline int *DEC_SQUAD(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 3) - 1;
+    dst[1] = (idx >> 2 & 3) - 1;
+    dst[2] = (idx >> 4 & 3) - 1;
+    dst[3] = (idx >> 6 & 3) - 1;
+
+    return dst + 4;
+}
+
+static inline int *DEC_UPAIR(int *dst, unsigned idx, unsigned sign)
+{
+    dst[0] = (idx & 15) * (1 - (sign & 0xFFFFFFFE));
+    dst[1] = (idx >> 4 & 15) * (1 - ((sign & 1) << 1));
+
+    return dst + 2;
+}
+
+static inline int *DEC_UQUAD(int *dst, unsigned idx, unsigned sign)
+{
+    unsigned nz = idx >> 12;
+
+    dst[0] = (idx & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[1] = (idx >> 2 & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[2] = (idx >> 4 & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[3] = (idx >> 6 & 3) * (1 + (((int)sign >> 31) << 1));
+
+    return dst + 4;
+}
+
+static void vector_pow43(int *coefs, int len)
+{
+    int i, coef;
+
+    for (i=0; i<len; i++) {
+        coef = coefs[i];
+        if (coef < 0)
+            coef = -(int)cbrt_tab[-coef];
+        else
+            coef = (int)cbrt_tab[coef];
+        coefs[i] = coef;
+    }
+}
+
+static void subband_scale(int *dst, int *src, int scale, int offset, int len)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+
+    s = offset - (s >> 2);
+
+    if (s > 0) {
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)src[i] * c) >> 32);
+            dst[i] = ((int)(out+round) >> s) * ssign;
+        }
+    }
+    else {
+        s = s + 32;
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)src[i] * c + round) >> s);
+            dst[i] = out * ssign;
+        }
+    }
+}
+
+static void noise_scale(int *coefs, int scale, int band_energy, int len)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+    int nlz = 0;
+
+    while (band_energy > 0x7fff) {
+        band_energy >>= 1;
+        nlz++;
+    }
+    c /= band_energy;
+    s = 21 + nlz - (s >> 2);
+
+    if (s > 0) {
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)coefs[i] * c) >> 32);
+            coefs[i] = ((int)(out+round) >> s) * ssign;
+        }
+    }
+    else {
+        s = s + 32;
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)coefs[i] * c + round) >> s);
+            coefs[i] = out * ssign;
+        }
+    }
+}
+
+static av_always_inline SoftFloat flt16_round(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x00200000U) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_even(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x001FFFFFU + (tmp.mant & 0x00400000U >> 16)) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_trunc(SoftFloat pf)
+{
+    SoftFloat pun;
+    int s;
+
+    pun.exp = pf.exp;
+    s = pf.mant >> 31;
+    pun.mant = (pf.mant ^ s) - s;
+    pun.mant = pun.mant & 0xFFC00000U;
+    pun.mant = (pun.mant ^ s) - s;
+
+    return pun;
+}
+
+static av_always_inline void predict(PredictorState *ps, int *coef,
+                                     int output_enable)
+{
+    const SoftFloat a     = { 1023410176, 0 };  // 61.0 / 64
+    const SoftFloat alpha = {  973078528, 0 };  // 29.0 / 32
+    SoftFloat e0, e1;
+    SoftFloat pv;
+    SoftFloat k1, k2;
+    SoftFloat   r0 = ps->r0,     r1 = ps->r1;
+    SoftFloat cor0 = ps->cor0, cor1 = ps->cor1;
+    SoftFloat var0 = ps->var0, var1 = ps->var1;
+    SoftFloat tmp;
+
+    if (var0.exp > 1 || (var0.exp == 1 && var0.mant > 0x20000000)) {
+        k1 = av_mul_sf(cor0, flt16_even(av_div_sf(a, var0)));
+    }
+    else {
+        k1.mant = 0;
+        k1.exp = 0;
+    }
+
+    if (var1.exp > 1 || (var1.exp == 1 && var1.mant > 0x20000000)) {
+        k2 = av_mul_sf(cor1, flt16_even(av_div_sf(a, var1)));
+    }
+    else {
+        k2.mant = 0;
+        k2.exp = 0;
+    }
+
+    tmp = av_mul_sf(k1, r0);
+    pv = flt16_round(av_add_sf(tmp, av_mul_sf(k2, r1)));
+    if (output_enable) {
+        int shift = 28 - pv.exp;
+
+        if (shift < 31)
+            *coef += (pv.mant + (1 << (shift - 1))) >> shift;
+    }
+
+    e0 = av_int2sf(*coef, 2);
+    e1 = av_sub_sf(e0, tmp);
+
+    ps->cor1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor1), av_mul_sf(r1, e1)));
+    tmp = av_add_sf(av_mul_sf(r1, r1), av_mul_sf(e1, e1));
+    tmp.exp--;
+    ps->var1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var1), tmp));
+    ps->cor0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor0), av_mul_sf(r0, e0)));
+    tmp = av_add_sf(av_mul_sf(r0, r0), av_mul_sf(e0, e0));
+    tmp.exp--;
+    ps->var0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var0), tmp));
+
+    ps->r1 = flt16_trunc(av_mul_sf(a, av_sub_sf(r0, av_mul_sf(k1, e0))));
+    ps->r0 = flt16_trunc(av_mul_sf(a, e0));
+}
+
+
+static const int cce_scale_fixed[8] = {
+    Q30(1.0),          //2^(0/8)
+    Q30(1.0905077327), //2^(1/8)
+    Q30(1.1892071150), //2^(2/8)
+    Q30(1.2968395547), //2^(3/8)
+    Q30(1.4142135624), //2^(4/8)
+    Q30(1.5422108254), //2^(5/8)
+    Q30(1.6817928305), //2^(6/8)
+    Q30(1.8340080864), //2^(7/8)
+};
+
+/**
+ * Apply dependent channel coupling (applied before IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_dependent_coupling_fixed(AACContext *ac,
+                                     SingleChannelElement *target,
+                                     ChannelElement *cce, int index)
+{
+    IndividualChannelStream *ics = &cce->ch[0].ics;
+    const uint16_t *offsets = ics->swb_offset;
+    int *dest = target->coeffs;
+    const int *src = cce->ch[0].coeffs;
+    int g, i, group, k, idx = 0;
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Dependent coupling is not supported together with LTP\n");
+        return;
+    }
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cce->ch[0].band_type[idx] != ZERO_BT) {
+                const int gain = cce->coup.gain[index][idx];
+                int shift, round, c, tmp;
+
+                if (gain < 0) {
+                    c = -cce_scale_fixed[-gain & 7];
+                    shift = (-gain-1024) >> 3;
+                }
+                else {
+                    c = cce_scale_fixed[gain & 7];
+                    shift = (gain-1024) >> 3;
+                }
+
+                if (shift < 0) {
+                    shift = -shift;
+                    round = 1 << (shift - 1);
+
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                       (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += (tmp + round) >> shift;
+                        }
+                    }
+                }
+                else {
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                        (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += tmp << shift;
+                        }
+                    }
+                }
+            }
+        }
+        dest += ics->group_len[g] * 128;
+        src  += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Apply independent channel coupling (applied after IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_independent_coupling_fixed(AACContext *ac,
+                                       SingleChannelElement *target,
+                                       ChannelElement *cce, int index)
+{
+    int i, c, shift, round, tmp;
+    const int gain = cce->coup.gain[index][0];
+    const int *src = cce->ch[0].ret;
+    int *dest = target->ret;
+    const int len = 1024 << (ac->oc[1].m4ac.sbr == 1);
+
+    c = cce_scale_fixed[gain & 7];
+    shift = (gain-1024) >> 3;
+    if (shift < 0) {
+        shift = -shift;
+        round = 1 << (shift - 1);
+
+        for (i = 0; i < len; i++) {
+            tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+            dest[i] += (tmp + round) >> shift;
+        }
+    }
+    else {
+      for (i = 0; i < len; i++) {
+          tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+          dest[i] += tmp << shift;
+      }
+    }
+}
+
+#include "aacdec_template.c"
+
+AVCodec ff_aac_fixed_decoder = {
+    .name            = "aac_fixed",
+    .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
+    .type            = AVMEDIA_TYPE_AUDIO,
+    .id              = AV_CODEC_ID_AAC,
+    .priv_data_size  = sizeof(AACContext),
+    .init            = aac_decode_init,
+    .close           = aac_decode_close,
+    .decode          = aac_decode_frame,
+    .sample_fmts     = (const enum AVSampleFormat[]) {
+        AV_SAMPLE_FMT_S32P, AV_SAMPLE_FMT_NONE
+    },
+    .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
+    .channel_layouts = aac_channel_layout,
+    .flush = flush,
+};
diff --git a/libavcodec/aacdec_template.c b/libavcodec/aacdec_template.c
new file mode 100644
index 0000000000..c2d7d05650
--- /dev/null
+++ b/libavcodec/aacdec_template.c
@@ -0,0 +1,3229 @@
+/*
+ * AAC decoder
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ * Copyright (c) 2008-2013 Alex Converse <alex.converse@gmail.com>
+ *
+ * AAC LATM decoder
+ * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
+ * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
+ *
+ * AAC decoder fixed-point implementation
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * AAC decoder fixed-point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ * @author Nedeljko Babic ( nedeljko.babic imgtec com )
+ */
+
+/*
+ * supported tools
+ *
+ * Support?                     Name
+ * N (code in SoC repo)         gain control
+ * Y                            block switching
+ * Y                            window shapes - standard
+ * N                            window shapes - Low Delay
+ * Y                            filterbank - standard
+ * N (code in SoC repo)         filterbank - Scalable Sample Rate
+ * Y                            Temporal Noise Shaping
+ * Y                            Long Term Prediction
+ * Y                            intensity stereo
+ * Y                            channel coupling
+ * Y                            frequency domain prediction
+ * Y                            Perceptual Noise Substitution
+ * Y                            Mid/Side stereo
+ * N                            Scalable Inverse AAC Quantization
+ * N                            Frequency Selective Switch
+ * N                            upsampling filter
+ * Y                            quantization & coding - AAC
+ * N                            quantization & coding - TwinVQ
+ * N                            quantization & coding - BSAC
+ * N                            AAC Error Resilience tools
+ * N                            Error Resilience payload syntax
+ * N                            Error Protection tool
+ * N                            CELP
+ * N                            Silence Compression
+ * N                            HVXC
+ * N                            HVXC 4kbits/s VR
+ * N                            Structured Audio tools
+ * N                            Structured Audio Sample Bank Format
+ * N                            MIDI
+ * N                            Harmonic and Individual Lines plus Noise
+ * N                            Text-To-Speech Interface
+ * Y                            Spectral Band Replication
+ * Y (not in this code)         Layer-1
+ * Y (not in this code)         Layer-2
+ * Y (not in this code)         Layer-3
+ * N                            SinuSoidal Coding (Transient, Sinusoid, Noise)
+ * Y                            Parametric Stereo
+ * N                            Direct Stream Transfer
+ * Y  (not in fixed point code) Enhanced AAC Low Delay (ER AAC ELD)
+ *
+ * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
+ *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
+           Parametric Stereo.
+ */
+
+static VLC vlc_scalefactors;
+static VLC vlc_spectral[11];
+
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID*4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame);
+
+#define overread_err "Input buffer exhausted before END element found\n"
+
+static int count_channels(uint8_t (*layout)[3], int tags)
+{
+    int i, sum = 0;
+    for (i = 0; i < tags; i++) {
+        int syn_ele = layout[i][0];
+        int pos     = layout[i][2];
+        sum += (1 + (syn_ele == TYPE_CPE)) *
+               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
+    }
+    return sum;
+}
+
+/**
+ * Check for the channel element in the current channel position configuration.
+ * If it exists, make sure the appropriate element is allocated and map the
+ * channel order to match the internal FFmpeg channel layout.
+ *
+ * @param   che_pos current channel position configuration
+ * @param   type channel element type
+ * @param   id channel element id
+ * @param   channels count of the number of channels in the configuration
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static av_cold int che_configure(AACContext *ac,
+                                 enum ChannelPosition che_pos,
+                                 int type, int id, int *channels)
+{
+    if (*channels >= MAX_CHANNELS)
+        return AVERROR_INVALIDDATA;
+    if (che_pos) {
+        if (!ac->che[type][id]) {
+            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
+                return AVERROR(ENOMEM);
+            AAC_RENAME(ff_aac_sbr_ctx_init)(ac, &ac->che[type][id]->sbr);
+        }
+        if (type != TYPE_CCE) {
+            if (*channels >= MAX_CHANNELS - (type == TYPE_CPE || (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "Too many channels\n");
+                return AVERROR_INVALIDDATA;
+            }
+            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
+            if (type == TYPE_CPE ||
+                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
+                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
+            }
+        }
+    } else {
+        if (ac->che[type][id])
+            AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][id]->sbr);
+        av_freep(&ac->che[type][id]);
+    }
+    return 0;
+}
+
+static int frame_configure_elements(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int type, id, ch, ret;
+
+    /* set channel pointers to internal buffers by default */
+    for (type = 0; type < 4; type++) {
+        for (id = 0; id < MAX_ELEM_ID; id++) {
+            ChannelElement *che = ac->che[type][id];
+            if (che) {
+                che->ch[0].ret = che->ch[0].ret_buf;
+                che->ch[1].ret = che->ch[1].ret_buf;
+            }
+        }
+    }
+
+    /* get output buffer */
+    av_frame_unref(ac->frame);
+    if (!avctx->channels)
+        return 1;
+
+    ac->frame->nb_samples = 2048;
+    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0)
+        return ret;
+
+    /* map output channel pointers to AVFrame data */
+    for (ch = 0; ch < avctx->channels; ch++) {
+        if (ac->output_element[ch])
+            ac->output_element[ch]->ret = (INTFLOAT *)ac->frame->extended_data[ch];
+    }
+
+    return 0;
+}
+
+struct elem_to_channel {
+    uint64_t av_position;
+    uint8_t syn_ele;
+    uint8_t elem_id;
+    uint8_t aac_position;
+};
+
+static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
+                       uint8_t (*layout_map)[3], int offset, uint64_t left,
+                       uint64_t right, int pos)
+{
+    if (layout_map[offset][0] == TYPE_CPE) {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left | right,
+            .syn_ele      = TYPE_CPE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        return 1;
+    } else {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        e2c_vec[offset + 1] = (struct elem_to_channel) {
+            .av_position  = right,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset + 1][1],
+            .aac_position = pos
+        };
+        return 2;
+    }
+}
+
+static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
+                                 int *current)
+{
+    int num_pos_channels = 0;
+    int first_cpe        = 0;
+    int sce_parity       = 0;
+    int i;
+    for (i = *current; i < tags; i++) {
+        if (layout_map[i][2] != pos)
+            break;
+        if (layout_map[i][0] == TYPE_CPE) {
+            if (sce_parity) {
+                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
+                    sce_parity = 0;
+                } else {
+                    return -1;
+                }
+            }
+            num_pos_channels += 2;
+            first_cpe         = 1;
+        } else {
+            num_pos_channels++;
+            sce_parity ^= 1;
+        }
+    }
+    if (sce_parity &&
+        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
+        return -1;
+    *current = i;
+    return num_pos_channels;
+}
+
+static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
+{
+    int i, n, total_non_cc_elements;
+    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
+    int num_front_channels, num_side_channels, num_back_channels;
+    uint64_t layout;
+
+    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
+        return 0;
+
+    i = 0;
+    num_front_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
+    if (num_front_channels < 0)
+        return 0;
+    num_side_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
+    if (num_side_channels < 0)
+        return 0;
+    num_back_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
+    if (num_back_channels < 0)
+        return 0;
+
+    if (num_side_channels == 0 && num_back_channels >= 4) {
+        num_side_channels = 2;
+        num_back_channels -= 2;
+    }
+
+    i = 0;
+    if (num_front_channels & 1) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_FRONT_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_FRONT
+        };
+        i++;
+        num_front_channels--;
+    }
+    if (num_front_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT_OF_CENTER,
+                         AV_CH_FRONT_RIGHT_OF_CENTER,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    if (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT,
+                         AV_CH_FRONT_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    while (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+
+    if (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_SIDE_LEFT,
+                         AV_CH_SIDE_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_side_channels -= 2;
+    }
+    while (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_SIDE);
+        num_side_channels -= 2;
+    }
+
+    while (num_back_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_BACK_LEFT,
+                         AV_CH_BACK_RIGHT,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_BACK_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_BACK
+        };
+        i++;
+        num_back_channels--;
+    }
+
+    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_LOW_FREQUENCY,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = UINT64_MAX,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+
+    // Must choose a stable sort
+    total_non_cc_elements = n = i;
+    do {
+        int next_n = 0;
+        for (i = 1; i < n; i++)
+            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
+                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
+                next_n = i;
+            }
+        n = next_n;
+    } while (n > 0);
+
+    layout = 0;
+    for (i = 0; i < total_non_cc_elements; i++) {
+        layout_map[i][0] = e2c_vec[i].syn_ele;
+        layout_map[i][1] = e2c_vec[i].elem_id;
+        layout_map[i][2] = e2c_vec[i].aac_position;
+        if (e2c_vec[i].av_position != UINT64_MAX) {
+            layout |= e2c_vec[i].av_position;
+        }
+    }
+
+    return layout;
+}
+
+/**
+ * Save current output configuration if and only if it has been locked.
+ */
+static void push_output_configuration(AACContext *ac) {
+    if (ac->oc[1].status == OC_LOCKED || ac->oc[0].status == OC_NONE) {
+        ac->oc[0] = ac->oc[1];
+    }
+    ac->oc[1].status = OC_NONE;
+}
+
+/**
+ * Restore the previous output configuration if and only if the current
+ * configuration is unlocked.
+ */
+static void pop_output_configuration(AACContext *ac) {
+    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
+        ac->oc[1] = ac->oc[0];
+        ac->avctx->channels = ac->oc[1].channels;
+        ac->avctx->channel_layout = ac->oc[1].channel_layout;
+        output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                         ac->oc[1].status, 0);
+    }
+}
+
+/**
+ * Configure output channel order based on the current program
+ * configuration element.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame)
+{
+    AVCodecContext *avctx = ac->avctx;
+    int i, channels = 0, ret;
+    uint64_t layout = 0;
+    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
+    uint8_t type_counts[TYPE_END] = { 0 };
+
+    if (ac->oc[1].layout_map != layout_map) {
+        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
+        ac->oc[1].layout_map_tags = tags;
+    }
+    for (i = 0; i < tags; i++) {
+        int type =         layout_map[i][0];
+        int id =           layout_map[i][1];
+        id_map[type][id] = type_counts[type]++;
+    }
+    // Try to sniff a reasonable channel order, otherwise output the
+    // channels in the order the PCE declared them.
+    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
+        layout = sniff_channel_order(layout_map, tags);
+    for (i = 0; i < tags; i++) {
+        int type =     layout_map[i][0];
+        int id =       layout_map[i][1];
+        int iid =      id_map[type][id];
+        int position = layout_map[i][2];
+        // Allocate or free elements depending on if they are in the
+        // current program configuration.
+        ret = che_configure(ac, position, type, iid, &channels);
+        if (ret < 0)
+            return ret;
+        ac->tag_che_map[type][id] = ac->che[type][iid];
+    }
+    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
+        if (layout == AV_CH_FRONT_CENTER) {
+            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
+        } else {
+            layout = 0;
+        }
+    }
+
+    if (layout) avctx->channel_layout = layout;
+                            ac->oc[1].channel_layout = layout;
+    avctx->channels       = ac->oc[1].channels       = channels;
+    ac->oc[1].status = oc_type;
+
+    if (get_new_frame) {
+        if ((ret = frame_configure_elements(ac->avctx)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static void flush(AVCodecContext *avctx)
+{
+    AACContext *ac= avctx->priv_data;
+    int type, i, j;
+
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che) {
+                for (j = 0; j <= 1; j++) {
+                    memset(che->ch[j].saved, 0, sizeof(che->ch[j].saved));
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Set up channel positions based on a default channel configuration
+ * as specified in table 1.17.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int set_default_channel_config(AVCodecContext *avctx,
+                                      uint8_t (*layout_map)[3],
+                                      int *tags,
+                                      int channel_config)
+{
+    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
+        channel_config > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid default channel configuration (%d)\n",
+               channel_config);
+        return AVERROR_INVALIDDATA;
+    }
+    *tags = tags_per_config[channel_config];
+    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
+           *tags * sizeof(*layout_map));
+
+    /*
+     * AAC specification has 7.1(wide) as a default layout for 8-channel streams.
+     * However, at least Nero AAC encoder encodes 7.1 streams using the default
+     * channel config 7, mapping the side channels of the original audio stream
+     * to the second AAC_CHANNEL_FRONT pair in the AAC stream. Similarly, e.g. FAAD
+     * decodes the second AAC_CHANNEL_FRONT pair as side channels, therefore decoding
+     * the incorrect streams as if they were correct (and as the encoder intended).
+     *
+     * As actual intended 7.1(wide) streams are very rare, default to assuming a
+     * 7.1 layout was intended.
+     */
+    if (channel_config == 7 && avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) {
+        av_log(avctx, AV_LOG_INFO, "Assuming an incorrectly encoded 7.1 channel layout"
+               " instead of a spec-compliant 7.1(wide) layout, use -strict %d to decode"
+               " according to the specification instead.\n", FF_COMPLIANCE_STRICT);
+        layout_map[2][2] = AAC_CHANNEL_SIDE;
+    }
+
+    return 0;
+}
+
+static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
+{
+    /* For PCE based channel configurations map the channels solely based
+     * on tags. */
+    if (!ac->oc[1].m4ac.chan_config) {
+        return ac->tag_che_map[type][elem_id];
+    }
+    // Allow single CPE stereo files to be signalled with mono configuration.
+    if (!ac->tags_mapped && type == TYPE_CPE &&
+        ac->oc[1].m4ac.chan_config == 1) {
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "mono with CPE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 2) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 2;
+        ac->oc[1].m4ac.ps = 0;
+    }
+    // And vice-versa
+    if (!ac->tags_mapped && type == TYPE_SCE &&
+        ac->oc[1].m4ac.chan_config == 2) {
+        uint8_t layout_map[MAX_ELEM_ID * 4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "stereo with SCE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 1) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 1;
+        if (ac->oc[1].m4ac.sbr)
+            ac->oc[1].m4ac.ps = -1;
+    }
+    /* For indexed channel configurations map the channels solely based
+     * on position. */
+    switch (ac->oc[1].m4ac.chan_config) {
+    case 12:
+    case 7:
+        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
+        }
+    case 11:
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 11 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 6:
+        /* Some streams incorrectly code 5.1 audio as
+         * SCE[0] CPE[0] CPE[1] SCE[1]
+         * instead of
+         * SCE[0] CPE[0] CPE[1] LFE[0].
+         * If we seem to have encountered such a stream, transfer
+         * the LFE[0] element to the SCE[1]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_LFE || elem_id != 0)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to LFE[0]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
+        }
+    case 5:
+        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
+        }
+    case 4:
+        /* Some streams incorrectly code 4.0 audio as
+         * SCE[0] CPE[0] LFE[0]
+         * instead of
+         * SCE[0] CPE[0] SCE[1].
+         * If we seem to have encountered such a stream, transfer
+         * the SCE[1] element to the LFE[0]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_SCE || elem_id != 1)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to SCE[1]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_SCE][1];
+        }
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 4 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 3:
+    case 2:
+        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
+            type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
+        } else if (ac->oc[1].m4ac.chan_config == 2) {
+            return NULL;
+        }
+    case 1:
+        if (!ac->tags_mapped && type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
+        }
+    default:
+        return NULL;
+    }
+}
+
+/**
+ * Decode an array of 4 bit element IDs, optionally interleaved with a
+ * stereo/mono switching bit.
+ *
+ * @param type speaker type/position for these channels
+ */
+static void decode_channel_map(uint8_t layout_map[][3],
+                               enum ChannelPosition type,
+                               GetBitContext *gb, int n)
+{
+    while (n--) {
+        enum RawDataBlockType syn_ele;
+        switch (type) {
+        case AAC_CHANNEL_FRONT:
+        case AAC_CHANNEL_BACK:
+        case AAC_CHANNEL_SIDE:
+            syn_ele = get_bits1(gb);
+            break;
+        case AAC_CHANNEL_CC:
+            skip_bits1(gb);
+            syn_ele = TYPE_CCE;
+            break;
+        case AAC_CHANNEL_LFE:
+            syn_ele = TYPE_LFE;
+            break;
+        default:
+            // AAC_CHANNEL_OFF has no channel map
+            av_assert0(0);
+        }
+        layout_map[0][0] = syn_ele;
+        layout_map[0][1] = get_bits(gb, 4);
+        layout_map[0][2] = type;
+        layout_map++;
+    }
+}
+
+/**
+ * Decode program configuration element; reference: table 4.2.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
+                      uint8_t (*layout_map)[3],
+                      GetBitContext *gb)
+{
+    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
+    int sampling_index;
+    int comment_len;
+    int tags;
+
+    skip_bits(gb, 2);  // object_type
+
+    sampling_index = get_bits(gb, 4);
+    if (m4ac->sampling_index != sampling_index)
+        av_log(avctx, AV_LOG_WARNING,
+               "Sample rate index in program config element does not "
+               "match the sample rate index configured by the container.\n");
+
+    num_front       = get_bits(gb, 4);
+    num_side        = get_bits(gb, 4);
+    num_back        = get_bits(gb, 4);
+    num_lfe         = get_bits(gb, 2);
+    num_assoc_data  = get_bits(gb, 3);
+    num_cc          = get_bits(gb, 4);
+
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // mono_mixdown_tag
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // stereo_mixdown_tag
+
+    if (get_bits1(gb))
+        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
+
+    if (get_bits_left(gb) < 4 * (num_front + num_side + num_back + num_lfe + num_assoc_data + num_cc)) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return -1;
+    }
+    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
+    tags = num_front;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
+    tags += num_side;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
+    tags += num_back;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
+    tags += num_lfe;
+
+    skip_bits_long(gb, 4 * num_assoc_data);
+
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
+    tags += num_cc;
+
+    align_get_bits(gb);
+
+    /* comment field, first byte is length */
+    comment_len = get_bits(gb, 8) * 8;
+    if (get_bits_left(gb) < comment_len) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, comment_len);
+    return tags;
+}
+
+/**
+ * Decode GA "General Audio" specific configuration; reference: table 4.1.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int extension_flag, ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_request_sample(avctx, "960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+    m4ac->frame_length_short = 0;
+
+    if (get_bits1(gb))       // dependsOnCoreCoder
+        skip_bits(gb, 14);   // coreCoderDelay
+    extension_flag = get_bits1(gb);
+
+    if (m4ac->object_type == AOT_AAC_SCALABLE ||
+        m4ac->object_type == AOT_ER_AAC_SCALABLE)
+        skip_bits(gb, 3);     // layerNr
+
+    if (channel_config == 0) {
+        skip_bits(gb, 4);  // element_instance_tag
+        tags = decode_pce(avctx, m4ac, layout_map, gb);
+        if (tags < 0)
+            return tags;
+    } else {
+        if ((ret = set_default_channel_config(avctx, layout_map,
+                                              &tags, channel_config)))
+            return ret;
+    }
+
+    if (count_channels(layout_map, tags) > 1) {
+        m4ac->ps = 0;
+    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
+        m4ac->ps = 1;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    if (extension_flag) {
+        switch (m4ac->object_type) {
+        case AOT_ER_BSAC:
+            skip_bits(gb, 5);    // numOfSubFrame
+            skip_bits(gb, 11);   // layer_length
+            break;
+        case AOT_ER_AAC_LC:
+        case AOT_ER_AAC_LTP:
+        case AOT_ER_AAC_SCALABLE:
+        case AOT_ER_AAC_LD:
+            res_flags = get_bits(gb, 3);
+            if (res_flags) {
+                avpriv_report_missing_feature(avctx,
+                                              "AAC data resilience (flags %x)",
+                                              res_flags);
+                return AVERROR_PATCHWELCOME;
+            }
+            break;
+        }
+        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
+    }
+    switch (m4ac->object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_SCALABLE:
+    case AOT_ER_AAC_LD:
+        ep_config = get_bits(gb, 2);
+        if (ep_config) {
+            avpriv_report_missing_feature(avctx,
+                                          "epConfig %d", ep_config);
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    return 0;
+}
+
+static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+    const int ELDEXT_TERM = 0;
+
+    m4ac->ps  = 0;
+    m4ac->sbr = 0;
+#if USE_FIXED
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_request_sample(avctx, "960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+#else
+    m4ac->frame_length_short = get_bits1(gb);
+#endif
+    res_flags = get_bits(gb, 3);
+    if (res_flags) {
+        avpriv_report_missing_feature(avctx,
+                                      "AAC data resilience (flags %x)",
+                                      res_flags);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (get_bits1(gb)) { // ldSbrPresentFlag
+        avpriv_report_missing_feature(avctx,
+                                      "Low Delay SBR");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    while (get_bits(gb, 4) != ELDEXT_TERM) {
+        int len = get_bits(gb, 4);
+        if (len == 15)
+            len += get_bits(gb, 8);
+        if (len == 15 + 255)
+            len += get_bits(gb, 16);
+        if (get_bits_left(gb) < len * 8 + 4) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            return AVERROR_INVALIDDATA;
+        }
+        skip_bits_long(gb, 8 * len);
+    }
+
+    if ((ret = set_default_channel_config(avctx, layout_map,
+                                          &tags, channel_config)))
+        return ret;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    ep_config = get_bits(gb, 2);
+    if (ep_config) {
+        avpriv_report_missing_feature(avctx,
+                                      "epConfig %d", ep_config);
+        return AVERROR_PATCHWELCOME;
+    }
+    return 0;
+}
+
+/**
+ * Decode audio specific configuration; reference: table 1.13.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
+ * @param   data        pointer to buffer holding an audio specific config
+ * @param   bit_size    size of audio specific config or data in bits
+ * @param   sync_extension look for an appended sync extension
+ *
+ * @return  Returns error status or number of consumed bits. <0 - error
+ */
+static int decode_audio_specific_config(AACContext *ac,
+                                        AVCodecContext *avctx,
+                                        MPEG4AudioConfig *m4ac,
+                                        const uint8_t *data, int64_t bit_size,
+                                        int sync_extension)
+{
+    GetBitContext gb;
+    int i, ret;
+
+    if (bit_size < 0 || bit_size > INT_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "Audio specific config size is invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_dlog(avctx, "audio specific config size %d\n", (int)bit_size >> 3);
+    for (i = 0; i < bit_size >> 3; i++)
+        ff_dlog(avctx, "%02x ", data[i]);
+    ff_dlog(avctx, "\n");
+
+    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
+        return ret;
+
+    if ((i = avpriv_mpeg4audio_get_config(m4ac, data, bit_size,
+                                          sync_extension)) < 0)
+        return AVERROR_INVALIDDATA;
+    if (m4ac->sampling_index > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+    if (m4ac->object_type == AOT_ER_AAC_LD &&
+        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid low delay sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+
+    skip_bits_long(&gb, i);
+
+    switch (m4ac->object_type) {
+    case AOT_AAC_MAIN:
+    case AOT_AAC_LC:
+    case AOT_AAC_LTP:
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LD:
+        if ((ret = decode_ga_specific_config(ac, avctx, &gb,
+                                            m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    case AOT_ER_AAC_ELD:
+        if ((ret = decode_eld_specific_config(ac, avctx, &gb,
+                                              m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx,
+                                      "Audio object type %s%d",
+                                      m4ac->sbr == 1 ? "SBR+" : "",
+                                      m4ac->object_type);
+        return AVERROR(ENOSYS);
+    }
+
+    ff_dlog(avctx,
+            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
+            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
+            m4ac->sample_rate, m4ac->sbr,
+            m4ac->ps);
+
+    return get_bits_count(&gb);
+}
+
+/**
+ * linear congruential pseudorandom number generator
+ *
+ * @param   previous_val    pointer to the current state of the generator
+ *
+ * @return  Returns a 32-bit pseudorandom integer
+ */
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static int sample_rate_idx (int rate)
+{
+         if (92017 <= rate) return 0;
+    else if (75132 <= rate) return 1;
+    else if (55426 <= rate) return 2;
+    else if (46009 <= rate) return 3;
+    else if (37566 <= rate) return 4;
+    else if (27713 <= rate) return 5;
+    else if (23004 <= rate) return 6;
+    else if (18783 <= rate) return 7;
+    else if (13856 <= rate) return 8;
+    else if (11502 <= rate) return 9;
+    else if (9391  <= rate) return 10;
+    else                    return 11;
+}
+
+static void reset_predictor_group(PredictorState *ps, int group_num)
+{
+    int i;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+#define AAC_INIT_VLC_STATIC(num, size)                                     \
+    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
+         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
+                                    sizeof(ff_aac_spectral_bits[num][0]),  \
+        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
+                                    sizeof(ff_aac_spectral_codes[num][0]), \
+        size);
+
+static void aacdec_init(AACContext *ac);
+
+static av_cold int aac_decode_init(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int ret;
+
+    ac->avctx = avctx;
+    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
+
+    aacdec_init(ac);
+#if USE_FIXED
+    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+#else
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+#endif /* USE_FIXED */
+
+    if (avctx->extradata_size > 0) {
+        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                                avctx->extradata,
+                                                avctx->extradata_size * 8LL,
+                                                1)) < 0)
+            return ret;
+    } else {
+        int sr, i;
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+
+        sr = sample_rate_idx(avctx->sample_rate);
+        ac->oc[1].m4ac.sampling_index = sr;
+        ac->oc[1].m4ac.channels = avctx->channels;
+        ac->oc[1].m4ac.sbr = -1;
+        ac->oc[1].m4ac.ps = -1;
+
+        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
+            if (ff_mpeg4audio_channels[i] == avctx->channels)
+                break;
+        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
+            i = 0;
+        }
+        ac->oc[1].m4ac.chan_config = i;
+
+        if (ac->oc[1].m4ac.chan_config) {
+            int ret = set_default_channel_config(avctx, layout_map,
+                &layout_map_tags, ac->oc[1].m4ac.chan_config);
+            if (!ret)
+                output_configure(ac, layout_map, layout_map_tags,
+                                 OC_GLOBAL_HDR, 0);
+            else if (avctx->err_recognition & AV_EF_EXPLODE)
+                return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Too many channels\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    AAC_INIT_VLC_STATIC( 0, 304);
+    AAC_INIT_VLC_STATIC( 1, 270);
+    AAC_INIT_VLC_STATIC( 2, 550);
+    AAC_INIT_VLC_STATIC( 3, 300);
+    AAC_INIT_VLC_STATIC( 4, 328);
+    AAC_INIT_VLC_STATIC( 5, 294);
+    AAC_INIT_VLC_STATIC( 6, 306);
+    AAC_INIT_VLC_STATIC( 7, 268);
+    AAC_INIT_VLC_STATIC( 8, 510);
+    AAC_INIT_VLC_STATIC( 9, 366);
+    AAC_INIT_VLC_STATIC(10, 462);
+
+    AAC_RENAME(ff_aac_sbr_init)();
+
+#if USE_FIXED
+    ac->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    ac->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#endif /* USE_FIXED */
+    if (!ac->fdsp) {
+        return AVERROR(ENOMEM);
+    }
+
+    ac->random_state = 0x1f2e3d4c;
+
+    ff_aac_tableinit();
+
+    INIT_VLC_STATIC(&vlc_scalefactors, 7,
+                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
+                    ff_aac_scalefactor_bits,
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    ff_aac_scalefactor_code,
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    352);
+
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct,       11, 1, 1.0 / RANGE15(1024.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ld,    10, 1, 1.0 / RANGE15(512.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_small,  8, 1, 1.0 / RANGE15(128.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ltp,   11, 0, RANGE15(-2.0));
+#if !USE_FIXED
+    ret = ff_imdct15_init(&ac->mdct480, 5);
+    if (ret < 0)
+        return ret;
+#endif
+    // window initialization
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_long_1024), 4.0, 1024);
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_short_128), 6.0, 128);
+    AAC_RENAME(ff_init_ff_sine_windows)(10);
+    AAC_RENAME(ff_init_ff_sine_windows)( 9);
+    AAC_RENAME(ff_init_ff_sine_windows)( 7);
+
+    AAC_RENAME(cbrt_tableinit)();
+
+    return 0;
+}
+
+/**
+ * Skip data_stream_element; reference: table 4.10.
+ */
+static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
+{
+    int byte_align = get_bits1(gb);
+    int count = get_bits(gb, 8);
+    if (count == 255)
+        count += get_bits(gb, 8);
+    if (byte_align)
+        align_get_bits(gb);
+
+    if (get_bits_left(gb) < 8 * count) {
+        av_log(ac->avctx, AV_LOG_ERROR, "skip_data_stream_element: "overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, 8 * count);
+    return 0;
+}
+
+static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
+                             GetBitContext *gb)
+{
+    int sfb;
+    if (get_bits1(gb)) {
+        ics->predictor_reset_group = get_bits(gb, 5);
+        if (ics->predictor_reset_group == 0 ||
+            ics->predictor_reset_group > 30) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid Predictor Reset Group.\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
+        ics->prediction_used[sfb] = get_bits1(gb);
+    }
+    return 0;
+}
+
+/**
+ * Decode Long Term Prediction data; reference: table 4.xx.
+ */
+static void decode_ltp(LongTermPrediction *ltp,
+                       GetBitContext *gb, uint8_t max_sfb)
+{
+    int sfb;
+
+    ltp->lag  = get_bits(gb, 11);
+    ltp->coef = ltp_coef[get_bits(gb, 3)];
+    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
+        ltp->used[sfb] = get_bits1(gb);
+}
+
+/**
+ * Decode Individual Channel Stream info; reference: table 4.6.
+ */
+static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
+                           GetBitContext *gb)
+{
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    const int aot = m4ac->object_type;
+    const int sampling_index = m4ac->sampling_index;
+    if (aot != AOT_ER_AAC_ELD) {
+        if (get_bits1(gb)) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
+            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
+                return AVERROR_INVALIDDATA;
+        }
+        ics->window_sequence[1] = ics->window_sequence[0];
+        ics->window_sequence[0] = get_bits(gb, 2);
+        if (aot == AOT_ER_AAC_LD &&
+            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
+                   "window sequence %d found.\n", ics->window_sequence[0]);
+            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
+            return AVERROR_INVALIDDATA;
+        }
+        ics->use_kb_window[1]   = ics->use_kb_window[0];
+        ics->use_kb_window[0]   = get_bits1(gb);
+    }
+    ics->num_window_groups  = 1;
+    ics->group_len[0]       = 1;
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        int i;
+        ics->max_sfb = get_bits(gb, 4);
+        for (i = 0; i < 7; i++) {
+            if (get_bits1(gb)) {
+                ics->group_len[ics->num_window_groups - 1]++;
+            } else {
+                ics->num_window_groups++;
+                ics->group_len[ics->num_window_groups - 1] = 1;
+            }
+        }
+        ics->num_windows       = 8;
+        ics->swb_offset        =    ff_swb_offset_128[sampling_index];
+        ics->num_swb           =   ff_aac_num_swb_128[sampling_index];
+        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
+        ics->predictor_present = 0;
+    } else {
+        ics->max_sfb           = get_bits(gb, 6);
+        ics->num_windows       = 1;
+        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
+            if (m4ac->frame_length_short) {
+                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
+            } else {
+                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
+            }
+            if (!ics->num_swb || !ics->swb_offset)
+                return AVERROR_BUG;
+        } else {
+            ics->swb_offset    =    ff_swb_offset_1024[sampling_index];
+            ics->num_swb       =   ff_aac_num_swb_1024[sampling_index];
+            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
+        }
+        if (aot != AOT_ER_AAC_ELD) {
+            ics->predictor_present     = get_bits1(gb);
+            ics->predictor_reset_group = 0;
+        }
+        if (ics->predictor_present) {
+            if (aot == AOT_AAC_MAIN) {
+                if (decode_prediction(ac, ics, gb)) {
+                    goto fail;
+                }
+            } else if (aot == AOT_AAC_LC ||
+                       aot == AOT_ER_AAC_LC) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Prediction is not allowed in AAC-LC.\n");
+                goto fail;
+            } else {
+                if (aot == AOT_ER_AAC_LD) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "LTP in ER AAC LD not yet implemented.\n");
+                    return AVERROR_PATCHWELCOME;
+                }
+                if ((ics->ltp.present = get_bits(gb, 1)))
+                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
+            }
+        }
+    }
+
+    if (ics->max_sfb > ics->num_swb) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Number of scalefactor bands in group (%d) "
+               "exceeds limit (%d).\n",
+               ics->max_sfb, ics->num_swb);
+        goto fail;
+    }
+
+    return 0;
+fail:
+    ics->max_sfb = 0;
+    return AVERROR_INVALIDDATA;
+}
+
+/**
+ * Decode band types (section_data payload); reference: table 4.46.
+ *
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_band_types(AACContext *ac, enum BandType band_type[120],
+                             int band_type_run_end[120], GetBitContext *gb,
+                             IndividualChannelStream *ics)
+{
+    int g, idx = 0;
+    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        int k = 0;
+        while (k < ics->max_sfb) {
+            uint8_t sect_end = k;
+            int sect_len_incr;
+            int sect_band_type = get_bits(gb, 4);
+            if (sect_band_type == 12) {
+                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
+                return AVERROR_INVALIDDATA;
+            }
+            do {
+                sect_len_incr = get_bits(gb, bits);
+                sect_end += sect_len_incr;
+                if (get_bits_left(gb) < 0) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "decode_band_types: "overread_err);
+                    return AVERROR_INVALIDDATA;
+                }
+                if (sect_end > ics->max_sfb) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "Number of bands (%d) exceeds limit (%d).\n",
+                           sect_end, ics->max_sfb);
+                    return AVERROR_INVALIDDATA;
+                }
+            } while (sect_len_incr == (1 << bits) - 1);
+            for (; k < sect_end; k++) {
+                band_type        [idx]   = sect_band_type;
+                band_type_run_end[idx++] = sect_end;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode scalefactors; reference: table 4.47.
+ *
+ * @param   global_gain         first scalefactor value as scalefactors are differentially coded
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ * @param   sf                  array of scalefactors or intensity stereo positions
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_scalefactors(AACContext *ac, INTFLOAT sf[120], GetBitContext *gb,
+                               unsigned int global_gain,
+                               IndividualChannelStream *ics,
+                               enum BandType band_type[120],
+                               int band_type_run_end[120])
+{
+    int g, i, idx = 0;
+    int offset[3] = { global_gain, global_gain - NOISE_OFFSET, 0 };
+    int clipped_offset;
+    int noise_flag = 1;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            int run_end = band_type_run_end[idx];
+            if (band_type[idx] == ZERO_BT) {
+                for (; i < run_end; i++, idx++)
+                    sf[idx] = FIXR(0.);
+            } else if ((band_type[idx] == INTENSITY_BT) ||
+                       (band_type[idx] == INTENSITY_BT2)) {
+                for (; i < run_end; i++, idx++) {
+                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[2], -155, 100);
+                    if (offset[2] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped intensity stereo position (%d -> %d)",
+                                              offset[2], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = 100 - clipped_offset;
+#else
+                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else if (band_type[idx] == NOISE_BT) {
+                for (; i < run_end; i++, idx++) {
+                    if (noise_flag-- > 0)
+                        offset[1] += get_bits(gb, NOISE_PRE_BITS) - NOISE_PRE;
+                    else
+                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[1], -100, 155);
+                    if (offset[1] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped noise gain (%d -> %d)",
+                                              offset[1], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = -(100 + clipped_offset);
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else {
+                for (; i < run_end; i++, idx++) {
+                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    if (offset[0] > 255U) {
+                        av_log(ac->avctx, AV_LOG_ERROR,
+                               "Scalefactor (%d) out of range.\n", offset[0]);
+                        return AVERROR_INVALIDDATA;
+                    }
+#if USE_FIXED
+                    sf[idx] = -offset[0];
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode pulse data; reference: table 4.7.
+ */
+static int decode_pulses(Pulse *pulse, GetBitContext *gb,
+                         const uint16_t *swb_offset, int num_swb)
+{
+    int i, pulse_swb;
+    pulse->num_pulse = get_bits(gb, 2) + 1;
+    pulse_swb        = get_bits(gb, 6);
+    if (pulse_swb >= num_swb)
+        return -1;
+    pulse->pos[0]    = swb_offset[pulse_swb];
+    pulse->pos[0]   += get_bits(gb, 5);
+    if (pulse->pos[0] >= swb_offset[num_swb])
+        return -1;
+    pulse->amp[0]    = get_bits(gb, 4);
+    for (i = 1; i < pulse->num_pulse; i++) {
+        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
+        if (pulse->pos[i] >= swb_offset[num_swb])
+            return -1;
+        pulse->amp[i] = get_bits(gb, 4);
+    }
+    return 0;
+}
+
+/**
+ * Decode Temporal Noise Shaping data; reference: table 4.48.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
+                      GetBitContext *gb, const IndividualChannelStream *ics)
+{
+    int w, filt, i, coef_len, coef_res, coef_compress;
+    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
+    for (w = 0; w < ics->num_windows; w++) {
+        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
+            coef_res = get_bits1(gb);
+
+            for (filt = 0; filt < tns->n_filt[w]; filt++) {
+                int tmp2_idx;
+                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
+
+                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "TNS filter order %d is greater than maximum %d.\n",
+                           tns->order[w][filt], tns_max_order);
+                    tns->order[w][filt] = 0;
+                    return AVERROR_INVALIDDATA;
+                }
+                if (tns->order[w][filt]) {
+                    tns->direction[w][filt] = get_bits1(gb);
+                    coef_compress = get_bits1(gb);
+                    coef_len = coef_res + 3 - coef_compress;
+                    tmp2_idx = 2 * coef_compress + coef_res;
+
+                    for (i = 0; i < tns->order[w][filt]; i++)
+                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode Mid/Side data; reference: table 4.54.
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
+                                   int ms_present)
+{
+    int idx;
+    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
+    if (ms_present == 1) {
+        for (idx = 0; idx < max_idx; idx++)
+            cpe->ms_mask[idx] = get_bits1(gb);
+    } else if (ms_present == 2) {
+        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
+    }
+}
+
+/**
+ * Decode spectral data; reference: table 4.50.
+ * Dequantize and scale spectral data; reference: 4.6.3.3.
+ *
+ * @param   coef            array of dequantized, scaled spectral data
+ * @param   sf              array of scalefactors or intensity stereo positions
+ * @param   pulse_present   set if pulses are present
+ * @param   pulse           pointer to pulse data struct
+ * @param   band_type       array of the used band type
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_spectrum_and_dequant(AACContext *ac, INTFLOAT coef[1024],
+                                       GetBitContext *gb, const INTFLOAT sf[120],
+                                       int pulse_present, const Pulse *pulse,
+                                       const IndividualChannelStream *ics,
+                                       enum BandType band_type[120])
+{
+    int i, k, g, idx = 0;
+    const int c = 1024 / ics->num_windows;
+    const uint16_t *offsets = ics->swb_offset;
+    INTFLOAT *coef_base = coef;
+
+    for (g = 0; g < ics->num_windows; g++)
+        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
+               sizeof(INTFLOAT) * (c - offsets[ics->max_sfb]));
+
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            INTFLOAT *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 >= INTENSITY_BT2 - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                    memset(cfo, 0, off_len * sizeof(*cfo));
+                }
+            } else if (cbt_m1 == NOISE_BT - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if !USE_FIXED
+                    float scale;
+#endif /* !USE_FIXED */
+                    INTFLOAT band_energy;
+
+                    for (k = 0; k < off_len; k++) {
+                        ac->random_state  = lcg_random(ac->random_state);
+#if USE_FIXED
+                        cfo[k] = ac->random_state >> 3;
+#else
+                        cfo[k] = ac->random_state;
+#endif /* USE_FIXED */
+                    }
+
+#if USE_FIXED
+                    band_energy = ac->fdsp->scalarproduct_fixed(cfo, cfo, off_len);
+                    band_energy = fixed_sqrt(band_energy, 31);
+                    noise_scale(cfo, sf[idx], band_energy, off_len);
+#else
+                    band_energy = ac->fdsp->scalarproduct_float(cfo, cfo, off_len);
+                    scale = sf[idx] / sqrtf(band_energy);
+                    ac->fdsp->vector_fmul_scalar(cfo, cfo, scale, off_len);
+#endif /* USE_FIXED */
+                }
+            } else {
+#if !USE_FIXED
+                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
+#endif /* !USE_FIXED */
+                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
+                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
+                OPEN_READER(re, gb);
+
+                switch (cbt_m1 >> 1) {
+                case 0:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SQUAD(cf, cb_idx);
+#else
+                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 1:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            bits = nnz ? GET_CACHE(re, gb) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UQUAD(cf, cb_idx, bits);
+#else
+                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 2:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SPAIR(cf, cb_idx);
+#else
+                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                case 3:
+                case 4:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            unsigned sign;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UPAIR(cf, cb_idx, sign);
+#else
+                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                default:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if USE_FIXED
+                        int *icf = cfo;
+                        int v;
+#else
+                        float *cf = cfo;
+                        uint32_t *icf = (uint32_t *) cf;
+#endif /* USE_FIXED */
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nzt, nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+                            int j;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+
+                            if (!code) {
+                                *icf++ = 0;
+                                *icf++ = 0;
+                                continue;
+                            }
+
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 12;
+                            nzt = cb_idx >> 8;
+                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
+                            LAST_SKIP_BITS(re, gb, nnz);
+
+                            for (j = 0; j < 2; j++) {
+                                if (nzt & 1<<j) {
+                                    uint32_t b;
+                                    int n;
+                                    /* The total length of escape_sequence must be < 22 bits according
+                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
+                                    UPDATE_CACHE(re, gb);
+                                    b = GET_CACHE(re, gb);
+                                    b = 31 - av_log2(~b);
+
+                                    if (b > 8) {
+                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
+
+                                    SKIP_BITS(re, gb, b + 1);
+                                    b += 4;
+                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
+                                    LAST_SKIP_BITS(re, gb, b);
+#if USE_FIXED
+                                    v = n;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    *icf++ = cbrt_tab[n] | (bits & 1U<<31);
+#endif /* USE_FIXED */
+                                    bits <<= 1;
+                                } else {
+#if USE_FIXED
+                                    v = cb_idx & 15;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
+                                    *icf++ = (bits & 1U<<31) | v;
+#endif /* USE_FIXED */
+                                    bits <<= !!v;
+                                }
+                                cb_idx >>= 4;
+                            }
+                        } while (len -= 2);
+#if !USE_FIXED
+                        ac->fdsp->vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
+#endif /* !USE_FIXED */
+                    }
+                }
+
+                CLOSE_READER(re, gb);
+            }
+        }
+        coef += g_len << 7;
+    }
+
+    if (pulse_present) {
+        idx = 0;
+        for (i = 0; i < pulse->num_pulse; i++) {
+            INTFLOAT co = coef_base[ pulse->pos[i] ];
+            while (offsets[idx + 1] <= pulse->pos[i])
+                idx++;
+            if (band_type[idx] != NOISE_BT && sf[idx]) {
+                INTFLOAT ico = -pulse->amp[i];
+#if USE_FIXED
+                if (co) {
+                    ico = co + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = ico;
+#else
+                if (co) {
+                    co /= sf[idx];
+                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
+#endif /* USE_FIXED */
+            }
+        }
+    }
+#if USE_FIXED
+    coef = coef_base;
+    idx = 0;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            int *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 < NOISE_BT - 1) {
+                for (group = 0; group < (int)g_len; group++, cfo+=128) {
+                    ac->vector_pow43(cfo, off_len);
+                    ac->subband_scale(cfo, cfo, sf[idx], 34, off_len);
+                }
+            }
+        }
+        coef += g_len << 7;
+    }
+#endif /* USE_FIXED */
+    return 0;
+}
+
+/**
+ * Apply AAC-Main style frequency domain prediction.
+ */
+static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
+{
+    int sfb, k;
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+    }
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0;
+             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
+             sfb++) {
+            for (k = sce->ics.swb_offset[sfb];
+                 k < sce->ics.swb_offset[sfb + 1];
+                 k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k],
+                        sce->ics.predictor_present &&
+                        sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group)
+            reset_predictor_group(sce->predictor_state,
+                                  sce->ics.predictor_reset_group);
+    } else
+        reset_all_predictors(sce->predictor_state);
+}
+
+/**
+ * Decode an individual_channel_stream payload; reference: table 4.44.
+ *
+ * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
+ * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ics(AACContext *ac, SingleChannelElement *sce,
+                      GetBitContext *gb, int common_window, int scale_flag)
+{
+    Pulse pulse;
+    TemporalNoiseShaping    *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *out = sce->coeffs;
+    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
+    int ret;
+
+    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    /* This assignment is to silence a GCC warning about the variable being used
+     * uninitialized when in fact it always is.
+     */
+    pulse.num_pulse = 0;
+
+    global_gain = get_bits(gb, 8);
+
+    if (!common_window && !scale_flag) {
+        if (decode_ics_info(ac, ics, gb) < 0)
+            return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = decode_band_types(ac, sce->band_type,
+                                 sce->band_type_run_end, gb, ics)) < 0)
+        return ret;
+    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
+                                  sce->band_type, sce->band_type_run_end)) < 0)
+        return ret;
+
+    pulse_present = 0;
+    if (!scale_flag) {
+        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
+            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse tool not allowed in eight short sequence.\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse data corrupt or invalid.\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        tns->present = get_bits1(gb);
+        if (tns->present && !er_syntax)
+            if (decode_tns(ac, tns, gb, ics) < 0)
+                return AVERROR_INVALIDDATA;
+        if (!eld_syntax && get_bits1(gb)) {
+            avpriv_request_sample(ac->avctx, "SSR");
+            return AVERROR_PATCHWELCOME;
+        }
+        // I see no textual basis in the spec for this occurring after SSR gain
+        // control, but this is what both reference and real implmentations do
+        if (tns->present && er_syntax)
+            if (decode_tns(ac, tns, gb, ics) < 0)
+                return AVERROR_INVALIDDATA;
+    }
+
+    if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
+                                    &pulse, ics, sce->band_type) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
+        apply_prediction(ac, sce);
+
+    return 0;
+}
+
+/**
+ * Mid/Side stereo decoding; reference: 4.6.8.1.3.
+ */
+static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
+{
+    const IndividualChannelStream *ics = &cpe->ch[0].ics;
+    INTFLOAT *ch0 = cpe->ch[0].coeffs;
+    INTFLOAT *ch1 = cpe->ch[1].coeffs;
+    int g, i, group, idx = 0;
+    const uint16_t *offsets = ics->swb_offset;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cpe->ms_mask[idx] &&
+                cpe->ch[0].band_type[idx] < NOISE_BT &&
+                cpe->ch[1].band_type[idx] < NOISE_BT) {
+#if USE_FIXED
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_fixed(ch0 + group * 128 + offsets[i],
+                                                ch1 + group * 128 + offsets[i],
+                                                offsets[i+1] - offsets[i]);
+#else
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_float(ch0 + group * 128 + offsets[i],
+                                               ch1 + group * 128 + offsets[i],
+                                               offsets[i+1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            }
+        }
+        ch0 += ics->group_len[g] * 128;
+        ch1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * intensity stereo decoding; reference: 4.6.8.2.3
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void apply_intensity_stereo(AACContext *ac,
+                                   ChannelElement *cpe, int ms_present)
+{
+    const IndividualChannelStream *ics = &cpe->ch[1].ics;
+    SingleChannelElement         *sce1 = &cpe->ch[1];
+    INTFLOAT *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
+    const uint16_t *offsets = ics->swb_offset;
+    int g, group, i, idx = 0;
+    int c;
+    INTFLOAT scale;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            if (sce1->band_type[idx] == INTENSITY_BT ||
+                sce1->band_type[idx] == INTENSITY_BT2) {
+                const int bt_run_end = sce1->band_type_run_end[idx];
+                for (; i < bt_run_end; i++, idx++) {
+                    c = -1 + 2 * (sce1->band_type[idx] - 14);
+                    if (ms_present)
+                        c *= 1 - 2 * cpe->ms_mask[idx];
+                    scale = c * sce1->sf[idx];
+                    for (group = 0; group < ics->group_len[g]; group++)
+#if USE_FIXED
+                        ac->subband_scale(coef1 + group * 128 + offsets[i],
+                                      coef0 + group * 128 + offsets[i],
+                                      scale,
+                                      23,
+                                      offsets[i + 1] - offsets[i]);
+#else
+                        ac->fdsp->vector_fmul_scalar(coef1 + group * 128 + offsets[i],
+                                                    coef0 + group * 128 + offsets[i],
+                                                    scale,
+                                                    offsets[i + 1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            } else {
+                int bt_run_end = sce1->band_type_run_end[idx];
+                idx += bt_run_end - i;
+                i    = bt_run_end;
+            }
+        }
+        coef0 += ics->group_len[g] * 128;
+        coef1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Decode a channel_pair_element; reference: table 4.4.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
+{
+    int i, ret, common_window, ms_present = 0;
+    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    common_window = eld_syntax || get_bits1(gb);
+    if (common_window) {
+        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
+            return AVERROR_INVALIDDATA;
+        i = cpe->ch[1].ics.use_kb_window[0];
+        cpe->ch[1].ics = cpe->ch[0].ics;
+        cpe->ch[1].ics.use_kb_window[1] = i;
+        if (cpe->ch[1].ics.predictor_present &&
+            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
+            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
+                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
+        ms_present = get_bits(gb, 2);
+        if (ms_present == 3) {
+            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
+            return AVERROR_INVALIDDATA;
+        } else if (ms_present)
+            decode_mid_side_stereo(cpe, gb, ms_present);
+    }
+    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
+        return ret;
+    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
+        return ret;
+
+    if (common_window) {
+        if (ms_present)
+            apply_mid_side_stereo(ac, cpe);
+        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
+            apply_prediction(ac, &cpe->ch[0]);
+            apply_prediction(ac, &cpe->ch[1]);
+        }
+    }
+
+    apply_intensity_stereo(ac, cpe, ms_present);
+    return 0;
+}
+
+static const float cce_scale[] = {
+    1.09050773266525765921, //2^(1/8)
+    1.18920711500272106672, //2^(1/4)
+    M_SQRT2,
+    2,
+};
+
+/**
+ * Decode coupling_channel_element; reference: table 4.8.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
+{
+    int num_gain = 0;
+    int c, g, sfb, ret;
+    int sign;
+    INTFLOAT scale;
+    SingleChannelElement *sce = &che->ch[0];
+    ChannelCoupling     *coup = &che->coup;
+
+    coup->coupling_point = 2 * get_bits1(gb);
+    coup->num_coupled = get_bits(gb, 3);
+    for (c = 0; c <= coup->num_coupled; c++) {
+        num_gain++;
+        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
+        coup->id_select[c] = get_bits(gb, 4);
+        if (coup->type[c] == TYPE_CPE) {
+            coup->ch_select[c] = get_bits(gb, 2);
+            if (coup->ch_select[c] == 3)
+                num_gain++;
+        } else
+            coup->ch_select[c] = 2;
+    }
+    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
+
+    sign  = get_bits(gb, 1);
+    scale = AAC_RENAME(cce_scale)[get_bits(gb, 2)];
+
+    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
+        return ret;
+
+    for (c = 0; c < num_gain; c++) {
+        int idx  = 0;
+        int cge  = 1;
+        int gain = 0;
+        INTFLOAT gain_cache = FIXR10(1.);
+        if (c) {
+            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
+            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
+            gain_cache = GET_GAIN(scale, gain);
+        }
+        if (coup->coupling_point == AFTER_IMDCT) {
+            coup->gain[c][0] = gain_cache;
+        } else {
+            for (g = 0; g < sce->ics.num_window_groups; g++) {
+                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
+                    if (sce->band_type[idx] != ZERO_BT) {
+                        if (!cge) {
+                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
+                            if (t) {
+                                int s = 1;
+                                t = gain += t;
+                                if (sign) {
+                                    s  -= 2 * (t & 0x1);
+                                    t >>= 1;
+                                }
+                                gain_cache = GET_GAIN(scale, t) * s;
+                            }
+                        }
+                        coup->gain[c][idx] = gain_cache;
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
+                                         GetBitContext *gb)
+{
+    int i;
+    int num_excl_chan = 0;
+
+    do {
+        for (i = 0; i < 7; i++)
+            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
+    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
+
+    return num_excl_chan / 7;
+}
+
+/**
+ * Decode dynamic range information; reference: table 4.52.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_dynamic_range(DynamicRangeControl *che_drc,
+                                GetBitContext *gb)
+{
+    int n             = 1;
+    int drc_num_bands = 1;
+    int i;
+
+    /* pce_tag_present? */
+    if (get_bits1(gb)) {
+        che_drc->pce_instance_tag  = get_bits(gb, 4);
+        skip_bits(gb, 4); // tag_reserved_bits
+        n++;
+    }
+
+    /* excluded_chns_present? */
+    if (get_bits1(gb)) {
+        n += decode_drc_channel_exclusions(che_drc, gb);
+    }
+
+    /* drc_bands_present? */
+    if (get_bits1(gb)) {
+        che_drc->band_incr            = get_bits(gb, 4);
+        che_drc->interpolation_scheme = get_bits(gb, 4);
+        n++;
+        drc_num_bands += che_drc->band_incr;
+        for (i = 0; i < drc_num_bands; i++) {
+            che_drc->band_top[i] = get_bits(gb, 8);
+            n++;
+        }
+    }
+
+    /* prog_ref_level_present? */
+    if (get_bits1(gb)) {
+        che_drc->prog_ref_level = get_bits(gb, 7);
+        skip_bits1(gb); // prog_ref_level_reserved_bits
+        n++;
+    }
+
+    for (i = 0; i < drc_num_bands; i++) {
+        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
+        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
+        n++;
+    }
+
+    return n;
+}
+
+static int decode_fill(AACContext *ac, GetBitContext *gb, int len) {
+    uint8_t buf[256];
+    int i, major, minor;
+
+    if (len < 13+7*8)
+        goto unknown;
+
+    get_bits(gb, 13); len -= 13;
+
+    for(i=0; i+1<sizeof(buf) && len>=8; i++, len-=8)
+        buf[i] = get_bits(gb, 8);
+
+    buf[i] = 0;
+    if (ac->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(ac->avctx, AV_LOG_DEBUG, "FILL:%s\n", buf);
+
+    if (sscanf(buf, "libfaac %d.%d", &major, &minor) == 2){
+        ac->avctx->internal->skip_samples = 1024;
+    }
+
+unknown:
+    skip_bits_long(gb, len);
+
+    return 0;
+}
+
+/**
+ * Decode extension data (incomplete); reference: table 4.51.
+ *
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return Returns number of bytes consumed
+ */
+static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
+                                    ChannelElement *che, enum RawDataBlockType elem_type)
+{
+    int crc_flag = 0;
+    int res = cnt;
+    int type = get_bits(gb, 4);
+
+    if (ac->avctx->debug & FF_DEBUG_STARTCODE)
+        av_log(ac->avctx, AV_LOG_DEBUG, "extension type: %d len:%d\n", type, cnt);
+
+    switch (type) { // extension type
+    case EXT_SBR_DATA_CRC:
+        crc_flag++;
+    case EXT_SBR_DATA:
+        if (!che) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
+            return res;
+        } else if (!ac->oc[1].m4ac.sbr) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->oc[1].m4ac.ps = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                             ac->oc[1].status, 1);
+        } else {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE;
+        }
+        res = AAC_RENAME(ff_decode_sbr_extension)(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
+        break;
+    case EXT_DYNAMIC_RANGE:
+        res = decode_dynamic_range(&ac->che_drc, gb);
+        break;
+    case EXT_FILL:
+        decode_fill(ac, gb, 8 * cnt - 4);
+        break;
+    case EXT_FILL_DATA:
+    case EXT_DATA_ELEMENT:
+    default:
+        skip_bits_long(gb, 8 * cnt - 4);
+        break;
+    };
+    return res;
+}
+
+/**
+ * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
+ *
+ * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
+ * @param   coef    spectral coefficients
+ */
+static void apply_tns(INTFLOAT coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode)
+{
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    int w, filt, m, i;
+    int bottom, top, order, start, end, size, inc;
+    INTFLOAT lpc[TNS_MAX_ORDER];
+    INTFLOAT tmp[TNS_MAX_ORDER+1];
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            AAC_RENAME(compute_lpc_coefs)(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            if (decode) {
+                // ar filter
+                for (m = 0; m < size; m++, start += inc)
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] -= AAC_MUL26(coef[start - i * inc], lpc[i - 1]);
+            } else {
+                // ma filter
+                for (m = 0; m < size; m++, start += inc) {
+                    tmp[0] = coef[start];
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] += AAC_MUL26(tmp[i], lpc[i - 1]);
+                    for (i = order; i > 0; i--)
+                        tmp[i] = tmp[i - 1];
+                }
+            }
+        }
+    }
+}
+
+/**
+ *  Apply windowing and MDCT to obtain the spectral
+ *  coefficient from the predicted sample by LTP.
+ */
+static void windowing_and_mdct_ltp(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics)
+{
+    const INTFLOAT *lwindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+
+    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
+        ac->fdsp->vector_fmul(in, in, lwindow_prev, 1024);
+    } else {
+        memset(in, 0, 448 * sizeof(*in));
+        ac->fdsp->vector_fmul(in + 448, in + 448, swindow_prev, 128);
+    }
+    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
+        ac->fdsp->vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
+    } else {
+        ac->fdsp->vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
+        memset(in + 1024 + 576, 0, 448 * sizeof(*in));
+    }
+    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
+}
+
+/**
+ * Apply the long term prediction
+ */
+static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        INTFLOAT *predTime = sce->ret;
+        INTFLOAT *predFreq = ac->buf_mdct;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = AAC_MUL30(sce->ltp_state[i + 2048 - ltp->lag], ltp->coef);
+        memset(&predTime[i], 0, (2048 - i) * sizeof(*predTime));
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+/**
+ * Update the LTP buffer for next frame
+ */
+static void update_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *saved     = sce->saved;
+    INTFLOAT *saved_ltp = sce->coeffs;
+    const INTFLOAT *lwindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(saved_ltp,       saved, 512 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,     448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,                  448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+
+        for (i = 0; i < 512; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], lwindow[511 - i]);
+    }
+
+    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
+}
+
+/**
+ * Conduct IMDCT and windowing.
+ */
+static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    INTFLOAT *buf  = ac->buf_mdct;
+    INTFLOAT *temp = ac->temp;
+    int i;
+
+    // imdct
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else {
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+#if USE_FIXED
+        for (i=0; i<1024; i++)
+          buf[i] = (buf[i] + 4) >> 3;
+#endif /* USE_FIXED */
+    }
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        memcpy(                         out,               saved,            448 * sizeof(*out));
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            ac->fdsp->vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
+            ac->fdsp->vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
+            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(*out));
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            memcpy(                     out + 576,         buf + 64,         448 * sizeof(*out));
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(                     saved,       temp + 64,         64 * sizeof(*saved));
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(                     saved,       buf + 512,        448 * sizeof(*saved));
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else { // LONG_STOP or ONLY_LONG
+        memcpy(                     saved,       buf + 512,        512 * sizeof(*saved));
+    }
+}
+
+static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+#if USE_FIXED
+    int i;
+#endif /* USE_FIXED */
+
+    // imdct
+    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+        buf[i] = (buf[i] + 2) >> 2;
+#endif /* USE_FIXED */
+
+    // window overlapping
+    if (ics->use_kb_window[1]) {
+        // AAC LD uses a low overlap sine window instead of a KBD window
+        memcpy(out, saved, 192 * sizeof(*out));
+        ac->fdsp->vector_fmul_window(out + 192, saved + 192, buf, AAC_RENAME(ff_sine_128), 64);
+        memcpy(                     out + 320, buf + 64, 192 * sizeof(*out));
+    } else {
+        ac->fdsp->vector_fmul_window(out, saved, buf, AAC_RENAME(ff_sine_512), 256);
+    }
+
+    // buffer update
+    memcpy(saved, buf + 256, 256 * sizeof(*saved));
+}
+
+static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
+{
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+    int i;
+    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
+    const int n2 = n >> 1;
+    const int n4 = n >> 2;
+    const INTFLOAT *const window = n == 480 ? AAC_RENAME(ff_aac_eld_window_480) :
+                                           AAC_RENAME(ff_aac_eld_window_512);
+
+    // Inverse transform, mapped to the conventional IMDCT by
+    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
+    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
+    // International Conference on Audio, Language and Image Processing, ICALIP 2008.
+    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
+    for (i = 0; i < n2; i+=2) {
+        INTFLOAT temp;
+        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
+        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
+    }
+#if !USE_FIXED
+    if (n == 480)
+        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1, -1.f/(16*1024*960));
+    else
+#endif
+        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+      buf[i] = (buf[i] + 1) >> 1;
+#endif /* USE_FIXED */
+
+    for (i = 0; i < n; i+=2) {
+        buf[i] = -buf[i];
+    }
+    // Like with the regular IMDCT at this point we still have the middle half
+    // of a transform but with even symmetry on the left and odd symmetry on
+    // the right
+
+    // window overlapping
+    // The spec says to use samples [0..511] but the reference decoder uses
+    // samples [128..639].
+    for (i = n4; i < n2; i ++) {
+        out[i - n4] = AAC_MUL31(   buf[    n2 - 1 - i] , window[i       - n4]) +
+                      AAC_MUL31( saved[        i + n2] , window[i +   n - n4]) +
+                      AAC_MUL31(-saved[n + n2 - 1 - i] , window[i + 2*n - n4]) +
+                      AAC_MUL31(-saved[  2*n + n2 + i] , window[i + 3*n - n4]);
+    }
+    for (i = 0; i < n2; i ++) {
+        out[n4 + i] = AAC_MUL31(   buf[              i] , window[i + n2       - n4]) +
+                      AAC_MUL31(-saved[      n - 1 - i] , window[i + n2 +   n - n4]) +
+                      AAC_MUL31(-saved[          n + i] , window[i + n2 + 2*n - n4]) +
+                      AAC_MUL31( saved[2*n + n - 1 - i] , window[i + n2 + 3*n - n4]);
+    }
+    for (i = 0; i < n4; i ++) {
+        out[n2 + n4 + i] = AAC_MUL31(   buf[    i + n2] , window[i +   n - n4]) +
+                           AAC_MUL31(-saved[n2 - 1 - i] , window[i + 2*n - n4]) +
+                           AAC_MUL31(-saved[n + n2 + i] , window[i + 3*n - n4]);
+    }
+
+    // buffer update
+    memmove(saved + n, saved, 2 * n * sizeof(*saved));
+    memcpy( saved,       buf,     n * sizeof(*saved));
+}
+
+/**
+ * channel coupling transformation interface
+ *
+ * @param   apply_coupling_method   pointer to (in)dependent coupling function
+ */
+static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
+                                   enum RawDataBlockType type, int elem_id,
+                                   enum CouplingPoint coupling_point,
+                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
+{
+    int i, c;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        ChannelElement *cce = ac->che[TYPE_CCE][i];
+        int index = 0;
+
+        if (cce && cce->coup.coupling_point == coupling_point) {
+            ChannelCoupling *coup = &cce->coup;
+
+            for (c = 0; c <= coup->num_coupled; c++) {
+                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
+                    if (coup->ch_select[c] != 1) {
+                        apply_coupling_method(ac, &cc->ch[0], cce, index);
+                        if (coup->ch_select[c] != 0)
+                            index++;
+                    }
+                    if (coup->ch_select[c] != 2)
+                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
+                } else
+                    index += 1 + (coup->ch_select[c] == 3);
+            }
+        }
+    }
+}
+
+/**
+ * Convert spectral data to samples, applying all supported tools as appropriate.
+ */
+static void spectral_to_sample(AACContext *ac, int samples)
+{
+    int i, type;
+    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LD:
+        imdct_and_window = imdct_and_windowing_ld;
+        break;
+    case AOT_ER_AAC_ELD:
+        imdct_and_window = imdct_and_windowing_eld;
+        break;
+    default:
+        imdct_and_window = ac->imdct_and_windowing;
+    }
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che && che->present) {
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, AAC_RENAME(apply_dependent_coupling));
+                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+                    if (che->ch[0].ics.predictor_present) {
+                        if (che->ch[0].ics.ltp.present)
+                            ac->apply_ltp(ac, &che->ch[0]);
+                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
+                            ac->apply_ltp(ac, &che->ch[1]);
+                    }
+                }
+                if (che->ch[0].tns.present)
+                    ac->apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
+                if (che->ch[1].tns.present)
+                    ac->apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, AAC_RENAME(apply_dependent_coupling));
+                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
+                    imdct_and_window(ac, &che->ch[0]);
+                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                        ac->update_ltp(ac, &che->ch[0]);
+                    if (type == TYPE_CPE) {
+                        imdct_and_window(ac, &che->ch[1]);
+                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                            ac->update_ltp(ac, &che->ch[1]);
+                    }
+                    if (ac->oc[1].m4ac.sbr > 0) {
+                        AAC_RENAME(ff_sbr_apply)(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
+                    }
+                }
+                if (type <= TYPE_CCE)
+                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, AAC_RENAME(apply_independent_coupling));
+
+#if USE_FIXED
+                {
+                    int j;
+                    /* preparation for resampler */
+                    for(j = 0; j<samples; j++){
+                        che->ch[0].ret[j] = (int32_t)av_clipl_int32((int64_t)che->ch[0].ret[j]<<7)+0x8000;
+                        if(type == TYPE_CPE)
+                            che->ch[1].ret[j] = (int32_t)av_clipl_int32((int64_t)che->ch[1].ret[j]<<7)+0x8000;
+                    }
+                }
+#endif /* USE_FIXED */
+                che->present = 0;
+            } else if (che) {
+                av_log(ac->avctx, AV_LOG_VERBOSE, "ChannelElement %d.%d missing \n", type, i);
+            }
+        }
+    }
+}
+
+static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
+{
+    int size;
+    AACADTSHeaderInfo hdr_info;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int layout_map_tags, ret;
+
+    size = avpriv_aac_parse_header(gb, &hdr_info);
+    if (size > 0) {
+        if (!ac->warned_num_aac_frames && hdr_info.num_aac_frames != 1) {
+            // This is 2 for "VLB " audio in NSV files.
+            // See samples/nsv/vlb_audio.
+            avpriv_report_missing_feature(ac->avctx,
+                                          "More than one AAC RDB per ADTS frame");
+            ac->warned_num_aac_frames = 1;
+        }
+        push_output_configuration(ac);
+        if (hdr_info.chan_config) {
+            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
+            if ((ret = set_default_channel_config(ac->avctx,
+                                                  layout_map,
+                                                  &layout_map_tags,
+                                                  hdr_info.chan_config)) < 0)
+                return ret;
+            if ((ret = output_configure(ac, layout_map, layout_map_tags,
+                                        FFMAX(ac->oc[1].status,
+                                              OC_TRIAL_FRAME), 0)) < 0)
+                return ret;
+        } else {
+            ac->oc[1].m4ac.chan_config = 0;
+            /**
+             * dual mono frames in Japanese DTV can have chan_config 0
+             * WITHOUT specifying PCE.
+             *  thus, set dual mono as default.
+             */
+            if (ac->dmono_mode && ac->oc[0].status == OC_NONE) {
+                layout_map_tags = 2;
+                layout_map[0][0] = layout_map[1][0] = TYPE_SCE;
+                layout_map[0][2] = layout_map[1][2] = AAC_CHANNEL_FRONT;
+                layout_map[0][1] = 0;
+                layout_map[1][1] = 1;
+                if (output_configure(ac, layout_map, layout_map_tags,
+                                     OC_TRIAL_FRAME, 0))
+                    return -7;
+            }
+        }
+        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
+        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
+        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
+        ac->oc[1].m4ac.frame_length_short = 0;
+        if (ac->oc[0].status != OC_LOCKED ||
+            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
+            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
+            ac->oc[1].m4ac.sbr = -1;
+            ac->oc[1].m4ac.ps  = -1;
+        }
+        if (!hdr_info.crc_absent)
+            skip_bits(gb, 16);
+    }
+    return size;
+}
+
+static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, GetBitContext *gb)
+{
+    AACContext *ac = avctx->priv_data;
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    ChannelElement *che;
+    int err, i;
+    int samples = m4ac->frame_length_short ? 960 : 1024;
+    int chan_config = m4ac->chan_config;
+    int aot = m4ac->object_type;
+
+    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
+        samples >>= 1;
+
+    ac->frame = data;
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        return err;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = aot - 1;
+
+    ac->tags_mapped = 0;
+
+    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
+        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
+                              chan_config);
+        return AVERROR_INVALIDDATA;
+    }
+    for (i = 0; i < tags_per_config[chan_config]; i++) {
+        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
+        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
+        if (!(che=get_che(ac, elem_type, elem_id))) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "channel element %d.%d is not allocated\n",
+                   elem_type, elem_id);
+            return AVERROR_INVALIDDATA;
+        }
+        che->present = 1;
+        if (aot != AOT_ER_AAC_ELD)
+            skip_bits(gb, 4);
+        switch (elem_type) {
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            break;
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        }
+        if (err < 0)
+            return err;
+    }
+
+    spectral_to_sample(ac, samples);
+
+    ac->frame->nb_samples = samples;
+    ac->frame->sample_rate = avctx->sample_rate;
+    *got_frame_ptr = 1;
+
+    skip_bits_long(gb, get_bits_left(gb));
+    return 0;
+}
+
+static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
+                                int *got_frame_ptr, GetBitContext *gb, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    ChannelElement *che = NULL, *che_prev = NULL;
+    enum RawDataBlockType elem_type, elem_type_prev = TYPE_END;
+    int err, elem_id;
+    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
+    int is_dmono, sce_count = 0;
+
+    ac->frame = data;
+
+    if (show_bits(gb, 12) == 0xfff) {
+        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
+            goto fail;
+        }
+        if (ac->oc[1].m4ac.sampling_index > 12) {
+            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        goto fail;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
+
+    ac->tags_mapped = 0;
+    // parse
+    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
+        elem_id = get_bits(gb, 4);
+
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "Elem type:%x id:%x\n", elem_type, elem_id);
+
+        if (!avctx->channels && elem_type != TYPE_PCE) {
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        if (elem_type < TYPE_DSE) {
+            if (!(che=get_che(ac, elem_type, elem_id))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
+                       elem_type, elem_id);
+                err = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+            samples = 1024;
+            che->present = 1;
+        }
+
+        switch (elem_type) {
+
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            sce_count++;
+            break;
+
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            audio_found = 1;
+            break;
+
+        case TYPE_CCE:
+            err = decode_cce(ac, gb, che);
+            break;
+
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            break;
+
+        case TYPE_DSE:
+            err = skip_data_stream_element(ac, gb);
+            break;
+
+        case TYPE_PCE: {
+            uint8_t layout_map[MAX_ELEM_ID*4][3];
+            int tags;
+            push_output_configuration(ac);
+            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb);
+            if (tags < 0) {
+                err = tags;
+                break;
+            }
+            if (pce_found) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
+            } else {
+                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
+                if (!err)
+                    ac->oc[1].m4ac.chan_config = 0;
+                pce_found = 1;
+            }
+            break;
+        }
+
+        case TYPE_FIL:
+            if (elem_id == 15)
+                elem_id += get_bits(gb, 8) - 1;
+            if (get_bits_left(gb) < 8 * elem_id) {
+                    av_log(avctx, AV_LOG_ERROR, "TYPE_FIL: "overread_err);
+                    err = AVERROR_INVALIDDATA;
+                    goto fail;
+            }
+            while (elem_id > 0)
+                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, elem_type_prev);
+            err = 0; /* FIXME */
+            break;
+
+        default:
+            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
+            break;
+        }
+
+        che_prev       = che;
+        elem_type_prev = elem_type;
+
+        if (err)
+            goto fail;
+
+        if (get_bits_left(gb) < 3) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if (!avctx->channels) {
+        *got_frame_ptr = 0;
+        return 0;
+    }
+
+    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
+    samples <<= multiplier;
+
+    spectral_to_sample(ac, samples);
+
+    if (ac->oc[1].status && audio_found) {
+        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
+        avctx->frame_size = samples;
+        ac->oc[1].status = OC_LOCKED;
+    }
+
+    if (multiplier) {
+        int side_size;
+        const uint8_t *side = av_packet_get_side_data(avpkt, AV_PKT_DATA_SKIP_SAMPLES, &side_size);
+        if (side && side_size>=4)
+            AV_WL32(side, 2*AV_RL32(side));
+    }
+
+    if (!ac->frame->data[0] && samples) {
+        av_log(avctx, AV_LOG_ERROR, "no frame data found\n");
+        err = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (samples) {
+        ac->frame->nb_samples = samples;
+        ac->frame->sample_rate = avctx->sample_rate;
+    } else
+        av_frame_unref(ac->frame);
+    *got_frame_ptr = !!samples;
+
+    /* for dual-mono audio (SCE + SCE) */
+    is_dmono = ac->dmono_mode && sce_count == 2 &&
+               ac->oc[1].channel_layout == (AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT);
+    if (is_dmono) {
+        if (ac->dmono_mode == 1)
+            ((AVFrame *)data)->data[1] =((AVFrame *)data)->data[0];
+        else if (ac->dmono_mode == 2)
+            ((AVFrame *)data)->data[0] =((AVFrame *)data)->data[1];
+    }
+
+    return 0;
+fail:
+    pop_output_configuration(ac);
+    return err;
+}
+
+static int aac_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame_ptr, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    GetBitContext gb;
+    int buf_consumed;
+    int buf_offset;
+    int err;
+    int new_extradata_size;
+    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_NEW_EXTRADATA,
+                                       &new_extradata_size);
+    int jp_dualmono_size;
+    const uint8_t *jp_dualmono   = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_JP_DUALMONO,
+                                       &jp_dualmono_size);
+
+    if (new_extradata && 0) {
+        av_free(avctx->extradata);
+        avctx->extradata = av_mallocz(new_extradata_size +
+                                      AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!avctx->extradata)
+            return AVERROR(ENOMEM);
+        avctx->extradata_size = new_extradata_size;
+        memcpy(avctx->extradata, new_extradata, new_extradata_size);
+        push_output_configuration(ac);
+        if (decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                         avctx->extradata,
+                                         avctx->extradata_size*8LL, 1) < 0) {
+            pop_output_configuration(ac);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    ac->dmono_mode = 0;
+    if (jp_dualmono && jp_dualmono_size > 0)
+        ac->dmono_mode =  1 + *jp_dualmono;
+    if (ac->force_dmono_mode >= 0)
+        ac->dmono_mode = ac->force_dmono_mode;
+
+    if (INT_MAX / 8 <= buf_size)
+        return AVERROR_INVALIDDATA;
+
+    if ((err = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return err;
+
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_LD:
+    case AOT_ER_AAC_ELD:
+        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
+        break;
+    default:
+        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb, avpkt);
+    }
+    if (err < 0)
+        return err;
+
+    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
+    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
+        if (buf[buf_offset])
+            break;
+
+    return buf_size > buf_offset ? buf_consumed : buf_size;
+}
+
+static av_cold int aac_decode_close(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int i, type;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        for (type = 0; type < 4; type++) {
+            if (ac->che[type][i])
+                AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][i]->sbr);
+            av_freep(&ac->che[type][i]);
+        }
+    }
+
+    ff_mdct_end(&ac->mdct);
+    ff_mdct_end(&ac->mdct_small);
+    ff_mdct_end(&ac->mdct_ld);
+    ff_mdct_end(&ac->mdct_ltp);
+#if !USE_FIXED
+    ff_imdct15_uninit(&ac->mdct480);
+#endif
+    av_freep(&ac->fdsp);
+    return 0;
+}
+
+static void aacdec_init(AACContext *c)
+{
+    c->imdct_and_windowing                      = imdct_and_windowing;
+    c->apply_ltp                                = apply_ltp;
+    c->apply_tns                                = apply_tns;
+    c->windowing_and_mdct_ltp                   = windowing_and_mdct_ltp;
+    c->update_ltp                               = update_ltp;
+#if USE_FIXED
+    c->vector_pow43                             = vector_pow43;
+    c->subband_scale                            = subband_scale;
+#endif
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacdec_init_mips(c);
+#endif /* !USE_FIXED */
+}
+/**
+ * AVOptions for Japanese DTV specific extensions (ADTS only)
+ */
+#define AACDEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    {"dual_mono_mode", "Select the channel to decode for dual mono",
+     offsetof(AACContext, force_dmono_mode), AV_OPT_TYPE_INT, {.i64=-1}, -1, 2,
+     AACDEC_FLAGS, "dual_mono_mode"},
+
+    {"auto", "autoselection",            0, AV_OPT_TYPE_CONST, {.i64=-1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"main", "Select Main/Left channel", 0, AV_OPT_TYPE_CONST, {.i64= 1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"sub" , "Select Sub/Right channel", 0, AV_OPT_TYPE_CONST, {.i64= 2}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"both", "Select both channels",     0, AV_OPT_TYPE_CONST, {.i64= 0}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+
+    {NULL},
+};
+
+static const AVClass aac_decoder_class = {
+    .class_name = "AAC decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_AAC_MAIN,  "Main"     },
+    { FF_PROFILE_AAC_LOW,   "LC"       },
+    { FF_PROFILE_AAC_SSR,   "SSR"      },
+    { FF_PROFILE_AAC_LTP,   "LTP"      },
+    { FF_PROFILE_AAC_HE,    "HE-AAC"   },
+    { FF_PROFILE_AAC_HE_V2, "HE-AACv2" },
+    { FF_PROFILE_AAC_LD,    "LD"       },
+    { FF_PROFILE_AAC_ELD,   "ELD"      },
+    { FF_PROFILE_UNKNOWN },
+};
diff --git a/libavcodec/aacdectab.h b/libavcodec/aacdectab.h
index b7c5f7e719..5c89a93d5d 100644
--- a/libavcodec/aacdectab.h
+++ b/libavcodec/aacdectab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,46 +38,11 @@
 /* @name ltp_coef
  * Table of the LTP coefficients
  */
-static const float ltp_coef[8] = {
-    0.570829, 0.696616, 0.813004, 0.911304,
-    0.984900, 1.067894, 1.194601, 1.369533,
+static const INTFLOAT ltp_coef[8] = {
+    Q30(0.570829f), Q30(0.696616f), Q30(0.813004f), Q30(0.911304f),
+    Q30(0.984900f), Q30(1.067894f), Q30(1.194601f), Q30(1.369533f),
 };
 
-/* @name tns_tmp2_map
- * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
- * The suffix _M_N[] indicate the values of coef_compress and coef_res
- * respectively.
- * @{
- */
-static const float tns_tmp2_map_1_3[4] = {
-     0.00000000, -0.43388373,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_0_3[8] = {
-     0.00000000, -0.43388373, -0.78183150, -0.97492790,
-     0.98480773,  0.86602539,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_1_4[8] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float tns_tmp2_map_0_4[16] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-    -0.74314481, -0.86602539, -0.95105654, -0.99452192,
-     0.99573416,  0.96182561,  0.89516330,  0.79801720,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float * const tns_tmp2_map[4] = {
-    tns_tmp2_map_0_3,
-    tns_tmp2_map_0_4,
-    tns_tmp2_map_1_3,
-    tns_tmp2_map_1_4
-};
-// @}
-
 static const int8_t tags_per_config[16] = { 0, 1, 1, 2, 3, 3, 4, 5, 0, 0, 0, 4, 5, 0, 5, 0 };
 
 static const uint8_t aac_channel_layout_map[16][5][3] = {
diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index d2df65bb69..1b95ebd755 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,6 @@
 /***********************************
  *              TODOs:
  * add sane pulse detection
- * add temporal noise shaping
  ***********************************/
 
 #include "libavutil/float_dsp.h"
@@ -42,120 +41,11 @@
 #include "aac.h"
 #include "aactab.h"
 #include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
 
 #include "psymodel.h"
 
-#define AAC_MAX_CHANNELS 6
-
-#define ERROR_IF(cond, ...) \
-    if (cond) { \
-        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
-        return AVERROR(EINVAL); \
-    }
-
-float ff_aac_pow34sf_tab[428];
-
-static const uint8_t swb_size_1024_96[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
-    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_64[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
-    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
-    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
-};
-
-static const uint8_t swb_size_1024_48[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-    96
-};
-
-static const uint8_t swb_size_1024_32[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-};
-
-static const uint8_t swb_size_1024_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_16[] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
-    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_8[] = {
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
-};
-
-static const uint8_t *swb_size_1024[] = {
-    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
-    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
-    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
-    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8
-};
-
-static const uint8_t swb_size_128_96[] = {
-    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
-};
-
-static const uint8_t swb_size_128_48[] = {
-    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
-};
-
-static const uint8_t swb_size_128_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
-};
-
-static const uint8_t swb_size_128_16[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
-};
-
-static const uint8_t swb_size_128_8[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
-};
-
-static const uint8_t *swb_size_128[] = {
-    /* the last entry on the following row is swb_size_128_64 but is a
-       duplicate of swb_size_128_96 */
-    swb_size_128_96, swb_size_128_96, swb_size_128_96,
-    swb_size_128_48, swb_size_128_48, swb_size_128_48,
-    swb_size_128_24, swb_size_128_24, swb_size_128_16,
-    swb_size_128_16, swb_size_128_16, swb_size_128_8
-};
-
-/** default channel configurations */
-static const uint8_t aac_chan_configs[6][5] = {
- {1, TYPE_SCE},                               // 1 channel  - single channel element
- {1, TYPE_CPE},                               // 2 channels - channel pair
- {2, TYPE_SCE, TYPE_CPE},                     // 3 channels - center + stereo
- {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},           // 4 channels - front center + stereo + back center
- {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},           // 5 channels - front center + stereo + back stereo
- {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 6 channels - front center + stereo + back stereo + LFE
-};
-
-/**
- * Table to remap channels from Libav's default order to AAC order.
- */
-static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
-    { 0 },
-    { 0, 1 },
-    { 2, 0, 1 },
-    { 2, 0, 1, 3 },
-    { 2, 0, 1, 3, 4 },
-    { 2, 0, 1, 4, 5, 3 },
-};
-
 /**
  * Make AAC audio config object.
  * @see 1.6.2.1 "Syntax - AudioSpecificConfig"
@@ -165,8 +55,8 @@ static void put_audio_specific_config(AVCodecContext *avctx)
     PutBitContext pb;
     AACEncContext *s = avctx->priv_data;
 
-    init_put_bits(&pb, avctx->extradata, avctx->extradata_size*8);
-    put_bits(&pb, 5, 2); //object type - AAC-LC
+    init_put_bits(&pb, avctx->extradata, avctx->extradata_size);
+    put_bits(&pb, 5, s->profile+1); //profile
     put_bits(&pb, 4, s->samplerate_index); //sample rate index
     put_bits(&pb, 4, s->channels);
     //GASpecificConfig
@@ -252,14 +142,15 @@ static void apply_window_and_mdct(AACEncContext *s, SingleChannelElement *sce,
     int i;
     float *output = sce->ret_buf;
 
-    apply_window[sce->ics.window_sequence[0]](&s->fdsp, sce, audio);
+    apply_window[sce->ics.window_sequence[0]](s->fdsp, sce, audio);
 
     if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE)
         s->mdct1024.mdct_calc(&s->mdct1024, sce->coeffs, output);
     else
         for (i = 0; i < 1024; i += 128)
-            s->mdct128.mdct_calc(&s->mdct128, sce->coeffs + i, output + i*2);
+            s->mdct128.mdct_calc(&s->mdct128, &sce->coeffs[i], output + i*2);
     memcpy(audio, audio + 1024, sizeof(audio[0]) * 1024);
+    memcpy(sce->pcoeffs, sce->coeffs, sizeof(sce->pcoeffs));
 }
 
 /**
@@ -275,7 +166,7 @@ static void put_ics_info(AACEncContext *s, IndividualChannelStream *info)
     put_bits(&s->pb, 1, info->use_kb_window[0]);
     if (info->window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
         put_bits(&s->pb, 6, info->max_sfb);
-        put_bits(&s->pb, 1, 0);            // no prediction
+        put_bits(&s->pb, 1, !!info->predictor_present);
     } else {
         put_bits(&s->pb, 4, info->max_sfb);
         for (w = 1; w < 8; w++)
@@ -304,27 +195,18 @@ static void encode_ms_info(PutBitContext *pb, ChannelElement *cpe)
 static void adjust_frame_information(ChannelElement *cpe, int chans)
 {
     int i, w, w2, g, ch;
-    int start, maxsfb, cmaxsfb;
+    int maxsfb, cmaxsfb;
 
     for (ch = 0; ch < chans; ch++) {
         IndividualChannelStream *ics = &cpe->ch[ch].ics;
-        start = 0;
         maxsfb = 0;
         cpe->ch[ch].pulse.num_pulse = 0;
-        for (w = 0; w < ics->num_windows*16; w += 16) {
-            for (g = 0; g < ics->num_swb; g++) {
-                //apply M/S
-                if (cpe->common_window && !ch && cpe->ms_mask[w + g]) {
-                    for (i = 0; i < ics->swb_sizes[g]; i++) {
-                        cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
-                        cpe->ch[1].coeffs[start+i] =  cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
-                    }
-                }
-                start += ics->swb_sizes[g];
+        for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+            for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+                for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w*16+cmaxsfb-1]; cmaxsfb--)
+                    ;
+                maxsfb = FFMAX(maxsfb, cmaxsfb);
             }
-            for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w+cmaxsfb-1]; cmaxsfb--)
-                ;
-            maxsfb = FFMAX(maxsfb, cmaxsfb);
         }
         ics->max_sfb = maxsfb;
 
@@ -360,6 +242,59 @@ static void adjust_frame_information(ChannelElement *cpe, int chans)
     }
 }
 
+static void apply_intensity_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                int p  = -1 + 2 * (cpe->ch[1].band_type[w*16+g] - 14);
+                float scale = cpe->ch[0].is_ener[w*16+g];
+                if (!cpe->is_mask[w*16 + g]) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
+                    cpe->ch[0].coeffs[start+i] = sum;
+                    cpe->ch[1].coeffs[start+i] = 0.0f;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
+static void apply_mid_side_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                if (!cpe->ms_mask[w*16 + g]) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float L = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) * 0.5f;
+                    float R = L - cpe->ch[1].coeffs[start+i];
+                    cpe->ch[0].coeffs[start+i] = L;
+                    cpe->ch[1].coeffs[start+i] = R;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
 /**
  * Encode scalefactor band coding type.
  */
@@ -367,6 +302,9 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 {
     int w;
 
+    if (s->coder->set_special_band_scalefactors)
+        s->coder->set_special_band_scalefactors(s, sce);
+
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
         s->coder->encode_window_bands_info(s, sce, w, sce->ics.group_len[w], s->lambda);
 }
@@ -377,16 +315,30 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 static void encode_scale_factors(AVCodecContext *avctx, AACEncContext *s,
                                  SingleChannelElement *sce)
 {
-    int off = sce->sf_idx[0], diff;
+    int diff, off_sf = sce->sf_idx[0], off_pns = sce->sf_idx[0] - NOISE_OFFSET;
+    int off_is = 0, noise_flag = 1;
     int i, w;
 
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (i = 0; i < sce->ics.max_sfb; i++) {
             if (!sce->zeroes[w*16 + i]) {
-                diff = sce->sf_idx[w*16 + i] - off + SCALE_DIFF_ZERO;
-                if (diff < 0 || diff > 120)
-                    av_log(avctx, AV_LOG_ERROR, "Scalefactor difference is too big to be coded\n");
-                off = sce->sf_idx[w*16 + i];
+                if (sce->band_type[w*16 + i] == NOISE_BT) {
+                    diff = sce->sf_idx[w*16 + i] - off_pns;
+                    off_pns = sce->sf_idx[w*16 + i];
+                    if (noise_flag-- > 0) {
+                        put_bits(&s->pb, NOISE_PRE_BITS, diff + NOISE_PRE);
+                        continue;
+                    }
+                } else if (sce->band_type[w*16 + i] == INTENSITY_BT  ||
+                           sce->band_type[w*16 + i] == INTENSITY_BT2) {
+                    diff = sce->sf_idx[w*16 + i] - off_is;
+                    off_is = sce->sf_idx[w*16 + i];
+                } else {
+                    diff = sce->sf_idx[w*16 + i] - off_sf;
+                    off_sf = sce->sf_idx[w*16 + i];
+                }
+                diff += SCALE_DIFF_ZERO;
+                av_assert0(diff >= 0 && diff <= 120);
                 put_bits(&s->pb, ff_aac_scalefactor_bits[diff], ff_aac_scalefactor_code[diff]);
             }
         }
@@ -426,18 +378,41 @@ static void encode_spectral_coeffs(AACEncContext *s, SingleChannelElement *sce)
                 start += sce->ics.swb_sizes[i];
                 continue;
             }
-            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++)
-                s->coder->quantize_and_encode_band(s, &s->pb, sce->coeffs + start + w2*128,
-                                                   sce->ics.swb_sizes[i],
+            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++) {
+                s->coder->quantize_and_encode_band(s, &s->pb,
+                                                   &sce->coeffs[start + w2*128],
+                                                   NULL, sce->ics.swb_sizes[i],
                                                    sce->sf_idx[w*16 + i],
                                                    sce->band_type[w*16 + i],
-                                                   s->lambda);
+                                                   s->lambda,
+                                                   sce->ics.window_clipping[w]);
+            }
             start += sce->ics.swb_sizes[i];
         }
     }
 }
 
 /**
+ * Downscale spectral coefficients for near-clipping windows to avoid artifacts
+ */
+static void avoid_clipping(AACEncContext *s, SingleChannelElement *sce)
+{
+    int start, i, j, w;
+
+    if (sce->ics.clip_avoidance_factor < 1.0f) {
+        for (w = 0; w < sce->ics.num_windows; w++) {
+            start = 0;
+            for (i = 0; i < sce->ics.max_sfb; i++) {
+                float *swb_coeffs = &sce->coeffs[start + w*128];
+                for (j = 0; j < sce->ics.swb_sizes[i]; j++)
+                    swb_coeffs[j] *= sce->ics.clip_avoidance_factor;
+                start += sce->ics.swb_sizes[i];
+            }
+        }
+    }
+}
+
+/**
  * Encode one channel of audio data.
  */
 static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
@@ -445,12 +420,17 @@ static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
                                      int common_window)
 {
     put_bits(&s->pb, 8, sce->sf_idx[0]);
-    if (!common_window)
+    if (!common_window) {
         put_ics_info(s, &sce->ics);
+        if (s->coder->encode_main_pred)
+            s->coder->encode_main_pred(s, sce);
+    }
     encode_band_info(s, sce);
     encode_scale_factors(avctx, s, sce);
     encode_pulses(s, &sce->pulse);
-    put_bits(&s->pb, 1, 0); //tns
+    put_bits(&s->pb, 1, !!sce->tns.present);
+    if (s->coder->encode_tns_info)
+        s->coder->encode_tns_info(s, sce);
     put_bits(&s->pb, 1, 0); //ssr
     encode_spectral_coeffs(s, sce);
     return 0;
@@ -478,7 +458,7 @@ static void put_bitstream_info(AACEncContext *s, const char *name)
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AAC order.
+ * Channels are reordered from libavcodec's default order to AAC order.
  */
 static void copy_input_samples(AACEncContext *s, const AVFrame *frame)
 {
@@ -508,7 +488,9 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     AACEncContext *s = avctx->priv_data;
     float **samples = s->planar_samples, *samples2, *la, *overlap;
     ChannelElement *cpe;
-    int i, ch, w, g, chans, tag, start_ch, ret;
+    SingleChannelElement *sce;
+    int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
+    int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
     int chan_el_counter[4];
     FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
 
@@ -537,6 +519,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         for (ch = 0; ch < chans; ch++) {
             IndividualChannelStream *ics = &cpe->ch[ch].ics;
             int cur_channel = start_ch + ch;
+            float clip_avoidance_factor;
             overlap  = &samples[cur_channel][0];
             samples2 = overlap + 1024;
             la       = samples2 + (448+64);
@@ -564,26 +547,50 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             ics->num_windows        = wi[ch].num_windows;
             ics->swb_sizes          = s->psy.bands    [ics->num_windows == 8];
             ics->num_swb            = tag == TYPE_LFE ? ics->num_swb : s->psy.num_bands[ics->num_windows == 8];
+            ics->swb_offset         = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_swb_offset_128 [s->samplerate_index]:
+                                        ff_swb_offset_1024[s->samplerate_index];
+            ics->tns_max_bands      = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_tns_max_bands_128 [s->samplerate_index]:
+                                        ff_tns_max_bands_1024[s->samplerate_index];
+            clip_avoidance_factor = 0.0f;
             for (w = 0; w < ics->num_windows; w++)
                 ics->group_len[w] = wi[ch].grouping[w];
+            for (w = 0; w < ics->num_windows; w++) {
+                if (wi[ch].clipping[w] > CLIP_AVOIDANCE_FACTOR) {
+                    ics->window_clipping[w] = 1;
+                    clip_avoidance_factor = FFMAX(clip_avoidance_factor, wi[ch].clipping[w]);
+                } else {
+                    ics->window_clipping[w] = 0;
+                }
+            }
+            if (clip_avoidance_factor > CLIP_AVOIDANCE_FACTOR) {
+                ics->clip_avoidance_factor = CLIP_AVOIDANCE_FACTOR / clip_avoidance_factor;
+            } else {
+                ics->clip_avoidance_factor = 1.0f;
+            }
 
             apply_window_and_mdct(s, &cpe->ch[ch], overlap);
+            if (isnan(cpe->ch->coeffs[0])) {
+                av_log(avctx, AV_LOG_ERROR, "Input contains NaN\n");
+                return AVERROR(EINVAL);
+            }
+            avoid_clipping(s, &cpe->ch[ch]);
         }
         start_ch += chans;
     }
-    if ((ret = ff_alloc_packet(avpkt, 768 * s->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192 * s->channels, 0)) < 0)
         return ret;
-    }
-
+    frame_bits = its = 0;
     do {
-        int frame_bits;
+        int target_bits, too_many_bits, too_few_bits;
 
         init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
         if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
             put_bitstream_info(s, LIBAVCODEC_IDENT);
         start_ch = 0;
+        target_bits = 0;
         memset(chan_el_counter, 0, sizeof(chan_el_counter));
         for (i = 0; i < s->chan_map[0]; i++) {
             FFPsyWindowInfo* wi = windows + start_ch;
@@ -591,16 +598,34 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             tag      = s->chan_map[i+1];
             chans    = tag == TYPE_CPE ? 2 : 1;
             cpe      = &s->cpe[i];
+            cpe->common_window = 0;
+            memset(cpe->is_mask, 0, sizeof(cpe->is_mask));
+            memset(cpe->ms_mask, 0, sizeof(cpe->ms_mask));
             put_bits(&s->pb, 3, tag);
             put_bits(&s->pb, 4, chan_el_counter[tag]++);
-            for (ch = 0; ch < chans; ch++)
-                coeffs[ch] = cpe->ch[ch].coeffs;
+            for (ch = 0; ch < chans; ch++) {
+                sce = &cpe->ch[ch];
+                coeffs[ch] = sce->coeffs;
+                sce->ics.predictor_present = 0;
+                memset(&sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+                memset(&sce->tns, 0, sizeof(TemporalNoiseShaping));
+                for (w = 0; w < 128; w++)
+                    if (sce->band_type[w] > RESERVED_BT)
+                        sce->band_type[w] = 0;
+            }
+            s->psy.bitres.alloc = -1;
+            s->psy.bitres.bits = avctx->frame_bits / s->channels;
             s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
+            if (s->psy.bitres.alloc > 0) {
+                /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
+                target_bits += s->psy.bitres.alloc;
+                s->psy.bitres.alloc /= chans;
+            }
+            s->cur_type = tag;
             for (ch = 0; ch < chans; ch++) {
                 s->cur_channel = start_ch + ch;
                 s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
             }
-            cpe->common_window = 0;
             if (chans > 1
                 && wi[0].window_type[0] == wi[1].window_type[0]
                 && wi[0].window_shape   == wi[1].window_shape) {
@@ -613,23 +638,61 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                     }
                 }
             }
+            for (ch = 0; ch < chans; ch++) { /* TNS and PNS */
+                sce = &cpe->ch[ch];
+                s->cur_channel = start_ch + ch;
+                if (s->options.pns && s->coder->search_for_pns)
+                    s->coder->search_for_pns(s, avctx, sce);
+                if (s->options.tns && s->coder->search_for_tns)
+                    s->coder->search_for_tns(s, sce);
+                if (s->options.tns && s->coder->apply_tns_filt)
+                    s->coder->apply_tns_filt(s, sce);
+                if (sce->tns.present)
+                    tns_mode = 1;
+            }
             s->cur_channel = start_ch;
-            if (s->options.stereo_mode && cpe->common_window) {
-                if (s->options.stereo_mode > 0) {
-                    IndividualChannelStream *ics = &cpe->ch[0].ics;
-                    for (w = 0; w < ics->num_windows; w += ics->group_len[w])
-                        for (g = 0;  g < ics->num_swb; g++)
-                            cpe->ms_mask[w*16+g] = 1;
-                } else if (s->coder->search_for_ms) {
-                    s->coder->search_for_ms(s, cpe, s->lambda);
+            if (s->options.intensity_stereo) { /* Intensity Stereo */
+                if (s->coder->search_for_is)
+                    s->coder->search_for_is(s, avctx, cpe);
+                if (cpe->is_mode) is_mode = 1;
+                apply_intensity_stereo(cpe);
+            }
+            if (s->options.pred) { /* Prediction */
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->search_for_pred)
+                        s->coder->search_for_pred(s, sce);
+                    if (cpe->ch[ch].ics.predictor_present) pred_mode = 1;
                 }
+                if (s->coder->adjust_common_prediction)
+                    s->coder->adjust_common_prediction(s, cpe);
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->apply_main_pred)
+                        s->coder->apply_main_pred(s, sce);
+                }
+                s->cur_channel = start_ch;
+            }
+            if (s->options.stereo_mode) { /* Mid/Side stereo */
+                if (s->options.stereo_mode == -1 && s->coder->search_for_ms)
+                    s->coder->search_for_ms(s, cpe);
+                else if (cpe->common_window)
+                    memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
+                for (w = 0; w < 128; w++)
+                    cpe->ms_mask[w] = cpe->is_mask[w] ? 0 : cpe->ms_mask[w];
+                apply_mid_side_stereo(cpe);
             }
             adjust_frame_information(cpe, chans);
             if (chans == 2) {
                 put_bits(&s->pb, 1, cpe->common_window);
                 if (cpe->common_window) {
                     put_ics_info(s, &cpe->ch[0].ics);
+                    if (s->coder->encode_main_pred)
+                        s->coder->encode_main_pred(s, &cpe->ch[0]);
                     encode_ms_info(&s->pb, cpe);
+                    if (cpe->ms_mode) ms_mode = 1;
                 }
             }
             for (ch = 0; ch < chans; ch++) {
@@ -639,27 +702,69 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             start_ch += chans;
         }
 
-        frame_bits = put_bits_count(&s->pb);
-        if (frame_bits <= 6144 * s->channels - 3) {
-            s->psy.bitres.bits = frame_bits / s->channels;
+        if (avctx->flags & CODEC_FLAG_QSCALE) {
+            /* When using a constant Q-scale, don't mess with lambda */
             break;
         }
 
-        s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits;
+        /* rate control stuff
+         * target either the nominal bitrate, or what psy's bit reservoir says to target
+         * whichever is greatest
+         */
 
+        frame_bits = put_bits_count(&s->pb);
+        target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / avctx->sample_rate);
+        target_bits = FFMIN(target_bits, 6144 * s->channels - 3);
+
+        /* When using ABR, be strict (but only for increasing) */
+        too_many_bits = target_bits + target_bits/2;
+        too_few_bits = target_bits - target_bits/8;
+
+        if (   its == 0 /* for steady-state Q-scale tracking */
+            || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
+            || frame_bits >= 6144 * s->channels - 3  )
+        {
+            float ratio = ((float)target_bits) / frame_bits;
+
+            if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
+                /*
+                 * This path is for steady-state Q-scale tracking
+                 * When frame bits fall within the stable range, we still need to adjust
+                 * lambda to maintain it like so in a stable fashion (large jumps in lambda
+                 * create artifacts and should be avoided), but slowly
+                 */
+                ratio = sqrtf(sqrtf(ratio));
+                ratio = av_clipf(ratio, 0.9f, 1.1f);
+            } else {
+                /* Not so fast though */
+                ratio = sqrtf(ratio);
+            }
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);
+
+            /* Keep iterating if we must reduce and lambda is in the sky */
+            if (s->lambda < 300.f || ratio > 0.9f) {
+                break;
+            } else {
+                if (is_mode || ms_mode || tns_mode || pred_mode) {
+                    for (i = 0; i < s->chan_map[0]; i++) {
+                        // Must restore coeffs
+                        chans = tag == TYPE_CPE ? 2 : 1;
+                        cpe = &s->cpe[i];
+                        for (ch = 0; ch < chans; ch++)
+                            memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, sizeof(cpe->ch[ch].coeffs));
+                    }
+                }
+                its++;
+            }
+        } else {
+            break;
+        }
     } while (1);
 
     put_bits(&s->pb, 3, TYPE_END);
     flush_put_bits(&s->pb);
     avctx->frame_bits = put_bits_count(&s->pb);
 
-    // rate control stuff
-    if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
-        float ratio = avctx->bit_rate * 1024.0f / avctx->sample_rate / avctx->frame_bits;
-        s->lambda *= ratio;
-        s->lambda = FFMIN(s->lambda, 65536.f);
-    }
-
     if (!frame)
         s->last_frame++;
 
@@ -678,10 +783,12 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
     ff_mdct_end(&s->mdct1024);
     ff_mdct_end(&s->mdct128);
     ff_psy_end(&s->psy);
+    ff_lpc_end(&s->lpc);
     if (s->psypp)
         ff_psy_preprocess_end(s->psypp);
     av_freep(&s->buffer.samples);
     av_freep(&s->cpe);
+    av_freep(&s->fdsp);
     ff_af_queue_close(&s->afq);
     return 0;
 }
@@ -690,7 +797,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 {
     int ret = 0;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     // window init
     ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
@@ -698,9 +807,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
     ff_init_ff_sine_windows(10);
     ff_init_ff_sine_windows(7);
 
-    if (ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0)) < 0)
         return ret;
-    if (ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0)) < 0)
         return ret;
 
     return 0;
@@ -709,8 +818,8 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 static av_cold int alloc_buffers(AVCodecContext *avctx, AACEncContext *s)
 {
     int ch;
-    FF_ALLOCZ_OR_GOTO(avctx, s->buffer.samples, 3 * 1024 * s->channels * sizeof(s->buffer.samples[0]), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->cpe, sizeof(ChannelElement) * s->chan_map[0], alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->buffer.samples, s->channels, 3 * 1024 * sizeof(s->buffer.samples[0]), alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->cpe, s->chan_map[0], sizeof(ChannelElement), alloc_fail);
     FF_ALLOCZ_OR_GOTO(avctx, avctx->extradata, 5 + AV_INPUT_BUFFER_PADDING_SIZE, alloc_fail);
 
     for(ch = 0; ch < s->channels; ch++)
@@ -737,14 +846,33 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
 
     s->channels = avctx->channels;
 
-    ERROR_IF(i == 16,
+    ERROR_IF(i == 16 || i >= ff_aac_swb_size_1024_len || i >= ff_aac_swb_size_128_len,
              "Unsupported sample rate %d\n", avctx->sample_rate);
     ERROR_IF(s->channels > AAC_MAX_CHANNELS,
              "Unsupported number of channels: %d\n", s->channels);
-    ERROR_IF(avctx->profile != FF_PROFILE_UNKNOWN && avctx->profile != FF_PROFILE_AAC_LOW,
-             "Unsupported profile %d\n", avctx->profile);
-    ERROR_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
-             "Too many bits per frame requested\n");
+    WARN_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
+             "Too many bits per frame requested, clamping to max\n");
+    if (avctx->profile == FF_PROFILE_AAC_MAIN) {
+        s->options.pred = 1;
+    } else if ((avctx->profile == FF_PROFILE_AAC_LOW ||
+                avctx->profile == FF_PROFILE_UNKNOWN) && s->options.pred) {
+        s->profile = 0; /* Main */
+        WARN_IF(1, "Prediction requested, changing profile to AAC-Main\n");
+    } else if (avctx->profile == FF_PROFILE_AAC_LOW ||
+               avctx->profile == FF_PROFILE_UNKNOWN) {
+        s->profile = 1; /* Low */
+    } else {
+        ERROR_IF(1, "Unsupported profile %d\n", avctx->profile);
+    }
+
+    if (s->options.aac_coder != AAC_CODER_TWOLOOP) {
+        s->options.intensity_stereo = 0;
+        s->options.pns = 0;
+    }
+
+    avctx->bit_rate = (int)FFMIN(
+        6144 * s->channels / 1024.0 * avctx->sample_rate,
+        avctx->bit_rate);
 
     s->samplerate_index = i;
 
@@ -759,8 +887,8 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     avctx->extradata_size = 5;
     put_audio_specific_config(avctx);
 
-    sizes[0]   = swb_size_1024[i];
-    sizes[1]   = swb_size_128[i];
+    sizes[0]   = ff_aac_swb_size_1024[i];
+    sizes[1]   = ff_aac_swb_size_128[i];
     lengths[0] = ff_aac_num_swb_1024[i];
     lengths[1] = ff_aac_num_swb_128[i];
     for (i = 0; i < s->chan_map[0]; i++)
@@ -769,14 +897,16 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
                            s->chan_map[0], grouping)) < 0)
         goto fail;
     s->psypp = ff_psy_preprocess_init(avctx);
-    s->coder = &ff_aac_coders[2];
+    s->coder = &ff_aac_coders[s->options.aac_coder];
+    ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
 
-    s->lambda = avctx->global_quality ? avctx->global_quality : 120;
+    if (HAVE_MIPSDSPR1)
+        ff_aac_coder_init_mips(s);
 
-    ff_aac_tableinit();
+    s->lambda = avctx->global_quality > 0 ? avctx->global_quality : 120;
+    s->random_state = 0x1f2e3d4c;
 
-    for (i = 0; i < 428; i++)
-        ff_aac_pow34sf_tab[i] = sqrt(ff_aac_pow2sf_tab[i] * sqrt(ff_aac_pow2sf_tab[i]));
+    ff_aac_tableinit();
 
     avctx->initial_padding = 1024;
     ff_af_queue_init(avctx, &s->afq);
@@ -793,6 +923,15 @@ static const AVOption aacenc_options[] = {
         {"auto",     "Selected by the Encoder", 0, AV_OPT_TYPE_CONST, {.i64 = -1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
         {"ms_off",   "Disable Mid/Side coding", 0, AV_OPT_TYPE_CONST, {.i64 =  0 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
         {"ms_force", "Force Mid/Side for the whole frame if possible", 0, AV_OPT_TYPE_CONST, {.i64 =  1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
+    {"aac_coder", "Coding algorithm", offsetof(AACEncContext, options.aac_coder), AV_OPT_TYPE_INT, {.i64 = AAC_CODER_TWOLOOP}, 0, AAC_CODER_NB-1, AACENC_FLAGS, "aac_coder"},
+        {"faac",     "FAAC-inspired method",      0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_FAAC},    INT_MIN, INT_MAX, AACENC_FLAGS, "aac_coder"},
+        {"anmr",     "ANMR method",               0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_ANMR},    INT_MIN, INT_MAX, AACENC_FLAGS, "aac_coder"},
+        {"twoloop",  "Two loop searching method", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_TWOLOOP}, INT_MIN, INT_MAX, AACENC_FLAGS, "aac_coder"},
+        {"fast",     "Constant quantizer",        0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_FAST},    INT_MIN, INT_MAX, AACENC_FLAGS, "aac_coder"},
+    {"aac_pns", "Perceptual Noise Substitution", offsetof(AACEncContext, options.pns), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, AACENC_FLAGS},
+    {"aac_is", "Intensity stereo coding", offsetof(AACEncContext, options.intensity_stereo), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, AACENC_FLAGS},
+    {"aac_tns", "Temporal noise shaping", offsetof(AACEncContext, options.tns), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, AACENC_FLAGS},
+    {"aac_pred", "AAC-Main prediction", offsetof(AACEncContext, options.pred), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, AACENC_FLAGS},
     {NULL}
 };
 
@@ -812,6 +951,7 @@ AVCodec ff_aac_encoder = {
     .init           = aac_encode_init,
     .encode2        = aac_encode_frame,
     .close          = aac_encode_end,
+    .supported_samplerates = mpeg4audio_sample_rates,
     .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY |
                       AV_CODEC_CAP_EXPERIMENTAL,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP,
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index dec445ce34..7e7609b1a8 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,24 @@
 #include "audio_frame_queue.h"
 #include "psymodel.h"
 
+#include "lpc.h"
+
+typedef enum AACCoder {
+    AAC_CODER_FAAC = 0,
+    AAC_CODER_ANMR,
+    AAC_CODER_TWOLOOP,
+    AAC_CODER_FAST,
+
+    AAC_CODER_NB,
+}AACCoder;
+
 typedef struct AACEncOptions {
     int stereo_mode;
+    int aac_coder;
+    int pns;
+    int tns;
+    int pred;
+    int intensity_stereo;
 } AACEncOptions;
 
 struct AACEncContext;
@@ -41,9 +57,19 @@ typedef struct AACCoefficientsEncoder {
                                   SingleChannelElement *sce, const float lambda);
     void (*encode_window_bands_info)(struct AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda);
-    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, int size,
-                                     int scale_idx, int cb, const float lambda);
-    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe, const float lambda);
+    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, float *out, int size,
+                                     int scale_idx, int cb, const float lambda, int rtz);
+    void (*encode_tns_info)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*encode_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*adjust_common_prediction)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*apply_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+    void (*search_for_pred)(struct AACEncContext *s, SingleChannelElement *sce);
 } AACCoefficientsEncoder;
 
 extern AACCoefficientsEncoder ff_aac_coders[];
@@ -57,9 +83,11 @@ typedef struct AACEncContext {
     PutBitContext pb;
     FFTContext mdct1024;                         ///< long (1024 samples) frame transform context
     FFTContext mdct128;                          ///< short (128 samples) frame transform context
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     float *planar_samples[6];                    ///< saved preprocessed input
 
+    int profile;                                 ///< copied from avctx
+    LPCContext lpc;                              ///< used by TNS
     int samplerate_index;                        ///< MPEG-4 samplerate index
     int channels;                                ///< channel count
     const uint8_t *chan_map;                     ///< channel configuration map
@@ -68,9 +96,12 @@ typedef struct AACEncContext {
     FFPsyContext psy;
     struct FFPsyPreprocessContext* psypp;
     AACCoefficientsEncoder *coder;
-    int cur_channel;
+    int cur_channel;                             ///< current channel for coder context
     int last_frame;
+    int random_state;
     float lambda;
+    enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
+
     AudioFrameQueue afq;
     DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
     DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
@@ -80,6 +111,6 @@ typedef struct AACEncContext {
     } buffer;
 } AACEncContext;
 
-extern float ff_aac_pow34sf_tab[428];
+void ff_aac_coder_init_mips(AACEncContext *c);
 
 #endif /* AVCODEC_AACENC_H */
diff --git a/libavcodec/aacenc_is.c b/libavcodec/aacenc_is.c
new file mode 100644
index 0000000000..e983b7548f
--- /dev/null
+++ b/libavcodec/aacenc_is.c
@@ -0,0 +1,141 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"
+#include "aacenc_quantization.h"
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase)
+{
+    int i, w2;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    float *L = use_pcoeffs ? sce0->pcoeffs : sce0->coeffs;
+    float *R = use_pcoeffs ? sce1->pcoeffs : sce1->coeffs;
+    float *L34 = &s->scoefs[256*0], *R34 = &s->scoefs[256*1];
+    float *IS  = &s->scoefs[256*2], *I34 = &s->scoefs[256*3];
+    float dist1 = 0.0f, dist2 = 0.0f;
+    struct AACISError is_error = {0};
+
+    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+        int is_band_type, is_sf_idx = FFMAX(1, sce0->sf_idx[(w+w2)*16+g]-4);
+        float e01_34 = phase*pow(ener1/ener0, 3.0/4.0);
+        float maxval, dist_spec_err = 0.0f;
+        float minthr = FFMIN(band0->threshold, band1->threshold);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++)
+            IS[i] = (L[start+(w+w2)*128+i] + phase*R[start+(w+w2)*128+i])*sqrt(ener0/ener01);
+        abs_pow34_v(L34, &L[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        abs_pow34_v(R34, &R[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        abs_pow34_v(I34, IS,                   sce0->ics.swb_sizes[g]);
+        maxval = find_max_val(1, sce0->ics.swb_sizes[g], I34);
+        is_band_type = find_min_book(maxval, is_sf_idx);
+        dist1 += quantize_band_cost(s, &L[start + (w+w2)*128], L34,
+                                    sce0->ics.swb_sizes[g],
+                                    sce0->sf_idx[(w+w2)*16+g],
+                                    sce0->band_type[(w+w2)*16+g],
+                                    s->lambda / band0->threshold, INFINITY, NULL, 0);
+        dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
+                                    sce1->ics.swb_sizes[g],
+                                    sce1->sf_idx[(w+w2)*16+g],
+                                    sce1->band_type[(w+w2)*16+g],
+                                    s->lambda / band1->threshold, INFINITY, NULL, 0);
+        dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
+                                    is_sf_idx, is_band_type,
+                                    s->lambda / minthr, INFINITY, NULL, 0);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+            dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
+            dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
+        }
+        dist_spec_err *= s->lambda / minthr;
+        dist2 += dist_spec_err;
+    }
+
+    is_error.pass = dist2 <= dist1;
+    is_error.phase = phase;
+    is_error.error = fabsf(dist1 - dist2);
+    is_error.dist1 = dist1;
+    is_error.dist2 = dist2;
+
+    return is_error;
+}
+
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe)
+{
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    int start = 0, count = 0, w, w2, g, i;
+    const float freq_mult = avctx->sample_rate/(1024.0f/sce0->ics.num_windows)/2.0f;
+
+    if (!cpe->common_window)
+        return;
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
+                cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
+                cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g]) {
+                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+                struct AACISError ph_err1, ph_err2, *erf;
+                if (sce0->band_type[w*16+g] == NOISE_BT ||
+                    sce1->band_type[w*16+g] == NOISE_BT) {
+                    start += sce0->ics.swb_sizes[g];
+                    continue;
+                }
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        float coef0 = fabsf(sce0->pcoeffs[start+(w+w2)*128+i]);
+                        float coef1 = fabsf(sce1->pcoeffs[start+(w+w2)*128+i]);
+                        ener0  += coef0*coef0;
+                        ener1  += coef1*coef1;
+                        ener01 += (coef0 + coef1)*(coef0 + coef1);
+                    }
+                }
+                ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01, 0, -1);
+                ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01, 0, +1);
+                erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+                if (erf->pass) {
+                    cpe->is_mask[w*16+g] = 1;
+                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0/ener01);
+                    cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
+                    cpe->ch[1].band_type[w*16+g] = erf->phase ? INTENSITY_BT : INTENSITY_BT2;
+                    count++;
+                }
+            }
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+    cpe->is_mode = !!count;
+}
diff --git a/libavcodec/aacenc_is.h b/libavcodec/aacenc_is.h
new file mode 100644
index 0000000000..31bbacac58
--- /dev/null
+++ b/libavcodec/aacenc_is.h
@@ -0,0 +1,50 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_IS_H
+#define AVCODEC_AACENC_IS_H
+
+#include "aacenc.h"
+
+/** Frequency in Hz for lower limit of intensity stereo **/
+#define INT_STEREO_LOW_LIMIT 6100
+
+struct AACISError {
+    int pass;    /* 1 if dist2 <= dist1  */
+    int phase;   /* -1 or +1             */
+    float error; /* fabs(dist1 - dist2)  */
+    float dist1; /* From original coeffs */
+    float dist2; /* From IS'd coeffs     */
+};
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase);
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+
+#endif /* AVCODEC_AACENC_IS_H */
diff --git a/libavcodec/aacenc_pred.c b/libavcodec/aacenc_pred.c
new file mode 100644
index 0000000000..c0e5e6e3b6
--- /dev/null
+++ b/libavcodec/aacenc_pred.c
@@ -0,0 +1,344 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aactab.h"
+#include "aacenc_pred.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"            /* <- Needed for common window distortions */
+#include "aacenc_quantization.h"
+
+#define RESTORE_PRED(sce, sfb) \
+        if (sce->ics.prediction_used[sfb]) {\
+            sce->ics.prediction_used[sfb] = 0;\
+            sce->band_type[sfb] = sce->band_alt[sfb];\
+        }
+
+static inline float flt16_round(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00008000U) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_even(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00007FFFU + (tmp.i & 0x00010000U >> 16)) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_trunc(float pf)
+{
+    union av_intfloat32 pun;
+    pun.f = pf;
+    pun.i &= 0xFFFF0000U;
+    return pun.f;
+}
+
+static inline void predict(PredictorState *ps, float *coef, float *rcoef, int set)
+{
+    float k2;
+    const float a     = 0.953125; // 61.0 / 64
+    const float alpha = 0.90625;  // 29.0 / 32
+    const float   k1 = ps->k1;
+    const float   r0 = ps->r0,     r1 = ps->r1;
+    const float cor0 = ps->cor0, cor1 = ps->cor1;
+    const float var0 = ps->var0, var1 = ps->var1;
+    const float e0 = *coef - ps->x_est;
+    const float e1 = e0 - k1 * r0;
+
+    if (set)
+        *coef = e0;
+
+    ps->cor1 = flt16_trunc(alpha * cor1 + r1 * e1);
+    ps->var1 = flt16_trunc(alpha * var1 + 0.5f * (r1 * r1 + e1 * e1));
+    ps->cor0 = flt16_trunc(alpha * cor0 + r0 * e0);
+    ps->var0 = flt16_trunc(alpha * var0 + 0.5f * (r0 * r0 + e0 * e0));
+    ps->r1   = flt16_trunc(a * (r0 - k1 * e0));
+    ps->r0   = flt16_trunc(a * e0);
+
+    /* Prediction for next frame */
+    ps->k1   = ps->var0 > 1 ? ps->cor0 * flt16_even(a / ps->var0) : 0;
+    k2       = ps->var1 > 1 ? ps->cor1 * flt16_even(a / ps->var1) : 0;
+    *rcoef   = ps->x_est = flt16_round(ps->k1*ps->r0 + k2*ps->r1);
+}
+
+static inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0    = 0.0f;
+    ps->r1    = 0.0f;
+    ps->k1    = 0.0f;
+    ps->cor0  = 0.0f;
+    ps->cor1  = 0.0f;
+    ps->var0  = 1.0f;
+    ps->var1  = 1.0f;
+    ps->x_est = 0.0f;
+}
+
+static inline void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static inline void reset_predictor_group(SingleChannelElement *sce, int group_num)
+{
+    int i;
+    PredictorState *ps = sce->predictor_state;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, k;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0; sfb < pmax; sfb++) {
+            for (k = sce->ics.swb_offset[sfb]; k < sce->ics.swb_offset[sfb + 1]; k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k], &sce->prcoeffs[k],
+                        sce->ics.predictor_present && sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group) {
+            reset_predictor_group(sce, sce->ics.predictor_reset_group);
+        }
+    } else {
+        reset_all_predictors(sce->predictor_state);
+    }
+}
+
+/* If inc = 0 you can check if this returns 0 to see if you can reset freely */
+static inline int update_counters(IndividualChannelStream *ics, int inc)
+{
+    int i;
+    for (i = 1; i < 31; i++) {
+        ics->predictor_reset_count[i] += inc;
+        if (ics->predictor_reset_count[i] > PRED_RESET_FRAME_MIN)
+            return i; /* Reset this immediately */
+    }
+    return 0;
+}
+
+void ff_aac_adjust_common_prediction(AACEncContext *s, ChannelElement *cpe)
+{
+    int start, w, w2, g, i, count = 0;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    const int pmax0 = FFMIN(sce0->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax1 = FFMIN(sce1->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax  = FFMIN(pmax0, pmax1);
+
+    if (!cpe->common_window ||
+        sce0->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE ||
+        sce1->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE)
+        return;
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            int sfb = w*16+g;
+            int sum = sce0->ics.prediction_used[sfb] + sce1->ics.prediction_used[sfb];
+            float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+            struct AACISError ph_err1, ph_err2, *erf;
+            if (sfb < PRED_SFB_START || sfb > pmax || sum != 2) {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+                start += sce0->ics.swb_sizes[g];
+                continue;
+            }
+            for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                    float coef0 = sce0->pcoeffs[start+(w+w2)*128+i];
+                    float coef1 = sce1->pcoeffs[start+(w+w2)*128+i];
+                    ener0  += coef0*coef0;
+                    ener1  += coef1*coef1;
+                    ener01 += (coef0 + coef1)*(coef0 + coef1);
+                }
+            }
+            ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, -1);
+            ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, +1);
+            erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+            if (erf->pass) {
+                sce0->ics.prediction_used[sfb] = 1;
+                sce1->ics.prediction_used[sfb] = 1;
+                count++;
+            } else {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+            }
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+
+    sce1->ics.predictor_present = sce0->ics.predictor_present = !!count;
+}
+
+static void update_pred_resets(SingleChannelElement *sce)
+{
+    int i, max_group_id_c, max_frame = 0;
+    float avg_frame = 0.0f;
+    IndividualChannelStream *ics = &sce->ics;
+
+    /* Update the counters and immediately update any frame behind schedule */
+    if ((ics->predictor_reset_group = update_counters(&sce->ics, 1)))
+        return;
+
+    for (i = 1; i < 31; i++) {
+        /* Count-based */
+        if (ics->predictor_reset_count[i] > max_frame) {
+            max_group_id_c = i;
+            max_frame = ics->predictor_reset_count[i];
+        }
+        avg_frame = (ics->predictor_reset_count[i] + avg_frame)/2;
+    }
+
+    if (max_frame > PRED_RESET_MIN) {
+        ics->predictor_reset_group = max_group_id_c;
+    } else {
+        ics->predictor_reset_group = 0;
+    }
+}
+
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, i, count = 0, cost_coeffs = 0, cost_pred = 0;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    float *O34  = &s->scoefs[128*0], *P34 = &s->scoefs[128*1];
+    float *SENT = &s->scoefs[128*2], *S34 = &s->scoefs[128*3];
+    float *QERR = &s->scoefs[128*4];
+
+    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        sce->ics.predictor_present = 0;
+        return;
+    }
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+        memcpy(sce->prcoeffs, sce->coeffs, 1024*sizeof(float));
+        for (i = 1; i < 31; i++)
+            sce->ics.predictor_reset_count[i] = i;
+    }
+
+    update_pred_resets(sce);
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+
+    for (sfb = PRED_SFB_START; sfb < pmax; sfb++) {
+        int cost1, cost2, cb_p;
+        float dist1, dist2, dist_spec_err = 0.0f;
+        const int cb_n = sce->band_type[sfb];
+        const int start_coef = sce->ics.swb_offset[sfb];
+        const int num_coeffs = sce->ics.swb_offset[sfb + 1] - start_coef;
+        const FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[sfb];
+
+        if (start_coef + num_coeffs > MAX_PREDICTORS ||
+            (s->cur_channel && sce->band_type[sfb] >= INTENSITY_BT2) ||
+            sce->band_type[sfb] == NOISE_BT)
+            continue;
+
+        /* Normal coefficients */
+        abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs);
+        dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
+                                              O34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, 0);
+        cost_coeffs += cost1;
+
+        /* Encoded coefficients - needed for #bits, band type and quant. error */
+        for (i = 0; i < num_coeffs; i++)
+            SENT[i] = sce->coeffs[start_coef + i] - sce->prcoeffs[start_coef + i];
+        abs_pow34_v(S34, SENT, num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = find_min_book(find_max_val(1, num_coeffs, S34), sce->sf_idx[sfb]);
+        else
+            cb_p = cb_n;
+        quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
+                                      sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
+                                      &cost2, 0);
+
+        /* Reconstructed coefficients - needed for distortion measurements */
+        for (i = 0; i < num_coeffs; i++)
+            sce->prcoeffs[start_coef + i] += QERR[i] != 0.0f ? (sce->prcoeffs[start_coef + i] - QERR[i]) : 0.0f;
+        abs_pow34_v(P34, &sce->prcoeffs[start_coef], num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = find_min_book(find_max_val(1, num_coeffs, P34), sce->sf_idx[sfb]);
+        else
+            cb_p = cb_n;
+        dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
+                                              P34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, 0);
+        for (i = 0; i < num_coeffs; i++)
+            dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
+        dist_spec_err *= s->lambda / band->threshold;
+        dist2 += dist_spec_err;
+
+        if (dist2 <= dist1 && cb_p <= cb_n) {
+            cost_pred += cost2;
+            sce->ics.prediction_used[sfb] = 1;
+            sce->band_alt[sfb]  = cb_n;
+            sce->band_type[sfb] = cb_p;
+            count++;
+        } else {
+            cost_pred += cost1;
+            sce->band_alt[sfb] = cb_p;
+        }
+    }
+
+    if (count && cost_coeffs < cost_pred) {
+        count = 0;
+        for (sfb = PRED_SFB_START; sfb < pmax; sfb++)
+            RESTORE_PRED(sce, sfb);
+        memset(&sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+    }
+
+    sce->ics.predictor_present = !!count;
+}
+
+/**
+ * Encoder predictors data.
+ */
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb;
+    IndividualChannelStream *ics = &sce->ics;
+    const int pmax = FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (!ics->predictor_present)
+        return;
+
+    put_bits(&s->pb, 1, !!ics->predictor_reset_group);
+    if (ics->predictor_reset_group)
+        put_bits(&s->pb, 5, ics->predictor_reset_group);
+    for (sfb = 0; sfb < pmax; sfb++)
+        put_bits(&s->pb, 1, ics->prediction_used[sfb]);
+}
diff --git a/libavcodec/aacenc_pred.h b/libavcodec/aacenc_pred.h
new file mode 100644
index 0000000000..999af869f5
--- /dev/null
+++ b/libavcodec/aacenc_pred.h
@@ -0,0 +1,47 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder main prediction
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_PRED_H
+#define AVCODEC_AACENC_PRED_H
+
+#include "aacenc.h"
+
+/* Every predictor group needs to get reset at least once in this many frames */
+#define PRED_RESET_FRAME_MIN 240
+
+/* Any frame with less than this amount of frames since last reset is ok */
+#define PRED_RESET_MIN 64
+
+/* Raise to filter any low frequency artifacts due to prediction */
+#define PRED_SFB_START 10
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_adjust_common_prediction(AACEncContext *s, ChannelElement *cpe);
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_PRED_H */
diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h
new file mode 100644
index 0000000000..6776dc37f7
--- /dev/null
+++ b/libavcodec/aacenc_quantization.h
@@ -0,0 +1,274 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder quantizer
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_QUANTIZATION_H
+#define AVCODEC_AACENC_QUANTIZATION_H
+
+#include "aactab.h"
+#include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+
+/**
+ * Calculate rate distortion cost for quantizing with given codebook
+ *
+ * @return quantization distortion
+ */
+static av_always_inline float quantize_and_encode_band_cost_template(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *out,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, int BT_ZERO, int BT_UNSIGNED,
+                                int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
+                                const float ROUNDING)
+{
+    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
+    const float Q   = ff_aac_pow2sf_tab [q_idx];
+    const float Q34 = ff_aac_pow34sf_tab[q_idx];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f*IQ;
+    int i, j;
+    float cost = 0;
+    const int dim = BT_PAIR ? 2 : 4;
+    int resbits = 0;
+    int off;
+
+    if (BT_ZERO || BT_NOISE || BT_STEREO) {
+        for (i = 0; i < size; i++)
+            cost += in[i]*in[i];
+        if (bits)
+            *bits = 0;
+        if (out) {
+            for (i = 0; i < size; i += dim)
+                for (j = 0; j < dim; j++)
+                    out[i+j] = 0.0f;
+        }
+        return cost * lambda;
+    }
+    if (!scaled) {
+        abs_pow34_v(s->scoefs, in, size);
+        scaled = s->scoefs;
+    }
+    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, aac_cb_maxval[cb], ROUNDING);
+    if (BT_UNSIGNED) {
+        off = 0;
+    } else {
+        off = aac_cb_maxval[cb];
+    }
+    for (i = 0; i < size; i += dim) {
+        const float *vec;
+        int *quants = s->qcoefs + i;
+        int curidx = 0;
+        int curbits;
+        float quantized, rd = 0.0f;
+        for (j = 0; j < dim; j++) {
+            curidx *= aac_cb_range[cb];
+            curidx += quants[j] + off;
+        }
+        curbits =  ff_aac_spectral_bits[cb-1][curidx];
+        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
+        if (BT_UNSIGNED) {
+            for (j = 0; j < dim; j++) {
+                float t = fabsf(in[i+j]);
+                float di;
+                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
+                    if (t >= CLIPPED_ESCAPE) {
+                        quantized = CLIPPED_ESCAPE;
+                        curbits += 21;
+                    } else {
+                        int c = av_clip_uintp2(quant(t, Q, ROUNDING), 13);
+                        quantized = c*cbrtf(c)*IQ;
+                        curbits += av_log2(c)*2 - 4 + 1;
+                    }
+                } else {
+                    quantized = vec[j]*IQ;
+                }
+                di = t - quantized;
+                if (out)
+                    out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
+                if (vec[j] != 0.0f)
+                    curbits++;
+                rd += di*di;
+            }
+        } else {
+            for (j = 0; j < dim; j++) {
+                quantized = vec[j]*IQ;
+                if (out)
+                    out[i+j] = quantized;
+                rd += (in[i+j] - quantized)*(in[i+j] - quantized);
+            }
+        }
+        cost    += rd * lambda + curbits;
+        resbits += curbits;
+        if (cost >= uplim)
+            return uplim;
+        if (pb) {
+            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
+            if (BT_UNSIGNED)
+                for (j = 0; j < dim; j++)
+                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
+                        put_bits(pb, 1, in[i+j] < 0.0f);
+            if (BT_ESC) {
+                for (j = 0; j < 2; j++) {
+                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
+                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q, ROUNDING), 13);
+                        int len = av_log2(coef);
+
+                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
+                        put_sbits(pb, len, coef);
+                    }
+                }
+            }
+        }
+    }
+
+    if (bits)
+        *bits = resbits;
+    return cost;
+}
+
+static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s, PutBitContext *pb,
+                                                const float *in, float *quant, const float *scaled,
+                                                int size, int scale_idx, int cb,
+                                                const float lambda, const float uplim,
+                                                int *bits) {
+    av_assert0(0);
+    return 0.0f;
+}
+
+#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO, ROUNDING) \
+static float quantize_and_encode_band_cost_ ## NAME(                                         \
+                                struct AACEncContext *s,                                     \
+                                PutBitContext *pb, const float *in, float *quant,            \
+                                const float *scaled, int size, int scale_idx,                \
+                                int cb, const float lambda, const float uplim,               \
+                                int *bits) {                                                 \
+    return quantize_and_encode_band_cost_template(                                           \
+                                s, pb, in, quant, scaled, size, scale_idx,                   \
+                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,                   \
+                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO,  \
+                                ROUNDING);                                                   \
+}
+
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC_RTZ, 0, 1, 1, 1, 0, 0, ROUND_TO_ZERO)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NOISE, 0, 0, 0, 0, 1, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(STEREO,0, 0, 0, 0, 0, 1, ROUND_STANDARD)
+
+static float (*const quantize_and_encode_band_cost_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+static float (*const quantize_and_encode_band_cost_rtz_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC_RTZ,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+#define quantize_and_encode_band_cost(                                  \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, rtz)               \
+    ((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)
+
+static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, int rtz)
+{
+    return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, lambda, uplim, bits, rtz);
+}
+
+static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, int rtz)
+{
+    int _bits;
+    quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, 0.0f, uplim, &_bits, rtz);
+    if (bits) {
+        *bits = _bits;
+    }
+    return _bits;
+}
+
+static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
+                                            const float *in, float *out, int size, int scale_idx,
+                                            int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, rtz);
+}
+
+#endif /* AVCODEC_AACENC_QUANTIZATION_H */
diff --git a/libavcodec/aacenc_tns.c b/libavcodec/aacenc_tns.c
new file mode 100644
index 0000000000..815b4e3ab9
--- /dev/null
+++ b/libavcodec/aacenc_tns.c
@@ -0,0 +1,219 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc.h"
+#include "aacenc_tns.h"
+#include "aactab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
+
+/*
+ * Shifts the values as well if compression is possible.
+ */
+static inline int compress_coeffs(int *coef, int order, int c_bits)
+{
+    int i, res = 0;
+    const int low_idx   = c_bits ?  4 : 2;
+    const int shift_val = c_bits ?  8 : 4;
+    const int high_idx  = c_bits ? 11 : 5;
+    for (i = 0; i < order; i++)
+        if (coef[i] < low_idx || coef[i] > high_idx)
+            res++;
+    if (res == order)
+        for (i = 0; i < order; i++)
+            coef[i] -= (coef[i] > high_idx) ? shift_val : 0;
+    return res == order;
+}
+
+/**
+ * Encode TNS data.
+ * Coefficient compression saves a single bit per coefficient.
+ */
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce)
+{
+    int i, w, filt, coef_len, coef_compress = 0;
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    TemporalNoiseShaping *tns = &sce->tns;
+    const int c_bits = is8 ? TNS_Q_BITS_SHORT == 4 : TNS_Q_BITS == 4;
+
+    if (!sce->tns.present)
+        return;
+
+    for (i = 0; i < sce->ics.num_windows; i++) {
+        put_bits(&s->pb, 2 - is8, sce->tns.n_filt[i]);
+        if (tns->n_filt[i]) {
+            put_bits(&s->pb, 1, c_bits);
+            for (filt = 0; filt < tns->n_filt[i]; filt++) {
+                put_bits(&s->pb, 6 - 2 * is8, tns->length[i][filt]);
+                put_bits(&s->pb, 5 - 2 * is8, tns->order[i][filt]);
+                if (tns->order[i][filt]) {
+                    coef_compress = compress_coeffs(tns->coef_idx[i][filt],
+                                                    tns->order[i][filt], c_bits);
+                    put_bits(&s->pb, 1, !!tns->direction[i][filt]);
+                    put_bits(&s->pb, 1, !!coef_compress);
+                    coef_len = c_bits + 3 - coef_compress;
+                    for (w = 0; w < tns->order[i][filt]; w++)
+                        put_bits(&s->pb, coef_len, tns->coef_idx[i][filt][w]);
+                }
+            }
+        }
+    }
+}
+
+/* Apply TNS filter */
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    int w, filt, m, i, top, order, bottom, start, end, size, inc;
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    float lpc[TNS_MAX_ORDER];
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            // ar filter
+            for (m = 0; m < size; m++, start += inc)
+                for (i = 1; i <= FFMIN(m, order); i++)
+                    sce->coeffs[start] += lpc[i-1]*sce->pcoeffs[start - i*inc];
+        }
+    }
+}
+
+/*
+ * c_bits - 1 if 4 bit coefficients, 0 if 3 bit coefficients
+ */
+static inline void quantize_coefs(double *coef, int *idx, float *lpc, int order,
+                                  int c_bits)
+{
+    int i;
+    const float *quant_arr = tns_tmp2_map[c_bits];
+    for (i = 0; i < order; i++) {
+        idx[i] = quant_array_idx((float)coef[i], quant_arr, c_bits ? 16 : 8);
+        lpc[i] = quant_arr[idx[i]];
+    }
+}
+
+/*
+ * 3 bits per coefficient with 8 short windows
+ */
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    int w, w2, g, count = 0;
+    const int mmm = FFMIN(sce->ics.tns_max_bands, sce->ics.max_sfb);
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int c_bits = is8 ? TNS_Q_BITS_SHORT == 4 : TNS_Q_BITS == 4;
+
+    int sfb_start = av_clip(tns_min_sfb[is8][s->samplerate_index], 0, mmm);
+    int sfb_end   = av_clip(sce->ics.num_swb, 0, mmm);
+
+    for (w = 0; w < sce->ics.num_windows; w++) {
+        int use_tns;
+        int order = is8 ? 5 : s->profile == FF_PROFILE_AAC_LOW ? 12 : TNS_MAX_ORDER;
+        int coef_start = w*sce->ics.num_swb + sce->ics.swb_offset[sfb_start];
+        int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
+        float e_ratio = 0.0f, threshold = 0.0f, spread = 0.0f, en[2] = {0.0, 0.0f};
+        double gain = 0.0f, coefs[MAX_LPC_ORDER] = {0};
+
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (w*16+g < sfb_start || w*16+g > sfb_end)
+                continue;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                if ((w+w2)*16+g > sfb_start + ((sfb_end - sfb_start)/2))
+                    en[1] += band->energy;
+                else
+                    en[0] += band->energy;
+                threshold += band->threshold;
+                spread    += band->spread;
+            }
+        }
+
+        if (coef_len <= 0 || (sfb_end - sfb_start) <= 0)
+            continue;
+
+        /* LPC */
+        gain = ff_lpc_calc_ref_coefs_f(&s->lpc, &sce->coeffs[coef_start],
+                                       coef_len, order, coefs);
+
+        if (!order || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
+            use_tns = 0;
+        else if ((en[0]+en[1]) < TNS_GAIN_THRESHOLD_LOW*threshold || spread < TNS_SPREAD_THRESHOLD)
+            use_tns = 0;
+        else
+            use_tns = 1;
+
+        if (use_tns) {
+            e_ratio = en[0]/en[1];
+            if (is8 || order < 2 || (e_ratio > TNS_E_RATIO_LOW && e_ratio < TNS_E_RATIO_HIGH)) {
+                tns->n_filt[w] = 1;
+                for (g = 0; g < tns->n_filt[w]; g++) {
+                    tns->length[w][g] = sfb_end - sfb_start;
+                    tns->direction[w][g] = en[0] < en[1];
+                    tns->order[w][g] = order;
+                    quantize_coefs(coefs, tns->coef_idx[w][g], tns->coef[w][g],
+                                   order, c_bits);
+                }
+            } else {  /* 2 filters due to energy disbalance */
+                tns->n_filt[w] = 2;
+                for (g = 0; g < tns->n_filt[w]; g++) {
+                    tns->direction[w][g] = en[g] < en[!g];
+                    tns->order[w][g] = !g ? order/2 : order - tns->order[w][g-1];
+                    tns->length[w][g] = !g ? (sfb_end - sfb_start)/2 : \
+                                    (sfb_end - sfb_start) - tns->length[w][g-1];
+                    quantize_coefs(&coefs[!g ? 0 : order - tns->order[w][g-1]],
+                                   tns->coef_idx[w][g], tns->coef[w][g],
+                                   tns->order[w][g], c_bits);
+                }
+            }
+            count++;
+        }
+    }
+    sce->tns.present = !!count;
+}
diff --git a/libavcodec/aacenc_tns.h b/libavcodec/aacenc_tns.h
new file mode 100644
index 0000000000..a7c119f368
--- /dev/null
+++ b/libavcodec/aacenc_tns.h
@@ -0,0 +1,55 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_TNS_H
+#define AVCODEC_AACENC_TNS_H
+
+#include "aacenc.h"
+
+/* Could be set to 3 to save an additional bit at the cost of little quality */
+#define TNS_Q_BITS 4
+
+/* Coefficient resolution in short windows */
+#define TNS_Q_BITS_SHORT 3
+
+/* TNS will only be used if the LPC gain is within these margins */
+#define TNS_GAIN_THRESHOLD_LOW  1.437f
+#define TNS_GAIN_THRESHOLD_HIGH 21.19f
+
+/* If the energy ratio between the low SFBs vs the high SFBs is not between
+ * those two values, use 2 filters instead */
+#define TNS_E_RATIO_LOW  0.77
+#define TNS_E_RATIO_HIGH 1.23
+
+/* Do not use TNS if the psy band spread is below this value */
+#define TNS_SPREAD_THRESHOLD 0.5f
+
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_TNS_H */
diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h
new file mode 100644
index 0000000000..dbc9554379
--- /dev/null
+++ b/libavcodec/aacenc_utils.h
@@ -0,0 +1,149 @@
+/*
+ * AAC encoder utilities
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder utilities
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_UTILS_H
+#define AVCODEC_AACENC_UTILS_H
+
+#include "aac.h"
+#include "aac_tablegen_decl.h"
+#include "aacenctab.h"
+
+#define ROUND_STANDARD 0.4054f
+#define ROUND_TO_ZERO 0.1054f
+#define C_QUANT 0.4054f
+
+static inline void abs_pow34_v(float *out, const float *in, const int size)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        float a = fabsf(in[i]);
+        out[i] = sqrtf(a * sqrtf(a));
+    }
+}
+
+/**
+ * Quantize one coefficient.
+ * @return absolute value of the quantized coefficient
+ * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
+ */
+static inline int quant(float coef, const float Q, const float rounding)
+{
+    float a = coef * Q;
+    return sqrtf(a * sqrtf(a)) + rounding;
+}
+
+static inline void quantize_bands(int *out, const float *in, const float *scaled,
+                                  int size, float Q34, int is_signed, int maxval,
+                                  const float rounding)
+{
+    int i;
+    double qc;
+    for (i = 0; i < size; i++) {
+        qc = scaled[i] * Q34;
+        out[i] = (int)FFMIN(qc + rounding, (double)maxval);
+        if (is_signed && in[i] < 0.0f) {
+            out[i] = -out[i];
+        }
+    }
+}
+
+static inline float find_max_val(int group_len, int swb_size, const float *scaled)
+{
+    float maxval = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        for (i = 0; i < swb_size; i++) {
+            maxval = FFMAX(maxval, scaled[w2*128+i]);
+        }
+    }
+    return maxval;
+}
+
+static inline int find_min_book(float maxval, int sf)
+{
+    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
+    float Q34 = sqrtf(Q * sqrtf(Q));
+    int qmaxval, cb;
+    qmaxval = maxval * Q34 + C_QUANT;
+    if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
+        cb = 11;
+    else
+        cb = aac_maxval_cb[qmaxval];
+    return cb;
+}
+
+/** Return the minimum scalefactor where the quantized coef does not clip. */
+static inline uint8_t coef2minsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/** Return the maximum scalefactor where the quantized coef is not zero. */
+static inline uint8_t coef2maxsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/*
+ * Returns the closest possible index to an array of float values, given a value.
+ */
+static inline int quant_array_idx(const float val, const float *arr, const int num)
+{
+    int i, index = 0;
+    float quant_min_err = INFINITY;
+    for (i = 0; i < num; i++) {
+        float error = (val - arr[i])*(val - arr[i]);
+        if (error < quant_min_err) {
+            quant_min_err = error;
+            index = i;
+        }
+    }
+    return index;
+}
+
+/*
+ * linear congruential pseudorandom number generator, copied from the decoder
+ */
+static inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+#define ERROR_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
+        return AVERROR(EINVAL); \
+    }
+
+#define WARN_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_WARNING, __VA_ARGS__); \
+    }
+
+
+#endif /* AVCODEC_AACENC_UTILS_H */
diff --git a/libavcodec/aacenctab.c b/libavcodec/aacenctab.c
new file mode 100644
index 0000000000..f3d70fbe31
--- /dev/null
+++ b/libavcodec/aacenctab.c
@@ -0,0 +1,108 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "aacenctab.h"
+
+static const uint8_t swb_size_128_96[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_64[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_48[] = {
+    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
+};
+
+static const uint8_t swb_size_128_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
+};
+
+static const uint8_t swb_size_128_16[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_128_8[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_1024_96[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_64[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
+    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
+    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
+};
+
+static const uint8_t swb_size_1024_48[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    96
+};
+
+static const uint8_t swb_size_1024_32[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+};
+
+static const uint8_t swb_size_1024_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_16[] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
+    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_8[] = {
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
+};
+
+const uint8_t *ff_aac_swb_size_128[] = {
+    swb_size_128_96, swb_size_128_96, swb_size_128_64,
+    swb_size_128_48, swb_size_128_48, swb_size_128_48,
+    swb_size_128_24, swb_size_128_24, swb_size_128_16,
+    swb_size_128_16, swb_size_128_16, swb_size_128_8,
+    swb_size_128_8
+};
+
+const uint8_t *ff_aac_swb_size_1024[] = {
+    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
+    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
+    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
+    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8,
+    swb_size_1024_8
+};
+
+const int ff_aac_swb_size_128_len  = FF_ARRAY_ELEMS(ff_aac_swb_size_128);
+const int ff_aac_swb_size_1024_len = FF_ARRAY_ELEMS(ff_aac_swb_size_1024);
diff --git a/libavcodec/aacenctab.h b/libavcodec/aacenctab.h
new file mode 100644
index 0000000000..9157cff810
--- /dev/null
+++ b/libavcodec/aacenctab.h
@@ -0,0 +1,117 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder data
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENCTAB_H
+#define AVCODEC_AACENCTAB_H
+
+#include "aac.h"
+
+/** Total number of usable codebooks **/
+#define CB_TOT 12
+
+/** Total number of codebooks, including special ones **/
+#define CB_TOT_ALL 15
+
+#define AAC_MAX_CHANNELS 6
+
+extern const uint8_t *ff_aac_swb_size_1024[];
+extern const int      ff_aac_swb_size_1024_len;
+extern const uint8_t *ff_aac_swb_size_128[];
+extern const int      ff_aac_swb_size_128_len;
+
+/** default channel configurations */
+static const uint8_t aac_chan_configs[6][5] = {
+    {1, TYPE_SCE},                               // 1 channel  - single channel element
+    {1, TYPE_CPE},                               // 2 channels - channel pair
+    {2, TYPE_SCE, TYPE_CPE},                     // 3 channels - center + stereo
+    {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},           // 4 channels - front center + stereo + back center
+    {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},           // 5 channels - front center + stereo + back stereo
+    {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 6 channels - front center + stereo + back stereo + LFE
+};
+
+/**
+ * Table to remap channels from libavcodec's default order to AAC order.
+ */
+static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
+    { 0 },
+    { 0, 1 },
+    { 2, 0, 1 },
+    { 2, 0, 1, 3 },
+    { 2, 0, 1, 3, 4 },
+    { 2, 0, 1, 4, 5, 3 },
+};
+
+/* duplicated from avpriv_mpeg4audio_sample_rates to avoid shared build
+ * failures */
+static const int mpeg4audio_sample_rates[16] = {
+    96000, 88200, 64000, 48000, 44100, 32000,
+    24000, 22050, 16000, 12000, 11025, 8000, 7350
+};
+
+/** bits needed to code codebook run value for long windows */
+static const uint8_t run_value_bits_long[64] = {
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
+};
+
+/** bits needed to code codebook run value for short windows */
+static const uint8_t run_value_bits_short[16] = {
+    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
+};
+
+/* TNS starting SFBs for long and short windows */
+static const uint8_t tns_min_sfb_short[16] = {
+    2, 2, 2, 3, 3, 4, 6, 6, 8, 10, 10, 12, 12, 12, 12, 12
+};
+
+static const uint8_t tns_min_sfb_long[16] = {
+    12, 13, 15, 16, 17, 20, 25, 26, 24, 28, 30, 31, 31, 31, 31, 31
+};
+
+static const uint8_t * const tns_min_sfb[2] = {
+    tns_min_sfb_long, tns_min_sfb_short
+};
+
+static const uint8_t * const run_value_bits[2] = {
+    run_value_bits_long, run_value_bits_short
+};
+
+/** Map to convert values from BandCodingPath index to a codebook index **/
+static const uint8_t aac_cb_out_map[CB_TOT_ALL]  = {0,1,2,3,4,5,6,7,8,9,10,11,13,14,15};
+/** Inverse map to convert from codebooks to BandCodingPath indices **/
+static const uint8_t aac_cb_in_map[CB_TOT_ALL+1] = {0,1,2,3,4,5,6,7,8,9,10,11,0,12,13,14};
+
+static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
+static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
+
+static const unsigned char aac_maxval_cb[] = {
+    0, 1, 3, 5, 5, 7, 7, 7, 9, 9, 9, 9, 9, 11
+};
+
+#endif /* AVCODEC_AACENCTAB_H */
diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c
index df069c3e6d..1165d9be3f 100644
--- a/libavcodec/aacps.c
+++ b/libavcodec/aacps.c
@@ -2,31 +2,38 @@
  * MPEG-4 Parametric Stereo decoding functions
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
  */
 
 #include <stdint.h>
 #include "libavutil/common.h"
-#include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "aacps.h"
+#if USE_FIXED
+#include "aacps_fixed_tablegen.h"
+#else
+#include "libavutil/internal.h"
 #include "aacps_tablegen.h"
+#endif /* USE_FIXED */
 #include "aacpsdata.c"
 
 #define PS_BASELINE 0  ///< Operate in Baseline PS mode
@@ -148,7 +155,7 @@ static void ipdopd_reset(int8_t *ipd_hist, int8_t *opd_hist)
     }
 }
 
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
 {
     int e;
     int bit_count_start = get_bits_count(gb_host);
@@ -236,6 +243,7 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
     if (!ps->num_env || ps->border_position[ps->num_env] < numQMFSlots - 1) {
         //Create a fake envelope
         int source = ps->num_env ? ps->num_env - 1 : ps->num_env_old - 1;
+        int b;
         if (source >= 0 && source != ps->num_env) {
             if (ps->enable_iid) {
                 memcpy(ps->iid_par+ps->num_env, ps->iid_par+source, sizeof(ps->iid_par[0]));
@@ -248,6 +256,22 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
                 memcpy(ps->opd_par+ps->num_env, ps->opd_par+source, sizeof(ps->opd_par[0]));
             }
         }
+        if (ps->enable_iid){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (FFABS(ps->iid_par[ps->num_env][b]) > 7 + 8 * ps->iid_quant) {
+                    av_log(avctx, AV_LOG_ERROR, "iid_par invalid\n");
+                    goto err;
+                }
+            }
+        }
+        if (ps->enable_icc){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (ps->icc_par[ps->num_env][b] > 7U) {
+                    av_log(avctx, AV_LOG_ERROR, "icc_par invalid\n");
+                    goto err;
+                }
+            }
+        }
         ps->num_env++;
         ps->border_position[ps->num_env] = numQMFSlots - 1;
     }
@@ -285,35 +309,41 @@ err:
 
 /** Split one subband into 2 subsubbands with a symmetric real filter.
  * The filter must have its non-center even coefficients equal to zero. */
-static void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[8], int len, int reverse)
+static void hybrid2_re(INTFLOAT (*in)[2], INTFLOAT (*out)[32][2], const INTFLOAT filter[8], int len, int reverse)
 {
     int i, j;
     for (i = 0; i < len; i++, in++) {
-        float re_in = filter[6] * in[6][0];          //real inphase
-        float re_op = 0.0f;                          //real out of phase
-        float im_in = filter[6] * in[6][1];          //imag inphase
-        float im_op = 0.0f;                          //imag out of phase
+        INT64FLOAT re_in = AAC_MUL31(filter[6], in[6][0]); //real inphase
+        INT64FLOAT re_op = 0.0f;                          //real out of phase
+        INT64FLOAT im_in = AAC_MUL31(filter[6], in[6][1]); //imag inphase
+        INT64FLOAT im_op = 0.0f;                          //imag out of phase
         for (j = 0; j < 6; j += 2) {
-            re_op += filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
-            im_op += filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
+            re_op += (INT64FLOAT)filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
+            im_op += (INT64FLOAT)filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
         }
-        out[ reverse][i][0] = re_in + re_op;
-        out[ reverse][i][1] = im_in + im_op;
-        out[!reverse][i][0] = re_in - re_op;
-        out[!reverse][i][1] = im_in - im_op;
+
+#if USE_FIXED
+        re_op = (re_op + 0x40000000) >> 31;
+        im_op = (im_op + 0x40000000) >> 31;
+#endif /* USE_FIXED */
+
+        out[ reverse][i][0] = (INTFLOAT)(re_in + re_op);
+        out[ reverse][i][1] = (INTFLOAT)(im_in + im_op);
+        out[!reverse][i][0] = (INTFLOAT)(re_in - re_op);
+        out[!reverse][i][1] = (INTFLOAT)(im_in - im_op);
     }
 }
 
 /** Split one subband into 6 subsubbands with a complex filter */
-static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
-                       TABLE_CONST float (*filter)[8][2], int len)
+static void hybrid6_cx(PSDSPContext *dsp, INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                       TABLE_CONST INTFLOAT (*filter)[8][2], int len)
 {
     int i;
     int N = 8;
-    LOCAL_ALIGNED_16(float, temp, [8], [2]);
+    LOCAL_ALIGNED_16(INTFLOAT, temp, [8], [2]);
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(temp, in, (const float (*)[8][2]) filter, 1, N);
+        dsp->hybrid_analysis(temp, in, (const INTFLOAT (*)[8][2]) filter, 1, N);
         out[0][i][0] = temp[6][0];
         out[0][i][1] = temp[6][1];
         out[1][i][0] = temp[7][0];
@@ -330,18 +360,18 @@ static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
 }
 
 static void hybrid4_8_12_cx(PSDSPContext *dsp,
-                            float (*in)[2], float (*out)[32][2],
-                            TABLE_CONST float (*filter)[8][2], int N, int len)
+                            INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                            TABLE_CONST INTFLOAT (*filter)[8][2], int N, int len)
 {
     int i;
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(out[0] + i, in, (const float (*)[8][2]) filter, 32, N);
+        dsp->hybrid_analysis(out[0] + i, in, (const INTFLOAT (*)[8][2]) filter, 32, N);
     }
 }
 
-static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
-                            float in[5][44][2], float L[2][38][64],
+static void hybrid_analysis(PSDSPContext *dsp, INTFLOAT out[91][32][2],
+                            INTFLOAT in[5][44][2], INTFLOAT L[2][38][64],
                             int is34, int len)
 {
     int i, j;
@@ -370,8 +400,8 @@ static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
     }
 }
 
-static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
-                             float in[91][32][2], int is34, int len)
+static void hybrid_synthesis(PSDSPContext *dsp, INTFLOAT out[2][38][64],
+                             INTFLOAT in[91][32][2], int is34, int len)
 {
     int i, n;
     if (is34) {
@@ -412,9 +442,10 @@ static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
 }
 
 /// All-pass filter decay slope
-#define DECAY_SLOPE      0.05f
+#define DECAY_SLOPE      Q30(0.05f)
 /// Number of frequency bands that can be addressed by the parameter index, b(k)
 static const int   NR_PAR_BANDS[]      = { 20, 34 };
+static const int   NR_IPDOPD_BANDS[]   = { 11, 17 };
 /// Number of frequency bands that can be addressed by the sub subband index, k
 static const int   NR_BANDS[]          = { 71, 91 };
 /// Start frequency band for the all-pass filter decay slope
@@ -465,28 +496,43 @@ static void map_idx_34_to_20(int8_t *par_mapped, const int8_t *par, int full)
     }
 }
 
-static void map_val_34_to_20(float par[PS_MAX_NR_IIDICC])
+static void map_val_34_to_20(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
+#if USE_FIXED
+    par[ 0] = (int)(((int64_t)(par[ 0] + (par[ 1]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 1] = (int)(((int64_t)((par[ 1]>>1) + par[ 2]) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 2] = (int)(((int64_t)(par[ 3] + (par[ 4]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 3] = (int)(((int64_t)((par[ 4]>>1) + par[ 5]) * 1431655765 + \
+                      0x40000000) >> 31);
+#else
     par[ 0] = (2*par[ 0] +   par[ 1]) * 0.33333333f;
     par[ 1] = (  par[ 1] + 2*par[ 2]) * 0.33333333f;
     par[ 2] = (2*par[ 3] +   par[ 4]) * 0.33333333f;
     par[ 3] = (  par[ 4] + 2*par[ 5]) * 0.33333333f;
-    par[ 4] = (  par[ 6] +   par[ 7]) * 0.5f;
-    par[ 5] = (  par[ 8] +   par[ 9]) * 0.5f;
+#endif /* USE_FIXED */
+    par[ 4] = AAC_HALF_SUM(par[ 6], par[ 7]);
+    par[ 5] = AAC_HALF_SUM(par[ 8], par[ 9]);
     par[ 6] =    par[10];
     par[ 7] =    par[11];
-    par[ 8] = (  par[12] +   par[13]) * 0.5f;
-    par[ 9] = (  par[14] +   par[15]) * 0.5f;
+    par[ 8] = AAC_HALF_SUM(par[12], par[13]);
+    par[ 9] = AAC_HALF_SUM(par[14], par[15]);
     par[10] =    par[16];
     par[11] =    par[17];
     par[12] =    par[18];
     par[13] =    par[19];
-    par[14] = (  par[20] +   par[21]) * 0.5f;
-    par[15] = (  par[22] +   par[23]) * 0.5f;
-    par[16] = (  par[24] +   par[25]) * 0.5f;
-    par[17] = (  par[26] +   par[27]) * 0.5f;
+    par[14] = AAC_HALF_SUM(par[20], par[21]);
+    par[15] = AAC_HALF_SUM(par[22], par[23]);
+    par[16] = AAC_HALF_SUM(par[24], par[25]);
+    par[17] = AAC_HALF_SUM(par[26], par[27]);
+#if USE_FIXED
+    par[18] = (((par[28]+2)>>2) + ((par[29]+2)>>2) + ((par[30]+2)>>2) + ((par[31]+2)>>2));
+#else
     par[18] = (  par[28] +   par[29] +   par[30] +   par[31]) * 0.25f;
-    par[19] = (  par[32] +   par[33]) * 0.5f;
+#endif /* USE_FIXED */
+    par[19] = AAC_HALF_SUM(par[32], par[33]);
 }
 
 static void map_idx_10_to_34(int8_t *par_mapped, const int8_t *par, int full)
@@ -571,7 +617,7 @@ static void map_idx_20_to_34(int8_t *par_mapped, const int8_t *par, int full)
     par_mapped[ 0] =  par[ 0];
 }
 
-static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
+static void map_val_20_to_34(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
     par[33] =  par[19];
     par[32] =  par[19];
@@ -602,28 +648,29 @@ static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
     par[ 7] =  par[ 4];
     par[ 6] =  par[ 4];
     par[ 5] =  par[ 3];
-    par[ 4] = (par[ 2] + par[ 3]) * 0.5f;
+    par[ 4] = AAC_HALF_SUM(par[ 2], par[ 3]);
     par[ 3] =  par[ 2];
     par[ 2] =  par[ 1];
-    par[ 1] = (par[ 0] + par[ 1]) * 0.5f;
-    par[ 0] =  par[ 0];
+    par[ 1] = AAC_HALF_SUM(par[ 0], par[ 1]);
 }
 
-static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[32][2], int is34)
+static void decorrelation(PSContext *ps, INTFLOAT (*out)[32][2], const INTFLOAT (*s)[32][2], int is34)
 {
-    LOCAL_ALIGNED_16(float, power, [34], [PS_QMF_TIME_SLOTS]);
-    LOCAL_ALIGNED_16(float, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
-    float *peak_decay_nrg = ps->peak_decay_nrg;
-    float *power_smooth = ps->power_smooth;
-    float *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
-    float (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
-    float (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
-    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    const float peak_decay_factor = 0.76592833836465f;
+    LOCAL_ALIGNED_16(INTFLOAT, power, [34], [PS_QMF_TIME_SLOTS]);
+    LOCAL_ALIGNED_16(INTFLOAT, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
+    INTFLOAT *peak_decay_nrg = ps->peak_decay_nrg;
+    INTFLOAT *power_smooth = ps->power_smooth;
+    INTFLOAT *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
+    INTFLOAT (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
+    INTFLOAT (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
+#if !USE_FIXED
     const float transient_impact  = 1.5f;
     const float a_smooth          = 0.25f; ///< Smoothing coefficient
+#endif /* USE_FIXED */
+    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
     int i, k, m, n;
     int n0 = 0, nL = 32;
+    const INTFLOAT peak_decay_factor = Q31(0.76592833836465f);
 
     memset(power, 0, 34 * sizeof(*power));
 
@@ -641,6 +688,33 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     }
 
     //Transient detection
+#if USE_FIXED
+    for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
+        for (n = n0; n < nL; n++) {
+            int decayed_peak;
+            int denom;
+
+            decayed_peak = (int)(((int64_t)peak_decay_factor * \
+                                           peak_decay_nrg[i] + 0x40000000) >> 31);
+            peak_decay_nrg[i] = FFMAX(decayed_peak, power[i][n]);
+            power_smooth[i] += (power[i][n] - power_smooth[i] + 2) >> 2;
+            peak_decay_diff_smooth[i] += (peak_decay_nrg[i] - power[i][n] - \
+                                          peak_decay_diff_smooth[i] + 2) >> 2;
+            denom = peak_decay_diff_smooth[i] + (peak_decay_diff_smooth[i] >> 1);
+            if (denom > power_smooth[i]) {
+              int p = power_smooth[i];
+              while (denom < 0x40000000) {
+                denom <<= 1;
+                p <<= 1;
+              }
+              transient_gain[i][n] = p / (denom >> 16);
+            }
+            else {
+              transient_gain[i][n] = 1 << 16;
+            }
+        }
+    }
+#else
     for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
         for (n = n0; n < nL; n++) {
             float decayed_peak = peak_decay_factor * peak_decay_nrg[i];
@@ -654,6 +728,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
     }
 
+#endif /* USE_FIXED */
     //Decorrelation and transient reduction
     //                         PS_AP_LINKS - 1
     //                               -----
@@ -664,8 +739,22 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     //d[k][z] (out) = transient_gain_mapped[k][z] * H[k][z] * s[k][z]
     for (k = 0; k < NR_ALLPASS_BANDS[is34]; k++) {
         int b = k_to_i[k];
+#if USE_FIXED
+        int g_decay_slope;
+
+        if (k - DECAY_CUTOFF[is34] <= 0) {
+          g_decay_slope = 1 << 30;
+        }
+        else if (k - DECAY_CUTOFF[is34] >= 20) {
+          g_decay_slope = 0;
+        }
+        else {
+          g_decay_slope = (1 << 30) - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
+        }
+#else
         float g_decay_slope = 1.f - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
         g_decay_slope = av_clipf(g_decay_slope, 0.f, 1.f);
+#endif /* USE_FIXED */
         memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
         memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
         for (m = 0; m < PS_AP_LINKS; m++) {
@@ -673,7 +762,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
         ps->dsp.decorrelate(out[k], delay[k] + PS_MAX_DELAY - 2, ap_delay[k],
                             phi_fract[is34][k],
-                            (const float (*)[2]) Q_fract_allpass[is34][k],
+                            (const INTFLOAT (*)[2]) Q_fract_allpass[is34][k],
                             transient_gain[b], g_decay_slope, nL - n0);
     }
     for (; k < SHORT_DELAY_BAND[is34]; k++) {
@@ -732,14 +821,14 @@ static void remap20(int8_t (**p_par_mapped)[PS_MAX_NR_IIDICC],
     }
 }
 
-static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2], int is34)
+static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r)[32][2], int is34)
 {
     int e, b, k;
 
-    float (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
-    float (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
-    float (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
-    float (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
+    INTFLOAT (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
+    INTFLOAT (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
+    INTFLOAT (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
+    INTFLOAT (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
     int8_t *opd_hist = ps->opd_hist;
     int8_t *ipd_hist = ps->ipd_hist;
     int8_t iid_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
@@ -751,7 +840,7 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     int8_t (*ipd_mapped)[PS_MAX_NR_IIDICC] = ipd_mapped_buf;
     int8_t (*opd_mapped)[PS_MAX_NR_IIDICC] = opd_mapped_buf;
     const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    TABLE_CONST float (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
+    TABLE_CONST INTFLOAT (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
 
     //Remapping
     if (ps->num_env_old) {
@@ -806,35 +895,36 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     //Mixing
     for (e = 0; e < ps->num_env; e++) {
         for (b = 0; b < NR_PAR_BANDS[is34]; b++) {
-            float h11, h12, h21, h22;
+            INTFLOAT h11, h12, h21, h22;
             h11 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][0];
             h12 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][1];
             h21 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][2];
             h22 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][3];
-            if (!PS_BASELINE && ps->enable_ipdopd && b < ps->nr_ipdopd_par) {
+
+            if (!PS_BASELINE && ps->enable_ipdopd && b < NR_IPDOPD_BANDS[is34]) {
                 //The spec say says to only run this smoother when enable_ipdopd
                 //is set but the reference decoder appears to run it constantly
-                float h11i, h12i, h21i, h22i;
-                float ipd_adj_re, ipd_adj_im;
+                INTFLOAT h11i, h12i, h21i, h22i;
+                INTFLOAT ipd_adj_re, ipd_adj_im;
                 int opd_idx = opd_hist[b] * 8 + opd_mapped[e][b];
                 int ipd_idx = ipd_hist[b] * 8 + ipd_mapped[e][b];
-                float opd_re = pd_re_smooth[opd_idx];
-                float opd_im = pd_im_smooth[opd_idx];
-                float ipd_re = pd_re_smooth[ipd_idx];
-                float ipd_im = pd_im_smooth[ipd_idx];
+                INTFLOAT opd_re = pd_re_smooth[opd_idx];
+                INTFLOAT opd_im = pd_im_smooth[opd_idx];
+                INTFLOAT ipd_re = pd_re_smooth[ipd_idx];
+                INTFLOAT ipd_im = pd_im_smooth[ipd_idx];
                 opd_hist[b] = opd_idx & 0x3F;
                 ipd_hist[b] = ipd_idx & 0x3F;
 
-                ipd_adj_re = opd_re*ipd_re + opd_im*ipd_im;
-                ipd_adj_im = opd_im*ipd_re - opd_re*ipd_im;
-                h11i = h11 * opd_im;
-                h11  = h11 * opd_re;
-                h12i = h12 * ipd_adj_im;
-                h12  = h12 * ipd_adj_re;
-                h21i = h21 * opd_im;
-                h21  = h21 * opd_re;
-                h22i = h22 * ipd_adj_im;
-                h22  = h22 * ipd_adj_re;
+                ipd_adj_re = AAC_MADD30(opd_re, ipd_re, opd_im, ipd_im);
+                ipd_adj_im = AAC_MSUB30(opd_im, ipd_re, opd_re, ipd_im);
+                h11i = AAC_MUL30(h11,  opd_im);
+                h11  = AAC_MUL30(h11,  opd_re);
+                h12i = AAC_MUL30(h12,  ipd_adj_im);
+                h12  = AAC_MUL30(h12,  ipd_adj_re);
+                h21i = AAC_MUL30(h21,  opd_im);
+                h21  = AAC_MUL30(h21,  opd_re);
+                h22i = AAC_MUL30(h22,  ipd_adj_im);
+                h22  = AAC_MUL30(h22,  ipd_adj_re);
                 H11[1][e+1][b] = h11i;
                 H12[1][e+1][b] = h12i;
                 H21[1][e+1][b] = h21i;
@@ -846,11 +936,14 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             H22[0][e+1][b] = h22;
         }
         for (k = 0; k < NR_BANDS[is34]; k++) {
-            float h[2][4];
-            float h_step[2][4];
+            LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]);
+            LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]);
             int start = ps->border_position[e];
             int stop  = ps->border_position[e+1];
-            float width = 1.f / (stop - start);
+            INTFLOAT width = Q30(1.f) / (stop - start);
+#if USE_FIXED
+            width <<= 1;
+#endif
             b = k_to_i[k];
             h[0][0] = H11[0][e][b];
             h[0][1] = H12[0][e][b];
@@ -871,15 +964,15 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             }
             }
             //Interpolation
-            h_step[0][0] = (H11[0][e+1][b] - h[0][0]) * width;
-            h_step[0][1] = (H12[0][e+1][b] - h[0][1]) * width;
-            h_step[0][2] = (H21[0][e+1][b] - h[0][2]) * width;
-            h_step[0][3] = (H22[0][e+1][b] - h[0][3]) * width;
+            h_step[0][0] = AAC_MSUB31_V3(H11[0][e+1][b], h[0][0], width);
+            h_step[0][1] = AAC_MSUB31_V3(H12[0][e+1][b], h[0][1], width);
+            h_step[0][2] = AAC_MSUB31_V3(H21[0][e+1][b], h[0][2], width);
+            h_step[0][3] = AAC_MSUB31_V3(H22[0][e+1][b], h[0][3], width);
             if (!PS_BASELINE && ps->enable_ipdopd) {
-                h_step[1][0] = (H11[1][e+1][b] - h[1][0]) * width;
-                h_step[1][1] = (H12[1][e+1][b] - h[1][1]) * width;
-                h_step[1][2] = (H21[1][e+1][b] - h[1][2]) * width;
-                h_step[1][3] = (H22[1][e+1][b] - h[1][3]) * width;
+                h_step[1][0] = AAC_MSUB31_V3(H11[1][e+1][b], h[1][0], width);
+                h_step[1][1] = AAC_MSUB31_V3(H12[1][e+1][b], h[1][1], width);
+                h_step[1][2] = AAC_MSUB31_V3(H21[1][e+1][b], h[1][2], width);
+                h_step[1][3] = AAC_MSUB31_V3(H22[1][e+1][b], h[1][3], width);
             }
             ps->dsp.stereo_interpolate[!PS_BASELINE && ps->enable_ipdopd](
                 l[k] + start + 1, r[k] + start + 1,
@@ -888,10 +981,10 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     }
 }
 
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top)
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top)
 {
-    LOCAL_ALIGNED_16(float, Lbuf, [91], [32][2]);
-    LOCAL_ALIGNED_16(float, Rbuf, [91], [32][2]);
+    INTFLOAT (*Lbuf)[32][2] = ps->Lbuf;
+    INTFLOAT (*Rbuf)[32][2] = ps->Rbuf;
     const int len = 32;
     int is34 = ps->is34bands;
 
@@ -901,7 +994,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
         memset(ps->ap_delay + top, 0, (NR_ALLPASS_BANDS[is34] - top)*sizeof(ps->ap_delay[0]));
 
     hybrid_analysis(&ps->dsp, Lbuf, ps->in_buf, L, is34, len);
-    decorrelation(ps, Rbuf, (const float (*)[32][2]) Lbuf, is34);
+    decorrelation(ps, Rbuf, (const INTFLOAT (*)[32][2]) Lbuf, is34);
     stereo_processing(ps, Lbuf, Rbuf, is34);
     hybrid_synthesis(&ps->dsp, L, Lbuf, is34, len);
     hybrid_synthesis(&ps->dsp, R, Rbuf, is34, len);
@@ -918,7 +1011,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
 #define PS_VLC_ROW(name) \
     { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
 
-av_cold void ff_ps_init(void) {
+av_cold void AAC_RENAME(ff_ps_init)(void) {
     // Syntax initialization
     static const struct {
         const void *ps_codes, *ps_bits;
@@ -950,7 +1043,7 @@ av_cold void ff_ps_init(void) {
     ps_tableinit();
 }
 
-av_cold void ff_ps_ctx_init(PSContext *ps)
+av_cold void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps)
 {
-    ff_psdsp_init(&ps->dsp);
+    AAC_RENAME(ff_psdsp_init)(&ps->dsp);
 }
diff --git a/libavcodec/aacps.h b/libavcodec/aacps.h
index e8a195ab2b..54f9d99177 100644
--- a/libavcodec/aacps.h
+++ b/libavcodec/aacps.h
@@ -2,20 +2,20 @@
  * MPEG-4 Parametric Stereo definitions and declarations
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,24 +61,26 @@ typedef struct PSContext {
     int    is34bands;
     int    is34bands_old;
 
-    DECLARE_ALIGNED(16, float, in_buf)[5][44][2];
-    DECLARE_ALIGNED(16, float, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
-    DECLARE_ALIGNED(16, float, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
-    DECLARE_ALIGNED(16, float, peak_decay_nrg)[34];
-    DECLARE_ALIGNED(16, float, power_smooth)[34];
-    DECLARE_ALIGNED(16, float, peak_decay_diff_smooth)[34];
-    DECLARE_ALIGNED(16, float, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, in_buf)[5][44][2];
+    DECLARE_ALIGNED(16, INTFLOAT, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_nrg)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, power_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_diff_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, Lbuf)[91][32][2];
+    DECLARE_ALIGNED(16, INTFLOAT, Rbuf)[91][32][2];
     int8_t opd_hist[PS_MAX_NR_IIDICC];
     int8_t ipd_hist[PS_MAX_NR_IIDICC];
     PSDSPContext dsp;
 } PSContext;
 
-void ff_ps_init(void);
-void ff_ps_ctx_init(PSContext *ps);
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top);
+void AAC_RENAME(ff_ps_init)(void);
+void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps);
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top);
 
 #endif /* AVCODEC_PS_H */
diff --git a/libavcodec/x86/audiodsp.h b/libavcodec/aacps_fixed.c
index 321056b8b7..46af21339a 100644
--- a/libavcodec/x86/audiodsp.h
+++ b/libavcodec/aacps_fixed.c
@@ -1,25 +1,24 @@
 /*
- * This file is part of Libav.
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_X86_AUDIODSP_H
-#define AVCODEC_X86_AUDIODSP_H
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len);
+#define USE_FIXED 1
 
-#endif /* AVCODEC_X86_AUDIODSP_H */
+#include "aacps.c"
diff --git a/libavcodec/aacps_fixed_tablegen.c b/libavcodec/aacps_fixed_tablegen.c
new file mode 100644
index 0000000000..9e306991f0
--- /dev/null
+++ b/libavcodec/aacps_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_fixed_tablegen.h b/libavcodec/aacps_fixed_tablegen.h
new file mode 100644
index 0000000000..01f2eced64
--- /dev/null
+++ b/libavcodec/aacps_fixed_tablegen.h
@@ -0,0 +1,403 @@
+/*
+ * Header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#ifndef AACPS_FIXED_TABLEGEN_H
+#define AACPS_FIXED_TABLEGEN_H
+
+#include <math.h>
+#include <stdint.h>
+
+#if CONFIG_HARDCODED_TABLES
+#define ps_tableinit()
+#define TABLE_CONST const
+#include "libavcodec/aacps_fixed_tables.h"
+#else
+#include "libavutil/common.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/mem.h"
+
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
+#define NR_ALLPASS_BANDS20 30
+#define NR_ALLPASS_BANDS34 50
+#define PS_AP_LINKS 3
+#define TABLE_CONST
+static int pd_re_smooth[8*8*8];
+static int pd_im_smooth[8*8*8];
+static int HA[46][8][4];
+static int HB[46][8][4];
+static DECLARE_ALIGNED(16, int, f20_0_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_0_12)[12][8][2];
+static DECLARE_ALIGNED(16, int, f34_1_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_2_4) [ 4][8][2];
+static TABLE_CONST DECLARE_ALIGNED(16, int, Q_fract_allpass)[2][50][3][2];
+static DECLARE_ALIGNED(16, int, phi_fract)[2][50][2];
+
+static const int g0_Q8[] = {
+    Q31(0.00746082949812f), Q31(0.02270420949825f), Q31(0.04546865930473f), Q31(0.07266113929591f),
+    Q31(0.09885108575264f), Q31(0.11793710567217f), Q31(0.125f)
+};
+
+static const int g0_Q12[] = {
+    Q31(0.04081179924692f), Q31(0.03812810994926f), Q31(0.05144908135699f), Q31(0.06399831151592f),
+    Q31(0.07428313801106f), Q31(0.08100347892914f), Q31(0.08333333333333f)
+};
+
+static const int g1_Q8[] = {
+    Q31(0.01565675600122f), Q31(0.03752716391991f), Q31(0.05417891378782f), Q31(0.08417044116767f),
+    Q31(0.10307344158036f), Q31(0.12222452249753f), Q31(0.125f)
+};
+
+static const int g2_Q4[] = {
+    Q31(-0.05908211155639f), Q31(-0.04871498374946f), Q31(0.0f),   Q31(0.07778723915851f),
+    Q31( 0.16486303567403f), Q31( 0.23279856662996f), Q31(0.25f)
+};
+
+static const int sintbl_4[4]   = {           0,  1073741824,           0, -1073741824 };
+static const int costbl_4[4]   = {  1073741824,           0, -1073741824,           0 };
+static const int sintbl_8[8]   = {           0,   759250125,  1073741824,   759250125,
+                                             0,  -759250125, -1073741824,  -759250125 };
+static const int costbl_8[8]   = {  1073741824,   759250125,           0,  -759250125,
+                                   -1073741824,  -759250125,           0,   759250125 };
+static const int sintbl_12[12] = {           0,   536870912,   929887697,  1073741824,
+                                     929887697,   536870912,           0,  -536870912,
+                                    -929887697, -1073741824,  -929887697,  -536870912 };
+static const int costbl_12[12] = {  1073741824,   929887697,   536870912,           0,
+                                    -536870912,  -929887697, -1073741824,  -929887697,
+                                    -536870912,           0,   536870912,   929887697 };
+
+static void make_filters_from_proto(int (*filter)[8][2], const int *proto, int bands)
+{
+
+    const int *sinptr, *cosptr;
+    int s, c, sinhalf, coshalf;
+    int q, n;
+
+    if (bands == 4) {
+        sinptr = sintbl_4;
+        cosptr = costbl_4;
+        sinhalf = 759250125;
+        coshalf = 759250125;
+    } else if (bands == 8) {
+        sinptr = sintbl_8;
+        cosptr = costbl_8;
+        sinhalf = 410903207;
+        coshalf = 992008094;
+    } else {
+        sinptr = sintbl_12;
+        cosptr = costbl_12;
+        sinhalf = 277904834;
+        coshalf = 1037154959;
+    }
+
+    for (q = 0; q < bands; q++) {
+        for (n = 0; n < 7; n++) {
+            int theta = (q*(n-6) + (n>>1) - 3) % bands;
+
+            if (theta < 0)
+                theta += bands;
+            s = sinptr[theta];
+            c = cosptr[theta];
+
+            if (n & 1) {
+                theta = (int)(((int64_t)c * coshalf - (int64_t)s * sinhalf + 0x20000000) >> 30);
+                s = (int)(((int64_t)s * coshalf + (int64_t)c * sinhalf + 0x20000000) >> 30);
+                c = theta;
+            }
+            filter[q][n][0] = (int)(((int64_t)proto[n] * c + 0x20000000) >> 30);
+            filter[q][n][1] = -(int)(((int64_t)proto[n] * s + 0x20000000) >> 30);
+        }
+    }
+}
+
+static void ps_tableinit(void)
+{
+    static const int ipdopd_sin[] = { Q30(0), Q30(M_SQRT1_2), Q30(1), Q30( M_SQRT1_2), Q30( 0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2) };
+    static const int ipdopd_cos[] = { Q30(1), Q30(M_SQRT1_2), Q30(0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2), Q30( 0), Q30( M_SQRT1_2) };
+    int pd0, pd1, pd2;
+    int idx;
+
+    static const int alpha_tab[] =
+    {
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4455626011f/M_PI), Q30(1.4531552792f/M_PI), Q30(1.4648091793f/M_PI), Q30(1.4945238829f/M_PI), Q30(1.5239057541f/M_PI), Q30(1.5644006729f/M_PI),
+      Q30(1.3738563061f/M_PI), Q30(1.3851221800f/M_PI), Q30(1.4026404619f/M_PI), Q30(1.4484288692f/M_PI), Q30(1.4949874878f/M_PI), Q30(1.5604078770f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1507037878f/M_PI), Q30(1.1669205427f/M_PI), Q30(1.1938756704f/M_PI), Q30(1.2754167318f/M_PI), Q30(1.3761177063f/M_PI), Q30(1.5429240465f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4200925827f/M_PI), Q30(0.4038758278f/M_PI), Q30(0.3769206405f/M_PI), Q30(0.2953795493f/M_PI), Q30(0.1946786791f/M_PI), Q30(0.0278722942f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.1969399750f/M_PI), Q30(0.1856741160f/M_PI), Q30(0.1681558639f/M_PI), Q30(0.1223674342f/M_PI), Q30(0.0758088827f/M_PI), Q30(0.0103884479f/M_PI),
+      Q30(0.1252337098f/M_PI), Q30(0.1176410317f/M_PI), Q30(0.1059871912f/M_PI), Q30(0.0762724727f/M_PI), Q30(0.0468905345f/M_PI), Q30(0.0063956482f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(1.5676341057f/M_PI), Q30(1.5678333044f/M_PI), Q30(1.5681363344f/M_PI), Q30(1.5688960552f/M_PI), Q30(1.5696337223f/M_PI), Q30(1.5706381798f/M_PI),
+      Q30(1.5651730299f/M_PI), Q30(1.5655272007f/M_PI), Q30(1.5660660267f/M_PI), Q30(1.5674170256f/M_PI), Q30(1.5687289238f/M_PI), Q30(1.5705151558f/M_PI),
+      Q30(1.5607966185f/M_PI), Q30(1.5614265203f/M_PI), Q30(1.5623844862f/M_PI), Q30(1.5647867918f/M_PI), Q30(1.5671195984f/M_PI), Q30(1.5702962875f/M_PI),
+      Q30(1.5530153513f/M_PI), Q30(1.5541347265f/M_PI), Q30(1.5558375120f/M_PI), Q30(1.5601085424f/M_PI), Q30(1.5642569065f/M_PI), Q30(1.5699069500f/M_PI),
+      Q30(1.5391840935f/M_PI), Q30(1.5411708355f/M_PI), Q30(1.5441943407f/M_PI), Q30(1.5517836809f/M_PI), Q30(1.5591609478f/M_PI), Q30(1.5692136288f/M_PI),
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4915299416f/M_PI), Q30(1.4964480400f/M_PI), Q30(1.5039558411f/M_PI), Q30(1.5229074955f/M_PI), Q30(1.5414420366f/M_PI), Q30(1.5667995214f/M_PI),
+      Q30(1.4590617418f/M_PI), Q30(1.4658898115f/M_PI), Q30(1.4763505459f/M_PI), Q30(1.5029321909f/M_PI), Q30(1.5291173458f/M_PI), Q30(1.5651149750f/M_PI),
+      Q30(1.4136143923f/M_PI), Q30(1.4229322672f/M_PI), Q30(1.4373078346f/M_PI), Q30(1.4743183851f/M_PI), Q30(1.5113102198f/M_PI), Q30(1.5626684427f/M_PI),
+      Q30(1.3505556583f/M_PI), Q30(1.3628427982f/M_PI), Q30(1.3820509911f/M_PI), Q30(1.4327841997f/M_PI), Q30(1.4850014448f/M_PI), Q30(1.5590143204f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1919227839f/M_PI), Q30(1.2081253529f/M_PI), Q30(1.2346779108f/M_PI), Q30(1.3123005629f/M_PI), Q30(1.4034168720f/M_PI), Q30(1.5471596718f/M_PI),
+      Q30(1.1061993837f/M_PI), Q30(1.1219338179f/M_PI), Q30(1.1484941244f/M_PI), Q30(1.2320860624f/M_PI), Q30(1.3421301842f/M_PI), Q30(1.5373806953f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4645969570f/M_PI), Q30(0.4488625824f/M_PI), Q30(0.4223022461f/M_PI), Q30(0.3387103081f/M_PI), Q30(0.2286661267f/M_PI), Q30(0.0334156826f/M_PI),
+      Q30(0.3788735867f/M_PI), Q30(0.3626709878f/M_PI), Q30(0.3361184299f/M_PI), Q30(0.2584958076f/M_PI), Q30(0.1673794836f/M_PI), Q30(0.0236366931f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.2202406377f/M_PI), Q30(0.2079535723f/M_PI), Q30(0.1887452900f/M_PI), Q30(0.1380121708f/M_PI), Q30(0.0857949182f/M_PI), Q30(0.0117820343f/M_PI),
+      Q30(0.1571819335f/M_PI), Q30(0.1478640437f/M_PI), Q30(0.1334884763f/M_PI), Q30(0.0964778885f/M_PI), Q30(0.0594860613f/M_PI), Q30(0.0081279324f/M_PI),
+      Q30(0.1117345318f/M_PI), Q30(0.1049065739f/M_PI), Q30(0.0944457650f/M_PI), Q30(0.0678641573f/M_PI), Q30(0.0416790098f/M_PI), Q30(0.0056813755f/M_PI),
+      Q30(0.0792663917f/M_PI), Q30(0.0743482932f/M_PI), Q30(0.0668405443f/M_PI), Q30(0.0478888862f/M_PI), Q30(0.0293543357f/M_PI), Q30(0.0039967746f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(0.0316122435f/M_PI), Q30(0.0296254847f/M_PI), Q30(0.0266019460f/M_PI), Q30(0.0190126132f/M_PI), Q30(0.0116353342f/M_PI), Q30(0.0015827164f/M_PI),
+      Q30(0.0177809205f/M_PI), Q30(0.0166615788f/M_PI), Q30(0.0149587989f/M_PI), Q30(0.0106877899f/M_PI), Q30(0.0065393616f/M_PI), Q30(0.0008894200f/M_PI),
+      Q30(0.0099996664f/M_PI), Q30(0.0093698399f/M_PI), Q30(0.0084118480f/M_PI), Q30(0.0060095116f/M_PI), Q30(0.0036767013f/M_PI), Q30(0.0005000498f/M_PI),
+      Q30(0.0056233541f/M_PI), Q30(0.0052691097f/M_PI), Q30(0.0047303112f/M_PI), Q30(0.0033792770f/M_PI), Q30(0.0020674451f/M_PI), Q30(0.0002811795f/M_PI),
+      Q30(0.0031622672f/M_PI), Q30(0.0029630491f/M_PI), Q30(0.0026600463f/M_PI), Q30(0.0019002859f/M_PI), Q30(0.0011625893f/M_PI), Q30(0.0001581155f/M_PI)
+    };
+
+    static const int gamma_tab[] =
+    {
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI)
+    };
+
+    static const int iid_par_dequant_c1[] = {
+        //iid_par_dequant_default
+        Q30(1.41198278375959f), Q30(1.40313815268360f), Q30(1.38687670404960f), Q30(1.34839972492648f),
+        Q30(1.29124937110028f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.57677990744575f), Q30(0.42640143271122f),
+        Q30(0.27671828230984f), Q30(0.17664462766713f), Q30(0.07940162697653f),
+        //iid_par_dequant_fine
+        Q30(1.41420649135832f), Q30(1.41419120222364f), Q30(1.41414285699784f), Q30(1.41399000859438f),
+        Q30(1.41350698548044f), Q30(1.41198278375959f), Q30(1.40977302262355f), Q30(1.40539479488545f),
+        Q30(1.39677960498402f), Q30(1.38005309967827f), Q30(1.34839972492648f), Q30(1.31392017367631f),
+        Q30(1.26431008149654f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.63365607219232f), Q30(0.52308104267543f),
+        Q30(0.42640143271122f), Q30(0.30895540465965f), Q30(0.22137464873077f), Q30(0.15768788954414f),
+        Q30(0.11198225164225f), Q30(0.07940162697653f), Q30(0.04469901562677f), Q30(0.02514469318284f),
+        Q30(0.01414142856998f), Q30(0.00795258154731f), Q30(0.00447211359449f),
+    };
+
+    static const int acos_icc_invq[] = {
+        Q31(0), Q31(0.178427635f/M_PI), Q31(0.28566733f/M_PI), Q31(0.46307236f/M_PI), Q31(0.59716315f/M_PI), Q31(0.78539816f/M_PI), Q31(1.10030855f/M_PI), Q31(1.57079633f/M_PI)
+    };
+    int iid, icc;
+
+    int k, m;
+    static const int8_t f_center_20[] = {
+        -3, -1, 1, 3, 5, 7, 10, 14, 18, 22,
+    };
+    static const int32_t f_center_34[] = {
+      Q31(  2/768.0),Q31(  6/768.0),Q31(10/768.0),Q31(14/768.0),Q31( 18/768.0),Q31( 22/768.0),Q31( 26/768.0),Q31(30/768.0),
+      Q31( 34/768.0),Q31(-10/768.0),Q31(-6/768.0),Q31(-2/768.0),Q31( 51/768.0),Q31( 57/768.0),Q31( 15/768.0),Q31(21/768.0),
+      Q31( 27/768.0),Q31( 33/768.0),Q31(39/768.0),Q31(45/768.0),Q31( 54/768.0),Q31( 66/768.0),Q31( 78/768.0),Q31(42/768.0),
+      Q31(102/768.0),Q31( 66/768.0),Q31(78/768.0),Q31(90/768.0),Q31(102/768.0),Q31(114/768.0),Q31(126/768.0),Q31(90/768.0)
+    };
+    static const int fractional_delay_links[] = { Q31(0.43f), Q31(0.75f), Q31(0.347f) };
+    const int fractional_delay_gain = Q31(0.39f);
+
+    for (pd0 = 0; pd0 < 8; pd0++) {
+        int pd0_re = (ipdopd_cos[pd0]+2)>>2;
+        int pd0_im = (ipdopd_sin[pd0]+2)>>2;
+        for (pd1 = 0; pd1 < 8; pd1++) {
+            int pd1_re = ipdopd_cos[pd1] >> 1;
+            int pd1_im = ipdopd_sin[pd1] >> 1;
+            for (pd2 = 0; pd2 < 8; pd2++) {
+                int shift, round;
+                int pd2_re = ipdopd_cos[pd2];
+                int pd2_im = ipdopd_sin[pd2];
+                int re_smooth = pd0_re + pd1_re + pd2_re;
+                int im_smooth = pd0_im + pd1_im + pd2_im;
+
+                SoftFloat pd_mag = av_int2sf(((ipdopd_cos[(pd0-pd1)&7]+8)>>4) + ((ipdopd_cos[(pd0-pd2)&7]+4)>>3) +
+                                               ((ipdopd_cos[(pd1-pd2)&7]+2)>>2) + 0x15000000, 28);
+                pd_mag = av_div_sf(FLOAT_1, av_sqrt_sf(pd_mag));
+                shift = 30 - pd_mag.exp;
+                round = 1 << (shift-1);
+                pd_re_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)re_smooth * pd_mag.mant + round) >> shift);
+                pd_im_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)im_smooth * pd_mag.mant + round) >> shift);
+            }
+        }
+    }
+
+    idx = 0;
+    for (iid = 0; iid < 46; iid++) {
+        int c1, c2;
+
+        c1 = iid_par_dequant_c1[iid];
+        if (iid < 15)
+          c2 = iid_par_dequant_c1[14-iid];
+        else
+          c2 = iid_par_dequant_c1[60-iid];
+
+        for (icc = 0; icc < 8; icc++) {
+            /*if (PS_BASELINE || ps->icc_mode < 3)*/{
+                int alpha, beta;
+                int ca, sa, cb, sb;
+
+                alpha = acos_icc_invq[icc];
+                beta = (int)(((int64_t)alpha * 1518500250 + 0x40000000) >> 31);
+                alpha >>= 1;
+                beta = (int)(((int64_t)beta * (c1 - c2) + 0x40000000) >> 31);
+                av_sincos_sf(beta + alpha, &sa, &ca);
+                av_sincos_sf(beta - alpha, &sb, &cb);
+
+                HA[iid][icc][0] = (int)(((int64_t)c2 * ca + 0x20000000) >> 30);
+                HA[iid][icc][1] = (int)(((int64_t)c1 * cb + 0x20000000) >> 30);
+                HA[iid][icc][2] = (int)(((int64_t)c2 * sa + 0x20000000) >> 30);
+                HA[iid][icc][3] = (int)(((int64_t)c1 * sb + 0x20000000) >> 30);
+            } /* else */ {
+                int alpha_int, gamma_int;
+                int alpha_c_int, alpha_s_int, gamma_c_int, gamma_s_int;
+
+                alpha_int = alpha_tab[idx];
+                gamma_int = gamma_tab[idx];
+
+                av_sincos_sf(alpha_int, &alpha_s_int, &alpha_c_int);
+                av_sincos_sf(gamma_int, &gamma_s_int, &gamma_c_int);
+
+                alpha_c_int = (int)(((int64_t)alpha_c_int * 1518500250 + 0x20000000) >> 30);
+                alpha_s_int = (int)(((int64_t)alpha_s_int * 1518500250 + 0x20000000) >> 30);
+
+                HB[iid][icc][0] = (int)(((int64_t)alpha_c_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][1] = (int)(((int64_t)alpha_s_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][2] = -(int)(((int64_t)alpha_s_int * gamma_s_int + 0x20000000) >> 30);
+                HB[iid][icc][3] = (int)(((int64_t)alpha_c_int * gamma_s_int + 0x20000000) >> 30);
+            }
+
+            if (icc < 5 || icc > 6)
+              idx++;
+        }
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS20; k++) {
+        int theta;
+        int64_t f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_20))
+          f_center = f_center_20[k];
+        else
+          f_center = (k << 3) - 52;
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 8) >> 4);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[0][k][m][0] = c;
+            Q_fract_allpass[0][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 8) >> 4);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[0][k][0] = c;
+        phi_fract[0][k][1] = s;
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS34; k++) {
+        int theta, f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_34))
+            f_center = f_center_34[k];
+        else
+            f_center = ((int64_t)k << 26) - (53 << 25);
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 0x10000000) >> 27);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[1][k][m][0] = c;
+            Q_fract_allpass[1][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 0x10000000) >> 27);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[1][k][0] = c;
+        phi_fract[1][k][1] = s;
+    }
+
+    make_filters_from_proto(f20_0_8,  g0_Q8,   8);
+    make_filters_from_proto(f34_0_12, g0_Q12, 12);
+    make_filters_from_proto(f34_1_8,  g1_Q8,   8);
+    make_filters_from_proto(f34_2_4,  g2_Q4,   4);
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AACPS_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacps_float.c b/libavcodec/aacps_float.c
new file mode 100644
index 0000000000..73259c10fb
--- /dev/null
+++ b/libavcodec/aacps_float.c
@@ -0,0 +1,24 @@
+/*
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacps.c"
diff --git a/libavcodec/aacps_tablegen.c b/libavcodec/aacps_tablegen.c
index 537b6ba651..26a6752faa 100644
--- a/libavcodec/aacps_tablegen.c
+++ b/libavcodec/aacps_tablegen.c
@@ -3,91 +3,22 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "aacps_tablegen.h"
-#include "tableprint.h"
-
-void write_float_3d_array (const void *p, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < b; i++) {
-        printf("{\n");
-        write_float_2d_array(f, c, d);
-        printf("},\n");
-        f += c * d;
-    }
-}
-
-void write_float_4d_array (const void *p, int a, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < a; i++) {
-        printf("{\n");
-        write_float_3d_array(f, b, c, d);
-        printf("},\n");
-        f += b * c * d;
-    }
-}
-
-int main(void)
-{
-    ps_tableinit();
-
-    write_fileheader();
-
-    printf("static const float pd_re_smooth[8*8*8] = {\n");
-    write_float_array(pd_re_smooth, 8*8*8);
-    printf("};\n");
-    printf("static const float pd_im_smooth[8*8*8] = {\n");
-    write_float_array(pd_im_smooth, 8*8*8);
-    printf("};\n");
-
-    printf("static const float HA[46][8][4] = {\n");
-    write_float_3d_array(HA, 46, 8, 4);
-    printf("};\n");
-    printf("static const float HB[46][8][4] = {\n");
-    write_float_3d_array(HB, 46, 8, 4);
-    printf("};\n");
-
-    printf("static const DECLARE_ALIGNED(16, float, f20_0_8)[8][8][2] = {\n");
-    write_float_3d_array(f20_0_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_0_12)[12][8][2] = {\n");
-    write_float_3d_array(f34_0_12, 12, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_1_8)[8][8][2] = {\n");
-    write_float_3d_array(f34_1_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_2_4)[4][8][2] = {\n");
-    write_float_3d_array(f34_2_4, 4, 8, 2);
-    printf("};\n");
-
-    printf("static TABLE_CONST DECLARE_ALIGNED(16, float, Q_fract_allpass)[2][50][3][2] = {\n");
-    write_float_4d_array(Q_fract_allpass, 2, 50, 3, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, phi_fract)[2][50][2] = {\n");
-    write_float_3d_array(phi_fract, 2, 50, 2);
-    printf("};\n");
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_tablegen.h b/libavcodec/aacps_tablegen.h
index a53f9fac1f..ca1112ddd0 100644
--- a/libavcodec/aacps_tablegen.h
+++ b/libavcodec/aacps_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ static const float g2_Q4[] = {
      0.16486303567403f,  0.23279856662996f, 0.25f
 };
 
-static void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
+static av_cold void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
 {
     int q, n;
     for (q = 0; q < bands; q++) {
@@ -82,7 +82,7 @@ static void make_filters_from_proto(float (*filter)[8][2], const float *proto, i
     }
 }
 
-static void ps_tableinit(void)
+static av_cold void ps_tableinit(void)
 {
     static const float ipdopd_sin[] = { 0, M_SQRT1_2, 1,  M_SQRT1_2,  0, -M_SQRT1_2, -1, -M_SQRT1_2 };
     static const float ipdopd_cos[] = { 1, M_SQRT1_2, 0, -M_SQRT1_2, -1, -M_SQRT1_2,  0,  M_SQRT1_2 };
diff --git a/libavcodec/aacps_tablegen_template.c b/libavcodec/aacps_tablegen_template.c
new file mode 100644
index 0000000000..e06ec914f2
--- /dev/null
+++ b/libavcodec/aacps_tablegen_template.c
@@ -0,0 +1,107 @@
+/*
+ * Generate a header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "aac_defines.h"
+
+#if USE_FIXED
+#define TYPE_NAME "int32_t"
+#define INT32FLOAT int32_t
+#define ARRAY_RENAME(x) write_int32_t_ ## x
+#define ARRAY_URENAME(x) write_uint32_t_ ## x
+#include "aacps_fixed_tablegen.h"
+#else
+#define TYPE_NAME "float"
+#define INT32FLOAT float
+#define ARRAY_RENAME(x) write_float_ ## x
+#define ARRAY_URENAME(x) write_float_ ## x
+#include "aacps_tablegen.h"
+#endif /* USE_FIXED */
+#include "tableprint.h"
+
+void ARRAY_RENAME(3d_array) (const void *p, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < b; i++) {
+        printf("{\n");
+        ARRAY_URENAME(2d_array)(f, c, d);
+        printf("},\n");
+        f += c * d;
+    }
+}
+
+void ARRAY_RENAME(4d_array) (const void *p, int a, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < a; i++) {
+        printf("{\n");
+        ARRAY_RENAME(3d_array)(f, b, c, d);
+        printf("},\n");
+        f += b * c * d;
+    }
+}
+
+int main(void)
+{
+    ps_tableinit();
+
+    write_fileheader();
+
+    printf("static const %s pd_re_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_re_smooth, 8*8*8);
+    printf("};\n");
+    printf("static const %s pd_im_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_im_smooth, 8*8*8);
+    printf("};\n");
+
+    printf("static const %s HA[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HA, 46, 8, 4);
+    printf("};\n");
+    printf("static const %s HB[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HB, 46, 8, 4);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, f20_0_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f20_0_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_0_12)[12][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_0_12, 12, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_1_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_1_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_2_4)[4][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_2_4, 4, 8, 2);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, Q_fract_allpass)[2][50][3][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(4d_array)(Q_fract_allpass, 2, 50, 3, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, phi_fract)[2][50][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(phi_fract, 2, 50, 2);
+    printf("};\n");
+
+    return 0;
+}
diff --git a/libavcodec/aacpsdata.c b/libavcodec/aacpsdata.c
index 675bd8e2b3..5c1a1b0f88 100644
--- a/libavcodec/aacpsdata.c
+++ b/libavcodec/aacpsdata.c
@@ -2,20 +2,20 @@
  * MPEG-4 Parametric Stereo data tables
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -157,7 +157,7 @@ static const int8_t k_to_i_34[] = {
     33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33
 };
 
-static const float g1_Q2[] = {
-    0.0f,  0.01899487526049f, 0.0f, -0.07293139167538f,
-    0.0f,  0.30596630545168f, 0.5f
+static const INTFLOAT g1_Q2[] = {
+    Q31(0.0f),  Q31(0.01899487526049f), Q31(0.0f), Q31(-0.07293139167538f),
+    Q31(0.0f),  Q31(0.30596630545168f), Q31(0.5f)
 };
diff --git a/libavcodec/aacpsdsp.c b/libavcodec/aacpsdsp.c
deleted file mode 100644
index 88e731f925..0000000000
--- a/libavcodec/aacpsdsp.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "aacpsdsp.h"
-
-static void ps_add_squares_c(float *dst, const float (*src)[2], int n)
-{
-    int i;
-    for (i = 0; i < n; i++)
-        dst[i] += src[i][0] * src[i][0] + src[i][1] * src[i][1];
-}
-
-static void ps_mul_pair_single_c(float (*dst)[2], float (*src0)[2], float *src1,
-                                 int n)
-{
-    int i;
-    for (i = 0; i < n; i++) {
-        dst[i][0] = src0[i][0] * src1[i];
-        dst[i][1] = src0[i][1] * src1[i];
-    }
-}
-
-static void ps_hybrid_analysis_c(float (*out)[2], float (*in)[2],
-                                 const float (*filter)[8][2],
-                                 int stride, int n)
-{
-    int i, j;
-
-    for (i = 0; i < n; i++) {
-        float sum_re = filter[i][6][0] * in[6][0];
-        float sum_im = filter[i][6][0] * in[6][1];
-
-        for (j = 0; j < 6; j++) {
-            float in0_re = in[j][0];
-            float in0_im = in[j][1];
-            float in1_re = in[12-j][0];
-            float in1_im = in[12-j][1];
-            sum_re += filter[i][j][0] * (in0_re + in1_re) -
-                      filter[i][j][1] * (in0_im - in1_im);
-            sum_im += filter[i][j][0] * (in0_im + in1_im) +
-                      filter[i][j][1] * (in0_re - in1_re);
-        }
-        out[i * stride][0] = sum_re;
-        out[i * stride][1] = sum_im;
-    }
-}
-
-static void ps_hybrid_analysis_ileave_c(float (*out)[32][2], float L[2][38][64],
-                                        int i, int len)
-{
-    int j;
-
-    for (; i < 64; i++) {
-        for (j = 0; j < len; j++) {
-            out[i][j][0] = L[0][j][i];
-            out[i][j][1] = L[1][j][i];
-        }
-    }
-}
-
-static void ps_hybrid_synthesis_deint_c(float out[2][38][64],
-                                        float (*in)[32][2],
-                                        int i, int len)
-{
-    int n;
-
-    for (; i < 64; i++) {
-        for (n = 0; n < len; n++) {
-            out[0][n][i] = in[i][n][0];
-            out[1][n][i] = in[i][n][1];
-        }
-    }
-}
-
-static void ps_decorrelate_c(float (*out)[2], float (*delay)[2],
-                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
-                             const float phi_fract[2], const float (*Q_fract)[2],
-                             const float *transient_gain,
-                             float g_decay_slope,
-                             int len)
-{
-    static const float a[] = { 0.65143905753106f,
-                               0.56471812200776f,
-                               0.48954165955695f };
-    float ag[PS_AP_LINKS];
-    int m, n;
-
-    for (m = 0; m < PS_AP_LINKS; m++)
-        ag[m] = a[m] * g_decay_slope;
-
-    for (n = 0; n < len; n++) {
-        float in_re = delay[n][0] * phi_fract[0] - delay[n][1] * phi_fract[1];
-        float in_im = delay[n][0] * phi_fract[1] + delay[n][1] * phi_fract[0];
-        for (m = 0; m < PS_AP_LINKS; m++) {
-            float a_re                = ag[m] * in_re;
-            float a_im                = ag[m] * in_im;
-            float link_delay_re       = ap_delay[m][n+2-m][0];
-            float link_delay_im       = ap_delay[m][n+2-m][1];
-            float fractional_delay_re = Q_fract[m][0];
-            float fractional_delay_im = Q_fract[m][1];
-            float apd_re = in_re;
-            float apd_im = in_im;
-            in_re = link_delay_re * fractional_delay_re -
-                    link_delay_im * fractional_delay_im - a_re;
-            in_im = link_delay_re * fractional_delay_im +
-                    link_delay_im * fractional_delay_re - a_im;
-            ap_delay[m][n+5][0] = apd_re + ag[m] * in_re;
-            ap_delay[m][n+5][1] = apd_im + ag[m] * in_im;
-        }
-        out[n][0] = transient_gain[n] * in_re;
-        out[n][1] = transient_gain[n] * in_im;
-    }
-}
-
-static void ps_stereo_interpolate_c(float (*l)[2], float (*r)[2],
-                                    float h[2][4], float h_step[2][4],
-                                    int len)
-{
-    float h0 = h[0][0];
-    float h1 = h[0][1];
-    float h2 = h[0][2];
-    float h3 = h[0][3];
-    float hs0 = h_step[0][0];
-    float hs1 = h_step[0][1];
-    float hs2 = h_step[0][2];
-    float hs3 = h_step[0][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h0 += hs0;
-        h1 += hs1;
-        h2 += hs2;
-        h3 += hs3;
-        l[n][0] = h0 * l_re + h2 * r_re;
-        l[n][1] = h0 * l_im + h2 * r_im;
-        r[n][0] = h1 * l_re + h3 * r_re;
-        r[n][1] = h1 * l_im + h3 * r_im;
-    }
-}
-
-static void ps_stereo_interpolate_ipdopd_c(float (*l)[2], float (*r)[2],
-                                           float h[2][4], float h_step[2][4],
-                                           int len)
-{
-    float h00  = h[0][0],      h10  = h[1][0];
-    float h01  = h[0][1],      h11  = h[1][1];
-    float h02  = h[0][2],      h12  = h[1][2];
-    float h03  = h[0][3],      h13  = h[1][3];
-    float hs00 = h_step[0][0], hs10 = h_step[1][0];
-    float hs01 = h_step[0][1], hs11 = h_step[1][1];
-    float hs02 = h_step[0][2], hs12 = h_step[1][2];
-    float hs03 = h_step[0][3], hs13 = h_step[1][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h00 += hs00;
-        h01 += hs01;
-        h02 += hs02;
-        h03 += hs03;
-        h10 += hs10;
-        h11 += hs11;
-        h12 += hs12;
-        h13 += hs13;
-
-        l[n][0] = h00 * l_re + h02 * r_re - h10 * l_im - h12 * r_im;
-        l[n][1] = h00 * l_im + h02 * r_im + h10 * l_re + h12 * r_re;
-        r[n][0] = h01 * l_re + h03 * r_re - h11 * l_im - h13 * r_im;
-        r[n][1] = h01 * l_im + h03 * r_im + h11 * l_re + h13 * r_re;
-    }
-}
-
-av_cold void ff_psdsp_init(PSDSPContext *s)
-{
-    s->add_squares            = ps_add_squares_c;
-    s->mul_pair_single        = ps_mul_pair_single_c;
-    s->hybrid_analysis        = ps_hybrid_analysis_c;
-    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
-    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
-    s->decorrelate            = ps_decorrelate_c;
-    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
-    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
-
-    if (ARCH_ARM)
-        ff_psdsp_init_arm(s);
-}
diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h
index dc380b10d0..c194bbe3ae 100644
--- a/libavcodec/aacpsdsp.h
+++ b/libavcodec/aacpsdsp.h
@@ -1,53 +1,57 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef LIBAVCODEC_AACPSDSP_H
 #define LIBAVCODEC_AACPSDSP_H
 
+#include "aac_defines.h"
+
 #define PS_QMF_TIME_SLOTS 32
 #define PS_AP_LINKS 3
 #define PS_MAX_AP_DELAY 5
 
 typedef struct PSDSPContext {
-    void (*add_squares)(float *dst, const float (*src)[2], int n);
-    void (*mul_pair_single)(float (*dst)[2], float (*src0)[2], float *src1,
+    void (*add_squares)(INTFLOAT *dst, const INTFLOAT (*src)[2], int n);
+    void (*mul_pair_single)(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
                             int n);
-    void (*hybrid_analysis)(float (*out)[2], float (*in)[2],
-                            const float (*filter)[8][2],
+    void (*hybrid_analysis)(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                            const INTFLOAT (*filter)[8][2],
                             int stride, int n);
-    void (*hybrid_analysis_ileave)(float (*out)[32][2], float L[2][38][64],
+    void (*hybrid_analysis_ileave)(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
                                    int i, int len);
-    void (*hybrid_synthesis_deint)(float out[2][38][64], float (*in)[32][2],
+    void (*hybrid_synthesis_deint)(INTFLOAT out[2][38][64], INTFLOAT (*in)[32][2],
                                    int i, int len);
-    void (*decorrelate)(float (*out)[2], float (*delay)[2],
-                        float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
-                        const float phi_fract[2], const float (*Q_fract)[2],
-                        const float *transient_gain,
-                        float g_decay_slope,
+    void (*decorrelate)(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                        INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
+                        const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                        const INTFLOAT *transient_gain,
+                        INTFLOAT g_decay_slope,
                         int len);
-    void (*stereo_interpolate[2])(float (*l)[2], float (*r)[2],
-                                  float h[2][4], float h_step[2][4],
+    void (*stereo_interpolate[2])(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                  INTFLOAT h[2][4], INTFLOAT h_step[2][4],
                                   int len);
 } PSDSPContext;
 
-void ff_psdsp_init(PSDSPContext *s);
+void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
 void ff_psdsp_init_arm(PSDSPContext *s);
+void ff_psdsp_init_mips(PSDSPContext *s);
+void ff_psdsp_init_x86(PSDSPContext *s);
 
 #endif /* LIBAVCODEC_AACPSDSP_H */
diff --git a/libavcodec/aacpsdsp_fixed.c b/libavcodec/aacpsdsp_fixed.c
new file mode 100644
index 0000000000..2413295113
--- /dev/null
+++ b/libavcodec/aacpsdsp_fixed.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_float.c b/libavcodec/aacpsdsp_float.c
new file mode 100644
index 0000000000..99aa650acf
--- /dev/null
+++ b/libavcodec/aacpsdsp_float.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c
new file mode 100644
index 0000000000..3049ce8b79
--- /dev/null
+++ b/libavcodec/aacpsdsp_template.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "aacpsdsp.h"
+
+static void ps_add_squares_c(INTFLOAT *dst, const INTFLOAT (*src)[2], int n)
+{
+    int i;
+    for (i = 0; i < n; i++)
+        dst[i] += AAC_MADD28(src[i][0], src[i][0], src[i][1], src[i][1]);
+}
+
+static void ps_mul_pair_single_c(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
+                                 int n)
+{
+    int i;
+    for (i = 0; i < n; i++) {
+        dst[i][0] = AAC_MUL16(src0[i][0], src1[i]);
+        dst[i][1] = AAC_MUL16(src0[i][1], src1[i]);
+    }
+}
+
+static void ps_hybrid_analysis_c(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                                 const INTFLOAT (*filter)[8][2],
+                                 int stride, int n)
+{
+    int i, j;
+
+    for (i = 0; i < n; i++) {
+        INT64FLOAT sum_re = (INT64FLOAT)filter[i][6][0] * in[6][0];
+        INT64FLOAT sum_im = (INT64FLOAT)filter[i][6][0] * in[6][1];
+
+        for (j = 0; j < 6; j++) {
+            INTFLOAT in0_re = in[j][0];
+            INTFLOAT in0_im = in[j][1];
+            INTFLOAT in1_re = in[12-j][0];
+            INTFLOAT in1_im = in[12-j][1];
+            sum_re += (INT64FLOAT)filter[i][j][0] * (in0_re + in1_re) -
+                      (INT64FLOAT)filter[i][j][1] * (in0_im - in1_im);
+            sum_im += (INT64FLOAT)filter[i][j][0] * (in0_im + in1_im) +
+                      (INT64FLOAT)filter[i][j][1] * (in0_re - in1_re);
+        }
+#if USE_FIXED
+        out[i * stride][0] = (int)((sum_re + 0x40000000) >> 31);
+        out[i * stride][1] = (int)((sum_im + 0x40000000) >> 31);
+#else
+        out[i * stride][0] = sum_re;
+        out[i * stride][1] = sum_im;
+#endif /* USE_FIXED */
+    }
+}
+static void ps_hybrid_analysis_ileave_c(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
+                                      int i, int len)
+{
+    int j;
+
+    for (; i < 64; i++) {
+        for (j = 0; j < len; j++) {
+            out[i][j][0] = L[0][j][i];
+            out[i][j][1] = L[1][j][i];
+        }
+    }
+}
+
+static void ps_hybrid_synthesis_deint_c(INTFLOAT out[2][38][64],
+                                      INTFLOAT (*in)[32][2],
+                                      int i, int len)
+{
+    int n;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < len; n++) {
+            out[0][n][i] = in[i][n][0];
+            out[1][n][i] = in[i][n][1];
+        }
+    }
+}
+
+static void ps_decorrelate_c(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                             INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                             const INTFLOAT *transient_gain,
+                             INTFLOAT g_decay_slope,
+                             int len)
+{
+    static const INTFLOAT a[] = { Q31(0.65143905753106f),
+                               Q31(0.56471812200776f),
+                               Q31(0.48954165955695f) };
+    INTFLOAT ag[PS_AP_LINKS];
+    int m, n;
+
+    for (m = 0; m < PS_AP_LINKS; m++)
+        ag[m] = AAC_MUL30(a[m], g_decay_slope);
+
+    for (n = 0; n < len; n++) {
+        INTFLOAT in_re = AAC_MSUB30(delay[n][0], phi_fract[0], delay[n][1], phi_fract[1]);
+        INTFLOAT in_im = AAC_MADD30(delay[n][0], phi_fract[1], delay[n][1], phi_fract[0]);
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            INTFLOAT a_re                = AAC_MUL31(ag[m], in_re);
+            INTFLOAT a_im                = AAC_MUL31(ag[m], in_im);
+            INTFLOAT link_delay_re       = ap_delay[m][n+2-m][0];
+            INTFLOAT link_delay_im       = ap_delay[m][n+2-m][1];
+            INTFLOAT fractional_delay_re = Q_fract[m][0];
+            INTFLOAT fractional_delay_im = Q_fract[m][1];
+            INTFLOAT apd_re = in_re;
+            INTFLOAT apd_im = in_im;
+            in_re = AAC_MSUB30(link_delay_re, fractional_delay_re,
+                    link_delay_im, fractional_delay_im);
+            in_re -= a_re;
+            in_im = AAC_MADD30(link_delay_re, fractional_delay_im,
+                    link_delay_im, fractional_delay_re);
+            in_im -= a_im;
+            ap_delay[m][n+5][0] = apd_re + AAC_MUL31(ag[m], in_re);
+            ap_delay[m][n+5][1] = apd_im + AAC_MUL31(ag[m], in_im);
+        }
+        out[n][0] = AAC_MUL16(transient_gain[n], in_re);
+        out[n][1] = AAC_MUL16(transient_gain[n], in_im);
+    }
+}
+
+static void ps_stereo_interpolate_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                    INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                    int len)
+{
+    INTFLOAT h0 = h[0][0];
+    INTFLOAT h1 = h[0][1];
+    INTFLOAT h2 = h[0][2];
+    INTFLOAT h3 = h[0][3];
+    INTFLOAT hs0 = h_step[0][0];
+    INTFLOAT hs1 = h_step[0][1];
+    INTFLOAT hs2 = h_step[0][2];
+    INTFLOAT hs3 = h_step[0][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h0 += hs0;
+        h1 += hs1;
+        h2 += hs2;
+        h3 += hs3;
+        l[n][0] = AAC_MADD30(h0,  l_re,  h2, r_re);
+        l[n][1] = AAC_MADD30(h0,  l_im,  h2,  r_im);
+        r[n][0] = AAC_MADD30(h1,  l_re,  h3,  r_re);
+        r[n][1] = AAC_MADD30(h1,  l_im,  h3,  r_im);
+    }
+}
+
+static void ps_stereo_interpolate_ipdopd_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                           INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                           int len)
+{
+    INTFLOAT h00  = h[0][0],      h10  = h[1][0];
+    INTFLOAT h01  = h[0][1],      h11  = h[1][1];
+    INTFLOAT h02  = h[0][2],      h12  = h[1][2];
+    INTFLOAT h03  = h[0][3],      h13  = h[1][3];
+    INTFLOAT hs00 = h_step[0][0], hs10 = h_step[1][0];
+    INTFLOAT hs01 = h_step[0][1], hs11 = h_step[1][1];
+    INTFLOAT hs02 = h_step[0][2], hs12 = h_step[1][2];
+    INTFLOAT hs03 = h_step[0][3], hs13 = h_step[1][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h00 += hs00;
+        h01 += hs01;
+        h02 += hs02;
+        h03 += hs03;
+        h10 += hs10;
+        h11 += hs11;
+        h12 += hs12;
+        h13 += hs13;
+
+        l[n][0] = AAC_MSUB30_V8(h00, l_re, h02, r_re, h10, l_im, h12, r_im);
+        l[n][1] = AAC_MADD30_V8(h00, l_im, h02, r_im, h10, l_re, h12, r_re);
+        r[n][0] = AAC_MSUB30_V8(h01, l_re, h03, r_re, h11, l_im, h13, r_im);
+        r[n][1] = AAC_MADD30_V8(h01, l_im, h03, r_im, h11, l_re, h13, r_re);
+    }
+}
+
+av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
+{
+    s->add_squares            = ps_add_squares_c;
+    s->mul_pair_single        = ps_mul_pair_single_c;
+    s->hybrid_analysis        = ps_hybrid_analysis_c;
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
+    s->decorrelate            = ps_decorrelate_c;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
+    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_psdsp_init_arm(s);
+    if (ARCH_MIPS)
+        ff_psdsp_init_mips(s);
+    if (ARCH_X86)
+        ff_psdsp_init_x86(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index af567061e0..af235c758c 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -2,20 +2,20 @@
  * AAC encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/libm.h"
+
 #include "avcodec.h"
 #include "aactab.h"
 #include "psymodel.h"
@@ -85,6 +87,7 @@ enum {
 };
 
 #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
+#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
 
 /* LAME psy model constants */
 #define PSY_LAME_FIR_LEN 21         ///< LAME psy model FIR order
@@ -216,6 +219,10 @@ static const float psy_fir_coeffs[] = {
     -5.52212e-17 * 2, -0.313819 * 2
 };
 
+#if ARCH_MIPS
+#   include "mips/aacpsy_mips.h"
+#endif /* ARCH_MIPS */
+
 /**
  * Calculate the ABR attack threshold from the above LAME psymodel table.
  */
@@ -294,7 +301,7 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
     int i, j, g, start;
     float prev, minscale, minath, minsnr, pe_min;
     const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
-    const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : ctx->avctx->sample_rate / 2;
+    const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : AAC_CUTOFF(ctx->avctx);
     const float num_bark   = calc_bark((float)bandwidth);
 
     ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
@@ -337,7 +344,7 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
             coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
             coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
             pe_min = bark_pe * bark_width;
-            minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
+            minsnr = exp2(pe_min / band_sizes[g]) - 1.5f;
             coeff->min_snr = av_clipf(1.0f / minsnr, PSY_SNR_25DB, PSY_SNR_1DB);
         }
         start = 0;
@@ -350,9 +357,9 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
         }
     }
 
-    pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels);
+    pctx->ch = av_mallocz_array(ctx->avctx->channels, sizeof(AacPsyChannel));
     if (!pctx->ch) {
-        av_freep(&pctx);
+        av_freep(&ctx->model_priv_data);
         return AVERROR(ENOMEM);
     }
 
@@ -532,8 +539,11 @@ static float calc_reduction_3gpp(float a, float desired_pe, float pe,
 {
     float thr_avg, reduction;
 
-    thr_avg   = powf(2.0f, (a - pe) / (4.0f * active_lines));
-    reduction = powf(2.0f, (a - desired_pe) / (4.0f * active_lines)) - thr_avg;
+    if(active_lines == 0.0)
+        return 0;
+
+    thr_avg   = exp2f((a - pe) / (4.0f * active_lines));
+    reduction = exp2f((a - desired_pe) / (4.0f * active_lines)) - thr_avg;
 
     return FFMAX(reduction, 0.0f);
 }
@@ -544,8 +554,10 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     float thr = band->thr;
 
     if (band->energy > thr) {
-        thr = powf(thr, 0.25f) + reduction;
-        thr = powf(thr, 4.0f);
+        thr = sqrtf(thr);
+        thr = sqrtf(thr) + reduction;
+        thr *= thr;
+        thr *= thr;
 
         /* This deviates from the 3GPP spec to match the reference encoder.
          * It performs min(thr_reduced, max(thr, energy/min_snr)) only for bands
@@ -561,6 +573,52 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     return thr;
 }
 
+#ifndef calc_thr_3gpp
+static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch,
+                          const uint8_t *band_sizes, const float *coefs)
+{
+    int i, w, g;
+    int start = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            for (i = 0; i < band_sizes[g]; i++) {
+                band->energy += coefs[start+i] * coefs[start+i];
+                form_factor  += sqrtf(fabs(coefs[start+i]));
+            }
+            Temp = band->energy > 0 ? sqrtf((float)band_sizes[g] / band->energy) : 0;
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+
+            start += band_sizes[g];
+        }
+    }
+}
+#endif /* calc_thr_3gpp */
+
+#ifndef psy_hp_filter
+static void psy_hp_filter(const float *firbuf, float *hpfsmpl, const float *psy_fir_coeffs)
+{
+    int i, j;
+    for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
+        float sum1, sum2;
+        sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
+        sum2 = 0.0;
+        for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
+            sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
+            sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
+        }
+        /* NOTE: The LAME psymodel expects it's input in the range -32768 to 32768.
+         *       Tuning this for normalized floats would be difficult. */
+        hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
+    }
+}
+#endif /* psy_hp_filter */
+
 /**
  * Calculate band thresholds as suggested in 3GPP TS26.403
  */
@@ -569,9 +627,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 {
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
-    int start = 0;
     int i, w, g;
-    float desired_bits, desired_pe, delta_pe, reduction, spread_en[128] = {0};
+    float desired_bits, desired_pe, delta_pe, reduction= NAN, spread_en[128] = {0};
     float a = 0.0f, active_lines = 0.0f, norm_fac = 0.0f;
     float pe = pctx->chan_bitrate > 32000 ? 0.0f : FFMAX(50.0f, 100.0f - pctx->chan_bitrate * 100.0f / 32000.0f);
     const int      num_bands   = ctx->num_bands[wi->num_windows == 8];
@@ -580,22 +637,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
     const float avoid_hole_thr = wi->num_windows == 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG;
 
     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
-    for (w = 0; w < wi->num_windows*16; w += 16) {
-        for (g = 0; g < num_bands; g++) {
-            AacPsyBand *band = &pch->band[w+g];
-
-            float form_factor = 0.0f;
-            band->energy = 0.0f;
-            for (i = 0; i < band_sizes[g]; i++) {
-                band->energy += coefs[start+i] * coefs[start+i];
-                form_factor  += sqrtf(fabs(coefs[start+i]));
-            }
-            band->thr      = band->energy * 0.001258925f;
-            band->nz_lines = form_factor / powf(band->energy / band_sizes[g], 0.25f);
+    calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs);
 
-            start += band_sizes[g];
-        }
-    }
     //modify thresholds and energies - spread, threshold in quiet, pre-echo control
     for (w = 0; w < wi->num_windows*16; w += 16) {
         AacPsyBand *bands = &pch->band[w];
@@ -645,6 +688,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
         desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
                                0.85f, 1.15f);
     pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
+    ctx->bitres.alloc = desired_bits;
 
     if (desired_pe < pe) {
         /* 5.6.1.3.4 "First Estimation of the reduction value" */
@@ -681,7 +725,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
             }
             desired_pe_no_ah = FFMAX(desired_pe - (pe - pe_no_ah), 0.0f);
             if (active_lines > 0.0f)
-                reduction += calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
+                reduction = calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
 
             pe = 0.0f;
             for (w = 0; w < wi->num_windows*16; w += 16) {
@@ -691,7 +735,10 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                     if (active_lines > 0.0f)
                         band->thr = calc_reduced_thr_3gpp(band, coeffs[g].min_snr, reduction);
                     pe += calc_pe_3gpp(band);
-                    band->norm_fac = band->active_lines / band->thr;
+                    if (band->thr > 0.0f)
+                        band->norm_fac = band->active_lines / band->thr;
+                    else
+                        band->norm_fac = 0.0f;
                     norm_fac += band->norm_fac;
                 }
             }
@@ -711,7 +758,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                         float delta_sfb_pe = band->norm_fac * norm_fac * delta_pe;
                         float thr = band->thr;
 
-                        thr *= powf(2.0f, delta_sfb_pe / band->active_lines);
+                        thr *= exp2f(delta_sfb_pe / band->active_lines);
                         if (thr > coeffs[g].min_snr * band->energy && band->avoid_holes == PSY_3GPP_AH_INACTIVE)
                             thr = FFMAX(band->thr, coeffs[g].min_snr * band->energy);
                         band->thr = thr;
@@ -742,6 +789,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
             psy_band->threshold = band->thr;
             psy_band->energy    = band->energy;
+            psy_band->spread    = band->active_lines * 2.0f / band_sizes[g];
+            psy_band->bits      = PSY_3GPP_PE_TO_BITS(band->pe);
         }
     }
 
@@ -791,6 +840,7 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
     int grouping     = 0;
     int uselongblock = 1;
     int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
+    float clippings[AAC_NUM_BLOCKS_SHORT];
     int i;
     FFPsyWindowInfo wi = { { 0 } };
 
@@ -801,21 +851,10 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
         float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
         float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
         const float *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN);
-        int j, att_sum = 0;
+        int att_sum = 0;
 
         /* LAME comment: apply high pass filter of fs/4 */
-        for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
-            float sum1, sum2;
-            sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
-            sum2 = 0.0;
-            for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
-                sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
-                sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
-            }
-            /* NOTE: The LAME psymodel expects its input in the range -32768 to
-             * 32768. Tuning this for normalized floats would be difficult. */
-            hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
-        }
+        psy_hp_filter(firbuf, hpfsmpl, psy_fir_coeffs);
 
         /* Calculate the energies of each sub-shortblock */
         for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) {
@@ -891,14 +930,35 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
 
     lame_apply_block_type(pch, &wi, uselongblock);
 
+    /* Calculate input sample maximums and evaluate clipping risk */
+    if (audio) {
+        for (i = 0; i < AAC_NUM_BLOCKS_SHORT; i++) {
+            const float *wbuf = audio + i * AAC_BLOCK_SIZE_SHORT;
+            float max = 0;
+            int j;
+            for (j = 0; j < AAC_BLOCK_SIZE_SHORT; j++)
+                max = FFMAX(max, fabsf(wbuf[j]));
+            clippings[i] = max;
+        }
+    } else {
+        for (i = 0; i < 8; i++)
+            clippings[i] = 0;
+    }
+
     wi.window_type[1] = prev_type;
     if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
+        float clipping = 0.0f;
+
         wi.num_windows  = 1;
         wi.grouping[0]  = 1;
         if (wi.window_type[0] == LONG_START_SEQUENCE)
             wi.window_shape = 0;
         else
             wi.window_shape = 1;
+
+        for (i = 0; i < 8; i++)
+            clipping = FFMAX(clipping, clippings[i]);
+        wi.clipping[0] = clipping;
     } else {
         int lastgrp = 0;
 
@@ -909,6 +969,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
                 lastgrp = i;
             wi.grouping[lastgrp]++;
         }
+
+        for (i = 0; i < 8; i += wi.grouping[i]) {
+            int w;
+            float clipping = 0.0f;
+            for (w = 0; w < wi.grouping[i] && !clipping; w++)
+                clipping = FFMAX(clipping, clippings[i+w]);
+            wi.clipping[i] = clipping;
+        }
     }
 
     /* Determine grouping, based on the location of the first attack, and save for
diff --git a/libavcodec/aacsbr.c b/libavcodec/aacsbr.c
index b389e10817..81f1902822 100644
--- a/libavcodec/aacsbr.c
+++ b/libavcodec/aacsbr.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,263 +25,30 @@
  * AAC Spectral Band Replication decoding functions
  * @author Robert Swain ( rob opendot cl )
  */
+#define USE_FIXED 0
 
 #include "aac.h"
 #include "sbr.h"
 #include "aacsbr.h"
 #include "aacsbrdata.h"
+#include "aacsbr_tablegen.h"
 #include "fft.h"
 #include "aacps.h"
 #include "sbrdsp.h"
 #include "libavutil/internal.h"
 #include "libavutil/libm.h"
+#include "libavutil/avassert.h"
 
 #include <stdint.h>
 #include <float.h>
+#include <math.h>
 
-#define ENVELOPE_ADJUSTMENT_OFFSET 2
-#define NOISE_FLOOR_OFFSET 6.0f
-
-/**
- * SBR VLC tables
- */
-enum {
-    T_HUFFMAN_ENV_1_5DB,
-    F_HUFFMAN_ENV_1_5DB,
-    T_HUFFMAN_ENV_BAL_1_5DB,
-    F_HUFFMAN_ENV_BAL_1_5DB,
-    T_HUFFMAN_ENV_3_0DB,
-    F_HUFFMAN_ENV_3_0DB,
-    T_HUFFMAN_ENV_BAL_3_0DB,
-    F_HUFFMAN_ENV_BAL_3_0DB,
-    T_HUFFMAN_NOISE_3_0DB,
-    T_HUFFMAN_NOISE_BAL_3_0DB,
-};
-
-/**
- * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
- */
-enum {
-    FIXFIX,
-    FIXVAR,
-    VARFIX,
-    VARVAR,
-};
-
-enum {
-    EXTENSION_ID_PS = 2,
-};
+#if ARCH_MIPS
+#include "mips/aacsbr_mips.h"
+#endif /* ARCH_MIPS */
 
 static VLC vlc_sbr[10];
-static const int8_t vlc_sbr_lav[10] =
-    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
-
-#define SBR_INIT_VLC_STATIC(num, size) \
-    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
-                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
-                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
-                    size)
-
-#define SBR_VLC_ROW(name) \
-    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
-
-av_cold void ff_aac_sbr_init(void)
-{
-    int n;
-    static const struct {
-        const void *sbr_codes, *sbr_bits;
-        const unsigned int table_size, elem_size;
-    } sbr_tmp[] = {
-        SBR_VLC_ROW(t_huffman_env_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_3_0dB),
-        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
-    };
-
-    // SBR VLC table initialization
-    SBR_INIT_VLC_STATIC(0, 1098);
-    SBR_INIT_VLC_STATIC(1, 1092);
-    SBR_INIT_VLC_STATIC(2, 768);
-    SBR_INIT_VLC_STATIC(3, 1026);
-    SBR_INIT_VLC_STATIC(4, 1058);
-    SBR_INIT_VLC_STATIC(5, 1052);
-    SBR_INIT_VLC_STATIC(6, 544);
-    SBR_INIT_VLC_STATIC(7, 544);
-    SBR_INIT_VLC_STATIC(8, 592);
-    SBR_INIT_VLC_STATIC(9, 512);
-
-    for (n = 1; n < 320; n++)
-        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
-    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
-    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
-
-    for (n = 0; n < 320; n++)
-        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
-
-    ff_ps_init();
-}
-
-/** Places SBR in pure upsampling mode. */
-static void sbr_turnoff(SpectralBandReplication *sbr) {
-    sbr->start = 0;
-    // Init defults used in pure upsampling mode
-    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
-    sbr->m[1] = 0;
-    // Reset values for first SBR header
-    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
-    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
-}
-
-av_cold void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr)
-{
-    sbr->kx[0] = sbr->kx[1];
-    sbr_turnoff(sbr);
-    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
-     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
-     * and scale back down at synthesis. */
-    ff_mdct_init(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
-    ff_mdct_init(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
-    ff_ps_ctx_init(&sbr->ps);
-    ff_sbrdsp_init(&sbr->dsp);
-}
-
-av_cold void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr)
-{
-    ff_mdct_end(&sbr->mdct);
-    ff_mdct_end(&sbr->mdct_ana);
-}
-
-static int qsort_comparison_function_int16(const void *a, const void *b)
-{
-    return *(const int16_t *)a - *(const int16_t *)b;
-}
-
-static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
-{
-    int i;
-    for (i = 0; i <= last_el; i++)
-        if (table[i] == needle)
-            return 1;
-    return 0;
-}
-
-/// Limiter Frequency Band Table (14496-3 sp04 p198)
-static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
-{
-    int k;
-    if (sbr->bs_limiter_bands > 0) {
-        static const float bands_warped[3] = { 1.32715174233856803909f,   //2^(0.49/1.2)
-                                               1.18509277094158210129f,   //2^(0.49/2)
-                                               1.11987160404675912501f }; //2^(0.49/3)
-        const float lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
-        int16_t patch_borders[7];
-        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
-
-        patch_borders[0] = sbr->kx[1];
-        for (k = 1; k <= sbr->num_patches; k++)
-            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
-
-        memcpy(sbr->f_tablelim, sbr->f_tablelow,
-               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
-        if (sbr->num_patches > 1)
-            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
-                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
-
-        qsort(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
-              sizeof(sbr->f_tablelim[0]),
-              qsort_comparison_function_int16);
-
-        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
-        while (out < sbr->f_tablelim + sbr->n_lim) {
-            if (*in >= *out * lim_bands_per_octave_warped) {
-                *++out = *in++;
-            } else if (*in == *out ||
-                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
-                in++;
-                sbr->n_lim--;
-            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
-                *out = *in++;
-                sbr->n_lim--;
-            } else {
-                *++out = *in++;
-            }
-        }
-    } else {
-        sbr->f_tablelim[0] = sbr->f_tablelow[0];
-        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
-        sbr->n_lim = 1;
-    }
-}
-
-static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
-{
-    unsigned int cnt = get_bits_count(gb);
-    uint8_t bs_header_extra_1;
-    uint8_t bs_header_extra_2;
-    int old_bs_limiter_bands = sbr->bs_limiter_bands;
-    SpectrumParameters old_spectrum_params;
-
-    sbr->start = 1;
-
-    // Save last spectrum parameters variables to compare to new ones
-    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
-
-    sbr->bs_amp_res_header              = get_bits1(gb);
-    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
-    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
-    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
-                                          skip_bits(gb, 2); // bs_reserved
-
-    bs_header_extra_1 = get_bits1(gb);
-    bs_header_extra_2 = get_bits1(gb);
-
-    if (bs_header_extra_1) {
-        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
-        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
-        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
-    } else {
-        sbr->spectrum_params.bs_freq_scale  = 2;
-        sbr->spectrum_params.bs_alter_scale = 1;
-        sbr->spectrum_params.bs_noise_bands = 2;
-    }
-
-    // Check if spectrum parameters changed
-    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
-        sbr->reset = 1;
-
-    if (bs_header_extra_2) {
-        sbr->bs_limiter_bands  = get_bits(gb, 2);
-        sbr->bs_limiter_gains  = get_bits(gb, 2);
-        sbr->bs_interpol_freq  = get_bits1(gb);
-        sbr->bs_smoothing_mode = get_bits1(gb);
-    } else {
-        sbr->bs_limiter_bands  = 2;
-        sbr->bs_limiter_gains  = 2;
-        sbr->bs_interpol_freq  = 1;
-        sbr->bs_smoothing_mode = 1;
-    }
-
-    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
-        sbr_make_f_tablelim(sbr);
-
-    return get_bits_count(gb) - cnt;
-}
-
-static int array_min_int16(const int16_t *array, int nel)
-{
-    int i, min = array[0];
-    for (i = 1; i < nel; i++)
-        min = FFMIN(array[i], min);
-    return min;
-}
+static void aacsbr_func_ptr_init(AACSBRContext *c);
 
 static void make_bands(int16_t* bands, int start, int stop, int num_bands)
 {
@@ -301,812 +68,6 @@ static void make_bands(int16_t* bands, int start, int stop, int num_bands)
     bands[num_bands-1] = stop - previous;
 }
 
-static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
-{
-    // Requirements (14496-3 sp04 p205)
-    if (n_master <= 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
-        return -1;
-    }
-    if (bs_xover_band >= n_master) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
-               bs_xover_band);
-        return -1;
-    }
-    return 0;
-}
-
-/// Master Frequency Band Table (14496-3 sp04 p194)
-static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
-                             SpectrumParameters *spectrum)
-{
-    unsigned int temp, max_qmf_subbands = 0;
-    unsigned int start_min, stop_min;
-    int k;
-    const int8_t *sbr_offset_ptr;
-    int16_t stop_dk[13];
-
-    if (sbr->sample_rate < 32000) {
-        temp = 3000;
-    } else if (sbr->sample_rate < 64000) {
-        temp = 4000;
-    } else
-        temp = 5000;
-
-    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    switch (sbr->sample_rate) {
-    case 16000:
-        sbr_offset_ptr = sbr_offset[0];
-        break;
-    case 22050:
-        sbr_offset_ptr = sbr_offset[1];
-        break;
-    case 24000:
-        sbr_offset_ptr = sbr_offset[2];
-        break;
-    case 32000:
-        sbr_offset_ptr = sbr_offset[3];
-        break;
-    case 44100: case 48000: case 64000:
-        sbr_offset_ptr = sbr_offset[4];
-        break;
-    case 88200: case 96000: case 128000: case 176400: case 192000:
-        sbr_offset_ptr = sbr_offset[5];
-        break;
-    default:
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
-        return -1;
-    }
-
-    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
-
-    if (spectrum->bs_stop_freq < 14) {
-        sbr->k[2] = stop_min;
-        make_bands(stop_dk, stop_min, 64, 13);
-        qsort(stop_dk, 13, sizeof(stop_dk[0]), qsort_comparison_function_int16);
-        for (k = 0; k < spectrum->bs_stop_freq; k++)
-            sbr->k[2] += stop_dk[k];
-    } else if (spectrum->bs_stop_freq == 14) {
-        sbr->k[2] = 2*sbr->k[0];
-    } else if (spectrum->bs_stop_freq == 15) {
-        sbr->k[2] = 3*sbr->k[0];
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
-        return -1;
-    }
-    sbr->k[2] = FFMIN(64, sbr->k[2]);
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->sample_rate <= 32000) {
-        max_qmf_subbands = 48;
-    } else if (sbr->sample_rate == 44100) {
-        max_qmf_subbands = 35;
-    } else if (sbr->sample_rate >= 48000)
-        max_qmf_subbands = 32;
-
-    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
-        return -1;
-    }
-
-    if (!spectrum->bs_freq_scale) {
-        int dk, k2diff;
-
-        dk = spectrum->bs_alter_scale + 1;
-        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
-        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-            return -1;
-
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] = dk;
-
-        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
-        if (k2diff < 0) {
-            sbr->f_master[1]--;
-            sbr->f_master[2]-= (k2diff < -1);
-        } else if (k2diff) {
-            sbr->f_master[sbr->n_master]++;
-        }
-
-        sbr->f_master[0] = sbr->k[0];
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] += sbr->f_master[k - 1];
-
-    } else {
-        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
-        int two_regions, num_bands_0;
-        int vdk0_max, vdk1_min;
-        int16_t vk0[49];
-
-        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
-            two_regions = 1;
-            sbr->k[1] = 2 * sbr->k[0];
-        } else {
-            two_regions = 0;
-            sbr->k[1] = sbr->k[2];
-        }
-
-        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
-
-        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
-            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
-            return -1;
-        }
-
-        vk0[0] = 0;
-
-        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
-
-        qsort(vk0 + 1, num_bands_0, sizeof(vk0[1]), qsort_comparison_function_int16);
-        vdk0_max = vk0[num_bands_0];
-
-        vk0[0] = sbr->k[0];
-        for (k = 1; k <= num_bands_0; k++) {
-            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
-                return -1;
-            }
-            vk0[k] += vk0[k-1];
-        }
-
-        if (two_regions) {
-            int16_t vk1[49];
-            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
-                                                     : 1.0f; // bs_alter_scale = {0,1}
-            int num_bands_1 = lrintf(half_bands * invwarp *
-                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
-
-            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
-
-            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
-
-            if (vdk1_min < vdk0_max) {
-                int change;
-                qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
-                vk1[1]           += change;
-                vk1[num_bands_1] -= change;
-            }
-
-            qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-
-            vk1[0] = sbr->k[1];
-            for (k = 1; k <= num_bands_1; k++) {
-                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
-                    return -1;
-                }
-                vk1[k] += vk1[k-1];
-            }
-
-            sbr->n_master = num_bands_0 + num_bands_1;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(&sbr->f_master[0],               vk0,
-                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
-                    num_bands_1      * sizeof(sbr->f_master[0]));
-
-        } else {
-            sbr->n_master = num_bands_0;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-        }
-    }
-
-    return 0;
-}
-
-/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
-static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int i, k, sb = 0;
-    int msb = sbr->k[0];
-    int usb = sbr->kx[1];
-    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    sbr->num_patches = 0;
-
-    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
-        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
-    } else
-        k = sbr->n_master;
-
-    do {
-        int odd = 0;
-        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
-            sb = sbr->f_master[i];
-            odd = (sb + sbr->k[0]) & 1;
-        }
-
-        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
-        // After this check the final number of patches can still be six which is
-        // illegal however the Coding Technologies decoder check stream has a final
-        // count of 6 patches
-        if (sbr->num_patches > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
-            return -1;
-        }
-
-        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
-        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
-
-        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
-            usb = sb;
-            msb = sb;
-            sbr->num_patches++;
-        } else
-            msb = sbr->kx[1];
-
-        if (sbr->f_master[k] - sb < 3)
-            k = sbr->n_master;
-    } while (sb != sbr->kx[1] + sbr->m[1]);
-
-    if (sbr->num_patches > 1 &&
-        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
-        sbr->num_patches--;
-
-    return 0;
-}
-
-/// Derived Frequency Band Tables (14496-3 sp04 p197)
-static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int k, temp;
-
-    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
-    sbr->n[0] = (sbr->n[1] + 1) >> 1;
-
-    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
-           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
-    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
-    sbr->kx[1] = sbr->f_tablehigh[0];
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->kx[1] + sbr->m[1] > 64) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
-        return -1;
-    }
-    if (sbr->kx[1] > 32) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
-        return -1;
-    }
-
-    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
-    temp = sbr->n[1] & 1;
-    for (k = 1; k <= sbr->n[0]; k++)
-        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
-
-    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
-                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
-    if (sbr->n_q > 5) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
-        return -1;
-    }
-
-    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
-    temp = 0;
-    for (k = 1; k <= sbr->n_q; k++) {
-        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
-        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
-    }
-
-    if (sbr_hf_calc_npatches(ac, sbr) < 0)
-        return -1;
-
-    sbr_make_f_tablelim(sbr);
-
-    sbr->data[0].f_indexnoise = 0;
-    sbr->data[1].f_indexnoise = 0;
-
-    return 0;
-}
-
-static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
-                                              int elements)
-{
-    int i;
-    for (i = 0; i < elements; i++) {
-        vec[i] = get_bits1(gb);
-    }
-}
-
-/** ceil(log2(index+1)) */
-static const int8_t ceil_log2[] = {
-    0, 1, 2, 2, 3, 3,
-};
-
-static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
-                         GetBitContext *gb, SBRData *ch_data)
-{
-    int i;
-    int bs_pointer = 0;
-    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
-    int abs_bord_trail = 16;
-    int num_rel_lead, num_rel_trail;
-    unsigned bs_num_env_old = ch_data->bs_num_env;
-
-    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
-    ch_data->bs_amp_res = sbr->bs_amp_res_header;
-    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
-
-    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
-    case FIXFIX:
-        ch_data->bs_num_env                 = 1 << get_bits(gb, 2);
-        num_rel_lead                        = ch_data->bs_num_env - 1;
-        if (ch_data->bs_num_env == 1)
-            ch_data->bs_amp_res = 0;
-
-        if (ch_data->bs_num_env > 4) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
-                   ch_data->bs_num_env;
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
-
-        ch_data->bs_freq_res[1] = get_bits1(gb);
-        for (i = 1; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
-        break;
-    case FIXVAR:
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_trail + 1;
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        for (i = 0; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
-        break;
-    case VARFIX:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + 1;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    case VARVAR:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + num_rel_trail + 1;
-
-        if (ch_data->bs_num_env > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    }
-
-    if (bs_pointer < 0 || bs_pointer > ch_data->bs_num_env + 1) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
-               bs_pointer);
-        return -1;
-    }
-
-    for (i = 1; i <= ch_data->bs_num_env; i++) {
-        if (ch_data->t_env[i-1] > ch_data->t_env[i]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Non monotone time borders\n");
-            return -1;
-        }
-    }
-
-    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
-
-    ch_data->t_q[0]                     = ch_data->t_env[0];
-    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
-    if (ch_data->bs_num_noise > 1) {
-        int idx;
-        if (ch_data->bs_frame_class == FIXFIX) {
-            idx = ch_data->bs_num_env >> 1;
-        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
-            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
-        } else { // VARFIX
-            if (!bs_pointer)
-                idx = 1;
-            else if (bs_pointer == 1)
-                idx = ch_data->bs_num_env - 1;
-            else // bs_pointer > 1
-                idx = bs_pointer - 1;
-        }
-        ch_data->t_q[1] = ch_data->t_env[idx];
-    }
-
-    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
-    ch_data->e_a[1] = -1;
-    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
-        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
-    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
-        ch_data->e_a[1] = bs_pointer - 1;
-
-    return 0;
-}
-
-static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
-    //These variables are saved from the previous frame rather than copied
-    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
-    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
-    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
-
-    //These variables are read from the bitstream and therefore copied
-    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
-    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
-    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
-    dst->bs_num_env        = src->bs_num_env;
-    dst->bs_amp_res        = src->bs_amp_res;
-    dst->bs_num_noise      = src->bs_num_noise;
-    dst->bs_frame_class    = src->bs_frame_class;
-    dst->e_a[1]            = src->e_a[1];
-}
-
-/// Read how the envelope and noise floor data is delta coded
-static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
-    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
-}
-
-/// Read inverse filtering data
-static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    int i;
-
-    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
-    for (i = 0; i < sbr->n_q; i++)
-        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
-}
-
-static void read_sbr_envelope(SpectralBandReplication *sbr, GetBitContext *gb,
-                              SBRData *ch_data, int ch)
-{
-    int bits;
-    int i, j, k;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-    const int odd = sbr->n[1] & 1;
-
-    if (sbr->bs_coupling && ch) {
-        if (ch_data->bs_amp_res) {
-            bits   = 5;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-        } else {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
-        }
-    } else {
-        if (ch_data->bs_amp_res) {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-        } else {
-            bits   = 7;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
-        }
-    }
-
-    for (i = 0; i < ch_data->bs_num_env; i++) {
-        if (ch_data->bs_df_env[i]) {
-            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
-            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-            } else if (ch_data->bs_freq_res[i + 1]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            } else {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            }
-        } else {
-            ch_data->env_facs[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
-            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                ch_data->env_facs[i + 1][j] = ch_data->env_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of env_facs from last elements
-    memcpy(ch_data->env_facs[0], ch_data->env_facs[ch_data->bs_num_env],
-           sizeof(ch_data->env_facs[0]));
-}
-
-static void read_sbr_noise(SpectralBandReplication *sbr, GetBitContext *gb,
-                           SBRData *ch_data, int ch)
-{
-    int i, j;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-
-    if (sbr->bs_coupling && ch) {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-    } else {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-    }
-
-    for (i = 0; i < ch_data->bs_num_noise; i++) {
-        if (ch_data->bs_df_noise[i]) {
-            for (j = 0; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
-        } else {
-            ch_data->noise_facs[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
-            for (j = 1; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of noise_facs from last elements
-    memcpy(ch_data->noise_facs[0], ch_data->noise_facs[ch_data->bs_num_noise],
-           sizeof(ch_data->noise_facs[0]));
-}
-
-static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                               GetBitContext *gb,
-                               int bs_extension_id, int *num_bits_left)
-{
-    switch (bs_extension_id) {
-    case EXTENSION_ID_PS:
-        if (!ac->oc[1].m4ac.ps) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-            *num_bits_left = 0;
-        } else {
-#if 1
-            *num_bits_left -= ff_ps_read_data(ac->avctx, gb, &sbr->ps, *num_bits_left);
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-#else
-            avpriv_report_missing_feature(ac->avctx, "Parametric Stereo");
-            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-            *num_bits_left = 0;
-#endif
-        }
-        break;
-    default:
-        // some files contain 0-padding
-        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
-            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
-        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-        *num_bits_left = 0;
-        break;
-    }
-}
-
-static int read_sbr_single_channel_element(AACContext *ac,
-                                            SpectralBandReplication *sbr,
-                                            GetBitContext *gb)
-{
-    if (get_bits1(gb)) // bs_data_extra
-        skip_bits(gb, 4); // bs_reserved
-
-    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-        return -1;
-    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-    read_sbr_invf(sbr, gb, &sbr->data[0]);
-    read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-    read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static int read_sbr_channel_pair_element(AACContext *ac,
-                                          SpectralBandReplication *sbr,
-                                          GetBitContext *gb)
-{
-    if (get_bits1(gb))    // bs_data_extra
-        skip_bits(gb, 8); // bs_reserved
-
-    if ((sbr->bs_coupling = get_bits1(gb))) {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-            return -1;
-        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    } else {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
-            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
-            return -1;
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        read_sbr_invf(sbr, gb, &sbr->data[1]);
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    }
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
-                                  GetBitContext *gb, int id_aac)
-{
-    unsigned int cnt = get_bits_count(gb);
-
-    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
-        if (read_sbr_single_channel_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else if (id_aac == TYPE_CPE) {
-        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
-        sbr_turnoff(sbr);
-        return get_bits_count(gb) - cnt;
-    }
-    if (get_bits1(gb)) { // bs_extended_data
-        int num_bits_left = get_bits(gb, 4); // bs_extension_size
-        if (num_bits_left == 15)
-            num_bits_left += get_bits(gb, 8); // bs_esc_count
-
-        num_bits_left <<= 3;
-        while (num_bits_left > 7) {
-            num_bits_left -= 2;
-            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
-        }
-        if (num_bits_left < 0) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
-        }
-        if (num_bits_left > 0)
-            skip_bits(gb, num_bits_left);
-    }
-
-    return get_bits_count(gb) - cnt;
-}
-
-static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int err;
-    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
-    if (err >= 0)
-        err = sbr_make_f_derived(ac, sbr);
-    if (err < 0) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
-        sbr_turnoff(sbr);
-    }
-}
-
-/**
- * Decode Spectral Band Replication extension data; reference: table 4.55.
- *
- * @param   crc flag indicating the presence of CRC checksum
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return  Returns number of bytes consumed from the TYPE_FIL element.
- */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
-{
-    unsigned int num_sbr_bits = 0, num_align_bits;
-    unsigned bytes_read;
-    GetBitContext gbc = *gb_host, *gb = &gbc;
-    skip_bits_long(gb_host, cnt*8 - 4);
-
-    sbr->reset = 0;
-
-    if (!sbr->sample_rate)
-        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
-    if (!ac->oc[1].m4ac.ext_sample_rate)
-        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
-
-    if (crc) {
-        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
-        num_sbr_bits += 10;
-    }
-
-    //Save some state from the previous frame.
-    sbr->kx[0] = sbr->kx[1];
-    sbr->m[0] = sbr->m[1];
-    sbr->kx_and_m_pushed = 1;
-
-    num_sbr_bits++;
-    if (get_bits1(gb)) // bs_header_flag
-        num_sbr_bits += read_sbr_header(sbr, gb);
-
-    if (sbr->reset)
-        sbr_reset(ac, sbr);
-
-    if (sbr->start)
-        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
-
-    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
-    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
-
-    if (bytes_read > cnt) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
-    }
-    return cnt;
-}
-
 /// Dequantization and stereo decoding (14496-3 sp04 p203)
 static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
 {
@@ -1120,7 +81,12 @@ static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
             for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
                 float temp1 = exp2f(sbr->data[0].env_facs[e][k] * alpha + 7.0f);
                 float temp2 = exp2f((pan_offset - sbr->data[1].env_facs[e][k]) * alpha);
-                float fac   = temp1 / (1.0f + temp2);
+                float fac;
+                if (temp1 > 1E20) {
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = 1;
+                }
+                fac   = temp1 / (1.0f + temp2);
                 sbr->data[0].env_facs[e][k] = fac;
                 sbr->data[1].env_facs[e][k] = fac * temp2;
             }
@@ -1129,7 +95,12 @@ static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
             for (k = 0; k < sbr->n_q; k++) {
                 float temp1 = exp2f(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs[e][k] + 1);
                 float temp2 = exp2f(12 - sbr->data[1].noise_facs[e][k]);
-                float fac   = temp1 / (1.0f + temp2);
+                float fac;
+                if (temp1 > 1E20) {
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = 1;
+                }
+                fac = temp1 / (1.0f + temp2);
                 sbr->data[0].noise_facs[e][k] = fac;
                 sbr->data[1].noise_facs[e][k] = fac * temp2;
             }
@@ -1138,9 +109,15 @@ static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
         for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
             float alpha = sbr->data[ch].bs_amp_res ? 1.0f : 0.5f;
             for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
-                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++)
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
                     sbr->data[ch].env_facs[e][k] =
                         exp2f(alpha * sbr->data[ch].env_facs[e][k] + 6.0f);
+                    if (sbr->data[ch].env_facs[e][k] > 1E20) {
+                        av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                        sbr->data[ch].env_facs[e][k] = 1;
+                    }
+                }
+
             for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
                 for (k = 0; k < sbr->n_q; k++)
                     sbr->data[ch].noise_facs[e][k] =
@@ -1149,80 +126,6 @@ static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
     }
 }
 
-/**
- * Analysis QMF Bank (14496-3 sp04 p206)
- *
- * @param   x       pointer to the beginning of the first sample window
- * @param   W       array of complex-valued samples split into subbands
- */
-static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
-                             SBRDSPContext *sbrdsp, const float *in, float *x,
-                             float z[320], float W[2][32][32][2], int buf_idx)
-{
-    int i;
-    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
-    memcpy(x+288, in,         1024*sizeof(x[0]));
-    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
-                               // are not supported
-        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
-        sbrdsp->sum64x5(z);
-        sbrdsp->qmf_pre_shuffle(z);
-        mdct->imdct_half(mdct, z, z+64);
-        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
-        x += 32;
-    }
-}
-
-/**
- * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
- * (14496-3 sp04 p206)
- */
-static void sbr_qmf_synthesis(FFTContext *mdct,
-                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
-                              float *out, float X[2][38][64],
-                              float mdct_buf[2][64],
-                              float *v0, int *v_off, const unsigned int div)
-{
-    int i, n;
-    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
-    const int step = 128 >> div;
-    float *v;
-    for (i = 0; i < 32; i++) {
-        if (*v_off < step) {
-            int saved_samples = (1280 - 128) >> div;
-            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
-            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
-        } else {
-            *v_off -= step;
-        }
-        v = v0 + *v_off;
-        if (div) {
-            for (n = 0; n < 32; n++) {
-                X[0][i][   n] = -X[0][i][n];
-                X[0][i][32+n] =  X[1][i][31-n];
-            }
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
-        } else {
-            sbrdsp->neg_odd_64(X[1][i]);
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
-            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
-        }
-        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
-        out += 64 >> div;
-    }
-}
-
 /** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
  * (14496-3 sp04 p214)
  * Warning: This routine does not seem numerically stable.
@@ -1302,203 +205,6 @@ static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
     }
 }
 
-/// Generate the subband filtered lowband
-static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_low[32][40][2], const float W[2][32][32][2],
-                      int buf_idx)
-{
-    int i, k;
-    const int t_HFGen = 8;
-    const int i_f = 32;
-    memset(X_low, 0, 32*sizeof(*X_low));
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
-        }
-    }
-    buf_idx = 1-buf_idx;
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
-        }
-    }
-    return 0;
-}
-
-/// High Frequency Generator (14496-3 sp04 p215)
-static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_high[64][40][2], const float X_low[32][40][2],
-                      const float (*alpha0)[2], const float (*alpha1)[2],
-                      const float bw_array[5], const uint8_t *t_env,
-                      int bs_num_env)
-{
-    int j, x;
-    int g = 0;
-    int k = sbr->kx[1];
-    for (j = 0; j < sbr->num_patches; j++) {
-        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
-            const int p = sbr->patch_start_subband[j] + x;
-            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
-                g++;
-            g--;
-
-            if (g < 0) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "ERROR : no subband found for frequency %d\n", k);
-                return -1;
-            }
-
-            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
-                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
-                            alpha0[p], alpha1[p], bw_array[g],
-                            2 * t_env[0], 2 * t_env[bs_num_env]);
-        }
-    }
-    if (k < sbr->m[1] + sbr->kx[1])
-        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
-
-    return 0;
-}
-
-/// Generate the subband filtered lowband
-static int sbr_x_gen(SpectralBandReplication *sbr, float X[2][38][64],
-                     const float Y0[38][64][2], const float Y1[38][64][2],
-                     const float X_low[32][40][2], int ch)
-{
-    int k, i;
-    const int i_f = 32;
-    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
-    memset(X, 0, 2*sizeof(*X));
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = Y0[i + i_f][k][0];
-            X[1][i][k] = Y0[i + i_f][k][1];
-        }
-    }
-
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = i_Temp; i < 38; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
-        for (i = i_Temp; i < i_f; i++) {
-            X[0][i][k] = Y1[i][k][0];
-            X[1][i][k] = Y1[i][k][1];
-        }
-    }
-    return 0;
-}
-
-/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
- * (14496-3 sp04 p217)
- */
-static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
-                        SBRData *ch_data, int e_a[2])
-{
-    int e, i, m;
-
-    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
-    for (e = 0; e < ch_data->bs_num_env; e++) {
-        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
-        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-        int k;
-
-        if (sbr->kx[1] != table[0]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
-                   "Derived frequency tables were not regenerated.\n");
-            sbr_turnoff(sbr);
-            return AVERROR_BUG;
-        }
-        for (i = 0; i < ilim; i++)
-            for (m = table[i]; m < table[i + 1]; m++)
-                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
-
-        // ch_data->bs_num_noise > 1 => 2 noise floors
-        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
-        for (i = 0; i < sbr->n_q; i++)
-            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
-                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
-
-        for (i = 0; i < sbr->n[1]; i++) {
-            if (ch_data->bs_add_harmonic_flag) {
-                const unsigned int m_midpoint =
-                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
-
-                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
-                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
-            }
-        }
-
-        for (i = 0; i < ilim; i++) {
-            int additional_sinusoid_present = 0;
-            for (m = table[i]; m < table[i + 1]; m++) {
-                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
-                    additional_sinusoid_present = 1;
-                    break;
-                }
-            }
-            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
-                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
-        }
-    }
-
-    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
-    return 0;
-}
-
-/// Estimation of current envelope (14496-3 sp04 p218)
-static void sbr_env_estimate(float (*e_curr)[48], float X_high[64][40][2],
-                             SpectralBandReplication *sbr, SBRData *ch_data)
-{
-    int e, m;
-    int kx1 = sbr->kx[1];
-
-    if (sbr->bs_interpol_freq) {
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-
-            for (m = 0; m < sbr->m[1]; m++) {
-                float sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
-                e_curr[e][m] = sum * recip_env_size;
-            }
-        }
-    } else {
-        int k, p;
-
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-
-            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
-                float sum = 0.0f;
-                const int den = env_size * (table[p + 1] - table[p]);
-
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
-                }
-                sum /= den;
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    e_curr[e][k - kx1] = sum;
-                }
-            }
-        }
-    }
-}
-
 /**
  * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
  * and Calculation of gain (14496-3 sp04 p219)
@@ -1575,10 +281,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
         0.11516383427084,
         0.03183050093751,
     };
-    static const int8_t phi[2][4] = {
-        {  1,  0, -1,  0}, // real
-        {  0,  1,  0, -1}, // imaginary
-    };
     float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
     int indexnoise = ch_data->f_indexnoise;
     int indexsine  = ch_data->f_indexsine;
@@ -1608,7 +310,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
 
     for (e = 0; e < ch_data->bs_num_env; e++) {
         for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
-            int phi_sign = (1 - 2*(kx & 1));
             LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
             LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
             float *g_filt, *q_filt;
@@ -1638,13 +339,17 @@ static void sbr_hf_assemble(float Y1[38][64][2],
                                                    q_filt, indexnoise,
                                                    kx, m_max);
             } else {
-                for (m = 0; m < m_max; m++) {
-                    Y1[i][m + kx][0] +=
-                        sbr->s_m[e][m] * phi[0][indexsine];
-                    Y1[i][m + kx][1] +=
-                        sbr->s_m[e][m] * (phi[1][indexsine] * phi_sign);
-                    phi_sign = -phi_sign;
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                    out[2*m  ] += in[m  ] * A;
+                    out[2*m+2] += in[m+1] * B;
                 }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
             }
             indexnoise = (indexnoise + m_max) & 0x1ff;
             indexsine = (indexsine + 1) & 3;
@@ -1654,81 +359,4 @@ static void sbr_hf_assemble(float Y1[38][64][2],
     ch_data->f_indexsine  = indexsine;
 }
 
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float* R)
-{
-    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
-    int ch;
-    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
-    int err;
-
-    if (!sbr->kx_and_m_pushed) {
-        sbr->kx[0] = sbr->kx[1];
-        sbr->m[0] = sbr->m[1];
-    } else {
-        sbr->kx_and_m_pushed = 0;
-    }
-
-    if (sbr->start) {
-        sbr_dequant(sbr, id_aac);
-    }
-    for (ch = 0; ch < nch; ch++) {
-        /* decode channel */
-        sbr_qmf_analysis(&ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
-                         (float*)sbr->qmf_filter_scratch,
-                         sbr->data[ch].W, sbr->data[ch].Ypos);
-        sbr_lf_gen(ac, sbr, sbr->X_low,
-                   (const float (*)[32][32][2]) sbr->data[ch].W,
-                   sbr->data[ch].Ypos);
-        sbr->data[ch].Ypos ^= 1;
-        if (sbr->start) {
-            sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
-                                  (const float (*)[40][2]) sbr->X_low, sbr->k[0]);
-            sbr_chirp(sbr, &sbr->data[ch]);
-            sbr_hf_gen(ac, sbr, sbr->X_high,
-                       (const float (*)[40][2]) sbr->X_low,
-                       (const float (*)[2]) sbr->alpha0,
-                       (const float (*)[2]) sbr->alpha1,
-                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
-                       sbr->data[ch].bs_num_env);
-
-            // hf_adj
-            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-            if (!err) {
-                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
-                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-                sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
-                                (const float (*)[40][2]) sbr->X_high,
-                                sbr, &sbr->data[ch],
-                                sbr->data[ch].e_a);
-            }
-        }
-
-        /* synthesis */
-        sbr_x_gen(sbr, sbr->X[ch],
-                  (const float (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
-                  (const float (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
-                  (const float (*)[40][2]) sbr->X_low, ch);
-    }
-
-    if (ac->oc[1].m4ac.ps == 1) {
-        if (sbr->ps.start) {
-            ff_ps_apply(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
-        } else {
-            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
-        }
-        nch = 2;
-    }
-
-    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                      L, sbr->X[0], sbr->qmf_filter_scratch,
-                      sbr->data[0].synthesis_filterbank_samples,
-                      &sbr->data[0].synthesis_filterbank_samples_offset,
-                      downsampled);
-    if (nch == 2)
-        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                          R, sbr->X[1], sbr->qmf_filter_scratch,
-                          sbr->data[1].synthesis_filterbank_samples,
-                          &sbr->data[1].synthesis_filterbank_samples_offset,
-                          downsampled);
-}
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr.h b/libavcodec/aacsbr.h
index 9bc5e29d33..ed1a7f9265 100644
--- a/libavcodec/aacsbr.h
+++ b/libavcodec/aacsbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,17 +33,64 @@
 #include "aac.h"
 #include "sbr.h"
 
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+#define NOISE_FLOOR_OFFSET FIXR(6.0f)
+
+/**
+ * SBR VLC tables
+ */
+enum {
+    T_HUFFMAN_ENV_1_5DB,
+    F_HUFFMAN_ENV_1_5DB,
+    T_HUFFMAN_ENV_BAL_1_5DB,
+    F_HUFFMAN_ENV_BAL_1_5DB,
+    T_HUFFMAN_ENV_3_0DB,
+    F_HUFFMAN_ENV_3_0DB,
+    T_HUFFMAN_ENV_BAL_3_0DB,
+    F_HUFFMAN_ENV_BAL_3_0DB,
+    T_HUFFMAN_NOISE_3_0DB,
+    T_HUFFMAN_NOISE_BAL_3_0DB,
+};
+
+/**
+ * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
+ */
+enum {
+    FIXFIX,
+    FIXVAR,
+    VARFIX,
+    VARVAR,
+};
+
+enum {
+    EXTENSION_ID_PS = 2,
+};
+
+static const int8_t vlc_sbr_lav[10] =
+    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
+
+#define SBR_INIT_VLC_STATIC(num, size) \
+    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
+                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
+                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
+                    size)
+
+#define SBR_VLC_ROW(name) \
+    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
+
 /** Initialize SBR. */
-void ff_aac_sbr_init(void);
+void AAC_RENAME(ff_aac_sbr_init)(void);
 /** Initialize one SBR context. */
-void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr);
 /** Close one SBR context. */
-void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr);
 /** Decode one SBR element. */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
                             GetBitContext *gb, int crc, int cnt, int id_aac);
 /** Apply one SBR element to one AAC element. */
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float *R);
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT *R);
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c);
 
 #endif /* AVCODEC_AACSBR_H */
diff --git a/libavcodec/aacsbr_fixed.c b/libavcodec/aacsbr_fixed.c
new file mode 100644
index 0000000000..e048069359
--- /dev/null
+++ b/libavcodec/aacsbr_fixed.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Note: Rounding-to-nearest used unless otherwise stated
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "aacsbrdata.h"
+#include "aacsbr_fixed_tablegen.h"
+#include "fft.h"
+#include "aacps.h"
+#include "sbrdsp.h"
+#include "libavutil/internal.h"
+#include "libavutil/libm.h"
+#include "libavutil/avassert.h"
+
+#include <stdint.h>
+#include <float.h>
+#include <math.h>
+
+static VLC vlc_sbr[10];
+static void aacsbr_func_ptr_init(AACSBRContext *c);
+static const int CONST_LN2       = Q31(0.6931471806/256);  // ln(2)/256
+static const int CONST_RECIP_LN2 = Q31(0.7213475204);      // 0.5/ln(2)
+static const int CONST_076923    = Q31(0.76923076923076923077f);
+
+static const int fixed_log_table[10] =
+{
+    Q31(1.0/2), Q31(1.0/3), Q31(1.0/4), Q31(1.0/5), Q31(1.0/6),
+    Q31(1.0/7), Q31(1.0/8), Q31(1.0/9), Q31(1.0/10), Q31(1.0/11)
+};
+
+static int fixed_log(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = x;
+    xpow = x;
+    for (i=0; i<10; i+=2){
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i] + 0x40000000) >> 31);
+        ret -= tmp;
+
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i+1] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static const int fixed_exp_table[7] =
+{
+    Q31(1.0/2), Q31(1.0/6), Q31(1.0/24), Q31(1.0/120),
+    Q31(1.0/720), Q31(1.0/5040), Q31(1.0/40320)
+};
+
+static int fixed_exp(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = 0x800000 + x;
+    xpow = x;
+    for (i=0; i<7; i++){
+        xpow = (int)(((int64_t)xpow * x + 0x400000) >> 23);
+        tmp = (int)(((int64_t)xpow * fixed_exp_table[i] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static void make_bands(int16_t* bands, int start, int stop, int num_bands)
+{
+    int k, previous, present;
+    int base, prod, nz = 0;
+
+    base = (stop << 23) / start;
+    while (base < 0x40000000){
+        base <<= 1;
+        nz++;
+    }
+    base = fixed_log(base - 0x80000000);
+    base = (((base + 0x80) >> 8) + (8-nz)*CONST_LN2) / num_bands;
+    base = fixed_exp(base);
+
+    previous = start;
+    prod = start << 23;
+
+    for (k = 0; k < num_bands-1; k++) {
+        prod = (int)(((int64_t)prod * base + 0x400000) >> 23);
+        present = (prod + 0x400000) >> 23;
+        bands[k] = present - previous;
+        previous = present;
+    }
+    bands[num_bands-1] = stop - previous;
+}
+
+/// Dequantization and stereo decoding (14496-3 sp04 p203)
+static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
+{
+    int k, e;
+    int ch;
+
+    if (id_aac == TYPE_CPE && sbr->bs_coupling) {
+        int alpha      = sbr->data[0].bs_amp_res ?  2 :  1;
+        int pan_offset = sbr->data[0].bs_amp_res ? 12 : 24;
+        for (e = 1; e <= sbr->data[0].bs_num_env; e++) {
+            for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = sbr->data[0].env_facs[e][k].mant * alpha + 14;
+                if (temp1.exp & 1)
+                  temp1.mant = 759250125;
+                else
+                  temp1.mant = 0x20000000;
+                temp1.exp = (temp1.exp >> 1) + 1;
+
+                temp2.exp = (pan_offset - sbr->data[1].env_facs[e][k].mant) * alpha;
+                if (temp2.exp & 1)
+                  temp2.mant = 759250125;
+                else
+                  temp2.mant = 0x20000000;
+                temp2.exp = (temp2.exp >> 1) + 1;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].env_facs[e][k] = fac;
+                sbr->data[1].env_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+        for (e = 1; e <= sbr->data[0].bs_num_noise; e++) {
+            for (k = 0; k < sbr->n_q; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = NOISE_FLOOR_OFFSET - \
+                    sbr->data[0].noise_facs[e][k].mant + 2;
+                temp1.mant = 0x20000000;
+                temp2.exp = 12 - sbr->data[1].noise_facs[e][k].mant + 1;
+                temp2.mant = 0x20000000;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].noise_facs[e][k] = fac;
+                sbr->data[1].noise_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+    } else { // SCE or one non-coupled CPE
+        for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
+            int alpha = sbr->data[ch].bs_amp_res ? 2 : 1;
+            for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
+                    SoftFloat temp1;
+
+                    temp1.exp = alpha * sbr->data[ch].env_facs[e][k].mant + 12;
+                    if (temp1.exp & 1)
+                        temp1.mant = 759250125;
+                    else
+                        temp1.mant = 0x20000000;
+                    temp1.exp = (temp1.exp >> 1) + 1;
+
+                    sbr->data[ch].env_facs[e][k] = temp1;
+                }
+            for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
+                for (k = 0; k < sbr->n_q; k++){
+                    sbr->data[ch].noise_facs[e][k].exp = NOISE_FLOOR_OFFSET - \
+                        sbr->data[ch].noise_facs[e][k].mant + 1;
+                    sbr->data[ch].noise_facs[e][k].mant = 0x20000000;
+                }
+        }
+    }
+}
+
+/** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
+ * (14496-3 sp04 p214)
+ * Warning: This routine does not seem numerically stable.
+ */
+static void sbr_hf_inverse_filter(SBRDSPContext *dsp,
+                                  int (*alpha0)[2], int (*alpha1)[2],
+                                  const int X_low[32][40][2], int k0)
+{
+    int k;
+    int shift, round;
+
+    for (k = 0; k < k0; k++) {
+        SoftFloat phi[3][2][2];
+        SoftFloat a00, a01, a10, a11;
+        SoftFloat dk;
+
+        dsp->autocorrelate(X_low[k], phi);
+
+        dk = av_sub_sf(av_mul_sf(phi[2][1][0], phi[1][0][0]),
+             av_mul_sf(av_add_sf(av_mul_sf(phi[1][1][0], phi[1][1][0]),
+             av_mul_sf(phi[1][1][1], phi[1][1][1])), FLOAT_0999999));
+
+        if (!dk.mant) {
+            a10 = FLOAT_0;
+            a11 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_sub_sf(av_sub_sf(av_mul_sf(phi[0][0][0], phi[1][1][0]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][1])),
+                                  av_mul_sf(phi[0][1][0], phi[1][0][0]));
+            temp_im   = av_sub_sf(av_add_sf(av_mul_sf(phi[0][0][0], phi[1][1][1]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][0])),
+                                  av_mul_sf(phi[0][1][1], phi[1][0][0]));
+
+            a10 = av_div_sf(temp_real, dk);
+            a11 = av_div_sf(temp_im,   dk);
+        }
+
+        if (!phi[1][0][0].mant) {
+            a00 = FLOAT_0;
+            a01 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_add_sf(phi[0][0][0],
+                                  av_add_sf(av_mul_sf(a10, phi[1][1][0]),
+                                            av_mul_sf(a11, phi[1][1][1])));
+            temp_im   = av_add_sf(phi[0][0][1],
+                                  av_sub_sf(av_mul_sf(a11, phi[1][1][0]),
+                                            av_mul_sf(a10, phi[1][1][1])));
+
+            temp_real.mant = -temp_real.mant;
+            temp_im.mant   = -temp_im.mant;
+            a00 = av_div_sf(temp_real, phi[1][0][0]);
+            a01 = av_div_sf(temp_im,   phi[1][0][0]);
+        }
+
+        shift = a00.exp;
+        if (shift >= 3)
+            alpha0[k][0] = 0x7fffffff;
+        else {
+            a00.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha0[k][0] = a00.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][0] = (a00.mant + round) >> shift;
+            }
+        }
+
+        shift = a01.exp;
+        if (shift >= 3)
+            alpha0[k][1] = 0x7fffffff;
+        else {
+            a01.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha0[k][1] = a01.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][1] = (a01.mant + round) >> shift;
+            }
+        }
+        shift = a10.exp;
+        if (shift >= 3)
+            alpha1[k][0] = 0x7fffffff;
+        else {
+            a10.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha1[k][0] = a10.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][0] = (a10.mant + round) >> shift;
+            }
+        }
+
+        shift = a11.exp;
+        if (shift >= 3)
+            alpha1[k][1] = 0x7fffffff;
+        else {
+            a11.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha1[k][1] = a11.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][1] = (a11.mant + round) >> shift;
+            }
+        }
+
+        shift = (int)(((int64_t)(alpha1[k][0]>>1) * (alpha1[k][0]>>1) + \
+                       (int64_t)(alpha1[k][1]>>1) * (alpha1[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+
+        shift = (int)(((int64_t)(alpha0[k][0]>>1) * (alpha0[k][0]>>1) + \
+                       (int64_t)(alpha0[k][1]>>1) * (alpha0[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+    }
+}
+
+/// Chirp Factors (14496-3 sp04 p214)
+static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int i;
+    int new_bw;
+    static const int bw_tab[] = { 0, 1610612736, 1932735283, 2104533975 };
+    int64_t accu;
+
+    for (i = 0; i < sbr->n_q; i++) {
+        if (ch_data->bs_invf_mode[0][i] + ch_data->bs_invf_mode[1][i] == 1)
+            new_bw = 1288490189;
+        else
+            new_bw = bw_tab[ch_data->bs_invf_mode[0][i]];
+
+        if (new_bw < ch_data->bw_array[i]){
+            accu  = (int64_t)new_bw * 1610612736;
+            accu += (int64_t)ch_data->bw_array[i] * 0x20000000;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        } else {
+            accu  = (int64_t)new_bw * 1946157056;
+            accu += (int64_t)ch_data->bw_array[i] * 201326592;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        }
+        ch_data->bw_array[i] = new_bw < 0x2000000 ? 0 : new_bw;
+    }
+}
+
+/**
+ * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
+ * and Calculation of gain (14496-3 sp04 p219)
+ */
+static void sbr_gain_calc(AACContext *ac, SpectralBandReplication *sbr,
+                          SBRData *ch_data, const int e_a[2])
+{
+    int e, k, m;
+    // max gain limits : -3dB, 0dB, 3dB, inf dB (limiter off)
+    static const SoftFloat limgain[4] = { { 760155524,  0 }, { 0x20000000,  1 },
+                                            { 758351638,  1 }, { 625000000, 34 } };
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        int delta = !((e == e_a[1]) || (e == e_a[0]));
+        for (k = 0; k < sbr->n_lim; k++) {
+            SoftFloat gain_boost, gain_max;
+            SoftFloat sum[2] = { { 0, 0}, { 0, 0 } };
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                const SoftFloat temp = av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]));
+                sbr->q_m[e][m] = av_sqrt_sf(av_mul_sf(temp, sbr->q_mapped[e][m]));
+                sbr->s_m[e][m] = av_sqrt_sf(av_mul_sf(temp, av_int2sf(ch_data->s_indexmapped[e + 1][m], 0)));
+                if (!sbr->s_mapped[e][m]) {
+                    if (delta) {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_mul_sf(av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                    } else {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->e_curr[e][m])));
+                    }
+                } else {
+                    sbr->gain[e][m] = av_sqrt_sf(
+                                        av_div_sf(
+                                            av_mul_sf(sbr->e_origmapped[e][m], sbr->q_mapped[e][m]),
+                                            av_mul_sf(
+                                                av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                                av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                }
+            }
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1], sbr->e_curr[e][m]);
+            }
+            gain_max = av_mul_sf(limgain[sbr->bs_limiter_gains],
+                            av_sqrt_sf(
+                                av_div_sf(
+                                    av_add_sf(FLOAT_EPSILON, sum[0]),
+                                    av_add_sf(FLOAT_EPSILON, sum[1]))));
+            if (av_gt_sf(gain_max, FLOAT_100000))
+              gain_max = FLOAT_100000;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                SoftFloat q_m_max = av_div_sf(
+                                        av_mul_sf(sbr->q_m[e][m], gain_max),
+                                        sbr->gain[e][m]);
+                if (av_gt_sf(sbr->q_m[e][m], q_m_max))
+                  sbr->q_m[e][m] = q_m_max;
+                if (av_gt_sf(sbr->gain[e][m], gain_max))
+                  sbr->gain[e][m] = gain_max;
+            }
+            sum[0] = sum[1] = FLOAT_0;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(
+                                av_mul_sf(sbr->e_curr[e][m],
+                                          sbr->gain[e][m]),
+                                sbr->gain[e][m]));
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(sbr->s_m[e][m], sbr->s_m[e][m]));
+                if (delta && !sbr->s_m[e][m].mant)
+                  sum[1] = av_add_sf(sum[1],
+                                av_mul_sf(sbr->q_m[e][m], sbr->q_m[e][m]));
+            }
+            gain_boost = av_sqrt_sf(
+                            av_div_sf(
+                                av_add_sf(FLOAT_EPSILON, sum[0]),
+                                av_add_sf(FLOAT_EPSILON, sum[1])));
+            if (av_gt_sf(gain_boost, FLOAT_1584893192))
+              gain_boost = FLOAT_1584893192;
+
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sbr->gain[e][m] = av_mul_sf(sbr->gain[e][m], gain_boost);
+                sbr->q_m[e][m]  = av_mul_sf(sbr->q_m[e][m], gain_boost);
+                sbr->s_m[e][m]  = av_mul_sf(sbr->s_m[e][m], gain_boost);
+            }
+        }
+    }
+}
+
+/// Assembling HF Signals (14496-3 sp04 p220)
+static void sbr_hf_assemble(int Y1[38][64][2],
+                            const int X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const SoftFloat h_smooth[5] = {
+      { 715827883, -1 },
+      { 647472402, -1 },
+      { 937030863, -2 },
+      { 989249804, -3 },
+      { 546843842, -4 },
+    };
+    SoftFloat (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        for (i = 0; i < 4; i++) {
+            memcpy(g_temp[i + 2 * ch_data->t_env[0]],
+                   g_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(g_temp[0]));
+            memcpy(q_temp[i + 2 * ch_data->t_env[0]],
+                   q_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(q_temp[0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            memcpy(g_temp[h_SL + i], sbr->gain[e], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[h_SL + i], sbr->q_m[e],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            SoftFloat g_filt_tab[48];
+            SoftFloat q_filt_tab[48];
+            SoftFloat *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m].mant = g_filt[m].exp = 0;
+                    q_filt[m].mant = q_filt[m].exp = 0;
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] = av_add_sf(g_filt[m],
+                                        av_mul_sf(g_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                        q_filt[m] = av_add_sf(q_filt[m],
+                                        av_mul_sf(q_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                int *out = &Y1[i][kx][idx];
+                int shift, round;
+
+                SoftFloat *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                  shift = 22 - in[m  ].exp;
+                  round = 1 << (shift-1);
+                  out[2*m  ] += (in[m  ].mant * A + round) >> shift;
+
+                  shift = 22 - in[m+1].exp;
+                  round = 1 << (shift-1);
+                  out[2*m+2] += (in[m+1].mant * B + round) >> shift;
+                }
+                if(m_max&1)
+                {
+                  shift = 22 - in[m  ].exp;
+                  round = 1 << (shift-1);
+
+                  out[2*m  ] += (in[m  ].mant * A + round) >> shift;
+                }
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr_fixed_tablegen.c b/libavcodec/aacsbr_fixed_tablegen.c
new file mode 100644
index 0000000000..b896d75880
--- /dev/null
+++ b/libavcodec/aacsbr_fixed_tablegen.c
@@ -0,0 +1,42 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/internal.h"
+#include "libavutil/common.h"
+#undef CONFIG_HARDCODED_TABLES
+#define CONFIG_HARDCODED_TABLES 0
+#define USE_FIXED 1
+#include "aacsbr_fixed_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    aacsbr_tableinit();
+
+    write_fileheader();
+
+    WRITE_ARRAY_ALIGNED("static const", 32, int32_t, sbr_qmf_window_ds);
+    WRITE_ARRAY_ALIGNED("static const", 32, int32_t, sbr_qmf_window_us);
+
+    return 0;
+}
diff --git a/libavcodec/aacsbr_fixed_tablegen.h b/libavcodec/aacsbr_fixed_tablegen.h
new file mode 100644
index 0000000000..1439ebe3a4
--- /dev/null
+++ b/libavcodec/aacsbr_fixed_tablegen.h
@@ -0,0 +1,32 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_FIXED_TABLEGEN_H
+#define AVCODEC_AACSBR_FIXED_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/aacsbr_fixed_tables.h"
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_AACSBR_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen.c b/libavcodec/aacsbr_tablegen.c
new file mode 100644
index 0000000000..ee0d818f81
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen.c
@@ -0,0 +1,42 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/internal.h"
+#include "libavutil/common.h"
+#undef CONFIG_HARDCODED_TABLES
+#define CONFIG_HARDCODED_TABLES 0
+#define USE_FIXED 0
+#include "aacsbr_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    aacsbr_tableinit();
+
+    write_fileheader();
+
+    WRITE_ARRAY_ALIGNED("static const", 32, float, sbr_qmf_window_ds);
+    WRITE_ARRAY_ALIGNED("static const", 32, float, sbr_qmf_window_us);
+
+    return 0;
+}
diff --git a/libavcodec/aacsbr_tablegen.h b/libavcodec/aacsbr_tablegen.h
new file mode 100644
index 0000000000..d86eba7540
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen.h
@@ -0,0 +1,32 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_H
+#define AVCODEC_AACSBR_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/aacsbr_tables.h"
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen_common.h b/libavcodec/aacsbr_tablegen_common.h
new file mode 100644
index 0000000000..0a64552ed6
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen_common.h
@@ -0,0 +1,129 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#define AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#include "aac.h"
+
+#if CONFIG_HARDCODED_TABLES
+#define aacsbr_tableinit()
+#else
+///< window coefficients for analysis/synthesis QMF banks
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_ds)[320];
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_us)[640] = {
+    Q31( 0.0000000000f), Q31(-0.0005525286f), Q31(-0.0005617692f), Q31(-0.0004947518f),
+    Q31(-0.0004875227f), Q31(-0.0004893791f), Q31(-0.0005040714f), Q31(-0.0005226564f),
+    Q31(-0.0005466565f), Q31(-0.0005677802f), Q31(-0.0005870930f), Q31(-0.0006132747f),
+    Q31(-0.0006312493f), Q31(-0.0006540333f), Q31(-0.0006777690f), Q31(-0.0006941614f),
+    Q31(-0.0007157736f), Q31(-0.0007255043f), Q31(-0.0007440941f), Q31(-0.0007490598f),
+    Q31(-0.0007681371f), Q31(-0.0007724848f), Q31(-0.0007834332f), Q31(-0.0007779869f),
+    Q31(-0.0007803664f), Q31(-0.0007801449f), Q31(-0.0007757977f), Q31(-0.0007630793f),
+    Q31(-0.0007530001f), Q31(-0.0007319357f), Q31(-0.0007215391f), Q31(-0.0006917937f),
+    Q31(-0.0006650415f), Q31(-0.0006341594f), Q31(-0.0005946118f), Q31(-0.0005564576f),
+    Q31(-0.0005145572f), Q31(-0.0004606325f), Q31(-0.0004095121f), Q31(-0.0003501175f),
+    Q31(-0.0002896981f), Q31(-0.0002098337f), Q31(-0.0001446380f), Q31(-0.0000617334f),
+    Q31( 0.0000134949f), Q31( 0.0001094383f), Q31( 0.0002043017f), Q31( 0.0002949531f),
+    Q31( 0.0004026540f), Q31( 0.0005107388f), Q31( 0.0006239376f), Q31( 0.0007458025f),
+    Q31( 0.0008608443f), Q31( 0.0009885988f), Q31( 0.0011250155f), Q31( 0.0012577884f),
+    Q31( 0.0013902494f), Q31( 0.0015443219f), Q31( 0.0016868083f), Q31( 0.0018348265f),
+    Q31( 0.0019841140f), Q31( 0.0021461583f), Q31( 0.0023017254f), Q31( 0.0024625616f),
+    Q31( 0.0026201758f), Q31( 0.0027870464f), Q31( 0.0029469447f), Q31( 0.0031125420f),
+    Q31( 0.0032739613f), Q31( 0.0034418874f), Q31( 0.0036008268f), Q31( 0.0037603922f),
+    Q31( 0.0039207432f), Q31( 0.0040819753f), Q31( 0.0042264269f), Q31( 0.0043730719f),
+    Q31( 0.0045209852f), Q31( 0.0046606460f), Q31( 0.0047932560f), Q31( 0.0049137603f),
+    Q31( 0.0050393022f), Q31( 0.0051407353f), Q31( 0.0052461166f), Q31( 0.0053471681f),
+    Q31( 0.0054196775f), Q31( 0.0054876040f), Q31( 0.0055475714f), Q31( 0.0055938023f),
+    Q31( 0.0056220643f), Q31( 0.0056455196f), Q31( 0.0056389199f), Q31( 0.0056266114f),
+    Q31( 0.0055917128f), Q31( 0.0055404363f), Q31( 0.0054753783f), Q31( 0.0053838975f),
+    Q31( 0.0052715758f), Q31( 0.0051382275f), Q31( 0.0049839687f), Q31( 0.0048109469f),
+    Q31( 0.0046039530f), Q31( 0.0043801861f), Q31( 0.0041251642f), Q31( 0.0038456408f),
+    Q31( 0.0035401246f), Q31( 0.0032091885f), Q31( 0.0028446757f), Q31( 0.0024508540f),
+    Q31( 0.0020274176f), Q31( 0.0015784682f), Q31( 0.0010902329f), Q31( 0.0005832264f),
+    Q31( 0.0000276045f), Q31(-0.0005464280f), Q31(-0.0011568135f), Q31(-0.0018039472f),
+    Q31(-0.0024826723f), Q31(-0.0031933778f), Q31(-0.0039401124f), Q31(-0.0047222596f),
+    Q31(-0.0055337211f), Q31(-0.0063792293f), Q31(-0.0072615816f), Q31(-0.0081798233f),
+    Q31(-0.0091325329f), Q31(-0.0101150215f), Q31(-0.0111315548f), Q31(-0.0121849995f),
+    Q31( 0.0132718220f), Q31( 0.0143904666f), Q31( 0.0155405553f), Q31( 0.0167324712f),
+    Q31( 0.0179433381f), Q31( 0.0191872431f), Q31( 0.0204531793f), Q31( 0.0217467550f),
+    Q31( 0.0230680169f), Q31( 0.0244160992f), Q31( 0.0257875847f), Q31( 0.0271859429f),
+    Q31( 0.0286072173f), Q31( 0.0300502657f), Q31( 0.0315017608f), Q31( 0.0329754081f),
+    Q31( 0.0344620948f), Q31( 0.0359697560f), Q31( 0.0374812850f), Q31( 0.0390053679f),
+    Q31( 0.0405349170f), Q31( 0.0420649094f), Q31( 0.0436097542f), Q31( 0.0451488405f),
+    Q31( 0.0466843027f), Q31( 0.0482165720f), Q31( 0.0497385755f), Q31( 0.0512556155f),
+    Q31( 0.0527630746f), Q31( 0.0542452768f), Q31( 0.0557173648f), Q31( 0.0571616450f),
+    Q31( 0.0585915683f), Q31( 0.0599837480f), Q31( 0.0613455171f), Q31( 0.0626857808f),
+    Q31( 0.0639715898f), Q31( 0.0652247106f), Q31( 0.0664367512f), Q31( 0.0676075985f),
+    Q31( 0.0687043828f), Q31( 0.0697630244f), Q31( 0.0707628710f), Q31( 0.0717002673f),
+    Q31( 0.0725682583f), Q31( 0.0733620255f), Q31( 0.0741003642f), Q31( 0.0747452558f),
+    Q31( 0.0753137336f), Q31( 0.0758008358f), Q31( 0.0761992479f), Q31( 0.0764992170f),
+    Q31( 0.0767093490f), Q31( 0.0768173975f), Q31( 0.0768230011f), Q31( 0.0767204924f),
+    Q31( 0.0765050718f), Q31( 0.0761748321f), Q31( 0.0757305756f), Q31( 0.0751576255f),
+    Q31( 0.0744664394f), Q31( 0.0736406005f), Q31( 0.0726774642f), Q31( 0.0715826364f),
+    Q31( 0.0703533073f), Q31( 0.0689664013f), Q31( 0.0674525021f), Q31( 0.0657690668f),
+    Q31( 0.0639444805f), Q31( 0.0619602779f), Q31( 0.0598166570f), Q31( 0.0575152691f),
+    Q31( 0.0550460034f), Q31( 0.0524093821f), Q31( 0.0495978676f), Q31( 0.0466303305f),
+    Q31( 0.0434768782f), Q31( 0.0401458278f), Q31( 0.0366418116f), Q31( 0.0329583930f),
+    Q31( 0.0290824006f), Q31( 0.0250307561f), Q31( 0.0207997072f), Q31( 0.0163701258f),
+    Q31( 0.0117623832f), Q31( 0.0069636862f), Q31( 0.0019765601f), Q31(-0.0032086896f),
+    Q31(-0.0085711749f), Q31(-0.0141288827f), Q31(-0.0198834129f), Q31(-0.0258227288f),
+    Q31(-0.0319531274f), Q31(-0.0382776572f), Q31(-0.0447806821f), Q31(-0.0514804176f),
+    Q31(-0.0583705326f), Q31(-0.0654409853f), Q31(-0.0726943300f), Q31(-0.0801372934f),
+    Q31(-0.0877547536f), Q31(-0.0955533352f), Q31(-0.1035329531f), Q31(-0.1116826931f),
+    Q31(-0.1200077984f), Q31(-0.1285002850f), Q31(-0.1371551761f), Q31(-0.1459766491f),
+    Q31(-0.1549607071f), Q31(-0.1640958855f), Q31(-0.1733808172f), Q31(-0.1828172548f),
+    Q31(-0.1923966745f), Q31(-0.2021250176f), Q31(-0.2119735853f), Q31(-0.2219652696f),
+    Q31(-0.2320690870f), Q31(-0.2423016884f), Q31(-0.2526480309f), Q31(-0.2631053299f),
+    Q31(-0.2736634040f), Q31(-0.2843214189f), Q31(-0.2950716717f), Q31(-0.3059098575f),
+    Q31(-0.3168278913f), Q31(-0.3278113727f), Q31(-0.3388722693f), Q31(-0.3499914122f),
+    Q31( 0.3611589903f), Q31( 0.3723795546f), Q31( 0.3836350013f), Q31( 0.3949211761f),
+    Q31( 0.4062317676f), Q31( 0.4175696896f), Q31( 0.4289119920f), Q31( 0.4402553754f),
+    Q31( 0.4515996535f), Q31( 0.4629308085f), Q31( 0.4742453214f), Q31( 0.4855253091f),
+    Q31( 0.4967708254f), Q31( 0.5079817500f), Q31( 0.5191234970f), Q31( 0.5302240895f),
+    Q31( 0.5412553448f), Q31( 0.5522051258f), Q31( 0.5630789140f), Q31( 0.5738524131f),
+    Q31( 0.5845403235f), Q31( 0.5951123086f), Q31( 0.6055783538f), Q31( 0.6159109932f),
+    Q31( 0.6261242695f), Q31( 0.6361980107f), Q31( 0.6461269695f), Q31( 0.6559016302f),
+    Q31( 0.6655139880f), Q31( 0.6749663190f), Q31( 0.6842353293f), Q31( 0.6933282376f),
+    Q31( 0.7022388719f), Q31( 0.7109410426f), Q31( 0.7194462634f), Q31( 0.7277448900f),
+    Q31( 0.7358211758f), Q31( 0.7436827863f), Q31( 0.7513137456f), Q31( 0.7587080760f),
+    Q31( 0.7658674865f), Q31( 0.7727780881f), Q31( 0.7794287519f), Q31( 0.7858353120f),
+    Q31( 0.7919735841f), Q31( 0.7978466413f), Q31( 0.8034485751f), Q31( 0.8087695004f),
+    Q31( 0.8138191270f), Q31( 0.8185776004f), Q31( 0.8230419890f), Q31( 0.8272275347f),
+    Q31( 0.8311038457f), Q31( 0.8346937361f), Q31( 0.8379717337f), Q31( 0.8409541392f),
+    Q31( 0.8436238281f), Q31( 0.8459818469f), Q31( 0.8480315777f), Q31( 0.8497805198f),
+    Q31( 0.8511971524f), Q31( 0.8523047035f), Q31( 0.8531020949f), Q31( 0.8535720573f),
+    Q31( 0.8537385600f),
+};
+
+static av_cold void aacsbr_tableinit(void)
+{
+    int n;
+    for (n = 1; n < 320; n++)
+        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
+    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
+    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
+
+    for (n = 0; n < 320; n++)
+        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_COMMON_H */
diff --git a/libavcodec/aacsbr_template.c b/libavcodec/aacsbr_template.c
new file mode 100644
index 0000000000..d31b71e0a8
--- /dev/null
+++ b/libavcodec/aacsbr_template.c
@@ -0,0 +1,1535 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * Fixed point code
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj@imgtec.com )
+ * @author Zoran Basaric ( zoran.basaric@imgtec.com )
+ */
+
+av_cold void AAC_RENAME(ff_aac_sbr_init)(void)
+{
+    static const struct {
+        const void *sbr_codes, *sbr_bits;
+        const unsigned int table_size, elem_size;
+    } sbr_tmp[] = {
+        SBR_VLC_ROW(t_huffman_env_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_3_0dB),
+        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
+    };
+
+    // SBR VLC table initialization
+    SBR_INIT_VLC_STATIC(0, 1098);
+    SBR_INIT_VLC_STATIC(1, 1092);
+    SBR_INIT_VLC_STATIC(2, 768);
+    SBR_INIT_VLC_STATIC(3, 1026);
+    SBR_INIT_VLC_STATIC(4, 1058);
+    SBR_INIT_VLC_STATIC(5, 1052);
+    SBR_INIT_VLC_STATIC(6, 544);
+    SBR_INIT_VLC_STATIC(7, 544);
+    SBR_INIT_VLC_STATIC(8, 592);
+    SBR_INIT_VLC_STATIC(9, 512);
+
+    aacsbr_tableinit();
+
+    AAC_RENAME(ff_ps_init)();
+}
+
+/** Places SBR in pure upsampling mode. */
+static void sbr_turnoff(SpectralBandReplication *sbr) {
+    sbr->start = 0;
+    // Init defults used in pure upsampling mode
+    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
+    sbr->m[1] = 0;
+    // Reset values for first SBR header
+    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
+    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr)
+{
+    if(sbr->mdct.mdct_bits)
+        return;
+    sbr->kx[0] = sbr->kx[1];
+    sbr_turnoff(sbr);
+    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
+     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
+     * and scale back down at synthesis. */
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
+    AAC_RENAME(ff_ps_ctx_init)(&sbr->ps);
+    AAC_RENAME(ff_sbrdsp_init)(&sbr->dsp);
+    aacsbr_func_ptr_init(&sbr->c);
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr)
+{
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct);
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct_ana);
+}
+
+static int qsort_comparison_function_int16(const void *a, const void *b)
+{
+    return *(const int16_t *)a - *(const int16_t *)b;
+}
+
+static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
+{
+    int i;
+    for (i = 0; i <= last_el; i++)
+        if (table[i] == needle)
+            return 1;
+    return 0;
+}
+
+/// Limiter Frequency Band Table (14496-3 sp04 p198)
+static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
+{
+    int k;
+    if (sbr->bs_limiter_bands > 0) {
+        static const INTFLOAT bands_warped[3] = { Q23(1.32715174233856803909f),   //2^(0.49/1.2)
+                                               Q23(1.18509277094158210129f),   //2^(0.49/2)
+                                               Q23(1.11987160404675912501f) }; //2^(0.49/3)
+        const INTFLOAT lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
+        int16_t patch_borders[7];
+        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
+
+        patch_borders[0] = sbr->kx[1];
+        for (k = 1; k <= sbr->num_patches; k++)
+            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
+
+        memcpy(sbr->f_tablelim, sbr->f_tablelow,
+               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
+        if (sbr->num_patches > 1)
+            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
+                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
+
+        qsort(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
+              sizeof(sbr->f_tablelim[0]),
+              qsort_comparison_function_int16);
+
+        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
+        while (out < sbr->f_tablelim + sbr->n_lim) {
+#if USE_FIXED
+            if ((*in << 23) >= *out * lim_bands_per_octave_warped) {
+#else
+            if (*in >= *out * lim_bands_per_octave_warped) {
+#endif /* USE_FIXED */
+                *++out = *in++;
+            } else if (*in == *out ||
+                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
+                in++;
+                sbr->n_lim--;
+            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
+                *out = *in++;
+                sbr->n_lim--;
+            } else {
+                *++out = *in++;
+            }
+        }
+    } else {
+        sbr->f_tablelim[0] = sbr->f_tablelow[0];
+        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
+        sbr->n_lim = 1;
+    }
+}
+
+static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
+{
+    unsigned int cnt = get_bits_count(gb);
+    uint8_t bs_header_extra_1;
+    uint8_t bs_header_extra_2;
+    int old_bs_limiter_bands = sbr->bs_limiter_bands;
+    SpectrumParameters old_spectrum_params;
+
+    sbr->start = 1;
+
+    // Save last spectrum parameters variables to compare to new ones
+    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
+
+    sbr->bs_amp_res_header              = get_bits1(gb);
+    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
+    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
+    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
+                                          skip_bits(gb, 2); // bs_reserved
+
+    bs_header_extra_1 = get_bits1(gb);
+    bs_header_extra_2 = get_bits1(gb);
+
+    if (bs_header_extra_1) {
+        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
+        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
+        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
+    } else {
+        sbr->spectrum_params.bs_freq_scale  = 2;
+        sbr->spectrum_params.bs_alter_scale = 1;
+        sbr->spectrum_params.bs_noise_bands = 2;
+    }
+
+    // Check if spectrum parameters changed
+    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
+        sbr->reset = 1;
+
+    if (bs_header_extra_2) {
+        sbr->bs_limiter_bands  = get_bits(gb, 2);
+        sbr->bs_limiter_gains  = get_bits(gb, 2);
+        sbr->bs_interpol_freq  = get_bits1(gb);
+        sbr->bs_smoothing_mode = get_bits1(gb);
+    } else {
+        sbr->bs_limiter_bands  = 2;
+        sbr->bs_limiter_gains  = 2;
+        sbr->bs_interpol_freq  = 1;
+        sbr->bs_smoothing_mode = 1;
+    }
+
+    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
+        sbr_make_f_tablelim(sbr);
+
+    return get_bits_count(gb) - cnt;
+}
+
+static int array_min_int16(const int16_t *array, int nel)
+{
+    int i, min = array[0];
+    for (i = 1; i < nel; i++)
+        min = FFMIN(array[i], min);
+    return min;
+}
+
+static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
+{
+    // Requirements (14496-3 sp04 p205)
+    if (n_master <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
+        return -1;
+    }
+    if (bs_xover_band >= n_master) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
+               bs_xover_band);
+        return -1;
+    }
+    return 0;
+}
+
+/// Master Frequency Band Table (14496-3 sp04 p194)
+static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
+                             SpectrumParameters *spectrum)
+{
+    unsigned int temp, max_qmf_subbands = 0;
+    unsigned int start_min, stop_min;
+    int k;
+    const int8_t *sbr_offset_ptr;
+    int16_t stop_dk[13];
+
+    if (sbr->sample_rate < 32000) {
+        temp = 3000;
+    } else if (sbr->sample_rate < 64000) {
+        temp = 4000;
+    } else
+        temp = 5000;
+
+    switch (sbr->sample_rate) {
+    case 16000:
+        sbr_offset_ptr = sbr_offset[0];
+        break;
+    case 22050:
+        sbr_offset_ptr = sbr_offset[1];
+        break;
+    case 24000:
+        sbr_offset_ptr = sbr_offset[2];
+        break;
+    case 32000:
+        sbr_offset_ptr = sbr_offset[3];
+        break;
+    case 44100: case 48000: case 64000:
+        sbr_offset_ptr = sbr_offset[4];
+        break;
+    case 88200: case 96000: case 128000: case 176400: case 192000:
+        sbr_offset_ptr = sbr_offset[5];
+        break;
+    default:
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
+        return -1;
+    }
+
+    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
+
+    if (spectrum->bs_stop_freq < 14) {
+        sbr->k[2] = stop_min;
+        make_bands(stop_dk, stop_min, 64, 13);
+        qsort(stop_dk, 13, sizeof(stop_dk[0]), qsort_comparison_function_int16);
+        for (k = 0; k < spectrum->bs_stop_freq; k++)
+            sbr->k[2] += stop_dk[k];
+    } else if (spectrum->bs_stop_freq == 14) {
+        sbr->k[2] = 2*sbr->k[0];
+    } else if (spectrum->bs_stop_freq == 15) {
+        sbr->k[2] = 3*sbr->k[0];
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
+        return -1;
+    }
+    sbr->k[2] = FFMIN(64, sbr->k[2]);
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->sample_rate <= 32000) {
+        max_qmf_subbands = 48;
+    } else if (sbr->sample_rate == 44100) {
+        max_qmf_subbands = 35;
+    } else if (sbr->sample_rate >= 48000)
+        max_qmf_subbands = 32;
+    else
+        av_assert0(0);
+
+    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
+        return -1;
+    }
+
+    if (!spectrum->bs_freq_scale) {
+        int dk, k2diff;
+
+        dk = spectrum->bs_alter_scale + 1;
+        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
+        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+            return -1;
+
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] = dk;
+
+        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
+        if (k2diff < 0) {
+            sbr->f_master[1]--;
+            sbr->f_master[2]-= (k2diff < -1);
+        } else if (k2diff) {
+            sbr->f_master[sbr->n_master]++;
+        }
+
+        sbr->f_master[0] = sbr->k[0];
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] += sbr->f_master[k - 1];
+
+    } else {
+        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
+        int two_regions, num_bands_0;
+        int vdk0_max, vdk1_min;
+        int16_t vk0[49];
+#if USE_FIXED
+        int tmp, nz = 0;
+#endif /* USE_FIXED */
+
+        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
+            two_regions = 1;
+            sbr->k[1] = 2 * sbr->k[0];
+        } else {
+            two_regions = 0;
+            sbr->k[1] = sbr->k[2];
+        }
+
+#if USE_FIXED
+        tmp = (sbr->k[1] << 23) / sbr->k[0];
+        while (tmp < 0x40000000) {
+          tmp <<= 1;
+          nz++;
+        }
+        tmp = fixed_log(tmp - 0x80000000);
+        tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+        tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+        num_bands_0 = ((tmp + 0x400000) >> 23) * 2;
+#else
+        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
+#endif /* USE_FIXED */
+
+        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
+            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
+            return -1;
+        }
+
+        vk0[0] = 0;
+
+        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
+
+        qsort(vk0 + 1, num_bands_0, sizeof(vk0[1]), qsort_comparison_function_int16);
+        vdk0_max = vk0[num_bands_0];
+
+        vk0[0] = sbr->k[0];
+        for (k = 1; k <= num_bands_0; k++) {
+            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
+                return -1;
+            }
+            vk0[k] += vk0[k-1];
+        }
+
+        if (two_regions) {
+            int16_t vk1[49];
+#if USE_FIXED
+            int num_bands_1;
+
+            tmp = (sbr->k[2] << 23) / sbr->k[1];
+            nz = 0;
+            while (tmp < 0x40000000) {
+              tmp <<= 1;
+              nz++;
+            }
+            tmp = fixed_log(tmp - 0x80000000);
+            tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+            tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+            if (spectrum->bs_alter_scale)
+                tmp = (int)(((int64_t)tmp * CONST_076923 + 0x40000000) >> 31);
+            num_bands_1 = ((tmp + 0x400000) >> 23) * 2;
+#else
+            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
+                                                     : 1.0f; // bs_alter_scale = {0,1}
+            int num_bands_1 = lrintf(half_bands * invwarp *
+                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
+#endif /* USE_FIXED */
+            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
+
+            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
+
+            if (vdk1_min < vdk0_max) {
+                int change;
+                qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
+                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
+                vk1[1]           += change;
+                vk1[num_bands_1] -= change;
+            }
+
+            qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
+
+            vk1[0] = sbr->k[1];
+            for (k = 1; k <= num_bands_1; k++) {
+                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
+                    return -1;
+                }
+                vk1[k] += vk1[k-1];
+            }
+
+            sbr->n_master = num_bands_0 + num_bands_1;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(&sbr->f_master[0],               vk0,
+                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
+                    num_bands_1      * sizeof(sbr->f_master[0]));
+
+        } else {
+            sbr->n_master = num_bands_0;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+        }
+    }
+
+    return 0;
+}
+
+/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
+static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int i, k, last_k = -1, last_msb = -1, sb = 0;
+    int msb = sbr->k[0];
+    int usb = sbr->kx[1];
+    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->num_patches = 0;
+
+    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
+        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
+    } else
+        k = sbr->n_master;
+
+    do {
+        int odd = 0;
+        if (k == last_k && msb == last_msb) {
+            av_log(ac->avctx, AV_LOG_ERROR, "patch construction failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+        last_k = k;
+        last_msb = msb;
+        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
+            sb = sbr->f_master[i];
+            odd = (sb + sbr->k[0]) & 1;
+        }
+
+        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
+        // After this check the final number of patches can still be six which is
+        // illegal however the Coding Technologies decoder check stream has a final
+        // count of 6 patches
+        if (sbr->num_patches > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
+            return -1;
+        }
+
+        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
+        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
+
+        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
+            usb = sb;
+            msb = sb;
+            sbr->num_patches++;
+        } else
+            msb = sbr->kx[1];
+
+        if (sbr->f_master[k] - sb < 3)
+            k = sbr->n_master;
+    } while (sb != sbr->kx[1] + sbr->m[1]);
+
+    if (sbr->num_patches > 1 &&
+        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
+        sbr->num_patches--;
+
+    return 0;
+}
+
+/// Derived Frequency Band Tables (14496-3 sp04 p197)
+static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int k, temp;
+#if USE_FIXED
+    int nz = 0;
+#endif /* USE_FIXED */
+
+    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
+    sbr->n[0] = (sbr->n[1] + 1) >> 1;
+
+    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
+           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
+    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
+    sbr->kx[1] = sbr->f_tablehigh[0];
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->kx[1] + sbr->m[1] > 64) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
+        return -1;
+    }
+    if (sbr->kx[1] > 32) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
+        return -1;
+    }
+
+    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
+    temp = sbr->n[1] & 1;
+    for (k = 1; k <= sbr->n[0]; k++)
+        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
+#if USE_FIXED
+    temp = (sbr->k[2] << 23) / sbr->kx[1];
+    while (temp < 0x40000000) {
+        temp <<= 1;
+        nz++;
+    }
+    temp = fixed_log(temp - 0x80000000);
+    temp = (int)(((int64_t)temp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+    temp = (((temp + 0x80) >> 8) + ((8 - nz) << 23)) * sbr->spectrum_params.bs_noise_bands;
+
+    sbr->n_q = (temp + 0x400000) >> 23;
+    if (sbr->n_q < 1)
+        sbr->n_q = 1;
+#else
+    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
+                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
+#endif /* USE_FIXED */
+
+    if (sbr->n_q > 5) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
+        return -1;
+    }
+
+    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
+    temp = 0;
+    for (k = 1; k <= sbr->n_q; k++) {
+        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
+        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
+    }
+
+    if (sbr_hf_calc_npatches(ac, sbr) < 0)
+        return -1;
+
+    sbr_make_f_tablelim(sbr);
+
+    sbr->data[0].f_indexnoise = 0;
+    sbr->data[1].f_indexnoise = 0;
+
+    return 0;
+}
+
+static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
+                                              int elements)
+{
+    int i;
+    for (i = 0; i < elements; i++) {
+        vec[i] = get_bits1(gb);
+    }
+}
+
+/** ceil(log2(index+1)) */
+static const int8_t ceil_log2[] = {
+    0, 1, 2, 2, 3, 3,
+};
+
+static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
+                         GetBitContext *gb, SBRData *ch_data)
+{
+    int i;
+    int bs_pointer = 0;
+    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
+    int abs_bord_trail = 16;
+    int num_rel_lead, num_rel_trail;
+    unsigned bs_num_env_old = ch_data->bs_num_env;
+
+    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
+    ch_data->bs_amp_res = sbr->bs_amp_res_header;
+    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
+
+    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
+    case FIXFIX:
+        ch_data->bs_num_env                 = 1 << get_bits(gb, 2);
+        num_rel_lead                        = ch_data->bs_num_env - 1;
+        if (ch_data->bs_num_env == 1)
+            ch_data->bs_amp_res = 0;
+
+        if (ch_data->bs_num_env > 4) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
+                   ch_data->bs_num_env);
+            return -1;
+        }
+
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
+                   ch_data->bs_num_env;
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
+
+        ch_data->bs_freq_res[1] = get_bits1(gb);
+        for (i = 1; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
+        break;
+    case FIXVAR:
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_trail + 1;
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        for (i = 0; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
+        break;
+    case VARFIX:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_lead + 1;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    case VARVAR:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_lead + num_rel_trail + 1;
+
+        if (ch_data->bs_num_env > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
+                   ch_data->bs_num_env);
+            return -1;
+        }
+
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    }
+
+    av_assert0(bs_pointer >= 0);
+    if (bs_pointer > ch_data->bs_num_env + 1) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
+               bs_pointer);
+        return -1;
+    }
+
+    for (i = 1; i <= ch_data->bs_num_env; i++) {
+        if (ch_data->t_env[i-1] > ch_data->t_env[i]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Non monotone time borders\n");
+            return -1;
+        }
+    }
+
+    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
+
+    ch_data->t_q[0]                     = ch_data->t_env[0];
+    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
+    if (ch_data->bs_num_noise > 1) {
+        int idx;
+        if (ch_data->bs_frame_class == FIXFIX) {
+            idx = ch_data->bs_num_env >> 1;
+        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
+            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
+        } else { // VARFIX
+            if (!bs_pointer)
+                idx = 1;
+            else if (bs_pointer == 1)
+                idx = ch_data->bs_num_env - 1;
+            else // bs_pointer > 1
+                idx = bs_pointer - 1;
+        }
+        ch_data->t_q[1] = ch_data->t_env[idx];
+    }
+
+    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
+    ch_data->e_a[1] = -1;
+    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
+        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
+    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
+        ch_data->e_a[1] = bs_pointer - 1;
+
+    return 0;
+}
+
+static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
+    //These variables are saved from the previous frame rather than copied
+    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
+    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
+    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
+
+    //These variables are read from the bitstream and therefore copied
+    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
+    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
+    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
+    dst->bs_num_env        = src->bs_num_env;
+    dst->bs_amp_res        = src->bs_amp_res;
+    dst->bs_num_noise      = src->bs_num_noise;
+    dst->bs_frame_class    = src->bs_frame_class;
+    dst->e_a[1]            = src->e_a[1];
+}
+
+/// Read how the envelope and noise floor data is delta coded
+static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
+    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
+}
+
+/// Read inverse filtering data
+static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    int i;
+
+    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
+    for (i = 0; i < sbr->n_q; i++)
+        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
+}
+
+static void read_sbr_envelope(SpectralBandReplication *sbr, GetBitContext *gb,
+                              SBRData *ch_data, int ch)
+{
+    int bits;
+    int i, j, k;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+    const int odd = sbr->n[1] & 1;
+
+    if (sbr->bs_coupling && ch) {
+        if (ch_data->bs_amp_res) {
+            bits   = 5;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+        } else {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
+        }
+    } else {
+        if (ch_data->bs_amp_res) {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+        } else {
+            bits   = 7;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
+        }
+    }
+
+#if USE_FIXED
+    for (i = 0; i < ch_data->bs_num_env; i++) {
+        if (ch_data->bs_df_env[i]) {
+            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
+            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
+                    ch_data->env_facs[i + 1][j].mant = ch_data->env_facs[i][j].mant + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+            } else if (ch_data->bs_freq_res[i + 1]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
+                    ch_data->env_facs[i + 1][j].mant = ch_data->env_facs[i][k].mant + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                }
+            } else {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
+                    ch_data->env_facs[i + 1][j].mant = ch_data->env_facs[i][k].mant + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                }
+            }
+        } else {
+            ch_data->env_facs[i + 1][0].mant = delta * get_bits(gb, bits); // bs_env_start_value_balance
+            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
+                ch_data->env_facs[i + 1][j].mant = ch_data->env_facs[i + 1][j - 1].mant + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+        }
+    }
+#else
+    for (i = 0; i < ch_data->bs_num_env; i++) {
+        if (ch_data->bs_df_env[i]) {
+            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
+            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
+                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+            } else if (ch_data->bs_freq_res[i + 1]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
+                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                }
+            } else {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
+                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                }
+            }
+        } else {
+            ch_data->env_facs[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
+            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
+                ch_data->env_facs[i + 1][j] = ch_data->env_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+        }
+    }
+#endif /* USE_FIXED */
+
+    //assign 0th elements of env_facs from last elements
+    memcpy(ch_data->env_facs[0], ch_data->env_facs[ch_data->bs_num_env],
+           sizeof(ch_data->env_facs[0]));
+}
+
+static void read_sbr_noise(SpectralBandReplication *sbr, GetBitContext *gb,
+                           SBRData *ch_data, int ch)
+{
+    int i, j;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+
+    if (sbr->bs_coupling && ch) {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+    } else {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+    }
+
+#if USE_FIXED
+    for (i = 0; i < ch_data->bs_num_noise; i++) {
+        if (ch_data->bs_df_noise[i]) {
+            for (j = 0; j < sbr->n_q; j++)
+                ch_data->noise_facs[i + 1][j].mant = ch_data->noise_facs[i][j].mant + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
+        } else {
+            ch_data->noise_facs[i + 1][0].mant = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
+            for (j = 1; j < sbr->n_q; j++)
+                ch_data->noise_facs[i + 1][j].mant = ch_data->noise_facs[i + 1][j - 1].mant + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+        }
+    }
+#else
+    for (i = 0; i < ch_data->bs_num_noise; i++) {
+        if (ch_data->bs_df_noise[i]) {
+            for (j = 0; j < sbr->n_q; j++)
+                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
+        } else {
+            ch_data->noise_facs[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
+            for (j = 1; j < sbr->n_q; j++)
+                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+        }
+    }
+#endif /* USE_FIXED */
+
+    //assign 0th elements of noise_facs from last elements
+    memcpy(ch_data->noise_facs[0], ch_data->noise_facs[ch_data->bs_num_noise],
+           sizeof(ch_data->noise_facs[0]));
+}
+
+static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+                               GetBitContext *gb,
+                               int bs_extension_id, int *num_bits_left)
+{
+    switch (bs_extension_id) {
+    case EXTENSION_ID_PS:
+        if (!ac->oc[1].m4ac.ps) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+            *num_bits_left = 0;
+        } else {
+#if 1
+            *num_bits_left -= AAC_RENAME(ff_ps_read_data)(ac->avctx, gb, &sbr->ps, *num_bits_left);
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+#else
+            avpriv_report_missing_feature(ac->avctx, "Parametric Stereo");
+            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+            *num_bits_left = 0;
+#endif
+        }
+        break;
+    default:
+        // some files contain 0-padding
+        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
+            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
+        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+        *num_bits_left = 0;
+        break;
+    }
+}
+
+static int read_sbr_single_channel_element(AACContext *ac,
+                                            SpectralBandReplication *sbr,
+                                            GetBitContext *gb)
+{
+    if (get_bits1(gb)) // bs_data_extra
+        skip_bits(gb, 4); // bs_reserved
+
+    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        return -1;
+    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+    read_sbr_invf(sbr, gb, &sbr->data[0]);
+    read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
+    read_sbr_noise(sbr, gb, &sbr->data[0], 0);
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static int read_sbr_channel_pair_element(AACContext *ac,
+                                          SpectralBandReplication *sbr,
+                                          GetBitContext *gb)
+{
+    if (get_bits1(gb))    // bs_data_extra
+        skip_bits(gb, 8); // bs_reserved
+
+    if ((sbr->bs_coupling = get_bits1(gb))) {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+            return -1;
+        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
+        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
+        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
+        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
+    } else {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
+            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
+            return -1;
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        read_sbr_invf(sbr, gb, &sbr->data[1]);
+        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
+        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
+        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
+        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
+    }
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
+                                  GetBitContext *gb, int id_aac)
+{
+    unsigned int cnt = get_bits_count(gb);
+
+    sbr->id_aac = id_aac;
+
+    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
+        if (read_sbr_single_channel_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else if (id_aac == TYPE_CPE) {
+        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
+        sbr_turnoff(sbr);
+        return get_bits_count(gb) - cnt;
+    }
+    if (get_bits1(gb)) { // bs_extended_data
+        int num_bits_left = get_bits(gb, 4); // bs_extension_size
+        if (num_bits_left == 15)
+            num_bits_left += get_bits(gb, 8); // bs_esc_count
+
+        num_bits_left <<= 3;
+        while (num_bits_left > 7) {
+            num_bits_left -= 2;
+            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
+        }
+        if (num_bits_left < 0) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
+        }
+        if (num_bits_left > 0)
+            skip_bits(gb, num_bits_left);
+    }
+
+    return get_bits_count(gb) - cnt;
+}
+
+static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int err;
+    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
+    if (err >= 0)
+        err = sbr_make_f_derived(ac, sbr);
+    if (err < 0) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
+        sbr_turnoff(sbr);
+    }
+}
+
+/**
+ * Decode Spectral Band Replication extension data; reference: table 4.55.
+ *
+ * @param   crc flag indicating the presence of CRC checksum
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return  Returns number of bytes consumed from the TYPE_FIL element.
+ */
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
+                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
+{
+    unsigned int num_sbr_bits = 0, num_align_bits;
+    unsigned bytes_read;
+    GetBitContext gbc = *gb_host, *gb = &gbc;
+    skip_bits_long(gb_host, cnt*8 - 4);
+
+    sbr->reset = 0;
+
+    if (!sbr->sample_rate)
+        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
+    if (!ac->oc[1].m4ac.ext_sample_rate)
+        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
+
+    if (crc) {
+        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
+        num_sbr_bits += 10;
+    }
+
+    //Save some state from the previous frame.
+    sbr->kx[0] = sbr->kx[1];
+    sbr->m[0] = sbr->m[1];
+    sbr->kx_and_m_pushed = 1;
+
+    num_sbr_bits++;
+    if (get_bits1(gb)) // bs_header_flag
+        num_sbr_bits += read_sbr_header(sbr, gb);
+
+    if (sbr->reset)
+        sbr_reset(ac, sbr);
+
+    if (sbr->start)
+        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
+
+    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
+    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
+
+    if (bytes_read > cnt) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
+    }
+    return cnt;
+}
+
+/**
+ * Analysis QMF Bank (14496-3 sp04 p206)
+ *
+ * @param   x       pointer to the beginning of the first sample window
+ * @param   W       array of complex-valued samples split into subbands
+ */
+#ifndef sbr_qmf_analysis
+#if USE_FIXED
+static void sbr_qmf_analysis(AVFixedDSPContext *dsp, FFTContext *mdct,
+#else
+static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
+#endif /* USE_FIXED */
+                             SBRDSPContext *sbrdsp, const INTFLOAT *in, INTFLOAT *x,
+                             INTFLOAT z[320], INTFLOAT W[2][32][32][2], int buf_idx)
+{
+    int i;
+    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
+    memcpy(x+288, in,         1024*sizeof(x[0]));
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+#endif
+
+/**
+ * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
+ * (14496-3 sp04 p206)
+ */
+#ifndef sbr_qmf_synthesis
+static void sbr_qmf_synthesis(FFTContext *mdct,
+#if USE_FIXED
+                              SBRDSPContext *sbrdsp, AVFixedDSPContext *dsp,
+#else
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
+#endif /* USE_FIXED */
+                              INTFLOAT *out, INTFLOAT X[2][38][64],
+                              INTFLOAT mdct_buf[2][64],
+                              INTFLOAT *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const INTFLOAT *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    INTFLOAT *v;
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(INTFLOAT));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+        out += 64 >> div;
+    }
+}
+#endif
+
+/// Generate the subband filtered lowband
+static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    const int t_HFGen = 8;
+    const int i_f = 32;
+    memset(X_low, 0, 32*sizeof(*X_low));
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
+        }
+    }
+    buf_idx = 1-buf_idx;
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
+        }
+    }
+    return 0;
+}
+
+/// High Frequency Generator (14496-3 sp04 p215)
+static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_high[64][40][2], const INTFLOAT X_low[32][40][2],
+                      const INTFLOAT (*alpha0)[2], const INTFLOAT (*alpha1)[2],
+                      const INTFLOAT bw_array[5], const uint8_t *t_env,
+                      int bs_num_env)
+{
+    int j, x;
+    int g = 0;
+    int k = sbr->kx[1];
+    for (j = 0; j < sbr->num_patches; j++) {
+        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
+            const int p = sbr->patch_start_subband[j] + x;
+            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
+                g++;
+            g--;
+
+            if (g < 0) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "ERROR : no subband found for frequency %d\n", k);
+                return -1;
+            }
+
+            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
+                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
+                            alpha0[p], alpha1[p], bw_array[g],
+                            2 * t_env[0], 2 * t_env[bs_num_env]);
+        }
+    }
+    if (k < sbr->m[1] + sbr->kx[1])
+        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
+
+    return 0;
+}
+
+/// Generate the subband filtered lowband
+static int sbr_x_gen(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+    memset(X, 0, 2*sizeof(*X));
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = Y0[i + i_f][k][0];
+            X[1][i][k] = Y0[i + i_f][k][1];
+        }
+    }
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = i_Temp; i < 38; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+        for (i = i_Temp; i < i_f; i++) {
+            X[0][i][k] = Y1[i][k][0];
+            X[1][i][k] = Y1[i][k][1];
+        }
+    }
+    return 0;
+}
+
+/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
+ * (14496-3 sp04 p217)
+ */
+static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
+                        SBRData *ch_data, int e_a[2])
+{
+    int e, i, m;
+
+    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
+        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+        int k;
+
+        if (sbr->kx[1] != table[0]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
+                   "Derived frequency tables were not regenerated.\n");
+            sbr_turnoff(sbr);
+            return AVERROR_BUG;
+        }
+        for (i = 0; i < ilim; i++)
+            for (m = table[i]; m < table[i + 1]; m++)
+                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
+
+        // ch_data->bs_num_noise > 1 => 2 noise floors
+        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
+        for (i = 0; i < sbr->n_q; i++)
+            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
+                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
+
+        for (i = 0; i < sbr->n[1]; i++) {
+            if (ch_data->bs_add_harmonic_flag) {
+                const unsigned int m_midpoint =
+                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
+
+                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
+                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
+            }
+        }
+
+        for (i = 0; i < ilim; i++) {
+            int additional_sinusoid_present = 0;
+            for (m = table[i]; m < table[i + 1]; m++) {
+                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
+                    additional_sinusoid_present = 1;
+                    break;
+                }
+            }
+            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
+                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
+        }
+    }
+
+    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
+    return 0;
+}
+
+/// Estimation of current envelope (14496-3 sp04 p218)
+static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2],
+                             SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int e, m;
+    int kx1 = sbr->kx[1];
+
+    if (sbr->bs_interpol_freq) {
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+#if USE_FIXED
+            const SoftFloat recip_env_size = av_int2sf(0x20000000 / (ch_data->t_env[e + 1] - ch_data->t_env[e]), 30);
+#else
+            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+#endif /* USE_FIXED */
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+
+            for (m = 0; m < sbr->m[1]; m++) {
+                AAC_FLOAT sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
+#if USE_FIXED
+                e_curr[e][m] = av_mul_sf(sum, recip_env_size);
+#else
+                e_curr[e][m] = sum * recip_env_size;
+#endif /* USE_FIXED */
+            }
+        }
+    } else {
+        int k, p;
+
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+
+            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
+#if USE_FIXED
+                SoftFloat sum = { 0, 0 };
+                const SoftFloat den = av_int2sf(0x20000000 / (env_size * (table[p + 1] - table[p])), 29);
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum = av_add_sf(sum, sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb));
+                }
+                sum = av_mul_sf(sum, den);
+#else
+                float sum = 0.0f;
+                const int den = env_size * (table[p + 1] - table[p]);
+
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
+                }
+                sum /= den;
+#endif /* USE_FIXED */
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    e_curr[e][k - kx1] = sum;
+                }
+            }
+        }
+    }
+}
+
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT* R)
+{
+    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
+    int ch;
+    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
+    int err;
+
+    if (id_aac != sbr->id_aac) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+            "element type mismatch %d != %d\n", id_aac, sbr->id_aac);
+        sbr_turnoff(sbr);
+    }
+
+    if (!sbr->kx_and_m_pushed) {
+        sbr->kx[0] = sbr->kx[1];
+        sbr->m[0] = sbr->m[1];
+    } else {
+        sbr->kx_and_m_pushed = 0;
+    }
+
+    if (sbr->start) {
+        sbr_dequant(sbr, id_aac);
+    }
+    for (ch = 0; ch < nch; ch++) {
+        /* decode channel */
+        sbr_qmf_analysis(ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
+                         (INTFLOAT*)sbr->qmf_filter_scratch,
+                         sbr->data[ch].W, sbr->data[ch].Ypos);
+        sbr->c.sbr_lf_gen(ac, sbr, sbr->X_low,
+                          (const INTFLOAT (*)[32][32][2]) sbr->data[ch].W,
+                          sbr->data[ch].Ypos);
+        sbr->data[ch].Ypos ^= 1;
+        if (sbr->start) {
+            sbr->c.sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
+                                         (const INTFLOAT (*)[40][2]) sbr->X_low, sbr->k[0]);
+            sbr_chirp(sbr, &sbr->data[ch]);
+            av_assert0(sbr->data[ch].bs_num_env > 0);
+            sbr_hf_gen(ac, sbr, sbr->X_high,
+                       (const INTFLOAT (*)[40][2]) sbr->X_low,
+                       (const INTFLOAT (*)[2]) sbr->alpha0,
+                       (const INTFLOAT (*)[2]) sbr->alpha1,
+                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
+                       sbr->data[ch].bs_num_env);
+
+            // hf_adj
+            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+            if (!err) {
+                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
+                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+                sbr->c.sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
+                                (const INTFLOAT (*)[40][2]) sbr->X_high,
+                                sbr, &sbr->data[ch],
+                                sbr->data[ch].e_a);
+            }
+        }
+
+        /* synthesis */
+        sbr->c.sbr_x_gen(sbr, sbr->X[ch],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[40][2]) sbr->X_low, ch);
+    }
+
+    if (ac->oc[1].m4ac.ps == 1) {
+        if (sbr->ps.start) {
+            AAC_RENAME(ff_ps_apply)(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
+        } else {
+            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
+        }
+        nch = 2;
+    }
+
+    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                      L, sbr->X[0], sbr->qmf_filter_scratch,
+                      sbr->data[0].synthesis_filterbank_samples,
+                      &sbr->data[0].synthesis_filterbank_samples_offset,
+                      downsampled);
+    if (nch == 2)
+        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                          R, sbr->X[1], sbr->qmf_filter_scratch,
+                          sbr->data[1].synthesis_filterbank_samples,
+                          &sbr->data[1].synthesis_filterbank_samples_offset,
+                          downsampled);
+}
+
+static void aacsbr_func_ptr_init(AACSBRContext *c)
+{
+    c->sbr_lf_gen            = sbr_lf_gen;
+    c->sbr_hf_assemble       = sbr_hf_assemble;
+    c->sbr_x_gen             = sbr_x_gen;
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter;
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacsbr_func_ptr_init_mips(c);
+#endif
+}
diff --git a/libavcodec/aacsbrdata.h b/libavcodec/aacsbrdata.h
index f3090591ed..4ff8fae913 100644
--- a/libavcodec/aacsbrdata.h
+++ b/libavcodec/aacsbrdata.h
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding data
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #include <stdint.h>
 #include "libavutil/mem.h"
+#include "aac_defines.h"
 
 ///< Huffman tables for SBR
 
@@ -266,351 +267,269 @@ static const int8_t sbr_offset[6][16] = {
     {-2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  9, 11, 13, 16, 20, 24}, // 64000 Hz <  fs_sbr
 };
 
-///< window coefficients for analysis/synthesis QMF banks
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_ds)[320];
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
-     0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518,
-    -0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564,
-    -0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747,
-    -0.0006312493, -0.0006540333, -0.0006777690, -0.0006941614,
-    -0.0007157736, -0.0007255043, -0.0007440941, -0.0007490598,
-    -0.0007681371, -0.0007724848, -0.0007834332, -0.0007779869,
-    -0.0007803664, -0.0007801449, -0.0007757977, -0.0007630793,
-    -0.0007530001, -0.0007319357, -0.0007215391, -0.0006917937,
-    -0.0006650415, -0.0006341594, -0.0005946118, -0.0005564576,
-    -0.0005145572, -0.0004606325, -0.0004095121, -0.0003501175,
-    -0.0002896981, -0.0002098337, -0.0001446380, -0.0000617334,
-     0.0000134949,  0.0001094383,  0.0002043017,  0.0002949531,
-     0.0004026540,  0.0005107388,  0.0006239376,  0.0007458025,
-     0.0008608443,  0.0009885988,  0.0011250155,  0.0012577884,
-     0.0013902494,  0.0015443219,  0.0016868083,  0.0018348265,
-     0.0019841140,  0.0021461583,  0.0023017254,  0.0024625616,
-     0.0026201758,  0.0027870464,  0.0029469447,  0.0031125420,
-     0.0032739613,  0.0034418874,  0.0036008268,  0.0037603922,
-     0.0039207432,  0.0040819753,  0.0042264269,  0.0043730719,
-     0.0045209852,  0.0046606460,  0.0047932560,  0.0049137603,
-     0.0050393022,  0.0051407353,  0.0052461166,  0.0053471681,
-     0.0054196775,  0.0054876040,  0.0055475714,  0.0055938023,
-     0.0056220643,  0.0056455196,  0.0056389199,  0.0056266114,
-     0.0055917128,  0.0055404363,  0.0054753783,  0.0053838975,
-     0.0052715758,  0.0051382275,  0.0049839687,  0.0048109469,
-     0.0046039530,  0.0043801861,  0.0041251642,  0.0038456408,
-     0.0035401246,  0.0032091885,  0.0028446757,  0.0024508540,
-     0.0020274176,  0.0015784682,  0.0010902329,  0.0005832264,
-     0.0000276045, -0.0005464280, -0.0011568135, -0.0018039472,
-    -0.0024826723, -0.0031933778, -0.0039401124, -0.0047222596,
-    -0.0055337211, -0.0063792293, -0.0072615816, -0.0081798233,
-    -0.0091325329, -0.0101150215, -0.0111315548, -0.0121849995,
-     0.0132718220,  0.0143904666,  0.0155405553,  0.0167324712,
-     0.0179433381,  0.0191872431,  0.0204531793,  0.0217467550,
-     0.0230680169,  0.0244160992,  0.0257875847,  0.0271859429,
-     0.0286072173,  0.0300502657,  0.0315017608,  0.0329754081,
-     0.0344620948,  0.0359697560,  0.0374812850,  0.0390053679,
-     0.0405349170,  0.0420649094,  0.0436097542,  0.0451488405,
-     0.0466843027,  0.0482165720,  0.0497385755,  0.0512556155,
-     0.0527630746,  0.0542452768,  0.0557173648,  0.0571616450,
-     0.0585915683,  0.0599837480,  0.0613455171,  0.0626857808,
-     0.0639715898,  0.0652247106,  0.0664367512,  0.0676075985,
-     0.0687043828,  0.0697630244,  0.0707628710,  0.0717002673,
-     0.0725682583,  0.0733620255,  0.0741003642,  0.0747452558,
-     0.0753137336,  0.0758008358,  0.0761992479,  0.0764992170,
-     0.0767093490,  0.0768173975,  0.0768230011,  0.0767204924,
-     0.0765050718,  0.0761748321,  0.0757305756,  0.0751576255,
-     0.0744664394,  0.0736406005,  0.0726774642,  0.0715826364,
-     0.0703533073,  0.0689664013,  0.0674525021,  0.0657690668,
-     0.0639444805,  0.0619602779,  0.0598166570,  0.0575152691,
-     0.0550460034,  0.0524093821,  0.0495978676,  0.0466303305,
-     0.0434768782,  0.0401458278,  0.0366418116,  0.0329583930,
-     0.0290824006,  0.0250307561,  0.0207997072,  0.0163701258,
-     0.0117623832,  0.0069636862,  0.0019765601, -0.0032086896,
-    -0.0085711749, -0.0141288827, -0.0198834129, -0.0258227288,
-    -0.0319531274, -0.0382776572, -0.0447806821, -0.0514804176,
-    -0.0583705326, -0.0654409853, -0.0726943300, -0.0801372934,
-    -0.0877547536, -0.0955533352, -0.1035329531, -0.1116826931,
-    -0.1200077984, -0.1285002850, -0.1371551761, -0.1459766491,
-    -0.1549607071, -0.1640958855, -0.1733808172, -0.1828172548,
-    -0.1923966745, -0.2021250176, -0.2119735853, -0.2219652696,
-    -0.2320690870, -0.2423016884, -0.2526480309, -0.2631053299,
-    -0.2736634040, -0.2843214189, -0.2950716717, -0.3059098575,
-    -0.3168278913, -0.3278113727, -0.3388722693, -0.3499914122,
-     0.3611589903,  0.3723795546,  0.3836350013,  0.3949211761,
-     0.4062317676,  0.4175696896,  0.4289119920,  0.4402553754,
-     0.4515996535,  0.4629308085,  0.4742453214,  0.4855253091,
-     0.4967708254,  0.5079817500,  0.5191234970,  0.5302240895,
-     0.5412553448,  0.5522051258,  0.5630789140,  0.5738524131,
-     0.5845403235,  0.5951123086,  0.6055783538,  0.6159109932,
-     0.6261242695,  0.6361980107,  0.6461269695,  0.6559016302,
-     0.6655139880,  0.6749663190,  0.6842353293,  0.6933282376,
-     0.7022388719,  0.7109410426,  0.7194462634,  0.7277448900,
-     0.7358211758,  0.7436827863,  0.7513137456,  0.7587080760,
-     0.7658674865,  0.7727780881,  0.7794287519,  0.7858353120,
-     0.7919735841,  0.7978466413,  0.8034485751,  0.8087695004,
-     0.8138191270,  0.8185776004,  0.8230419890,  0.8272275347,
-     0.8311038457,  0.8346937361,  0.8379717337,  0.8409541392,
-     0.8436238281,  0.8459818469,  0.8480315777,  0.8497805198,
-     0.8511971524,  0.8523047035,  0.8531020949,  0.8535720573,
-     0.8537385600,
-};
-
-/* First two entries repeated at end to simplify SIMD implementations. */
-const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
-{ 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
-{ 0.80705063769351,  0.29653668284408}, {-0.38981478896926,  0.89572605717087},
-{-0.01053049862020, -0.66959058036166}, {-0.91266367957293, -0.11522938140034},
-{ 0.54840422910309,  0.75221367176302}, { 0.40009252867955, -0.98929400334421},
-{-0.99867974711855, -0.88147068645358}, {-0.95531076805040,  0.90908757154593},
-{-0.45725933317144, -0.56716323646760}, {-0.72929675029275, -0.98008272727324},
-{ 0.75622801399036,  0.20950329995549}, { 0.07069442601050, -0.78247898470706},
-{ 0.74496252926055, -0.91169004445807}, {-0.96440182703856, -0.94739918296622},
-{ 0.30424629369539, -0.49438267012479}, { 0.66565033746925,  0.64652935542491},
-{ 0.91697008020594,  0.17514097332009}, {-0.70774918760427,  0.52548653416543},
-{-0.70051415345560, -0.45340028808763}, {-0.99496513054797, -0.90071908066973},
-{ 0.98164490790123, -0.77463155528697}, {-0.54671580548181, -0.02570928536004},
-{-0.01689629065389,  0.00287506445732}, {-0.86110349531986,  0.42548583726477},
-{-0.98892980586032, -0.87881132267556}, { 0.51756627678691,  0.66926784710139},
-{-0.99635026409640, -0.58107730574765}, {-0.99969370862163,  0.98369989360250},
-{ 0.55266258627194,  0.59449057465591}, { 0.34581177741673,  0.94879421061866},
-{ 0.62664209577999, -0.74402970906471}, {-0.77149701404973, -0.33883658042801},
-{-0.91592244254432,  0.03687901376713}, {-0.76285492357887, -0.91371867919124},
-{ 0.79788337195331, -0.93180971199849}, { 0.54473080610200, -0.11919206037186},
-{-0.85639281671058,  0.42429854760451}, {-0.92882402971423,  0.27871809078609},
-{-0.11708371046774, -0.99800843444966}, { 0.21356749817493, -0.90716295627033},
-{-0.76191692573909,  0.99768118356265}, { 0.98111043100884, -0.95854459734407},
-{-0.85913269895572,  0.95766566168880}, {-0.93307242253692,  0.49431757696466},
-{ 0.30485754879632, -0.70540034357529}, { 0.85289650925190,  0.46766131791044},
-{ 0.91328082618125, -0.99839597361769}, {-0.05890199924154,  0.70741827819497},
-{ 0.28398686150148,  0.34633555702188}, { 0.95258164539612, -0.54893416026939},
-{-0.78566324168507, -0.75568541079691}, {-0.95789495447877, -0.20423194696966},
-{ 0.82411158711197,  0.96654618432562}, {-0.65185446735885, -0.88734990773289},
-{-0.93643603134666,  0.99870790442385}, { 0.91427159529618, -0.98290505544444},
-{-0.70395684036886,  0.58796798221039}, { 0.00563771969365,  0.61768196727244},
-{ 0.89065051931895,  0.52783352697585}, {-0.68683707712762,  0.80806944710339},
-{ 0.72165342518718, -0.69259857349564}, {-0.62928247730667,  0.13627037407335},
-{ 0.29938434065514, -0.46051329682246}, {-0.91781958879280, -0.74012716684186},
-{ 0.99298717043688,  0.40816610075661}, { 0.82368298622748, -0.74036047190173},
-{-0.98512833386833, -0.99972330709594}, {-0.95915368242257, -0.99237800466040},
-{-0.21411126572790, -0.93424819052545}, {-0.68821476106884, -0.26892306315457},
-{ 0.91851997982317,  0.09358228901785}, {-0.96062769559127,  0.36099095133739},
-{ 0.51646184922287, -0.71373332873917}, { 0.61130721139669,  0.46950141175917},
-{ 0.47336129371299, -0.27333178296162}, { 0.90998308703519,  0.96715662938132},
-{ 0.44844799194357,  0.99211574628306}, { 0.66614891079092,  0.96590176169121},
-{ 0.74922239129237, -0.89879858826087}, {-0.99571588506485,  0.52785521494349},
-{ 0.97401082477563, -0.16855870075190}, { 0.72683747733879, -0.48060774432251},
-{ 0.95432193457128,  0.68849603408441}, {-0.72962208425191, -0.76608443420917},
-{-0.85359479233537,  0.88738125901579}, {-0.81412430338535, -0.97480768049637},
-{-0.87930772356786,  0.74748307690436}, {-0.71573331064977, -0.98570608178923},
-{ 0.83524300028228,  0.83702537075163}, {-0.48086065601423, -0.98848504923531},
-{ 0.97139128574778,  0.80093621198236}, { 0.51992825347895,  0.80247631400510},
-{-0.00848591195325, -0.76670128000486}, {-0.70294374303036,  0.55359910445577},
-{-0.95894428168140, -0.43265504344783}, { 0.97079252950321,  0.09325857238682},
-{-0.92404293670797,  0.85507704027855}, {-0.69506469500450,  0.98633412625459},
-{ 0.26559203620024,  0.73314307966524}, { 0.28038443336943,  0.14537913654427},
-{-0.74138124825523,  0.99310339807762}, {-0.01752795995444, -0.82616635284178},
-{-0.55126773094930, -0.98898543862153}, { 0.97960898850996, -0.94021446752851},
-{-0.99196309146936,  0.67019017358456}, {-0.67684928085260,  0.12631491649378},
-{ 0.09140039465500, -0.20537731453108}, {-0.71658965751996, -0.97788200391224},
-{ 0.81014640078925,  0.53722648362443}, { 0.40616991671205, -0.26469008598449},
-{-0.67680188682972,  0.94502052337695}, { 0.86849774348749, -0.18333598647899},
-{-0.99500381284851, -0.02634122068550}, { 0.84329189340667,  0.10406957462213},
-{-0.09215968531446,  0.69540012101253}, { 0.99956173327206, -0.12358542001404},
-{-0.79732779473535, -0.91582524736159}, { 0.96349973642406,  0.96640458041000},
-{-0.79942778496547,  0.64323902822857}, {-0.11566039853896,  0.28587846253726},
-{-0.39922954514662,  0.94129601616966}, { 0.99089197565987, -0.92062625581587},
-{ 0.28631285179909, -0.91035047143603}, {-0.83302725605608, -0.67330410892084},
-{ 0.95404443402072,  0.49162765398743}, {-0.06449863579434,  0.03250560813135},
-{-0.99575054486311,  0.42389784469507}, {-0.65501142790847,  0.82546114655624},
-{-0.81254441908887, -0.51627234660629}, {-0.99646369485481,  0.84490533520752},
-{ 0.00287840603348,  0.64768261158166}, { 0.70176989408455, -0.20453028573322},
-{ 0.96361882270190,  0.40706967140989}, {-0.68883758192426,  0.91338958840772},
-{-0.34875585502238,  0.71472290693300}, { 0.91980081243087,  0.66507455644919},
-{-0.99009048343881,  0.85868021604848}, { 0.68865791458395,  0.55660316809678},
-{-0.99484402129368, -0.20052559254934}, { 0.94214511408023, -0.99696425367461},
-{-0.67414626793544,  0.49548221180078}, {-0.47339353684664, -0.85904328834047},
-{ 0.14323651387360, -0.94145598222488}, {-0.29268293575672,  0.05759224927952},
-{ 0.43793861458754, -0.78904969892724}, {-0.36345126374441,  0.64874435357162},
-{-0.08750604656825,  0.97686944362527}, {-0.96495267812511, -0.53960305946511},
-{ 0.55526940659947,  0.78891523734774}, { 0.73538215752630,  0.96452072373404},
-{-0.30889773919437, -0.80664389776860}, { 0.03574995626194, -0.97325616900959},
-{ 0.98720684660488,  0.48409133691962}, {-0.81689296271203, -0.90827703628298},
-{ 0.67866860118215,  0.81284503870856}, {-0.15808569732583,  0.85279555024382},
-{ 0.80723395114371, -0.24717418514605}, { 0.47788757329038, -0.46333147839295},
-{ 0.96367554763201,  0.38486749303242}, {-0.99143875716818, -0.24945277239809},
-{ 0.83081876925833, -0.94780851414763}, {-0.58753191905341,  0.01290772389163},
-{ 0.95538108220960, -0.85557052096538}, {-0.96490920476211, -0.64020970923102},
-{-0.97327101028521,  0.12378128133110}, { 0.91400366022124,  0.57972471346930},
-{-0.99925837363824,  0.71084847864067}, {-0.86875903507313, -0.20291699203564},
-{-0.26240034795124, -0.68264554369108}, {-0.24664412953388, -0.87642273115183},
-{ 0.02416275806869,  0.27192914288905}, { 0.82068619590515, -0.85087787994476},
-{ 0.88547373760759, -0.89636802901469}, {-0.18173078152226, -0.26152145156800},
-{ 0.09355476558534,  0.54845123045604}, {-0.54668414224090,  0.95980774020221},
-{ 0.37050990604091, -0.59910140383171}, {-0.70373594262891,  0.91227665827081},
-{-0.34600785879594, -0.99441426144200}, {-0.68774481731008, -0.30238837956299},
-{-0.26843291251234,  0.83115668004362}, { 0.49072334613242, -0.45359708737775},
-{ 0.38975993093975,  0.95515358099121}, {-0.97757125224150,  0.05305894580606},
-{-0.17325552859616, -0.92770672250494}, { 0.99948035025744,  0.58285545563426},
-{-0.64946246527458,  0.68645507104960}, {-0.12016920576437, -0.57147322153312},
-{-0.58947456517751, -0.34847132454388}, {-0.41815140454465,  0.16276422358861},
-{ 0.99885650204884,  0.11136095490444}, {-0.56649614128386, -0.90494866361587},
-{ 0.94138021032330,  0.35281916733018}, {-0.75725076534641,  0.53650549640587},
-{ 0.20541973692630, -0.94435144369918}, { 0.99980371023351,  0.79835913565599},
-{ 0.29078277605775,  0.35393777921520}, {-0.62858772103030,  0.38765693387102},
-{ 0.43440904467688, -0.98546330463232}, {-0.98298583762390,  0.21021524625209},
-{ 0.19513029146934, -0.94239832251867}, {-0.95476662400101,  0.98364554179143},
-{ 0.93379635304810, -0.70881994583682}, {-0.85235410573336, -0.08342347966410},
-{-0.86425093011245, -0.45795025029466}, { 0.38879779059045,  0.97274429344593},
-{ 0.92045124735495, -0.62433652524220}, { 0.89162532251878,  0.54950955570563},
-{-0.36834336949252,  0.96458298020975}, { 0.93891760988045, -0.89968353740388},
-{ 0.99267657565094, -0.03757034316958}, {-0.94063471614176,  0.41332338538963},
-{ 0.99740224117019, -0.16830494996370}, {-0.35899413170555, -0.46633226649613},
-{ 0.05237237274947, -0.25640361602661}, { 0.36703583957424, -0.38653265641875},
-{ 0.91653180367913, -0.30587628726597}, { 0.69000803499316,  0.90952171386132},
-{-0.38658751133527,  0.99501571208985}, {-0.29250814029851,  0.37444994344615},
-{-0.60182204677608,  0.86779651036123}, {-0.97418588163217,  0.96468523666475},
-{ 0.88461574003963,  0.57508405276414}, { 0.05198933055162,  0.21269661669964},
-{-0.53499621979720,  0.97241553731237}, {-0.49429560226497,  0.98183865291903},
-{-0.98935142339139, -0.40249159006933}, {-0.98081380091130, -0.72856895534041},
-{-0.27338148835532,  0.99950922447209}, { 0.06310802338302, -0.54539587529618},
-{-0.20461677199539, -0.14209977628489}, { 0.66223843141647,  0.72528579940326},
-{-0.84764345483665,  0.02372316801261}, {-0.89039863483811,  0.88866581484602},
-{ 0.95903308477986,  0.76744927173873}, { 0.73504123909879, -0.03747203173192},
-{-0.31744434966056, -0.36834111883652}, {-0.34110827591623,  0.40211222807691},
-{ 0.47803883714199, -0.39423219786288}, { 0.98299195879514,  0.01989791390047},
-{-0.30963073129751, -0.18076720599336}, { 0.99992588229018, -0.26281872094289},
-{-0.93149731080767, -0.98313162570490}, { 0.99923472302773, -0.80142993767554},
-{-0.26024169633417, -0.75999759855752}, {-0.35712514743563,  0.19298963768574},
-{-0.99899084509530,  0.74645156992493}, { 0.86557171579452,  0.55593866696299},
-{ 0.33408042438752,  0.86185953874709}, { 0.99010736374716,  0.04602397576623},
-{-0.66694269691195, -0.91643611810148}, { 0.64016792079480,  0.15649530836856},
-{ 0.99570534804836,  0.45844586038111}, {-0.63431466947340,  0.21079116459234},
-{-0.07706847005931, -0.89581437101329}, { 0.98590090577724,  0.88241721133981},
-{ 0.80099335254678, -0.36851896710853}, { 0.78368131392666,  0.45506999802597},
-{ 0.08707806671691,  0.80938994918745}, {-0.86811883080712,  0.39347308654705},
-{-0.39466529740375, -0.66809432114456}, { 0.97875325649683, -0.72467840967746},
-{-0.95038560288864,  0.89563219587625}, { 0.17005239424212,  0.54683053962658},
-{-0.76910792026848, -0.96226617549298}, { 0.99743281016846,  0.42697157037567},
-{ 0.95437383549973,  0.97002324109952}, { 0.99578905365569, -0.54106826257356},
-{ 0.28058259829990, -0.85361420634036}, { 0.85256524470573, -0.64567607735589},
-{-0.50608540105128, -0.65846015480300}, {-0.97210735183243, -0.23095213067791},
-{ 0.95424048234441, -0.99240147091219}, {-0.96926570524023,  0.73775654896574},
-{ 0.30872163214726,  0.41514960556126}, {-0.24523839572639,  0.63206633394807},
-{-0.33813265086024, -0.38661779441897}, {-0.05826828420146, -0.06940774188029},
-{-0.22898461455054,  0.97054853316316}, {-0.18509915019881,  0.47565762892084},
-{-0.10488238045009, -0.87769947402394}, {-0.71886586182037,  0.78030982480538},
-{ 0.99793873738654,  0.90041310491497}, { 0.57563307626120, -0.91034337352097},
-{ 0.28909646383717,  0.96307783970534}, { 0.42188998312520,  0.48148651230437},
-{ 0.93335049681047, -0.43537023883588}, {-0.97087374418267,  0.86636445711364},
-{ 0.36722871286923,  0.65291654172961}, {-0.81093025665696,  0.08778370229363},
-{-0.26240603062237, -0.92774095379098}, { 0.83996497984604,  0.55839849139647},
-{-0.99909615720225, -0.96024605713970}, { 0.74649464155061,  0.12144893606462},
-{-0.74774595569805, -0.26898062008959}, { 0.95781667469567, -0.79047927052628},
-{ 0.95472308713099, -0.08588776019550}, { 0.48708332746299,  0.99999041579432},
-{ 0.46332038247497,  0.10964126185063}, {-0.76497004940162,  0.89210929242238},
-{ 0.57397389364339,  0.35289703373760}, { 0.75374316974495,  0.96705214651335},
-{-0.59174397685714, -0.89405370422752}, { 0.75087906691890, -0.29612672982396},
-{-0.98607857336230,  0.25034911730023}, {-0.40761056640505, -0.90045573444695},
-{ 0.66929266740477,  0.98629493401748}, {-0.97463695257310, -0.00190223301301},
-{ 0.90145509409859,  0.99781390365446}, {-0.87259289048043,  0.99233587353666},
-{-0.91529461447692, -0.15698707534206}, {-0.03305738840705, -0.37205262859764},
-{ 0.07223051368337, -0.88805001733626}, { 0.99498012188353,  0.97094358113387},
-{-0.74904939500519,  0.99985483641521}, { 0.04585228574211,  0.99812337444082},
-{-0.89054954257993, -0.31791913188064}, {-0.83782144651251,  0.97637632547466},
-{ 0.33454804933804, -0.86231516800408}, {-0.99707579362824,  0.93237990079441},
-{-0.22827527843994,  0.18874759397997}, { 0.67248046289143, -0.03646211390569},
-{-0.05146538187944, -0.92599700120679}, { 0.99947295749905,  0.93625229707912},
-{ 0.66951124390363,  0.98905825623893}, {-0.99602956559179, -0.44654715757688},
-{ 0.82104905483590,  0.99540741724928}, { 0.99186510988782,  0.72023001312947},
-{-0.65284592392918,  0.52186723253637}, { 0.93885443798188, -0.74895312615259},
-{ 0.96735248738388,  0.90891816978629}, {-0.22225968841114,  0.57124029781228},
-{-0.44132783753414, -0.92688840659280}, {-0.85694974219574,  0.88844532719844},
-{ 0.91783042091762, -0.46356892383970}, { 0.72556974415690, -0.99899555770747},
-{-0.99711581834508,  0.58211560180426}, { 0.77638976371966,  0.94321834873819},
-{ 0.07717324253925,  0.58638399856595}, {-0.56049829194163,  0.82522301569036},
-{ 0.98398893639988,  0.39467440420569}, { 0.47546946844938,  0.68613044836811},
-{ 0.65675089314631,  0.18331637134880}, { 0.03273375457980, -0.74933109564108},
-{-0.38684144784738,  0.51337349030406}, {-0.97346267944545, -0.96549364384098},
-{-0.53282156061942, -0.91423265091354}, { 0.99817310731176,  0.61133572482148},
-{-0.50254500772635, -0.88829338134294}, { 0.01995873238855,  0.85223515096765},
-{ 0.99930381973804,  0.94578896296649}, { 0.82907767600783, -0.06323442598128},
-{-0.58660709669728,  0.96840773806582}, {-0.17573736667267, -0.48166920859485},
-{ 0.83434292401346, -0.13023450646997}, { 0.05946491307025,  0.20511047074866},
-{ 0.81505484574602, -0.94685947861369}, {-0.44976380954860,  0.40894572671545},
-{-0.89746474625671,  0.99846578838537}, { 0.39677256130792, -0.74854668609359},
-{-0.07588948563079,  0.74096214084170}, { 0.76343198951445,  0.41746629422634},
-{-0.74490104699626,  0.94725911744610}, { 0.64880119792759,  0.41336660830571},
-{ 0.62319537462542, -0.93098313552599}, { 0.42215817594807, -0.07712787385208},
-{ 0.02704554141885, -0.05417518053666}, { 0.80001773566818,  0.91542195141039},
-{-0.79351832348816, -0.36208897989136}, { 0.63872359151636,  0.08128252493444},
-{ 0.52890520960295,  0.60048872455592}, { 0.74238552914587,  0.04491915291044},
-{ 0.99096131449250, -0.19451182854402}, {-0.80412329643109, -0.88513818199457},
-{-0.64612616129736,  0.72198674804544}, { 0.11657770663191, -0.83662833815041},
-{-0.95053182488101, -0.96939905138082}, {-0.62228872928622,  0.82767262846661},
-{ 0.03004475787316, -0.99738896333384}, {-0.97987214341034,  0.36526129686425},
-{-0.99986980746200, -0.36021610299715}, { 0.89110648599879, -0.97894250343044},
-{ 0.10407960510582,  0.77357793811619}, { 0.95964737821728, -0.35435818285502},
-{ 0.50843233159162,  0.96107691266205}, { 0.17006334670615, -0.76854025314829},
-{ 0.25872675063360,  0.99893303933816}, {-0.01115998681937,  0.98496019742444},
-{-0.79598702973261,  0.97138411318894}, {-0.99264708948101, -0.99542822402536},
-{-0.99829663752818,  0.01877138824311}, {-0.70801016548184,  0.33680685948117},
-{-0.70467057786826,  0.93272777501857}, { 0.99846021905254, -0.98725746254433},
-{-0.63364968534650, -0.16473594423746}, {-0.16258217500792, -0.95939125400802},
-{-0.43645594360633, -0.94805030113284}, {-0.99848471702976,  0.96245166923809},
-{-0.16796458968998, -0.98987511890470}, {-0.87979225745213, -0.71725725041680},
-{ 0.44183099021786, -0.93568974498761}, { 0.93310180125532, -0.99913308068246},
-{-0.93941931782002, -0.56409379640356}, {-0.88590003188677,  0.47624600491382},
-{ 0.99971463703691, -0.83889954253462}, {-0.75376385639978,  0.00814643438625},
-{ 0.93887685615875, -0.11284528204636}, { 0.85126435782309,  0.52349251543547},
-{ 0.39701421446381,  0.81779634174316}, {-0.37024464187437, -0.87071656222959},
-{-0.36024828242896,  0.34655735648287}, {-0.93388812549209, -0.84476541096429},
-{-0.65298804552119, -0.18439575450921}, { 0.11960319006843,  0.99899346780168},
-{ 0.94292565553160,  0.83163906518293}, { 0.75081145286948, -0.35533223142265},
-{ 0.56721979748394, -0.24076836414499}, { 0.46857766746029, -0.30140233457198},
-{ 0.97312313923635, -0.99548191630031}, {-0.38299976567017,  0.98516909715427},
-{ 0.41025800019463,  0.02116736935734}, { 0.09638062008048,  0.04411984381457},
-{-0.85283249275397,  0.91475563922421}, { 0.88866808958124, -0.99735267083226},
-{-0.48202429536989, -0.96805608884164}, { 0.27572582416567,  0.58634753335832},
-{-0.65889129659168,  0.58835634138583}, { 0.98838086953732,  0.99994349600236},
-{-0.20651349620689,  0.54593044066355}, {-0.62126416356920, -0.59893681700392},
-{ 0.20320105410437, -0.86879180355289}, {-0.97790548600584,  0.96290806999242},
-{ 0.11112534735126,  0.21484763313301}, {-0.41368337314182,  0.28216837680365},
-{ 0.24133038992960,  0.51294362630238}, {-0.66393410674885, -0.08249679629081},
-{-0.53697829178752, -0.97649903936228}, {-0.97224737889348,  0.22081333579837},
-{ 0.87392477144549, -0.12796173740361}, { 0.19050361015753,  0.01602615387195},
-{-0.46353441212724, -0.95249041539006}, {-0.07064096339021, -0.94479803205886},
-{-0.92444085484466, -0.10457590187436}, {-0.83822593578728, -0.01695043208885},
-{ 0.75214681811150, -0.99955681042665}, {-0.42102998829339,  0.99720941999394},
-{-0.72094786237696, -0.35008961934255}, { 0.78843311019251,  0.52851398958271},
-{ 0.97394027897442, -0.26695944086561}, { 0.99206463477946, -0.57010120849429},
-{ 0.76789609461795, -0.76519356730966}, {-0.82002421836409, -0.73530179553767},
-{ 0.81924990025724,  0.99698425250579}, {-0.26719850873357,  0.68903369776193},
-{-0.43311260380975,  0.85321815947490}, { 0.99194979673836,  0.91876249766422},
-{-0.80692001248487, -0.32627540663214}, { 0.43080003649976, -0.21919095636638},
-{ 0.67709491937357, -0.95478075822906}, { 0.56151770568316, -0.70693811747778},
-{ 0.10831862810749, -0.08628837174592}, { 0.91229417540436, -0.65987351408410},
-{-0.48972893932274,  0.56289246362686}, {-0.89033658689697, -0.71656563987082},
-{ 0.65269447475094,  0.65916004833932}, { 0.67439478141121, -0.81684380846796},
-{-0.47770832416973, -0.16789556203025}, {-0.99715979260878, -0.93565784007648},
-{-0.90889593602546,  0.62034397054380}, {-0.06618622548177, -0.23812217221359},
-{ 0.99430266919728,  0.18812555317553}, { 0.97686402381843, -0.28664534366620},
-{ 0.94813650221268, -0.97506640027128}, {-0.95434497492853, -0.79607978501983},
-{-0.49104783137150,  0.32895214359663}, { 0.99881175120751,  0.88993983831354},
-{ 0.50449166760303, -0.85995072408434}, { 0.47162891065108, -0.18680204049569},
-{-0.62081581361840,  0.75000676218956}, {-0.43867015250812,  0.99998069244322},
-{ 0.98630563232075, -0.53578899600662}, {-0.61510362277374, -0.89515019899997},
-{-0.03841517601843, -0.69888815681179}, {-0.30102157304644, -0.07667808922205},
-{ 0.41881284182683,  0.02188098922282}, {-0.86135454941237,  0.98947480909359},
-{ 0.67226861393788, -0.13494389011014}, {-0.70737398842068, -0.76547349325992},
-{ 0.94044946687963,  0.09026201157416}, {-0.82386352534327,  0.08924768823676},
-{-0.32070666698656,  0.50143421908753}, { 0.57593163224487, -0.98966422921509},
-{-0.36326018419965,  0.07440243123228}, { 0.99979044674350, -0.14130287347405},
-{-0.92366023326932, -0.97979298068180}, {-0.44607178518598, -0.54233252016394},
-{ 0.44226800932956,  0.71326756742752}, { 0.03671907158312,  0.63606389366675},
-{ 0.52175424682195, -0.85396826735705}, {-0.94701139690956, -0.01826348194255},
-{-0.98759606946049,  0.82288714303073}, { 0.87434794743625,  0.89399495655433},
-{-0.93412041758744,  0.41374052024363}, { 0.96063943315511,  0.93116709541280},
-{ 0.97534253457837,  0.86150930812689}, { 0.99642466504163,  0.70190043427512},
-{-0.94705089665984, -0.29580042814306}, { 0.91599807087376, -0.98147830385781},
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
+/* First eight entries repeated at end to simplify SIMD implementations. */
+const DECLARE_ALIGNED(16, INTFLOAT, AAC_RENAME(ff_sbr_noise_table))[][2] = {
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
+{Q31( 0.54840422910309f), Q31( 0.75221367176302f)}, {Q31( 0.40009252867955f), Q31(-0.98929400334421f)},
+{Q31(-0.99867974711855f), Q31(-0.88147068645358f)}, {Q31(-0.95531076805040f), Q31( 0.90908757154593f)},
+{Q31(-0.45725933317144f), Q31(-0.56716323646760f)}, {Q31(-0.72929675029275f), Q31(-0.98008272727324f)},
+{Q31( 0.75622801399036f), Q31( 0.20950329995549f)}, {Q31( 0.07069442601050f), Q31(-0.78247898470706f)},
+{Q31( 0.74496252926055f), Q31(-0.91169004445807f)}, {Q31(-0.96440182703856f), Q31(-0.94739918296622f)},
+{Q31( 0.30424629369539f), Q31(-0.49438267012479f)}, {Q31( 0.66565033746925f), Q31( 0.64652935542491f)},
+{Q31( 0.91697008020594f), Q31( 0.17514097332009f)}, {Q31(-0.70774918760427f), Q31( 0.52548653416543f)},
+{Q31(-0.70051415345560f), Q31(-0.45340028808763f)}, {Q31(-0.99496513054797f), Q31(-0.90071908066973f)},
+{Q31( 0.98164490790123f), Q31(-0.77463155528697f)}, {Q31(-0.54671580548181f), Q31(-0.02570928536004f)},
+{Q31(-0.01689629065389f), Q31( 0.00287506445732f)}, {Q31(-0.86110349531986f), Q31( 0.42548583726477f)},
+{Q31(-0.98892980586032f), Q31(-0.87881132267556f)}, {Q31( 0.51756627678691f), Q31( 0.66926784710139f)},
+{Q31(-0.99635026409640f), Q31(-0.58107730574765f)}, {Q31(-0.99969370862163f), Q31( 0.98369989360250f)},
+{Q31( 0.55266258627194f), Q31( 0.59449057465591f)}, {Q31( 0.34581177741673f), Q31( 0.94879421061866f)},
+{Q31( 0.62664209577999f), Q31(-0.74402970906471f)}, {Q31(-0.77149701404973f), Q31(-0.33883658042801f)},
+{Q31(-0.91592244254432f), Q31( 0.03687901376713f)}, {Q31(-0.76285492357887f), Q31(-0.91371867919124f)},
+{Q31( 0.79788337195331f), Q31(-0.93180971199849f)}, {Q31( 0.54473080610200f), Q31(-0.11919206037186f)},
+{Q31(-0.85639281671058f), Q31( 0.42429854760451f)}, {Q31(-0.92882402971423f), Q31( 0.27871809078609f)},
+{Q31(-0.11708371046774f), Q31(-0.99800843444966f)}, {Q31( 0.21356749817493f), Q31(-0.90716295627033f)},
+{Q31(-0.76191692573909f), Q31( 0.99768118356265f)}, {Q31( 0.98111043100884f), Q31(-0.95854459734407f)},
+{Q31(-0.85913269895572f), Q31( 0.95766566168880f)}, {Q31(-0.93307242253692f), Q31( 0.49431757696466f)},
+{Q31( 0.30485754879632f), Q31(-0.70540034357529f)}, {Q31( 0.85289650925190f), Q31( 0.46766131791044f)},
+{Q31( 0.91328082618125f), Q31(-0.99839597361769f)}, {Q31(-0.05890199924154f), Q31( 0.70741827819497f)},
+{Q31( 0.28398686150148f), Q31( 0.34633555702188f)}, {Q31( 0.95258164539612f), Q31(-0.54893416026939f)},
+{Q31(-0.78566324168507f), Q31(-0.75568541079691f)}, {Q31(-0.95789495447877f), Q31(-0.20423194696966f)},
+{Q31( 0.82411158711197f), Q31( 0.96654618432562f)}, {Q31(-0.65185446735885f), Q31(-0.88734990773289f)},
+{Q31(-0.93643603134666f), Q31( 0.99870790442385f)}, {Q31( 0.91427159529618f), Q31(-0.98290505544444f)},
+{Q31(-0.70395684036886f), Q31( 0.58796798221039f)}, {Q31( 0.00563771969365f), Q31( 0.61768196727244f)},
+{Q31( 0.89065051931895f), Q31( 0.52783352697585f)}, {Q31(-0.68683707712762f), Q31( 0.80806944710339f)},
+{Q31( 0.72165342518718f), Q31(-0.69259857349564f)}, {Q31(-0.62928247730667f), Q31( 0.13627037407335f)},
+{Q31( 0.29938434065514f), Q31(-0.46051329682246f)}, {Q31(-0.91781958879280f), Q31(-0.74012716684186f)},
+{Q31( 0.99298717043688f), Q31( 0.40816610075661f)}, {Q31( 0.82368298622748f), Q31(-0.74036047190173f)},
+{Q31(-0.98512833386833f), Q31(-0.99972330709594f)}, {Q31(-0.95915368242257f), Q31(-0.99237800466040f)},
+{Q31(-0.21411126572790f), Q31(-0.93424819052545f)}, {Q31(-0.68821476106884f), Q31(-0.26892306315457f)},
+{Q31( 0.91851997982317f), Q31( 0.09358228901785f)}, {Q31(-0.96062769559127f), Q31( 0.36099095133739f)},
+{Q31( 0.51646184922287f), Q31(-0.71373332873917f)}, {Q31( 0.61130721139669f), Q31( 0.46950141175917f)},
+{Q31( 0.47336129371299f), Q31(-0.27333178296162f)}, {Q31( 0.90998308703519f), Q31( 0.96715662938132f)},
+{Q31( 0.44844799194357f), Q31( 0.99211574628306f)}, {Q31( 0.66614891079092f), Q31( 0.96590176169121f)},
+{Q31( 0.74922239129237f), Q31(-0.89879858826087f)}, {Q31(-0.99571588506485f), Q31( 0.52785521494349f)},
+{Q31( 0.97401082477563f), Q31(-0.16855870075190f)}, {Q31( 0.72683747733879f), Q31(-0.48060774432251f)},
+{Q31( 0.95432193457128f), Q31( 0.68849603408441f)}, {Q31(-0.72962208425191f), Q31(-0.76608443420917f)},
+{Q31(-0.85359479233537f), Q31( 0.88738125901579f)}, {Q31(-0.81412430338535f), Q31(-0.97480768049637f)},
+{Q31(-0.87930772356786f), Q31( 0.74748307690436f)}, {Q31(-0.71573331064977f), Q31(-0.98570608178923f)},
+{Q31( 0.83524300028228f), Q31( 0.83702537075163f)}, {Q31(-0.48086065601423f), Q31(-0.98848504923531f)},
+{Q31( 0.97139128574778f), Q31( 0.80093621198236f)}, {Q31( 0.51992825347895f), Q31( 0.80247631400510f)},
+{Q31(-0.00848591195325f), Q31(-0.76670128000486f)}, {Q31(-0.70294374303036f), Q31( 0.55359910445577f)},
+{Q31(-0.95894428168140f), Q31(-0.43265504344783f)}, {Q31( 0.97079252950321f), Q31( 0.09325857238682f)},
+{Q31(-0.92404293670797f), Q31( 0.85507704027855f)}, {Q31(-0.69506469500450f), Q31( 0.98633412625459f)},
+{Q31( 0.26559203620024f), Q31( 0.73314307966524f)}, {Q31( 0.28038443336943f), Q31( 0.14537913654427f)},
+{Q31(-0.74138124825523f), Q31( 0.99310339807762f)}, {Q31(-0.01752795995444f), Q31(-0.82616635284178f)},
+{Q31(-0.55126773094930f), Q31(-0.98898543862153f)}, {Q31( 0.97960898850996f), Q31(-0.94021446752851f)},
+{Q31(-0.99196309146936f), Q31( 0.67019017358456f)}, {Q31(-0.67684928085260f), Q31( 0.12631491649378f)},
+{Q31( 0.09140039465500f), Q31(-0.20537731453108f)}, {Q31(-0.71658965751996f), Q31(-0.97788200391224f)},
+{Q31( 0.81014640078925f), Q31( 0.53722648362443f)}, {Q31( 0.40616991671205f), Q31(-0.26469008598449f)},
+{Q31(-0.67680188682972f), Q31( 0.94502052337695f)}, {Q31( 0.86849774348749f), Q31(-0.18333598647899f)},
+{Q31(-0.99500381284851f), Q31(-0.02634122068550f)}, {Q31( 0.84329189340667f), Q31( 0.10406957462213f)},
+{Q31(-0.09215968531446f), Q31( 0.69540012101253f)}, {Q31( 0.99956173327206f), Q31(-0.12358542001404f)},
+{Q31(-0.79732779473535f), Q31(-0.91582524736159f)}, {Q31( 0.96349973642406f), Q31( 0.96640458041000f)},
+{Q31(-0.79942778496547f), Q31( 0.64323902822857f)}, {Q31(-0.11566039853896f), Q31( 0.28587846253726f)},
+{Q31(-0.39922954514662f), Q31( 0.94129601616966f)}, {Q31( 0.99089197565987f), Q31(-0.92062625581587f)},
+{Q31( 0.28631285179909f), Q31(-0.91035047143603f)}, {Q31(-0.83302725605608f), Q31(-0.67330410892084f)},
+{Q31( 0.95404443402072f), Q31( 0.49162765398743f)}, {Q31(-0.06449863579434f), Q31( 0.03250560813135f)},
+{Q31(-0.99575054486311f), Q31( 0.42389784469507f)}, {Q31(-0.65501142790847f), Q31( 0.82546114655624f)},
+{Q31(-0.81254441908887f), Q31(-0.51627234660629f)}, {Q31(-0.99646369485481f), Q31( 0.84490533520752f)},
+{Q31( 0.00287840603348f), Q31( 0.64768261158166f)}, {Q31( 0.70176989408455f), Q31(-0.20453028573322f)},
+{Q31( 0.96361882270190f), Q31( 0.40706967140989f)}, {Q31(-0.68883758192426f), Q31( 0.91338958840772f)},
+{Q31(-0.34875585502238f), Q31( 0.71472290693300f)}, {Q31( 0.91980081243087f), Q31( 0.66507455644919f)},
+{Q31(-0.99009048343881f), Q31( 0.85868021604848f)}, {Q31( 0.68865791458395f), Q31( 0.55660316809678f)},
+{Q31(-0.99484402129368f), Q31(-0.20052559254934f)}, {Q31( 0.94214511408023f), Q31(-0.99696425367461f)},
+{Q31(-0.67414626793544f), Q31( 0.49548221180078f)}, {Q31(-0.47339353684664f), Q31(-0.85904328834047f)},
+{Q31( 0.14323651387360f), Q31(-0.94145598222488f)}, {Q31(-0.29268293575672f), Q31( 0.05759224927952f)},
+{Q31( 0.43793861458754f), Q31(-0.78904969892724f)}, {Q31(-0.36345126374441f), Q31( 0.64874435357162f)},
+{Q31(-0.08750604656825f), Q31( 0.97686944362527f)}, {Q31(-0.96495267812511f), Q31(-0.53960305946511f)},
+{Q31( 0.55526940659947f), Q31( 0.78891523734774f)}, {Q31( 0.73538215752630f), Q31( 0.96452072373404f)},
+{Q31(-0.30889773919437f), Q31(-0.80664389776860f)}, {Q31( 0.03574995626194f), Q31(-0.97325616900959f)},
+{Q31( 0.98720684660488f), Q31( 0.48409133691962f)}, {Q31(-0.81689296271203f), Q31(-0.90827703628298f)},
+{Q31( 0.67866860118215f), Q31( 0.81284503870856f)}, {Q31(-0.15808569732583f), Q31( 0.85279555024382f)},
+{Q31( 0.80723395114371f), Q31(-0.24717418514605f)}, {Q31( 0.47788757329038f), Q31(-0.46333147839295f)},
+{Q31( 0.96367554763201f), Q31( 0.38486749303242f)}, {Q31(-0.99143875716818f), Q31(-0.24945277239809f)},
+{Q31( 0.83081876925833f), Q31(-0.94780851414763f)}, {Q31(-0.58753191905341f), Q31( 0.01290772389163f)},
+{Q31( 0.95538108220960f), Q31(-0.85557052096538f)}, {Q31(-0.96490920476211f), Q31(-0.64020970923102f)},
+{Q31(-0.97327101028521f), Q31( 0.12378128133110f)}, {Q31( 0.91400366022124f), Q31( 0.57972471346930f)},
+{Q31(-0.99925837363824f), Q31( 0.71084847864067f)}, {Q31(-0.86875903507313f), Q31(-0.20291699203564f)},
+{Q31(-0.26240034795124f), Q31(-0.68264554369108f)}, {Q31(-0.24664412953388f), Q31(-0.87642273115183f)},
+{Q31( 0.02416275806869f), Q31( 0.27192914288905f)}, {Q31( 0.82068619590515f), Q31(-0.85087787994476f)},
+{Q31( 0.88547373760759f), Q31(-0.89636802901469f)}, {Q31(-0.18173078152226f), Q31(-0.26152145156800f)},
+{Q31( 0.09355476558534f), Q31( 0.54845123045604f)}, {Q31(-0.54668414224090f), Q31( 0.95980774020221f)},
+{Q31( 0.37050990604091f), Q31(-0.59910140383171f)}, {Q31(-0.70373594262891f), Q31( 0.91227665827081f)},
+{Q31(-0.34600785879594f), Q31(-0.99441426144200f)}, {Q31(-0.68774481731008f), Q31(-0.30238837956299f)},
+{Q31(-0.26843291251234f), Q31( 0.83115668004362f)}, {Q31( 0.49072334613242f), Q31(-0.45359708737775f)},
+{Q31( 0.38975993093975f), Q31( 0.95515358099121f)}, {Q31(-0.97757125224150f), Q31( 0.05305894580606f)},
+{Q31(-0.17325552859616f), Q31(-0.92770672250494f)}, {Q31( 0.99948035025744f), Q31( 0.58285545563426f)},
+{Q31(-0.64946246527458f), Q31( 0.68645507104960f)}, {Q31(-0.12016920576437f), Q31(-0.57147322153312f)},
+{Q31(-0.58947456517751f), Q31(-0.34847132454388f)}, {Q31(-0.41815140454465f), Q31( 0.16276422358861f)},
+{Q31( 0.99885650204884f), Q31( 0.11136095490444f)}, {Q31(-0.56649614128386f), Q31(-0.90494866361587f)},
+{Q31( 0.94138021032330f), Q31( 0.35281916733018f)}, {Q31(-0.75725076534641f), Q31( 0.53650549640587f)},
+{Q31( 0.20541973692630f), Q31(-0.94435144369918f)}, {Q31( 0.99980371023351f), Q31( 0.79835913565599f)},
+{Q31( 0.29078277605775f), Q31( 0.35393777921520f)}, {Q31(-0.62858772103030f), Q31( 0.38765693387102f)},
+{Q31( 0.43440904467688f), Q31(-0.98546330463232f)}, {Q31(-0.98298583762390f), Q31( 0.21021524625209f)},
+{Q31( 0.19513029146934f), Q31(-0.94239832251867f)}, {Q31(-0.95476662400101f), Q31( 0.98364554179143f)},
+{Q31( 0.93379635304810f), Q31(-0.70881994583682f)}, {Q31(-0.85235410573336f), Q31(-0.08342347966410f)},
+{Q31(-0.86425093011245f), Q31(-0.45795025029466f)}, {Q31( 0.38879779059045f), Q31( 0.97274429344593f)},
+{Q31( 0.92045124735495f), Q31(-0.62433652524220f)}, {Q31( 0.89162532251878f), Q31( 0.54950955570563f)},
+{Q31(-0.36834336949252f), Q31( 0.96458298020975f)}, {Q31( 0.93891760988045f), Q31(-0.89968353740388f)},
+{Q31( 0.99267657565094f), Q31(-0.03757034316958f)}, {Q31(-0.94063471614176f), Q31( 0.41332338538963f)},
+{Q31( 0.99740224117019f), Q31(-0.16830494996370f)}, {Q31(-0.35899413170555f), Q31(-0.46633226649613f)},
+{Q31( 0.05237237274947f), Q31(-0.25640361602661f)}, {Q31( 0.36703583957424f), Q31(-0.38653265641875f)},
+{Q31( 0.91653180367913f), Q31(-0.30587628726597f)}, {Q31( 0.69000803499316f), Q31( 0.90952171386132f)},
+{Q31(-0.38658751133527f), Q31( 0.99501571208985f)}, {Q31(-0.29250814029851f), Q31( 0.37444994344615f)},
+{Q31(-0.60182204677608f), Q31( 0.86779651036123f)}, {Q31(-0.97418588163217f), Q31( 0.96468523666475f)},
+{Q31( 0.88461574003963f), Q31( 0.57508405276414f)}, {Q31( 0.05198933055162f), Q31( 0.21269661669964f)},
+{Q31(-0.53499621979720f), Q31( 0.97241553731237f)}, {Q31(-0.49429560226497f), Q31( 0.98183865291903f)},
+{Q31(-0.98935142339139f), Q31(-0.40249159006933f)}, {Q31(-0.98081380091130f), Q31(-0.72856895534041f)},
+{Q31(-0.27338148835532f), Q31( 0.99950922447209f)}, {Q31( 0.06310802338302f), Q31(-0.54539587529618f)},
+{Q31(-0.20461677199539f), Q31(-0.14209977628489f)}, {Q31( 0.66223843141647f), Q31( 0.72528579940326f)},
+{Q31(-0.84764345483665f), Q31( 0.02372316801261f)}, {Q31(-0.89039863483811f), Q31( 0.88866581484602f)},
+{Q31( 0.95903308477986f), Q31( 0.76744927173873f)}, {Q31( 0.73504123909879f), Q31(-0.03747203173192f)},
+{Q31(-0.31744434966056f), Q31(-0.36834111883652f)}, {Q31(-0.34110827591623f), Q31( 0.40211222807691f)},
+{Q31( 0.47803883714199f), Q31(-0.39423219786288f)}, {Q31( 0.98299195879514f), Q31( 0.01989791390047f)},
+{Q31(-0.30963073129751f), Q31(-0.18076720599336f)}, {Q31( 0.99992588229018f), Q31(-0.26281872094289f)},
+{Q31(-0.93149731080767f), Q31(-0.98313162570490f)}, {Q31( 0.99923472302773f), Q31(-0.80142993767554f)},
+{Q31(-0.26024169633417f), Q31(-0.75999759855752f)}, {Q31(-0.35712514743563f), Q31( 0.19298963768574f)},
+{Q31(-0.99899084509530f), Q31( 0.74645156992493f)}, {Q31( 0.86557171579452f), Q31( 0.55593866696299f)},
+{Q31( 0.33408042438752f), Q31( 0.86185953874709f)}, {Q31( 0.99010736374716f), Q31( 0.04602397576623f)},
+{Q31(-0.66694269691195f), Q31(-0.91643611810148f)}, {Q31( 0.64016792079480f), Q31( 0.15649530836856f)},
+{Q31( 0.99570534804836f), Q31( 0.45844586038111f)}, {Q31(-0.63431466947340f), Q31( 0.21079116459234f)},
+{Q31(-0.07706847005931f), Q31(-0.89581437101329f)}, {Q31( 0.98590090577724f), Q31( 0.88241721133981f)},
+{Q31( 0.80099335254678f), Q31(-0.36851896710853f)}, {Q31( 0.78368131392666f), Q31( 0.45506999802597f)},
+{Q31( 0.08707806671691f), Q31( 0.80938994918745f)}, {Q31(-0.86811883080712f), Q31( 0.39347308654705f)},
+{Q31(-0.39466529740375f), Q31(-0.66809432114456f)}, {Q31( 0.97875325649683f), Q31(-0.72467840967746f)},
+{Q31(-0.95038560288864f), Q31( 0.89563219587625f)}, {Q31( 0.17005239424212f), Q31( 0.54683053962658f)},
+{Q31(-0.76910792026848f), Q31(-0.96226617549298f)}, {Q31( 0.99743281016846f), Q31( 0.42697157037567f)},
+{Q31( 0.95437383549973f), Q31( 0.97002324109952f)}, {Q31( 0.99578905365569f), Q31(-0.54106826257356f)},
+{Q31( 0.28058259829990f), Q31(-0.85361420634036f)}, {Q31( 0.85256524470573f), Q31(-0.64567607735589f)},
+{Q31(-0.50608540105128f), Q31(-0.65846015480300f)}, {Q31(-0.97210735183243f), Q31(-0.23095213067791f)},
+{Q31( 0.95424048234441f), Q31(-0.99240147091219f)}, {Q31(-0.96926570524023f), Q31( 0.73775654896574f)},
+{Q31( 0.30872163214726f), Q31( 0.41514960556126f)}, {Q31(-0.24523839572639f), Q31( 0.63206633394807f)},
+{Q31(-0.33813265086024f), Q31(-0.38661779441897f)}, {Q31(-0.05826828420146f), Q31(-0.06940774188029f)},
+{Q31(-0.22898461455054f), Q31( 0.97054853316316f)}, {Q31(-0.18509915019881f), Q31( 0.47565762892084f)},
+{Q31(-0.10488238045009f), Q31(-0.87769947402394f)}, {Q31(-0.71886586182037f), Q31( 0.78030982480538f)},
+{Q31( 0.99793873738654f), Q31( 0.90041310491497f)}, {Q31( 0.57563307626120f), Q31(-0.91034337352097f)},
+{Q31( 0.28909646383717f), Q31( 0.96307783970534f)}, {Q31( 0.42188998312520f), Q31( 0.48148651230437f)},
+{Q31( 0.93335049681047f), Q31(-0.43537023883588f)}, {Q31(-0.97087374418267f), Q31( 0.86636445711364f)},
+{Q31( 0.36722871286923f), Q31( 0.65291654172961f)}, {Q31(-0.81093025665696f), Q31( 0.08778370229363f)},
+{Q31(-0.26240603062237f), Q31(-0.92774095379098f)}, {Q31( 0.83996497984604f), Q31( 0.55839849139647f)},
+{Q31(-0.99909615720225f), Q31(-0.96024605713970f)}, {Q31( 0.74649464155061f), Q31( 0.12144893606462f)},
+{Q31(-0.74774595569805f), Q31(-0.26898062008959f)}, {Q31( 0.95781667469567f), Q31(-0.79047927052628f)},
+{Q31( 0.95472308713099f), Q31(-0.08588776019550f)}, {Q31( 0.48708332746299f), Q31( 0.99999041579432f)},
+{Q31( 0.46332038247497f), Q31( 0.10964126185063f)}, {Q31(-0.76497004940162f), Q31( 0.89210929242238f)},
+{Q31( 0.57397389364339f), Q31( 0.35289703373760f)}, {Q31( 0.75374316974495f), Q31( 0.96705214651335f)},
+{Q31(-0.59174397685714f), Q31(-0.89405370422752f)}, {Q31( 0.75087906691890f), Q31(-0.29612672982396f)},
+{Q31(-0.98607857336230f), Q31( 0.25034911730023f)}, {Q31(-0.40761056640505f), Q31(-0.90045573444695f)},
+{Q31( 0.66929266740477f), Q31( 0.98629493401748f)}, {Q31(-0.97463695257310f), Q31(-0.00190223301301f)},
+{Q31( 0.90145509409859f), Q31( 0.99781390365446f)}, {Q31(-0.87259289048043f), Q31( 0.99233587353666f)},
+{Q31(-0.91529461447692f), Q31(-0.15698707534206f)}, {Q31(-0.03305738840705f), Q31(-0.37205262859764f)},
+{Q31( 0.07223051368337f), Q31(-0.88805001733626f)}, {Q31( 0.99498012188353f), Q31( 0.97094358113387f)},
+{Q31(-0.74904939500519f), Q31( 0.99985483641521f)}, {Q31( 0.04585228574211f), Q31( 0.99812337444082f)},
+{Q31(-0.89054954257993f), Q31(-0.31791913188064f)}, {Q31(-0.83782144651251f), Q31( 0.97637632547466f)},
+{Q31( 0.33454804933804f), Q31(-0.86231516800408f)}, {Q31(-0.99707579362824f), Q31( 0.93237990079441f)},
+{Q31(-0.22827527843994f), Q31( 0.18874759397997f)}, {Q31( 0.67248046289143f), Q31(-0.03646211390569f)},
+{Q31(-0.05146538187944f), Q31(-0.92599700120679f)}, {Q31( 0.99947295749905f), Q31( 0.93625229707912f)},
+{Q31( 0.66951124390363f), Q31( 0.98905825623893f)}, {Q31(-0.99602956559179f), Q31(-0.44654715757688f)},
+{Q31( 0.82104905483590f), Q31( 0.99540741724928f)}, {Q31( 0.99186510988782f), Q31( 0.72023001312947f)},
+{Q31(-0.65284592392918f), Q31( 0.52186723253637f)}, {Q31( 0.93885443798188f), Q31(-0.74895312615259f)},
+{Q31( 0.96735248738388f), Q31( 0.90891816978629f)}, {Q31(-0.22225968841114f), Q31( 0.57124029781228f)},
+{Q31(-0.44132783753414f), Q31(-0.92688840659280f)}, {Q31(-0.85694974219574f), Q31( 0.88844532719844f)},
+{Q31( 0.91783042091762f), Q31(-0.46356892383970f)}, {Q31( 0.72556974415690f), Q31(-0.99899555770747f)},
+{Q31(-0.99711581834508f), Q31( 0.58211560180426f)}, {Q31( 0.77638976371966f), Q31( 0.94321834873819f)},
+{Q31( 0.07717324253925f), Q31( 0.58638399856595f)}, {Q31(-0.56049829194163f), Q31( 0.82522301569036f)},
+{Q31( 0.98398893639988f), Q31( 0.39467440420569f)}, {Q31( 0.47546946844938f), Q31( 0.68613044836811f)},
+{Q31( 0.65675089314631f), Q31( 0.18331637134880f)}, {Q31( 0.03273375457980f), Q31(-0.74933109564108f)},
+{Q31(-0.38684144784738f), Q31( 0.51337349030406f)}, {Q31(-0.97346267944545f), Q31(-0.96549364384098f)},
+{Q31(-0.53282156061942f), Q31(-0.91423265091354f)}, {Q31( 0.99817310731176f), Q31( 0.61133572482148f)},
+{Q31(-0.50254500772635f), Q31(-0.88829338134294f)}, {Q31( 0.01995873238855f), Q31( 0.85223515096765f)},
+{Q31( 0.99930381973804f), Q31( 0.94578896296649f)}, {Q31( 0.82907767600783f), Q31(-0.06323442598128f)},
+{Q31(-0.58660709669728f), Q31( 0.96840773806582f)}, {Q31(-0.17573736667267f), Q31(-0.48166920859485f)},
+{Q31( 0.83434292401346f), Q31(-0.13023450646997f)}, {Q31( 0.05946491307025f), Q31( 0.20511047074866f)},
+{Q31( 0.81505484574602f), Q31(-0.94685947861369f)}, {Q31(-0.44976380954860f), Q31( 0.40894572671545f)},
+{Q31(-0.89746474625671f), Q31( 0.99846578838537f)}, {Q31( 0.39677256130792f), Q31(-0.74854668609359f)},
+{Q31(-0.07588948563079f), Q31( 0.74096214084170f)}, {Q31( 0.76343198951445f), Q31( 0.41746629422634f)},
+{Q31(-0.74490104699626f), Q31( 0.94725911744610f)}, {Q31( 0.64880119792759f), Q31( 0.41336660830571f)},
+{Q31( 0.62319537462542f), Q31(-0.93098313552599f)}, {Q31( 0.42215817594807f), Q31(-0.07712787385208f)},
+{Q31( 0.02704554141885f), Q31(-0.05417518053666f)}, {Q31( 0.80001773566818f), Q31( 0.91542195141039f)},
+{Q31(-0.79351832348816f), Q31(-0.36208897989136f)}, {Q31( 0.63872359151636f), Q31( 0.08128252493444f)},
+{Q31( 0.52890520960295f), Q31( 0.60048872455592f)}, {Q31( 0.74238552914587f), Q31( 0.04491915291044f)},
+{Q31( 0.99096131449250f), Q31(-0.19451182854402f)}, {Q31(-0.80412329643109f), Q31(-0.88513818199457f)},
+{Q31(-0.64612616129736f), Q31( 0.72198674804544f)}, {Q31( 0.11657770663191f), Q31(-0.83662833815041f)},
+{Q31(-0.95053182488101f), Q31(-0.96939905138082f)}, {Q31(-0.62228872928622f), Q31( 0.82767262846661f)},
+{Q31( 0.03004475787316f), Q31(-0.99738896333384f)}, {Q31(-0.97987214341034f), Q31( 0.36526129686425f)},
+{Q31(-0.99986980746200f), Q31(-0.36021610299715f)}, {Q31( 0.89110648599879f), Q31(-0.97894250343044f)},
+{Q31( 0.10407960510582f), Q31( 0.77357793811619f)}, {Q31( 0.95964737821728f), Q31(-0.35435818285502f)},
+{Q31( 0.50843233159162f), Q31( 0.96107691266205f)}, {Q31( 0.17006334670615f), Q31(-0.76854025314829f)},
+{Q31( 0.25872675063360f), Q31( 0.99893303933816f)}, {Q31(-0.01115998681937f), Q31( 0.98496019742444f)},
+{Q31(-0.79598702973261f), Q31( 0.97138411318894f)}, {Q31(-0.99264708948101f), Q31(-0.99542822402536f)},
+{Q31(-0.99829663752818f), Q31( 0.01877138824311f)}, {Q31(-0.70801016548184f), Q31( 0.33680685948117f)},
+{Q31(-0.70467057786826f), Q31( 0.93272777501857f)}, {Q31( 0.99846021905254f), Q31(-0.98725746254433f)},
+{Q31(-0.63364968534650f), Q31(-0.16473594423746f)}, {Q31(-0.16258217500792f), Q31(-0.95939125400802f)},
+{Q31(-0.43645594360633f), Q31(-0.94805030113284f)}, {Q31(-0.99848471702976f), Q31( 0.96245166923809f)},
+{Q31(-0.16796458968998f), Q31(-0.98987511890470f)}, {Q31(-0.87979225745213f), Q31(-0.71725725041680f)},
+{Q31( 0.44183099021786f), Q31(-0.93568974498761f)}, {Q31( 0.93310180125532f), Q31(-0.99913308068246f)},
+{Q31(-0.93941931782002f), Q31(-0.56409379640356f)}, {Q31(-0.88590003188677f), Q31( 0.47624600491382f)},
+{Q31( 0.99971463703691f), Q31(-0.83889954253462f)}, {Q31(-0.75376385639978f), Q31( 0.00814643438625f)},
+{Q31( 0.93887685615875f), Q31(-0.11284528204636f)}, {Q31( 0.85126435782309f), Q31( 0.52349251543547f)},
+{Q31( 0.39701421446381f), Q31( 0.81779634174316f)}, {Q31(-0.37024464187437f), Q31(-0.87071656222959f)},
+{Q31(-0.36024828242896f), Q31( 0.34655735648287f)}, {Q31(-0.93388812549209f), Q31(-0.84476541096429f)},
+{Q31(-0.65298804552119f), Q31(-0.18439575450921f)}, {Q31( 0.11960319006843f), Q31( 0.99899346780168f)},
+{Q31( 0.94292565553160f), Q31( 0.83163906518293f)}, {Q31( 0.75081145286948f), Q31(-0.35533223142265f)},
+{Q31( 0.56721979748394f), Q31(-0.24076836414499f)}, {Q31( 0.46857766746029f), Q31(-0.30140233457198f)},
+{Q31( 0.97312313923635f), Q31(-0.99548191630031f)}, {Q31(-0.38299976567017f), Q31( 0.98516909715427f)},
+{Q31( 0.41025800019463f), Q31( 0.02116736935734f)}, {Q31( 0.09638062008048f), Q31( 0.04411984381457f)},
+{Q31(-0.85283249275397f), Q31( 0.91475563922421f)}, {Q31( 0.88866808958124f), Q31(-0.99735267083226f)},
+{Q31(-0.48202429536989f), Q31(-0.96805608884164f)}, {Q31( 0.27572582416567f), Q31( 0.58634753335832f)},
+{Q31(-0.65889129659168f), Q31( 0.58835634138583f)}, {Q31( 0.98838086953732f), Q31( 0.99994349600236f)},
+{Q31(-0.20651349620689f), Q31( 0.54593044066355f)}, {Q31(-0.62126416356920f), Q31(-0.59893681700392f)},
+{Q31( 0.20320105410437f), Q31(-0.86879180355289f)}, {Q31(-0.97790548600584f), Q31( 0.96290806999242f)},
+{Q31( 0.11112534735126f), Q31( 0.21484763313301f)}, {Q31(-0.41368337314182f), Q31( 0.28216837680365f)},
+{Q31( 0.24133038992960f), Q31( 0.51294362630238f)}, {Q31(-0.66393410674885f), Q31(-0.08249679629081f)},
+{Q31(-0.53697829178752f), Q31(-0.97649903936228f)}, {Q31(-0.97224737889348f), Q31( 0.22081333579837f)},
+{Q31( 0.87392477144549f), Q31(-0.12796173740361f)}, {Q31( 0.19050361015753f), Q31( 0.01602615387195f)},
+{Q31(-0.46353441212724f), Q31(-0.95249041539006f)}, {Q31(-0.07064096339021f), Q31(-0.94479803205886f)},
+{Q31(-0.92444085484466f), Q31(-0.10457590187436f)}, {Q31(-0.83822593578728f), Q31(-0.01695043208885f)},
+{Q31( 0.75214681811150f), Q31(-0.99955681042665f)}, {Q31(-0.42102998829339f), Q31( 0.99720941999394f)},
+{Q31(-0.72094786237696f), Q31(-0.35008961934255f)}, {Q31( 0.78843311019251f), Q31( 0.52851398958271f)},
+{Q31( 0.97394027897442f), Q31(-0.26695944086561f)}, {Q31( 0.99206463477946f), Q31(-0.57010120849429f)},
+{Q31( 0.76789609461795f), Q31(-0.76519356730966f)}, {Q31(-0.82002421836409f), Q31(-0.73530179553767f)},
+{Q31( 0.81924990025724f), Q31( 0.99698425250579f)}, {Q31(-0.26719850873357f), Q31( 0.68903369776193f)},
+{Q31(-0.43311260380975f), Q31( 0.85321815947490f)}, {Q31( 0.99194979673836f), Q31( 0.91876249766422f)},
+{Q31(-0.80692001248487f), Q31(-0.32627540663214f)}, {Q31( 0.43080003649976f), Q31(-0.21919095636638f)},
+{Q31( 0.67709491937357f), Q31(-0.95478075822906f)}, {Q31( 0.56151770568316f), Q31(-0.70693811747778f)},
+{Q31( 0.10831862810749f), Q31(-0.08628837174592f)}, {Q31( 0.91229417540436f), Q31(-0.65987351408410f)},
+{Q31(-0.48972893932274f), Q31( 0.56289246362686f)}, {Q31(-0.89033658689697f), Q31(-0.71656563987082f)},
+{Q31( 0.65269447475094f), Q31( 0.65916004833932f)}, {Q31( 0.67439478141121f), Q31(-0.81684380846796f)},
+{Q31(-0.47770832416973f), Q31(-0.16789556203025f)}, {Q31(-0.99715979260878f), Q31(-0.93565784007648f)},
+{Q31(-0.90889593602546f), Q31( 0.62034397054380f)}, {Q31(-0.06618622548177f), Q31(-0.23812217221359f)},
+{Q31( 0.99430266919728f), Q31( 0.18812555317553f)}, {Q31( 0.97686402381843f), Q31(-0.28664534366620f)},
+{Q31( 0.94813650221268f), Q31(-0.97506640027128f)}, {Q31(-0.95434497492853f), Q31(-0.79607978501983f)},
+{Q31(-0.49104783137150f), Q31( 0.32895214359663f)}, {Q31( 0.99881175120751f), Q31( 0.88993983831354f)},
+{Q31( 0.50449166760303f), Q31(-0.85995072408434f)}, {Q31( 0.47162891065108f), Q31(-0.18680204049569f)},
+{Q31(-0.62081581361840f), Q31( 0.75000676218956f)}, {Q31(-0.43867015250812f), Q31( 0.99998069244322f)},
+{Q31( 0.98630563232075f), Q31(-0.53578899600662f)}, {Q31(-0.61510362277374f), Q31(-0.89515019899997f)},
+{Q31(-0.03841517601843f), Q31(-0.69888815681179f)}, {Q31(-0.30102157304644f), Q31(-0.07667808922205f)},
+{Q31( 0.41881284182683f), Q31( 0.02188098922282f)}, {Q31(-0.86135454941237f), Q31( 0.98947480909359f)},
+{Q31( 0.67226861393788f), Q31(-0.13494389011014f)}, {Q31(-0.70737398842068f), Q31(-0.76547349325992f)},
+{Q31( 0.94044946687963f), Q31( 0.09026201157416f)}, {Q31(-0.82386352534327f), Q31( 0.08924768823676f)},
+{Q31(-0.32070666698656f), Q31( 0.50143421908753f)}, {Q31( 0.57593163224487f), Q31(-0.98966422921509f)},
+{Q31(-0.36326018419965f), Q31( 0.07440243123228f)}, {Q31( 0.99979044674350f), Q31(-0.14130287347405f)},
+{Q31(-0.92366023326932f), Q31(-0.97979298068180f)}, {Q31(-0.44607178518598f), Q31(-0.54233252016394f)},
+{Q31( 0.44226800932956f), Q31( 0.71326756742752f)}, {Q31( 0.03671907158312f), Q31( 0.63606389366675f)},
+{Q31( 0.52175424682195f), Q31(-0.85396826735705f)}, {Q31(-0.94701139690956f), Q31(-0.01826348194255f)},
+{Q31(-0.98759606946049f), Q31( 0.82288714303073f)}, {Q31( 0.87434794743625f), Q31( 0.89399495655433f)},
+{Q31(-0.93412041758744f), Q31( 0.41374052024363f)}, {Q31( 0.96063943315511f), Q31( 0.93116709541280f)},
+{Q31( 0.97534253457837f), Q31( 0.86150930812689f)}, {Q31( 0.99642466504163f), Q31( 0.70190043427512f)},
+{Q31(-0.94705089665984f), Q31(-0.29580042814306f)}, {Q31( 0.91599807087376f), Q31(-0.98147830385781f)},
+// Start of duplicated table
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
 };
 
 #endif /* AVCODEC_AACSBRDATA_H */
diff --git a/libavcodec/aactab.c b/libavcodec/aactab.c
index 9f1e8afafd..dc9acc1bf3 100644
--- a/libavcodec/aactab.c
+++ b/libavcodec/aactab.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,8 @@
 
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_short_128_fixed)[128];
 
 const uint8_t ff_aac_num_swb_1024[] = {
     41, 41, 47, 49, 49, 51, 47, 47, 43, 43, 43, 40, 40
@@ -1767,6 +1769,490 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_512)[1920] = {
     -0.00111144, -0.00109764, -0.00108377, -0.00106989,
 };
 
+/* Q30 representation of ff_aac_eld_window_512 table */
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_512_fixed)[1920] = {
+    0x003783ba, 0x005d04f4, 0x008ae226, 0x00c02021,
+    0x00fb1804, 0x013a30a8, 0x017be9e6, 0x01bf296c,
+    0x02033204, 0x0247502c, 0x028adab0, 0x02cd9568,
+    0x030fa980, 0x03513dc0, 0x03927274, 0x03d363e0,
+    0x04142e40, 0x0454edc0, 0x0495bd48, 0x04d6a060,
+    0x051786d8, 0x05586548, 0x059935e8, 0x05d9feb0,
+    0x061acea0, 0x065bb680, 0x069cc800, 0x06de13f0,
+    0x071fa748, 0x07618b80, 0x07a3c7a8, 0x07e66da0,
+    0x082999d0, 0x086d6590, 0x08b1e640, 0x08f72850,
+    0x093d3120, 0x09840550, 0x09cba880, 0x0a1415f0,
+    0x0a5d41b0, 0x0aa720d0, 0x0af1a9a0, 0x0b3cce70,
+    0x0b887ec0, 0x0bd4ac10, 0x0c214a70, 0x0c6e5130,
+    0x0cbbba50, 0x0d098130, 0x0d57a240, 0x0da61a60,
+    0x0df4e620, 0x0e4401d0, 0x0e9369f0, 0x0ee31de0,
+    0x0f332000, 0x0f837180, 0x0fd412a0, 0x10250260,
+    0x10763f20, 0x10c7c660, 0x11199560, 0x116baa00,
+    0x11be0400, 0x1210a1c0, 0x12638180, 0x12b69ee0,
+    0x1309f3e0, 0x135d7ac0, 0x13b12dc0, 0x1404ffa0,
+    0x1458dd40, 0x14acb720, 0x15008120, 0x15543260,
+    0x15a7c460, 0x15fb3160, 0x164e7520, 0x16a193c0,
+    0x16f49740, 0x17478720, 0x179a6720, 0x17ed3720,
+    0x183ff460, 0x18929c20, 0x18e52b00, 0x19379c00,
+    0x1989e900, 0x19dc0ca0, 0x1a2e0280, 0x1a7fc400,
+    0x1ad14a00, 0x1b228ec0, 0x1b738ea0, 0x1bc44540,
+    0x1c14ada0, 0x1c64c380, 0x1cb48440, 0x1d03f420,
+    0x1d531c00, 0x1da20160, 0x1df0a660, 0x1e3f0860,
+    0x1e8d2340, 0x1edaf340, 0x1f2875e0, 0x1f75a700,
+    0x1fc281e0, 0x200f0380, 0x205b2ac0, 0x20a6f980,
+    0x20f27200, 0x213d9600, 0x21886580, 0x21d2e040,
+    0x221d0640, 0x2266d6c0, 0x22b05180, 0x22f97580,
+    0x23424280, 0x238ab880, 0x23d2d780, 0x241aa040,
+    0x246213c0, 0x24a93300, 0x24efff80, 0x25367b40,
+    0x256f68c0, 0x25b53580, 0x25faa580, 0x263fb940,
+    0x26847080, 0x26c8cbc0, 0x270ccb00, 0x27506e40,
+    0x2793b600, 0x27d6a200, 0x281932c0, 0x285b6880,
+    0x289d4400, 0x28dec5c0, 0x291feec0, 0x2960bf80,
+    0x29a137c0, 0x29e15800, 0x2a212000, 0x2a609080,
+    0x2a9fa980, 0x2ade6b40, 0x2b1cd600, 0x2b5aea00,
+    0x2b98a740, 0x2bd60d80, 0x2c131cc0, 0x2c4fd500,
+    0x2c8c3600, 0x2cc83f00, 0x2d03f040, 0x2d3f48c0,
+    0x2d7a48c0, 0x2db4ef40, 0x2def3c40, 0x2e292ec0,
+    0x2e62c700, 0x2e9c0400, 0x2ed4e580, 0x2f0d6ac0,
+    0x2f4592c0, 0x2f7d5c80, 0x2fb4c6c0, 0x2febd140,
+    0x30227b40, 0x3058c400, 0x308eab40, 0x30c43040,
+    0x30f95100, 0x312e0d00, 0x31626240, 0x31965040,
+    0x31c9d5c0, 0x31fcf240, 0x322fa480, 0x3261ec00,
+    0x3293c7c0, 0x32c53680, 0x32f63780, 0x3326c9c0,
+    0x3356ec00, 0x33869d00, 0x33b5db80, 0x33e4a700,
+    0x3412fdc0, 0x3440df40, 0x346e4a80, 0x349b3e40,
+    0x34c7ba00, 0x34f3bd80, 0x351f47c0, 0x354a5840,
+    0x3574ee40, 0x359f0900, 0x35c8a840, 0x35f1cb80,
+    0x361a71c0, 0x36429a80, 0x366a4580, 0x36917280,
+    0x36b82100, 0x36de5180, 0x37040340, 0x372936c0,
+    0x374dec40, 0x37722340, 0x3795dc40, 0x37b91780,
+    0x37dbd600, 0x37fe18c0, 0x381fe080, 0x38412e00,
+    0x38620280, 0x38825f40, 0x38a24540, 0x38c1b680,
+    0x38e0b5c0, 0x38ff4540, 0x391d6800, 0x393b20c0,
+    0x39587280, 0x39755fc0, 0x3991eb80, 0x39ae1a80,
+    0x39c9f280, 0x39e57980, 0x3a00b600, 0x3a1bae00,
+    0x3a366800, 0x3a50e9c0, 0x3a6b3a40, 0x3a8560c0,
+    0x3a9f6640, 0x3ab95400, 0x3ad332c0, 0x3aed0680,
+    0x3b06cf80, 0x3b208d40, 0x3b3a3e80, 0x3b53cb80,
+    0x3b6d0780, 0x3b85c380, 0x3b9dd0c0, 0x3bb4eb40,
+    0x3bcabac0, 0x3bdee680, 0x3bf11680, 0x3c011440,
+    0x3c179ac0, 0x3c1c4f00, 0x3c21aa40, 0x3c278880,
+    0x3c2dba80, 0x3c341140, 0x3c3a5e80, 0x3c409100,
+    0x3c46b480, 0x3c4cd5c0, 0x3c530180, 0x3c593cc0,
+    0x3c5f84c0, 0x3c65d640, 0x3c6c2e40, 0x3c728b40,
+    0x3c78ee80, 0x3c7f5840, 0x3c85c940, 0x3c8c4240,
+    0x3c92c380, 0x3c994cc0, 0x3c9fde40, 0x3ca67880,
+    0x3cad1ac0, 0x3cb3c540, 0x3cba7800, 0x3cc132c0,
+    0x3cc7f640, 0x3ccec280, 0x3cd59800, 0x3cdc76c0,
+    0x3ce35e80, 0x3cea4f00, 0x3cf147c0, 0x3cf84900,
+    0x3cff5340, 0x3d0666c0, 0x3d0d8400, 0x3d14ab40,
+    0x3d1bdc00, 0x3d2315c0, 0x3d2a5880, 0x3d31a440,
+    0x3d38f900, 0x3d405780, 0x3d47c040, 0x3d4f3300,
+    0x3d56af40, 0x3d5e3500, 0x3d65c380, 0x3d6d5ac0,
+    0x3d74fb40, 0x3d7ca540, 0x3d845900, 0x3d8c1680,
+    0x3d93dd00, 0x3d9bac80, 0x3da38400, 0x3dab6400,
+    0x3db34c80, 0x3dbb3dc0, 0x3dc33840, 0x3dcb3bc0,
+    0x3dd347c0, 0x3ddb5bc0, 0x3de37780, 0x3deb9b00,
+    0x3df3c600, 0x3dfbf940, 0x3e0434c0, 0x3e0c7840,
+    0x3e14c3c0, 0x3e1d1640, 0x3e256f80, 0x3e2dcf40,
+    0x3e363580, 0x3e3ea300, 0x3e4717c0, 0x3e4f9380,
+    0x3e581600, 0x3e609e40, 0x3e692c40, 0x3e71bf80,
+    0x3e7a5840, 0x3e82f740, 0x3e8b9c40, 0x3e944700,
+    0x3e9cf780, 0x3ea5ad00, 0x3eae66c0, 0x3eb72500,
+    0x3ebfe780, 0x3ec8af00, 0x3ed17b80, 0x3eda4d00,
+    0x3ee32340, 0x3eebfd40, 0x3ef4dac0, 0x3efdbbc0,
+    0x3f06a040, 0x3f0f88c0, 0x3f187540, 0x3f216600,
+    0x3f2a5a80, 0x3f335200, 0x3f3c4c40, 0x3f454940,
+    0x3f4e4940, 0x3f574c80, 0x3f605340, 0x3f695dc0,
+    0x3f726b40, 0x3f7b7b40, 0x3f848dc0, 0x3f8da240,
+    0x3f96b940, 0x3f9fd300, 0x3fa8f040, 0x3fb21080,
+    0x3fbb33c0, 0x3fc459c0, 0x3fcd81c0, 0x3fd6abc0,
+    0x3fdfd780, 0x3fe90480, 0x3ff23280, 0x3ffb6100,
+    0x40049f80, 0x400dd080, 0x40170400, 0x40203880,
+    0x40296f00, 0x4032a600, 0x403bde00, 0x40451680,
+    0x404e4f00, 0x40578700, 0x4060be80, 0x4069f500,
+    0x40732b80, 0x407c6280, 0x40859980, 0x408ed100,
+    0x40980800, 0x40a13f00, 0x40aa7500, 0x40b3a980,
+    0x40bcdd80, 0x40c61180, 0x40cf4500, 0x40d87800,
+    0x40e1ab00, 0x40eadc80, 0x40f40c80, 0x40fd3a80,
+    0x41066700, 0x410f9300, 0x4118bd80, 0x4121e700,
+    0x412b0f80, 0x41343580, 0x413d5880, 0x41467980,
+    0x414f9780, 0x4158b380, 0x4161cd80, 0x416ae580,
+    0x4173fb00, 0x417d0d00, 0x41861b80, 0x418f2600,
+    0x41982c80, 0x41a12f80, 0x41aa3000, 0x41b32c80,
+    0x41bc2580, 0x41c51a00, 0x41ce0900, 0x41d6f300,
+    0x41dfd800, 0x41e8b880, 0x41f19400, 0x41fa6b80,
+    0x42033d00, 0x420c0900, 0x4214cf00, 0x421d8e00,
+    0x42264680, 0x422ef980, 0x4237a680, 0x42404d80,
+    0x4248ee00, 0x42518780, 0x425a1a00, 0x4262a480,
+    0x426b2800, 0x4273a400, 0x427c1980, 0x42848880,
+    0x428cef80, 0x42954f00, 0x429da680, 0x42a5f500,
+    0x42ae3b80, 0x42b67a00, 0x42beb100, 0x42c6e080,
+    0x42cf0780, 0x42d72680, 0x42df3c00, 0x42e74880,
+    0x42ef4c80, 0x42f74880, 0x42ff3c80, 0x43072880,
+    0x430f0c80, 0x4316e800, 0x431eba00, 0x43268380,
+    0x432e4480, 0x4335fd00, 0x433dae80, 0x43455800,
+    0x434cfa00, 0x43549400, 0x435c2500, 0x4363ad80,
+    0x436b2e00, 0x4372a700, 0x437a1800, 0x43818200,
+    0x4388e400, 0x43903f00, 0x43979200, 0x439edd00,
+    0x43a62080, 0x43ad5c80, 0x43b49180, 0x43bbbf80,
+    0x43c2e800, 0x43ca0b00, 0x43d12980, 0x43d84280,
+    0x43df5200, 0x43e65500, 0x43ed4800, 0x43f43080,
+    0x43fb1c80, 0x44021b80, 0x44093a00, 0x44106480,
+    0x44176700, 0x441e0c00, 0x44241e00, 0x44297380,
+    0x4425dc00, 0x44240180, 0x441ff300, 0x4419e300,
+    0x44123f80, 0x44097500, 0x43ffe900, 0x43f5e700,
+    0x43eb9f00, 0x43e13f00, 0x43d6f200, 0x43ccbd80,
+    0x43c28400, 0x43b82780, 0x43ad8b00, 0x43a29c80,
+    0x43975180, 0x438ba080, 0x437f8180, 0x4372fd00,
+    0x43662b00, 0x43592480, 0x434c0000, 0x433ecd00,
+    0x43319180, 0x43245300, 0x43171700, 0x4309da80,
+    0x42fc9300, 0x42ef3500, 0x42e1b600, 0x42d40280,
+    0x42c60000, 0x42b79300, 0x42a8a180, 0x42991a00,
+    0x4288f200, 0x42782100, 0x42669e00, 0x42546880,
+    0x42418800, 0x422e0480, 0x4219e500, 0x42053680,
+    0x41f00980, 0x41da7080, 0x41c47b00, 0x41ae3600,
+    0x4197ab80, 0x4180e400, 0x4169e780, 0x4152bb00,
+    0x413b5e80, 0x4123d180, 0x410c1480, 0x40f42100,
+    0x40dbed00, 0x40c36c80, 0x40aa9600, 0x40915f80,
+    0x4077c100, 0x405db280, 0x40432c80, 0x40282580,
+    0x400c9280, 0x3ff068c0, 0x3fd39dc0, 0x3fb62bc0,
+    0x3f981200, 0x3f795080, 0x3f59e780, 0x3f39ebc0,
+    0x3f198680, 0x3ef8e100, 0x3ed82440, 0x3eb76c80,
+    0x3e96c940, 0x3e764900, 0x3e55f980, 0x3e35cb00,
+    0x3e1590c0, 0x3df51cc0, 0x3dd44200, 0x3db2e640,
+    0x3d910200, 0x3d6e8e40, 0x3d4b8480, 0x3d27e600,
+    0x3d03bc00, 0x3cdf0fc0, 0x3cb9eb80, 0x3c946240,
+    0x3c6e9180, 0x3c489700, 0x3c229000, 0x3bfc95c0,
+    0x3bd6bd00, 0x3bb11a80, 0x3b8bc180, 0x3b669bc0,
+    0x3b416a00, 0x3b1beb80, 0x3af5e140, 0x3acf3300,
+    0x3aa7ef80, 0x3a802780, 0x3a57eb80, 0x3a2f5880,
+    0x3a069640, 0x39ddcd40, 0x39b524c0, 0x398ca540,
+    0x39643800, 0x393bc540, 0x39133580, 0x38ea7ac0,
+    0x38c19040, 0x389871c0, 0x386f1b40, 0x38458e00,
+    0x381bd000, 0x37f1e780, 0x37c7db00, 0x379db080,
+    0x37736e80, 0x37491b00, 0x371ebcc0, 0x36f45980,
+    0x36c96600, 0x369ed300, 0x36740380, 0x3648ffc0,
+    0x361dcf40, 0x35f27a00, 0x35c70780, 0x359b7f80,
+    0x356fe9c0, 0x35444dc0, 0x3518b280, 0x34ed1940,
+    0x34c17c00, 0x3495d4c0, 0x346a1d40, 0x343e4300,
+    0x34122840, 0x33e5ae00, 0x33b8b780, 0x338b4dc0,
+    0x335d9f00, 0x332fdc00, 0x33023440, 0x32d4cc40,
+    0x32a7bc80, 0x327b1d40, 0x324f04c0, 0x32235280,
+    0x31f7b100, 0x31cbc7c0, 0x319f4140, 0x3171fb40,
+    0x31440840, 0x31157d00, 0x30e66e80, 0x30b6fc40,
+    0x30875080, 0x30579600, 0x3027f700, 0x2ff89140,
+    0x2fc976c0, 0x2f9ab880, 0x2f6c6780, 0x2f3e8780,
+    0x2f111000, 0x2ee3f800, 0x2eb73480, 0x2e8a9840,
+    0x2e5dd340, 0x2e3093c0, 0x2e028ac0, 0x2dd39680,
+    0x2da3c480, 0x2d732380, 0x2d41c400, 0x2d0fd300,
+    0x2cdd9ac0, 0x2cab6640, 0x2c797f00, 0x2c480d40,
+    0x2c171700, 0x2be6a0c0, 0x2bb6ae80, 0x2b8739c0,
+    0x2b583200, 0x2b298600, 0x2afb2400, 0x2accfa40,
+    0x2a9ef500, 0x2a710100, 0x2a430ac0, 0x2a14f9c0,
+    0x29e6b0c0, 0x29b81240, 0x29890140, 0x29596900,
+    0x29293e00, 0x28f87500, 0x28c70340, 0x2894efc0,
+    0x28625140, 0x282f4040, 0x27fbd5c0, 0x27c83540,
+    0x27948ec0, 0x27611240, 0x272def80, 0x26fb4cc0,
+    0x26c94780, 0x2697fcc0, 0x26678880, 0x2637f740,
+    0x26094540, 0x25db6dc0, 0x25ae6b40, 0x25821680,
+    0x255627c0, 0x252a55c0, 0x24fe5680, 0x24d1db40,
+    0x24a48fc0, 0x24761f40, 0x244637c0, 0x2414c900,
+    0x23e20240, 0x23ae1740, 0x23793bc0, 0x2343cc00,
+    0x230e4ac0, 0x22d93c80, 0x22a52400, 0x22725180,
+    0x2240e480, 0x2210f9c0, 0x21e2ab40, 0x21b5c7c0,
+    0x2189d2c0, 0x215e4d40, 0x2132b900, 0x2106ba80,
+    0x20da1940, 0x20ac9d80, 0x207e11c0, 0x204e77c0,
+    0x201e0880, 0x1fecfea0, 0x1fbb94e0, 0x1f8a0500,
+    0x1f59d340, 0x1f27ac20, 0x1ef67c60, 0x1ec64e40,
+    0x1e96fdc0, 0x1e686400, 0x1e3a5a00, 0x1e0cae80,
+    0x1ddf25e0, 0x1db18460, 0x1d839020, 0x1d5536e0,
+    0x1d268e80, 0x1cf7ae60, 0x1cc8aea0, 0x1c99af00,
+    0x1c6ad820, 0x1c3c5280, 0x1c0e4500, 0x1be0ab60,
+    0x1bb35620, 0x1b861400, 0x1b58b480, 0x1b2b1a00,
+    0x1afd39c0, 0x1acf09a0, 0x1aa080c0, 0x1a71b020,
+    0x1a42c2a0, 0x1a13e420, 0x19e53fc0, 0x19b6eb00,
+    0x1988e620, 0x195b3060, 0x192dc8a0, 0x1900a8a0,
+    0x18d3c4e0, 0x18a711e0, 0x187a83e0, 0x184e10e0,
+    0x1821b060, 0x17f55a00, 0x17c90580, 0x179cb100,
+    0x177060a0, 0x17441880, 0x1717dd20, 0x16ebb080,
+    0x16bf9260, 0x169382e0, 0x166781c0, 0x163b8f80,
+    0x160fade0, 0x15e3de40, 0x15b82220, 0x158c7ae0,
+    0x1560ea80, 0x15357240, 0x150a1400, 0x14ded020,
+    0x14b3a640, 0x148895a0, 0x145d9dc0, 0x1432bde0,
+    0x1407f540, 0x13dd4380, 0x13b2a860, 0x13882460,
+    0x135db880, 0x133365a0, 0x13092cc0, 0x12df0e60,
+    0x12b50aa0, 0x128b2120, 0x12615200, 0x12379da0,
+    0x120e04c0, 0x11e48820, 0x11bb2860, 0x1191e600,
+    0x1168c080, 0x113fb7a0, 0x1116cb40, 0x10edfba0,
+    0x10c54a00, 0x109cb7a0, 0x10744560, 0x104bf420,
+    0x1023c3e0, 0x0ffbb500, 0x0fd3c790, 0x0fabfbe0,
+    0x0f845290, 0x0f5ccc40, 0x0f356970, 0x0f0e2a60,
+    0x0ee70eb0, 0x0ec01610, 0x0e994040, 0x0e728d50,
+    0x0e4bfdf0, 0x0e2592c0, 0x0dff4c70, 0x0dd92af0,
+    0x0db32da0, 0x0d8d53e0, 0x0d679cf0, 0x0d420880,
+    0x0d1c9680, 0x0cf74700, 0x0cd219f0, 0x0cad0eb0,
+    0x0c882450, 0x0c6359a0, 0x0c3ead90, 0x0c1a1f80,
+    0x0bf5af40, 0x0bd15cf0, 0x0bad2870, 0x0b891440,
+    0x0b652530, 0x0b416020, 0x0b1dca30, 0x0afa6810,
+    0x0ad73ee0, 0x0ab45370, 0x0a91aac0, 0x0a6f49b0,
+    0x0a4da7f0, 0x0a2c7e20, 0x0a0ba310, 0x09eb1220,
+    0x09cac6e0, 0x09aabc70, 0x098aee40, 0x096b57a0,
+    0x094bf400, 0x092cbea0, 0x090db2e0, 0x08eecef0,
+    0x08d01360, 0x08b18110, 0x089318b0, 0x0874db00,
+    0x0856c880, 0x0838e1b0, 0x081b2730, 0x07fd99a8,
+    0x07e03a28, 0x07c309a8, 0x07a60910, 0x07893918,
+    0x076c99d0, 0x07502b90, 0x0733ee70, 0x0717e2f8,
+    0x06fc09b8, 0x06e06378, 0x06c4f0b8, 0x06a9b1c8,
+    0x068ea6a0, 0x0673cf18, 0x06592b18, 0x063ebad0,
+    0x06247ed0, 0x060a7780, 0x05f0a570, 0x05d708b8,
+    0x05bda128, 0x05a46e80, 0x058b7078, 0x0572a740,
+    0x055a1330, 0x0541b4d8, 0x05298c98, 0x05119a88,
+    0x04f9de50, 0x04e257a0, 0x04cb0630, 0x04b3ea00,
+    0x049d0378, 0x04865308, 0x046fd918, 0x045995a8,
+    0x04438860, 0x042db0d0, 0x04180ea0, 0x0402a1d0,
+    0x03ed6abc, 0x03d869b8, 0x03c39f28, 0x03af0af0,
+    0x039aaca0, 0x038683b4, 0x03728fc0, 0x035ed0b0,
+    0x034b46c4, 0x0337f254, 0x0324d3a0, 0x0311eab0,
+    0x02ff370c, 0x02ecb85c, 0x02da6e34, 0x02c858a8,
+    0x02b67820, 0x02a4cd28, 0x02935820, 0x02821920,
+    0x02710fac, 0x02603b54, 0x024f9bb4, 0x023f308c,
+    0x022ef9e8, 0x021ef7c8, 0x020f2a40, 0x01ff908e,
+    0x01f02974, 0x01e0f38a, 0x01d1ed94, 0x01c316d6,
+    0x01b46f5e, 0x01a5f720, 0x0197ae28, 0x018994ea,
+    0x017bac54, 0x016df546, 0x016070ae, 0x01532078,
+    0x01460760, 0x01392834, 0x012c85a4, 0x01201f7a,
+    0x0113f27c, 0x0107fb6c, 0x00fc36fd, 0x00f0a2d5,
+    0x00e53d51, 0x00da050f, 0x00cef88c, 0x00c41869,
+    0x00b9671f, 0x00aee754, 0x00a49b80, 0x009a8384,
+    0x00909ca6, 0x0086e400, 0x007d56e3, 0x0073f48e,
+    0x006abe70, 0x0061b5de, 0x0058dc65, 0x005033b4,
+    0x0047be30, 0x003f7e30, 0x00377619, 0x002fa4d4,
+    0x002805ee, 0x002094cb, 0x00194cb8, 0x00122856,
+    0x000b215c, 0x00043148, 0xfffd51f0, 0xfff683a0,
+    0xffefcd4d, 0xffe9362f, 0xffe2c57d, 0xffdc855c,
+    0xffd682c4, 0xffd0cad4, 0xffcb6a2c, 0xffc663bc,
+    0xffc1b06f, 0xffbd48e1, 0xffb92570, 0xffb53a54,
+    0xffb1779c, 0xffadcd38, 0xffaa2b42, 0xffa68855,
+    0xffa2e141, 0xff9f332c, 0xff9b7b9c, 0xff97bf2e,
+    0xff9409e2, 0xff9067e2, 0xff8ce556, 0xff898bf0,
+    0xff866306, 0xff8371d0, 0xff80bf63, 0xff7e4eba,
+    0xff7c1eaa, 0xff7a2e04, 0xff787b47, 0xff770280,
+    0xff75bd06, 0xff74a3f7, 0xff73b0b2, 0xff72dd02,
+    0xff72237e, 0xff717ebe, 0xff70e94c, 0xff705f59,
+    0xff6fde6a, 0xff6f6426, 0xff6eee40, 0xff6e7d0b,
+    0xff6e1359, 0xff6db403, 0xff6d61f8, 0xff6d2054,
+    0xff6cf267, 0xff6cdb76, 0xff6cdebb, 0xff6cff47,
+    0xff6d3fc9, 0xff6da306, 0xff6e2b82, 0xff6eda13,
+    0xff6fad6d, 0xff70a463, 0xff71bd9d, 0xff72f662,
+    0xff744a80, 0xff75b5c4, 0xff773409, 0xff78c0a6,
+    0xff7a5693, 0xff7bf0dc, 0xff7d8abb, 0xff7f2301,
+    0xff80bc08, 0xff825854, 0xff83fa56, 0xff85a55c,
+    0xff875d22, 0xff892598, 0xff8b025d, 0xff8cf53c,
+    0xff8efdf4, 0xff911c48, 0xff934fc9, 0xff959675,
+    0xff97ec86, 0xff9a4e35, 0xff9cb7d2, 0xff9f26cc,
+    0xffa199ce, 0xffa40f74, 0xffa6867c, 0xffa8feb2,
+    0xffab78e0, 0xffadf5c7, 0xffb07640, 0xffb2fba0,
+    0xffb587a2, 0xffb81bfb, 0xffbaba46, 0xffbd6236,
+    0xffc011a8, 0xffc2c679, 0xffc57e84, 0xffc83894,
+    0xffcaf41a, 0xffcdb0b8, 0xffd06e17, 0xffd32bf7,
+    0xffd5ea38, 0xffd8a8c3, 0xffdb6764, 0xffde25fb,
+    0xffe0e471, 0xffe3a2b2, 0xffe66087, 0xffe91da6,
+    0xffebd978, 0xffee9351, 0xfff14ab0, 0xfff3fef6,
+    0xfff6af94, 0xfff95c0c, 0xfffc03c7, 0xfffea659,
+    0x00015885, 0x0003f2e9, 0x00068a73, 0x00091e8d,
+    0x000bae7f, 0x000e39bf, 0x0010bf96, 0x00133f78,
+    0x0015b8c4, 0x00182ae4, 0x001a9558, 0x001cf7b2,
+    0x001f51e0, 0x0021a3b4, 0x0023ed25, 0x00262df2,
+    0x002865c5, 0x002a9469, 0x002cb967, 0x002ed4aa,
+    0x0030e607, 0x0032ed88, 0x0034eb2f, 0x0036de23,
+    0x0038c503, 0x003a9e4c, 0x003c68a6, 0x003e23dd,
+    0x003fd0db, 0x00417083, 0x0043038b, 0x00448adf,
+    0x00460740, 0x0047799c, 0x0048e2b2, 0x004a42af,
+    0x004b98fb, 0x004ce50b, 0x004e2654, 0x004f5b5d,
+    0x005081c3, 0x00519716, 0x00529920, 0x005386d0,
+    0x0054603f, 0x00552581, 0x0055d6cc, 0x00567558,
+    0x0057033c, 0x005782b4, 0x0057f5b6, 0x00585e46,
+    0x0058be68, 0x005917ff, 0x00596ce4, 0x0059bcc0,
+    0x005a053a, 0x005a43ee, 0x005a76ae, 0x005a9b37,
+    0x005aaf38, 0x005ab07a, 0x005a9cef, 0x005a7349,
+    0x005a3328, 0x0059dc0a, 0x00596db0, 0x0058e8e5,
+    0x00584f98, 0x0057a3c0, 0x0056e738, 0x00561bec,
+    0x005543df, 0x0054610b, 0x0053753e, 0x0052824e,
+    0x005189f6, 0x00508dec, 0x004f8fc0, 0x004e8fd0,
+    0x004d8d26, 0x004c86d7, 0x004b7c0a, 0x004a6b33,
+    0x00495239, 0x00482f0e, 0x0046ffc4, 0x0045c201,
+    0x00447337, 0x004310cc, 0x00419871, 0x004008e4,
+    0x003e6231, 0x003ca460, 0x003acf8a, 0x0038e57a,
+    0x0036e981, 0x0034defa, 0x0032c94b, 0x0030acc6,
+    0x002e8eb4, 0x002c7452, 0x002a62aa, 0x00285bbf,
+    0x00265eda, 0x00246b24, 0x00227f9c, 0x002098e7,
+    0x001eb13b, 0x001cc2ef, 0x001ac899, 0x0018be3d,
+    0x0016a198, 0x00147065, 0x00122897, 0x000fcbc5,
+    0x000d5f03, 0x000ae77a, 0x00086a52, 0x0005eb92,
+    0x00036e4a, 0x0000f57e, 0xfffe8414, 0xfffc1a78,
+    0xfff9b6bb, 0xfff756d9, 0xfff4f8d0, 0xfff29add,
+    0xfff03b87, 0xffedd94c, 0xffeb7295, 0xffe9072b,
+    0xffe6981a, 0xffe4265b, 0xffe1b30e, 0xffdf3f2b,
+    0xffdccb9e, 0xffda5993, 0xffd7ea0c, 0xffd57d60,
+    0xffd31302, 0xffd0aa27, 0xffce4243, 0xffcbdb40,
+    0xffc97595, 0xffc711a2, 0xffc4af9d, 0xffc24fa6,
+    0xffbff1de, 0xffbd9699, 0xffbb3e44, 0xffb8e8d5,
+    0xffb695f4, 0xffb44522, 0xffb1f627, 0xffafa8f0,
+    0xffad5d91, 0xffab140a, 0xffa8cc1c, 0xffa68590,
+    0xffa44066, 0xffa1fca0, 0xff9fba30, 0xff9d7902,
+    0xff9b3916, 0xff98fa6d, 0xff96bd06, 0xff9480b6,
+    0xff924532, 0xff900a24, 0xff8dcf41, 0xff8b9433,
+    0xff895884, 0xff871bd3, 0xff84dd8a, 0xff829d34,
+    0xff805a43, 0xff7e142d, 0xff7bca71, 0xff797c83,
+    0xff7729e3, 0xff74d204, 0xff727451, 0xff70101e,
+    0xff6da493, 0xff6b30d1, 0xff68b3f4, 0xff662d31,
+    0xff639bd1, 0xff60ff09, 0xff5e562c, 0xff5ba3e0,
+    0xff58ee39, 0xff563c22, 0xff5394f3, 0xff50fd1e,
+    0xff4e7599, 0xff4bff32, 0xff499ad4, 0xff47490a,
+    0xff450a36, 0xff42deb7, 0xff40c6cf, 0xff3ec2be,
+    0xff3cd299, 0xff3af681, 0xff392e6a, 0xff377a4a,
+    0xff35d9f7, 0xff344d44, 0xff32d3e8, 0xff316d96,
+    0xff3019d9, 0xff2ed83a, 0xff2da82f, 0xff2c88bf,
+    0xff2b78b4, 0xff2a76cc, 0xff298184, 0xff289890,
+    0xff27bc7d, 0xff26ee21, 0xff262e28, 0xff257cdc,
+    0xff24d9f4, 0xff244524, 0xff23be15, 0xff234488,
+    0xff22d852, 0xff227947, 0xff22273d, 0xff21e1d2,
+    0xff21a871, 0xff217a79, 0xff215748, 0xff213eca,
+    0xff21319e, 0xff21305c, 0xff213baf, 0xff2153c2,
+    0xff21782b, 0xff21a892, 0xff21e477, 0xff222bda,
+    0xff227f26, 0xff22debd, 0xff234b09, 0xff23c394,
+    0xff24471d, 0xff24d42b, 0xff25695c, 0xff260538,
+    0xff26a652, 0xff274b28, 0xff27f22d, 0xff2899d2,
+    0xff295975, 0xff29f2ad, 0xff2a96d7, 0xff2b45f4,
+    0xff2bffe3, 0xff2cc4ba, 0xff2d9458, 0xff2e6ede,
+    0xff2f544c, 0xff3044b7, 0xff314034, 0xff3246fa,
+    0xff33591e, 0xff3476e0, 0xff35a060, 0xff36d534,
+    0xff38148f, 0xff395daf, 0xff3aafd4, 0xff3c0ac8,
+    0xff3d6ed6, 0xff3edc54, 0xff405382, 0xff41d3f5,
+    0xff435ccc, 0xff44ed0f, 0xff4683d3, 0xff482080,
+    0xff49c297, 0xff4b69ab, 0xff4d1547, 0xff4ec4f5,
+    0xff50781d, 0xff522e20, 0xff53e692, 0xff55a15d,
+    0xff575f17, 0xff592022, 0xff5ae4de, 0xff5cacb4,
+    0xff5e75e2, 0xff603ee5, 0xff62062f, 0xff63caab,
+    0xff658b55, 0xff67476d, 0xff68fe11, 0xff6aaea0,
+    0xff6c5899, 0xff6dfb86, 0xff6f96e7, 0xff712a65,
+    0xff72b59f, 0xff74382b, 0xff75b1d3, 0xff772276,
+    0xff788a20, 0xff79e8e5, 0xff7b3ef0, 0xff7c8c98,
+    0xff7dd249, 0xff7f108c, 0xff804804, 0xff817d0e,
+    0xff82b74a, 0xff83fde6, 0xff855762, 0xff86c622,
+    0xff884904, 0xff89ded1, 0xff8b8646, 0xff8d3e4c,
+    0xff8f05cc, 0xff90dbc6, 0xff92bf2a, 0xff94af04,
+    0xff96aa26, 0xff98af9a, 0xff9abe48, 0xff9cd543,
+    0xff9ef3c1, 0xffa118ea, 0xffa343fd, 0xffa57423,
+    0xffa7a890, 0xffa9e084, 0xffac1b31, 0xffae5802,
+    0xffb09680, 0xffb2d621, 0xffb51678, 0xffb75704,
+    0xffb99726, 0xffbbd645, 0xffbe13d7, 0xffc04f26,
+    0xffc2879a, 0xffc4bc72, 0xffc6ed24, 0xffc918e3,
+    0xffcb3eb8, 0xffcd5dcc, 0xffcf7549, 0xffd184d8,
+    0xffd38c8f, 0xffd58ca4, 0xffd7854d, 0xffd97694,
+    0xffdb606e, 0xffdd42d1, 0xffdf1da8, 0xffe0f09b,
+    0xffe2bb00, 0xffe47c41, 0xffe633c6, 0xffe7e150,
+    0xffe98534, 0xffeb1fb4, 0xffecb10e, 0xffee3944,
+    0xffefb7e9, 0xfff12cbe, 0xfff29762, 0xfff3f789,
+    0xfff54cbe, 0xfff69695, 0xfff7d4b8, 0xfff90748,
+    0xfffa2ee5, 0xfffb4c3c, 0xfffc6003, 0xfffd6af0,
+    0xfffe6dda, 0xffff69b8, 0x00005f4b, 0x00014e7f,
+    0x00023646, 0x000315b4, 0x0003ebd3, 0x0004b74a,
+    0x00057677, 0x000627e2, 0x0006ca09, 0x00075ce1,
+    0x0007e196, 0x00085955, 0x0008c556, 0x00092751,
+    0x00098153, 0x0009d581, 0x000a25be, 0x000a732b,
+    0x000abe1f, 0x000b06e4, 0x000b4db1, 0x000b91fa,
+    0x000bd266, 0x000c0da0, 0x000c426e, 0x000c6ffb,
+    0x000c95b0, 0x000cb2f7, 0x000cc76e, 0x000cd317,
+    0x000cd647, 0x000cd17f, 0x000cc52b, 0x000cb1ea,
+    0x000c98c0, 0x000c7a62, 0x000c57c7, 0x000c3187,
+    0x000c0862, 0x000bdcd8, 0x000baf81, 0x000b80c7,
+    0x000b50ec, 0x000b202f, 0x000aeec6, 0x000abcb2,
+    0x000a89d2, 0x000a5605, 0x000a2116, 0x0009eafb,
+    0x0009b37d, 0x00097a9d, 0x00094030, 0x00090440,
+    0x0008c6b9, 0x000887ae, 0x0008470c, 0x00080512,
+    0x0007c1f6, 0x00077df9, 0x0007395a, 0x0006f45b,
+    0x0006af67, 0x00066abe, 0x000626b6, 0x0005e38f,
+    0x0005a1a0, 0x0005611e, 0x00052234, 0x0004e502,
+    0x0004a95d, 0x00046f46, 0x00043691, 0x0003ff33,
+    0x0003c90d, 0x0003941f, 0x00036047, 0x00032d9c,
+    0x0002fc1e, 0x0002cbed, 0x00029d1e, 0x00026fbc,
+    0x000243f2, 0x000219d6, 0x0001f17d, 0x0001caf1,
+    0x0001a63e, 0x00018363, 0x00016256, 0x00014316,
+    0x0001258f, 0x000109cb, 0x0000efaa, 0x0000d720,
+    0x0000c03a, 0x0000aacb, 0x000096de, 0x0000846a,
+    0x0000736d, 0x000063d3, 0x000055a6, 0x000048d0,
+    0x00003d47, 0x000032f6, 0x000029dc, 0x000021d9,
+    0x00001ae3, 0x000014ee, 0x00000fdb, 0x00000ba9,
+    0x00000839, 0x00000589, 0x00000370, 0x000001ee,
+    0x000000d7, 0x00000036, 0xffffffe0, 0xffffffc0,
+    0xffffffd5, 0xfffffff5, 0x0000000b, 0x0000000b,
+    0x0000000b, 0x0000000b, 0xfffffff5, 0xffffffd5,
+    0xffffffca, 0xffffffe0, 0x00000036, 0x000000d7,
+    0x000001ce, 0x0000033b, 0x00000529, 0x000007ad,
+    0x00000ac8, 0x00000e99, 0x00001316, 0x0000185e,
+    0x00001e7e, 0x00002575, 0x00002d4c, 0x0000361b,
+    0x00003fd6, 0x00004a93, 0x00005647, 0x00006312,
+    0x000070de, 0x00007fad, 0x00008f87, 0x0000a064,
+    0x0000b242, 0x0000c52d, 0x0000d919, 0x0000ee12,
+    0x0001040c, 0x00011b13, 0x0001331b, 0x00014c30,
+    0x0001663c, 0x0001814a, 0x00019d4f, 0x0001ba35,
+    0x0001d7e7, 0x0001f645, 0x00021544, 0x000234c3,
+    0x000254b9, 0x00027505, 0x000295a7, 0x0002b67e,
+    0x0002d7a1, 0x0002f904, 0x00031ab2, 0x00033ca0,
+    0x00035ee5, 0x0003818a, 0x0003a485, 0x0003c7e1,
+    0x0003eb72, 0x00040f0e, 0x0004329f, 0x000455e6,
+    0x000478c0, 0x00049aef, 0x0004bc52, 0x0004dca9,
+    0x0004fbde, 0x000519c5, 0x00053635, 0x0005512d,
+    0x00056aae, 0x000582a1, 0x00059927, 0x0005ae40,
+    0x0005c1f6, 0x0005d455, 0x0005e572, 0x0005f56d,
+    0x00060446, 0x0006121e, 0x00061f09, 0x00062b08,
+    0x00063605, 0x00063feb, 0x00064899, 0x00064ff0,
+    0x000655a5, 0x00065996, 0x00065b6f, 0x00065af8,
+    0x000657e9, 0x000651d4, 0x00064884, 0x00063bae,
+    0x00062b33, 0x00061706, 0x0005fefd, 0x0005e344,
+    0x0005c404, 0x0005a195, 0x00057c41, 0x00055473,
+    0x00052ac2, 0x0004ffc4, 0x0004d410, 0x0004a7e5,
+    0x00047b4f, 0x00044e39, 0x00042096, 0x0003f208,
+    0x0003c1e1, 0x00038f77, 0x00035a12, 0x00032127,
+    0x0002e476, 0x0002a389, 0x00025e29, 0x0002146d,
+    0x0001c700, 0x00017682, 0x000123a1, 0x0000cefd,
+    0x000078f7, 0x0000221a, 0xffffcad1, 0xffff7332,
+    0xffff1b1e, 0xfffec253, 0xfffe6891, 0xfffe0da2,
+    0xfffdb15c, 0xfffd5393, 0xfffcf412, 0xfffc92e3,
+    0xfffc3032, 0xfffbcc29, 0xfffb6714, 0xfffb0113,
+    0xfffa9a5b, 0xfffa3337, 0xfff9cbd4, 0xfff96450,
+    0xfff8fcac, 0xfff894dc, 0xfff82cd8, 0xfff7c4a8,
+    0xfff75c6d, 0xfff6f45e, 0xfff68c84, 0xfff62500,
+    0xfff5bde8, 0xfff5575a, 0xfff4f179, 0xfff48c64,
+    0xfff42810, 0xfff3c488, 0xfff361d7, 0xfff30008,
+    0xfff29f3a, 0xfff23f78, 0xfff1e0d8, 0xfff1835b,
+    0xfff1272a, 0xfff0cc46, 0xfff072cf, 0xfff01ad0,
+    0xffefc469, 0xffef6fa4, 0xffef1ca3, 0xffeecb7a,
+    0xffee7c1f, 0xffee2eb2, 0xffede33d, 0xffed99c1,
+    0xffed5249, 0xffed0cde, 0xffecc98d, 0xffec8849,
+    0xffec4934, 0xffec0c38, 0xffebd175, 0xffeb98eb,
+    0xffeb62a4, 0xffeb2ead, 0xffeafd19, 0xffeacdea,
+    0xffeaa129, 0xffea76cc, 0xffea4ef4, 0xffea299f,
+    0xffea06e5, 0xffe9e6ce, 0xffe9c97d, 0xffe9aebb,
+    0xffe99651, 0xffe97fd6, 0xffe96ad3, 0xffe95711,
+    0xffe9447d, 0xffe93315, 0xffe922ce, 0xffe913a0,
+    0xffe90588, 0xffe8f887, 0xffe8ec93, 0xffe8e1c1,
+    0xffe8d806, 0xffe8cf77, 0xffe8c816, 0xffe8c1eb,
+    0xffe8bd03, 0xffe8b967, 0xffe8b72e, 0xffe8b64d,
+    0xffe8b6d8, 0xffe8b8dc, 0xffe8bc6c, 0xffe8c18a,
+    0xffe8c840, 0xffe8d0a4, 0xffe8daca, 0xffe8e69e,
+    0xffe8f42a, 0xffe9035a, 0xffe9142b, 0xffe926a0,
+    0xffe93ab7, 0xffe95066, 0xffe967b8, 0xffe980ad,
+    0xffe99b3a, 0xffe9b754, 0xffe9d511, 0xffe9f45b,
+    0xffea1532, 0xffea3797, 0xffea5b89, 0xffea8108,
+    0xffeaa7ff, 0xffead079, 0xffeafa55, 0xffeb259e,
+    0xffeb5254, 0xffeb8061, 0xffebafdc, 0xffebe0ae,
+    0xffec12ce, 0xffec462f, 0xffec7add, 0xffecb0a3,
+    0xffece774, 0xffed1f32, 0xffed57a7, 0xffed90b2,
+    0xffedca48, 0xffee042a, 0xffee3e57, 0xffee788e,
+};
+
 const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
      0.00101191,  0.00440397,  0.00718669,  0.01072130,
      0.01459757,  0.01875954,  0.02308987,  0.02751541,
@@ -2219,3 +2705,456 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
     -0.00115988, -0.00114605, -0.00113200, -0.00111778,
     -0.00110343, -0.00108898, -0.00107448, -0.00105995,
 };
+
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_480_fixed)[1800] = {
+    0x00109442, 0x00482797, 0x0075bf2a, 0x00afa864,
+    0x00ef2aa5, 0x01335b36, 0x017a4df0, 0x01c2cffe,
+    0x020bfb4c, 0x0254fd74, 0x029d557c, 0x02e50574,
+    0x032c41a8, 0x03732c08, 0x03b9cb88, 0x040032e8,
+    0x044686f0, 0x048cd578, 0x04d30738, 0x05190500,
+    0x055ec210, 0x05a44750, 0x05e9aeb8, 0x062f0c80,
+    0x067477a0, 0x06ba1ac0, 0x07001998, 0x074680e0,
+    0x078d5ec0, 0x07d4d038, 0x081cf8f0, 0x0865f8b0,
+    0x08afe0e0, 0x08fab150, 0x09466cd0, 0x09931910,
+    0x09e0adb0, 0x0a2f1640, 0x0a7e43f0, 0x0ace2960,
+    0x0b1eb180, 0x0b6fc4b0, 0x0bc15050, 0x0c134710,
+    0x0c65a420, 0x0cb86340, 0x0d0b7df0, 0x0d5ef450,
+    0x0db2cb60, 0x0e070180, 0x0e5b91f0, 0x0eb07f20,
+    0x0f05d0a0, 0x0f5b8920, 0x0fb1a950, 0x10082e40,
+    0x105f1400, 0x10b65820, 0x110df780, 0x1165f120,
+    0x11be43e0, 0x1216eea0, 0x126feac0, 0x12c92b00,
+    0x1322a620, 0x137c55c0, 0x13d61ae0, 0x142fc940,
+    0x148949e0, 0x14e28da0, 0x153b9a80, 0x15947640,
+    0x15ed1840, 0x16458660, 0x169deb20, 0x16f663c0,
+    0x174ef8c0, 0x17a7a120, 0x180041c0, 0x1858d000,
+    0x18b14940, 0x1909a140, 0x1961c820, 0x19b9b620,
+    0x1a116480, 0x1a68c1a0, 0x1abfbd00, 0x1b164f60,
+    0x1b6c7580, 0x1bc23120, 0x1c1780e0, 0x1c6c5d00,
+    0x1cc0dbe0, 0x1d1532a0, 0x1d697660, 0x1dbdac20,
+    0x1e11b280, 0x1e655b80, 0x1eb89e80, 0x1f0b7720,
+    0x1f5dd680, 0x1fafaec0, 0x2000fb00, 0x2051c340,
+    0x20a22ac0, 0x20f24580, 0x214213c0, 0x21919140,
+    0x21e0b300, 0x222f7580, 0x227dd900, 0x22cbd880,
+    0x23196ec0, 0x23669b00, 0x23b35d80, 0x23ffb6c0,
+    0x244ba7c0, 0x249731c0, 0x24e25700, 0x252d1940,
+    0x2594ae40, 0x25deea40, 0x2628bd00, 0x26722680,
+    0x26bb2740, 0x2703bf40, 0x274beec0, 0x2793b600,
+    0x27db1500, 0x28220c00, 0x28689b80, 0x28aec4c0,
+    0x28f48800, 0x2939e680, 0x297ee080, 0x29c37600,
+    0x2a07a740, 0x2a4b74c0, 0x2a8ede80, 0x2ad1e500,
+    0x2b148880, 0x2b56c940, 0x2b98a740, 0x2bda2240,
+    0x2c1b3a80, 0x2c5bef80, 0x2c9c4100, 0x2cdc2e80,
+    0x2d1bb800, 0x2d5adc80, 0x2d999b80, 0x2dd7f500,
+    0x2e15e800, 0x2e537400, 0x2e9098c0, 0x2ecd5540,
+    0x2f09a900, 0x2f4592c0, 0x2f811140, 0x2fbc2340,
+    0x2ff6c7c0, 0x3030fe80, 0x306ac6c0, 0x30a41f80,
+    0x30dd07c0, 0x31157dc0, 0x314d7fc0, 0x31850c80,
+    0x31bc22c0, 0x31f2c1c0, 0x3228e840, 0x325e9540,
+    0x3293c7c0, 0x32c87e40, 0x32fcb800, 0x33307340,
+    0x3363aec0, 0x33966940, 0x33c8a140, 0x33fa5580,
+    0x342b84c0, 0x345c2dc0, 0x348c4f80, 0x34bbe900,
+    0x34eaf9c0, 0x35198080, 0x35477d00, 0x3574ee40,
+    0x35a1d340, 0x35ce2bc0, 0x35f9f6c0, 0x36253380,
+    0x364fe180, 0x367a0040, 0x36a38f80, 0x36cc8ec0,
+    0x36f4fe80, 0x371cde80, 0x37442e80, 0x376aef00,
+    0x37912000, 0x37b6c200, 0x37dbd600, 0x38005d00,
+    0x38245840, 0x3847c880, 0x386aaf80, 0x388d0e80,
+    0x38aee700, 0x38d03bc0, 0x38f11000, 0x39116700,
+    0x39314440, 0x3950ab00, 0x396f9e80, 0x398e22c0,
+    0x39ac3c40, 0x39c9f280, 0x39e74cc0, 0x3a045280,
+    0x3a210b40, 0x3a3d7ec0, 0x3a59b480, 0x3a75b480,
+    0x3a918900, 0x3aad3cc0, 0x3ac8db00, 0x3ae46bc0,
+    0x3afff080, 0x3b1b6840, 0x3b36d2c0, 0x3b521980,
+    0x3b6d0780, 0x3b876400, 0x3ba0f4c0, 0x3bb96740,
+    0x3bd03dc0, 0x3be56580, 0x3bf6dec0, 0x3c0c6140,
+    0x3c15a9c0, 0x3c1a5780, 0x3c1fd0c0, 0x3c25edc0,
+    0x3c2c78c0, 0x3c333880, 0x3c39f3c0, 0x3c409100,
+    0x3c471d00, 0x3c4da780, 0x3c543f40, 0x3c5ae880,
+    0x3c619f00, 0x3c685f00, 0x3c6f25c0, 0x3c75f280,
+    0x3c7cc6c0, 0x3c83a2c0, 0x3c8a87c0, 0x3c9175c0,
+    0x3c986d00, 0x3c9f6e00, 0x3ca67880, 0x3cad8c40,
+    0x3cb4a980, 0x3cbbd000, 0x3cc2ffc0, 0x3cca3940,
+    0x3cd17d40, 0x3cd8cb80, 0x3ce02480, 0x3ce78740,
+    0x3ceef3c0, 0x3cf66a00, 0x3cfdea00, 0x3d0574c0,
+    0x3d0d0a40, 0x3d14ab40, 0x3d1c5700, 0x3d240d00,
+    0x3d2bcd40, 0x3d3397c0, 0x3d3b6cc0, 0x3d434d00,
+    0x3d4b38c0, 0x3d532fc0, 0x3d5b3180, 0x3d633dc0,
+    0x3d6b53c0, 0x3d737400, 0x3d7b9f00, 0x3d83d540,
+    0x3d8c1680, 0x3d946200, 0x3d9cb780, 0x3da51680,
+    0x3dad7f00, 0x3db5f140, 0x3dbe6dc0, 0x3dc6f480,
+    0x3dcf8540, 0x3dd81fc0, 0x3de0c300, 0x3de96ec0,
+    0x3df22340, 0x3dfae0c0, 0x3e03a800, 0x3e0c7840,
+    0x3e155180, 0x3e1e32c0, 0x3e271bc0, 0x3e300c00,
+    0x3e390400, 0x3e420400, 0x3e4b0c40, 0x3e541c80,
+    0x3e5d33c0, 0x3e6651c0, 0x3e6f7580, 0x3e789fc0,
+    0x3e81d080, 0x3e8b0880, 0x3e944700, 0x3e9d8c00,
+    0x3ea6d680, 0x3eb02600, 0x3eb97a80, 0x3ec2d400,
+    0x3ecc3340, 0x3ed59880, 0x3edf0300, 0x3ee87280,
+    0x3ef1e600, 0x3efb5d40, 0x3f04d880, 0x3f0e5840,
+    0x3f17dcc0, 0x3f216600, 0x3f2af340, 0x3f348440,
+    0x3f3e1840, 0x3f47af40, 0x3f514a00, 0x3f5ae840,
+    0x3f648b00, 0x3f6e3140, 0x3f77db00, 0x3f818740,
+    0x3f8b3600, 0x3f94e780, 0x3f9e9c40, 0x3fa85480,
+    0x3fb21080, 0x3fbbcfc0, 0x3fc59200, 0x3fcf56c0,
+    0x3fd91dc0, 0x3fe2e640, 0x3fecb040, 0x3ff67b40,
+    0x40098600, 0x40135580, 0x401d2700, 0x4026fa00,
+    0x4030ce80, 0x403aa380, 0x40447900, 0x404e4f00,
+    0x40582400, 0x4061f900, 0x406bcd00, 0x4075a080,
+    0x407f7480, 0x40894900, 0x40931e00, 0x409cf280,
+    0x40a6c600, 0x40b09800, 0x40ba6980, 0x40c43a80,
+    0x40ce0b00, 0x40d7db00, 0x40e1ab00, 0x40eb7980,
+    0x40f54600, 0x40ff1080, 0x4108d980, 0x4112a100,
+    0x411c6800, 0x41262d80, 0x412ff080, 0x4139b180,
+    0x41436e80, 0x414d2980, 0x4156e100, 0x41609700,
+    0x416a4a80, 0x4173fb00, 0x417da800, 0x41875000,
+    0x4190f400, 0x419a9400, 0x41a43000, 0x41adc880,
+    0x41b75d00, 0x41c0ec80, 0x41ca7700, 0x41d3fb00,
+    0x41dd7980, 0x41e6f280, 0x41f06600, 0x41f9d480,
+    0x42033d00, 0x420c9f00, 0x4215f980, 0x421f4d00,
+    0x42289900, 0x4231de80, 0x423b1d00, 0x42445500,
+    0x424d8500, 0x4256ad00, 0x425fcc80, 0x4268e380,
+    0x4271f200, 0x427af900, 0x4283f880, 0x428cef80,
+    0x4295de00, 0x429ec280, 0x42a79d80, 0x42b06f00,
+    0x42b93800, 0x42c1f800, 0x42caaf80, 0x42d35d80,
+    0x42dc0100, 0x42e49b00, 0x42ed2a80, 0x42f5b080,
+    0x42fe2d80, 0x4306a180, 0x430f0c80, 0x43176d80,
+    0x431fc480, 0x43281100, 0x43305400, 0x43388e80,
+    0x4340c000, 0x4348e900, 0x43510900, 0x43591f00,
+    0x43612b80, 0x43692f00, 0x43712900, 0x43791a80,
+    0x43810380, 0x4388e400, 0x4390bc00, 0x43988b00,
+    0x43a05180, 0x43a80f00, 0x43afc480, 0x43b77180,
+    0x43bf1780, 0x43c6b700, 0x43ce5100, 0x43d5e580,
+    0x43dd7100, 0x43e4ef80, 0x43ec5b80, 0x43f3ba80,
+    0x43fb1c80, 0x44029400, 0x440a2e80, 0x4411d080,
+    0x44193800, 0x44202480, 0x44265880, 0x442ba780,
+    0x442d8680, 0x4428a500, 0x44241380, 0x441ccb00,
+    0x44140100, 0x440a1200, 0x43ff7280, 0x43f46980,
+    0x43e93200, 0x43ddff00, 0x43d2dc80, 0x43c7ac00,
+    0x43bc4900, 0x43b09400, 0x43a47d80, 0x4397fd80,
+    0x438b0780, 0x437d9b80, 0x436fd380, 0x4361cd80,
+    0x4353a800, 0x43457500, 0x43373c80, 0x43290500,
+    0x431ad400, 0x430ca280, 0x42fe6000, 0x42f00080,
+    0x42e17380, 0x42d29e00, 0x42c35d80, 0x42b39200,
+    0x42a32080, 0x4291fc00, 0x42801900, 0x426d6d80,
+    0x4259f680, 0x4245bd00, 0x4230ca80, 0x421b2900,
+    0x4204e800, 0x41ee1d00, 0x41d6dd80, 0x41bf3c80,
+    0x41a74680, 0x418f0680, 0x41768800, 0x415dd100,
+    0x4144e400, 0x412bbf80, 0x41126400, 0x40f8cc00,
+    0x40deea00, 0x40c4b100, 0x40aa1400, 0x408f0800,
+    0x40738380, 0x40577d80, 0x403aeb80, 0x401dc180,
+    0x3ffff240, 0x3fe170c0, 0x3fc232c0, 0x3fa23680,
+    0x3f817c40, 0x3f6002c0, 0x3f3ddec0, 0x3f1b4180,
+    0x3ef85d40, 0x3ed56340, 0x3eb27240, 0x3e8f9c40,
+    0x3e6cf400, 0x3e4a81c0, 0x3e282140, 0x3e059980,
+    0x3de2b280, 0x3dbf4100, 0x3d9b3640, 0x3d768b00,
+    0x3d513640, 0x3d2b3840, 0x3d049b80, 0x3cdd6b40,
+    0x3cb5b400, 0x3c8d8f40, 0x3c652080, 0x3c3c8c40,
+    0x3c13f480, 0x3beb7580, 0x3bc327c0, 0x3b9b2680,
+    0x3b737000, 0x3b4bc580, 0x3b23d740, 0x3afb5640,
+    0x3ad21c40, 0x3aa83780, 0x3a7dbc40, 0x3a52bf80,
+    0x3a276600, 0x39fbe0c0, 0x39d06140, 0x39a50ec0,
+    0x3979e300, 0x394ebf40, 0x392386c0, 0x38f82280,
+    0x38cc89c0, 0x38a0b7c0, 0x3874a740, 0x38485840,
+    0x381bd1c0, 0x37ef1b40, 0x37c23cc0, 0x37953dc0,
+    0x376825c0, 0x373afc80, 0x370dc980, 0x36e09440,
+    0x36b41dc0, 0x36862100, 0x3657e480, 0x36297240,
+    0x35fad380, 0x35cc1200, 0x359d36c0, 0x356e4b40,
+    0x353f5880, 0x35106780, 0x34e17780, 0x34b28240,
+    0x34838040, 0x345466c0, 0x34251940, 0x33f57280,
+    0x33c54bc0, 0x33949840, 0x33638380, 0x33324980,
+    0x33012500, 0x32d04480, 0x329fc7c0, 0x326fcbc0,
+    0x324068c0, 0x32116fc0, 0x31e27600, 0x31b30fc0,
+    0x3182e300, 0x3151e240, 0x312029c0, 0x30edd080,
+    0x30baf700, 0x3087cd00, 0x30548600, 0x30215680,
+    0x2fee65c0, 0x2fbbca40, 0x2f899980, 0x2f57e6c0,
+    0x2f26b540, 0x2ef5f980, 0x2ec5aa00, 0x2e95afc0,
+    0x2e65c180, 0x2e357b40, 0x2e047840, 0x2dd27380,
+    0x2d9f6c40, 0x2d6b7780, 0x2d36a6c0, 0x2d012940,
+    0x2ccb5680, 0x2c958a00, 0x2c601b80, 0x2c2b3640,
+    0x2bf6dfc0, 0x2bc31ec0, 0x2b8ff500, 0x2b5d5540,
+    0x2b2b2a00, 0x2af95e80, 0x2ac7dd80, 0x2a968f80,
+    0x2a655d40, 0x2a342f00, 0x2a02e8c0, 0x29d16700,
+    0x299f8640, 0x296d2380, 0x293a2740, 0x29068400,
+    0x28d22b40, 0x289d1540, 0x28675280, 0x28310180,
+    0x27fa3f00, 0x27c32f80, 0x278c08c0, 0x275505c0,
+    0x271e60c0, 0x26e84b00, 0x26b2e880, 0x267e5cc0,
+    0x264ac940, 0x26183a40, 0x25e6aa80, 0x25b615c0,
+    0x25866b80, 0x25576b40, 0x2528ba00, 0x24f9ffc0,
+    0x24cadfc0, 0x249af540, 0x2469da80, 0x24372780,
+    0x2402b800, 0x23ccbfc0, 0x23957cc0, 0x235d3140,
+    0x23245200, 0x22eb8000, 0x22b35cc0, 0x227c7940,
+    0x22471d40, 0x22136840, 0x21e18240, 0x21b15d80,
+    0x21827dc0, 0x21544600, 0x21261b00, 0x20f78600,
+    0x20c83e00, 0x20980000, 0x20668e00, 0x2033f300,
+    0x20007400, 0x1fcc64e0, 0x1f97d120, 0x1f642320,
+    0x1f2f49e0, 0x1efaa840, 0x1ec73580, 0x1e94d880,
+    0x1e636120, 0x1e32a160, 0x1e025ba0, 0x1dd24300,
+    0x1da20e60, 0x1d717940, 0x1d407560, 0x1d0f2040,
+    0x1cdd95c0, 0x1cabf500, 0x1c7a6940, 0x1c492340,
+    0x1c185680, 0x1be818c0, 0x1bb83f60, 0x1b888d20,
+    0x1b58c640, 0x1b28c240, 0x1af871e0, 0x1ac7c960,
+    0x1a96bf00, 0x1a656b60, 0x1a340360, 0x1a02bd20,
+    0x19d1c6c0, 0x19a12f40, 0x1970f480, 0x19411640,
+    0x19119000, 0x18e255a0, 0x18b358a0, 0x18848b20,
+    0x1855e040, 0x18274e00, 0x17f8c9e0, 0x17ca4a80,
+    0x179bce40, 0x176d5a60, 0x173ef400, 0x17109fe0,
+    0x16e25f60, 0x16b43240, 0x16861880, 0x16581220,
+    0x162a20c0, 0x15fc4620, 0x15ce8420, 0x15a0dca0,
+    0x157351c0, 0x1545e580, 0x151899a0, 0x14eb6ec0,
+    0x14be63a0, 0x14917a00, 0x14649ae0, 0x14377060,
+    0x1409d0c0, 0x13dbbb20, 0x13ad58e0, 0x137f0160,
+    0x1350cc80, 0x1322b8c0, 0x12f4ca60, 0x12c704e0,
+    0x129968a0, 0x126bf5c0, 0x123eade0, 0x12119300,
+    0x11e4a660, 0x11b7e860, 0x118b5940, 0x115ef8a0,
+    0x1132c600, 0x1106c1a0, 0x10daecc0, 0x10af4900,
+    0x1083d7a0, 0x10589c00, 0x102d9a00, 0x1002d1e0,
+    0x0fd842c0, 0x0fadde80, 0x0f839a50, 0x0f597700,
+    0x0f2f76e0, 0x0f05a170, 0x0edbf9c0, 0x0eb27f30,
+    0x0e8930d0, 0x0e600d70, 0x0e371550, 0x0e0e4950,
+    0x0de5ab50, 0x0dbd3d20, 0x0d94fe10, 0x0d6cecb0,
+    0x0d450220, 0x0d1d38f0, 0x0cf59130, 0x0cce0c30,
+    0x0ca6af10, 0x0c7f7b80, 0x0c587010, 0x0c318960,
+    0x0c0ac200, 0x0be418d0, 0x0bbd8da0, 0x0b9724e0,
+    0x0b70e6c0, 0x0b4ad970, 0x0b2502f0, 0x0aff6930,
+    0x0ada1250, 0x0ab50430, 0x0a9044d0, 0x0a6bda30,
+    0x0a3bedf0, 0x0a18be40, 0x09f5e530, 0x09d35cf0,
+    0x09b11ff0, 0x098f2890, 0x096d7120, 0x094bf400,
+    0x092aab80, 0x09099240, 0x08e8a620, 0x08c7e850,
+    0x08a75990, 0x0886fae0, 0x0866ccf0, 0x0846d070,
+    0x08270610, 0x08076e70, 0x07e80ac8, 0x07c8dc60,
+    0x07a9e440, 0x078b2348, 0x076c99d0, 0x074e4818,
+    0x07302e50, 0x07124d18, 0x06f4a530, 0x06d73778,
+    0x06ba0488, 0x069d0c88, 0x06804f68, 0x0663cce0,
+    0x06478528, 0x062b78a0, 0x060fa7e8, 0x05f413b8,
+    0x05d8bc38, 0x05bda128, 0x05a2c258, 0x05881f60,
+    0x056db888, 0x05538e60, 0x0539a170, 0x051ff218,
+    0x05068040, 0x04ed4b90, 0x04d45398, 0x04bb9820,
+    0x04a31988, 0x048ad860, 0x0472d528, 0x045b0ff0,
+    0x04438860, 0x042c3de8, 0x04153040, 0x03fe5f4c,
+    0x03e7cb98, 0x03d17580, 0x03bb5d64, 0x03a582e8,
+    0x038fe588, 0x037a8494, 0x03655fcc, 0x03507768,
+    0x033bcbb4, 0x03275d28, 0x03132bc0, 0x02ff370c,
+    0x02eb7e94, 0x02d801e8, 0x02c4c11c, 0x02b1bcbc,
+    0x029ef578, 0x028c6ba8, 0x027a1f20, 0x02680f54,
+    0x02563bac, 0x0244a3c8, 0x023347a0, 0x02222730,
+    0x0211429c, 0x02009938, 0x01f02974, 0x01dff1ae,
+    0x01cff058, 0x01c024c8, 0x01b08ef4, 0x01a12eda,
+    0x019204b0, 0x01831138, 0x01745588, 0x0165d2c2,
+    0x01578a96, 0x01497ffc, 0x013bb670, 0x012e3160,
+    0x0120f146, 0x0113f27c, 0x0107310c, 0x00faa909,
+    0x00ee57a1, 0x00e23b09, 0x00d6515b, 0x00ca9977,
+    0x00bf1509, 0x00b3c74d, 0x00a8b388, 0x009ddb3d,
+    0x00933bf2, 0x0088d22c, 0x007e9a70, 0x0074935a,
+    0x006abe70, 0x00611d5c, 0x0057b1f8, 0x004e7e73,
+    0x0045859b, 0x003cca96, 0x00344f32, 0x002c1074,
+    0x00240873, 0x001c31ba, 0x0014863f, 0x000cfe8b,
+    0x00059307, 0xfffe3b9a, 0xfff6f718, 0xffefcd4d,
+    0xffe8c6f4, 0xffe1ed10, 0xffdb4c57, 0xffd4f484,
+    0xffcef5dc, 0xffc95d0c, 0xffc4284e, 0xffbf4e14,
+    0xffbac5ae, 0xffb68360, 0xffb27548, 0xffae87be,
+    0xffaaa733, 0xffa6c67e, 0xffa2e141, 0xff9ef40c,
+    0xff9afc25, 0xff970058, 0xff930f7c, 0xff8f3857,
+    0xff8b8900, 0xff880bfe, 0xff84c9ea, 0xff81cbbd,
+    0xff7f17ad, 0xff7cadc6, 0xff7a8c4e, 0xff78b1cd,
+    0xff7719f3, 0xff75bd06, 0xff7492a4, 0xff7392bf,
+    0xff72b600, 0xff71f5c6, 0xff714b72, 0xff70b0ed,
+    0xff702232, 0xff6f9c90, 0xff6f1cee, 0xff6ea21f,
+    0xff6e2e9c, 0xff6dc617, 0xff6d6c09, 0xff6d2425,
+    0xff6cf267, 0xff6cdaca, 0xff6ce155, 0xff6d0983,
+    0xff6d56bb, 0xff6dcc4c, 0xff6e6cd0, 0xff6f3832,
+    0xff702cc4, 0xff71492e, 0xff728ae2, 0xff73ed63,
+    0xff756b7c, 0xff77001c, 0xff78a5d9, 0xff7a5693,
+    0xff7c0c40, 0xff7dc141, 0xff7f74aa, 0xff81298b,
+    0xff82e2de, 0xff84a3de, 0xff8670bd, 0xff884e42,
+    0xff8a410c, 0xff8c4c7f, 0xff8e70fc, 0xff90ae18,
+    0xff93037e, 0xff956f12, 0xff97ec86, 0xff9a7724,
+    0xff9d0a9d, 0xff9fa3ea, 0xffa2417e, 0xffa4e1ac,
+    0xffa78332, 0xffaa265a, 0xffaccc26, 0xffaf758e,
+    0xffb223d4, 0xffb4d906, 0xffb79726, 0xffba604e,
+    0xffbd349e, 0xffc011a8, 0xffc2f4d2, 0xffc5db82,
+    0xffc8c45f, 0xffcbaed5, 0xffce9a6d, 0xffd186c6,
+    0xffd473aa, 0xffd760e5, 0xffda4e55, 0xffdd3bd0,
+    0xffe0292b, 0xffe31645, 0xffe602ff, 0xffe8eef7,
+    0xffebd978, 0xffeec1bf, 0xfff1a72c, 0xfff488fe,
+    0xfff76689, 0xfffa3f2c, 0xfffd1245, 0xffffdf33,
+    0x000020ac, 0x0002e66f, 0x0005a937, 0x00086839,
+    0x000b22b3, 0x000dd7da, 0x001086ec, 0x00132f3c,
+    0x0015d001, 0x00186897, 0x001af849, 0x001d7eb6,
+    0x001ffbbe, 0x00226f41, 0x0024d8e8, 0x00273874,
+    0x00298d82, 0x002bd7aa, 0x002e16d4, 0x00304af6,
+    0x00327406, 0x00349203, 0x0036a416, 0x0038a893,
+    0x003a9da0, 0x003c8170, 0x003e53b8, 0x0040159a,
+    0x0041c816, 0x00436c92, 0x0045042c, 0x00468ff2,
+    0x00481106, 0x004987fe, 0x004af466, 0x004c5599,
+    0x004daae4, 0x004ef28c, 0x005029c4, 0x00514d9a,
+    0x00525b57, 0x005351f7, 0x00543190, 0x0054fa43,
+    0x0055ac2f, 0x00564938, 0x0056d3f7, 0x00574f3c,
+    0x0057bdd7, 0x00582260, 0x00587f28, 0x0058d6b1,
+    0x0059293c, 0x0059741a, 0x0059b472, 0x0059e73c,
+    0x005a0976, 0x005a1870, 0x005a116e, 0x0059f224,
+    0x0059b964, 0x005966ce, 0x0058f9e2, 0x005872e8,
+    0x0057d407, 0x00571f82, 0x005657b0, 0x00557ecd,
+    0x00549731, 0x0053a34b, 0x0052a56a, 0x00519fc6,
+    0x00509482, 0x004f85a4, 0x004e74ee, 0x004d6214,
+    0x004c4bd3, 0x004b314c, 0x004a1110, 0x0048e8c8,
+    0x0047b5f7, 0x00467626, 0x00452690, 0x0043c405,
+    0x00424b7f, 0x0040ba04, 0x003f0e53, 0x003d488b,
+    0x003b688c, 0x00396eb6, 0x00375dfb, 0x00353aaa,
+    0x003308ac, 0x0030ccb1, 0x002e8cf1, 0x002c4fd5,
+    0x002a1be8, 0x0027f486, 0x0025d90d, 0x0023c852,
+    0x0021c13b, 0x001fbf23, 0x001dbafc, 0x001badc6,
+    0x00199136, 0x00176150, 0x00151b86, 0x0012bcd1,
+    0x001044d1, 0x000db8d0, 0x000b1f43, 0x00087e89,
+    0x0005dbe2, 0x00033b1e, 0x00009fee, 0xfffe0d82,
+    0xfffb83cf, 0xfff90047, 0xfff6805a, 0xfff4019a,
+    0xfff18203, 0xffeeffb2, 0xffec78ba, 0xffe9ec4d,
+    0xffe75b4e, 0xffe4c71f, 0xffe23138, 0xffdf9ae6,
+    0xffdd0574, 0xffda723c, 0xffd7e24a, 0xffd55567,
+    0xffd2cabe, 0xffd04161, 0xffcdb890, 0xffcb306a,
+    0xffc8a95c, 0xffc62406, 0xffc3a140, 0xffc12188,
+    0xffbea542, 0xffbc2cc2, 0xffb9b7d2, 0xffb745f2,
+    0xffb4d6ac, 0xffb268fe, 0xffaffc72, 0xffad90e8,
+    0xffab263e, 0xffa8bcb8, 0xffa6547e, 0xffa3ed7b,
+    0xffa187ba, 0xff9f2351, 0xff9cc055, 0xff9a5ebc,
+    0xff97fe84, 0xff959f84, 0xff934146, 0xff90e37d,
+    0xff8e858a, 0xff8c26c0, 0xff89c69e, 0xff876483,
+    0xff84ffe4, 0xff82982b, 0xff802cb6, 0xff7dbccf,
+    0xff7b47b4, 0xff78ccd0, 0xff764b6c, 0xff73c2db,
+    0xff713227, 0xff6e9864, 0xff6bf470, 0xff694553,
+    0xff668a0d, 0xff63c1a6, 0xff60ec34, 0xff5e0e9e,
+    0xff5b30d3, 0xff585b8c, 0xff5595c9, 0xff52e1da,
+    0xff5040a0, 0xff4db31c, 0xff4b3a3b, 0xff48d67e,
+    0xff468850, 0xff445011, 0xff422ded, 0xff4021f9,
+    0xff3e2c56, 0xff3c4cf8, 0xff3a83df, 0xff38d0ec,
+    0xff3733c9, 0xff35ac14, 0xff343963, 0xff32db09,
+    0xff319066, 0xff305898, 0xff2f323d, 0xff2e1bb2,
+    0xff2d1369, 0xff2c18f8, 0xff2b2d2a, 0xff2a50e1,
+    0xff2984f4, 0xff28c978, 0xff281e01, 0xff278245,
+    0xff26f5c3, 0xff26785a, 0xff2609bf, 0xff25a9c8,
+    0xff255814, 0xff2513f6, 0xff24dcc4, 0xff24b1a6,
+    0xff2492b1, 0xff248093, 0xff247c0b, 0xff2485c6,
+    0xff249daf, 0xff24c359, 0xff24f639, 0xff253605,
+    0xff258312, 0xff25ddd5, 0xff2646e7, 0xff26be25,
+    0xff274264, 0xff27d1f6, 0xff286b19, 0xff290c13,
+    0xff29b30d, 0xff2a5e38, 0xff2b0bbd, 0xff2bb9a2,
+    0xff29a9d2, 0xff2a53dc, 0xff2b0a5a, 0xff2bcd43,
+    0xff2c9c76, 0xff2d7808, 0xff2e5ffa, 0xff2f544c,
+    0xff305528, 0xff316299, 0xff327ce0, 0xff33a432,
+    0xff34d8ba, 0xff361a8e, 0xff3768f8, 0xff38c2f5,
+    0xff3a2784, 0xff3b9623, 0xff3d0ef4, 0xff3e9277,
+    0xff4020ed, 0xff41ba14, 0xff435ccc, 0xff4507fd,
+    0xff46ba84, 0xff4873ac, 0xff4a32ea, 0xff4bf7bb,
+    0xff4dc17f, 0xff4f8fa0, 0xff516167, 0xff53361d,
+    0xff550d79, 0xff56e7ee, 0xff58c5ff, 0xff5aa84d,
+    0xff5c8e41, 0xff5e75e2, 0xff605d4d, 0xff6242b6,
+    0xff6424b8, 0xff66023d, 0xff67da44, 0xff69abd6,
+    0xff6b7646, 0xff6d38e8, 0xff6ef348, 0xff70a4ce,
+    0xff724d0f, 0xff73eb95, 0xff757fff, 0xff770a2d,
+    0xff788a20, 0xff79fff6, 0xff7b6be7, 0xff7cce52,
+    0xff7e27e4, 0xff7f78fc, 0xff80c38a, 0xff820e98,
+    0xff836378, 0xff84caaa, 0xff864990, 0xff87dff4,
+    0xff898c30, 0xff8b4cda, 0xff8d207a, 0xff8f05cc,
+    0xff90fb9b, 0xff930098, 0xff95138e, 0xff97332d,
+    0xff995e2a, 0xff9b934e, 0xff9dd18c, 0xffa017e3,
+    0xffa26550, 0xffa4b8e7, 0xffa711a8, 0xffa96eae,
+    0xffabcefc, 0xffae31cc, 0xffb09680, 0xffb2fc82,
+    0xffb5635a, 0xffb7ca52, 0xffba30a8, 0xffbc95a8,
+    0xffbef8a4, 0xffc158d0, 0xffc3b557, 0xffc60d6b,
+    0xffc86041, 0xffcaacb7, 0xffccf1cb, 0xffcf2e5c,
+    0xffd161e8, 0xffd38c8f, 0xffd5ae88, 0xffd7c808,
+    0xffd9d925, 0xffdbe1c8, 0xffdde1f3, 0xffdfd964,
+    0xffe1c79b, 0xffe3abcc, 0xffe5852a, 0xffe75341,
+    0xffe9162f, 0xffeace55, 0xffec7c15, 0xffee1f63,
+    0xffefb7e9, 0xfff1453d, 0xfff2c6fd, 0xfff43ca8,
+    0xfff5a5d4, 0xfff701ea, 0xfff850b4, 0xfff99288,
+    0xfffac853, 0xfffbf2d5, 0xfffd12e6, 0xfffe2991,
+    0xffff37e4, 0x00003eea, 0x00013ec4, 0x00023646,
+    0x0003244d, 0x00040797, 0x0004de8c, 0x0005a734,
+    0x00065fab, 0x0007068f, 0x00079c82, 0x000822fa,
+    0x00089b70, 0x000907a6, 0x00096a01, 0x0009c506,
+    0x000a1b37, 0x000a6e18, 0x000abe1f, 0x000b0bac,
+    0x000b5701, 0x000b9f3b, 0x000be2c2, 0x000c1fff,
+    0x000c5599, 0x000c829a, 0x000ca661, 0x000cc058,
+    0x000cd028, 0x000cd63d, 0x000cd317, 0x000cc739,
+    0x000cb36d, 0x000c98c0, 0x000c7833, 0x000c52df,
+    0x000c2984, 0x000bfcf9, 0x000bcdea, 0x000b9cf7,
+    0x000b6a97, 0x000b3700, 0x000b029d, 0x000acd79,
+    0x000a977e, 0x000a6076, 0x000a2838, 0x0009eea1,
+    0x0009b37d, 0x000976c2, 0x0009384e, 0x0008f816,
+    0x0008b612, 0x0008724a, 0x00082cd5, 0x0007e5e8,
+    0x00079dce, 0x000754de, 0x00070b62, 0x0006c1c6,
+    0x0006786a, 0x00062fba, 0x0005e801, 0x0005a1a0,
+    0x00055ce1, 0x000519fb, 0x0004d8f8, 0x000499b8,
+    0x00045c30, 0x00042040, 0x0003e5c8, 0x0003acb3,
+    0x000374df, 0x00033e59, 0x00030934, 0x0002d57d,
+    0x0002a348, 0x000272b6, 0x000243f2, 0x00021711,
+    0x0001ec3e, 0x0001c37a, 0x00019cc3, 0x00017830,
+    0x000155a0, 0x00013514, 0x0001168b, 0x0000f9e6,
+    0x0000df23, 0x0000c62e, 0x0000aef2, 0x00009978,
+    0x000085a1, 0x0000736d, 0x000062dc, 0x000053d8,
+    0x0000466c, 0x00003a62, 0x00002fd1, 0x00002681,
+    0x00001e73, 0x00001792, 0x000011c9, 0x00000cf6,
+    0x0000091a, 0x000005ff, 0x000003b1, 0x00000203,
+    0x000000d7, 0x0000002b, 0xffffffd5, 0xffffffc0,
+    0xffffffd5, 0x00000000, 0x00000015, 0x00000000,
+    0x00000000, 0x00000015, 0x00000000, 0xffffffd5,
+    0xffffffca, 0xffffffd5, 0x0000002b, 0x000000cc,
+    0x000001e3, 0x0000037b, 0x0000059f, 0x0000086e,
+    0x00000bf4, 0x0000103b, 0x00001564, 0x00001b6e,
+    0x0000226f, 0x00002a68, 0x00003377, 0x00003d93,
+    0x000048c5, 0x00005525, 0x000062a6, 0x00007155,
+    0x0000812f, 0x00009237, 0x0000a455, 0x0000b7ab,
+    0x0000cc18, 0x0000e1bd, 0x0000f878, 0x0001106c,
+    0x00012981, 0x000143c2, 0x00015f30, 0x00017bb6,
+    0x00019948, 0x0001b7e6, 0x0001d771, 0x0001f7bc,
+    0x000218b4, 0x00023a42, 0x00025c3b, 0x00027ea0,
+    0x0002a150, 0x0002c440, 0x0002e771, 0x00030aed,
+    0x00032eb4, 0x000352db, 0x00037759, 0x00039c4c,
+    0x0003c1ac, 0x0003e74b, 0x00040d00, 0x0004329f,
+    0x000457de, 0x00047c9c, 0x0004a083, 0x0004c35e,
+    0x0004e502, 0x00050543, 0x000523ec, 0x000540e7,
+    0x00055c2b, 0x000575c0, 0x00058da9, 0x0005a3e4,
+    0x0005b886, 0x0005cbb1, 0x0005dd65, 0x0005edcb,
+    0x0005fcfa, 0x00060afc, 0x00061808, 0x000623fc,
+    0x00062ec3, 0x00063849, 0x0006404b, 0x000646ac,
+    0x00064b13, 0x00064d37, 0x00064cd6, 0x0006497b,
+    0x000642c5, 0x0006385e, 0x000629f0, 0x00061766,
+    0x000600a0, 0x0005e57d, 0x0005c63e, 0x0005a322,
+    0x00057c97, 0x00055306, 0x00052711, 0x0004f96f,
+    0x0004caeb, 0x00049bfc, 0x00046c96, 0x00043cbb,
+    0x00040c3f, 0x0003daab, 0x0003a734, 0x000370f9,
+    0x0003372d, 0x0002f944, 0x0002b6d4, 0x00026f71,
+    0x000222fb, 0x0001d212, 0x00017d84, 0x00012630,
+    0x0000ccda, 0x00007200, 0x0000163b, 0xffffba15,
+    0xffff5da3, 0xffff0091, 0xfffea293, 0xfffe4367,
+    0xfffde2da, 0xfffd809f, 0xfffd1c81, 0xfffcb66a,
+    0xfffc4e90, 0xfffbe53e, 0xfffb7aa0, 0xfffb0f0a,
+    0xfffaa2c9, 0xfffa3612, 0xfff9c92f, 0xfff95c2d,
+    0xfff8eef4, 0xfff8817c, 0xfff813c3, 0xfff7a5d4,
+    0xfff737e5, 0xfff6ca17, 0xfff65c9e, 0xfff5efbc,
+    0xfff58390, 0xfff51830, 0xfff4adbc, 0xfff44435,
+    0xfff3db9a, 0xfff373d6, 0xfff30cfd, 0xfff2a71c,
+    0xfff24248, 0xfff1de9f, 0xfff17c44, 0xfff11b56,
+    0xfff0bbea, 0xfff05e17, 0xfff00206, 0xffefa7d9,
+    0xffef4f99, 0xffeef95d, 0xffeea53a, 0xffee533a,
+    0xffee035e, 0xffedb5b0, 0xffed6a3c, 0xffed20f5,
+    0xffecd9fe, 0xffec9555, 0xffec5305, 0xffec1319,
+    0xffebd591, 0xffeb9a83, 0xffeb61f9, 0xffeb2bfe,
+    0xffeaf89c, 0xffeac7ea, 0xffea99d2, 0xffea6e7e,
+    0xffea45ef, 0xffea203a, 0xffe9fda0, 0xffe9decc,
+    0xffe9c3de, 0xffe9ac56, 0xffe99789, 0xffe9845e,
+    0xffe97295, 0xffe96219, 0xffe952ea, 0xffe944f3,
+    0xffe93833, 0xffe92c9f, 0xffe92238, 0xffe918fe,
+    0xffe910fb, 0xffe90a3a, 0xffe904c6, 0xffe900a0,
+    0xffe8fddb, 0xffe8fc83, 0xffe8fca4, 0xffe8fe3c,
+    0xffe9016c, 0xffe9061e, 0xffe90c74, 0xffe9146c,
+    0xffe91e11, 0xffe929a5, 0xffe93731, 0xffe946c0,
+    0xffe95833, 0xffe96b7e, 0xffe98082, 0xffe9975e,
+    0xffe9affd, 0xffe9ca5e, 0xffe9e68e, 0xffea0481,
+    0xffea242b, 0xffea458e, 0xffea6894, 0xffea8d52,
+    0xffeab3c8, 0xffeadc0c, 0xffeb05fe, 0xffeb31a7,
+    0xffeb5ede, 0xffeb8da2, 0xffebbdf4, 0xffebefbd,
+    0xffec231f, 0xffec5802, 0xffec8e5e, 0xffecc61c,
+    0xffecff1c, 0xffed391e, 0xffed740c, 0xffedafb1,
+    0xffedebe1, 0xffee287d, 0xffee654e, 0xffeea23f,
+};
diff --git a/libavcodec/aactab.h b/libavcodec/aactab.h
index fc26db8a4d..9f333801e9 100644
--- a/libavcodec/aactab.h
+++ b/libavcodec/aactab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,13 +41,53 @@
  * encoder.
  */
 
+/* @name tns_tmp2_map
+ * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
+ * The suffix _M_N[] indicate the values of coef_compress and coef_res
+ * respectively.
+ * @{
+ */
+static const INTFLOAT tns_tmp2_map_1_3[4] = {
+    Q31(0.00000000f), Q31(-0.43388373f),  Q31(0.64278758f),  Q31(0.34202015f),
+};
+
+static const INTFLOAT tns_tmp2_map_0_3[8] = {
+    Q31(0.00000000f), Q31(-0.43388373f), Q31(-0.78183150f), Q31(-0.97492790f),
+    Q31(0.98480773f), Q31( 0.86602539f), Q31( 0.64278758f), Q31( 0.34202015f),
+};
+
+static const INTFLOAT tns_tmp2_map_1_4[8] = {
+    Q31(0.00000000f), Q31(-0.20791170f), Q31(-0.40673664f), Q31(-0.58778524f),
+    Q31(0.67369562f), Q31( 0.52643216f), Q31( 0.36124167f), Q31( 0.18374951f),
+};
+
+static const INTFLOAT tns_tmp2_map_0_4[16] = {
+    Q31( 0.00000000f), Q31(-0.20791170f), Q31(-0.40673664f), Q31(-0.58778524f),
+    Q31(-0.74314481f), Q31(-0.86602539f), Q31(-0.95105654f), Q31(-0.99452192f),
+    Q31( 0.99573416f), Q31( 0.96182561f), Q31( 0.89516330f), Q31( 0.79801720f),
+    Q31( 0.67369562f), Q31( 0.52643216f), Q31( 0.36124167f), Q31( 0.18374951f),
+};
+
+static const INTFLOAT * const tns_tmp2_map[4] = {
+    tns_tmp2_map_0_3,
+    tns_tmp2_map_0_4,
+    tns_tmp2_map_1_3,
+    tns_tmp2_map_1_4
+};
+// @}
+
 /* @name window coefficients
  * @{
  */
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_512_fixed)[512];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_short_128_fixed)[128];
 const DECLARE_ALIGNED(32, extern float, ff_aac_eld_window_512)[1920];
+const DECLARE_ALIGNED(32, extern int,   ff_aac_eld_window_512_fixed)[1920];
 const DECLARE_ALIGNED(32, extern float, ff_aac_eld_window_480)[1800];
+const DECLARE_ALIGNED(32, extern int,   ff_aac_eld_window_480_fixed)[1800];
 // @}
 
 /* @name number of scalefactor window bands for long and short transform windows respectively
diff --git a/libavcodec/aandcttab.c b/libavcodec/aandcttab.c
index 0c5b573412..97013d2b52 100644
--- a/libavcodec/aandcttab.c
+++ b/libavcodec/aandcttab.c
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Aakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #include <stdint.h>
diff --git a/libavcodec/aandcttab.h b/libavcodec/aandcttab.h
index daccb7bb0b..b0a2f44ecd 100644
--- a/libavcodec/aandcttab.h
+++ b/libavcodec/aandcttab.h
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Nakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #ifndef AVCODEC_AANDCTTAB_H
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 45b5c40f80..8defd7c9ec 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
index e12953e86c..6b9b77eb30 100644
--- a/libavcodec/aarch64/cabac.h
+++ b/libavcodec/aarch64/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index 589e82d331..8514d3b07b 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index e205e23d88..862039f97d 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -8,20 +8,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index c7679ab4cc..2af62becf7 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index d1025c7a25..486079f87c 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index b106f11134..e0f378f5ab 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,6 +78,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if (chroma_format_idc <= 1)
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 9b4610a4d4..4ec35f2905 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 99c2cb5030..04b5a47f04 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 8f912cbca9..b144376f90 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index a38a27f186..213b40b3e7 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 74088b216c..77f41d9a21 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 731dc0658d..d27cfac494 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c
index 6bc4c09f6c..144ae2bcc4 100644
--- a/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index 29782908f8..a491c173bb 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
index 38018f2b4a..58af9f00c0 100644
--- a/libavcodec/aarch64/imdct15_init.c
+++ b/libavcodec/aarch64/imdct15_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
index d99edf4108..97e1442ccc 100644
--- a/libavcodec/aarch64/imdct15_neon.S
+++ b/libavcodec/aarch64/imdct15_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index bccd8323fd..1fd199c972 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c
index a8b2baf955..b94514645f 100644
--- a/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index 808576af72..733fc8406b 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index f1072b73e5..619aec6426 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c
index 041482913f..6e41f3736e 100644
--- a/libavcodec/aarch64/neontest.c
+++ b/libavcodec/aarch64/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index 0bb404f6d2..764bc1ef39 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 11cd81eb5c..e59e55ec88 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index 7ce5a7ddf6..24067cc2af 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c
index 59b697d4f4..6f667a6d3e 100644
--- a/libavcodec/aarch64/videodsp_init.c
+++ b/libavcodec/aarch64/videodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c
index 3559b54a30..c796f95e61 100644
--- a/libavcodec/aarch64/vorbisdsp_init.c
+++ b/libavcodec/aarch64/vorbisdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S
index 11f71f1d89..e76feebc54 100644
--- a/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/libavcodec/aarch64/vorbisdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aasc.c b/libavcodec/aasc.c
index 462ee67786..a91721e28d 100644
--- a/libavcodec/aasc.c
+++ b/libavcodec/aasc.c
@@ -1,21 +1,21 @@
 /*
  * Autodesk RLE Decoder
- * Copyright (C) 2005 the ffmpeg project
+ * Copyright (c) 2005 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,15 +36,39 @@ typedef struct AascContext {
     AVCodecContext *avctx;
     GetByteContext gb;
     AVFrame *frame;
+
+    uint32_t palette[AVPALETTE_COUNT];
+    int palette_size;
 } AascContext;
 
 static av_cold int aasc_decode_init(AVCodecContext *avctx)
 {
     AascContext *s = avctx->priv_data;
+    uint8_t *ptr;
+    int i;
 
     s->avctx = avctx;
-
-    avctx->pix_fmt = AV_PIX_FMT_BGR24;
+    switch (avctx->bits_per_coded_sample) {
+    case 8:
+        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+
+        ptr = avctx->extradata;
+        s->palette_size = FFMIN(avctx->extradata_size, AVPALETTE_SIZE);
+        for (i = 0; i < s->palette_size / 4; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RL32(ptr);
+            ptr += 4;
+        }
+        break;
+    case 16:
+        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+        break;
+    case 24:
+        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", avctx->bits_per_coded_sample);
+        return -1;
+    }
 
     s->frame = av_frame_alloc();
     if (!s->frame)
@@ -60,27 +84,35 @@ static int aasc_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     AascContext *s     = avctx->priv_data;
-    int compr, i, stride, ret;
+    int compr, i, stride, psize, ret;
 
-    if (buf_size < 4)
+    if (buf_size < 4) {
+        av_log(avctx, AV_LOG_ERROR, "frame too short\n");
         return AVERROR_INVALIDDATA;
+    }
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     compr     = AV_RL32(buf);
     buf      += 4;
     buf_size -= 4;
+    psize = avctx->bits_per_coded_sample / 8;
+    switch (avctx->codec_tag) {
+    case MKTAG('A', 'A', 'S', '4'):
+        bytestream2_init(&s->gb, buf - 4, buf_size + 4);
+        ff_msrle_decode(avctx, (AVPicture*)s->frame, 8, &s->gb);
+        break;
+    case MKTAG('A', 'A', 'S', 'C'):
     switch (compr) {
     case 0:
-        stride = (avctx->width * 3 + 3) & ~3;
+        stride = (avctx->width * psize + psize) & ~psize;
         if (buf_size < stride * avctx->height)
             return AVERROR_INVALIDDATA;
         for (i = avctx->height - 1; i >= 0; i--) {
-            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * 3);
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
             buf += stride;
+            buf_size -= stride;
         }
         break;
     case 1:
@@ -91,6 +123,14 @@ static int aasc_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
+        return -1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(s->frame->data[1], s->palette, s->palette_size);
 
     *got_frame = 1;
     if ((ret = av_frame_ref(data, s->frame)) < 0)
diff --git a/libavcodec/ac3.c b/libavcodec/ac3.c
index 99e5b50acb..b54315dcb3 100644
--- a/libavcodec/ac3.c
+++ b/libavcodec/ac3.c
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -131,6 +131,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
     int band_start, band_end, begin, end1;
     int lowcomp, fastleak, slowleak;
 
+    if (end <= 0)
+        return AVERROR_INVALIDDATA;
+
     /* excitation function */
     band_start = ff_ac3_bin_to_band_tab[start];
     band_end   = ff_ac3_bin_to_band_tab[end-1] + 1;
@@ -200,9 +203,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
             if (band >= AC3_CRITICAL_BANDS || dba_lengths[seg] > AC3_CRITICAL_BANDS-band)
                 return -1;
             if (dba_values[seg] >= 4) {
-                delta = (dba_values[seg] - 3) << 7;
+                delta = (dba_values[seg] - 3) * 128;
             } else {
-                delta = (dba_values[seg] - 4) << 7;
+                delta = (dba_values[seg] - 4) * 128;
             }
             for (i = 0; i < dba_lengths[seg]; i++) {
                 mask[band++] += delta;
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index f2cb6c39df..3f67e09aea 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,8 @@
 #define AC3_CRITICAL_BANDS 50
 #define AC3_MAX_CPL_BANDS  18
 
+#include "libavutil/opt.h"
+#include "avcodec.h"
 #include "ac3tab.h"
 
 /* exponent encoding strategy */
@@ -49,6 +51,54 @@
 #define EXP_D25   2
 #define EXP_D45   3
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#define FFT_FLOAT 0
+
+#define FIXR(a)                 ((int)((a) * 0 + 0.5))
+#define FIXR12(a)               ((int)((a) * 4096 + 0.5))
+#define FIXR15(a)               ((int)((a) * 32768 + 0.5))
+#define ROUND15(x)              ((x) + 16384) >> 15
+
+#define AC3_RENAME(x)           x ## _fixed
+#define AC3_NORM(norm)          (1<<24)/(norm)
+#define AC3_MUL(a,b)            ((((int64_t) (a)) * (b))>>12)
+#define AC3_RANGE(x)            ((x)|(((x)&128)<<1))
+#define AC3_HEAVY_RANGE(x)      ((x)<<1)
+#define AC3_DYNAMIC_RANGE(x)    (x)
+#define AC3_SPX_BLEND(x)        (x)
+#define AC3_DYNAMIC_RANGE1      0
+
+#define INTFLOAT                int
+#define SHORTFLOAT              int16_t
+
+#else /* USE_FIXED */
+
+#define FIXR(x)                 ((float)(x))
+#define FIXR12(x)               ((float)(x))
+#define FIXR15(x)               ((float)(x))
+#define ROUND15(x)              (x)
+
+#define AC3_RENAME(x)           x
+#define AC3_NORM(norm)          (1.0f/(norm))
+#define AC3_MUL(a,b)            ((a) * (b))
+#define AC3_RANGE(x)            (dynamic_range_tab[(x)])
+#define AC3_HEAVY_RANGE(x)      (heavy_dynamic_range_tab[(x)])
+#define AC3_DYNAMIC_RANGE(x)    (powf(x,  s->drc_scale))
+#define AC3_SPX_BLEND(x)        (x)* (1.0f/32)
+#define AC3_DYNAMIC_RANGE1      1.0f
+
+#define INTFLOAT                float
+#define SHORTFLOAT              float
+
+#endif /* USE_FIXED */
+
+#define AC3_LEVEL(x)            ROUND15((x) * FIXR15(0.7071067811865476))
+
 /* pre-defined gain values */
 #define LEVEL_PLUS_3DB          1.4142135623730950
 #define LEVEL_PLUS_1POINT5DB    1.1892071150027209
diff --git a/libavcodec/ac3_parser.c b/libavcodec/ac3_parser.c
index 970484810b..5ab5627a46 100644
--- a/libavcodec/ac3_parser.c
+++ b/libavcodec/ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,9 +47,16 @@ static const uint8_t center_levels[4] = { 4, 5, 6, 5 };
 static const uint8_t surround_levels[4] = { 4, 6, 7, 6 };
 
 
-int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
+int avpriv_ac3_parse_header2(GetBitContext *gbc, AC3HeaderInfo **phdr)
 {
     int frame_size_code;
+    AC3HeaderInfo *hdr;
+
+    if (!*phdr)
+        *phdr = av_mallocz(sizeof(AC3HeaderInfo));
+    if (!*phdr)
+        return AVERROR(ENOMEM);
+    hdr = *phdr;
 
     memset(hdr, 0, sizeof(*hdr));
 
@@ -133,8 +140,8 @@ int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
         hdr->channel_mode = get_bits(gbc, 3);
         hdr->lfe_on = get_bits1(gbc);
 
-        hdr->bit_rate = (uint32_t)(8.0 * hdr->frame_size * hdr->sample_rate /
-                        (hdr->num_blocks * 256.0));
+        hdr->bit_rate = 8LL * hdr->frame_size * hdr->sample_rate /
+                        (hdr->num_blocks * 256);
         hdr->channels = ff_ac3_channels_tab[hdr->channel_mode] + hdr->lfe_on;
     }
     hdr->channel_layout = avpriv_ac3_channel_layout_tab[hdr->channel_mode];
@@ -144,6 +151,15 @@ int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
     return 0;
 }
 
+int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
+{
+    AC3HeaderInfo tmp, *ptmp = &tmp;
+    int ret = avpriv_ac3_parse_header2(gbc, &ptmp);
+
+    memcpy(hdr, ptmp, ((intptr_t)&tmp.channel_layout) - ((intptr_t)&tmp) + sizeof(uint64_t));
+    return ret;
+}
+
 static int ac3_sync(uint64_t state, AACAC3ParseContext *hdr_info,
         int *need_next_header, int *new_frame_start)
 {
@@ -152,11 +168,11 @@ static int ac3_sync(uint64_t state, AACAC3ParseContext *hdr_info,
         uint64_t u64;
         uint8_t  u8[8 + AV_INPUT_BUFFER_PADDING_SIZE];
     } tmp = { av_be2ne64(state) };
-    AC3HeaderInfo hdr;
+    AC3HeaderInfo hdr, *phdr = &hdr;
     GetBitContext gbc;
 
     init_get_bits(&gbc, tmp.u8+8-AC3_HEADER_SIZE, 54);
-    err = avpriv_ac3_parse_header(&gbc, &hdr);
+    err = avpriv_ac3_parse_header2(&gbc, &phdr);
 
     if(err < 0)
         return 0;
diff --git a/libavcodec/ac3_parser.h b/libavcodec/ac3_parser.h
index 9322550ea5..f37387d76c 100644
--- a/libavcodec/ac3_parser.h
+++ b/libavcodec/ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,11 +31,14 @@
  * Parse the header up to the lfeon element, which is the first 52 or 54 bits
  * depending on the audio coding mode.
  * @param[in]  gbc BitContext containing the first 54 bits of the frame.
- * @param[out] hdr Pointer to struct where header info is written.
+ * @param[out] hdr Pointer to Pointer to struct where header info is written.
+ *                 will be allocated if NULL
  * @return Returns 0 on success, -1 if there is a sync word mismatch,
  * -2 if the bsid (version) element is invalid, -3 if the fscod (sample rate)
  * element is invalid, or -4 if the frmsizecod (bit rate) element is invalid.
  */
+int avpriv_ac3_parse_header2(GetBitContext *gbc, AC3HeaderInfo **hdr);
+
 int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr);
 
 #endif /* AVCODEC_AC3_PARSER_H */
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 97ce2878a8..6df697e855 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,7 @@ static const uint8_t quantization_tab[16] = {
 
 /** dynamic range table. converts codes to scale factors. */
 static float dynamic_range_tab[256];
+static float heavy_dynamic_range_tab[256];
 
 /** Adjustments in dB gain */
 static const float gain_levels[9] = {
@@ -111,7 +112,7 @@ static const uint8_t ac3_default_coeffs[8][5][2] = {
 static inline int
 symmetric_dequant(int code, int levels)
 {
-    return ((code - (levels >> 1)) << 24) / levels;
+    return ((code - (levels >> 1)) * (1 << 24)) / levels;
 }
 
 /*
@@ -164,6 +165,14 @@ static av_cold void ac3_tables_init(void)
         int v = (i >> 5) - ((i >> 7) << 3) - 5;
         dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0x1F) | 0x20);
     }
+
+    /* generate compr dynamic range table
+       reference: Section 7.7.2 Heavy Compression */
+    for (i = 0; i < 256; i++) {
+        int v = (i >> 4) - ((i >> 7) << 4) - 4;
+        heavy_dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0xF) | 0x10);
+    }
+
 }
 
 /**
@@ -180,14 +189,23 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
     ac3_tables_init();
     ff_mdct_init(&s->imdct_256, 8, 1, 1.0);
     ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
-    ff_kbd_window_init(s->window, 5.0, 256);
+    AC3_RENAME(ff_kbd_window_init)(s->window, 5.0, 256);
     ff_bswapdsp_init(&s->bdsp);
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+#if (USE_FIXED)
+    s->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
     ff_fmt_convert_init(&s->fmt_conv, avctx);
+#endif
+
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
     av_lfg_init(&s->dith_state, 0);
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    if (USE_FIXED)
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+    else
+        avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
     /* allow downmixing to stereo or mono */
     if (avctx->channels > 1 &&
@@ -219,9 +237,19 @@ static int ac3_parse_header(AC3DecodeContext *s)
     /* read the rest of the bsi. read twice for dual mono mode. */
     i = !s->channel_mode;
     do {
-        skip_bits(gbc, 5); // skip dialog normalization
-        if (get_bits1(gbc))
-            skip_bits(gbc, 8); //skip compression
+        s->dialog_normalization[(!s->channel_mode)-i] = -get_bits(gbc, 5);
+        if (s->dialog_normalization[(!s->channel_mode)-i] == 0) {
+            s->dialog_normalization[(!s->channel_mode)-i] = -31;
+        }
+        if (s->target_level != 0) {
+            s->level_gain[(!s->channel_mode)-i] = powf(2.0f,
+                (float)(s->target_level -
+                s->dialog_normalization[(!s->channel_mode)-i])/6.0f);
+        }
+        if (s->compression_exists[(!s->channel_mode)-i] = get_bits1(gbc)) {
+            s->heavy_dynamic_range[(!s->channel_mode)-i] =
+                AC3_HEAVY_RANGE(get_bits(gbc, 8));
+        }
         if (get_bits1(gbc))
             skip_bits(gbc, 8); //skip language code
         if (get_bits1(gbc))
@@ -267,10 +295,10 @@ static int ac3_parse_header(AC3DecodeContext *s)
  */
 static int parse_frame_header(AC3DecodeContext *s)
 {
-    AC3HeaderInfo hdr;
+    AC3HeaderInfo hdr, *phdr=&hdr;
     int err;
 
-    err = avpriv_ac3_parse_header(&s->gbc, &hdr);
+    err = avpriv_ac3_parse_header2(&s->gbc, &phdr);
     if (err)
         return err;
 
@@ -338,40 +366,45 @@ static void set_downmix_coeffs(AC3DecodeContext *s)
     float cmix = gain_levels[s->  center_mix_level];
     float smix = gain_levels[s->surround_mix_level];
     float norm0, norm1;
+    float downmix_coeffs[AC3_MAX_CHANNELS][2];
 
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[i][0] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
-        s->downmix_coeffs[i][1] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
+        downmix_coeffs[i][0] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
+        downmix_coeffs[i][1] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
     }
     if (s->channel_mode > 1 && s->channel_mode & 1) {
-        s->downmix_coeffs[1][0] = s->downmix_coeffs[1][1] = cmix;
+        downmix_coeffs[1][0] = downmix_coeffs[1][1] = cmix;
     }
     if (s->channel_mode == AC3_CHMODE_2F1R || s->channel_mode == AC3_CHMODE_3F1R) {
         int nf = s->channel_mode - 2;
-        s->downmix_coeffs[nf][0] = s->downmix_coeffs[nf][1] = smix * LEVEL_MINUS_3DB;
+        downmix_coeffs[nf][0] = downmix_coeffs[nf][1] = smix * LEVEL_MINUS_3DB;
     }
     if (s->channel_mode == AC3_CHMODE_2F2R || s->channel_mode == AC3_CHMODE_3F2R) {
         int nf = s->channel_mode - 4;
-        s->downmix_coeffs[nf][0] = s->downmix_coeffs[nf+1][1] = smix;
+        downmix_coeffs[nf][0] = downmix_coeffs[nf+1][1] = smix;
     }
 
     /* renormalize */
     norm0 = norm1 = 0.0;
     for (i = 0; i < s->fbw_channels; i++) {
-        norm0 += s->downmix_coeffs[i][0];
-        norm1 += s->downmix_coeffs[i][1];
+        norm0 += downmix_coeffs[i][0];
+        norm1 += downmix_coeffs[i][1];
     }
     norm0 = 1.0f / norm0;
     norm1 = 1.0f / norm1;
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[i][0] *= norm0;
-        s->downmix_coeffs[i][1] *= norm1;
+        downmix_coeffs[i][0] *= norm0;
+        downmix_coeffs[i][1] *= norm1;
     }
 
     if (s->output_mode == AC3_CHMODE_MONO) {
         for (i = 0; i < s->fbw_channels; i++)
-            s->downmix_coeffs[i][0] = (s->downmix_coeffs[i][0] +
-                                       s->downmix_coeffs[i][1]) * LEVEL_MINUS_3DB;
+            downmix_coeffs[i][0] = (downmix_coeffs[i][0] +
+                                    downmix_coeffs[i][1]) * LEVEL_MINUS_3DB;
+    }
+    for (i = 0; i < s->fbw_channels; i++) {
+        s->downmix_coeffs[i][0] = FIXR12(downmix_coeffs[i][0]);
+        s->downmix_coeffs[i][1] = FIXR12(downmix_coeffs[i][1]);
     }
 }
 
@@ -429,7 +462,7 @@ static void calc_transform_coeffs_cpl(AC3DecodeContext *s)
                 int cpl_coord = s->cpl_coords[ch][band] << 5;
                 for (bin = band_start; bin < band_end; bin++) {
                     s->fixed_coeffs[ch][bin] =
-                        MULH(s->fixed_coeffs[CPL_CH][bin] << 4, cpl_coord);
+                        MULH(s->fixed_coeffs[CPL_CH][bin] * (1 << 4), cpl_coord);
                 }
                 if (ch == 2 && s->phase_flags[band]) {
                     for (bin = band_start; bin < band_end; bin++)
@@ -475,7 +508,7 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
         case 0:
             /* random noise with approximate range of -0.707 to 0.707 */
             if (dither)
-                mantissa = (av_lfg_get(&s->dith_state) / 362) - 5932275;
+                mantissa = (((av_lfg_get(&s->dith_state)>>8)*181)>>8) - 5931008;
             else
                 mantissa = 0;
             break;
@@ -522,8 +555,11 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
             break;
         default: /* 6 to 15 */
             /* Shift mantissa and sign-extend it. */
-            mantissa = get_sbits(gbc, quantization_tab[bap]);
-            mantissa <<= 24 - quantization_tab[bap];
+            if (bap > 15) {
+                av_log(s->avctx, AV_LOG_ERROR, "bap %d is invalid in plain AC-3\n", bap);
+                bap = 15;
+            }
+            mantissa = (unsigned)get_sbits(gbc, quantization_tab[bap]) << (24 - quantization_tab[bap]);
             break;
         }
         coeffs[freq] = mantissa >> exps[freq];
@@ -557,7 +593,7 @@ static void decode_transform_coeffs_ch(AC3DecodeContext *s, int blk, int ch,
         /* if AHT is used, mantissas for all blocks are encoded in the first
            block of the frame. */
         int bin;
-        if (!blk && CONFIG_EAC3_DECODER)
+        if (CONFIG_EAC3_DECODER && !blk)
             ff_eac3_decode_transform_coeffs_aht_ch(s, ch);
         for (bin = s->start_freq[ch]; bin < s->end_freq[ch]; bin++) {
             s->fixed_coeffs[ch][bin] = s->pre_mantissa[ch][bin][blk] >> s->dexps[ch][bin];
@@ -635,20 +671,30 @@ static inline void do_imdct(AC3DecodeContext *s, int channels)
     for (ch = 1; ch <= channels; ch++) {
         if (s->block_switch[ch]) {
             int i;
-            float *x = s->tmp_output + 128;
+            FFTSample *x = s->tmp_output + 128;
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i];
             s->imdct_256.imdct_half(&s->imdct_256, s->tmp_output, x);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
                                        s->tmp_output, s->window, 128);
+#endif
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i + 1];
             s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1], x);
         } else {
             s->imdct_512.imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
                                        s->tmp_output, s->window, 128);
-            memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(float));
+#endif
+            memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(FFTSample));
         }
     }
 }
@@ -783,13 +829,14 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         if (get_bits1(gbc)) {
             /* Allow asymmetric application of DRC when drc_scale > 1.
                Amplification of quiet sounds is enhanced */
-            float range = dynamic_range_tab[get_bits(gbc, 8)];
-            if (range > 1.0 || s->drc_scale <= 1.0)
-                s->dynamic_range[i] = powf(range, s->drc_scale);
+            int range_bits = get_bits(gbc, 8);
+            INTFLOAT range = AC3_RANGE(range_bits);
+            if (range_bits <= 127 || s->drc_scale <= 1.0)
+                s->dynamic_range[i] = AC3_DYNAMIC_RANGE(range);
             else
                 s->dynamic_range[i] = range;
         } else if (blk == 0) {
-            s->dynamic_range[i] = 1.0f;
+            s->dynamic_range[i] = AC3_DYNAMIC_RANGE1;
         }
     } while (i--);
 
@@ -815,6 +862,9 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
             if (start_subband > 7)
                 start_subband += start_subband - 7;
             end_subband    = get_bits(gbc, 3) + 5;
+#if USE_FIXED
+            s->spx_dst_end_freq = end_freq_inv_tab[end_subband-5];
+#endif
             if (end_subband   > 7)
                 end_subband   += end_subband   - 7;
             dst_start_freq = dst_start_freq * 12 + 25;
@@ -835,7 +885,8 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
 
             s->spx_dst_start_freq = dst_start_freq;
             s->spx_src_start_freq = src_start_freq;
-            s->spx_dst_end_freq   = dst_end_freq;
+            if (!USE_FIXED)
+                s->spx_dst_end_freq   = dst_end_freq;
 
             decode_band_structure(gbc, blk, s->eac3, 0,
                                   start_subband, end_subband,
@@ -855,26 +906,47 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         for (ch = 1; ch <= fbw_channels; ch++) {
             if (s->channel_uses_spx[ch]) {
                 if (s->first_spx_coords[ch] || get_bits1(gbc)) {
-                    float spx_blend;
+                    INTFLOAT spx_blend;
                     int bin, master_spx_coord;
 
                     s->first_spx_coords[ch] = 0;
-                    spx_blend = get_bits(gbc, 5) * (1.0f/32);
+                    spx_blend = AC3_SPX_BLEND(get_bits(gbc, 5));
                     master_spx_coord = get_bits(gbc, 2) * 3;
 
                     bin = s->spx_src_start_freq;
                     for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
-                        int bandsize;
+                        int bandsize = s->spx_band_sizes[bnd];
                         int spx_coord_exp, spx_coord_mant;
-                        float nratio, sblend, nblend, spx_coord;
+                        INTFLOAT nratio, sblend, nblend;
+#if USE_FIXED
+                        /* calculate blending factors */
+                        int64_t accu = ((bin << 23) + (bandsize << 22))
+                                     * (int64_t)s->spx_dst_end_freq;
+                        nratio = (int)(accu >> 32);
+                        nratio -= spx_blend << 18;
+
+                        if (nratio < 0) {
+                            nblend = 0;
+                            sblend = 0x800000;
+                        } else if (nratio > 0x7fffff) {
+                            nblend = 14529495; // sqrt(3) in FP.23
+                            sblend = 0;
+                        } else {
+                            nblend = fixed_sqrt(nratio, 23);
+                            accu = (int64_t)nblend * 1859775393;
+                            nblend = (int)((accu + (1<<29)) >> 30);
+                            sblend = fixed_sqrt(0x800000 - nratio, 23);
+                        }
+#else
+                        float spx_coord;
 
                         /* calculate blending factors */
-                        bandsize = s->spx_band_sizes[bnd];
                         nratio = ((float)((bin + (bandsize >> 1))) / s->spx_dst_end_freq) - spx_blend;
                         nratio = av_clipf(nratio, 0.0f, 1.0f);
                         nblend = sqrtf(3.0f * nratio); // noise is scaled by sqrt(3)
                                                        // to give unity variance
                         sblend = sqrtf(1.0f - nratio);
+#endif
                         bin += bandsize;
 
                         /* decode spx coordinates */
@@ -883,11 +955,18 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
                         if (spx_coord_exp == 15) spx_coord_mant <<= 1;
                         else                     spx_coord_mant += 4;
                         spx_coord_mant <<= (25 - spx_coord_exp - master_spx_coord);
-                        spx_coord = spx_coord_mant * (1.0f / (1 << 23));
 
                         /* multiply noise and signal blending factors by spx coordinate */
+#if USE_FIXED
+                        accu = (int64_t)nblend * spx_coord_mant;
+                        s->spx_noise_blend[ch][bnd]  = (int)((accu + (1<<22)) >> 23);
+                        accu = (int64_t)sblend * spx_coord_mant;
+                        s->spx_signal_blend[ch][bnd] = (int)((accu + (1<<22)) >> 23);
+#else
+                        spx_coord = spx_coord_mant * (1.0f / (1 << 23));
                         s->spx_noise_blend [ch][bnd] = nblend * spx_coord;
                         s->spx_signal_blend[ch][bnd] = sblend * spx_coord;
+#endif
                     }
                 }
             } else {
@@ -1244,18 +1323,28 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
 
     /* apply scaling to coefficients (headroom, dynrng) */
     for (ch = 1; ch <= s->channels; ch++) {
-        float gain = 1.0 / 4194304.0f;
-        if (s->channel_mode == AC3_CHMODE_DUALMONO) {
-            gain *= s->dynamic_range[2 - ch];
-        } else {
-            gain *= s->dynamic_range[0];
-        }
+        int audio_channel = 0;
+        INTFLOAT gain;
+        if (s->channel_mode == AC3_CHMODE_DUALMONO)
+            audio_channel = 2-ch;
+        if (s->heavy_compression && s->compression_exists[audio_channel])
+            gain = s->heavy_dynamic_range[audio_channel];
+        else
+            gain = s->dynamic_range[audio_channel];
+
+#if USE_FIXED
+        scale_coefs(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+#else
+        if (s->target_level != 0)
+          gain = gain * s->level_gain[audio_channel];
+        gain *= 1.0 / 4194304.0f;
         s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch],
                                                s->fixed_coeffs[ch], gain, 256);
+#endif
     }
 
     /* apply spectral extension to high frequency bins */
-    if (s->spx_in_use && CONFIG_EAC3_DECODER) {
+    if (CONFIG_EAC3_DECODER && s->spx_in_use) {
         ff_eac3_apply_spectral_extension(s);
     }
 
@@ -1276,19 +1365,24 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         do_imdct(s, s->channels);
 
         if (downmix_output) {
+#if USE_FIXED
+            ac3_downmix_c_fixed16(s->outptr, s->downmix_coeffs,
+                              s->out_channels, s->fbw_channels, 256);
+#else
             s->ac3dsp.downmix(s->outptr, s->downmix_coeffs,
                               s->out_channels, s->fbw_channels, 256);
+#endif
         }
     } else {
         if (downmix_output) {
-            s->ac3dsp.downmix(s->xcfptr + 1, s->downmix_coeffs,
-                              s->out_channels, s->fbw_channels, 256);
+            s->ac3dsp.AC3_RENAME(downmix)(s->xcfptr + 1, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 256);
         }
 
         if (downmix_output && !s->downmixed) {
             s->downmixed = 1;
-            s->ac3dsp.downmix(s->dlyptr, s->downmix_coeffs, s->out_channels,
-                              s->fbw_channels, 128);
+            s->ac3dsp.AC3_RENAME(downmix)(s->dlyptr, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 128);
         }
 
         do_imdct(s, s->out_channels);
@@ -1309,7 +1403,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
     AC3DecodeContext *s = avctx->priv_data;
     int blk, ch, err, ret;
     const uint8_t *channel_map;
-    const float *output[AC3_MAX_CHANNELS];
+    const SHORTFLOAT *output[AC3_MAX_CHANNELS];
     enum AVMatrixEncoding matrix_encoding;
     AVDownmixInfo *downmix_info;
 
@@ -1324,7 +1418,8 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         memcpy(s->input_buffer, buf, FFMIN(buf_size, AC3_FRAME_BUFFER_SIZE));
     buf = s->input_buffer;
     /* initialize the GetBitContext with the start of valid AC-3 Frame */
-    init_get_bits(&s->gbc, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gbc, buf, buf_size)) < 0)
+        return ret;
 
     /* parse the syncinfo */
     err = parse_frame_header(s);
@@ -1367,7 +1462,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         if (s->frame_size > buf_size) {
             av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
             err = AAC_AC3_PARSE_ERROR_FRAME_SIZE;
-        } else if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        } else if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             /* check for crc mismatch */
             if (av_crc(av_crc_get_table(AV_CRC_16_ANSI), 0, &buf[2],
                        s->frame_size - 2)) {
@@ -1401,6 +1496,10 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
             s->output_mode  = AC3_CHMODE_STEREO;
         }
 
+        s->loro_center_mix_level   = gain_levels[s->  center_mix_level];
+        s->loro_surround_mix_level = gain_levels[s->surround_mix_level];
+        s->ltrt_center_mix_level   = LEVEL_MINUS_3DB;
+        s->ltrt_surround_mix_level = LEVEL_MINUS_3DB;
         /* set downmixing coefficients if needed */
         if (s->channels != s->out_channels && !((s->output_mode & AC3_OUTPUT_LFEON) &&
                 s->fbw_channels == s->out_channels)) {
@@ -1422,19 +1521,18 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = s->num_blocks * AC3_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /* decode the audio blocks */
     channel_map = ff_ac3_dec_channel_map[s->output_mode & ~AC3_OUTPUT_LFEON][s->lfe_on];
+    for (ch = 0; ch < AC3_MAX_CHANNELS; ch++) {
+        output[ch] = s->output[ch];
+        s->outptr[ch] = s->output[ch];
+    }
     for (ch = 0; ch < s->channels; ch++) {
         if (ch < s->out_channels)
-            s->outptr[channel_map[ch]] = (float *)frame->data[ch];
-        else
-            s->outptr[ch] = s->output[ch];
-        output[ch] = s->output[ch];
+            s->outptr[channel_map[ch]] = (SHORTFLOAT *)frame->data[ch];
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
         if (!err && decode_audio_block(s, blk)) {
@@ -1443,16 +1541,20 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         }
         if (err)
             for (ch = 0; ch < s->out_channels; ch++)
-                memcpy(s->outptr[channel_map[ch]], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+                memcpy(((SHORTFLOAT*)frame->data[ch]) + AC3_BLOCK_SIZE*blk, output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
         for (ch = 0; ch < s->out_channels; ch++)
             output[ch] = s->outptr[channel_map[ch]];
-        for (ch = 0; ch < s->out_channels; ch++)
-            s->outptr[ch] += AC3_BLOCK_SIZE;
+        for (ch = 0; ch < s->out_channels; ch++) {
+            if (!ch || channel_map[ch])
+                s->outptr[channel_map[ch]] += AC3_BLOCK_SIZE;
+        }
     }
 
+    av_frame_set_decode_error_flags(frame, err ? FF_DECODE_ERROR_INVALID_BITSTREAM : 0);
+
     /* keep last block for error concealment in next frame */
     for (ch = 0; ch < s->out_channels; ch++)
-        memcpy(s->output[ch], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+        memcpy(s->output[ch], output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
 
     /*
      * AVMatrixEncoding
@@ -1523,59 +1625,10 @@ static av_cold int ac3_decode_end(AVCodecContext *avctx)
     AC3DecodeContext *s = avctx->priv_data;
     ff_mdct_end(&s->imdct_512);
     ff_mdct_end(&s->imdct_256);
+    av_freep(&s->fdsp);
 
     return 0;
 }
 
 #define OFFSET(x) offsetof(AC3DecodeContext, x)
 #define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
-static const AVOption options[] = {
-    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
-    { NULL},
-};
-
-static const AVClass ac3_decoder_class = {
-    .class_name = "AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_ac3_decoder = {
-    .name           = "ac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &ac3_decoder_class,
-};
-
-#if CONFIG_EAC3_DECODER
-static const AVClass eac3_decoder_class = {
-    .class_name = "E-AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_eac3_decoder = {
-    .name           = "eac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_EAC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &eac3_decoder_class,
-};
-#endif
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index e4d443c9a8..b3498fec96 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 and E-AC-3 decoders
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,6 +51,7 @@
 #define AVCODEC_AC3DEC_H
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "libavutil/lfg.h"
 #include "ac3.h"
 #include "ac3dsp.h"
@@ -83,6 +84,9 @@ typedef struct AC3DecodeContext {
     int bitstream_mode;                     ///< bitstream mode                         (bsmod)
     int channel_mode;                       ///< channel mode                           (acmod)
     int lfe_on;                             ///< lfe channel in use
+    int dialog_normalization[2];            ///< dialog level in dBFS                   (dialnorm)
+    int compression_exists[2];              ///< compression field is valid for frame   (compre)
+    int compression_gain[2];                ///< gain to apply for heavy compression    (compr)
     int channel_map;                        ///< custom channel map
     int preferred_downmix;                  ///< Preferred 2-channel downmix mode       (dmixmod)
     int center_mix_level;                   ///< Center mix level index
@@ -97,6 +101,14 @@ typedef struct AC3DecodeContext {
     int dolby_headphone_mode;               ///< dolby headphone mode                   (dheadphonmod)
 ///@}
 
+    int preferred_stereo_downmix;
+    float ltrt_center_mix_level;
+    float ltrt_surround_mix_level;
+    float loro_center_mix_level;
+    float loro_surround_mix_level;
+    int target_level;                       ///< target level in dBFS
+    float level_gain[2];
+
 ///@name Frame syntax parameters
     int snr_offset_strategy;                ///< SNR offset strategy                    (snroffststr)
     int block_switch_syntax;                ///< block switch syntax enabled            (blkswe)
@@ -132,8 +144,8 @@ typedef struct AC3DecodeContext {
     int num_spx_bands;                          ///< number of spx bands                    (nspxbnds)
     uint8_t spx_band_sizes[SPX_MAX_BANDS];      ///< number of bins in each spx band
     uint8_t first_spx_coords[AC3_MAX_CHANNELS]; ///< first spx coordinates states           (firstspxcos)
-    float spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
-    float spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
+    INTFLOAT spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
+    INTFLOAT spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
 ///@}
 
 ///@name Adaptive hybrid transform
@@ -145,15 +157,17 @@ typedef struct AC3DecodeContext {
     int fbw_channels;                           ///< number of full-bandwidth channels
     int channels;                               ///< number of total channels
     int lfe_ch;                                 ///< index of LFE channel
-    float downmix_coeffs[AC3_MAX_CHANNELS][2];  ///< stereo downmix coefficients
+    SHORTFLOAT downmix_coeffs[AC3_MAX_CHANNELS][2];  ///< stereo downmix coefficients
     int downmixed;                              ///< indicates if coeffs are currently downmixed
     int output_mode;                            ///< output channel configuration
     int out_channels;                           ///< number of output channels
 ///@}
 
 ///@name Dynamic range
-    float dynamic_range[2];                 ///< dynamic range
-    float drc_scale;                        ///< percentage of dynamic range compression to be applied
+    INTFLOAT dynamic_range[2];                 ///< dynamic range
+    INTFLOAT drc_scale;                        ///< percentage of dynamic range compression to be applied
+    int heavy_compression;                     ///< apply heavy compression
+    INTFLOAT heavy_dynamic_range[2];           ///< heavy dynamic range compression
 ///@}
 
 ///@name Bandwidth
@@ -201,22 +215,26 @@ typedef struct AC3DecodeContext {
 
 ///@name Optimization
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
+    AVFloatDSPContext *fdsp;
+#endif
     AC3DSPContext ac3dsp;
     FmtConvertContext fmt_conv;             ///< optimized conversion functions
 ///@}
 
-    float *outptr[AC3_MAX_CHANNELS];
-    float *xcfptr[AC3_MAX_CHANNELS];
-    float *dlyptr[AC3_MAX_CHANNELS];
+    SHORTFLOAT *outptr[AC3_MAX_CHANNELS];
+    INTFLOAT *xcfptr[AC3_MAX_CHANNELS];
+    INTFLOAT *dlyptr[AC3_MAX_CHANNELS];
 
 ///@name Aligned arrays
-    DECLARE_ALIGNED(16, int32_t, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];     ///< fixed-point transform coefficients
-    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///< fixed-point transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
+    DECLARE_ALIGNED(32, INTFLOAT, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, SHORTFLOAT, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
     DECLARE_ALIGNED(32, uint8_t, input_buffer)[AC3_FRAME_BUFFER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; ///< temp buffer to prevent overread
 ///@}
 } AC3DecodeContext;
@@ -225,19 +243,19 @@ typedef struct AC3DecodeContext {
  * Parse the E-AC-3 frame header.
  * This parses both the bit stream info and audio frame header.
  */
-int ff_eac3_parse_header(AC3DecodeContext *s);
+static int ff_eac3_parse_header(AC3DecodeContext *s);
 
 /**
  * Decode mantissas in a single channel for the entire frame.
  * This is used when AHT mode is enabled.
  */
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
 
 /**
  * Apply spectral extension to each channel by copying lower frequency
  * coefficients to higher frequency bins and applying side information to
  * approximate the original high frequency signal.
  */
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
 
 #endif /* AVCODEC_AC3DEC_H */
diff --git a/libavcodec/ac3dec_data.c b/libavcodec/ac3dec_data.c
index 272a963f08..d0a9b1ec40 100644
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_data.h b/libavcodec/ac3dec_data.h
index c0a584e7b3..975b52ef2c 100644
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_fixed.c b/libavcodec/ac3dec_fixed.c
new file mode 100644
index 0000000000..21756eac1b
--- /dev/null
+++ b/libavcodec/ac3dec_fixed.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *
+ * AC3 fixed-point decoder for MIPS platforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define USE_FIXED 1
+#define FFT_FIXED_32 1
+#include "ac3dec.h"
+
+
+static const int end_freq_inv_tab[8] =
+{
+    50529027, 44278013, 39403370, 32292987, 27356480, 23729101, 20951060, 18755316
+};
+
+static void scale_coefs (
+    int32_t *dst,
+    const int32_t *src,
+    int dynrng,
+    int len)
+{
+    int i, shift, round;
+    int16_t mul;
+    int temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    mul = (dynrng & 0x1f) + 0x20;
+    shift = 4 - ((dynrng << 23) >> 28);
+    if (shift > 0 ) {
+      round = 1 << (shift-1);
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp = temp + round;
+          temp2 = src[i+2] * mul;
+
+          temp1 = temp1 + round;
+          dst[i] = temp >> shift;
+          temp3 = src[i+3] * mul;
+          temp2 = temp2 + round;
+
+          dst[i+1] = temp1 >> shift;
+          temp4 = src[i + 4] * mul;
+          temp3 = temp3 + round;
+          dst[i+2] = temp2 >> shift;
+
+          temp5 = src[i+5] * mul;
+          temp4 = temp4 + round;
+          dst[i+3] = temp3 >> shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 >> shift;
+          temp5 = temp5 + round;
+          temp7 = src[i+7] * mul;
+          temp6 = temp6 + round;
+
+          dst[i+5] = temp5 >> shift;
+          temp7 = temp7 + round;
+          dst[i+6] = temp6 >> shift;
+          dst[i+7] = temp7 >> shift;
+
+      }
+    } else {
+      shift = -shift;
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp2 = src[i+2] * mul;
+
+          dst[i] = temp << shift;
+          temp3 = src[i+3] * mul;
+
+          dst[i+1] = temp1 << shift;
+          temp4 = src[i + 4] * mul;
+          dst[i+2] = temp2 << shift;
+
+          temp5 = src[i+5] * mul;
+          dst[i+3] = temp3 << shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 << shift;
+          temp7 = src[i+7] * mul;
+
+          dst[i+5] = temp5 << shift;
+          dst[i+6] = temp6 << shift;
+          dst[i+7] = temp7 << shift;
+
+      }
+    }
+}
+
+/**
+ * Downmix samples from original signal to stereo or mono (this is for 16-bit samples
+ * and fixed point decoder - original (for 32-bit samples) is in ac3dsp.c).
+ */
+static void ac3_downmix_c_fixed16(int16_t **samples, int16_t (*matrix)[2],
+                                  int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += samples[j][i] * matrix[j][0];
+                v1 += samples[j][i] * matrix[j][1];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += samples[j][i] * matrix[j][0];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "heavy dynamic range compression enabled", OFFSET(heavy_compression), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, PAR },
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "Fixed-Point AC-3 Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_fixed_decoder = {
+    .name           = "ac3_fixed",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
diff --git a/libavcodec/ac3dec_float.c b/libavcodec/ac3dec_float.c
new file mode 100644
index 0000000000..6b46de27fd
--- /dev/null
+++ b/libavcodec/ac3dec_float.c
@@ -0,0 +1,92 @@
+/*
+ * AC-3 Audio Decoder
+ * This code was developed as part of Google Summer of Code 2006.
+ * E-AC-3 support was added as part of Google Summer of Code 2007.
+ *
+ * Copyright (c) 2006 Kartikey Mahendra BHATT (bhattkm at gmail dot com)
+ * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
+ * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * Upmix delay samples from stereo to original channel layout.
+ */
+#include "ac3dec.h"
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "heavy dynamic range compression enabled", OFFSET(heavy_compression), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, PAR },
+    { "target_level", "target level in -dBFS (0 not applied)", OFFSET(target_level), AV_OPT_TYPE_INT, {.i64 = 0 }, -31, 0, PAR },
+
+{"dmix_mode", "Preferred Stereo Downmix Mode", OFFSET(preferred_stereo_downmix), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, 2, 0, "dmix_mode"},
+{"ltrt_cmixlev",   "Lt/Rt Center Mix Level",   OFFSET(ltrt_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"ltrt_surmixlev", "Lt/Rt Surround Mix Level", OFFSET(ltrt_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_cmixlev",   "Lo/Ro Center Mix Level",   OFFSET(loro_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_surmixlev", "Lo/Ro Surround Mix Level", OFFSET(loro_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_decoder = {
+    .name           = "ac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
+
+#if CONFIG_EAC3_DECODER
+static const AVClass eac3_decoder_class = {
+    .class_name = "E-AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_eac3_decoder = {
+    .name           = "eac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EAC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &eac3_decoder_class,
+};
+#endif
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 933550bfdc..fe87b5bde5 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -172,6 +172,48 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
     }
 }
 
+static void ac3_sum_square_butterfly_int32_c(int64_t sum[4],
+                                             const int32_t *coef0,
+                                             const int32_t *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        int lt = coef0[i];
+        int rt = coef1[i];
+        int md = lt + rt;
+        int sd = lt - rt;
+        MAC64(sum[0], lt, lt);
+        MAC64(sum[1], rt, rt);
+        MAC64(sum[2], md, md);
+        MAC64(sum[3], sd, sd);
+    }
+}
+
+static void ac3_sum_square_butterfly_float_c(float sum[4],
+                                             const float *coef0,
+                                             const float *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        float lt = coef0[i];
+        float rt = coef1[i];
+        float md = lt + rt;
+        float sd = lt - rt;
+        sum[0] += lt * lt;
+        sum[1] += rt * rt;
+        sum[2] += md * md;
+        sum[3] += sd * sd;
+    }
+}
+
 static void ac3_downmix_c(float **samples, float (*matrix)[2],
                           int out_ch, int in_ch, int len)
 {
@@ -197,6 +239,31 @@ static void ac3_downmix_c(float **samples, float (*matrix)[2],
     }
 }
 
+static void ac3_downmix_c_fixed(int32_t **samples, int16_t (*matrix)[2],
+                                int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int64_t v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += (int64_t)samples[j][i] * matrix[j][0];
+                v1 += (int64_t)samples[j][i] * matrix[j][1];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += (int64_t)samples[j][i] * matrix[j][0];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
 static void apply_window_int16_c(int16_t *output, const int16_t *input,
                                  const int16_t *window, unsigned int len)
 {
@@ -221,11 +288,16 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
     c->update_bap_counts = ac3_update_bap_counts_c;
     c->compute_mantissa_size = ac3_compute_mantissa_size_c;
     c->extract_exponents = ac3_extract_exponents_c;
+    c->sum_square_butterfly_int32 = ac3_sum_square_butterfly_int32_c;
+    c->sum_square_butterfly_float = ac3_sum_square_butterfly_float_c;
     c->downmix = ac3_downmix_c;
+    c->downmix_fixed = ac3_downmix_c_fixed;
     c->apply_window_int16 = apply_window_int16_c;
 
     if (ARCH_ARM)
         ff_ac3dsp_init_arm(c, bit_exact);
     if (ARCH_X86)
         ff_ac3dsp_init_x86(c, bit_exact);
+    if (ARCH_MIPS)
+        ff_ac3dsp_init_mips(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 6ca0c5b8e8..ed98c8ce6a 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,9 +126,18 @@ typedef struct AC3DSPContext {
 
     void (*extract_exponents)(uint8_t *exp, int32_t *coef, int nb_coefs);
 
+    void (*sum_square_butterfly_int32)(int64_t sum[4], const int32_t *coef0,
+                                       const int32_t *coef1, int len);
+
+    void (*sum_square_butterfly_float)(float sum[4], const float *coef0,
+                                       const float *coef1, int len);
+
     void (*downmix)(float **samples, float (*matrix)[2], int out_ch,
                     int in_ch, int len);
 
+    void (*downmix_fixed)(int32_t **samples, int16_t (*matrix)[2], int out_ch,
+                          int in_ch, int len);
+
     /**
      * Apply symmetric window in 16-bit fixed-point.
      * @param output destination array
@@ -147,5 +156,6 @@ typedef struct AC3DSPContext {
 void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index b10767a378..35e721a964 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "me_cmp.h"
 #include "put_bits.h"
 #include "audiodsp.h"
@@ -273,7 +274,7 @@ void ff_ac3_apply_rematrixing(AC3EncodeContext *s)
     int nb_coefs;
     int blk, bnd, i;
     int start, end;
-    uint8_t *flags;
+    uint8_t *flags = NULL;
 
     if (!s->rematrixing_enabled)
         return;
@@ -1210,14 +1211,11 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
     int i;
 
     for (i = start_freq; i < end_freq; i++) {
-        int v;
         int c = fixed_coef[i];
         int e = exp[i];
-        int b = bap[i];
-        switch (b) {
-        case 0:
-            v = 0;
-            break;
+        int v = bap[i];
+        if (v)
+        switch (v) {
         case 1:
             v = sym_quant(c, e, 3);
             switch (s->mant1_cnt) {
@@ -1286,7 +1284,7 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
             v = asym_quant(c, e, 16);
             break;
         default:
-            v = asym_quant(c, e, b - 1);
+            v = asym_quant(c, e, v - 1);
             break;
         }
         qmant[i] = v;
@@ -1386,7 +1384,7 @@ static void ac3_output_frame_header(AC3EncodeContext *s)
  */
 static void output_audio_block(AC3EncodeContext *s, int blk)
 {
-    int ch, i, baie, bnd, got_cpl, ch0;
+    int ch, i, baie, bnd, got_cpl, av_uninit(ch0);
     AC3Block *block = &s->blocks[blk];
 
     /* block switching */
@@ -2022,6 +2020,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     AC3EncodeContext *s = avctx->priv_data;
 
     av_freep(&s->windowed_samples);
+    if (s->planar_samples)
     for (ch = 0; ch < s->channels; ch++)
         av_freep(&s->planar_samples[ch]);
     av_freep(&s->planar_samples);
@@ -2037,6 +2036,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     av_freep(&s->qmant_buffer);
     av_freep(&s->cpl_coord_exp_buffer);
     av_freep(&s->cpl_coord_mant_buffer);
+    av_freep(&s->fdsp);
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
         av_freep(&block->mdct_coef);
@@ -2153,8 +2153,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
 
     /* validate bit rate */
     if (s->eac3) {
-        int max_br, min_br, wpf, min_br_dist, min_br_code;
+        int max_br, min_br, wpf, min_br_code;
         int num_blks_code, num_blocks, frame_samples;
+        long long min_br_dist;
 
         /* calculate min/max bitrate */
         /* TODO: More testing with 3 and 2 blocks. All E-AC-3 samples I've
@@ -2184,9 +2185,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
            this is needed for lookup tables for bandwidth and coupling
            parameter selection */
         min_br_code = -1;
-        min_br_dist = INT_MAX;
+        min_br_dist = INT64_MAX;
         for (i = 0; i < 19; i++) {
-            int br_dist = abs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
+            long long br_dist = llabs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
             if (br_dist < min_br_dist) {
                 min_br_dist = br_dist;
                 min_br_code = i;
@@ -2199,10 +2200,11 @@ static av_cold int validate_options(AC3EncodeContext *s)
             wpf--;
         s->frame_size_min = 2 * wpf;
     } else {
-        int best_br = 0, best_code = 0, best_diff = INT_MAX;
+        int best_br = 0, best_code = 0;
+        long long best_diff = INT64_MAX;
         for (i = 0; i < 19; i++) {
             int br   = (ff_ac3_bitrate_tab[i] >> s->bit_alloc.sr_shift) * 1000;
-            int diff = abs(br - avctx->bit_rate);
+            long long diff = llabs(br - avctx->bit_rate);
             if (diff < best_diff) {
                 best_br   = br;
                 best_code = i;
@@ -2250,7 +2252,7 @@ static av_cold int validate_options(AC3EncodeContext *s)
  */
 static av_cold void set_bandwidth(AC3EncodeContext *s)
 {
-    int blk, ch, cpl_start;
+    int blk, ch, av_uninit(cpl_start);
 
     if (s->cutoff) {
         /* calculate bandwidth based on user-specified cutoff frequency */
@@ -2329,50 +2331,50 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     if (s->allocate_sample_buffers(s))
         goto alloc_fail;
 
-    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap_buffer, total_coefs,
                      sizeof(*s->bap_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap1_buffer, total_coefs,
                      sizeof(*s->bap1_buffer), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs *
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs,
                       sizeof(*s->mdct_coef_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->exp_buffer, total_coefs,
                      sizeof(*s->exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks * 128 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks, 128 *
                      sizeof(*s->grouped_exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->psd_buffer, total_coefs,
                      sizeof(*s->psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks, 64 *
                      sizeof(*s->band_psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->mask_buffer, channel_blocks, 64 *
                      sizeof(*s->mask_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->qmant_buffer, total_coefs,
                      sizeof(*s->qmant_buffer), alloc_fail);
     if (s->cpl_enabled) {
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
-        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mdct_coef, channels, sizeof(*block->mdct_coef),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->exp, channels * sizeof(*block->exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->exp, channels, sizeof(*block->exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, channels * sizeof(*block->grouped_exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->grouped_exp, channels, sizeof(*block->grouped_exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->psd, channels * sizeof(*block->psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->psd, channels, sizeof(*block->psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, channels * sizeof(*block->band_psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->band_psd, channels, sizeof(*block->band_psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->mask, channels * sizeof(*block->mask),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mask, channels, sizeof(*block->mask),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, channels * sizeof(*block->qmant),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->qmant, channels, sizeof(*block->qmant),
                           alloc_fail);
         if (s->cpl_enabled) {
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_exp, channels * sizeof(*block->cpl_coord_exp),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_exp, channels, sizeof(*block->cpl_coord_exp),
                               alloc_fail);
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_mant, channels * sizeof(*block->cpl_coord_mant),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_mant, channels, sizeof(*block->cpl_coord_mant),
                               alloc_fail);
         }
 
@@ -2395,11 +2397,11 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     }
 
     if (!s->fixed_point) {
-        FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs *
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs,
                           sizeof(*s->fixed_coef_buffer), alloc_fail);
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
@@ -2407,7 +2409,7 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     } else {
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = (int32_t *)block->mdct_coef[ch];
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 76b6d7f6e5..a2442d0e55 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -165,7 +165,7 @@ typedef struct AC3EncodeContext {
     AVCodecContext *avctx;                  ///< parent AVCodecContext
     PutBitContext pb;                       ///< bitstream writer context
     AudioDSPContext adsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     MECmpContext mecc;
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
     FFTContext mdct;                        ///< FFT context for MDCT calculation
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index 2bb82ef3b6..9d39026dd5 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,13 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_AC3_FIXED
 #include "ac3enc_opts_template.c"
-static const AVClass ac3enc_class = { "Fixed-Point AC-3 Encoder", av_default_item_name,
-                                      ac3_options, LIBAVUTIL_VERSION_INT };
+
+static const AVClass ac3enc_class = {
+    .class_name = "Fixed-Point AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 #include "ac3enc_template.c"
 
@@ -97,6 +102,12 @@ static void scale_coefficients(AC3EncodeContext *s)
     }
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, int64_t sum[4],
+                                 const int32_t *coef0, const int32_t *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_int32(sum, coef0, coef1, len);
+}
 
 /*
  * Clip MDCT coefficients to allowable range.
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 822f431b44..6c91f459d0 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,12 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_AC3
 #include "ac3enc_opts_template.c"
-static const AVClass ac3enc_class = { "AC-3 Encoder", av_default_item_name,
-                                      ac3_options, LIBAVUTIL_VERSION_INT };
+static const AVClass ac3enc_class = {
+    .class_name = "AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 #include "ac3enc_template.c"
 
@@ -68,7 +72,7 @@ av_cold int ff_ac3_float_mdct_init(AC3EncodeContext *s)
     n  = 1 << 9;
     n2 = n >> 1;
 
-    window = av_malloc(n * sizeof(*window));
+    window = av_malloc_array(n, sizeof(*window));
     if (!window) {
         av_log(s->avctx, AV_LOG_ERROR, "Cannot allocate memory.\n");
         return AVERROR(ENOMEM);
@@ -104,6 +108,12 @@ static void scale_coefficients(AC3EncodeContext *s)
                                chan_size * (s->channels + cpl));
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, float sum[4],
+                                 const float *coef0, const float *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_float(sum, coef0, coef1, len);
+}
 
 /*
  * Clip MDCT coefficients to allowable range.
@@ -129,7 +139,9 @@ static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl)
 av_cold int ff_ac3_float_encode_init(AVCodecContext *avctx)
 {
     AC3EncodeContext *s = avctx->priv_data;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
     return ff_ac3_encode_init(avctx);
 }
 
diff --git a/libavcodec/ac3enc_opts_template.c b/libavcodec/ac3enc_opts_template.c
index a08c70d2c5..83113b8d12 100644
--- a/libavcodec/ac3enc_opts_template.c
+++ b/libavcodec/ac3enc_opts_template.c
@@ -2,20 +2,20 @@
  * AC-3 encoder options
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index 8febf858ef..9dec9ae9cf 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2011 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,9 @@ static void clip_coefficients(AudioDSPContext *adsp, CoefType *coef,
 
 static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl);
 
+static void sum_square_butterfly(AC3EncodeContext *s, CoefSumType sum[4],
+                                 const CoefType *coef0, const CoefType *coef1,
+                                 int len);
 
 int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
 {
@@ -54,7 +57,7 @@ int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
 
     FF_ALLOC_OR_GOTO(s->avctx, s->windowed_samples, AC3_WINDOW_SIZE *
                      sizeof(*s->windowed_samples), alloc_fail);
-    FF_ALLOC_OR_GOTO(s->avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
+    FF_ALLOC_ARRAY_OR_GOTO(s->avctx, s->planar_samples, s->channels, sizeof(*s->planar_samples),
                      alloc_fail);
     for (ch = 0; ch < s->channels; ch++) {
         FF_ALLOCZ_OR_GOTO(s->avctx, s->planar_samples[ch],
@@ -70,7 +73,7 @@ alloc_fail:
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AC-3 order.
+ * Channels are reordered from FFmpeg's default order to AC-3 order.
  */
 static void copy_input_samples(AC3EncodeContext *s, SampleType **samples)
 {
@@ -105,7 +108,7 @@ static void apply_mdct(AC3EncodeContext *s)
             const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
 
 #if CONFIG_AC3ENC_FLOAT
-            s->fdsp.vector_fmul(s->windowed_samples, input_samples,
+            s->fdsp->vector_fmul(s->windowed_samples, input_samples,
                                 s->mdct_window, AC3_WINDOW_SIZE);
 #else
             s->ac3dsp.apply_window_int16(s->windowed_samples, input_samples,
@@ -133,7 +136,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 #else
     int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords;
 #endif
-    int blk, ch, bnd, i, j;
+    int av_uninit(blk), ch, bnd, i, j;
     CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
     int cpl_start, num_cpl_coefs;
 
@@ -260,7 +263,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
                 energy_cpl = energy[blk][CPL_CH][bnd];
                 energy_ch = energy[blk][ch][bnd];
                 blk1 = blk+1;
-                while (!s->blocks[blk1].new_cpl_coords[ch] && blk1 < s->num_blocks) {
+                while (blk1 < s->num_blocks && !s->blocks[blk1].new_cpl_coords[ch]) {
                     if (s->blocks[blk1].cpl_in_use) {
                         energy_cpl += energy[blk1][CPL_CH][bnd];
                         energy_ch += energy[blk1][ch][bnd];
@@ -336,8 +339,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 static void compute_rematrixing_strategy(AC3EncodeContext *s)
 {
     int nb_coefs;
-    int blk, bnd, i;
-    AC3Block *block, *block0;
+    int blk, bnd;
+    AC3Block *block, *block0 = NULL;
 
     if (s->channel_mode != AC3_CHMODE_STEREO)
         return;
@@ -361,20 +364,12 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
         }
 
         for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
-            /* calculate calculate sum of squared coeffs for one band in one block */
+            /* calculate sum of squared coeffs for one band in one block */
             int start = ff_ac3_rematrix_band_tab[bnd];
             int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
-            CoefSumType sum[4] = {0,};
-            for (i = start; i < end; i++) {
-                CoefType lt = block->mdct_coef[1][i];
-                CoefType rt = block->mdct_coef[2][i];
-                CoefType md = lt + rt;
-                CoefType sd = lt - rt;
-                MAC_COEF(sum[0], lt, lt);
-                MAC_COEF(sum[1], rt, rt);
-                MAC_COEF(sum[2], md, md);
-                MAC_COEF(sum[3], sd, sd);
-            }
+            CoefSumType sum[4];
+            sum_square_butterfly(s, sum, block->mdct_coef[1] + start,
+                                 block->mdct_coef[2] + start, end - start);
 
             /* compare sums to determine if rematrixing will be used for this band */
             if (FFMIN(sum[2], sum[3]) < FFMIN(sum[0], sum[1]))
@@ -443,10 +438,8 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
 
     ff_ac3_quantize_mantissas(s);
 
-    if ((ret = ff_alloc_packet(avpkt, s->frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size, 0)) < 0)
         return ret;
-    }
     ff_ac3_output_frame(s, avpkt->data);
 
     if (frame->pts != AV_NOPTS_VALUE)
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 3cd07f9e4b..d62d8bfbf5 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -2,20 +2,20 @@
  * AC-3 tables
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -116,7 +116,7 @@ const uint8_t ff_ac3_enc_channel_map[8][2][6] = {
 };
 
 /**
- * Table to remap channels from from AC-3 order to SMPTE order.
+ * Table to remap channels from AC-3 order to SMPTE order.
  * [channel_mode][lfe][ch]
  */
 const uint8_t ff_ac3_dec_channel_map[8][2][6] = {
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index 83edec52d1..74cbd9ed65 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -2,20 +2,20 @@
  * AC-3 tables
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/acelp_filters.c b/libavcodec/acelp_filters.c
index 93bec6589a..9ab758b977 100644
--- a/libavcodec/acelp_filters.c
+++ b/libavcodec/acelp_filters.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "acelp_filters.h"
@@ -46,7 +47,7 @@ void ff_acelp_interpolate(int16_t* out, const int16_t* in,
 {
     int n, i;
 
-    assert(frac_pos >= 0 && frac_pos < precision);
+    av_assert1(frac_pos >= 0 && frac_pos < precision);
 
     for (n = 0; n < length; n++) {
         int idx = 0;
@@ -143,3 +144,12 @@ void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
     samples[0] -= tilt * *mem;
     *mem = new_tilt_mem;
 }
+
+void ff_acelp_filter_init(ACELPFContext *c)
+{
+    c->acelp_interpolatef                      = ff_acelp_interpolatef;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_filter_init_mips(c);
+}
diff --git a/libavcodec/acelp_filters.h b/libavcodec/acelp_filters.h
index 6a9ebd943e..7a3061bd1f 100644
--- a/libavcodec/acelp_filters.h
+++ b/libavcodec/acelp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,39 @@
 
 #include <stdint.h>
 
+typedef struct ACELPFContext {
+    /**
+    * Floating point version of ff_acelp_interpolate()
+    */
+    void (*acelp_interpolatef)(float *out, const float *in,
+                            const float *filter_coeffs, int precision,
+                            int frac_pos, int filter_length, int length);
+
+    /**
+     * Apply an order 2 rational transfer function in-place.
+     *
+     * @param out output buffer for filtered speech samples
+     * @param in input buffer containing speech data (may be the same as out)
+     * @param zero_coeffs z^-1 and z^-2 coefficients of the numerator
+     * @param pole_coeffs z^-1 and z^-2 coefficients of the denominator
+     * @param gain scale factor for final output
+     * @param mem intermediate values used by filter (should be 0 initially)
+     * @param n number of samples (should be a multiple of eight)
+     */
+    void (*acelp_apply_order_2_transfer_function)(float *out, const float *in,
+                                                  const float zero_coeffs[2],
+                                                  const float pole_coeffs[2],
+                                                  float gain,
+                                                  float mem[2], int n);
+
+}ACELPFContext;
+
+/**
+ * Initialize ACELPFContext.
+ */
+void ff_acelp_filter_init(ACELPFContext *c);
+void ff_acelp_filter_init_mips(ACELPFContext *c);
+
 /**
  * low-pass Finite Impulse Response filter coefficients.
  *
@@ -76,7 +109,7 @@ void ff_acelp_interpolatef(float *out, const float *in,
  *
  * The filter has a cut-off frequency of 1/80 of the sampling freq
  *
- * @note Two items before the top of the out buffer must contain two items from the
+ * @note Two items before the top of the in buffer must contain two items from the
  *       tail of the previous subframe.
  *
  * @remark It is safe to pass the same array in in and out parameters.
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index 19657729ce..3ecec01cbe 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
 #include "acelp_pitch_delay.h"
@@ -107,9 +108,20 @@ int16_t ff_acelp_decode_gain_code(
     for(i=0; i<ma_pred_order; i++)
         mr_energy += quant_energy[i] * ma_prediction_coeff[i];
 
+#ifdef G729_BITEXACT
+    mr_energy += (((-6165LL * ff_log2(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0))) >> 3) & ~0x3ff);
+
+    mr_energy = (5439 * (mr_energy >> 15)) >> 8;           // (0.15) = (0.15) * (7.23)
+
+    return bidir_sal(
+               ((ff_exp2(mr_energy & 0x7fff) + 16) >> 5) * (gain_corr_factor >> 1),
+               (mr_energy >> 15) - 25
+           );
+#else
     mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
                 sqrt(adsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
     return mr_energy >> 12;
+#endif
 }
 
 float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
diff --git a/libavcodec/acelp_pitch_delay.h b/libavcodec/acelp_pitch_delay.h
index 7b5b33d9b4..2aade2f226 100644
--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/acelp_vectors.c b/libavcodec/acelp_vectors.c
index 0c660acb5a..798217d73b 100644
--- a/libavcodec/acelp_vectors.c
+++ b/libavcodec/acelp_vectors.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
@@ -50,6 +51,26 @@ const uint8_t ff_fc_2pulses_9bits_track1_gray[16] =
   28, 26,
 };
 
+const uint8_t ff_fc_2pulses_9bits_track2_gray[32] =
+{
+  0,  2,
+  5,  4,
+  12, 10,
+  7,  9,
+  25, 24,
+  20, 22,
+  14, 15,
+  19, 17,
+  36, 31,
+  21, 26,
+  1,  6,
+  16, 11,
+  27, 29,
+  32, 30,
+  39, 37,
+  34, 35,
+};
+
 const uint8_t ff_fc_4pulses_8bits_tracks_13[16] =
 {
   0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75,
@@ -219,11 +240,13 @@ void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
         int x   = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
         float y = in->y[i] * scale;
 
-        do {
-            out[x] += y;
-            y *= in->pitch_fac;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            av_assert0(x < size);
+            do {
+                out[x] += y;
+                y *= in->pitch_fac;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
 
@@ -234,9 +257,18 @@ void ff_clear_fixed_vector(float *out, const AMRFixed *in, int size)
     for (i=0; i < in->n; i++) {
         int x  = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
 
-        do {
-            out[x] = 0.0;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            do {
+                out[x] = 0.0;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
+
+void ff_acelp_vectors_init(ACELPVContext *c)
+{
+    c->weighted_vector_sumf   = ff_weighted_vector_sumf;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_vectors_init_mips(c);
+}
diff --git a/libavcodec/acelp_vectors.h b/libavcodec/acelp_vectors.h
index d6226bf020..fae834dac1 100644
--- a/libavcodec/acelp_vectors.h
+++ b/libavcodec/acelp_vectors.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,30 @@
 
 #include <stdint.h>
 
+typedef struct ACELPVContext {
+    /**
+     * float implementation of weighted sum of two vectors.
+     * @param[out] out result of addition
+     * @param in_a first vector
+     * @param in_b second vector
+     * @param weight_coeff_a first vector weight coefficient
+     * @param weight_coeff_a second vector weight coefficient
+     * @param length vectors length (should be a multiple of two)
+     *
+     * @note It is safe to pass the same buffer for out and in_a or in_b.
+     */
+    void (*weighted_vector_sumf)(float *out, const float *in_a, const float *in_b,
+                                 float weight_coeff_a, float weight_coeff_b,
+                                 int length);
+
+}ACELPVContext;
+
+/**
+ * Initialize ACELPVContext.
+ */
+void ff_acelp_vectors_init(ACELPVContext *c);
+void ff_acelp_vectors_init_mips(ACELPVContext *c);
+
 /** Sparse representation for the algebraic codebook (fixed) vector */
 typedef struct AMRFixed {
     int      n;
@@ -82,6 +106,37 @@ extern const uint8_t ff_fc_2pulses_9bits_track1[16];
 extern const uint8_t ff_fc_2pulses_9bits_track1_gray[16];
 
 /**
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 7, 14, 20, 27, 34,  1, 21
+ *      |     | 2, 9, 15, 22, 29, 35,  6, 26
+ *      |     | 4,10, 17, 24, 30, 37, 11, 31
+ *      |     | 5,12, 19, 25, 32, 39, 16, 36
+ * -----------------------------------------
+ *
+ * @remark Track in the table should be read top-to-bottom, left-to-right.
+ *
+ * @note (EE.1) This table (from the reference code) does not comply with
+ *              the specification.
+ *              The specification contains the following table:
+ *
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 5, 10, 15, 20, 25, 30, 35
+ *      |     | 1, 6, 11, 16, 21, 26, 31, 36
+ *      |     | 2, 7, 12, 17, 22, 27, 32, 37
+ *      |     | 4, 9, 14, 19, 24, 29, 34, 39
+ *
+ * -----------------------------------------
+ *
+ * @note (EE.2) Reference G.729D code also uses gray decoding for each
+ *              pulse index before looking up the value in the table.
+ *
+ * Used in G.729 @@6.4k (with gray coding)
+ */
+extern const uint8_t ff_fc_2pulses_9bits_track2_gray[32];
+
+/**
  * b60 hamming windowed sinc function coefficients
  */
 extern const float ff_b60_sinc[61];
diff --git a/libavcodec/adpcm.c b/libavcodec/adpcm.c
index aa3c7b9262..ba38041f4a 100644
--- a/libavcodec/adpcm.c
+++ b/libavcodec/adpcm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2003 The ffmpeg Project
+ * Copyright (c) 2001-2003 The FFmpeg Project
  *
  * first version by Francois Revol (revol@free.fr)
  * fringe ADPCM codecs (e.g., DK3, DK4, Westwood)
@@ -13,25 +13,24 @@
  * MAXIS EA ADPCM decoder by Robert Marston (rmarston@gmail.com)
  * THP ADPCM decoder by Marco Gerards (mgerards@xs4all.nl)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
 #include "get_bits.h"
-#include "put_bits.h"
 #include "bytestream.h"
 #include "adpcm.h"
 #include "adpcm_data.h"
@@ -85,8 +84,9 @@ static const int swf_index_tables[4][16] = {
 /* end of tables */
 
 typedef struct ADPCMDecodeContext {
-    ADPCMChannelStatus status[6];
+    ADPCMChannelStatus status[14];
     int vqa_version;                /**< VQA version. Used for ADPCM_IMA_WS */
+    int has_status;
 } ADPCMDecodeContext;
 
 static av_cold int adpcm_decode_init(AVCodecContext * avctx)
@@ -96,15 +96,21 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     unsigned int max_channels = 2;
 
     switch(avctx->codec->id) {
+    case AV_CODEC_ID_ADPCM_DTK:
     case AV_CODEC_ID_ADPCM_EA:
         min_channels = 2;
         break;
+    case AV_CODEC_ID_ADPCM_AFC:
     case AV_CODEC_ID_ADPCM_EA_R1:
     case AV_CODEC_ID_ADPCM_EA_R2:
     case AV_CODEC_ID_ADPCM_EA_R3:
     case AV_CODEC_ID_ADPCM_EA_XAS:
         max_channels = 6;
         break;
+    case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        max_channels = 14;
+        break;
     }
     if (avctx->channels < min_channels || avctx->channels > max_channels) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
@@ -116,10 +122,8 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         c->status[0].step = c->status[1].step = 511;
         break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
-        if (avctx->bits_per_coded_sample != 4) {
-            av_log(avctx, AV_LOG_ERROR, "Only 4-bit ADPCM IMA WAV files are supported\n");
-            return -1;
-        }
+        if (avctx->bits_per_coded_sample < 2 || avctx->bits_per_coded_sample > 5)
+            return AVERROR_INVALIDDATA;
         break;
     case AV_CODEC_ID_ADPCM_IMA_APC:
         if (avctx->extradata && avctx->extradata_size >= 8) {
@@ -145,6 +149,9 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         case AV_CODEC_ID_ADPCM_EA_R3:
         case AV_CODEC_ID_ADPCM_EA_XAS:
         case AV_CODEC_ID_ADPCM_THP:
+        case AV_CODEC_ID_ADPCM_THP_LE:
+        case AV_CODEC_ID_ADPCM_AFC:
+        case AV_CODEC_ID_ADPCM_DTK:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
             break;
         case AV_CODEC_ID_ADPCM_IMA_WS:
@@ -184,6 +191,29 @@ static inline short adpcm_ima_expand_nibble(ADPCMChannelStatus *c, char nibble,
     return (short)c->predictor;
 }
 
+static inline int16_t adpcm_ima_wav_expand_nibble(ADPCMChannelStatus *c, GetBitContext *gb, int bps)
+{
+    int nibble, step_index, predictor, sign, delta, diff, step, shift;
+
+    shift = bps - 1;
+    nibble = get_bits_le(gb, bps),
+    step = ff_adpcm_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_tables[bps - 2][nibble];
+    step_index = av_clip(step_index, 0, 88);
+
+    sign = nibble & (1 << shift);
+    delta = av_mod_uintp2(nibble, shift);
+    diff = ((2 * delta + 1) * step) >> shift;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_int16(predictor);
+    c->step_index = step_index;
+
+    return (int16_t)c->predictor;
+}
+
 static inline int adpcm_ima_qt_expand_nibble(ADPCMChannelStatus *c, int nibble, int shift)
 {
     int step_index;
@@ -221,10 +251,35 @@ static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
     c->sample1 = av_clip_int16(predictor);
     c->idelta = (ff_adpcm_AdaptationTable[(int)nibble] * c->idelta) >> 8;
     if (c->idelta < 16) c->idelta = 16;
+    if (c->idelta > INT_MAX/768) {
+        av_log(NULL, AV_LOG_WARNING, "idelta overflow\n");
+        c->idelta = INT_MAX/768;
+    }
 
     return c->sample1;
 }
 
+static inline short adpcm_ima_oki_expand_nibble(ADPCMChannelStatus *c, int nibble)
+{
+    int step_index, predictor, sign, delta, diff, step;
+
+    step = ff_adpcm_oki_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_table[(unsigned)nibble];
+    step_index = av_clip(step_index, 0, 48);
+
+    sign = nibble & 8;
+    delta = nibble & 7;
+    diff = ((2 * delta + 1) * step) >> 3;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_intp2(predictor, 11);
+    c->step_index = step_index;
+
+    return c->predictor << 4;
+}
+
 static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
 {
     int sign, delta, diff;
@@ -298,11 +353,9 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
     for(i=0;i<4;i++) {
         shift  = 12 - (in[4+i*2] & 15);
         filter = in[4+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
@@ -329,12 +382,11 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
 
         shift  = 12 - (in[5+i*2] & 15);
         filter = in[5+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
+
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
 
@@ -428,9 +480,11 @@ static void adpcm_swf_decode(AVCodecContext *avctx, const uint8_t *buf, int buf_
  * @param[out] coded_samples set to the number of samples as coded in the
  *                           packet, or 0 if the codec does not encode the
  *                           number of samples in each frame.
+ * @param[out] approx_nb_samples set to non-zero if the number of samples
+ *                               returned is an approximation.
  */
 static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
-                          int buf_size, int *coded_samples)
+                          int buf_size, int *coded_samples, int *approx_nb_samples)
 {
     ADPCMDecodeContext *s = avctx->priv_data;
     int nb_samples        = 0;
@@ -439,6 +493,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     int header_size;
 
     *coded_samples = 0;
+    *approx_nb_samples = 0;
+
+    if(ch <= 0)
+        return 0;
 
     switch (avctx->codec->id) {
     /* constant, only check buf_size */
@@ -456,6 +514,7 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_YAMAHA:
         nb_samples = buf_size * 2 / ch;
@@ -470,7 +529,7 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         case AV_CODEC_ID_ADPCM_4XM:
         case AV_CODEC_ID_ADPCM_IMA_ISS:     header_size = 4 * ch;      break;
         case AV_CODEC_ID_ADPCM_IMA_AMV:     header_size = 8;           break;
-        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4;           break;
+        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4 * ch;      break;
     }
     if (header_size > 0)
         return (buf_size - header_size) * 2 / ch;
@@ -514,6 +573,7 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         *coded_samples -= *coded_samples % 28;
         nb_samples      = (buf_size - header_size) * 2 / ch;
         nb_samples     -= nb_samples % 28;
+        *approx_nb_samples = 1;
         break;
     case AV_CODEC_ID_ADPCM_IMA_DK3:
         if (avctx->block_align > 0)
@@ -523,17 +583,30 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_IMA_DK4:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
         nb_samples = 1 + (buf_size - 4 * ch) * 2 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        if (avctx->block_align > 0)
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        nb_samples = (buf_size - 4 * ch) * 2 / ch;
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
+    {
+        int bsize = ff_adpcm_ima_block_sizes[avctx->bits_per_coded_sample - 2];
+        int bsamples = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 1 + (buf_size - 4 * ch) / (4 * ch) * 8;
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
+        nb_samples = 1 + (buf_size - 4 * ch) / (bsize * ch) * bsamples;
         break;
+    }
     case AV_CODEC_ID_ADPCM_MS:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 2 + (buf_size - 7 * ch) * 2 / ch;
+        nb_samples = (buf_size - 6 * ch) * 2 / ch;
         break;
     case AV_CODEC_ID_ADPCM_SBPRO_2:
     case AV_CODEC_ID_ADPCM_SBPRO_3:
@@ -546,6 +619,8 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         case AV_CODEC_ID_ADPCM_SBPRO_4: samples_per_byte = 2; break;
         }
         if (!s->status[0].step_index) {
+            if (buf_size < ch)
+                return AVERROR_INVALIDDATA;
             nb_samples++;
             buf_size -= ch;
         }
@@ -566,15 +641,32 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         break;
     }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        if (avctx->extradata) {
+            nb_samples = buf_size * 14 / (8 * ch);
+            break;
+        }
         has_coded_samples = 1;
         bytestream2_skip(gb, 4); // channel size
-        *coded_samples  = bytestream2_get_be32(gb);
-        *coded_samples -= *coded_samples % 14;
-        nb_samples      = (buf_size - 80) / (8 * ch) * 14;
+        *coded_samples  = (avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE) ?
+                          bytestream2_get_le32(gb) :
+                          bytestream2_get_be32(gb);
+        buf_size       -= 8 + 36 * ch;
+        buf_size       /= ch;
+        nb_samples      = buf_size / 8 * 14;
+        if (buf_size % 8 > 1)
+            nb_samples     += (buf_size % 8 - 1) * 2;
+        *approx_nb_samples = 1;
+        break;
+    case AV_CODEC_ID_ADPCM_AFC:
+        nb_samples = buf_size / (9 * ch) * 16;
         break;
     case AV_CODEC_ID_ADPCM_XA:
         nb_samples = (buf_size / 128) * 224 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_DTK:
+        nb_samples = buf_size / (16 * ch) * 28;
+        break;
     }
 
     /* validate coded sample count */
@@ -597,11 +689,11 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
     int16_t **samples_p;
     int st; /* stereo */
     int count1, count2;
-    int nb_samples, coded_samples, ret;
+    int nb_samples, coded_samples, approx_nb_samples, ret;
     GetByteContext gb;
 
     bytestream2_init(&gb, buf, buf_size);
-    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples);
+    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples, &approx_nb_samples);
     if (nb_samples <= 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid number of samples in packet\n");
         return AVERROR_INVALIDDATA;
@@ -609,17 +701,15 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (short *)frame->data[0];
     samples_p = (int16_t **)frame->extended_data;
 
     /* use coded_samples when applicable */
     /* it is always <= nb_samples, so the output buffer will be large enough */
     if (coded_samples) {
-        if (coded_samples != nb_samples)
+        if (!approx_nb_samples && coded_samples != nb_samples)
             av_log(avctx, AV_LOG_WARNING, "mismatch in coded sample count\n");
         frame->nb_samples = nb_samples = coded_samples;
     }
@@ -681,6 +771,25 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
 
+        if (avctx->bits_per_coded_sample != 4) {
+            int samples_per_block = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
+            GetBitContext g;
+
+            ret = init_get_bits8(&g, gb.buffer, bytestream2_get_bytes_left(&gb));
+            if (ret < 0)
+                return ret;
+            for (n = 0; n < (nb_samples - 1) / samples_per_block; n++) {
+                for (i = 0; i < avctx->channels; i++) {
+                    cs = &c->status[i];
+                    samples = &samples_p[i][1 + n * samples_per_block];
+                    for (m = 0; m < samples_per_block; m++) {
+                        samples[m] = adpcm_ima_wav_expand_nibble(cs, &g,
+                                          avctx->bits_per_coded_sample);
+                    }
+                }
+            }
+            bytestream2_skip(&gb, avctx->block_align - avctx->channels * 4);
+        } else {
         for (n = 0; n < (nb_samples - 1) / 8; n++) {
             for (i = 0; i < avctx->channels; i++) {
                 cs = &c->status[i];
@@ -692,6 +801,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 }
             }
         }
+        }
         break;
     case AV_CODEC_ID_ADPCM_4XM:
         for (i = 0; i < avctx->channels; i++)
@@ -770,7 +880,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 return AVERROR_INVALIDDATA;
             }
         }
-        for (n = (nb_samples >> (1 - st)) - 1; n > 0; n--) {
+        for (n = (nb_samples - 1) >> (1 - st); n > 0; n--) {
             int v = bytestream2_get_byteu(&gb);
             *samples++ = adpcm_ima_expand_nibble(&c->status[0 ], v >> 4  , 3);
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
@@ -835,6 +945,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = c->status[0].predictor + c->status[1].predictor;
             *samples++ = c->status[0].predictor - c->status[1].predictor;
         }
+
+        if ((bytestream2_tell(&gb) & 1))
+            bytestream2_skip(&gb, 1);
         break;
     }
     case AV_CODEC_ID_ADPCM_IMA_ISS:
@@ -871,6 +984,38 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
         }
         break;
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
+        while (bytestream2_get_bytes_left(&gb) > 0) {
+            int v = bytestream2_get_byteu(&gb);
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[0],  v >> 4  );
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[st], v & 0x0F);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            cs = &c->status[channel];
+            cs->step_index = sign_extend(bytestream2_get_le16u(&gb), 16);
+            cs->predictor  = sign_extend(bytestream2_get_le16u(&gb), 16);
+            if (cs->step_index > 88u){
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index[%d] = %i\n",
+                       channel, cs->step_index);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        for (n = 0; n < nb_samples / 2; n++) {
+            int byte[2];
+
+            byte[0] = bytestream2_get_byteu(&gb);
+            if (st)
+                byte[1] = bytestream2_get_byteu(&gb);
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] & 0x0F, 3);
+            }
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] >> 4  , 3);
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WS:
         if (c->vqa_version == 3) {
             for (channel = 0; channel < avctx->channels; channel++) {
@@ -946,6 +1091,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         /* Each EA ADPCM frame has a 12-byte header followed by 30-byte pieces,
            each coding 28 stereo samples. */
 
+        if(avctx->channels != 2)
+            return AVERROR_INVALIDDATA;
+
         current_left_sample   = sign_extend(bytestream2_get_le16u(&gb), 16);
         previous_left_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
         current_right_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
@@ -1131,16 +1279,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_CODEC_ID_ADPCM_IMA_AMV:
-    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
-        if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-            c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_le16u(&gb);
-            bytestream2_skipu(&gb, 4);
-        } else {
-            c->status[0].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_byteu(&gb);
-            bytestream2_skipu(&gb, 1);
-        }
+        c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+        c->status[0].step_index = bytestream2_get_le16u(&gb);
+        bytestream2_skipu(&gb, 4);
         if (c->status[0].step_index > 88u) {
             av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
                    c->status[0].step_index);
@@ -1148,18 +1289,29 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
 
         for (n = nb_samples >> (1 - st); n > 0; n--) {
-            int hi, lo, v = bytestream2_get_byteu(&gb);
+            int v = bytestream2_get_byteu(&gb);
 
-            if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-                hi = v & 0x0F;
-                lo = v >> 4;
-            } else {
-                lo = v & 0x0F;
-                hi = v >> 4;
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v >> 4, 3);
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v & 0xf, 3);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
+        for (i = 0; i < avctx->channels; i++) {
+            c->status[i].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
+            c->status[i].step_index = bytestream2_get_byteu(&gb);
+            bytestream2_skipu(&gb, 1);
+            if (c->status[i].step_index > 88u) {
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
+                       c->status[i].step_index);
+                return AVERROR_INVALIDDATA;
             }
+        }
+
+        for (n = nb_samples >> (1 - st); n > 0; n--) {
+            int v = bytestream2_get_byteu(&gb);
 
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], lo, 3);
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], hi, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[0 ], v >> 4, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[st], v & 0xf, 3);
         }
         break;
     case AV_CODEC_ID_ADPCM_CT:
@@ -1189,7 +1341,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                                                        byte & 0x0F, 4, 0);
             }
         } else if (avctx->codec->id == AV_CODEC_ID_ADPCM_SBPRO_3) {
-            for (n = nb_samples / 3; n > 0; n--) {
+            for (n = (nb_samples<<st) / 3; n > 0; n--) {
                 int byte = bytestream2_get_byteu(&gb);
                 *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
                                                         byte >> 5        , 3, 0);
@@ -1223,26 +1375,104 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_yamaha_expand_nibble(&c->status[st], v >> 4  );
         }
         break;
+    case AV_CODEC_ID_ADPCM_AFC:
+    {
+        int samples_per_block;
+        int blocks;
+
+        if (avctx->extradata && avctx->extradata_size == 1 && avctx->extradata[0]) {
+            samples_per_block = avctx->extradata[0] / 16;
+            blocks = nb_samples / avctx->extradata[0];
+        } else {
+            samples_per_block = nb_samples / 16;
+            blocks = 1;
+        }
+
+        for (m = 0; m < blocks; m++) {
+        for (channel = 0; channel < avctx->channels; channel++) {
+            int prev1 = c->status[channel].sample1;
+            int prev2 = c->status[channel].sample2;
+
+            samples = samples_p[channel] + m * 16;
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < samples_per_block; i++) {
+                int byte = bytestream2_get_byteu(&gb);
+                int scale = 1 << (byte >> 4);
+                int index = byte & 0xf;
+                int factor1 = ff_adpcm_afc_coeffs[0][index];
+                int factor2 = ff_adpcm_afc_coeffs[1][index];
+
+                /* Decode 16 samples.  */
+                for (n = 0; n < 16; n++) {
+                    int32_t sampledat;
+
+                    if (n & 1) {
+                        sampledat = sign_extend(byte, 4);
+                    } else {
+                        byte = bytestream2_get_byteu(&gb);
+                        sampledat = sign_extend(byte >> 4, 4);
+                    }
+
+                    sampledat = ((prev1 * factor1 + prev2 * factor2) +
+                                 ((sampledat * scale) << 11)) >> 11;
+                    *samples = av_clip_int16(sampledat);
+                    prev2 = prev1;
+                    prev1 = *samples++;
+                }
+            }
+
+            c->status[channel].sample1 = prev1;
+            c->status[channel].sample2 = prev2;
+        }
+        }
+        bytestream2_seek(&gb, 0, SEEK_END);
+        break;
+    }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
     {
-        int table[2][16];
-        int prev[2][2];
+        int table[14][16];
         int ch;
 
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 16; n++)
-                table[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+#define THP_GET16(g) \
+    sign_extend( \
+        avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE ? \
+        bytestream2_get_le16u(&(g)) : \
+        bytestream2_get_be16u(&(g)), 16)
+
+        if (avctx->extradata) {
+            GetByteContext tb;
+            if (avctx->extradata_size < 32 * avctx->channels) {
+                av_log(avctx, AV_LOG_ERROR, "Missing coeff table\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-        /* Initialize the previous sample.  */
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 2; n++)
-                prev[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+            bytestream2_init(&tb, avctx->extradata, avctx->extradata_size);
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(tb);
+        } else {
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(gb);
+
+            if (!c->has_status) {
+                /* Initialize the previous sample.  */
+                for (i = 0; i < avctx->channels; i++) {
+                    c->status[i].sample1 = THP_GET16(gb);
+                    c->status[i].sample2 = THP_GET16(gb);
+                }
+                c->has_status = 1;
+            } else {
+                bytestream2_skip(&gb, avctx->channels * 4);
+            }
+        }
 
-        for (ch = 0; ch <= st; ch++) {
+        for (ch = 0; ch < avctx->channels; ch++) {
             samples = samples_p[ch];
 
             /* Read in every sample for this channel.  */
-            for (i = 0; i < nb_samples / 14; i++) {
+            for (i = 0; i < (nb_samples + 13) / 14; i++) {
                 int byte = bytestream2_get_byteu(&gb);
                 int index = (byte >> 4) & 7;
                 unsigned int exp = byte & 0x0F;
@@ -1250,7 +1480,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 int factor2 = table[ch][index * 2 + 1];
 
                 /* Decode 14 samples.  */
-                for (n = 0; n < 14; n++) {
+                for (n = 0; n < 14 && (i * 14 + n < nb_samples); n++) {
                     int32_t sampledat;
 
                     if (n & 1) {
@@ -1260,30 +1490,94 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                         sampledat = sign_extend(byte >> 4, 4);
                     }
 
-                    sampledat = ((prev[ch][0]*factor1
-                                + prev[ch][1]*factor2) >> 11) + (sampledat << exp);
+                    sampledat = ((c->status[ch].sample1 * factor1
+                                + c->status[ch].sample2 * factor2) >> 11) + (sampledat << exp);
                     *samples = av_clip_int16(sampledat);
-                    prev[ch][1] = prev[ch][0];
-                    prev[ch][0] = *samples++;
+                    c->status[ch].sample2 = c->status[ch].sample1;
+                    c->status[ch].sample1 = *samples++;
                 }
             }
         }
         break;
     }
+    case AV_CODEC_ID_ADPCM_DTK:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < nb_samples / 28; i++) {
+                int byte, header;
+                if (channel)
+                    bytestream2_skipu(&gb, 1);
+                header = bytestream2_get_byteu(&gb);
+                bytestream2_skipu(&gb, 3 - channel);
+
+                /* Decode 28 samples.  */
+                for (n = 0; n < 28; n++) {
+                    int32_t sampledat, prev;
+
+                    switch (header >> 4) {
+                    case 1:
+                        prev = (c->status[channel].sample1 * 0x3c);
+                        break;
+                    case 2:
+                        prev = (c->status[channel].sample1 * 0x73) - (c->status[channel].sample2 * 0x34);
+                        break;
+                    case 3:
+                        prev = (c->status[channel].sample1 * 0x62) - (c->status[channel].sample2 * 0x37);
+                        break;
+                    default:
+                        prev = 0;
+                    }
+
+                    prev = av_clip_intp2((prev + 0x20) >> 6, 21);
+
+                    byte = bytestream2_get_byteu(&gb);
+                    if (!channel)
+                        sampledat = sign_extend(byte, 4);
+                    else
+                        sampledat = sign_extend(byte >> 4, 4);
+
+                    sampledat = (((sampledat << 12) >> (header & 0xf)) << 6) + prev;
+                    *samples++ = av_clip_int16(sampledat >> 6);
+                    c->status[channel].sample2 = c->status[channel].sample1;
+                    c->status[channel].sample1 = sampledat;
+                }
+            }
+            if (!channel)
+                bytestream2_seek(&gb, 0, SEEK_SET);
+        }
+        break;
 
     default:
         return -1;
     }
 
+    if (avpkt->size && bytestream2_tell(&gb) == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Nothing consumed\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     *got_frame_ptr = 1;
 
+    if (avpkt->size < bytestream2_tell(&gb)) {
+        av_log(avctx, AV_LOG_ERROR, "Overread of %d < %d\n", avpkt->size, bytestream2_tell(&gb));
+        return avpkt->size;
+    }
+
     return bytestream2_tell(&gb);
 }
 
+static void adpcm_flush(AVCodecContext *avctx)
+{
+    ADPCMDecodeContext *c = avctx->priv_data;
+    c->has_status = 0;
+}
+
 
 static const enum AVSampleFormat sample_fmts_s16[]  = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_NONE };
-static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16,
+static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_NONE };
 static const enum AVSampleFormat sample_fmts_both[] = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_S16P,
@@ -1298,13 +1592,16 @@ AVCodec ff_ ## name_ ## _decoder = {                        \
     .priv_data_size = sizeof(ADPCMDecodeContext),           \
     .init           = adpcm_decode_init,                    \
     .decode         = adpcm_decode_frame,                   \
+    .flush          = adpcm_flush,                          \
     .capabilities   = AV_CODEC_CAP_DR1,                     \
     .sample_fmts    = sample_fmts_,                         \
 }
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_4XM,         sample_fmts_s16p, adpcm_4xm,         "ADPCM 4X Movie");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_AFC,         sample_fmts_s16p, adpcm_afc,         "ADPCM Nintendo Gamecube AFC");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_CT,          sample_fmts_s16,  adpcm_ct,          "ADPCM Creative Technology");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_DTK,         sample_fmts_s16p, adpcm_dtk,         "ADPCM Nintendo Gamecube DTK");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA,          sample_fmts_s16,  adpcm_ea,          "ADPCM Electronic Arts");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_MAXIS_XA, sample_fmts_s16,  adpcm_ea_maxis_xa, "ADPCM Electronic Arts Maxis CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_R1,       sample_fmts_s16p, adpcm_ea_r1,       "ADPCM Electronic Arts R1");
@@ -1318,7 +1615,9 @@ ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DK4,     sample_fmts_s16,  adpcm_ima_dk4,
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_EACS, sample_fmts_s16,  adpcm_ima_ea_eacs, "ADPCM IMA Electronic Arts EACS");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_SEAD, sample_fmts_s16,  adpcm_ima_ea_sead, "ADPCM IMA Electronic Arts SEAD");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_ISS,     sample_fmts_s16,  adpcm_ima_iss,     "ADPCM IMA Funcom ISS");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_OKI,     sample_fmts_s16,  adpcm_ima_oki,     "ADPCM IMA Dialogic OKI");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_QT,      sample_fmts_s16p, adpcm_ima_qt,      "ADPCM IMA QuickTime");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_RAD,     sample_fmts_s16,  adpcm_ima_rad,     "ADPCM IMA Radical");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_SMJPEG,  sample_fmts_s16,  adpcm_ima_smjpeg,  "ADPCM IMA Loki SDL MJPEG");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WAV,     sample_fmts_s16p, adpcm_ima_wav,     "ADPCM IMA WAV");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WS,      sample_fmts_both, adpcm_ima_ws,      "ADPCM IMA Westwood");
@@ -1327,6 +1626,7 @@ ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_2,     sample_fmts_s16,  adpcm_sbpro_2,
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_3,     sample_fmts_s16,  adpcm_sbpro_3,     "ADPCM Sound Blaster Pro 2.6-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_4,     sample_fmts_s16,  adpcm_sbpro_4,     "ADPCM Sound Blaster Pro 4-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SWF,         sample_fmts_s16,  adpcm_swf,         "ADPCM Shockwave Flash");
-ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo Gamecube THP");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP_LE,      sample_fmts_s16p, adpcm_thp_le,      "ADPCM Nintendo THP (little-endian)");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo THP");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_XA,          sample_fmts_s16p, adpcm_xa,          "ADPCM CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_YAMAHA,      sample_fmts_s16,  adpcm_yamaha,      "ADPCM Yamaha");
diff --git a/libavcodec/adpcm.h b/libavcodec/adpcm.h
index 16facb6d0f..fcbb70b3ca 100644
--- a/libavcodec/adpcm.h
+++ b/libavcodec/adpcm.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2001-2003 The ffmpeg Project
+ * Copyright (c) 2001-2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,8 +38,8 @@ typedef struct ADPCMChannelStatus {
     int prev_sample;
 
     /* MS version */
-    int16_t sample1;
-    int16_t sample2;
+    int sample1;
+    int sample2;
     int coeff1;
     int coeff2;
     int idelta;
diff --git a/libavcodec/adpcm_data.c b/libavcodec/adpcm_data.c
index 3bc5de23b3..9c38360a91 100644
--- a/libavcodec/adpcm_data.c
+++ b/libavcodec/adpcm_data.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2001-2003 The ffmpeg Project
+ * Copyright (c) 2001-2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,33 @@
 
 /* ff_adpcm_step_table[] and ff_adpcm_index_table[] are from the ADPCM
    reference source */
-/* This is the index table: */
+static const int8_t adpcm_index_table2[4] = {
+    -1,  2,
+    -1,  2,
+};
+
+static const int8_t adpcm_index_table3[8] = {
+    -1, -1,  1,  2,
+    -1, -1,  1,  2,
+};
+
 const int8_t ff_adpcm_index_table[16] = {
     -1, -1, -1, -1, 2, 4, 6, 8,
     -1, -1, -1, -1, 2, 4, 6, 8,
 };
 
+static const int8_t adpcm_index_table5[32] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+};
+
+const int8_t * const ff_adpcm_index_tables[4] = {
+    &adpcm_index_table2[0],
+    &adpcm_index_table3[0],
+    &ff_adpcm_index_table[0],
+    &adpcm_index_table5[0],
+};
+
 /**
  * This is the step table. Note that many programs use slight deviations from
  * this table, but such deviations are negligible:
@@ -49,6 +70,14 @@ const int16_t ff_adpcm_step_table[89] = {
     15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
 };
 
+const int16_t ff_adpcm_oki_step_table[49] = {
+     16,  17,  19,  21,   23,   25,   28,   31,   34,  37,
+     41,  45,  50,  55,   60,   66,   73,   80,   88,  97,
+    107, 118, 130, 143,  157,  173,  190,  209,  230, 253,
+    279, 307, 337, 371,  408,  449,  494,  544,  598, 658,
+    724, 796, 876, 963, 1060, 1166, 1282, 1411, 1552
+};
+
 /* These are for MS-ADPCM */
 /* ff_adpcm_AdaptationTable[], ff_adpcm_AdaptCoeff1[], and
    ff_adpcm_AdaptCoeff2[] are from libsndfile */
@@ -76,3 +105,8 @@ const int8_t ff_adpcm_yamaha_difflookup[] = {
      1,  3,  5,  7,  9,  11,  13,  15,
     -1, -3, -5, -7, -9, -11, -13, -15
 };
+
+const int16_t ff_adpcm_afc_coeffs[2][16] = {
+    { 0, 2048, 0, 1024, 4096, 3584, 3072, 4608, 4200, 4800, 5120, 2048, 1024, 64512, 64512, 63488 },
+    { 0, 0, 2048, 1024, 63488, 64000, 64512, 62976, 63288, 63236, 62464, 63488, 64512, 1024, 0, 0 }
+};
diff --git a/libavcodec/adpcm_data.h b/libavcodec/adpcm_data.h
index a46cb5bdec..b179d65f7a 100644
--- a/libavcodec/adpcm_data.h
+++ b/libavcodec/adpcm_data.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2001-2003 The ffmpeg Project
+ * Copyright (c) 2001-2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,18 @@
 
 #include <stdint.h>
 
+static const uint8_t ff_adpcm_ima_block_sizes[4]   = {  4, 12, 4, 20 };
+static const uint8_t ff_adpcm_ima_block_samples[4] = { 16, 32, 8, 32 };
+
+extern const int8_t * const ff_adpcm_index_tables[4];
 extern const int8_t  ff_adpcm_index_table[16];
 extern const int16_t ff_adpcm_step_table[89];
+extern const int16_t ff_adpcm_oki_step_table[49];
 extern const int16_t ff_adpcm_AdaptationTable[];
 extern const uint8_t ff_adpcm_AdaptCoeff1[];
 extern const int8_t  ff_adpcm_AdaptCoeff2[];
 extern const int16_t ff_adpcm_yamaha_indexscale[];
 extern const int8_t  ff_adpcm_yamaha_difflookup[];
+extern const int16_t ff_adpcm_afc_coeffs[2][16];
 
 #endif /* AVCODEC_ADPCM_DATA_H */
diff --git a/libavcodec/adpcmenc.c b/libavcodec/adpcmenc.c
index 65e1f8e636..9ceea09452 100644
--- a/libavcodec/adpcmenc.c
+++ b/libavcodec/adpcmenc.c
@@ -1,29 +1,28 @@
 /*
- * Copyright (c) 2001-2003 The ffmpeg Project
+ * Copyright (c) 2001-2003 The FFmpeg Project
  *
  * first version by Francois Revol (revol@free.fr)
  * fringe ADPCM codecs (e.g., DK3, DK4, Westwood)
  *   by Mike Melanson (melanson@pcisys.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "get_bits.h"
 #include "put_bits.h"
 #include "bytestream.h"
 #include "adpcm.h"
@@ -59,6 +58,8 @@ typedef struct ADPCMEncodeContext {
 
 #define FREEZE_INTERVAL 128
 
+static av_cold int adpcm_encode_close(AVCodecContext *avctx);
+
 static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 {
     ADPCMEncodeContext *s = avctx->priv_data;
@@ -100,6 +101,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
         /* seems frame_size isn't taken into account...
            have to buffer the samples :-( */
         avctx->block_align = BLKSIZE;
+        avctx->bits_per_coded_sample = 4;
         break;
     case AV_CODEC_ID_ADPCM_IMA_QT:
         avctx->frame_size  = 64;
@@ -108,8 +110,8 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
     case AV_CODEC_ID_ADPCM_MS:
         /* each 16 bits sample gives one nibble
            and we have 7 bytes per channel overhead */
-        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 /
-                             avctx->channels + 2;
+        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 / avctx->channels + 2;
+        avctx->bits_per_coded_sample = 4;
         avctx->block_align    = BLKSIZE;
         if (!(avctx->extradata = av_malloc(32 + AV_INPUT_BUFFER_PADDING_SIZE)))
             goto error;
@@ -144,10 +146,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 
     return 0;
 error:
-    av_freep(&s->paths);
-    av_freep(&s->node_buf);
-    av_freep(&s->nodep_buf);
-    av_freep(&s->trellis_hash);
+    adpcm_encode_close(avctx);
     return ret;
 }
 
@@ -180,24 +179,27 @@ static inline uint8_t adpcm_ima_qt_compress_sample(ADPCMChannelStatus *c,
                                                    int16_t sample)
 {
     int delta  = sample - c->prev_sample;
-    int mask, step = ff_adpcm_step_table[c->step_index];
-    int diff   = step >> 3;
-    int nibble = 0;
+    int diff, step = ff_adpcm_step_table[c->step_index];
+    int nibble = 8*(delta < 0);
 
-    if (delta < 0) {
-        nibble = 8;
-        delta  = -delta;
-    }
+    delta= abs(delta);
+    diff = delta + (step >> 3);
 
-    for (mask = 4; mask;) {
-        if (delta >= step) {
-            nibble |= mask;
-            delta  -= step;
-            diff   += step;
-        }
-        step >>= 1;
-        mask >>= 1;
+    if (delta >= step) {
+        nibble |= 4;
+        delta  -= step;
+    }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 2;
+        delta  -= step;
     }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 1;
+        delta  -= step;
+    }
+    diff -= delta;
 
     if (nibble & 8)
         c->prev_sample -= diff;
@@ -225,7 +227,7 @@ static inline uint8_t adpcm_ms_compress_sample(ADPCMChannelStatus *c,
         bias = -c->idelta / 2;
 
     nibble = (nibble + bias) / c->idelta;
-    nibble = av_clip(nibble, -8, 7) & 0x0F;
+    nibble = av_clip_intp2(nibble, 3) & 0x0F;
 
     predictor += ((nibble & 0x08) ? (nibble - 0x10) : nibble) * c->idelta;
 
@@ -330,7 +332,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     uint8_t *h;\
                     dec_sample = av_clip_int16(dec_sample);\
                     d = sample - dec_sample;\
-                    ssd = nodes[j]->ssd + d*d;\
+                    ssd = nodes[j]->ssd + d*(unsigned)d;\
                     /* Check for wraparound, skip such samples completely. \
                      * Note, changing ssd to a 64 bit variable would be \
                      * simpler, avoiding this check, but it's slower on \
@@ -365,7 +367,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     *h = generation;\
                     u  = nodes_next[pos];\
                     if (!u) {\
-                        assert(pathn < FREEZE_INTERVAL << avctx->trellis);\
+                        av_assert1(pathn < FREEZE_INTERVAL << avctx->trellis);\
                         u = t++;\
                         nodes_next[pos] = u;\
                         u->path = pathn++;\
@@ -484,10 +486,8 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         pkt_size = (2 + avctx->channels * (22 + 4 * (frame->nb_samples - 1)) + 7) / 8;
     else
         pkt_size = avctx->block_align;
-    if ((ret = ff_alloc_packet(avpkt, pkt_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch(avctx->codec->id) {
@@ -509,7 +509,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
         /* stereo: 4 bytes (8 samples) for left, 4 bytes for right */
         if (avctx->trellis > 0) {
-            FF_ALLOC_OR_GOTO(avctx, buf, avctx->channels * blocks * 8, error);
+            FF_ALLOC_ARRAY_OR_GOTO(avctx, buf, avctx->channels, blocks * 8, error);
             for (ch = 0; ch < avctx->channels; ch++) {
                 adpcm_compress_trellis(avctx, &samples_p[ch][1],
                                        buf + ch * blocks * 8, &c->status[ch],
@@ -541,7 +541,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_IMA_QT:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         for (ch = 0; ch < avctx->channels; ch++) {
             ADPCMChannelStatus *status = &c->status[ch];
@@ -571,7 +571,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_SWF:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         n = frame->nb_samples - 1;
 
@@ -581,7 +581,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         // init the encoder state
         for (i = 0; i < avctx->channels; i++) {
             // clip step so it fits 6 bits
-            c->status[i].step_index = av_clip(c->status[i].step_index, 0, 63);
+            c->status[i].step_index = av_clip_uintp2(c->status[i].step_index, 6);
             put_sbits(&pb, 16, samples[i]);
             put_bits(&pb, 6, c->status[i].step_index);
             c->status[i].prev_sample = samples[i];
diff --git a/libavcodec/adx.c b/libavcodec/adx.c
index d941d7b89c..cd88b16660 100644
--- a/libavcodec/adx.c
+++ b/libavcodec/adx.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx.h b/libavcodec/adx.h
index 9ae84dcec1..08f749a046 100644
--- a/libavcodec/adx.h
+++ b/libavcodec/adx.h
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx_parser.c b/libavcodec/adx_parser.c
index 706e242c82..1fa718f694 100644
--- a/libavcodec/adx_parser.c
+++ b/libavcodec/adx_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adxdec.c b/libavcodec/adxdec.c
index dc587b2733..32cc0f005a 100644
--- a/libavcodec/adxdec.c
+++ b/libavcodec/adxdec.c
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -101,6 +101,7 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
     int16_t **samples;
     int samples_offset;
     const uint8_t *buf  = avpkt->data;
+    const uint8_t *buf_end = buf + avpkt->size;
     int num_blocks, ch, ret;
 
     if (c->eof) {
@@ -141,16 +142,14 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = num_blocks * BLOCK_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
     samples_offset = 0;
 
     while (num_blocks--) {
         for (ch = 0; ch < c->channels; ch++) {
-            if (adx_decode(c, samples[ch], samples_offset, buf, ch)) {
+            if (buf_end - buf < BLOCK_SIZE || adx_decode(c, samples[ch], samples_offset, buf, ch)) {
                 c->eof = 1;
                 buf = avpkt->data + avpkt->size;
                 break;
@@ -158,9 +157,11 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
             buf_size -= BLOCK_SIZE;
             buf      += BLOCK_SIZE;
         }
-        samples_offset += BLOCK_SAMPLES;
+        if (!c->eof)
+            samples_offset += BLOCK_SAMPLES;
     }
 
+    frame->nb_samples = samples_offset;
     *got_frame_ptr = 1;
 
     return buf - avpkt->data;
diff --git a/libavcodec/adxenc.c b/libavcodec/adxenc.c
index e730811744..f1ba5911b3 100644
--- a/libavcodec/adxenc.c
+++ b/libavcodec/adxenc.c
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,14 +43,12 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     int s0, s1, s2, d;
     int max = 0;
     int min = 0;
-    int data[BLOCK_SAMPLES];
 
     s1 = prev->s1;
     s2 = prev->s2;
     for (i = 0, j = 0; j < 32; i += channels, j++) {
         s0 = wav[i];
         d = ((s0 << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
-        data[j] = d;
         if (max < d)
             max = d;
         if (min > d)
@@ -58,10 +56,10 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
         s2 = s1;
         s1 = s0;
     }
-    prev->s1 = s1;
-    prev->s2 = s2;
 
     if (max == 0 && min == 0) {
+        prev->s1 = s1;
+        prev->s2 = s2;
         memset(adx, 0, BLOCK_SIZE);
         return;
     }
@@ -77,8 +75,23 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     AV_WB16(adx, scale);
 
     init_put_bits(&pb, adx + 2, 16);
-    for (i = 0; i < BLOCK_SAMPLES; i++)
-        put_sbits(&pb, 4, av_clip(data[i] / scale, -8, 7));
+
+    s1 = prev->s1;
+    s2 = prev->s2;
+    for (i = 0, j = 0; j < 32; i += channels, j++) {
+        d = ((wav[i] << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
+
+        d = av_clip_intp2(ROUNDED_DIV(d, scale), 3);
+
+        put_sbits(&pb, 4, d);
+
+        s0 = ((d << COEFF_BITS) * scale + c->coeff[0] * s1 + c->coeff[1] * s2) >> COEFF_BITS;
+        s2 = s1;
+        s1 = s0;
+    }
+    prev->s1 = s1;
+    prev->s2 = s2;
+
     flush_put_bits(&pb);
 }
 
@@ -133,10 +146,8 @@ static int adx_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int ch, out_size, ret;
 
     out_size = BLOCK_SIZE * avctx->channels + !c->header_parsed * HEADER_SIZE;
-    if ((ret = ff_alloc_packet(avpkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     if (!c->header_parsed) {
diff --git a/libavcodec/aic.c b/libavcodec/aic.c
index 9ea27a7ecd..5decc78744 100644
--- a/libavcodec/aic.c
+++ b/libavcodec/aic.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -132,7 +132,7 @@ static const uint8_t aic_c_ext_scan[192] = {
     177, 184, 176, 169, 162, 161, 168, 160,
 };
 
-static const uint8_t *aic_scan[NUM_BANDS] = {
+static const uint8_t * const aic_scan[NUM_BANDS] = {
     aic_y_scan, aic_c_scan, aic_y_ext_scan, aic_c_ext_scan
 };
 
@@ -152,6 +152,7 @@ typedef struct AICContext {
     int16_t        *data_ptr[NUM_BANDS];
 
     DECLARE_ALIGNED(16, int16_t, block)[64];
+    DECLARE_ALIGNED(16, uint8_t, quant_matrix)[64];
 } AICContext;
 
 static int aic_decode_header(AICContext *ctx, const uint8_t *src, int size)
@@ -203,7 +204,8 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
     int has_skips, coeff_type, coeff_bits, skip_type, skip_bits;
     const int num_coeffs = aic_num_band_coeffs[band];
     const uint8_t *scan = aic_scan[band | force_chroma];
-    int mb, idx, val;
+    int mb, idx;
+    unsigned val;
 
     has_skips  = get_bits1(gb);
     coeff_type = get_bits1(gb);
@@ -217,14 +219,14 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
             idx = -1;
             do {
                 GET_CODE(val, skip_type, skip_bits);
-                if (val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 idx += val + 1;
                 if (idx >= num_coeffs)
                     break;
                 GET_CODE(val, coeff_type, coeff_bits);
                 val++;
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             } while (idx < num_coeffs - 1);
@@ -234,7 +236,7 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
         for (mb = 0; mb < slice_width; mb++) {
             for (idx = 0; idx < num_coeffs; idx++) {
                 GET_CODE(val, coeff_type, coeff_bits);
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             }
@@ -286,7 +288,7 @@ static void recombine_block_il(int16_t *dst, const uint8_t *scan,
     }
 }
 
-static void unquant_block(int16_t *block, int q)
+static void unquant_block(int16_t *block, int q, uint8_t *quant_matrix)
 {
     int i;
 
@@ -294,7 +296,7 @@ static void unquant_block(int16_t *block, int q)
         int val  = (uint16_t)block[i];
         int sign = val & 1;
 
-        block[i] = (((val >> 1) ^ -sign) * q * aic_quant_matrix[i] >> 4)
+        block[i] = (((val >> 1) ^ -sign) * q * quant_matrix[i] >> 4)
                    + sign;
     }
 }
@@ -335,7 +337,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
             else
                 recombine_block_il(ctx->block, ctx->scantable.permutated,
                                    &base_y, &ext_y, blk);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
 
             if (!ctx->interlaced) {
@@ -352,7 +354,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
         for (blk = 0; blk < 2; blk++) {
             recombine_block(ctx->block, ctx->scantable.permutated,
                             &base_c, &ext_c);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
             ctx->idsp.put_signed_pixels_clamped(ctx->block, C[blk],
                                                 ctx->frame->linesize[blk + 1]);
@@ -437,6 +439,8 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 64; i++)
         scan[i] = i;
     ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, scan);
+    for (i = 0; i < 64; i++)
+        ctx->quant_matrix[ctx->idsp.idct_permutation[i]] = aic_quant_matrix[i];
 
     ctx->mb_width  = FFALIGN(avctx->width,  16) >> 4;
     ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
@@ -451,7 +455,7 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
         }
     }
 
-    ctx->slice_data = av_malloc(ctx->slice_width * AIC_BAND_COEFFS
+    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
                                 * sizeof(*ctx->slice_data));
     if (!ctx->slice_data) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");
diff --git a/libavcodec/alac.c b/libavcodec/alac.c
index 72e8a5717b..146668ec26 100644
--- a/libavcodec/alac.c
+++ b/libavcodec/alac.c
@@ -2,20 +2,20 @@
  * ALAC (Apple Lossless Audio Codec) decoder
  * Copyright (c) 2005 David Hammerton
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  *  8bit  compatible version   (0)
  *  8bit  sample size
  *  8bit  history mult         (40)
- *  8bit  initial history      (14)
- *  8bit  rice param limit     (10)
+ *  8bit  initial history      (10)
+ *  8bit  rice param limit     (14)
  *  8bit  channels
  * 16bit  maxRun               (255)
  * 32bit  max coded frame size (0 means unknown)
@@ -48,17 +48,21 @@
 #include <inttypes.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "thread.h"
 #include "unary.h"
 #include "mathops.h"
 #include "alac_data.h"
+#include "alacdsp.h"
 
 #define ALAC_EXTRADATA_SIZE 36
 
 typedef struct ALACContext {
+    AVClass *class;
     AVCodecContext *avctx;
     GetBitContext gb;
     int channels;
@@ -75,6 +79,11 @@ typedef struct ALACContext {
 
     int extra_bits;     /**< number of extra bits beyond 16-bit */
     int nb_samples;     /**< number of samples in the current frame */
+
+    int direct_output;
+    int extra_bit_bug;
+
+    ALACDSPContext dsp;
 } ALACContext;
 
 static inline unsigned int decode_scalar(GetBitContext *gb, int k, int bps)
@@ -99,7 +108,7 @@ static inline unsigned int decode_scalar(GetBitContext *gb, int k, int bps)
     return x;
 }
 
-static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
+static int rice_decompress(ALACContext *alac, int32_t *output_buffer,
                             int nb_samples, int bps, int rice_history_mult)
 {
     int i;
@@ -110,6 +119,9 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
         int k;
         unsigned int x;
 
+        if(get_bits_left(&alac->gb) <= 0)
+            return -1;
+
         /* calculate rice param and decode next value */
         k = av_log2((history >> 9) + 3);
         k = FFMIN(k, alac->rice_limit);
@@ -150,6 +162,7 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
             history = 0;
         }
     }
+    return 0;
 }
 
 static inline int sign_only(int v)
@@ -186,7 +199,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 
     /* read warm-up samples */
-    for (i = 1; i <= lpc_order; i++)
+    for (i = 1; i <= lpc_order && i < nb_samples; i++)
         buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps);
 
     /* NOTE: 4 and 8 are very common cases that could be optimized. */
@@ -220,35 +233,6 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 }
 
-static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
-                               int decorr_shift, int decorr_left_weight)
-{
-    int i;
-
-    for (i = 0; i < nb_samples; i++) {
-        int32_t a, b;
-
-        a = buffer[0][i];
-        b = buffer[1][i];
-
-        a -= (b * decorr_left_weight) >> decorr_shift;
-        b += a;
-
-        buffer[0][i] = b;
-        buffer[1][i] = a;
-    }
-}
-
-static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
-                              int extra_bits, int channels, int nb_samples)
-{
-    int i, ch;
-
-    for (ch = 0; ch < channels; ch++)
-        for (i = 0; i < nb_samples; i++)
-            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
-}
-
 static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
                           int channels)
 {
@@ -265,7 +249,7 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
 
     alac->extra_bits = get_bits(&alac->gb, 2) << 3;
     bps = alac->sample_size - alac->extra_bits + channels - 1;
-    if (bps > 32) {
+    if (bps > 32U) {
         av_log(avctx, AV_LOG_ERROR, "bps is unsupported: %d\n", bps);
         return AVERROR_PATCHWELCOME;
     }
@@ -283,19 +267,18 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         return AVERROR_INVALIDDATA;
     }
     if (!alac->nb_samples) {
+        ThreadFrame tframe = { .f = frame };
         /* get output buffer */
         frame->nb_samples = output_samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
     } else if (output_samples != alac->nb_samples) {
         av_log(avctx, AV_LOG_ERROR, "sample count mismatch: %"PRIu32" != %d\n",
                output_samples, alac->nb_samples);
         return AVERROR_INVALIDDATA;
     }
     alac->nb_samples = output_samples;
-    if (alac->sample_size > 16) {
+    if (alac->direct_output) {
         for (ch = 0; ch < channels; ch++)
             alac->output_samples_buffer[ch] = (int32_t *)frame->extended_data[ch_index + ch];
     }
@@ -332,14 +315,18 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
 
         if (alac->extra_bits) {
             for (i = 0; i < alac->nb_samples; i++) {
+                if(get_bits_left(&alac->gb) <= 0)
+                    return -1;
                 for (ch = 0; ch < channels; ch++)
                     alac->extra_bits_buffer[ch][i] = get_bits(&alac->gb, alac->extra_bits);
             }
         }
         for (ch = 0; ch < channels; ch++) {
-            rice_decompress(alac, alac->predict_error_buffer[ch],
+            int ret=rice_decompress(alac, alac->predict_error_buffer[ch],
                             alac->nb_samples, bps,
                             rice_history_mult[ch] * alac->rice_history_mult / 4);
+            if(ret<0)
+                return ret;
 
             /* adaptive FIR filter */
             if (prediction_type[ch] == 15) {
@@ -364,6 +351,8 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
     } else {
         /* not compressed, easy case */
         for (i = 0; i < alac->nb_samples; i++) {
+            if(get_bits_left(&alac->gb) <= 0)
+                return -1;
             for (ch = 0; ch < channels; ch++) {
                 alac->output_samples_buffer[ch][i] =
                          get_sbits_long(&alac->gb, alac->sample_size);
@@ -374,14 +363,24 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         decorr_left_weight = 0;
     }
 
-    if (channels == 2 && decorr_left_weight) {
-        decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
-                           decorr_shift, decorr_left_weight);
-    }
+    if (channels == 2) {
+        if (alac->extra_bits && alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
 
-    if (alac->extra_bits) {
-        append_extra_bits(alac->output_samples_buffer, alac->extra_bits_buffer,
-                          alac->extra_bits, channels, alac->nb_samples);
+        if (decorr_left_weight) {
+            alac->dsp.decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
+                                         decorr_shift, decorr_left_weight);
+        }
+
+        if (alac->extra_bits && !alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
+    } else if (alac->extra_bits) {
+        alac->dsp.append_extra_bits[0](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                       alac->extra_bits, channels, alac->nb_samples);
     }
 
     switch(alac->sample_size) {
@@ -412,7 +411,8 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
     int channels;
     int ch, ret, got_end;
 
-    init_get_bits(&alac->gb, avpkt->data, avpkt->size * 8);
+    if ((ret = init_get_bits8(&alac->gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     got_end = 0;
     alac->nb_samples = 0;
@@ -424,7 +424,7 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
             break;
         }
         if (element > TYPE_CPE && element != TYPE_LFE) {
-            av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d", element);
+            av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d\n", element);
             return AVERROR_PATCHWELCOME;
         }
 
@@ -453,7 +453,10 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
                avpkt->size * 8 - get_bits_count(&alac->gb));
     }
 
-    *got_frame_ptr = 1;
+    if (alac->channels == ch)
+        *got_frame_ptr = 1;
+    else
+        av_log(avctx, AV_LOG_WARNING, "Failed to decode all channels\n");
 
     return avpkt->size;
 }
@@ -465,7 +468,7 @@ static av_cold int alac_decode_close(AVCodecContext *avctx)
     int ch;
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         av_freep(&alac->predict_error_buffer[ch]);
-        if (alac->sample_size == 16)
+        if (!alac->direct_output)
             av_freep(&alac->output_samples_buffer[ch]);
         av_freep(&alac->extra_bits_buffer[ch]);
     }
@@ -478,11 +481,18 @@ static int allocate_buffers(ALACContext *alac)
     int ch;
     int buf_size = alac->max_samples_per_frame * sizeof(int32_t);
 
+    for (ch = 0; ch < 2; ch++) {
+        alac->predict_error_buffer[ch]  = NULL;
+        alac->output_samples_buffer[ch] = NULL;
+        alac->extra_bits_buffer[ch]     = NULL;
+    }
+
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         FF_ALLOC_OR_GOTO(alac->avctx, alac->predict_error_buffer[ch],
                          buf_size, buf_alloc_fail);
 
-        if (alac->sample_size == 16) {
+        alac->direct_output = alac->sample_size > 16;
+        if (!alac->direct_output) {
             FF_ALLOC_OR_GOTO(alac->avctx, alac->output_samples_buffer[ch],
                              buf_size, buf_alloc_fail);
         }
@@ -535,11 +545,11 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
 
     /* initialize from the extradata */
     if (alac->avctx->extradata_size < ALAC_EXTRADATA_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "alac: extradata is too small\n");
+        av_log(avctx, AV_LOG_ERROR, "extradata is too small\n");
         return AVERROR_INVALIDDATA;
     }
     if (alac_set_info(alac)) {
-        av_log(avctx, AV_LOG_ERROR, "alac: set_info failed\n");
+        av_log(avctx, AV_LOG_ERROR, "set_info failed\n");
         return -1;
     }
 
@@ -563,7 +573,7 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         else
             avctx->channels = alac->channels;
     }
-    if (avctx->channels > ALAC_MAX_CHANNELS) {
+    if (avctx->channels > ALAC_MAX_CHANNELS || avctx->channels <= 0 ) {
         av_log(avctx, AV_LOG_ERROR, "Unsupported channel count: %d\n",
                avctx->channels);
         return AVERROR_PATCHWELCOME;
@@ -575,9 +585,34 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         return ret;
     }
 
+    ff_alacdsp_init(&alac->dsp);
+
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    ALACContext *alac = avctx->priv_data;
+    alac->avctx = avctx;
+    return allocate_buffers(alac);
+}
+#endif
+
+static const AVOption options[] = {
+    { "extra_bits_bug", "Force non-standard decoding process",
+      offsetof(ALACContext, extra_bit_bug), AV_OPT_TYPE_INT, { .i64 = 0 },
+      0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM },
+    { NULL },
+};
+
+static const AVClass alac_class = {
+    .class_name = "alac",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_alac_decoder = {
     .name           = "alac",
     .long_name      = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"),
@@ -587,5 +622,7 @@ AVCodec ff_alac_decoder = {
     .init           = alac_decode_init,
     .close          = alac_decode_close,
     .decode         = alac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &alac_class
 };
diff --git a/libavcodec/alac_data.c b/libavcodec/alac_data.c
index 9e131199b1..0bcb06c075 100644
--- a/libavcodec/alac_data.c
+++ b/libavcodec/alac_data.c
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alac_data.h b/libavcodec/alac_data.h
index ebb1f33a93..650d6dcd15 100644
--- a/libavcodec/alac_data.h
+++ b/libavcodec/alac_data.h
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alacdsp.c b/libavcodec/alacdsp.c
new file mode 100644
index 0000000000..608c0d8915
--- /dev/null
+++ b/libavcodec/alacdsp.c
@@ -0,0 +1,60 @@
+/*
+ * ALAC (Apple Lossless Audio Codec) decoder
+ * Copyright (c) 2005 David Hammerton
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "alacdsp.h"
+#include "config.h"
+
+static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight)
+{
+    int i;
+
+    for (i = 0; i < nb_samples; i++) {
+        int32_t a, b;
+
+        a = buffer[0][i];
+        b = buffer[1][i];
+
+        a -= (b * decorr_left_weight) >> decorr_shift;
+        b += a;
+
+        buffer[0][i] = b;
+        buffer[1][i] = a;
+    }
+}
+
+static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                              int extra_bits, int channels, int nb_samples)
+{
+    int i, ch;
+
+    for (ch = 0; ch < channels; ch++)
+        for (i = 0; i < nb_samples; i++)
+            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
+}
+
+av_cold void ff_alacdsp_init(ALACDSPContext *c)
+{
+    c->decorrelate_stereo   = decorrelate_stereo;
+    c->append_extra_bits[0] =
+    c->append_extra_bits[1] = append_extra_bits;
+}
diff --git a/libavcodec/alacdsp.h b/libavcodec/alacdsp.h
new file mode 100644
index 0000000000..a7acdbb56a
--- /dev/null
+++ b/libavcodec/alacdsp.h
@@ -0,0 +1,33 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALACDSP_H
+#define AVCODEC_ALACDSP_H
+
+#include <stdint.h>
+
+typedef struct ALACDSPContext {
+    void (*decorrelate_stereo)(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight);
+    void (*append_extra_bits[2])(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                 int extra_bits, int channels, int nb_samples);
+} ALACDSPContext;
+
+void ff_alacdsp_init(ALACDSPContext *c);
+
+#endif /* AVCODEC_ALACDSP_H */
diff --git a/libavcodec/alacenc.c b/libavcodec/alacenc.c
index 8a94f81bb4..a87c373e25 100644
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@@ -2,20 +2,20 @@
  * ALAC audio encoder
  * Copyright (c) 2008  Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,7 +66,7 @@ typedef struct AlacEncodeContext {
     int write_sample_size;
     int extra_bits;
     int32_t sample_buf[2][DEFAULT_FRAME_SIZE];
-    int32_t predictor_buf[DEFAULT_FRAME_SIZE];
+    int32_t predictor_buf[2][DEFAULT_FRAME_SIZE];
     int interlacing_shift;
     int interlacing_leftweight;
     PutBitContext pbctx;
@@ -253,13 +253,14 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 {
     int i;
     AlacLPCContext lpc = s->lpc[ch];
+    int32_t *residual = s->predictor_buf[ch];
 
     if (lpc.lpc_order == 31) {
-        s->predictor_buf[0] = s->sample_buf[ch][0];
+        residual[0] = s->sample_buf[ch][0];
 
         for (i = 1; i < s->frame_size; i++) {
-            s->predictor_buf[i] = s->sample_buf[ch][i    ] -
-                                  s->sample_buf[ch][i - 1];
+            residual[i] = s->sample_buf[ch][i    ] -
+                          s->sample_buf[ch][i - 1];
         }
 
         return;
@@ -269,12 +270,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 
     if (lpc.lpc_order > 0) {
         int32_t *samples  = s->sample_buf[ch];
-        int32_t *residual = s->predictor_buf;
 
         // generate warm-up samples
         residual[0] = samples[0];
         for (i = 1; i <= lpc.lpc_order; i++)
-            residual[i] = samples[i] - samples[i-1];
+            residual[i] = sign_extend(samples[i] - samples[i-1], s->write_sample_size);
 
         // perform lpc on remaining samples
         for (i = lpc.lpc_order + 1; i < s->frame_size; i++) {
@@ -313,11 +313,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
     }
 }
 
-static void alac_entropy_coder(AlacEncodeContext *s)
+static void alac_entropy_coder(AlacEncodeContext *s, int ch)
 {
     unsigned int history = s->rc.initial_history;
     int sign_modifier = 0, i, k;
-    int32_t *samples = s->predictor_buf;
+    int32_t *samples = s->predictor_buf[ch];
 
     for (i = 0; i < s->frame_size;) {
         int x;
@@ -394,6 +394,19 @@ static void write_element(AlacEncodeContext *s,
         init_sample_buffers(s, channels, samples);
         write_element_header(s, element, instance);
 
+        // extract extra bits if needed
+        if (s->extra_bits) {
+            uint32_t mask = (1 << s->extra_bits) - 1;
+            for (j = 0; j < channels; j++) {
+                int32_t *extra = s->predictor_buf[j];
+                int32_t *smp   = s->sample_buf[j];
+                for (i = 0; i < s->frame_size; i++) {
+                    extra[i] = smp[i] & mask;
+                    smp[i] >>= s->extra_bits;
+                }
+            }
+        }
+
         if (channels == 2)
             alac_stereo_decorrelation(s);
         else
@@ -416,11 +429,9 @@ static void write_element(AlacEncodeContext *s,
 
         // write extra bits if needed
         if (s->extra_bits) {
-            uint32_t mask = (1 << s->extra_bits) - 1;
             for (i = 0; i < s->frame_size; i++) {
                 for (j = 0; j < channels; j++) {
-                    put_bits(pb, s->extra_bits, s->sample_buf[j][i] & mask);
-                    s->sample_buf[j][i] >>= s->extra_bits;
+                    put_bits(pb, s->extra_bits, s->predictor_buf[j][i]);
                 }
             }
         }
@@ -432,10 +443,11 @@ static void write_element(AlacEncodeContext *s,
             // TODO: determine when this will actually help. for now it's not used.
             if (prediction_type == 15) {
                 // 2nd pass 1st order filter
+                int32_t *residual = s->predictor_buf[i];
                 for (j = s->frame_size - 1; j > 0; j--)
-                    s->predictor_buf[j] -= s->predictor_buf[j - 1];
+                    residual[j] -= residual[j - 1];
             }
-            alac_entropy_coder(s);
+            alac_entropy_coder(s, i);
         }
     }
 }
@@ -606,10 +618,8 @@ static int alac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         max_frame_size = s->max_coded_frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * max_frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * max_frame_size, 0)) < 0)
         return ret;
-    }
 
     /* use verbatim mode for compression_level 0 */
     if (s->compression_level) {
diff --git a/libavcodec/aliaspixdec.c b/libavcodec/aliaspixdec.c
index 8c1892490b..087b18fb91 100644
--- a/libavcodec/aliaspixdec.c
+++ b/libavcodec/aliaspixdec.c
@@ -2,20 +2,20 @@
  * Alias PIX image decoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aliaspixenc.c b/libavcodec/aliaspixenc.c
index 63016afc63..a9ba00cd29 100644
--- a/libavcodec/aliaspixenc.c
+++ b/libavcodec/aliaspixenc.c
@@ -2,20 +2,20 @@
  * Alias PIX image encoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     length = ALIAS_HEADER_SIZE + 4 * width * height; // max possible
-    if ((ret = ff_alloc_packet(pkt, length)) < 0) {
+    if ((ret = ff_alloc_packet2(avctx, pkt, length, ALIAS_HEADER_SIZE + height*2)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
         return ret;
     }
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 2d8474a7c4..3870cf6f12 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -2,20 +2,20 @@
  * Provide registration of all codecs, parsers and bitstream filters for libavcodec.
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,6 +76,7 @@ void avcodec_register_all(void)
     /* hardware accelerators */
     REGISTER_HWACCEL(H263_VAAPI,        h263_vaapi);
     REGISTER_HWACCEL(H263_VDPAU,        h263_vdpau);
+    REGISTER_HWACCEL(H263_VIDEOTOOLBOX, h263_videotoolbox);
     REGISTER_HWACCEL(H264_D3D11VA,      h264_d3d11va);
     REGISTER_HWACCEL(H264_DXVA2,        h264_dxva2);
     REGISTER_HWACCEL(H264_MMAL,         h264_mmal);
@@ -84,21 +85,30 @@ void avcodec_register_all(void)
     REGISTER_HWACCEL(H264_VDA,          h264_vda);
     REGISTER_HWACCEL(H264_VDA_OLD,      h264_vda_old);
     REGISTER_HWACCEL(H264_VDPAU,        h264_vdpau);
+    REGISTER_HWACCEL(H264_VIDEOTOOLBOX, h264_videotoolbox);
     REGISTER_HWACCEL(HEVC_D3D11VA,      hevc_d3d11va);
     REGISTER_HWACCEL(HEVC_DXVA2,        hevc_dxva2);
     REGISTER_HWACCEL(HEVC_QSV,          hevc_qsv);
+    REGISTER_HWACCEL(HEVC_VAAPI,        hevc_vaapi);
+    REGISTER_HWACCEL(HEVC_VDPAU,        hevc_vdpau);
+    REGISTER_HWACCEL(MPEG1_XVMC,        mpeg1_xvmc);
     REGISTER_HWACCEL(MPEG1_VDPAU,       mpeg1_vdpau);
+    REGISTER_HWACCEL(MPEG1_VIDEOTOOLBOX, mpeg1_videotoolbox);
+    REGISTER_HWACCEL(MPEG2_XVMC,        mpeg2_xvmc);
     REGISTER_HWACCEL(MPEG2_D3D11VA,     mpeg2_d3d11va);
     REGISTER_HWACCEL(MPEG2_DXVA2,       mpeg2_dxva2);
     REGISTER_HWACCEL(MPEG2_QSV,         mpeg2_qsv);
     REGISTER_HWACCEL(MPEG2_VAAPI,       mpeg2_vaapi);
     REGISTER_HWACCEL(MPEG2_VDPAU,       mpeg2_vdpau);
+    REGISTER_HWACCEL(MPEG2_VIDEOTOOLBOX, mpeg2_videotoolbox);
     REGISTER_HWACCEL(MPEG4_VAAPI,       mpeg4_vaapi);
     REGISTER_HWACCEL(MPEG4_VDPAU,       mpeg4_vdpau);
+    REGISTER_HWACCEL(MPEG4_VIDEOTOOLBOX, mpeg4_videotoolbox);
     REGISTER_HWACCEL(VC1_D3D11VA,       vc1_d3d11va);
     REGISTER_HWACCEL(VC1_DXVA2,         vc1_dxva2);
     REGISTER_HWACCEL(VC1_VAAPI,         vc1_vaapi);
     REGISTER_HWACCEL(VC1_VDPAU,         vc1_vdpau);
+    REGISTER_HWACCEL(VC1_QSV,           vc1_qsv);
     REGISTER_HWACCEL(WMV3_D3D11VA,      wmv3_d3d11va);
     REGISTER_HWACCEL(WMV3_DXVA2,        wmv3_dxva2);
     REGISTER_HWACCEL(WMV3_VAAPI,        wmv3_vaapi);
@@ -110,14 +120,19 @@ void avcodec_register_all(void)
     REGISTER_DECODER(AASC,              aasc);
     REGISTER_DECODER(AIC,               aic);
     REGISTER_ENCDEC (ALIAS_PIX,         alias_pix);
-    REGISTER_DECODER(AMV,               amv);
+    REGISTER_ENCDEC (AMV,               amv);
     REGISTER_DECODER(ANM,               anm);
     REGISTER_DECODER(ANSI,              ansi);
+    REGISTER_ENCDEC (APNG,              apng);
     REGISTER_ENCDEC (ASV1,              asv1);
     REGISTER_ENCDEC (ASV2,              asv2);
     REGISTER_DECODER(AURA,              aura);
     REGISTER_DECODER(AURA2,             aura2);
+    REGISTER_ENCDEC (AVRP,              avrp);
+    REGISTER_DECODER(AVRN,              avrn);
     REGISTER_DECODER(AVS,               avs);
+    REGISTER_ENCDEC (AVUI,              avui);
+    REGISTER_ENCDEC (AYUV,              ayuv);
     REGISTER_DECODER(BETHSOFTVID,       bethsoftvid);
     REGISTER_DECODER(BFI,               bfi);
     REGISTER_DECODER(BINK,              bink);
@@ -128,14 +143,16 @@ void avcodec_register_all(void)
     REGISTER_DECODER(CAVS,              cavs);
     REGISTER_DECODER(CDGRAPHICS,        cdgraphics);
     REGISTER_DECODER(CDXL,              cdxl);
-    REGISTER_DECODER(CINEPAK,           cinepak);
+    REGISTER_ENCDEC (CINEPAK,           cinepak);
     REGISTER_ENCDEC (CLJR,              cljr);
     REGISTER_DECODER(CLLC,              cllc);
     REGISTER_ENCDEC (COMFORTNOISE,      comfortnoise);
+    REGISTER_DECODER(CPIA,              cpia);
     REGISTER_DECODER(CSCD,              cscd);
     REGISTER_DECODER(CYUV,              cyuv);
     REGISTER_DECODER(DDS,               dds);
     REGISTER_DECODER(DFA,               dfa);
+    REGISTER_DECODER(DIRAC,             dirac);
     REGISTER_ENCDEC (DNXHD,             dnxhd);
     REGISTER_ENCDEC (DPX,               dpx);
     REGISTER_DECODER(DSICINVIDEO,       dsicinvideo);
@@ -158,7 +175,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (FFVHUFF,           ffvhuff);
     REGISTER_DECODER(FIC,               fic);
     REGISTER_ENCDEC (FLASHSV,           flashsv);
-    REGISTER_DECODER(FLASHSV2,          flashsv2);
+    REGISTER_ENCDEC (FLASHSV2,          flashsv2);
     REGISTER_DECODER(FLIC,              flic);
     REGISTER_ENCDEC (FLV,               flv);
     REGISTER_DECODER(FOURXM,            fourxm);
@@ -169,10 +186,15 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (H261,              h261);
     REGISTER_ENCDEC (H263,              h263);
     REGISTER_DECODER(H263I,             h263i);
-    REGISTER_ENCODER(H263P,             h263p);
+    REGISTER_ENCDEC (H263P,             h263p);
     REGISTER_DECODER(H264,              h264);
+    REGISTER_DECODER(H264_CRYSTALHD,    h264_crystalhd);
     REGISTER_DECODER(H264_MMAL,         h264_mmal);
     REGISTER_DECODER(H264_QSV,          h264_qsv);
+    REGISTER_DECODER(H264_VDA,          h264_vda);
+#if FF_API_VDPAU
+    REGISTER_DECODER(H264_VDPAU,        h264_vdpau);
+#endif
     REGISTER_ENCDEC (HAP,               hap);
     REGISTER_DECODER(HEVC,              hevc);
     REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
@@ -188,7 +210,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(INDEO4,            indeo4);
     REGISTER_DECODER(INDEO5,            indeo5);
     REGISTER_DECODER(INTERPLAY_VIDEO,   interplay_video);
-    REGISTER_DECODER(JPEG2000,          jpeg2000);
+    REGISTER_ENCDEC (JPEG2000,          jpeg2000);
     REGISTER_ENCDEC (JPEGLS,            jpegls);
     REGISTER_DECODER(JV,                jv);
     REGISTER_DECODER(KGV1,              kgv1);
@@ -207,16 +229,27 @@ void avcodec_register_all(void)
 #endif /* FF_API_XVMC */
     REGISTER_ENCDEC (MPEG1VIDEO,        mpeg1video);
     REGISTER_ENCDEC (MPEG2VIDEO,        mpeg2video);
-    REGISTER_DECODER(MPEG2_QSV,         mpeg2_qsv);
     REGISTER_ENCDEC (MPEG4,             mpeg4);
+    REGISTER_DECODER(MPEG4_CRYSTALHD,   mpeg4_crystalhd);
+#if FF_API_VDPAU
+    REGISTER_DECODER(MPEG4_VDPAU,       mpeg4_vdpau);
+#endif
+    REGISTER_DECODER(MPEGVIDEO,         mpegvideo);
+#if FF_API_VDPAU
+    REGISTER_DECODER(MPEG_VDPAU,        mpeg_vdpau);
+    REGISTER_DECODER(MPEG1_VDPAU,       mpeg1_vdpau);
+#endif
+    REGISTER_DECODER(MPEG2_CRYSTALHD,   mpeg2_crystalhd);
+    REGISTER_DECODER(MPEG2_QSV,         mpeg2_qsv);
     REGISTER_DECODER(MSA1,              msa1);
+    REGISTER_DECODER(MSMPEG4_CRYSTALHD, msmpeg4_crystalhd);
     REGISTER_DECODER(MSMPEG4V1,         msmpeg4v1);
     REGISTER_ENCDEC (MSMPEG4V2,         msmpeg4v2);
     REGISTER_ENCDEC (MSMPEG4V3,         msmpeg4v3);
     REGISTER_DECODER(MSRLE,             msrle);
     REGISTER_DECODER(MSS1,              mss1);
     REGISTER_DECODER(MSS2,              mss2);
-    REGISTER_DECODER(MSVIDEO1,          msvideo1);
+    REGISTER_ENCDEC (MSVIDEO1,          msvideo1);
     REGISTER_DECODER(MSZH,              mszh);
     REGISTER_DECODER(MTS2,              mts2);
     REGISTER_DECODER(MVC1,              mvc1);
@@ -233,12 +266,15 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (PNG,               png);
     REGISTER_ENCDEC (PPM,               ppm);
     REGISTER_ENCDEC (PRORES,            prores);
+    REGISTER_ENCODER(PRORES_AW,         prores_aw);
+    REGISTER_ENCODER(PRORES_KS,         prores_ks);
+    REGISTER_DECODER(PRORES_LGPL,       prores_lgpl);
     REGISTER_DECODER(PTX,               ptx);
     REGISTER_DECODER(QDRAW,             qdraw);
     REGISTER_DECODER(QPEG,              qpeg);
     REGISTER_ENCDEC (QTRLE,             qtrle);
-    REGISTER_DECODER(R10K,              r10k);
-    REGISTER_DECODER(R210,              r210);
+    REGISTER_ENCDEC (R10K,              r10k);
+    REGISTER_ENCDEC (R210,              r210);
     REGISTER_ENCDEC (RAWVIDEO,          rawvideo);
     REGISTER_DECODER(RL2,               rl2);
     REGISTER_ENCDEC (ROQ,               roq);
@@ -247,18 +283,21 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (RV20,              rv20);
     REGISTER_DECODER(RV30,              rv30);
     REGISTER_DECODER(RV40,              rv40);
-    REGISTER_DECODER(S302M,             s302m);
+    REGISTER_ENCDEC (S302M,             s302m);
     REGISTER_DECODER(SANM,              sanm);
     REGISTER_DECODER(SCREENPRESSO,      screenpresso);
     REGISTER_ENCDEC (SGI,               sgi);
     REGISTER_DECODER(SGIRLE,            sgirle);
     REGISTER_DECODER(SMACKER,           smacker);
     REGISTER_DECODER(SMC,               smc);
+    REGISTER_DECODER(SMVJPEG,           smvjpeg);
+    REGISTER_ENCDEC (SNOW,              snow);
     REGISTER_DECODER(SP5X,              sp5x);
     REGISTER_ENCDEC (SUNRAST,           sunrast);
     REGISTER_ENCDEC (SVQ1,              svq1);
     REGISTER_DECODER(SVQ3,              svq3);
     REGISTER_ENCDEC (TARGA,             targa);
+    REGISTER_DECODER(TARGA_Y216,        targa_y216);
     REGISTER_DECODER(TDSC,              tdsc);
     REGISTER_DECODER(THEORA,            theora);
     REGISTER_DECODER(THP,               thp);
@@ -274,11 +313,18 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (UTVIDEO,           utvideo);
     REGISTER_ENCDEC (V210,              v210);
     REGISTER_DECODER(V210X,             v210x);
+    REGISTER_ENCDEC (V308,              v308);
+    REGISTER_ENCDEC (V408,              v408);
     REGISTER_ENCDEC (V410,              v410);
     REGISTER_DECODER(VB,                vb);
     REGISTER_DECODER(VBLE,              vble);
     REGISTER_DECODER(VC1,               vc1);
+    REGISTER_DECODER(VC1_CRYSTALHD,     vc1_crystalhd);
+#if FF_API_VDPAU
+    REGISTER_DECODER(VC1_VDPAU,         vc1_vdpau);
+#endif
     REGISTER_DECODER(VC1IMAGE,          vc1image);
+    REGISTER_DECODER(VC1_QSV,           vc1_qsv);
     REGISTER_DECODER(VCR1,              vcr1);
     REGISTER_DECODER(VMDVIDEO,          vmdvideo);
     REGISTER_DECODER(VMNC,              vmnc);
@@ -295,23 +341,32 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (WMV1,              wmv1);
     REGISTER_ENCDEC (WMV2,              wmv2);
     REGISTER_DECODER(WMV3,              wmv3);
+    REGISTER_DECODER(WMV3_CRYSTALHD,    wmv3_crystalhd);
+#if FF_API_VDPAU
+    REGISTER_DECODER(WMV3_VDPAU,        wmv3_vdpau);
+#endif
     REGISTER_DECODER(WMV3IMAGE,         wmv3image);
     REGISTER_DECODER(WNV1,              wnv1);
     REGISTER_DECODER(XAN_WC3,           xan_wc3);
     REGISTER_DECODER(XAN_WC4,           xan_wc4);
     REGISTER_ENCDEC (XBM,               xbm);
+    REGISTER_ENCDEC (XFACE,             xface);
     REGISTER_DECODER(XL,                xl);
     REGISTER_ENCDEC (XWD,               xwd);
+    REGISTER_ENCDEC (Y41P,              y41p);
     REGISTER_DECODER(YOP,               yop);
+    REGISTER_ENCDEC (YUV4,              yuv4);
+    REGISTER_DECODER(ZERO12V,           zero12v);
     REGISTER_DECODER(ZEROCODEC,         zerocodec);
     REGISTER_ENCDEC (ZLIB,              zlib);
     REGISTER_ENCDEC (ZMBV,              zmbv);
 
     /* audio codecs */
     REGISTER_ENCDEC (AAC,               aac);
+    REGISTER_DECODER(AAC_FIXED,         aac_fixed);
     REGISTER_DECODER(AAC_LATM,          aac_latm);
     REGISTER_ENCDEC (AC3,               ac3);
-    REGISTER_ENCODER(AC3_FIXED,         ac3_fixed);
+    REGISTER_ENCDEC (AC3_FIXED,         ac3_fixed);
     REGISTER_ENCDEC (ALAC,              alac);
     REGISTER_DECODER(ALS,               als);
     REGISTER_DECODER(AMRNB,             amrnb);
@@ -324,12 +379,19 @@ void avcodec_register_all(void)
     REGISTER_DECODER(BINKAUDIO_RDFT,    binkaudio_rdft);
     REGISTER_DECODER(BMV_AUDIO,         bmv_audio);
     REGISTER_DECODER(COOK,              cook);
-    REGISTER_DECODER(DCA,               dca);
+    REGISTER_ENCDEC (DCA,               dca);
+    REGISTER_DECODER(DSD_LSBF,          dsd_lsbf);
+    REGISTER_DECODER(DSD_MSBF,          dsd_msbf);
+    REGISTER_DECODER(DSD_LSBF_PLANAR,   dsd_lsbf_planar);
+    REGISTER_DECODER(DSD_MSBF_PLANAR,   dsd_msbf_planar);
     REGISTER_DECODER(DSICINAUDIO,       dsicinaudio);
     REGISTER_DECODER(DSS_SP,            dss_sp);
     REGISTER_ENCDEC (EAC3,              eac3);
+    REGISTER_DECODER(EVRC,              evrc);
+    REGISTER_DECODER(FFWAVESYNTH,       ffwavesynth);
     REGISTER_ENCDEC (FLAC,              flac);
-    REGISTER_DECODER(G723_1,            g723_1);
+    REGISTER_ENCDEC (G723_1,            g723_1);
+    REGISTER_DECODER(G729,              g729);
     REGISTER_DECODER(GSM,               gsm);
     REGISTER_DECODER(GSM_MS,            gsm_ms);
     REGISTER_DECODER(IAC,               iac);
@@ -342,6 +404,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(MP1FLOAT,          mp1float);
     REGISTER_ENCDEC (MP2,               mp2);
     REGISTER_DECODER(MP2FLOAT,          mp2float);
+    REGISTER_ENCODER(MP2FIXED,          mp2fixed);
     REGISTER_DECODER(MP3,               mp3);
     REGISTER_DECODER(MP3FLOAT,          mp3float);
     REGISTER_DECODER(MP3ADU,            mp3adu);
@@ -362,14 +425,16 @@ void avcodec_register_all(void)
     REGISTER_DECODER(SHORTEN,           shorten);
     REGISTER_DECODER(SIPR,              sipr);
     REGISTER_DECODER(SMACKAUD,          smackaud);
+    REGISTER_ENCDEC (SONIC,             sonic);
+    REGISTER_ENCODER(SONIC_LS,          sonic_ls);
     REGISTER_DECODER(TAK,               tak);
     REGISTER_DECODER(TRUEHD,            truehd);
     REGISTER_DECODER(TRUESPEECH,        truespeech);
-    REGISTER_DECODER(TTA,               tta);
+    REGISTER_ENCDEC (TTA,               tta);
     REGISTER_DECODER(TWINVQ,            twinvq);
     REGISTER_DECODER(VMDAUDIO,          vmdaudio);
     REGISTER_ENCDEC (VORBIS,            vorbis);
-    REGISTER_DECODER(WAVPACK,           wavpack);
+    REGISTER_ENCDEC (WAVPACK,           wavpack);
     REGISTER_DECODER(WMALOSSLESS,       wmalossless);
     REGISTER_DECODER(WMAPRO,            wmapro);
     REGISTER_ENCDEC (WMAV1,             wmav1);
@@ -388,18 +453,18 @@ void avcodec_register_all(void)
     REGISTER_DECODER(PCM_LXF,           pcm_lxf);
     REGISTER_ENCDEC (PCM_MULAW,         pcm_mulaw);
     REGISTER_ENCDEC (PCM_S8,            pcm_s8);
-    REGISTER_DECODER(PCM_S8_PLANAR,     pcm_s8_planar);
+    REGISTER_ENCDEC (PCM_S8_PLANAR,     pcm_s8_planar);
     REGISTER_ENCDEC (PCM_S16BE,         pcm_s16be);
-    REGISTER_DECODER(PCM_S16BE_PLANAR,  pcm_s16be_planar);
+    REGISTER_ENCDEC (PCM_S16BE_PLANAR,  pcm_s16be_planar);
     REGISTER_ENCDEC (PCM_S16LE,         pcm_s16le);
-    REGISTER_DECODER(PCM_S16LE_PLANAR,  pcm_s16le_planar);
+    REGISTER_ENCDEC (PCM_S16LE_PLANAR,  pcm_s16le_planar);
     REGISTER_ENCDEC (PCM_S24BE,         pcm_s24be);
     REGISTER_ENCDEC (PCM_S24DAUD,       pcm_s24daud);
     REGISTER_ENCDEC (PCM_S24LE,         pcm_s24le);
-    REGISTER_DECODER(PCM_S24LE_PLANAR,  pcm_s24le_planar);
+    REGISTER_ENCDEC (PCM_S24LE_PLANAR,  pcm_s24le_planar);
     REGISTER_ENCDEC (PCM_S32BE,         pcm_s32be);
     REGISTER_ENCDEC (PCM_S32LE,         pcm_s32le);
-    REGISTER_DECODER(PCM_S32LE_PLANAR,  pcm_s32le_planar);
+    REGISTER_ENCDEC (PCM_S32LE_PLANAR,  pcm_s32le_planar);
     REGISTER_ENCDEC (PCM_U8,            pcm_u8);
     REGISTER_ENCDEC (PCM_U16BE,         pcm_u16be);
     REGISTER_ENCDEC (PCM_U16LE,         pcm_u16le);
@@ -407,7 +472,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (PCM_U24LE,         pcm_u24le);
     REGISTER_ENCDEC (PCM_U32BE,         pcm_u32be);
     REGISTER_ENCDEC (PCM_U32LE,         pcm_u32le);
-    REGISTER_DECODER(PCM_ZORK ,         pcm_zork);
+    REGISTER_DECODER(PCM_ZORK,          pcm_zork);
 
     /* DPCM codecs */
     REGISTER_DECODER(INTERPLAY_DPCM,    interplay_dpcm);
@@ -418,7 +483,9 @@ void avcodec_register_all(void)
     /* ADPCM codecs */
     REGISTER_DECODER(ADPCM_4XM,         adpcm_4xm);
     REGISTER_ENCDEC (ADPCM_ADX,         adpcm_adx);
+    REGISTER_DECODER(ADPCM_AFC,         adpcm_afc);
     REGISTER_DECODER(ADPCM_CT,          adpcm_ct);
+    REGISTER_DECODER(ADPCM_DTK,         adpcm_dtk);
     REGISTER_DECODER(ADPCM_EA,          adpcm_ea);
     REGISTER_DECODER(ADPCM_EA_MAXIS_XA, adpcm_ea_maxis_xa);
     REGISTER_DECODER(ADPCM_EA_R1,       adpcm_ea_r1);
@@ -427,6 +494,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(ADPCM_EA_XAS,      adpcm_ea_xas);
     REGISTER_ENCDEC (ADPCM_G722,        adpcm_g722);
     REGISTER_ENCDEC (ADPCM_G726,        adpcm_g726);
+    REGISTER_DECODER(ADPCM_G726LE,      adpcm_g726le);
     REGISTER_DECODER(ADPCM_IMA_AMV,     adpcm_ima_amv);
     REGISTER_DECODER(ADPCM_IMA_APC,     adpcm_ima_apc);
     REGISTER_DECODER(ADPCM_IMA_DK3,     adpcm_ima_dk3);
@@ -434,7 +502,9 @@ void avcodec_register_all(void)
     REGISTER_DECODER(ADPCM_IMA_EA_EACS, adpcm_ima_ea_eacs);
     REGISTER_DECODER(ADPCM_IMA_EA_SEAD, adpcm_ima_ea_sead);
     REGISTER_DECODER(ADPCM_IMA_ISS,     adpcm_ima_iss);
+    REGISTER_DECODER(ADPCM_IMA_OKI,     adpcm_ima_oki);
     REGISTER_ENCDEC (ADPCM_IMA_QT,      adpcm_ima_qt);
+    REGISTER_DECODER(ADPCM_IMA_RAD,     adpcm_ima_rad);
     REGISTER_DECODER(ADPCM_IMA_SMJPEG,  adpcm_ima_smjpeg);
     REGISTER_ENCDEC (ADPCM_IMA_WAV,     adpcm_ima_wav);
     REGISTER_DECODER(ADPCM_IMA_WS,      adpcm_ima_ws);
@@ -444,19 +514,37 @@ void avcodec_register_all(void)
     REGISTER_DECODER(ADPCM_SBPRO_4,     adpcm_sbpro_4);
     REGISTER_ENCDEC (ADPCM_SWF,         adpcm_swf);
     REGISTER_DECODER(ADPCM_THP,         adpcm_thp);
+    REGISTER_DECODER(ADPCM_THP_LE,      adpcm_thp_le);
     REGISTER_DECODER(ADPCM_VIMA,        adpcm_vima);
     REGISTER_DECODER(ADPCM_XA,          adpcm_xa);
     REGISTER_ENCDEC (ADPCM_YAMAHA,      adpcm_yamaha);
 
     /* subtitles */
+    REGISTER_ENCDEC (SSA,               ssa);
     REGISTER_ENCDEC (ASS,               ass);
+    REGISTER_DECODER(CCAPTION,          ccaption);
     REGISTER_ENCDEC (DVBSUB,            dvbsub);
     REGISTER_ENCDEC (DVDSUB,            dvdsub);
+    REGISTER_DECODER(JACOSUB,           jacosub);
+    REGISTER_DECODER(MICRODVD,          microdvd);
+    REGISTER_ENCDEC (MOVTEXT,           movtext);
+    REGISTER_DECODER(MPL2,              mpl2);
     REGISTER_DECODER(PGSSUB,            pgssub);
-    REGISTER_DECODER(SRT,               srt);
+    REGISTER_DECODER(PJS,               pjs);
+    REGISTER_DECODER(REALTEXT,          realtext);
+    REGISTER_DECODER(SAMI,              sami);
+    REGISTER_ENCDEC (SRT,               srt);
+    REGISTER_DECODER(STL,               stl);
+    REGISTER_ENCDEC (SUBRIP,            subrip);
+    REGISTER_DECODER(SUBVIEWER,         subviewer);
+    REGISTER_DECODER(SUBVIEWER1,        subviewer1);
+    REGISTER_DECODER(TEXT,              text);
+    REGISTER_DECODER(VPLAYER,           vplayer);
+    REGISTER_ENCDEC (WEBVTT,            webvtt);
     REGISTER_ENCDEC (XSUB,              xsub);
 
     /* external libraries */
+    REGISTER_DECODER(LIBCELT,           libcelt);
     REGISTER_DECODER(LIBDCADEC,         libdcadec)
     REGISTER_ENCODER(LIBFAAC,           libfaac);
     REGISTER_ENCDEC (LIBFDK_AAC,        libfdk_aac);
@@ -469,29 +557,43 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (LIBOPENJPEG,       libopenjpeg);
     REGISTER_ENCDEC (LIBOPUS,           libopus);
     REGISTER_ENCDEC (LIBSCHROEDINGER,   libschroedinger);
+    REGISTER_ENCODER(LIBSHINE,          libshine);
     REGISTER_ENCDEC (LIBSPEEX,          libspeex);
+    REGISTER_DECODER(LIBSTAGEFRIGHT_H264, libstagefright_h264);
     REGISTER_ENCODER(LIBTHEORA,         libtheora);
     REGISTER_ENCODER(LIBTWOLAME,        libtwolame);
+    REGISTER_ENCDEC (LIBUTVIDEO,        libutvideo);
     REGISTER_ENCODER(LIBVO_AACENC,      libvo_aacenc);
     REGISTER_ENCODER(LIBVO_AMRWBENC,    libvo_amrwbenc);
-    REGISTER_ENCODER(LIBVORBIS,         libvorbis);
+    REGISTER_ENCDEC (LIBVORBIS,         libvorbis);
     REGISTER_ENCDEC (LIBVPX_VP8,        libvpx_vp8);
     REGISTER_ENCDEC (LIBVPX_VP9,        libvpx_vp9);
     REGISTER_ENCODER(LIBWAVPACK,        libwavpack);
+    REGISTER_ENCODER(LIBWEBP_ANIM,      libwebp_anim);  /* preferred over libwebp */
     REGISTER_ENCODER(LIBWEBP,           libwebp);
     REGISTER_ENCODER(LIBX262,           libx262);
     REGISTER_ENCODER(LIBX264,           libx264);
+    REGISTER_ENCODER(LIBX264RGB,        libx264rgb);
     REGISTER_ENCODER(LIBX265,           libx265);
     REGISTER_ENCODER(LIBXAVS,           libxavs);
     REGISTER_ENCODER(LIBXVID,           libxvid);
+    REGISTER_DECODER(LIBZVBI_TELETEXT,  libzvbi_teletext);
+    REGISTER_ENCODER(LIBAACPLUS,        libaacplus);
+
+    /* text */
+    REGISTER_DECODER(BINTEXT,           bintext);
+    REGISTER_DECODER(XBIN,              xbin);
+    REGISTER_DECODER(IDF,               idf);
 
     /* external libraries, that shouldn't be used by default if one of the
      * above is available */
     REGISTER_ENCODER(LIBOPENH264,       libopenh264);
-    REGISTER_ENCODER(H264_NVENC,        h264_nvenc);
     REGISTER_ENCODER(H264_QSV,          h264_qsv);
-    REGISTER_ENCODER(HEVC_NVENC,        hevc_nvenc);
+    REGISTER_ENCODER(NVENC,             nvenc);
+    REGISTER_ENCODER(NVENC_H264,        nvenc_h264);
+    REGISTER_ENCODER(NVENC_HEVC,        nvenc_hevc);
     REGISTER_ENCODER(HEVC_QSV,          hevc_qsv);
+    REGISTER_ENCODER(LIBKVAZAAR,        libkvazaar);
     REGISTER_ENCODER(MPEG2_QSV,         mpeg2_qsv);
 
     /* parsers */
@@ -508,7 +610,9 @@ void avcodec_register_all(void)
     REGISTER_PARSER(DPX,                dpx);
     REGISTER_PARSER(DVBSUB,             dvbsub);
     REGISTER_PARSER(DVDSUB,             dvdsub);
+    REGISTER_PARSER(DVD_NAV,            dvd_nav);
     REGISTER_PARSER(FLAC,               flac);
+    REGISTER_PARSER(G729,               g729);
     REGISTER_PARSER(GSM,                gsm);
     REGISTER_PARSER(H261,               h261);
     REGISTER_PARSER(H263,               h263);
@@ -529,6 +633,7 @@ void avcodec_register_all(void)
     REGISTER_PARSER(VORBIS,             vorbis);
     REGISTER_PARSER(VP3,                vp3);
     REGISTER_PARSER(VP8,                vp8);
+    REGISTER_PARSER(VP9,                vp9);
 
     /* bitstream filters */
     REGISTER_BSF(AAC_ADTSTOASC,         aac_adtstoasc);
@@ -539,6 +644,8 @@ void avcodec_register_all(void)
     REGISTER_BSF(IMX_DUMP_HEADER,       imx_dump_header);
     REGISTER_BSF(MJPEG2JPEG,            mjpeg2jpeg);
     REGISTER_BSF(MJPEGA_DUMP_HEADER,    mjpega_dump_header);
+    REGISTER_BSF(MP3_HEADER_DECOMPRESS, mp3_header_decompress);
+    REGISTER_BSF(MPEG4_UNPACK_BFRAMES,  mpeg4_unpack_bframes);
     REGISTER_BSF(MOV2TEXTSUB,           mov2textsub);
     REGISTER_BSF(NOISE,                 noise);
     REGISTER_BSF(REMOVE_EXTRADATA,      remove_extradata);
diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile
new file mode 100644
index 0000000000..796d9762b3
--- /dev/null
+++ b/libavcodec/alpha/Makefile
@@ -0,0 +1,10 @@
+OBJS-$(CONFIG_BLOCKDSP)                 += alpha/blockdsp_alpha.o
+OBJS-$(CONFIG_ME_CMP)                   += alpha/me_cmp_alpha.o         \
+                                           alpha/me_cmp_mvi_asm.o
+OBJS-$(CONFIG_HPELDSP)                  += alpha/hpeldsp_alpha.o        \
+                                           alpha/hpeldsp_alpha_asm.o
+OBJS-$(CONFIG_IDCTDSP)                  += alpha/idctdsp_alpha.o        \
+                                           alpha/idctdsp_alpha_asm.o    \
+                                           alpha/simple_idct_alpha.o
+OBJS-$(CONFIG_MPEGVIDEO)                += alpha/mpegvideo_alpha.o
+OBJS-$(CONFIG_PIXBLOCKDSP)              += alpha/pixblockdsp_alpha.o
diff --git a/libavcodec/alpha/asm.h b/libavcodec/alpha/asm.h
new file mode 100644
index 0000000000..827721e777
--- /dev/null
+++ b/libavcodec/alpha/asm.h
@@ -0,0 +1,186 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_ASM_H
+#define AVCODEC_ALPHA_ASM_H
+
+#include <inttypes.h>
+
+#include "libavutil/common.h"
+
+#if AV_GCC_VERSION_AT_LEAST(2,96)
+# define likely(x)      __builtin_expect((x) != 0, 1)
+# define unlikely(x)    __builtin_expect((x) != 0, 0)
+#else
+# define likely(x)      (x)
+# define unlikely(x)    (x)
+#endif
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_CIX (1 << 2)
+#define AMASK_MVI (1 << 8)
+
+static inline uint64_t BYTE_VEC(uint64_t x)
+{
+    x |= x <<  8;
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+static inline uint64_t WORD_VEC(uint64_t x)
+{
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+
+#define sextw(x) ((int16_t) (x))
+
+#ifdef __GNUC__
+#define ldq(p)                                                  \
+    (((const union {                                            \
+        uint64_t __l;                                           \
+        __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];  \
+    } *) (p))->__l)
+#define ldl(p)                                                  \
+    (((const union {                                            \
+        int32_t __l;                                            \
+        __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];   \
+    } *) (p))->__l)
+#define stq(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            uint64_t __l;                                               \
+            __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];      \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+#define stl(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            int32_t __l;                                                \
+            __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];       \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+struct unaligned_long { uint64_t l; } __attribute__((packed));
+#define ldq_u(p)        (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
+#define uldq(a)         (((const struct unaligned_long *) (a))->l)
+
+#if AV_GCC_VERSION_AT_LEAST(3,3)
+#define prefetch(p)     __builtin_prefetch((p), 0, 1)
+#define prefetch_en(p)  __builtin_prefetch((p), 0, 0)
+#define prefetch_m(p)   __builtin_prefetch((p), 1, 1)
+#define prefetch_men(p) __builtin_prefetch((p), 1, 0)
+#define cmpbge          __builtin_alpha_cmpbge
+/* Avoid warnings.  */
+#define extql(a, b)     __builtin_alpha_extql(a, (uint64_t) (b))
+#define extwl(a, b)     __builtin_alpha_extwl(a, (uint64_t) (b))
+#define extqh(a, b)     __builtin_alpha_extqh(a, (uint64_t) (b))
+#define zap             __builtin_alpha_zap
+#define zapnot          __builtin_alpha_zapnot
+#define amask           __builtin_alpha_amask
+#define implver         __builtin_alpha_implver
+#define rpcc            __builtin_alpha_rpcc
+#else
+#define prefetch(p)     __asm__ volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_en(p)  __asm__ volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_m(p)   __asm__ volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_men(p) __asm__ volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define cmpbge(a, b) ({ uint64_t __r; __asm__ ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extql(a, b)  ({ uint64_t __r; __asm__ ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extwl(a, b)  ({ uint64_t __r; __asm__ ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extqh(a, b)  ({ uint64_t __r; __asm__ ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zap(a, b)    ({ uint64_t __r; __asm__ ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zapnot(a, b) ({ uint64_t __r; __asm__ ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define amask(a)     ({ uint64_t __r; __asm__ ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));           __r; })
+#define implver()    ({ uint64_t __r; __asm__ ("implver %0"         : "=r" (__r));                       __r; })
+#define rpcc()       ({ uint64_t __r; __asm__ volatile ("rpcc %0"   : "=r" (__r));                       __r; })
+#endif
+#define wh64(p) __asm__ volatile("wh64 (%0)" : : "r"(p) : "memory")
+
+#if AV_GCC_VERSION_AT_LEAST(3,3) && defined(__alpha_max__)
+#define minub8  __builtin_alpha_minub8
+#define minsb8  __builtin_alpha_minsb8
+#define minuw4  __builtin_alpha_minuw4
+#define minsw4  __builtin_alpha_minsw4
+#define maxub8  __builtin_alpha_maxub8
+#define maxsb8  __builtin_alpha_maxsb8
+#define maxuw4  __builtin_alpha_maxuw4
+#define maxsw4  __builtin_alpha_maxsw4
+#define perr    __builtin_alpha_perr
+#define pklb    __builtin_alpha_pklb
+#define pkwb    __builtin_alpha_pkwb
+#define unpkbl  __builtin_alpha_unpkbl
+#define unpkbw  __builtin_alpha_unpkbw
+#else
+#define minub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define perr(a, b)   ({ uint64_t __r; __asm__ (".arch ev6; perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
+#define pklb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define pkwb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbl(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbw(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#endif
+
+#elif defined(__DECC)           /* Digital/Compaq/hp "ccc" compiler */
+
+#include <c_asm.h>
+#define ldq(p) (*(const uint64_t *) (p))
+#define ldl(p) (*(const int32_t *)  (p))
+#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
+#define stl(l, p) do { *(int32_t *)  (p) = (l); } while (0)
+#define ldq_u(a)     asm ("ldq_u   %v0,0(%a0)", a)
+#define uldq(a)      (*(const __unaligned uint64_t *) (a))
+#define cmpbge(a, b) asm ("cmpbge  %a0,%a1,%v0", a, b)
+#define extql(a, b)  asm ("extql   %a0,%a1,%v0", a, b)
+#define extwl(a, b)  asm ("extwl   %a0,%a1,%v0", a, b)
+#define extqh(a, b)  asm ("extqh   %a0,%a1,%v0", a, b)
+#define zap(a, b)    asm ("zap     %a0,%a1,%v0", a, b)
+#define zapnot(a, b) asm ("zapnot  %a0,%a1,%v0", a, b)
+#define amask(a)     asm ("amask   %a0,%v0", a)
+#define implver()    asm ("implver %v0")
+#define rpcc()       asm ("rpcc           %v0")
+#define minub8(a, b) asm ("minub8  %a0,%a1,%v0", a, b)
+#define minsb8(a, b) asm ("minsb8  %a0,%a1,%v0", a, b)
+#define minuw4(a, b) asm ("minuw4  %a0,%a1,%v0", a, b)
+#define minsw4(a, b) asm ("minsw4  %a0,%a1,%v0", a, b)
+#define maxub8(a, b) asm ("maxub8  %a0,%a1,%v0", a, b)
+#define maxsb8(a, b) asm ("maxsb8  %a0,%a1,%v0", a, b)
+#define maxuw4(a, b) asm ("maxuw4  %a0,%a1,%v0", a, b)
+#define maxsw4(a, b) asm ("maxsw4  %a0,%a1,%v0", a, b)
+#define perr(a, b)   asm ("perr    %a0,%a1,%v0", a, b)
+#define pklb(a)      asm ("pklb    %a0,%v0", a)
+#define pkwb(a)      asm ("pkwb    %a0,%v0", a)
+#define unpkbl(a)    asm ("unpkbl  %a0,%v0", a)
+#define unpkbw(a)    asm ("unpkbw  %a0,%v0", a)
+#define wh64(a)      asm ("wh64    %a0", a)
+
+#else
+#error "Unknown compiler!"
+#endif
+
+#endif /* AVCODEC_ALPHA_ASM_H */
diff --git a/libavcodec/alpha/blockdsp_alpha.c b/libavcodec/alpha/blockdsp_alpha.c
new file mode 100644
index 0000000000..c6f0964607
--- /dev/null
+++ b/libavcodec/alpha/blockdsp_alpha.c
@@ -0,0 +1,49 @@
+/*
+ * Alpha optimised block operations
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/blockdsp.h"
+#include "asm.h"
+
+static void clear_blocks_axp(int16_t *blocks) {
+    uint64_t *p = (uint64_t *) blocks;
+    int n = sizeof(int16_t) * 6 * 64;
+
+    do {
+        p[0] = 0;
+        p[1] = 0;
+        p[2] = 0;
+        p[3] = 0;
+        p[4] = 0;
+        p[5] = 0;
+        p[6] = 0;
+        p[7] = 0;
+        p += 8;
+        n -= 8 * 8;
+    } while (n);
+}
+
+av_cold void ff_blockdsp_init_alpha(BlockDSPContext *c)
+{
+    c->clear_blocks = clear_blocks_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c
new file mode 100644
index 0000000000..8d54807d8f
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.c
@@ -0,0 +1,213 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/hpeldsp.h"
+#include "hpeldsp_alpha.h"
+#include "asm.h"
+
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
+{
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+#endif
+
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
+
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
+
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
+static void OPNAME ## _pixels ## SUFF ## _axp                               \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    if ((size_t) pixels & 0x7) {                                            \
+        OPKIND(uldq, STORE);                                                \
+    } else {                                                                \
+        OPKIND(ldq, STORE);                                                 \
+    }                                                                       \
+}                                                                           \
+                                                                            \
+static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
+    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
+}
+
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
+#define AVG2 avg2
+#define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
+#undef AVG2
+#undef AVG4
+#undef AVG4_ROUNDER
+#undef STORE
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
+
+static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
+                                 ptrdiff_t line_size, int h)
+{
+    put_pixels_axp_asm(block,     pixels,     line_size, h);
+    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
+}
+
+av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
+
+    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
+    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
+    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
+    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
+
+    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
+    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
+    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
+
+    c->avg_pixels_tab[1][0] = avg_pixels_axp;
+    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
+    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
+    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.h b/libavcodec/alpha/hpeldsp_alpha.h
new file mode 100644
index 0000000000..985182c67b
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H
+#define AVCODEC_ALPHA_HPELDSP_ALPHA_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+
+#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */
diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S b/libavcodec/alpha/hpeldsp_alpha_asm.S
new file mode 100644
index 0000000000..df386c429e
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha_asm.S
@@ -0,0 +1,125 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+
+        addq    a0, a2, t4
+        addq    a1, a2, a1
+        addq    t4, a2, t5
+        subq    a3, 4, a3
+
+        stq     t0, 0(a0)
+        addq    t5, a2, t6
+        stq     t1, 0(t4)
+        addq    t6, a2, a0
+
+        stq     t2, 0(t5)
+        stq     t3, 0(t6)
+
+        bne     a3, $aligned
+        ret
+        .end put_pixels_axp_asm
diff --git a/libavcodec/alpha/idctdsp_alpha.c b/libavcodec/alpha/idctdsp_alpha.c
new file mode 100644
index 0000000000..1923ebbc5f
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+
+void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+
+#if 0
+/* These functions were the base for the optimized assembler routines,
+   and remain here for documentation purposes.  */
+static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                                   ptrdiff_t line_size)
+{
+    int i = 8;
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+
+    do {
+        uint64_t shorts0, shorts1;
+
+        shorts0 = ldq(block);
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+        stl(pkwb(shorts0), pixels);
+
+        shorts1 = ldq(block + 4);
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--i);
+}
+
+void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                            ptrdiff_t line_size)
+{
+    int h = 8;
+    /* Keep this function a leaf function by generating the constants
+       manually (mainly for the hack value ;-).  */
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+    uint64_t signmask  = zap(-1, 0x33);
+    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
+
+    do {
+        uint64_t shorts0, pix0, signs0;
+        uint64_t shorts1, pix1, signs1;
+
+        shorts0 = ldq(block);
+        shorts1 = ldq(block + 4);
+
+        pix0    = unpkbw(ldl(pixels));
+        /* Signed subword add (MMX paddw).  */
+        signs0  = shorts0 & signmask;
+        shorts0 &= ~signmask;
+        shorts0 += pix0;
+        shorts0 ^= signs0;
+        /* Clamp. */
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+
+        /* Next 4.  */
+        pix1    = unpkbw(ldl(pixels + 4));
+        signs1  = shorts1 & signmask;
+        shorts1 &= ~signmask;
+        shorts1 += pix1;
+        shorts1 ^= signs1;
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+
+        stl(pkwb(shorts0), pixels);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
+}
+#endif
+
+av_cold void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                                   unsigned high_bit_depth)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
+        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
+    }
+
+    put_pixels_clamped_axp_p = c->put_pixels_clamped;
+    add_pixels_clamped_axp_p = c->add_pixels_clamped;
+
+    if (!high_bit_depth && !avctx->lowres &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
+        c->idct_put = ff_simple_idct_put_axp;
+        c->idct_add = ff_simple_idct_add_axp;
+        c->idct =     ff_simple_idct_axp;
+    }
+}
diff --git a/libavcodec/alpha/idctdsp_alpha.h b/libavcodec/alpha/idctdsp_alpha.h
new file mode 100644
index 0000000000..bf984950f2
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+#define AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+extern void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+
+void ff_simple_idct_axp(int16_t *block);
+void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
+
+#endif /* AVCODEC_ALPHA_IDCTDSP_ALPHA_H */
diff --git a/libavcodec/alpha/idctdsp_alpha_asm.S b/libavcodec/alpha/idctdsp_alpha_asm.S
new file mode 100644
index 0000000000..f545df9e4f
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha_asm.S
@@ -0,0 +1,167 @@
+/*
+ * Alpha optimized IDCT-related routines
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl put_pixels_clamped_mvi_asm
+        .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t8, -1
+        lda     t9, 8           # loop counter
+        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+
+        .align 4
+1:      ldq     t0,  0(a0)
+        ldq     t1,  8(a0)
+        ldq     t2, 16(a0)
+        ldq     t3, 24(a0)
+
+        maxsw4  t0, zero, t0
+        subq    t9, 2, t9
+        maxsw4  t1, zero, t1
+        lda     a0, 32(a0)
+
+        maxsw4  t2, zero, t2
+        addq    a1, a2, ta
+        maxsw4  t3, zero, t3
+        minsw4  t0, t8, t0
+
+        minsw4  t1, t8, t1
+        minsw4  t2, t8, t2
+        minsw4  t3, t8, t3
+        pkwb    t0, t0
+
+        pkwb    t1, t1
+        pkwb    t2, t2
+        pkwb    t3, t3
+        stl     t0, 0(a1)
+
+        stl     t1, 4(a1)
+        addq    ta, a2, a1
+        stl     t2, 0(ta)
+        stl     t3, 4(ta)
+
+        bne     t9, 1b
+        ret
+        .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl add_pixels_clamped_mvi_asm
+        .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t1, -1
+        lda     th, 8
+        zap     t1, 0x33, tg
+        nop
+
+        srl     tg, 1, t0
+        xor     tg, t0, tg      # 0x8000800080008000
+        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+
+        .align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+        ldl     t4, 4(a1)       # pix1
+        addq    a1, a2, te      # pixels += line_size
+        ldq     t0, 0(a0)       # shorts0
+
+        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+        ldl     ta, 4(te)       # pix3
+        ldq     t3, 8(a0)       # shorts1
+        ldq     t6, 16(a0)      # shorts2
+
+        ldq     t9, 24(a0)      # shorts3
+        unpkbw  t1, t1          # 0 0 (quarter/op no.)
+        and     t0, tg, t2      # 0 1
+        unpkbw  t4, t4          # 1 0
+
+        bic     t0, tg, t0      # 0 2
+        unpkbw  t7, t7          # 2 0
+        and     t3, tg, t5      # 1 1
+        addq    t0, t1, t0      # 0 3
+
+        xor     t0, t2, t0      # 0 4
+        unpkbw  ta, ta          # 3 0
+        and     t6, tg, t8      # 2 1
+        maxsw4  t0, zero, t0    # 0 5
+
+        bic     t3, tg, t3      # 1 2
+        bic     t6, tg, t6      # 2 2
+        minsw4  t0, tf, t0      # 0 6
+        addq    t3, t4, t3      # 1 3
+
+        pkwb    t0, t0          # 0 7
+        xor     t3, t5, t3      # 1 4
+        maxsw4  t3, zero, t3    # 1 5
+        addq    t6, t7, t6      # 2 3
+
+        xor     t6, t8, t6      # 2 4
+        and     t9, tg, tb      # 3 1
+        minsw4  t3, tf, t3      # 1 6
+        bic     t9, tg, t9      # 3 2
+
+        maxsw4  t6, zero, t6    # 2 5
+        addq    t9, ta, t9      # 3 3
+        stl     t0, 0(a1)       # 0 8
+        minsw4  t6, tf, t6      # 2 6
+
+        xor     t9, tb, t9      # 3 4
+        maxsw4  t9, zero, t9    # 3 5
+        lda     a0, 32(a0)      # block += 16;
+        pkwb    t3, t3          # 1 7
+
+        minsw4  t9, tf, t9      # 3 6
+        subq    th, 2, th
+        pkwb    t6, t6          # 2 7
+        pkwb    t9, t9          # 3 7
+
+        stl     t3, 4(a1)       # 1 8
+        addq    te, a2, a1      # pixels += line_size
+        stl     t6, 0(te)       # 2 8
+        stl     t9, 4(te)       # 3 8
+
+        bne     th, 1b
+        ret
+        .end add_pixels_clamped_mvi_asm
diff --git a/libavcodec/alpha/me_cmp_alpha.c b/libavcodec/alpha/me_cmp_alpha.c
new file mode 100644
index 0000000000..8f360190f4
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_alpha.c
@@ -0,0 +1,317 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/me_cmp.h"
+#include "asm.h"
+
+int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+
+static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 8 pixel a time */
+            uint64_t p1, p2;
+
+            p1  = ldq(pix1);
+            p2  = uldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1, p2;
+
+            p1 = ldq(pix1);
+            p2 = ldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    }
+
+    return result;
+}
+
+#if 0                           /* now done in assembly */
+int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int result = 0;
+    int h = 16;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 16 pixel a time */
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            t     = ldq_u(pix2 + 8);
+            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            p2_l = ldq(pix2);
+            p2_r = ldq(pix2 + 8);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    }
+
+    return result;
+}
+#endif
+
+static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+    uint64_t disalign = (size_t) pix2 & 0x7;
+
+    switch (disalign) {
+    case 0:
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l    = ldq(pix2);
+            r    = ldq(pix2 + 8);
+            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
+            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    case 7:
+        /* |.......l|lllllllr|rrrrrrr*|
+           This case is special because disalign1 would be 8, which
+           gets treated as 0 by extqh.  At least it is a bit faster
+           that way :)  */
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    default:
+        do {
+            uint64_t disalign1 = disalign + 1;
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
+                         extql(l, disalign1) | extqh(m, disalign1));
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
+                         extql(m, disalign1) | extqh(r, disalign1));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    }
+    return result;
+}
+
+static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        uint64_t t, p2_l, p2_r;
+        t     = ldq_u(pix2 + 8);
+        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            pix2 += line_size;
+            t     = ldq_u(pix2 + 8);
+            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+
+        } while (--h);
+    } else {
+        uint64_t p2_l, p2_r;
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            pix2 += line_size;
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+        } while (--h);
+    }
+    return result;
+}
+
+static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    uint64_t p1_l, p1_r;
+    uint64_t p2_l, p2_r, p2_x;
+
+    p1_l = ldq(pix1);
+    p1_r = ldq(pix1 + 8);
+
+    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+        p2_l = uldq(pix2);
+        p2_r = uldq(pix2 + 8);
+        p2_x = (uint64_t) pix2[16] << 56;
+    } else {
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        p2_x = ldq(pix2 + 16) << 56;
+    }
+
+    do {
+        uint64_t np1_l, np1_r;
+        uint64_t np2_l, np2_r, np2_x;
+
+        pix1 += line_size;
+        pix2 += line_size;
+
+        np1_l = ldq(pix1);
+        np1_r = ldq(pix1 + 8);
+
+        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+            np2_l = uldq(pix2);
+            np2_r = uldq(pix2 + 8);
+            np2_x = (uint64_t) pix2[16] << 56;
+        } else {
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+            np2_x = ldq(pix2 + 16) << 56;
+        }
+
+        result += perr(p1_l,
+                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
+                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
+                + perr(p1_r,
+                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
+                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
+
+        p1_l = np1_l;
+        p1_r = np1_r;
+        p2_l = np2_l;
+        p2_r = np2_r;
+        p2_x = np2_x;
+    } while (--h);
+
+    return result;
+}
+
+av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->sad[0]           = pix_abs16x16_mvi_asm;
+        c->sad[1]           = pix_abs8x8_mvi;
+        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
+        c->pix_abs[1][0]    = pix_abs8x8_mvi;
+        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
+        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
+        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
+    }
+}
diff --git a/libavcodec/alpha/me_cmp_mvi_asm.S b/libavcodec/alpha/me_cmp_mvi_asm.S
new file mode 100644
index 0000000000..2399085bcb
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_mvi_asm.S
@@ -0,0 +1,179 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "regdef.h"
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/*****************************************************************************
+ * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
+ *
+ * This code is written with a pca56 in mind. For ev6, one should
+ * really take the increased latency of 3 cycles for MVI instructions
+ * into account.
+ *
+ * It is important to keep the loading and first use of a register as
+ * far apart as possible, because if a register is accessed before it
+ * has been fetched from memory, the CPU will stall.
+ */
+        .align 4
+        .globl pix_abs16x16_mvi_asm
+        .ent pix_abs16x16_mvi_asm
+pix_abs16x16_mvi_asm:
+        .frame sp, 0, ra, 0
+        .prologue 0
+
+        and     a2, 7, t0
+        clr     v0
+        beq     t0, $aligned
+        .align 4
+$unaligned:
+        /* Registers:
+           line 0:
+           t0:  left_u -> left lo -> left
+           t1:  mid
+           t2:  right_u -> right hi -> right
+           t3:  ref left
+           t4:  ref right
+           line 1:
+           t5:  left_u -> left lo -> left
+           t6:  mid
+           t7:  right_u -> right hi -> right
+           t8:  ref left
+           t9:  ref right
+           temp:
+           ta:  left hi
+           tb:  right lo
+           tc:  error left
+           td:  error right  */
+
+        /* load line 0 */
+        ldq_u   t0, 0(a2)       # left_u
+        ldq_u   t1, 8(a2)       # mid
+        ldq_u   t2, 16(a2)      # right_u
+        ldq     t3, 0(a1)       # ref left
+        ldq     t4, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* load line 1 */
+        ldq_u   t5, 0(a2)       # left_u
+        ldq_u   t6, 8(a2)       # mid
+        ldq_u   t7, 16(a2)      # right_u
+        ldq     t8, 0(a1)       # ref left
+        ldq     t9, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* calc line 0 */
+        extql   t0, a2, t0      # left lo
+        extqh   t1, a2, ta      # left hi
+        extql   t1, a2, tb      # right lo
+        or      t0, ta, t0      # left
+        extqh   t2, a2, t2      # right hi
+        perr    t3, t0, tc      # error left
+        or      t2, tb, t2      # right
+        perr    t4, t2, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* calc line 1 */
+        extql   t5, a2, t5      # left lo
+        extqh   t6, a2, ta      # left hi
+        extql   t6, a2, tb      # right lo
+        or      t5, ta, t5      # left
+        extqh   t7, a2, t7      # right hi
+        perr    t8, t5, tc      # error left
+        or      t7, tb, t7      # right
+        perr    t9, t7, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* loop */
+        subq    a4,  2, a4      # h -= 2
+        bne     a4, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        /* load line 0 */
+        ldq     t0, 0(a2)       # left
+        ldq     t1, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t2, 0(a1)       # ref left
+        ldq     t3, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 1 */
+        ldq     t4, 0(a2)       # left
+        ldq     t5, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t6, 0(a1)       # ref left
+        ldq     t7, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 2 */
+        ldq     t8, 0(a2)       # left
+        ldq     t9, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     ta, 0(a1)       # ref left
+        ldq     tb, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 3 */
+        ldq     tc, 0(a2)       # left
+        ldq     td, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     te, 0(a1)       # ref left
+        ldq     a0, 8(a1)       # ref right
+        /* calc line 0 */
+        perr    t0, t2, t0      # error left
+        addq    a1, a3, a1      # pix1
+        perr    t1, t3, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 1 */
+        perr    t4, t6, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t5, t7, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 2 */
+        perr    t8, ta, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t9, tb, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 3 */
+        perr    tc, te, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    td, a0, t1      # error right
+        addq    v0, t0, v0      # add error left
+        addq    v0, t1, v0      # add error right
+        /* loop */
+        subq    a4,  4, a4      # h -= 4
+        bne     a4, $aligned
+        ret
+        .end pix_abs16x16_mvi_asm
diff --git a/libavcodec/alpha/mpegvideo_alpha.c b/libavcodec/alpha/mpegvideo_alpha.c
new file mode 100644
index 0000000000..126fe264a1
--- /dev/null
+++ b/libavcodec/alpha/mpegvideo_alpha.c
@@ -0,0 +1,110 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/mpegvideo.h"
+#include "asm.h"
+
+static void dct_unquantize_h263_axp(int16_t *block, int n_coeffs,
+                                    uint64_t qscale, uint64_t qadd)
+{
+    uint64_t qmul = qscale << 1;
+    uint64_t correction = WORD_VEC(qmul * 255 >> 8);
+    int i;
+
+    qadd = WORD_VEC(qadd);
+
+    for(i = 0; i <= n_coeffs; block += 4, i += 4) {
+        uint64_t levels, negmask, zeros, add, sub;
+
+        levels = ldq(block);
+        if (levels == 0)
+            continue;
+
+#ifdef __alpha_max__
+        /* I don't think the speed difference justifies runtime
+           detection.  */
+        negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
+        negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+#else
+        negmask = cmpbge(WORD_VEC(0x7fff), levels);
+        negmask &= (negmask >> 1) | (1 << 7);
+        negmask = zap(-1, negmask);
+#endif
+
+        zeros = cmpbge(0, levels);
+        zeros &= zeros >> 1;
+        /* zeros |= zeros << 1 is not needed since qadd <= 255, so
+           zapping the lower byte suffices.  */
+
+        levels *= qmul;
+        levels -= correction & (negmask << 16);
+
+        add = qadd & ~negmask;
+        sub = qadd &  negmask;
+        /* Set qadd to 0 for levels == 0.  */
+        add = zap(add, zeros);
+        levels += add;
+        levels -= sub;
+
+        stq(levels, block);
+    }
+}
+
+static void dct_unquantize_h263_intra_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs;
+    uint64_t qadd;
+    int16_t block0 = block[0];
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            block0 *= s->y_dc_scale;
+        else
+            block0 *= s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+
+    if(s->ac_pred)
+        n_coeffs = 63;
+    else
+        n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, qadd);
+
+    block[0] = block0;
+}
+
+static void dct_unquantize_h263_inter_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, (qscale - 1) | 1);
+}
+
+av_cold void ff_mpv_common_init_axp(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_axp;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_axp;
+}
diff --git a/libavcodec/alpha/pixblockdsp_alpha.c b/libavcodec/alpha/pixblockdsp_alpha.c
new file mode 100644
index 0000000000..866b762b16
--- /dev/null
+++ b/libavcodec/alpha/pixblockdsp_alpha.c
@@ -0,0 +1,78 @@
+/*
+ * SIMD-optimized pixel operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/pixblockdsp.h"
+#include "asm.h"
+
+static void get_pixels_mvi(int16_t *restrict block,
+                           const uint8_t *restrict pixels, ptrdiff_t line_size)
+{
+    int h = 8;
+
+    do {
+        uint64_t p;
+
+        p = ldq(pixels);
+        stq(unpkbw(p),       block);
+        stq(unpkbw(p >> 32), block + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
+}
+
+static void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                            int stride) {
+    int h = 8;
+    uint64_t mask = 0x4040;
+
+    mask |= mask << 16;
+    mask |= mask << 32;
+    do {
+        uint64_t x, y, c, d, a;
+        uint64_t signs;
+
+        x = ldq(s1);
+        y = ldq(s2);
+        c = cmpbge(x, y);
+        d = x - y;
+        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
+        d += 4 * a;             /* ...so we can use s4addq here.      */
+        signs = zap(-1, c);
+
+        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
+        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+    } while (--h);
+}
+
+av_cold void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                                       unsigned high_bit_depth)
+{
+    if (amask(AMASK_MVI) == 0) {
+        if (!high_bit_depth)
+            c->get_pixels = get_pixels_mvi;
+        c->diff_pixels = diff_pixels_mvi;
+    }
+}
diff --git a/libavcodec/alpha/regdef.h b/libavcodec/alpha/regdef.h
new file mode 100644
index 0000000000..f05577a89b
--- /dev/null
+++ b/libavcodec/alpha/regdef.h
@@ -0,0 +1,77 @@
+/*
+ * Alpha optimized DSP utils
+ * copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Some BSDs don't seem to have regdef.h... sigh  */
+#ifndef AVCODEC_ALPHA_REGDEF_H
+#define AVCODEC_ALPHA_REGDEF_H
+
+#define v0      $0      /* function return value */
+
+#define t0      $1      /* temporary registers (caller-saved) */
+#define t1      $2
+#define t2      $3
+#define t3      $4
+#define t4      $5
+#define t5      $6
+#define t6      $7
+#define t7      $8
+
+#define s0      $9      /* saved-registers (callee-saved registers) */
+#define s1      $10
+#define s2      $11
+#define s3      $12
+#define s4      $13
+#define s5      $14
+#define s6      $15
+#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0      $16     /* argument registers (caller-saved) */
+#define a1      $17
+#define a2      $18
+#define a3      $19
+#define a4      $20
+#define a5      $21
+
+#define t8      $22     /* more temps (caller-saved) */
+#define t9      $23
+#define t10     $24
+#define t11     $25
+#define ra      $26     /* return address register */
+#define t12     $27
+
+#define pv      t12     /* procedure-variable register */
+#define AT      $at     /* assembler temporary */
+#define gp      $29     /* global pointer */
+#define sp      $30     /* stack pointer */
+#define zero    $31     /* reads as zero, writes are noops */
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+#endif /* AVCODEC_ALPHA_REGDEF_H */
diff --git a/libavcodec/alpha/simple_idct_alpha.c b/libavcodec/alpha/simple_idct_alpha.c
new file mode 100644
index 0000000000..04be0cef06
--- /dev/null
+++ b/libavcodec/alpha/simple_idct_alpha.c
@@ -0,0 +1,303 @@
+/*
+ * Simple IDCT (Alpha optimized)
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * based upon some outcommented C code from mpeg2dec (idct_mmx.c
+ * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
+ *
+ * Alpha optimizations by Måns Rullgård <mans@mansr.com>
+ *                     and Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
+// W4 is actually exactly 16384, but using 16383 works around
+// accumulating rounding errors for some encoders
+#define W1 22725
+#define W2 21407
+#define W3 19266
+#define W4 16383
+#define W5 12873
+#define W6  8867
+#define W7  4520
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
+static inline int idct_row(int16_t *row)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3, t;
+    uint64_t l, r, t2;
+    l = ldq(row);
+    r = ldq(row + 4);
+
+    if (l == 0 && r == 0)
+        return 0;
+
+    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
+
+    if (((l & ~0xffffUL) | r) == 0) {
+        a0 >>= ROW_SHIFT;
+        t2 = (uint16_t) a0;
+        t2 |= t2 << 16;
+        t2 |= t2 << 32;
+
+        stq(t2, row);
+        stq(t2, row + 4);
+        return 1;
+    }
+
+    a1 = a0;
+    a2 = a0;
+    a3 = a0;
+
+    t = extwl(l, 4);            /* row[2] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W2 * t;
+        a1 += W6 * t;
+        a2 -= W6 * t;
+        a3 -= W2 * t;
+    }
+
+    t = extwl(r, 0);            /* row[4] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W4 * t;
+        a1 -= W4 * t;
+        a2 -= W4 * t;
+        a3 += W4 * t;
+    }
+
+    t = extwl(r, 4);            /* row[6] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W6 * t;
+        a1 -= W2 * t;
+        a2 += W2 * t;
+        a3 -= W6 * t;
+    }
+
+    t = extwl(l, 2);            /* row[1] */
+    if (t != 0) {
+        t = sextw(t);
+        b0 = W1 * t;
+        b1 = W3 * t;
+        b2 = W5 * t;
+        b3 = W7 * t;
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    t = extwl(l, 6);            /* row[3] */
+    if (t) {
+        t = sextw(t);
+        b0 += W3 * t;
+        b1 -= W7 * t;
+        b2 -= W1 * t;
+        b3 -= W5 * t;
+    }
+
+
+    t = extwl(r, 2);            /* row[5] */
+    if (t) {
+        t = sextw(t);
+        b0 += W5 * t;
+        b1 -= W1 * t;
+        b2 += W7 * t;
+        b3 += W3 * t;
+    }
+
+    t = extwl(r, 6);            /* row[7] */
+    if (t) {
+        t = sextw(t);
+        b0 += W7 * t;
+        b1 -= W5 * t;
+        b2 += W3 * t;
+        b3 -= W1 * t;
+    }
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+
+    return 2;
+}
+
+static inline void idct_col(int16_t *col)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    col[0] += (1 << (COL_SHIFT - 1)) / W4;
+
+    a0 = W4 * col[8 * 0];
+    a1 = W4 * col[8 * 0];
+    a2 = W4 * col[8 * 0];
+    a3 = W4 * col[8 * 0];
+
+    if (col[8 * 2]) {
+        a0 += W2 * col[8 * 2];
+        a1 += W6 * col[8 * 2];
+        a2 -= W6 * col[8 * 2];
+        a3 -= W2 * col[8 * 2];
+    }
+
+    if (col[8 * 4]) {
+        a0 += W4 * col[8 * 4];
+        a1 -= W4 * col[8 * 4];
+        a2 -= W4 * col[8 * 4];
+        a3 += W4 * col[8 * 4];
+    }
+
+    if (col[8 * 6]) {
+        a0 += W6 * col[8 * 6];
+        a1 -= W2 * col[8 * 6];
+        a2 += W2 * col[8 * 6];
+        a3 -= W6 * col[8 * 6];
+    }
+
+    if (col[8 * 1]) {
+        b0 = W1 * col[8 * 1];
+        b1 = W3 * col[8 * 1];
+        b2 = W5 * col[8 * 1];
+        b3 = W7 * col[8 * 1];
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    if (col[8 * 3]) {
+        b0 += W3 * col[8 * 3];
+        b1 -= W7 * col[8 * 3];
+        b2 -= W1 * col[8 * 3];
+        b3 -= W5 * col[8 * 3];
+    }
+
+    if (col[8 * 5]) {
+        b0 += W5 * col[8 * 5];
+        b1 -= W1 * col[8 * 5];
+        b2 += W7 * col[8 * 5];
+        b3 += W3 * col[8 * 5];
+    }
+
+    if (col[8 * 7]) {
+        b0 += W7 * col[8 * 7];
+        b1 -= W5 * col[8 * 7];
+        b2 += W3 * col[8 * 7];
+        b3 -= W1 * col[8 * 7];
+    }
+
+    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
+    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
+    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
+    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
+    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
+    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
+    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
+    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
+}
+
+/* If all rows but the first one are zero after row transformation,
+   all rows will be identical after column transformation.  */
+static inline void idct_col2(int16_t *col)
+{
+    int i;
+    uint64_t l, r;
+
+    for (i = 0; i < 8; ++i) {
+        int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
+
+        a0 *= W4;
+        col[i] = a0 >> COL_SHIFT;
+    }
+
+    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
+    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
+    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
+    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
+    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
+    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
+    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
+    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
+}
+
+void ff_simple_idct_axp(int16_t *block)
+{
+
+    int i;
+    int rowsZero = 1;           /* all rows except row 0 zero */
+    int rowsConstant = 1;       /* all rows consist of a constant value */
+
+    for (i = 0; i < 8; i++) {
+        int sparseness = idct_row(block + 8 * i);
+
+        if (i > 0 && sparseness > 0)
+            rowsZero = 0;
+        if (sparseness == 2)
+            rowsConstant = 0;
+    }
+
+    if (rowsZero) {
+        idct_col2(block);
+    } else if (rowsConstant) {
+        idct_col(block);
+        for (i = 0; i < 8; i += 2) {
+            uint64_t v = (uint16_t) block[0];
+            uint64_t w = (uint16_t) block[8];
+
+            v |= v << 16;
+            w |= w << 16;
+            v |= v << 32;
+            w |= w << 32;
+            stq(v, block + 0 * 4);
+            stq(v, block + 1 * 4);
+            stq(w, block + 2 * 4);
+            stq(w, block + 3 * 4);
+            block += 4 * 4;
+        }
+    } else {
+        for (i = 0; i < 8; i++)
+            idct_col(block + i);
+    }
+}
+
+void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    put_pixels_clamped_axp_p(block, dest, line_size);
+}
+
+void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    add_pixels_clamped_axp_p(block, dest, line_size);
+}
diff --git a/libavcodec/alsdec.c b/libavcodec/alsdec.c
index 9d25b57d8f..ebd364e085 100644
--- a/libavcodec/alsdec.c
+++ b/libavcodec/alsdec.c
@@ -1,28 +1,28 @@
 /*
  * MPEG-4 ALS decoder
- * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * MPEG-4 ALS decoder
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include <inttypes.h>
@@ -199,6 +199,7 @@ typedef struct ALSDecContext {
     unsigned int cur_frame_length;  ///< length of the current frame to decode
     unsigned int frame_id;          ///< the frame ID / number of the current frame
     unsigned int js_switch;         ///< if true, joint-stereo decoding is enforced
+    unsigned int cs_switch;         ///< if true, channel rearrangement is done
     unsigned int num_blocks;        ///< number of blocks used in the current frame
     unsigned int s_max;             ///< maximum Rice parameter allowed in entropy coding
     uint8_t *bgmc_lut;              ///< pointer at lookup tables used for BGMC
@@ -281,12 +282,14 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
     GetBitContext gb;
     uint64_t ht_size;
     int i, config_offset;
-    MPEG4AudioConfig m4ac;
+    MPEG4AudioConfig m4ac = {0};
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
     uint32_t als_id, header_size, trailer_size;
+    int ret;
 
-    init_get_bits(&gb, avctx->extradata, avctx->extradata_size * 8);
+    if ((ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size)) < 0)
+        return ret;
 
     config_offset = avpriv_mpeg4audio_get_config(&m4ac, avctx->extradata,
                                                  avctx->extradata_size * 8, 1);
@@ -305,7 +308,7 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
     skip_bits_long(&gb, 32); // sample rate already known
     sconf->samples              = get_bits_long(&gb, 32);
     avctx->channels             = m4ac.channels;
-    skip_bits(&gb, 16);      // number of channels already knwon
+    skip_bits(&gb, 16);      // number of channels already known
     skip_bits(&gb, 3);       // skip file_type
     sconf->resolution           = get_bits(&gb, 3);
     sconf->floating             = get_bits1(&gb);
@@ -349,16 +352,28 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
         if (get_bits_left(&gb) < bits_needed)
             return AVERROR_INVALIDDATA;
 
-        if (!(sconf->chan_pos = av_malloc(avctx->channels * sizeof(*sconf->chan_pos))))
+        if (!(sconf->chan_pos = av_malloc_array(avctx->channels, sizeof(*sconf->chan_pos))))
             return AVERROR(ENOMEM);
 
-        for (i = 0; i < avctx->channels; i++)
-            sconf->chan_pos[i] = get_bits(&gb, chan_pos_bits);
+        ctx->cs_switch = 1;
+
+        for (i = 0; i < avctx->channels; i++) {
+            sconf->chan_pos[i] = -1;
+        }
+
+        for (i = 0; i < avctx->channels; i++) {
+            int idx;
+
+            idx = get_bits(&gb, chan_pos_bits);
+            if (idx >= avctx->channels || sconf->chan_pos[idx] != -1) {
+                av_log(avctx, AV_LOG_WARNING, "Invalid channel reordering.\n");
+                ctx->cs_switch = 0;
+                break;
+            }
+            sconf->chan_pos[idx] = i;
+        }
 
         align_get_bits(&gb);
-        // TODO: use this to actually do channel sorting
-    } else {
-        sconf->chan_sort = 0;
     }
 
 
@@ -392,7 +407,7 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
         if (get_bits_left(&gb) < 32)
             return AVERROR_INVALIDDATA;
 
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             ctx->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
             ctx->crc       = 0xFFFFFFFF;
             ctx->crc_org   = ~get_bits_long(&gb, 32);
@@ -428,7 +443,6 @@ static int check_specific_config(ALSDecContext *ctx)
 
     MISSING_ERR(sconf->floating,  "Floating point decoding",     AVERROR_PATCHWELCOME);
     MISSING_ERR(sconf->rlslms,    "Adaptive RLS-LMS prediction", AVERROR_PATCHWELCOME);
-    MISSING_ERR(sconf->chan_sort, "Channel sorting",             0);
 
     return error;
 }
@@ -551,12 +565,15 @@ static void get_block_sizes(ALSDecContext *ctx, unsigned int *div_blocks,
 
 /** Read the block data for a constant block
  */
-static void read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
+static int read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 {
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
     GetBitContext *gb        = &ctx->gb;
 
+    if (bd->block_length <= 0)
+        return AVERROR_INVALIDDATA;
+
     *bd->raw_samples = 0;
     *bd->const_block = get_bits1(gb);    // 1 = constant value, 0 = zero block (silence)
     bd->js_blocks    = get_bits1(gb);
@@ -571,6 +588,8 @@ static void read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 
     // ensure constant block decoding by reusing this field
     *bd->const_block = 1;
+
+    return 0;
 }
 
 
@@ -669,13 +688,17 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             *bd->opt_order       = get_bits(gb, opt_order_length);
             if (*bd->opt_order > sconf->max_order) {
                 *bd->opt_order = sconf->max_order;
-                av_log(avctx, AV_LOG_ERROR, "Predictor order too large!\n");
+                av_log(avctx, AV_LOG_ERROR, "Predictor order too large.\n");
                 return AVERROR_INVALIDDATA;
             }
         } else {
             *bd->opt_order = sconf->max_order;
         }
-
+        if (*bd->opt_order > bd->block_length) {
+            *bd->opt_order = bd->block_length;
+            av_log(avctx, AV_LOG_ERROR, "Predictor order too large.\n");
+            return AVERROR_INVALIDDATA;
+        }
         opt_order = *bd->opt_order;
 
         if (opt_order) {
@@ -706,7 +729,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                     quant_cof[k] = decode_rice(gb, rice_param) + offset;
                     if (quant_cof[k] < -64 || quant_cof[k] > 63) {
                         av_log(avctx, AV_LOG_ERROR,
-                               "quant_cof %"PRIu32" is out of range\n",
+                               "quant_cof %"PRIu32" is out of range.\n",
                                quant_cof[k]);
                         return AVERROR_INVALIDDATA;
                     }
@@ -964,7 +987,7 @@ static int decode_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
  */
 static int read_block(ALSDecContext *ctx, ALSBlockData *bd)
 {
-    int ret = 0;
+    int ret;
     GetBitContext *gb        = &ctx->gb;
 
     *bd->shift_lsbs = 0;
@@ -972,7 +995,7 @@ static int read_block(ALSDecContext *ctx, ALSBlockData *bd)
     if (get_bits1(gb)) {
         ret = read_var_block_data(ctx, bd);
     } else {
-        read_const_block_data(ctx, bd);
+        ret = read_const_block_data(ctx, bd);
     }
 
     return ret;
@@ -1026,8 +1049,8 @@ static void zero_remaining(unsigned int b, unsigned int b_max,
 {
     unsigned int count = 0;
 
-    for (; b < b_max; b++)
-        count += div_blocks[b];
+    while (b < b_max)
+        count += div_blocks[b++];
 
     if (count)
         memset(buf, 0, sizeof(*buf) * count);
@@ -1132,7 +1155,7 @@ static int decode_blocks(ALSDecContext *ctx, unsigned int ra_frame,
         // reconstruct joint-stereo blocks
         if (bd[0].js_blocks) {
             if (bd[1].js_blocks)
-                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair!\n");
+                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair.\n");
 
             for (s = 0; s < div_blocks[b]; s++)
                 bd[0].raw_samples[s] = bd[1].raw_samples[s] - bd[0].raw_samples[s];
@@ -1180,7 +1203,7 @@ static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
         current->master_channel = get_bits_long(gb, av_ceil_log2(channels));
 
         if (current->master_channel >= channels) {
-            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel!\n");
+            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel.\n");
             return AVERROR_INVALIDDATA;
         }
 
@@ -1205,7 +1228,7 @@ static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
     }
 
     if (entries == channels) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data!\n");
+        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -1238,7 +1261,7 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     }
 
     if (dep == channels) {
-        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation!\n");
+        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -1253,21 +1276,31 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     bd->quant_cof   = ctx->quant_cof[c];
     bd->raw_samples = ctx->raw_samples[c] + offset;
 
-    dep = 0;
-    while (!ch[dep].stop_flag) {
+    for (dep = 0; !ch[dep].stop_flag; dep++) {
         ptrdiff_t smp;
         ptrdiff_t begin = 1;
         ptrdiff_t end   = bd->block_length - 1;
         int64_t y;
         int32_t *master = ctx->raw_samples[ch[dep].master_channel] + offset;
 
+        if (ch[dep].master_channel == c)
+            continue;
+
         if (ch[dep].time_diff_flag) {
             int t = ch[dep].time_diff_index;
 
             if (ch[dep].time_diff_sign) {
                 t      = -t;
+                if (begin < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "begin %td smaller than time diff index %d.\n", begin, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 begin -= t;
             } else {
+                if (end < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "end %td smaller than time diff index %d.\n", end, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 end   -= t;
             }
 
@@ -1311,8 +1344,6 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
                 bd->raw_samples[smp] += y >> 7;
             }
         }
-
-        dep++;
     }
 
     return 0;
@@ -1387,7 +1418,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
         for (c = 0; c < avctx->channels; c++)
             if (ctx->chan_data[c] < ctx->chan_data_buffer) {
-                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data!\n");
+                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data.\n");
                 return AVERROR_INVALIDDATA;
             }
 
@@ -1443,6 +1474,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
                 bd.lpc_cof     = ctx->lpc_cof[c];
                 bd.quant_cof   = ctx->quant_cof[c];
                 bd.raw_samples = ctx->raw_samples[c] + offset;
+
                 if ((ret = decode_block(ctx, &bd)) < 0)
                     return ret;
             }
@@ -1461,6 +1493,11 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
     // TODO: read_diff_float_data
 
+    if (get_bits_left(gb) < 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Overread %d\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -1478,7 +1515,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     int invalid_frame, ret;
     unsigned int c, sample, ra_frame, bytes_read, shift;
 
-    init_get_bits(&ctx->gb, buffer, buffer_size * 8);
+    if ((ret = init_get_bits8(&ctx->gb, buffer, buffer_size)) < 0)
+        return ret;
 
     // In the case that the distance between random access frames is set to zero
     // (sconf->ra_distance == 0) no frame is treated as a random access frame.
@@ -1502,19 +1540,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
     /* get output buffer */
     frame->nb_samples = ctx->cur_frame_length;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     // transform decoded frame into output format
-    #define INTERLEAVE_OUTPUT(bps)                                 \
-    {                                                              \
-        int##bps##_t *dest = (int##bps##_t*)frame->data[0];        \
-        shift = bps - ctx->avctx->bits_per_raw_sample;             \
-        for (sample = 0; sample < ctx->cur_frame_length; sample++) \
-            for (c = 0; c < avctx->channels; c++)                  \
-                *dest++ = ctx->raw_samples[c][sample] << shift;    \
+    #define INTERLEAVE_OUTPUT(bps)                                                   \
+    {                                                                                \
+        int##bps##_t *dest = (int##bps##_t*)frame->data[0];                          \
+        shift = bps - ctx->avctx->bits_per_raw_sample;                               \
+        if (!ctx->cs_switch) {                                                       \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[c][sample] << shift;                  \
+        } else {                                                                     \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[sconf->chan_pos[c]][sample] << shift; \
+        }                                                                            \
     }
 
     if (ctx->avctx->bits_per_raw_sample <= 16) {
@@ -1524,7 +1566,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     // update CRC
-    if (sconf->crc_enabled && (avctx->err_recognition & AV_EF_CRCCHECK)) {
+    if (sconf->crc_enabled && (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
         int swap = HAVE_BIGENDIAN != sconf->msb_first;
 
         if (ctx->avctx->bits_per_raw_sample == 24) {
@@ -1681,14 +1723,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // allocate quantized parcor coefficient buffer
     num_buffers = sconf->mc_coding ? avctx->channels : 1;
 
-    ctx->quant_cof        = av_malloc(sizeof(*ctx->quant_cof) * num_buffers);
-    ctx->lpc_cof          = av_malloc(sizeof(*ctx->lpc_cof)   * num_buffers);
-    ctx->quant_cof_buffer = av_malloc(sizeof(*ctx->quant_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_buffer   = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_reversed_buffer = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                             sconf->max_order);
+    ctx->quant_cof        = av_malloc_array(num_buffers, sizeof(*ctx->quant_cof));
+    ctx->lpc_cof          = av_malloc_array(num_buffers, sizeof(*ctx->lpc_cof));
+    ctx->quant_cof_buffer = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->quant_cof_buffer));
+    ctx->lpc_cof_buffer   = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->lpc_cof_buffer));
+    ctx->lpc_cof_reversed_buffer = av_malloc_array(sconf->max_order,
+                                                   sizeof(*ctx->lpc_cof_buffer));
 
     if (!ctx->quant_cof              || !ctx->lpc_cof        ||
         !ctx->quant_cof_buffer       || !ctx->lpc_cof_buffer ||
@@ -1705,15 +1747,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     // allocate and assign lag and gain data buffer for ltp mode
-    ctx->const_block     = av_malloc (sizeof(*ctx->const_block) * num_buffers);
-    ctx->shift_lsbs      = av_malloc (sizeof(*ctx->shift_lsbs)  * num_buffers);
-    ctx->opt_order       = av_malloc (sizeof(*ctx->opt_order)   * num_buffers);
-    ctx->store_prev_samples = av_malloc(sizeof(*ctx->store_prev_samples) * num_buffers);
-    ctx->use_ltp         = av_mallocz(sizeof(*ctx->use_ltp)  * num_buffers);
-    ctx->ltp_lag         = av_malloc (sizeof(*ctx->ltp_lag)  * num_buffers);
-    ctx->ltp_gain        = av_malloc (sizeof(*ctx->ltp_gain) * num_buffers);
-    ctx->ltp_gain_buffer = av_malloc (sizeof(*ctx->ltp_gain_buffer) *
-                                      num_buffers * 5);
+    ctx->const_block     = av_malloc_array(num_buffers, sizeof(*ctx->const_block));
+    ctx->shift_lsbs      = av_malloc_array(num_buffers, sizeof(*ctx->shift_lsbs));
+    ctx->opt_order       = av_malloc_array(num_buffers, sizeof(*ctx->opt_order));
+    ctx->store_prev_samples = av_malloc_array(num_buffers, sizeof(*ctx->store_prev_samples));
+    ctx->use_ltp         = av_mallocz_array(num_buffers, sizeof(*ctx->use_ltp));
+    ctx->ltp_lag         = av_malloc_array(num_buffers, sizeof(*ctx->ltp_lag));
+    ctx->ltp_gain        = av_malloc_array(num_buffers, sizeof(*ctx->ltp_gain));
+    ctx->ltp_gain_buffer = av_malloc_array(num_buffers * 5, sizeof(*ctx->ltp_gain_buffer));
 
     if (!ctx->const_block || !ctx->shift_lsbs ||
         !ctx->opt_order || !ctx->store_prev_samples ||
@@ -1729,12 +1770,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate and assign channel data buffer for mcc mode
     if (sconf->mc_coding) {
-        ctx->chan_data_buffer  = av_malloc(sizeof(*ctx->chan_data_buffer) *
-                                           num_buffers * num_buffers);
-        ctx->chan_data         = av_malloc(sizeof(*ctx->chan_data) *
-                                           num_buffers);
-        ctx->reverted_channels = av_malloc(sizeof(*ctx->reverted_channels) *
-                                           num_buffers);
+        ctx->chan_data_buffer  = av_mallocz_array(num_buffers * num_buffers,
+                                                 sizeof(*ctx->chan_data_buffer));
+        ctx->chan_data         = av_mallocz_array(num_buffers,
+                                                 sizeof(*ctx->chan_data));
+        ctx->reverted_channels = av_malloc_array(num_buffers,
+                                                 sizeof(*ctx->reverted_channels));
 
         if (!ctx->chan_data_buffer || !ctx->chan_data || !ctx->reverted_channels) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
@@ -1752,9 +1793,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     channel_size      = sconf->frame_length + sconf->max_order;
 
-    ctx->prev_raw_samples = av_malloc (sizeof(*ctx->prev_raw_samples) * sconf->max_order);
-    ctx->raw_buffer       = av_mallocz(sizeof(*ctx->     raw_buffer)  * avctx->channels * channel_size);
-    ctx->raw_samples      = av_malloc (sizeof(*ctx->     raw_samples) * avctx->channels);
+    ctx->prev_raw_samples = av_malloc_array(sconf->max_order, sizeof(*ctx->prev_raw_samples));
+    ctx->raw_buffer       = av_mallocz_array(avctx->channels * channel_size, sizeof(*ctx->raw_buffer));
+    ctx->raw_samples      = av_malloc_array(avctx->channels, sizeof(*ctx->raw_samples));
 
     // allocate previous raw sample buffer
     if (!ctx->prev_raw_samples || !ctx->raw_buffer|| !ctx->raw_samples) {
@@ -1770,11 +1811,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate crc buffer
     if (HAVE_BIGENDIAN != sconf->msb_first && sconf->crc_enabled &&
-        (avctx->err_recognition & AV_EF_CRCCHECK)) {
-        ctx->crc_buffer = av_malloc(sizeof(*ctx->crc_buffer) *
-                                    ctx->cur_frame_length *
-                                    avctx->channels *
-                                    av_get_bytes_per_sample(avctx->sample_fmt));
+        (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
+        ctx->crc_buffer = av_malloc_array(ctx->cur_frame_length *
+                                          avctx->channels *
+                                          av_get_bytes_per_sample(avctx->sample_fmt),
+                                          sizeof(*ctx->crc_buffer));
         if (!ctx->crc_buffer) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
             ret = AVERROR(ENOMEM);
diff --git a/libavcodec/amr.h b/libavcodec/amr.h
index 676c9630df..1ac73abe86 100644
--- a/libavcodec/amr.h
+++ b/libavcodec/amr.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/amrnbdata.h b/libavcodec/amrnbdata.h
index b7d1b89608..435fd9924b 100644
--- a/libavcodec/amrnbdata.h
+++ b/libavcodec/amrnbdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1655,10 +1655,10 @@ static const float ir_filter_medium[AMR_SUBFRAME_SIZE] = {
  0.016998,  0.023804, -0.041779,  0.025696,  0.019989,
 };
 
-static const float *ir_filters_lookup[2]           = {
+static const float * const ir_filters_lookup[2]           = {
     ir_filter_strong,           ir_filter_medium
 };
-static const float *ir_filters_lookup_MODE_7k95[2] = {
+static const float * const ir_filters_lookup_MODE_7k95[2] = {
     ir_filter_strong_MODE_7k95, ir_filter_medium
 };
 
diff --git a/libavcodec/amrnbdec.c b/libavcodec/amrnbdec.c
index 60d5683604..2299a253ab 100644
--- a/libavcodec/amrnbdec.c
+++ b/libavcodec/amrnbdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,8 @@
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
+#include "celp_math.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
@@ -84,7 +86,7 @@
 /** Maximum sharpening factor
  *
  * The specification says 0.8, which should be 13107, but the reference C code
- * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in bitexact G.729.)
+ * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in g729dec.c.)
  */
 #define SHARP_MAX 0.79449462890625
 
@@ -136,6 +138,11 @@ typedef struct AMRContext {
 
     float samples_in[LP_FILTER_ORDER + AMR_SUBFRAME_SIZE]; ///< floating point samples
 
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRContext;
 
 /** Double version of ff_weighted_vector_sumf() */
@@ -162,7 +169,8 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     // p->excitation always points to the same position in p->excitation_buf
@@ -176,6 +184,11 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         p->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&p->acelpf_ctx);
+    ff_acelp_vectors_init(&p->acelpv_ctx);
+    ff_celp_filter_init(&p->celpf_ctx);
+    ff_celp_math_init(&p->celpm_ctx);
+
     return 0;
 }
 
@@ -219,15 +232,16 @@ static enum Mode unpack_bitstream(AMRContext *p, const uint8_t *buf,
  * Interpolate the LSF vector (used for fixed gain smoothing).
  * The interpolation is done over all four subframes even in MODE_12k2.
  *
+ * @param[in]     ctx       The Context
  * @param[in,out] lsf_q     LSFs in [0,1] for each subframe
  * @param[in]     lsf_new   New LSFs in [0,1] for subframe 4
  */
-static void interpolate_lsf(float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
+static void interpolate_lsf(ACELPVContext *ctx, float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
 {
     int i;
 
     for (i = 0; i < 4; i++)
-        ff_weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
+        ctx->weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
                                 0.25 * (3 - i), 0.25 * (i + 1),
                                 LP_FILTER_ORDER);
 }
@@ -271,7 +285,7 @@ static void lsf2lsp_for_mode12k2(AMRContext *p, double lsp[LP_FILTER_ORDER],
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     if (update)
-        interpolate_lsf(p->lsf_q, lsf_q);
+        interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
 
     ff_acelp_lsf2lspd(lsp, lsf_q, LP_FILTER_ORDER);
 }
@@ -334,7 +348,7 @@ static void lsf2lsp_3(AMRContext *p)
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     // store data for computing the next frame's LSFs
-    interpolate_lsf(p->lsf_q, lsf_q);
+    interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
     memcpy(p->prev_lsf_r, lsf_r, LP_FILTER_ORDER * sizeof(*lsf_r));
 
     ff_acelp_lsf2lspd(p->lsp[3], lsf_q, LP_FILTER_ORDER);
@@ -385,22 +399,23 @@ static void decode_pitch_vector(AMRContext *p,
         decode_pitch_lag_1_6(&pitch_lag_int, &pitch_lag_frac,
                              amr_subframe->p_lag, p->pitch_lag_int,
                              subframe);
-    } else
+    } else {
         ff_decode_pitch_lag(&pitch_lag_int, &pitch_lag_frac,
                             amr_subframe->p_lag,
                             p->pitch_lag_int, subframe,
                             mode != MODE_4k75 && mode != MODE_5k15,
                             mode <= MODE_6k7 ? 4 : (mode == MODE_7k95 ? 5 : 6));
+        pitch_lag_frac *= 2;
+    }
 
     p->pitch_lag_int = pitch_lag_int; // store previous lag in a uint8_t
 
-    pitch_lag_frac <<= (p->cur_frame_mode != MODE_12k2);
-
     pitch_lag_int += pitch_lag_frac > 0;
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a b60 hamming windowed sinc function.   */
-    ff_acelp_interpolatef(p->excitation, p->excitation + 1 - pitch_lag_int,
+    p->acelpf_ctx.acelp_interpolatef(p->excitation,
+                          p->excitation + 1 - pitch_lag_int,
                           ff_b60_sinc, 6,
                           pitch_lag_frac + 6 - 6*(pitch_lag_frac > 0),
                           10, AMR_SUBFRAME_SIZE);
@@ -484,7 +499,7 @@ static void decode_8_pulses_31bits(const int16_t *fixed_index,
 static void decode_fixed_sparse(AMRFixed *fixed_sparse, const uint16_t *pulses,
                                 const enum Mode mode, const int subframe)
 {
-    assert(MODE_4k75 <= mode && mode <= MODE_12k2);
+    av_assert1(MODE_4k75 <= (signed)mode && mode <= MODE_12k2);
 
     if (mode == MODE_12k2) {
         ff_decode_10_pulses_35bits(pulses, fixed_sparse, gray_decode, 5, 3);
@@ -785,12 +800,12 @@ static int synthesis(AMRContext *p, float *lpc,
         for (i = 0; i < AMR_SUBFRAME_SIZE; i++)
             p->pitch_vector[i] *= 0.25;
 
-    ff_weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
+    p->acelpv_ctx.weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
                             p->pitch_gain[4], fixed_gain, AMR_SUBFRAME_SIZE);
 
     // emphasize pitch vector contribution
     if (p->pitch_gain[4] > 0.5 && !overflow) {
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = p->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMR_SUBFRAME_SIZE);
         float pitch_factor =
             p->pitch_gain[4] *
@@ -805,7 +820,8 @@ static int synthesis(AMRContext *p, float *lpc,
                                                 AMR_SUBFRAME_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation, AMR_SUBFRAME_SIZE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
+                                 AMR_SUBFRAME_SIZE,
                                  LP_FILTER_ORDER);
 
     // detect overflow
@@ -851,10 +867,11 @@ static void update_state(AMRContext *p)
 /**
  * Get the tilt factor of a formant filter from its transfer function
  *
+ * @param p     The Context
  * @param lpc_n LP_FILTER_ORDER coefficients of the numerator
  * @param lpc_d LP_FILTER_ORDER coefficients of the denominator
  */
-static float tilt_factor(float *lpc_n, float *lpc_d)
+static float tilt_factor(AMRContext *p, float *lpc_n, float *lpc_d)
 {
     float rh0, rh1; // autocorrelation at lag 0 and 1
 
@@ -864,11 +881,12 @@ static float tilt_factor(float *lpc_n, float *lpc_d)
 
     hf[0] = 1.0;
     memcpy(hf + 1, lpc_n, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(hf, lpc_d, hf, AMR_TILT_RESPONSE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(hf, lpc_d, hf,
+                                 AMR_TILT_RESPONSE,
                                  LP_FILTER_ORDER);
 
-    rh0 = avpriv_scalarproduct_float_c(hf, hf,     AMR_TILT_RESPONSE);
-    rh1 = avpriv_scalarproduct_float_c(hf, hf + 1, AMR_TILT_RESPONSE - 1);
+    rh0 = p->celpm_ctx.dot_productf(hf, hf,     AMR_TILT_RESPONSE);
+    rh1 = p->celpm_ctx.dot_productf(hf, hf + 1, AMR_TILT_RESPONSE - 1);
 
     // The spec only specifies this check for 12.2 and 10.2 kbit/s
     // modes. But in the ref source the tilt is always non-negative.
@@ -888,7 +906,7 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     int i;
     float *samples          = p->samples_in + LP_FILTER_ORDER; // Start of input
 
-    float speech_gain       = avpriv_scalarproduct_float_c(samples, samples,
+    float speech_gain       = p->celpm_ctx.dot_productf(samples, samples,
                                                            AMR_SUBFRAME_SIZE);
 
     float pole_out[AMR_SUBFRAME_SIZE + LP_FILTER_ORDER];  // Output of pole filter
@@ -909,16 +927,16 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     }
 
     memcpy(pole_out, p->postfilter_mem, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
+    p->celpf_ctx.celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
                                  AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
     memcpy(p->postfilter_mem, pole_out + AMR_SUBFRAME_SIZE,
            sizeof(float) * LP_FILTER_ORDER);
 
-    ff_celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
+    p->celpf_ctx.celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
                                       pole_out + LP_FILTER_ORDER,
                                       AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
 
-    ff_tilt_compensation(&p->tilt_mem, tilt_factor(lpc_n, lpc_d), buf_out,
+    ff_tilt_compensation(&p->tilt_mem, tilt_factor(p, lpc_n, lpc_d), buf_out,
                          AMR_SUBFRAME_SIZE);
 
     ff_adaptive_gain_control(buf_out, buf_out, speech_gain, AMR_SUBFRAME_SIZE,
@@ -945,10 +963,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AMR_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     p->cur_frame_mode = unpack_bitstream(p, buf, buf_size);
@@ -957,7 +973,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
     if (p->cur_frame_mode == MODE_DTX) {
-        avpriv_request_sample(avctx, "dtx mode");
+        avpriv_report_missing_feature(avctx, "dtx mode");
+        av_log(avctx, AV_LOG_INFO, "Note: libopencore_amrnb supports dtx\n");
         return AVERROR_PATCHWELCOME;
     }
 
@@ -995,7 +1012,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
         p->fixed_gain[4] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(p->fixed_vector,
+                       p->celpm_ctx.dot_productf(p->fixed_vector,
                                                                p->fixed_vector,
                                                                AMR_SUBFRAME_SIZE) /
                                   AMR_SUBFRAME_SIZE,
@@ -1041,7 +1058,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         update_state(p);
     }
 
-    ff_acelp_apply_order_2_transfer_function(buf_out, buf_out, highpass_zeros,
+    p->acelpf_ctx.acelp_apply_order_2_transfer_function(buf_out,
+                                             buf_out, highpass_zeros,
                                              highpass_poles,
                                              highpass_gain * AMR_SAMPLE_SCALE,
                                              p->high_pass_mem, AMR_BLOCK_SIZE);
@@ -1052,7 +1070,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
      * for fixed_gain_smooth.
      * The specification has an incorrect formula: the reference decoder uses
      * qbar(n-1) rather than qbar(n) in section 6.1(4) equation 71. */
-    ff_weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
+    p->acelpv_ctx.weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
                             0.84, 0.16, LP_FILTER_ORDER);
 
     *got_frame_ptr = 1;
diff --git a/libavcodec/amrwbdata.h b/libavcodec/amrwbdata.h
index 81f8b47d1b..8390582b05 100644
--- a/libavcodec/amrwbdata.h
+++ b/libavcodec/amrwbdata.h
@@ -2,20 +2,20 @@
  * AMR wideband data and definitions
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1805,7 +1805,7 @@ static const float ir_filter_mid[64] = {
     -7.501221e-02,  2.920532e-02,  1.660156e-02,  7.751465e-02
 };
 
-static const float *ir_filters_lookup[2] = {
+static const float * const ir_filters_lookup[2] = {
     ir_filter_str, ir_filter_mid
 };
 
diff --git a/libavcodec/amrwbdec.c b/libavcodec/amrwbdec.c
index bf679e82d9..b73b700a71 100644
--- a/libavcodec/amrwbdec.c
+++ b/libavcodec/amrwbdec.c
@@ -2,20 +2,20 @@
  * AMR wideband decoder
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A particular PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "avcodec.h"
 #include "lsp.h"
 #include "celp_filters.h"
+#include "celp_math.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
@@ -41,6 +42,7 @@
 #include "amr.h"
 
 #include "amrwbdata.h"
+#include "mips/amrwbdec_mips.h"
 
 typedef struct AMRWBContext {
     AMRWBFrame                             frame; ///< AMRWB parameters decoded from bitstream
@@ -84,6 +86,11 @@ typedef struct AMRWBContext {
 
     AVLFG                                   prng; ///< random number generator for white noise excitation
     uint8_t                          first_frame; ///< flag active during decoding of the first frame
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRWBContext;
 
 static av_cold int amrwb_decode_init(AVCodecContext *avctx)
@@ -98,7 +105,8 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 16000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 16000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     av_lfg_init(&ctx->prng, 1);
@@ -112,6 +120,11 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         ctx->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&ctx->acelpf_ctx);
+    ff_acelp_vectors_init(&ctx->acelpv_ctx);
+    ff_celp_filter_init(&ctx->celpf_ctx);
+    ff_celp_math_init(&ctx->celpm_ctx);
+
     return 0;
 }
 
@@ -326,7 +339,8 @@ static void decode_pitch_vector(AMRWBContext *ctx,
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a hamming windowed sinc function */
-    ff_acelp_interpolatef(exc, exc + 1 - pitch_lag_int,
+    ctx->acelpf_ctx.acelp_interpolatef(exc,
+                          exc + 1 - pitch_lag_int,
                           ac_inter, 4,
                           pitch_lag_frac + (pitch_lag_frac > 0 ? 0 : 4),
                           LP_ORDER, AMRWB_SFR_SIZE + 1);
@@ -585,16 +599,18 @@ static void pitch_sharpening(AMRWBContext *ctx, float *fixed_vector)
  *
  * @param[in] p_vector, f_vector   Pitch and fixed excitation vectors
  * @param[in] p_gain, f_gain       Pitch and fixed gains
+ * @param[in] ctx                  The context
  */
 // XXX: There is something wrong with the precision here! The magnitudes
 // of the energies are not correct. Please check the reference code carefully
 static float voice_factor(float *p_vector, float p_gain,
-                          float *f_vector, float f_gain)
+                          float *f_vector, float f_gain,
+                          CELPMContext *ctx)
 {
-    double p_ener = (double) avpriv_scalarproduct_float_c(p_vector, p_vector,
+    double p_ener = (double) ctx->dot_productf(p_vector, p_vector,
                                                           AMRWB_SFR_SIZE) *
                     p_gain * p_gain;
-    double f_ener = (double) avpriv_scalarproduct_float_c(f_vector, f_vector,
+    double f_ener = (double) ctx->dot_productf(f_vector, f_vector,
                                                           AMRWB_SFR_SIZE) *
                     f_gain * f_gain;
 
@@ -758,13 +774,13 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                       float fixed_gain, const float *fixed_vector,
                       float *samples)
 {
-    ff_weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
+    ctx->acelpv_ctx.weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
                             ctx->pitch_gain[0], fixed_gain, AMRWB_SFR_SIZE);
 
     /* emphasize pitch vector contribution in low bitrate modes */
     if (ctx->pitch_gain[0] > 0.5 && ctx->fr_cur_mode <= MODE_8k85) {
         int i;
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = ctx->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMRWB_SFR_SIZE);
 
         // XXX: Weird part in both ref code and spec. A unknown parameter
@@ -778,7 +794,7 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                                                 energy, AMRWB_SFR_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
                                  AMRWB_SFR_SIZE, LP_ORDER);
 }
 
@@ -810,8 +826,9 @@ static void de_emphasis(float *out, float *in, float m, float mem[1])
  * @param[out] out                 Buffer for interpolated signal
  * @param[in]  in                  Current signal data (length 0.8*o_size)
  * @param[in]  o_size              Output signal length
+ * @param[in] ctx                  The context
  */
-static void upsample_5_4(float *out, const float *in, int o_size)
+static void upsample_5_4(float *out, const float *in, int o_size, CELPMContext *ctx)
 {
     const float *in0 = in - UPS_FIR_SIZE + 1;
     int i, j, k;
@@ -824,7 +841,7 @@ static void upsample_5_4(float *out, const float *in, int o_size)
         i++;
 
         for (k = 1; k < 5; k++) {
-            out[i] = avpriv_scalarproduct_float_c(in0 + int_part,
+            out[i] = ctx->dot_productf(in0 + int_part,
                                                   upsample_fir[4 - frac_part],
                                                   UPS_MEM_SIZE);
             int_part++;
@@ -852,8 +869,8 @@ static float find_hb_gain(AMRWBContext *ctx, const float *synth,
     if (ctx->fr_cur_mode == MODE_23k85)
         return qua_hb_gain[hb_idx] * (1.0f / (1 << 14));
 
-    tilt = avpriv_scalarproduct_float_c(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
-           avpriv_scalarproduct_float_c(synth, synth, AMRWB_SFR_SIZE);
+    tilt = ctx->celpm_ctx.dot_productf(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
+           ctx->celpm_ctx.dot_productf(synth, synth, AMRWB_SFR_SIZE);
 
     /* return gain bounded by [0.1, 1.0] */
     return av_clipf((1.0 - FFMAX(0.0, tilt)) * (1.25 - 0.25 * wsp), 0.1, 1.0);
@@ -872,7 +889,7 @@ static void scaled_hb_excitation(AMRWBContext *ctx, float *hb_exc,
                                  const float *synth_exc, float hb_gain)
 {
     int i;
-    float energy = avpriv_scalarproduct_float_c(synth_exc, synth_exc,
+    float energy = ctx->celpm_ctx.dot_productf(synth_exc, synth_exc,
                                                 AMRWB_SFR_SIZE);
 
     /* Generate a white-noise excitation */
@@ -1003,7 +1020,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         float e_isf[LP_ORDER_16k]; // ISF vector for extrapolation
         double e_isp[LP_ORDER_16k];
 
-        ff_weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
+        ctx->acelpv_ctx.weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
                                 1.0 - isfp_inter[subframe], LP_ORDER);
 
         extrapolate_isf(e_isf);
@@ -1017,7 +1034,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         lpc_weighting(hb_lpc, ctx->lp_coef[subframe], 0.6, LP_ORDER);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
                                  (mode == MODE_6k60) ? LP_ORDER_16k : LP_ORDER);
 }
 
@@ -1032,6 +1049,8 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
  *
  * @remark It is safe to pass the same array in in and out parameters
  */
+
+#ifndef hb_fir_filter
 static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
                           float mem[HB_FIR_SIZE], const float *in)
 {
@@ -1049,6 +1068,7 @@ static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
 
     memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
 }
+#endif /* hb_fir_filter */
 
 /**
  * Update context state before the next subframe.
@@ -1092,10 +1112,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 4 * AMRWB_SFR_SIZE_16k;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     header_size      = decode_mime_header(ctx, buf);
@@ -1166,7 +1184,7 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         ctx->fixed_gain[0] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(ctx->fixed_vector,
+                                  ctx->celpm_ctx.dot_productf(ctx->fixed_vector,
                                                                ctx->fixed_vector,
                                                                AMRWB_SFR_SIZE) /
                                   AMRWB_SFR_SIZE,
@@ -1175,7 +1193,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         /* Calculate voice factor and store tilt for next subframe */
         voice_fac      = voice_factor(ctx->pitch_vector, ctx->pitch_gain[0],
-                                      ctx->fixed_vector, ctx->fixed_gain[0]);
+                                      ctx->fixed_vector, ctx->fixed_gain[0],
+                                      &ctx->celpm_ctx);
         ctx->tilt_coef = voice_fac * 0.25 + 0.25;
 
         /* Construct current excitation */
@@ -1201,15 +1220,15 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
         de_emphasis(&ctx->samples_up[UPS_MEM_SIZE],
                     &ctx->samples_az[LP_ORDER], PREEMPH_FAC, ctx->demph_mem);
 
-        ff_acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_31_poles,
             hpf_31_gain, ctx->hpf_31_mem, AMRWB_SFR_SIZE);
 
         upsample_5_4(sub_buf, &ctx->samples_up[UPS_FIR_SIZE],
-                     AMRWB_SFR_SIZE_16k);
+                     AMRWB_SFR_SIZE_16k, &ctx->celpm_ctx);
 
         /* High frequency band (6.4 - 7.0 kHz) generation part */
-        ff_acelp_apply_order_2_transfer_function(hb_samples,
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(hb_samples,
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_400_poles,
             hpf_400_gain, ctx->hpf_400_mem, AMRWB_SFR_SIZE);
 
diff --git a/libavcodec/anm.c b/libavcodec/anm.c
index b70d220c9a..29d59fbceb 100644
--- a/libavcodec/anm.c
+++ b/libavcodec/anm.c
@@ -2,20 +2,20 @@
  * Deluxe Paint Animation decoder
  * Copyright (c) 2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,8 +47,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
-    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256)
+    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256) {
+        av_frame_free(&s->frame);
         return AVERROR_INVALIDDATA;
+    }
 
     bytestream2_skipu(&s->gb, 16 * 8);
     for (i = 0; i < 256; i++)
@@ -117,10 +119,8 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t *dst, *dst_end;
     int count, ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     dst     = s->frame->data[0];
     dst_end = s->frame->data[0] + s->frame->linesize[0]*avctx->height;
 
@@ -128,11 +128,11 @@ static int decode_frame(AVCodecContext *avctx,
 
     if (bytestream2_get_byte(&s->gb) != 0x42) {
         avpriv_request_sample(avctx, "Unknown record type");
-        return buf_size;
+        return AVERROR_INVALIDDATA;
     }
     if (bytestream2_get_byte(&s->gb)) {
         avpriv_request_sample(avctx, "Padding bytes");
-        return buf_size;
+        return AVERROR_PATCHWELCOME;
     }
     bytestream2_skip(&s->gb, 2);
 
diff --git a/libavcodec/ansi.c b/libavcodec/ansi.c
index 070192ad46..21d5ae1db2 100644
--- a/libavcodec/ansi.c
+++ b/libavcodec/ansi.c
@@ -2,20 +2,20 @@
  * ASCII/ANSI art decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/lfg.h"
+#include "libavutil/xga_font_data.h"
 #include "avcodec.h"
 #include "cga_data.h"
 #include "internal.h"
@@ -60,6 +61,7 @@ typedef struct AnsiContext {
     int attributes;       /**< attribute flags */
     int fg;               /**< foreground color */
     int bg;               /**< background color */
+    int first_frame;
 
     /* ansi parser state machine */
     enum {
@@ -83,7 +85,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     /* defaults */
-    s->font        = ff_vga16_font;
+    s->font        = avpriv_vga16_font;
     s->font_height = 16;
     s->fg          = DEFAULT_FG_COLOR;
     s->bg          = DEFAULT_BG_COLOR;
@@ -96,12 +98,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static void set_palette(uint32_t *pal)
+{
+    int r, g, b;
+    memcpy(pal, ff_cga_palette, 16 * 4);
+    pal += 16;
+#define COLOR(x) ((x) * 40 + 55)
+    for (r = 0; r < 6; r++)
+        for (g = 0; g < 6; g++)
+            for (b = 0; b < 6; b++)
+                *pal++ = 0xFF000000 | (COLOR(r) << 16) | (COLOR(g) << 8) | COLOR(b);
+#define GRAY(x) ((x) * 10 + 8)
+    for (g = 0; g < 24; g++)
+        *pal++ = 0xFF000000 | (GRAY(g) << 16) | (GRAY(g) << 8) | GRAY(g);
+}
+
 static void hscroll(AVCodecContext *avctx)
 {
     AnsiContext *s = avctx->priv_data;
     int i;
 
-    if (s->y < avctx->height - s->font_height) {
+    if (s->y <= avctx->height - 2*s->font_height) {
         s->y += s->font_height;
         return;
     }
@@ -154,7 +171,7 @@ static void draw_char(AVCodecContext *avctx, int c)
     ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
                     s->frame->linesize[0], s->font, s->font_height, c, fg, bg);
     s->x += FONT_WIDTH;
-    if (s->x >= avctx->width) {
+    if (s->x > avctx->width - FONT_WIDTH) {
         s->x = 0;
         hscroll(avctx);
     }
@@ -168,8 +185,8 @@ static int execute_code(AVCodecContext * avctx, int c)
 {
     AnsiContext *s = avctx->priv_data;
     int ret, i;
-    int width = 0;
-    int height = 0;
+    int width  = avctx->width;
+    int height = avctx->height;
 
     switch(c) {
     case 'A': //Cursor Up
@@ -195,19 +212,19 @@ static int execute_code(AVCodecContext * avctx, int c)
             s->args[0] = DEFAULT_SCREEN_MODE;
         switch(s->args[0]) {
         case 0: case 1: case 4: case 5: case 13: case 19: //320x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 40<<3;
             height = 25<<3;
             break;
         case 2: case 3: //640x400 (25 rows)
-            s->font = ff_vga16_font;
+            s->font = avpriv_vga16_font;
             s->font_height = 16;
             width  = 80<<3;
             height = 25<<4;
             break;
         case 6: case 14: //640x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 25<<3;
@@ -215,13 +232,13 @@ static int execute_code(AVCodecContext * avctx, int c)
         case 7: //set line wrapping
             break;
         case 15: case 16: //640x350 (43 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 43<<3;
             break;
         case 17: case 18: //640x480 (60 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 60<<4;
@@ -229,20 +246,19 @@ static int execute_code(AVCodecContext * avctx, int c)
         default:
             avpriv_request_sample(avctx, "Unsupported screen mode");
         }
-        if (width != 0 && height != 0 &&
-            (width != avctx->width || height != avctx->height)) {
+        s->x = av_clip(s->x, 0, width  - FONT_WIDTH);
+        s->y = av_clip(s->y, 0, height - s->font_height);
+        if (width != avctx->width || height != avctx->height) {
             av_frame_unref(s->frame);
             ret = ff_set_dimensions(avctx, width, height);
             if (ret < 0)
                 return ret;
-            ret = ff_get_buffer(avctx, s->frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, s->frame,
+                                     AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
             s->frame->pict_type           = AV_PICTURE_TYPE_I;
             s->frame->palette_has_changed = 1;
-            memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+            set_palette((uint32_t *)s->frame->data[1]);
             erase_screen(avctx);
         } else if (c == 'l') {
             erase_screen(avctx);
@@ -290,12 +306,20 @@ static int execute_code(AVCodecContext * avctx, int c)
                 s->bg = DEFAULT_BG_COLOR;
             } else if (m == 1 || m == 2 || m == 4 || m == 5 || m == 7 || m == 8) {
                 s->attributes |= 1 << (m - 1);
-            } else if (m >= 30 && m <= 38) {
+            } else if (m >= 30 && m <= 37) {
                 s->fg = ansi_to_cga[m - 30];
+            } else if (m == 38 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->fg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 39) {
                 s->fg = ansi_to_cga[DEFAULT_FG_COLOR];
             } else if (m >= 40 && m <= 47) {
                 s->bg = ansi_to_cga[m - 40];
+            } else if (m == 48 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->bg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 49) {
                 s->fg = ansi_to_cga[DEFAULT_BG_COLOR];
             } else {
@@ -319,6 +343,8 @@ static int execute_code(AVCodecContext * avctx, int c)
         avpriv_request_sample(avctx, "Unknown escape code");
         break;
     }
+    s->x = av_clip(s->x, 0, avctx->width  - FONT_WIDTH);
+    s->y = av_clip(s->y, 0, avctx->height - s->font_height);
     return 0;
 }
 
@@ -332,19 +358,21 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf_end   = buf+buf_size;
     int ret, i, count;
 
-    ret = ff_reget_buffer(avctx, s->frame);
-    if (ret < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     if (!avctx->frame_number) {
-        memset(s->frame->data[0], 0, avctx->height * FFABS(s->frame->linesize[0]));
+        for (i=0; i<avctx->height; i++)
+            memset(s->frame->data[0]+ i*s->frame->linesize[0], 0, avctx->width);
         memset(s->frame->data[1], 0, AVPALETTE_SIZE);
     }
 
     s->frame->pict_type           = AV_PICTURE_TYPE_I;
     s->frame->palette_has_changed = 1;
-    memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+    set_palette((uint32_t *)s->frame->data[1]);
+    if (!s->first_frame) {
+        erase_screen(avctx);
+        s->first_frame = 1;
+    }
 
     while(buf < buf_end) {
         switch(s->state) {
@@ -383,7 +411,7 @@ static int decode_frame(AVCodecContext *avctx,
             if (buf[0] == '[') {
                 s->state   = STATE_CODE;
                 s->nb_args = 0;
-                s->args[0] = 0;
+                s->args[0] = -1;
             } else {
                 s->state = STATE_NORMAL;
                 draw_char(avctx, 0x1B);
@@ -394,8 +422,8 @@ static int decode_frame(AVCodecContext *avctx,
             switch(buf[0]) {
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
-                if (s->nb_args < MAX_NB_ARGS)
-                    s->args[s->nb_args] = s->args[s->nb_args] * 10 + buf[0] - '0';
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] < 6553)
+                    s->args[s->nb_args] = FFMAX(s->args[s->nb_args], 0) * 10 + buf[0] - '0';
                 break;
             case ';':
                 s->nb_args++;
@@ -411,7 +439,7 @@ static int decode_frame(AVCodecContext *avctx,
             default:
                 if (s->nb_args > MAX_NB_ARGS)
                     av_log(avctx, AV_LOG_WARNING, "args overflow (%i)\n", s->nb_args);
-                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args])
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] >= 0)
                     s->nb_args++;
                 if ((ret = execute_code(avctx, buf[0])) < 0)
                     return ret;
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index d33139be7d..fcccfbe6d4 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
-#include "apedsp.h"
+#include "lossless_audiodsp.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
@@ -137,7 +137,7 @@ typedef struct APEContext {
     AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    APEDSPContext adsp;
+    LLAudDSPContext adsp;
     int channels;
     int samples;                             ///< samples left to decode in current frame
     int bps;
@@ -212,19 +212,6 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul)
-{
-    int res = 0;
-
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -263,6 +250,7 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
     av_log(avctx, AV_LOG_DEBUG, "Compression Level: %d - Flags: %d\n",
            s->compression_level, s->flags);
     if (s->compression_level % 1000 || s->compression_level > COMPRESSION_LEVEL_INSANE ||
+        !s->compression_level ||
         (s->fileversion < 3930 && s->compression_level == COMPRESSION_LEVEL_INSANE)) {
         av_log(avctx, AV_LOG_ERROR, "Incorrect compression level %d\n",
                s->compression_level);
@@ -305,16 +293,8 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
         s->predictor_decode_stereo = predictor_decode_stereo_3950;
     }
 
-    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
-    if (ARCH_ARM)
-        ff_apedsp_init_arm(&s->adsp);
-    if (ARCH_PPC)
-        ff_apedsp_init_ppc(&s->adsp);
-    if (ARCH_X86)
-        ff_apedsp_init_x86(&s->adsp);
-
     ff_bswapdsp_init(&s->bdsp);
+    ff_llauddsp_init(&s->adsp);
     avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
     return 0;
@@ -512,9 +492,12 @@ static inline int ape_decode_value_3860(APEContext *ctx, GetBitContext *gb,
 
     if (!rice->k)
         x = overflow;
-    else
+    else if(rice->k <= MIN_CACHE_BITS) {
         x = (overflow << rice->k) + get_bits(gb, rice->k);
-
+    } else {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %d\n", rice->k);
+        return AVERROR_INVALIDDATA;
+    }
     rice->ksum += x - (rice->ksum + 8 >> 4);
     if (rice->ksum < (rice->k ? 1 << (rice->k + 4) : 0))
         rice->k--;
@@ -522,10 +505,7 @@ static inline int ape_decode_value_3860(APEContext *ctx, GetBitContext *gb,
         rice->k++;
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
@@ -541,9 +521,13 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     } else
         tmpk = (rice->k < 1) ? 0 : rice->k - 1;
 
-    if (tmpk <= 16 || ctx->fileversion < 3910)
+    if (tmpk <= 16 || ctx->fileversion < 3910) {
+        if (tmpk > 23) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %d\n", tmpk);
+            return AVERROR_INVALIDDATA;
+        }
         x = range_decode_bits(ctx, tmpk);
-    else if (tmpk <= 32) {
+    } else if (tmpk <= 31) {
         x = range_decode_bits(ctx, 16);
         x |= (range_decode_bits(ctx, tmpk - 16) << 16);
     } else {
@@ -555,10 +539,7 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
@@ -601,10 +582,7 @@ static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
@@ -619,10 +597,14 @@ static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
         rice->ksum += out[i];
     }
     rice->k = av_log2(rice->ksum / 10) + 1;
+    if (rice->k >= 24)
+        return;
     for (; i < FFMIN(blockstodecode, 64); i++) {
         out[i] = get_rice_ook(&ctx->gb, rice->k);
         rice->ksum += out[i];
         rice->k = av_log2(rice->ksum / ((i + 1) * 2)) + 1;
+        if (rice->k >= 24)
+            return;
     }
     ksummax = 1 << rice->k + 7;
     ksummin = rice->k ? (1 << rice->k + 6) : 0;
@@ -643,12 +625,8 @@ static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
         }
     }
 
-    for (i = 0; i < blockstodecode; i++) {
-        if (out[i] & 1)
-            out[i] = (out[i] >> 1) + 1;
-        else
-            out[i] = -(out[i] >> 1);
-    }
+    for (i = 0; i < blockstodecode; i++)
+        out[i] = ((out[i] >> 1) ^ ((out[i] & 1) - 1)) + 1;
 }
 
 static void entropy_decode_mono_0000(APEContext *ctx, int blockstodecode)
@@ -908,11 +886,11 @@ static av_always_inline int filter_3800(APEPredictor *p,
     return p->filterA[filter];
 }
 
-static void long_filter_high_3800(int32_t *buffer, int order, int shift,
-                                  int32_t *coeffs, int32_t *delay, int length)
+static void long_filter_high_3800(int32_t *buffer, int order, int shift, int length)
 {
     int i, j;
     int32_t dotprod, sign;
+    int32_t coeffs[256], delay[256];
 
     memset(coeffs, 0, order * sizeof(*coeffs));
     for (i = 0; i < order; i++)
@@ -922,7 +900,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift,
         sign = APESIGN(buffer[i]);
         for (j = 0; j < order; j++) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         buffer[i] -= dotprod >> shift;
         for (j = 0; j < order - 1; j++)
@@ -942,7 +920,7 @@ static void long_filter_ehigh_3830(int32_t *buffer, int length)
         sign = APESIGN(buffer[i]);
         for (j = 7; j >= 0; j--) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         for (j = 7; j > 0; j--)
             delay[j] = delay[j - 1];
@@ -956,13 +934,12 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
-        long_filter_high_3800(decoded1, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
+        long_filter_high_3800(decoded1, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -974,8 +951,8 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded1 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
-        long_filter_high_3800(decoded1, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
+        long_filter_high_3800(decoded1, order, shift2, count);
     }
 
     while (count--) {
@@ -1011,12 +988,11 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
 {
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -1027,7 +1003,7 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded0 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
     }
 
     while (count--) {
@@ -1305,7 +1281,7 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
             /* Update the adaption coefficients */
             absres = FFABS(res);
             if (absres)
-                *f->adaptcoeffs = ((res & (-1<<31)) ^ (-1<<30)) >>
+                *f->adaptcoeffs = ((res & INT32_MIN) ^ (-(1<<30))) >>
                                   (25 + (absres <= f->avg*3) + (absres <= f->avg*4/3));
             else
                 *f->adaptcoeffs = 0;
@@ -1449,7 +1425,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         }
         if (s->fileversion < 3950) // previous versions overread two bytes
             buf_size += 2;
-        av_fast_malloc(&s->data, &s->data_size, buf_size);
+        av_fast_padded_malloc(&s->data, &s->data_size, buf_size);
         if (!s->data)
             return AVERROR(ENOMEM);
         s->bdsp.bswap_buf((uint32_t *) s->data, (const uint32_t *) buf,
@@ -1472,7 +1448,8 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
             }
             s->ptr += offset;
         } else {
-            init_get_bits(&s->gb, s->ptr, (s->data_end - s->ptr) * 8);
+            if ((ret = init_get_bits8(&s->gb, s->ptr, s->data_end - s->ptr)) < 0)
+                return ret;
             if (s->fileversion > 3800)
                 skip_bits_long(&s->gb, offset * 8);
             else
@@ -1484,14 +1461,13 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
                    nblocks);
             return AVERROR_INVALIDDATA;
         }
-        s->samples = nblocks;
 
         /* Initialize the frame decoder */
         if (init_frame_decoder(s) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error reading frame header\n");
             return AVERROR_INVALIDDATA;
         }
-
+        s->samples = nblocks;
     }
 
     if (!s->data) {
@@ -1516,10 +1492,8 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blockstodecode;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     s->error=0;
 
@@ -1563,7 +1537,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return (s->samples == 0) ? avpkt->size : 0;
+    return !s->samples ? avpkt->size : 0;
 }
 
 static void ape_flush(AVCodecContext *avctx)
diff --git a/libavcodec/apng.h b/libavcodec/apng.h
new file mode 100644
index 0000000000..41249e0df0
--- /dev/null
+++ b/libavcodec/apng.h
@@ -0,0 +1,41 @@
+/*
+ * APNG common header
+ * Copyright (c) 2014 Benoit Fouet
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * APNG common header
+ */
+
+#ifndef AVCODEC_APNG_H
+#define AVCODEC_APNG_H
+
+enum {
+   APNG_DISPOSE_OP_NONE       = 0,
+   APNG_DISPOSE_OP_BACKGROUND = 1,
+   APNG_DISPOSE_OP_PREVIOUS   = 2,
+};
+
+enum {
+    APNG_BLEND_OP_SOURCE = 0,
+    APNG_BLEND_OP_OVER   = 1,
+};
+
+#endif /* AVCODEC_APNG_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 71e55a9fd6..cdd35b08ea 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,6 +21,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
 OBJS-$(CONFIG_FLACDSP)                 += arm/flacdsp_init_arm.o        \
                                           arm/flacdsp_arm.o
 OBJS-$(CONFIG_G722DSP)                 += arm/g722dsp_init_arm.o
+OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
 OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
@@ -35,8 +36,8 @@ OBJS-$(CONFIG_VP8DSP)                  += arm/vp8dsp_init_arm.o
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
 OBJS-$(CONFIG_VC1_DECODER)             += arm/vc1dsp_init_arm.o
@@ -125,9 +126,13 @@ NEON-OBJS-$(CONFIG_VP8DSP)             += arm/vp8dsp_init_neon.o        \
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
+NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
                                           arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_idct_neon.o       \
+                                          arm/hevcdsp_qpel_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 4f143cb8a9..cafa881fc7 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c
index 6326376004..e04787caae 100644
--- a/libavcodec/arm/aacpsdsp_init_arm.c
+++ b/libavcodec/arm/aacpsdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index fb00900a4d..a93bbfea9c 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index ed8eb37845..1aea190de9 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2028d0b89f..1d2563d4f7 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c
index a48353a099..a3c32ff407 100644
--- a/libavcodec/arm/ac3dsp_init_arm.c
+++ b/libavcodec/arm/ac3dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
                                 const int16_t *window, unsigned n);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+                                            const float *coef0,
+                                            const float *coef1,
+                                            int len);
 
 void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
                                      int start, int end,
@@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
         c->float_to_fixed24      = ff_float_to_fixed24_neon;
         c->extract_exponents     = ff_ac3_extract_exponents_neon;
         c->apply_window_int16    = ff_apply_window_int16_neon;
+        c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+        c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
     }
 }
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index f97b1907df..89d0ae8048 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1
 
         pop             {r4,pc}
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        vmov.i64        q0,  #0
+        vmov.i64        q1,  #0
+        vmov.i64        q2,  #0
+        vmov.i64        q3,  #0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.s32        d18, d16, d17
+        vsub.s32        d19, d16, d17
+        vmlal.s32       q0,  d16, d16
+        vmlal.s32       q1,  d17, d17
+        vmlal.s32       q2,  d18, d18
+        vmlal.s32       q3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vadd.s64        d0,  d0,  d1
+        vadd.s64        d1,  d2,  d3
+        vadd.s64        d2,  d4,  d5
+        vadd.s64        d3,  d6,  d7
+        vst1.64         {q0-q1},  [r0]
+        bx              lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+        vmov.f32        q0,  #0.0
+        vmov.f32        q1,  #0.0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.f32        d18, d16, d17
+        vsub.f32        d19, d16, d17
+        vmla.f32        d0,  d16, d16
+        vmla.f32        d1,  d17, d17
+        vmla.f32        d2,  d18, d18
+        vmla.f32        d3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vpadd.f32       d0,  d0,  d1
+        vpadd.f32       d1,  d2,  d3
+        vst1.32         {q0},     [r0]
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index 0ea2f04e4a..a2174b0a08 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h
index e97e804de7..213660dae7 100644
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c
index ea9ec3ca10..74aa52a4ef 100644
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized audio functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c
index af532724c8..f7bd162482 100644
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S
index dfb998de32..ab32cef7ab 100644
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h
index 6d9c2c3ed2..59ebeb8466 100644
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,6 @@
 
 #include "libavcodec/blockdsp.h"
 
-void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);
+void ff_blockdsp_init_neon(BlockDSPContext *c);
 
 #endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c
index a0c03674d7..2080d5253f 100644
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized block operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,10 @@
 #include "libavcodec/blockdsp.h"
 #include "blockdsp_arm.h"
 
-av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_arm(BlockDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags))
-        ff_blockdsp_init_neon(c, high_bit_depth);
+        ff_blockdsp_init_neon(c);
 }
diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c
index 5081cf0cdf..87c0d6d6eb 100644
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised block operations
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,8 @@
 void ff_clear_block_neon(int16_t *block);
 void ff_clear_blocks_neon(int16_t *blocks);
 
-av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_neon(BlockDSPContext *c)
 {
-    if (!high_bit_depth) {
-        c->clear_block  = ff_clear_block_neon;
-        c->clear_blocks = ff_clear_blocks_neon;
-    }
+      c->clear_block  = ff_clear_block_neon;
+      c->clear_blocks = ff_clear_blocks_neon;
 }
diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S
index 98df2c60c4..9fc63cba5b 100644
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised block functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index 6ff5f1a385..fdbf86b45e 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
         "tst        %[r_c]        , %[r_c]                      \n\t"
         "bne        2f                                          \n\t"
         "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
+#else
         "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
         "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
         "cmp        %[r_c]        , %[r_b]                      \n\t"
         "itt        lt                                          \n\t"
         "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
         "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
+#endif
         "sub        %[r_c]        , %[low]      , #1            \n\t"
         "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
         "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 4aed57603e..6e87111a32 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavcodec/dcadsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4)
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
 
 #define decode_blockcodes decode_blockcodes
 static inline int decode_blockcodes(int code1, int code2, int levels,
@@ -35,46 +35,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 {
     int32_t v0, v1, v2, v3, v4, v5;
 
-    __asm__ ("smmul   %8,  %14, %18           \n"
-             "smmul   %11, %15, %18           \n"
-             "smlabb  %14, %8,  %17, %14      \n"
-             "smlabb  %15, %11, %17, %15      \n"
-             "smmul   %9,  %8,  %18           \n"
-             "smmul   %12, %11, %18           \n"
-             "sub     %14, %14, %16, lsr #1   \n"
-             "sub     %15, %15, %16, lsr #1   \n"
-             "smlabb  %8,  %9,  %17, %8       \n"
-             "smlabb  %11, %12, %17, %11      \n"
-             "smmul   %10, %9,  %18           \n"
-             "smmul   %13, %12, %18           \n"
-             "str     %14, %0                 \n"
-             "str     %15, %4                 \n"
-             "sub     %8,  %8,  %16, lsr #1   \n"
-             "sub     %11, %11, %16, lsr #1   \n"
-             "smlabb  %9,  %10, %17, %9       \n"
-             "smlabb  %12, %13, %17, %12      \n"
-             "smmul   %14, %10, %18           \n"
-             "smmul   %15, %13, %18           \n"
-             "str     %8,  %1                 \n"
-             "str     %11, %5                 \n"
-             "sub     %9,  %9,  %16, lsr #1   \n"
-             "sub     %12, %12, %16, lsr #1   \n"
-             "smlabb  %10, %14, %17, %10      \n"
-             "smlabb  %13, %15, %17, %13      \n"
-             "str     %9,  %2                 \n"
-             "str     %12, %6                 \n"
-             "sub     %10, %10, %16, lsr #1   \n"
-             "sub     %13, %13, %16, lsr #1   \n"
-             "str     %10, %3                 \n"
-             "str     %13, %7                 \n"
-             : "=m"(values[0]), "=m"(values[1]),
-               "=m"(values[2]), "=m"(values[3]),
-               "=m"(values[4]), "=m"(values[5]),
-               "=m"(values[6]), "=m"(values[7]),
-               "=&r"(v0), "=&r"(v1), "=&r"(v2),
+    __asm__ ("smmul   %0,  %6,  %10           \n"
+             "smmul   %3,  %7,  %10           \n"
+             "smlabb  %6,  %0,  %9,  %6       \n"
+             "smlabb  %7,  %3,  %9,  %7       \n"
+             "smmul   %1,  %0,  %10           \n"
+             "smmul   %4,  %3,  %10           \n"
+             "sub     %6,  %6,  %8,  lsr #1   \n"
+             "sub     %7,  %7,  %8,  lsr #1   \n"
+             "smlabb  %0,  %1,  %9,  %0       \n"
+             "smlabb  %3,  %4,  %9,  %3       \n"
+             "smmul   %2,  %1,  %10           \n"
+             "smmul   %5,  %4,  %10           \n"
+             "str     %6,  [%11, #0]          \n"
+             "str     %7,  [%11, #16]         \n"
+             "sub     %0,  %0,  %8,  lsr #1   \n"
+             "sub     %3,  %3,  %8,  lsr #1   \n"
+             "smlabb  %1,  %2,  %9,  %1       \n"
+             "smlabb  %4,  %5,  %9,  %4       \n"
+             "smmul   %6,  %2,  %10           \n"
+             "smmul   %7,  %5,  %10           \n"
+             "str     %0,  [%11, #4]          \n"
+             "str     %3,  [%11, #20]         \n"
+             "sub     %1,  %1,  %8,  lsr #1   \n"
+             "sub     %4,  %4,  %8,  lsr #1   \n"
+             "smlabb  %2,  %6,  %9,  %2       \n"
+             "smlabb  %5,  %7,  %9,  %5       \n"
+             "str     %1,  [%11, #8]          \n"
+             "str     %4,  [%11, #24]         \n"
+             "sub     %2,  %2,  %8,  lsr #1   \n"
+             "sub     %5,  %5,  %8,  lsr #1   \n"
+             "str     %2,  [%11, #12]         \n"
+             "str     %5,  [%11, #28]         \n"
+             : "=&r"(v0), "=&r"(v1), "=&r"(v2),
                "=&r"(v3), "=&r"(v4), "=&r"(v5),
                "+&r"(code1), "+&r"(code2)
-             : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+             : "r"(levels - 1), "r"(-levels),
+               "r"(ff_inverse[levels]), "r"(values)
+             : "memory");
 
     return code1 | code2;
 }
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 540048415f..a54951584f 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index 70580cdeec..cdc41367e9 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index c9114d499a..2e09f0ee5d 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dct-test.c b/libavcodec/arm/dct-test.c
index 70e5c1cb82..f9076b394f 100644
--- a/libavcodec/arm/dct-test.c
+++ b/libavcodec/arm/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c
index 2f749e49df..b60bb9fa85 100644
--- a/libavcodec/arm/fft_fixed_init_arm.c
+++ b/libavcodec/arm/fft_fixed_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,9 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 
     if (have_neon(cpu_flags)) {
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
         s->fft_calc        = ff_fft_fixed_calc_neon;
+#endif
 
 #if CONFIG_MDCT
         if (!s->inverse && s->nbits >= 3) {
diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S
index c70a18991a..2651607544 100644
--- a/libavcodec/arm/fft_fixed_neon.S
+++ b/libavcodec/arm/fft_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index bc143c10fb..5087f5f64d 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,8 +48,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
     }
 
     if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#endif
 #if CONFIG_MDCT
         s->imdct_calc   = ff_imdct_calc_neon;
         s->imdct_half   = ff_imdct_half_neon;
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
index b161015e39..48f8dfc424 100644
--- a/libavcodec/arm/fft_neon.S
+++ b/libavcodec/arm/fft_neon.S
@@ -7,20 +7,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index c2801fa1a9..ac601325f2 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S
index d4441da1bb..f8861c5967 100644
--- a/libavcodec/arm/flacdsp_arm.S
+++ b/libavcodec/arm/flacdsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c
index 0530cf7a85..564e3dc79b 100644
--- a/libavcodec/arm/flacdsp_init_arm.c
+++ b/libavcodec/arm/flacdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,9 @@
 void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                                  int bps)
 {
-    if (bps <= 16)
-        c->lpc = ff_flac_lpc_16_arm;
+    if (CONFIG_FLAC_DECODER)
+        c->lpc16 = ff_flac_lpc_16_arm;
 }
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 27d3c88011..58589c46ef 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 5e0ac68843..a9ad57f793 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised Format Conversion Utils
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 4e43f425a5..b14af454eb 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c
index 5edf619f17..c0e5d8b989 100644
--- a/libavcodec/arm/g722dsp_init_arm.c
+++ b/libavcodec/arm/g722dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S
index 5fa3c279e9..757e53f167 100644
--- a/libavcodec/arm/g722dsp_neon.S
+++ b/libavcodec/arm/g722dsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions for G722 coding
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 6f365533cf..13f7e0d702 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index ee7011b00b..77ed3c0daa 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350890..90144d0da2 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
 static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
                                       const int chroma_format_idc)
 {
+#if HAVE_NEON
     if (bit_depth == 8) {
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if(chroma_format_idc == 1){
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        }
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
@@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
     }
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
@@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565b3e..274a547f26 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index f588f3e744..4f68bdb9f5 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index a445d4d667..cc324d7dca 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
+#if HAVE_NEON
     const int high_depth = bit_depth > 8;
 
     if (high_depth)
@@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
     if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
         codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S
index 332f94bd53..4dc47ba8f1 100644
--- a/libavcodec/arm/h264pred_neon.S
+++ b/libavcodec/arm/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c
index 01615b5719..71237be359 100644
--- a/libavcodec/arm/h264qpel_init_arm.c
+++ b/libavcodec/arm/h264qpel_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S
index 6c51250d5b..21336c6c32 100644
--- a/libavcodec/arm/h264qpel_neon.S
+++ b/libavcodec/arm/h264qpel_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h
new file mode 100644
index 0000000000..7735df9cd2
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..166bddb104
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start
+        ldr      r12, [r2]
+        ldr      r3, [r2, #4]
+        add      r2, r3, r12
+        cmp      r2, #0
+        it       eq
+        bxeq     lr
+.endm
+
+.macro hevc_loop_filter_chroma_body
+        vsubl.u8  q3, d4, d2
+        vsubl.u8  q11, d18, d19
+        vshl.i16  q3, #2
+        vadd.i16  q11, q3
+        vdup.16   d0, r12
+        vdup.16   d1, r3
+        vrshr.s16 q11, q11, #3
+        vneg.s16  q12, q0
+        vmovl.u8  q2, d4
+        vmin.s16  q11, q11, q0
+        vmax.s16  q11, q11, q12
+        vaddw.u8  q1, q11, d2
+        vsub.i16  q2, q11
+        vqmovun.s16 d2, q1
+        vqmovun.s16 d4, q2
+.endm
+
+.macro hevc_loop_filter_luma_start
+        ldr     r12, [r3]
+        ldr      r3, [r3, #4]
+        lsl      r3, #16
+        orr      r3, r12
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        lsr      r3, #16
+.endm
+
+.macro hevc_loop_filter_luma_body
+        vmovl.u8  q8, d16
+        vmovl.u8  q9, d18
+        vmovl.u8  q10, d20
+        vmovl.u8  q11, d22
+        vmovl.u8  q12, d24
+        vmovl.u8  q13, d26
+        vmovl.u8  q14, d28
+        vmovl.u8  q15, d30
+
+        vadd.i16   q7, q9, q11
+        vadd.i16   q6, q14, q12
+        vsub.i16   q7, q10
+        vsub.i16   q6, q13
+        vabd.s16   q7, q7, q10
+        vabd.s16   q6, q6, q13
+
+
+        vdup.16    q0, r2
+        vmov       q4, q7
+        vmov       q5, q6
+        vdup.16    d4, r12
+        vtrn.16    q7, q4
+        vtrn.16    q6, q5
+
+        vshl.u64   q7, #32
+        vshr.u64   q4, #32
+        vshl.u64   q6, #32
+        vshr.u64   q5, #32
+        vshr.u64   q7, #32
+        vshr.u64   q6, #32
+        vshl.u64   q5, #32
+        vshl.u64   q4, #32
+        vorr       q6, q5
+        vorr       q7, q4
+        vdup.16    d5, r3
+        vadd.i16   q5, q7, q6
+
+        vmov       q4, q5
+        vmov       q3, q5
+        vtrn.32    q3, q4
+
+        vadd.i16   q4, q3
+
+        vshl.s16   q5, q5, #1
+        vcgt.s16   q3, q0, q4
+
+        vmovn.i16  d6, q3
+        vshr.s16   q1, q0, #2
+        vmovn.i16  d6, q3
+        vcgt.s16   q5, q1, q5
+        vmov       r7, s12
+        cmp        r7, #0
+        beq        bypasswrite
+
+        vpadd.i32  d0, d14, d12
+        vpadd.i32  d1, d15, d13
+        vmov       q4, q2
+        vshl.s16   q2, #2
+        vshr.s16   q1, q1, #1
+        vrhadd.s16 q2, q4
+
+        vabd.s16   q7, q8, q11
+        vaba.s16   q7, q15, q12
+
+        vmovn.i32  d0, q0
+        vmov       r5, r6, s0, s1
+        vcgt.s16   q6, q1, q7
+        vand       q5, q5, q6
+        vabd.s16   q7, q11, q12
+        vcgt.s16   q6, q2, q7
+        vand       q5, q5, q6
+
+        vmov       q2, q5
+        vtrn.s16   q5, q2
+        vshr.u64   q2, #32
+        vshl.u64   q5, #32
+        vshl.u64   q2, #32
+        vshr.u64   q5, #32
+        vorr       q5, q2
+
+        vmov       q2, q5
+        vshl.i16   q7, q4, #1
+        vtrn.32    q2, q5
+        vand       q5, q2
+        vneg.s16   q6, q7
+        vmovn.i16  d4, q5
+        vmovn.i16  d4, q2
+        vmov       r8, s8
+
+        and        r9, r8, r7
+        cmp        r9, #0
+        beq        weakfilter_\@
+
+        vadd.i16  q2, q11, q12
+        vadd.i16  q4, q9, q8
+        vadd.i16  q1, q2, q10
+        vdup.16   d10, r9
+        vadd.i16  q0, q1, q9
+        vshl.i16  q4, #1
+        lsr        r9, #16
+        vadd.i16  q1, q0
+        vrshr.s16 q3, q0, #2
+        vadd.i16  q1, q13
+        vadd.i16  q4, q0
+        vsub.i16  q3, q10
+        vrshr.s16 q1, #3
+        vrshr.s16 q4, #3
+        vmax.s16  q3, q6
+        vsub.i16  q1, q11
+        vsub.i16  q4, q9
+        vmin.s16  q3, q7
+        vmax.s16  q4, q6
+        vmax.s16  q1, q6
+        vadd.i16  q3, q10
+        vmin.s16  q4, q7
+        vmin.s16  q1, q7
+        vdup.16   d11, r9
+        vadd.i16  q4, q9
+        vadd.i16  q1, q11
+        vbit      q9, q4, q5
+        vadd.i16  q4, q2, q13
+        vbit      q11, q1, q5
+        vadd.i16  q0, q4, q14
+        vadd.i16  q2, q15, q14
+        vadd.i16  q4, q0
+
+        vshl.i16  q2, #1
+        vadd.i16  q4, q10
+        vbit      q10, q3, q5
+        vrshr.s16 q4, #3
+        vadd.i16  q2, q0
+        vrshr.s16 q3, q0, #2
+        vsub.i16  q4, q12
+        vrshr.s16 q2, #3
+        vsub.i16  q3, q13
+        vmax.s16  q4, q6
+        vsub.i16  q2, q14
+        vmax.s16  q3, q6
+        vmin.s16  q4, q7
+        vmax.s16  q2, q6
+        vmin.s16  q3, q7
+        vadd.i16  q4, q12
+        vmin.s16  q2, q7
+        vadd.i16  q3, q13
+        vbit      q12, q4, q5
+        vadd.i16  q2, q14
+        vbit      q13, q3, q5
+        vbit      q14, q2, q5
+
+weakfilter_\@:
+        mvn       r8, r8
+        and       r9, r8, r7
+        cmp       r9, #0
+        beq       ready_\@
+
+        vdup.16    q4, r2
+
+        vdup.16   d10, r9
+        lsr       r9, #16
+        vmov       q1, q4
+        vdup.16   d11, r9
+        vshr.s16   q1, #1
+        vsub.i16  q2, q12, q11
+        vadd.i16   q4, q1
+        vshl.s16  q0, q2, #3
+        vshr.s16   q4, #3
+        vadd.i16  q2, q0
+        vsub.i16  q0, q13, q10
+        vsub.i16  q2, q0
+        vshl.i16  q0, q0, #1
+        vsub.i16  q2, q0
+        vshl.s16  q1, q7, 2
+        vrshr.s16 q2, q2, #4
+        vadd.i16  q1, q7
+        vabs.s16  q3, q2
+        vshr.s16  q6, q6, #1
+        vcgt.s16  q1, q1, q3
+        vand      q5, q1
+        vshr.s16  q7, q7, #1
+        vmax.s16  q2, q2, q6
+        vmin.s16  q2, q2, q7
+
+        vshr.s16  q7, q7, #1
+        vrhadd.s16 q3, q9, q11
+        vneg.s16  q6, q7
+        vsub.s16  q3, q10
+        vdup.16   d2, r5
+        vhadd.s16 q3, q2
+        vdup.16   d3, r6
+        vmax.s16  q3, q3, q6
+        vcgt.s16  q1, q4, q1
+        vmin.s16  q3, q3, q7
+        vand      q1, q5
+        vadd.i16  q3, q10
+        lsr       r5, #16
+        lsr       r6, #16
+        vbit      q10, q3, q1
+
+        vrhadd.s16 q3, q14, q12
+        vdup.16   d2, r5
+        vsub.s16  q3, q13
+        vdup.16   d3, r6
+        vhsub.s16 q3, q2
+        vcgt.s16  q1, q4, q1
+        vmax.s16  q3, q3, q6
+        vand      q1, q5
+        vmin.s16  q3, q3, q7
+        vadd.i16  q3, q13
+        vbit      q13, q3, q1
+        vadd.i16  q0, q11, q2
+        vsub.i16  q4, q12, q2
+        vbit      q11, q0, q5
+        vbit      q12, q4, q5
+
+ready_\@:
+        vqmovun.s16 d16, q8
+        vqmovun.s16 d18, q9
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d22, q11
+        vqmovun.s16 d24, q12
+        vqmovun.s16 d26, q13
+        vqmovun.s16 d28, q14
+        vqmovun.s16 d30, q15
+.endm
+
+function ff_hevc_v_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d22}, [r0], r1
+        vld1.8   {d24}, [r0], r1
+        vld1.8   {d26}, [r0], r1
+        vld1.8   {d28}, [r0], r1
+        vld1.8   {d30}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        hevc_loop_filter_luma_body
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0], r1
+        vst1.8   {d30}, [r0]
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, r0, r1, lsl #2
+        vld1.8  {d16}, [r0], r1
+        vld1.8  {d18}, [r0], r1
+        vld1.8  {d20}, [r0], r1
+        vld1.8  {d22}, [r0], r1
+        vld1.8  {d24}, [r0], r1
+        vld1.8  {d26}, [r0], r1
+        vld1.8  {d28}, [r0], r1
+        vld1.8  {d30}, [r0], r1
+        sub        r0, r0, r1, lsl #3
+        add        r0, r1
+        hevc_loop_filter_luma_body
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0]
+bypasswrite:
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d17}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2},  [r0], r1
+        vld1.8   {d4},  [r0], r1
+        vld1.8   {d19}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d21}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        hevc_loop_filter_chroma_body
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d17}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d2},  [r0], r1
+        vst1.8   {d4},  [r0], r1
+        vst1.8   {d19}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d21}, [r0]
+        bx       lr
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, r0, r1, lsl #1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2}, [r0], r1
+        vld1.8   {d4}, [r0], r1
+        vld1.8   {d19}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_chroma_body
+        vst1.8   {d2}, [r0], r1
+        vst1.8   {d4}, [r0]
+        bx       lr
+endfunc
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
new file mode 100644
index 0000000000..13d540e5ff
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q0, r1
+        vdup.16     q1, r1
+        vst1.16     {q0, q1}, [r0]
+        bx lr
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        mov         r3, #16
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+1:      subs        r3, #1
+        vstm        r0!, {q8-q15}
+        bne         1b
+        bx lr
+endfunc
+
+function ff_hevc_transform_add_4x4_neon_8, export=1
+        vldm        r1, {q0-q1}
+        vld1.32     d4[0], [r0], r2
+        vld1.32     d4[1], [r0], r2
+        vld1.32     d5[0], [r0], r2
+        vld1.32     d5[1], [r0], r2
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q8, d4
+        vmovl.u8    q9, d5
+        vqadd.s16   q0, q0, q8
+        vqadd.s16   q1, q1, q9
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r2
+        vst1.32     d0[1], [r0], r2
+        vst1.32     d1[0], [r0], r2
+        vst1.32     d1[1], [r0], r2
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_8x8_neon_8, export=1
+        mov         r3,   #8
+1:      subs        r3,   #1
+        vld1.16     {q0}, [r1]!
+        vld1.8      d16,  [r0]
+        vmovl.u8    q8,   d16
+        vqadd.s16   q0,   q8
+        vqmovun.s16 d0,   q0
+        vst1.32     d0,   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_16x16_neon_8, export=1
+        mov         r3,   #16
+1:      subs        r3,   #1
+        vld1.16     {q0, q1}, [r1]!
+        vld1.8      {q8},  [r0]
+        vmovl.u8    q9,  d16
+        vmovl.u8    q10, d17
+        vqadd.s16   q0,  q9
+        vqadd.s16   q1,  q10
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vst1.8      {q0},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_32x32_neon_8, export=1
+        mov         r3,   #32
+1:      subs        r3,   #1
+        vldm        r1!, {q0-q3}
+        vld1.8      {q8, q9},  [r0]
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vqmovun.s16 d2,  q2
+        vqmovun.s16 d3,  q3
+        vst1.8     {q0, q1},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.64         \r0, \r4
+        vtrn.64         \r1, \r5
+        vtrn.64         \r2, \r6
+        vtrn.64         \r3, \r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+.endm
+
+// in 4 q regs
+// output 8 d regs
+.macro transpose_16b_4x4    r0, r1, r2, r3
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+.endm
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
+        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
+        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
+        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
+
+        vaddl.s16   q7, \r0, \r3    // src0 + src3
+        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
+        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
+
+        vmul.s32    q8, q5, d0[1]   // 29 * c0
+        vmul.s32    q9, q2, d1[0]   // 55 * c1
+        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
+        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
+
+        vmul.s32    q2, q2, d0[1]   // 29 * c1
+        vmul.s32    q9, q4, d1[0]   // 55 * c2
+        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
+        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
+
+        vmul.s32    q5, q5, d1[0]   // 55 * c0
+        vmul.s32    q4, q4, d0[1]   // 29 * c2
+        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
+        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
+
+        vqrshrn.s32   \r0, q8, \shift
+        vqrshrn.s32   \r1, q9, \shift
+        vqrshrn.s32   \r2, q7, \shift
+        vqrshrn.s32   \r3, q5, \shift
+.endm
+
+/* uses registers q2 - q6 for temp values */
+.macro tr4 r0, r1, r2, r3
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+.endm
+
+.macro tr4_shift r0, r1, r2, r3, shift
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+
+        vqrshrn.s32   \r0, q4, \shift
+        vqrshrn.s32   \r1, q5, \shift
+        vqrshrn.s32   \r2, q6, \shift
+        vqrshrn.s32   \r3, q3, \shift
+.endm
+
+function ff_hevc_transform_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x00240053 // 36 and 83
+        vmov.32     d0[0], r3
+
+        tr4_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x4a  // 74
+        vmov.32     d0[0], r3
+        ldr         r3, =0x1d  // 29
+        vmov.32     d0[1], r3
+        ldr         r3, =0x37  // 55
+        vmov.32     d1[0], r3
+
+        tr4_luma_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_luma_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+.macro tr8_begin in0, in1, in2, in3
+        vmull.s16  q7, \in0, d1[1]   // 89 * src1
+        vmull.s16  q8, \in0, d1[0]   // 75 * src1
+        vmull.s16  q9, \in0, d1[3]   // 50 * src1
+        vmull.s16  q10, \in0, d1[2]  // 18 * src1
+
+        vmlal.s16  q7, \in1, d1[0]   // 75 * src3
+        vmlsl.s16  q8, \in1, d1[2]   //-18 * src3
+        vmlsl.s16  q9, \in1, d1[1]   //-89 * src3
+        vmlsl.s16  q10, \in1, d1[3]  //-50 * src3
+
+        vmlal.s16  q7, \in2, d1[3]   // 50 * src5
+        vmlsl.s16  q8, \in2, d1[1]   //-89 * src5
+        vmlal.s16  q9, \in2, d1[2]   // 18 * src5
+        vmlal.s16  q10, \in2, d1[0]  // 75 * src5
+
+        vmlal.s16  q7, \in3, d1[2]   // 18 * src7
+        vmlsl.s16  q8, \in3, d1[3]   //-50 * src7
+        vmlal.s16  q9, \in3, d1[0]   // 75 * src7
+        vmlsl.s16  q10, \in3, d1[1]  //-89 * src7
+.endm
+
+.macro tr8_end shift
+        vadd.s32   q1, q4, q7   //  e_8[0] + o_8[0], dst[0]
+        vsub.s32   q4, q4, q7   //  e_8[0] - o_8[0], dst[7]
+
+        vadd.s32   q2, q5, q8   // e_8[1] + o_8[1], dst[1]
+        vsub.s32   q5, q5, q8   // e_8[1] - o_8[1], dst[6]
+
+        vadd.s32   q11, q6, q9  // e_8[2] + o_8[2], dst[2]
+        vsub.s32    q6, q6, q9  // e_8[2] - o_8[2], dst[5]
+
+        vadd.s32   q12, q3, q10 // e_8[3] + o_8[3], dst[3]
+        vsub.s32   q3, q3, q10  // e_8[3] - o_8[3], dst[4]
+        vqrshrn.s32   d2, q1, \shift
+        vqrshrn.s32   d3, q2, \shift
+        vqrshrn.s32   d4, q11, \shift
+        vqrshrn.s32   d5, q12, \shift
+        vqrshrn.s32   d6, q3, \shift
+        vqrshrn.s32   d7, q6, \shift
+        vqrshrn.s32   d9, q4, \shift
+        vqrshrn.s32   d8, q5, \shift
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+        push   {r4-r8}
+        vpush {d8-d15}
+        mov    r5, #16
+
+        adr       r3, tr4f
+        vld1.16   {d0, d1}, [r3]
+
+        // left half
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #128
+        //skip right half if col_limit in r1 is less than 4
+        cmp      r1, #4
+        blt      1f
+        //right half
+        add      r0, #8
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #136
+1:
+        // top half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        vstm r0!, {q1-q4}
+
+        // bottom half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        //vstm     r0, {q1-q4}
+        vst1.16 {q1-q2}, [r0]
+        add     r0, #32
+        vst1.16 {q3-q4}, [r0]
+        sub     r0, #32
+        vpop {d8-d15}
+        pop {r4-r8}
+        bx lr
+endfunc
+
+.align 4
+tr4f:
+.word 0x00240053  // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b  // 89, d0[0] = 75
+.word 0x00320012  // 50, d0[2] = 18
+tr16:
+.word 0x005a0057  // 90, d2[0] = 87
+.word 0x00500046  // 80, d2[2] = 70
+.word 0x0039002b  // 57, d2[0] = 43
+.word 0x00190009  // 25, d2[2] = 9
diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c
new file mode 100644
index 0000000000..adcc454511
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+av_cold void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_init_neon(c, bit_depth);
+}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
new file mode 100644
index 0000000000..55918077e2
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+
+#define PUT_PIXELS(name) \
+    void name(int16_t *dst, uint8_t *src, \
+                                ptrdiff_t srcstride, int height, \
+                                intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+                                   int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width) {
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        int x;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+        c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+        c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
+        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
+        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
+        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+        put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+        put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+        put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+        put_hevc_qpel_neon[0][1]       = ff_hevc_put_qpel_h1_neon_8;
+        put_hevc_qpel_neon[0][2]       = ff_hevc_put_qpel_h2_neon_8;
+        put_hevc_qpel_neon[0][3]       = ff_hevc_put_qpel_h3_neon_8;
+        put_hevc_qpel_neon[1][1]       = ff_hevc_put_qpel_h1v1_neon_8;
+        put_hevc_qpel_neon[1][2]       = ff_hevc_put_qpel_h2v1_neon_8;
+        put_hevc_qpel_neon[1][3]       = ff_hevc_put_qpel_h3v1_neon_8;
+        put_hevc_qpel_neon[2][1]       = ff_hevc_put_qpel_h1v2_neon_8;
+        put_hevc_qpel_neon[2][2]       = ff_hevc_put_qpel_h2v2_neon_8;
+        put_hevc_qpel_neon[2][3]       = ff_hevc_put_qpel_h3v2_neon_8;
+        put_hevc_qpel_neon[3][1]       = ff_hevc_put_qpel_h1v3_neon_8;
+        put_hevc_qpel_neon[3][2]       = ff_hevc_put_qpel_h2v3_neon_8;
+        put_hevc_qpel_neon[3][3]       = ff_hevc_put_qpel_h3v3_neon_8;
+        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_put_qpel_uw_v1_neon_8;
+        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_put_qpel_uw_v2_neon_8;
+        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_put_qpel_uw_v3_neon_8;
+        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_put_qpel_uw_h1_neon_8;
+        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_put_qpel_uw_h2_neon_8;
+        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_put_qpel_uw_h3_neon_8;
+        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_put_qpel_uw_h1v1_neon_8;
+        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_put_qpel_uw_h2v1_neon_8;
+        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_put_qpel_uw_h3v1_neon_8;
+        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_put_qpel_uw_h1v2_neon_8;
+        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_put_qpel_uw_h2v2_neon_8;
+        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_put_qpel_uw_h3v2_neon_8;
+        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_put_qpel_uw_h1v3_neon_8;
+        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_put_qpel_uw_h2v3_neon_8;
+        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_put_qpel_uw_h3v3_neon_8;
+        for (x = 0; x < 10; x++) {
+            c->put_hevc_qpel[x][1][0]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][0][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][1][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+        }
+        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
+        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+    }
+}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..86f92cf75a
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,999 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+    vmov d16, d17
+    vmov d17, d18
+    vmov d18, d19
+    vmov d19, d20
+    vmov d20, d21
+    vmov d21, d22
+    vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+    vmov q0, q1
+    vmov q1, q2
+    vmov q2, q3
+    vmov q3, q4
+    vmov q4, q5
+    vmov q5, q6
+    vmov q6, q7
+.endm
+
+.macro vextin8
+        pld       [r2]
+        vld1.8    {q11}, [r2], r3
+        vext.8    d16, d22, d23, #1
+        vext.8    d17, d22, d23, #2
+        vext.8    d18, d22, d23, #3
+        vext.8    d19, d22, d23, #4
+        vext.8    d20, d22, d23, #5
+        vext.8    d21, d22, d23, #6
+        vext.8    d22, d22, d23, #7
+.endm
+
+.macro loadin8
+        pld       [r2]
+        vld1.8    {d16}, [r2], r3
+        pld       [r2]
+        vld1.8    {d17}, [r2], r3
+        pld       [r2]
+        vld1.8    {d18}, [r2], r3
+        pld       [r2]
+        vld1.8    {d19}, [r2], r3
+        pld       [r2]
+        vld1.8    {d20}, [r2], r3
+        pld       [r2]
+        vld1.8    {d21}, [r2], r3
+        pld       [r2]
+        vld1.8    {d22}, [r2], r3
+        pld       [r2]
+        vld1.8    {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d6, d16   // 58 * d0
+        vmull.s16  q10, d7, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d4, d17   // 10 * c0
+        vmull.s16  q12, d5, d17   // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d8, d16   // 17 * e0
+        vmull.s16  q14, d9, d16   // 17 * e1
+        vmull.s16  q15, d10, d17  //  5 * f0
+        vmull.s16   q8, d11, d17  //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d2, #2    // 4 * b0
+        vshll.s16  q12, d3, #2    // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d12, d0   // g0 - a0
+        vsubl.s16  q14, d13, d1   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+// input  q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+        vmov.i32   q8, #11
+        vaddl.s16   q9, d6, d8   // d0 + e0
+        vaddl.s16  q10, d7, d9   // d1 + e1
+        vaddl.s16  q11, d4, d10  // c0 + f0
+        vaddl.s16  q12, d5, d11  // c1 + f1
+        vmul.s32   q11, q8       // 11 * (c0 + f0)
+        vmul.s32   q12, q8       // 11 * (c1 + f1)
+        vmov.i32   q8, #40
+        vaddl.s16  q15, d2, d12  // b0 + g0
+        vmul.s32    q9, q8       // 40 * (d0 + e0)
+        vmul.s32   q10, q8       // 40 * (d1 + e1)
+        vaddl.s16   q8, d3, d13  // b1 + g1
+        vaddl.s16  q13, d0, d14  // a0 + h0
+        vaddl.s16  q14, d1, d15  // a1 + h1
+        vshl.s32   q15, #2       // 4*(b0+g0)
+        vshl.s32    q8, #2       // 4*(b1+g1)
+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d8, d16   // 58 * d0
+        vmull.s16  q10, d9, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d10, d17  // 10 * c0
+        vmull.s16  q12, d11, d17  // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d6, d16   // 17 * e0
+        vmull.s16  q14, d7, d16   // 17 * e1
+        vmull.s16  q15, d4, d17   //  5 * f0
+        vmull.s16   q8, d5, d17   //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d12, #2   // 4 * b0
+        vshll.s16  q12, d13, #2   // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d2, d14   // g0 - a0
+        vsubl.s16  q14, d3, d15   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d20, #4   // 16*e
+        vshll.u8   q14, d21, #2   // 4*f
+        vmull.u8  \out, d19, d24  // 58*d
+        vaddw.u8   q13, q13, d20  // 17*e
+        vmull.u8   q15, d18, d25  // 10*c
+        vaddw.u8   q14, q14, d21  // 5*f
+        vsubl.u8   q12, d22, d16  // g - a
+        vadd.u16  \out, q13       // 58d + 17e
+        vshll.u8   q13, d17, #2   // 4*b
+        vadd.u16   q15, q14       // 10*c + 5*f
+        vadd.s16   q13, q12       // - a + 4*b + g
+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+        vmov.i16   q12, #10
+        vmov.i16   q14, #11
+        vaddl.u8   q13, d19, d20   // d + e
+        vaddl.u8   q15, d18, d21   // c + f
+        vmul.u16   q13, q12        // 10 * (d+e)
+        vmul.u16   q15, q14        // 11 * ( c + f)
+        vaddl.u8  \out, d17, d22   // b + g
+        vaddl.u8   q12, d16, d23   // a + h
+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
+        vadd.s16   q12, q15
+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
+        vsub.s16  \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d19, #4     // 16*e
+        vshll.u8   q14, d18, #2     // 4*f
+        vmull.u8  \out, d20, d24    // 58*d
+        vaddw.u8   q13, q13, d19    // 17*e
+        vmull.u8   q15, d21, d25    // 10*c
+        vaddw.u8   q14, q14, d18    // 5*f
+        vsubl.u8   q12, d17, d23    // g - a
+        vadd.u16  \out, q13         // 58d + 17e
+        vshll.u8   q13, d22, #2     // 4*b
+        vadd.u16   q15, q14         // 10*c + 5*f
+        vadd.s16   q13, q12         // - a + 4*b + g
+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        lsl       r1, #1
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vst1.16    {q7}, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vst1.16    d14, [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        push   {r4-r10}
+        ldr    r5, [sp, #28] // width
+        ldr    r4, [sp, #32] // height
+        ldr    r8, [sp, #36] // src2
+        ldr    r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32    d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+        b   99f
+.Lbi\@: lsl       r9, #1
+        mov       r10, r8
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        push     {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush    {d8-d15}
+        sub       r2, #4
+        lsl       r1, #1
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16   {q7}, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16  d14, [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush    {d8-d15}
+        sub       r2, #4
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32  d0[0], [r0], r1
+        bne       4b
+        b         99f
+.Lbi\@:
+        lsl       r9, #1
+        cmp       r5, #4
+        beq       4f
+        mov       r10, r8
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        add       r10, #16
+        mov       r8, r10
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        lsl       r1, #1
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vst1.16    {q8}, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vst1.16    d16, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.32        d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+        b   99f
+.Lbi\@: lsl      r9, #1
+        mov      r10, r8
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q8
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d16
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+        pld    [r1]
+        pld    [r1, r2]
+        mov    r12, MAX_PB_SIZE
+        lsl    r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+        vmov.u8      d5, #255
+        vshr.u64     d5, #32
+0:      subs r3, #1
+        vld1.32     {d0[0]}, [r1], r2
+        pld [r1]
+        vld1.32     d6, [r0]
+        vshll.u8    q0, d0, #6
+        vbit        d6, d0, d5
+        vst1.32     d6, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.32   {d0[0]}, [r1], r2
+        vld1.32   {d0[1]}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8   q0, d0, #6
+        vst1.64   {d0}, [r0], r12
+        vst1.64   {d1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        vmov.u8      q10, #255
+        vshr.u64     d21, #32
+0:      subs r3, #1
+        vld1.16     {d0}, [r1], r2
+        pld [r1]
+        vshll.u8    q0, d0, #6
+        vld1.8      {q12}, [r0]
+        vbit        q12, q0, q10
+        vst1.8      {q12}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {d0}, [r1], r2
+        vld1.8   {d2}, [r1], r2
+        pld        [r1]
+        pld        [r1, r2]
+        vshll.u8   q0, d0, #6
+        vshll.u8   q1, d2, #6
+        vst1.16   {q0}, [r0], r12
+        vst1.16   {q1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.64    {d0}, [r1]
+        add       r1, #8
+        vld1.32   {d1[0]}, [r1], r2
+        sub       r1, #8
+        vld1.64    {d2}, [r1]
+        add       r1, #8
+        vld1.32   {d1[1]}, [r1], r2
+        sub       r1, #8
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vmov      d22, d19
+        vst1.64   {d16, d17, d18}, [r0], r12
+        vst1.64   {d20, d21, d22}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {q0}, [r1], r2
+        vld1.8   {q1}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vst1.8    {q8, q9}, [r0], r12
+        vst1.8    {q10, q11}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8   {d0, d1, d2}, [r1], r2
+        pld       [r1]
+        vshll.u8  q10, d0, #6
+        vshll.u8  q11, d1, #6
+        vshll.u8  q12, d2, #6
+        vstm     r0, {q10, q11, q12}
+        add      r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8 {q0, q1}, [r1], r2
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vstm    r0, {q8, q9, q10, q11}
+        add     r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add r1, #32
+        vld1.8    {q2}, [r1], r2
+        sub r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vstm r0, {q8, q9, q10, q11, q12, q13}
+        add  r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add      r1, #32
+        vld1.8    {q2, q3}, [r1], r2
+        sub      r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vshll.u8  q14, d6, #6
+        vshll.u8  q15, d7, #6
+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+        add r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+        push   {r4-r9}
+        ldr    r5, [sp, #24] // width
+        ldr    r4, [sp, #28] // height
+        ldr    r8, [sp, #32] // src2
+        ldr    r9, [sp, #36] // src2stride
+        vpush {d8-d15}
+        cmp    r8, #0
+        bne    2f
+1:      subs r4, #1
+        vld1.8     {d0}, [r2], r3
+        vst1.8      d0, [r0], r1
+        bne 1b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+2:      subs  r4, #1
+        vld1.8         {d0}, [r2], r3
+        vld1.16        {q1}, [r8], r9
+        vshll.u8       q0, d0, #6
+        vqadd.s16      q0, q1
+        vqrshrun.s16   d0, q0, #7
+        vst1.8      d0, [r0], r1
+        bne 2b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        ldr    r12, [sp] // height
+1:      subs   r12, #4
+        vld1.32     {\regs}  , [r2], r3
+        vld1.32     {\regs2} , [r2], r3
+        vld1.32     {\regs3} , [r2], r3
+        vld1.32     {\regs4} , [r2], r3
+        vst1.32     {\regs}  , [r0], r1
+        vst1.32     {\regs2} , [r0], r1
+        vst1.32     {\regs3} , [r0], r1
+        vst1.32     {\regs4} , [r0], r1
+        bne 1b
+        bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+1:      subs r12, #2
+        mov      r4, r2
+        vld1.32   {\regs} , [r2]!
+        vld1.32   {\regs2} , [r2]
+        add      r2, r4, r3
+        mov      r4, r2
+        vld1.32   {\regs3} , [r2]!
+        vld1.32   {\regs4} , [r2]
+        add      r2, r4, r3
+        mov      r5, r0
+        vst1.32   {\regs} , [r0]!
+        vst1.32   {\regs2} , [r0]
+        add      r0, r5, r1
+        mov      r5, r0
+        vst1.32   {\regs3} , [r0]!
+        vst1.32   {\regs4} , [r0]
+        add      r0, r5, r1
+        bne 1b
+        pop   {r4-r5}
+        bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
index 0f8092e15e..219f793d99 100644
--- a/libavcodec/arm/hpeldsp_arm.S
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -2,20 +2,20 @@
 @ ARMv4-optimized halfpel functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
index a8641529d5..5f3c7741c1 100644
--- a/libavcodec/arm/hpeldsp_arm.h
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
index f1abc328eb..a8bd459c20 100644
--- a/libavcodec/arm/hpeldsp_armv6.S
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
index 63906606a2..1977b1379b 100644
--- a/libavcodec/arm/hpeldsp_init_arm.c
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized halfpel functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
index 67a500d513..967a8e0427 100644
--- a/libavcodec/arm/hpeldsp_init_armv6.c
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
index 76d4eafceb..d9feadd1dd 100644
--- a/libavcodec/arm/hpeldsp_init_neon.c
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
index 90bc3cb8ae..cf4a6cfb8d 100644
--- a/libavcodec/arm/hpeldsp_neon.S
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index 168d64b666..39cef3a874 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
index 34f467e86f..057eff9be8 100644
--- a/libavcodec/arm/idctdsp_arm.S
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -2,27 +2,27 @@
 @ ARMv4-optimized IDCT functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
 #include "config.h"
 #include "libavutil/arm/asm.S"
 
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
 function ff_add_pixels_clamped_arm, export=1, align=5
         push            {r4-r10}
         mov             r10, #8
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
index 9012b82904..d7bc5cd02a 100644
--- a/libavcodec/arm/idctdsp_arm.h
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
index c180d732fa..a6e77d6da1 100644
--- a/libavcodec/arm/idctdsp_armv6.S
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 8207c31589..0068e3f86c 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized IDCT functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
-                               int line_size);
+                               ptrdiff_t line_size);
 
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
  * converted */
@@ -63,8 +63,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_ARM) {
             c->idct_put  = j_rev_dct_arm_put;
             c->idct_add  = j_rev_dct_arm_add;
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
index 251165dd74..3d881e1f18 100644
--- a/libavcodec/arm/idctdsp_init_armv5te.c
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,9 @@
 av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-    if (!high_bit_depth &&
+    if (!avctx->lowres && !high_bit_depth &&
         (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
         c->idct_put  = ff_simple_idct_put_armv5te;
         c->idct_add  = ff_simple_idct_add_armv5te;
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
index 8f0c49b142..edf3070e15 100644
--- a/libavcodec/arm/idctdsp_init_armv6.c
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,13 +27,13 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
+                                 ptrdiff_t line_size);
 
 av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
             c->idct_put  = ff_simple_idct_put_armv6;
             c->idct_add  = ff_simple_idct_add_armv6;
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
index c94f7b6e5d..b70c5b0d44 100644
--- a/libavcodec/arm/idctdsp_init_neon.c
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,16 @@
 #include "idct.h"
 #include "idctdsp_arm.h"
 
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
 
 av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
+    if (!avctx->lowres && !high_bit_depth) {
         if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLENEON) {
             c->idct_put  = ff_simple_idct_put_neon;
             c->idct_add  = ff_simple_idct_add_neon;
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
index 7095879bae..1911a33468 100644
--- a/libavcodec/arm/idctdsp_neon.S
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 42f37392e1..72c4c77c45 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -1,21 +1,21 @@
 /*
  * ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
+ * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1
         vmlal.s16       q2,  d18,  d22
         vmlal.s16       q3,  d19,  d23
         subs            r2,  r2,   #16
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
+
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c
index 47ea034359..981a39aff9 100644
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/lossless_audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,12 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3, int len, int mul);
 
-av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S
index 7cfbf43c6d..ba7c45fcef 100644
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/lossless_audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised integer operations
  * Copyright (c) 2009 Kostya Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1
         vst1.16         {q10},     [r12,:128]!
         subs            r3,  r3,   #16
         vst1.16         {q13},     [r12,:128]!
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 45ac67d436..dc57c5571c 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S
index c77be59c65..365c5e7faf 100644
--- a/libavcodec/arm/mdct_fixed_neon.S
+++ b/libavcodec/arm/mdct_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index bfe259c396..a6952fa571 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised MDCT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index f3fe668eae..43f6d14c0c 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S
index 436e20dd25..fa5a82301e 100644
--- a/libavcodec/arm/me_cmp_armv6.S
+++ b/libavcodec/arm/me_cmp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c
index 4d73f3e0fd..03870a2bfa 100644
--- a/libavcodec/arm/me_cmp_init_arm.c
+++ b/libavcodec/arm/me_cmp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S
index 4272dae029..4f9aa485fd 100644
--- a/libavcodec/arm/mlpdsp_armv5te.S
+++ b/libavcodec/arm/mlpdsp_armv5te.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
index 3c88021a35..d98f807559 100644
--- a/libavcodec/arm/mlpdsp_armv6.S
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 4cdd10caf5..34a5f61e1d 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 49bd0bcaf2..977abb6939 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
index e73aee6a2b..98e0c8a3a8 100644
--- a/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 34e9cf18b5..918be16d03 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2002 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h
index 17e3a5b024..709ae6b247 100644
--- a/libavcodec/arm/mpegvideo_arm.h
+++ b/libavcodec/arm/mpegvideo_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
index 3c44cd80b5..88c5f4fdad 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -2,24 +2,25 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideo_arm.h"
@@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
     int level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qmul = qscale << 1;
 
@@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
     int qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 4426e15e91..8687d6b31c 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -2,20 +2,20 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 3e1f7b53e2..1889d7a912 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S
index 99db501b25..ab0dad7b18 100644
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c
index ab9ba3e1be..4bfe835684 100644
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S
index 716a607af7..787bc4bf36 100644
--- a/libavcodec/arm/neon.S
+++ b/libavcodec/arm/neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c
index b77bcd7223..a3b5b8e2e3 100644
--- a/libavcodec/arm/neontest.c
+++ b/libavcodec/arm/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S
index 4c925a4daa..b10ea78e88 100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index f20769b3bc..76d7509728 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,7 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
-void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
+void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
                           const uint8_t *s2, int stride);
 
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 7d01d53f1a..781d976354 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised RDFT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c
index 5ce787ba7f..8bfe90b3d3 100644
--- a/libavcodec/arm/rv34dsp_init_arm.c
+++ b/libavcodec/arm/rv34dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a29123f772..3d4a83d9ac 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c
index df3e4611a1..c24854d1cd 100644
--- a/libavcodec/arm/rv40dsp_init_arm.c
+++ b/libavcodec/arm/rv40dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 6bd45eb5ad..099f88c092 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 4da7967b49..4fb69f922b 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
index 610397f9e2..e66abd682a 100644
--- a/libavcodec/arm/sbrdsp_neon.S
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index bf9ee3d4e6..c6300737a8 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -4,22 +4,22 @@
  * Author: Frederic Boulay <dilb@handhelds.org>
  *
  * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the Libav project.
+ * from the libavcodec library part of the FFmpeg project.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index bf509eeffc..d1f10b75cb 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index 60723467a0..79cf5d41fb 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index a1cde8d80a..c3e573c00a 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -6,20 +6,20 @@
  * Based on Simple IDCT
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
index d7996c1a4b..cf25d9d4df 100644
--- a/libavcodec/arm/startcode.h
+++ b/libavcodec/arm/startcode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
index 64078b2898..a46f009375 100644
--- a/libavcodec/arm/startcode_armv6.S
+++ b/libavcodec/arm/startcode_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 62bb6674ed..5417be7d53 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 5d79e509f9..596734c5bc 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h
index 30f059f28c..cd01ac5384 100644
--- a/libavcodec/arm/vc1dsp.h
+++ b/libavcodec/arm/vc1dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index a6a97c8bf9..5f2c759048 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         ff_vc1dsp_init_neon(dsp);
 }
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 9ded7a28b9..bb873e687e 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,40 +37,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);
 
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+                                      ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+                                         ptrdiff_t stride, int rnd) \
+{ \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+  dst += 8*stride; src += 8*stride; \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
 
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
@@ -81,6 +79,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
 
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
 av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 {
     dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
@@ -92,23 +94,26 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
-    dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
+    dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
     if (HAVE_AS_DN_DIRECTIVE) {
-    dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon;
-    dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon;
-    dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon;
-    dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon;
-    dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
-    dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
-    dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(3, 0);
+
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(3, 1);
+
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(3, 2);
+
+    FN_ASSIGN(0, 3);
+    FN_ASSIGN(1, 3);
+    FN_ASSIGN(2, 3);
+    FN_ASSIGN(3, 3);
     }
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index fa87eded61..c4f4db9c8e 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h
index a7087599cc..112cbb86c7 100644
--- a/libavcodec/arm/videodsp_arm.h
+++ b/libavcodec/arm/videodsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S
index 0510019f03..aff1161ada 100644
--- a/libavcodec/arm/videodsp_armv5te.S
+++ b/libavcodec/arm/videodsp_armv5te.S
@@ -2,20 +2,20 @@
 @ ARMv5te-optimized core video DSP functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c
index 20c6e4a605..a89abb25d5 100644
--- a/libavcodec/arm/videodsp_init_arm.c
+++ b/libavcodec/arm/videodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index 832191f6d2..1ea1f3438d 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
 {
+#if HAVE_ARMV5TE_EXTERNAL
     ctx->prefetch = ff_prefetch_arm;
+#endif
 }
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c
index 853ba2d865..f4b3d80ef6 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S
index 7df876c2bc..79ce54f938 100644
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 11e1f1ca11..d9246365df 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 58bd97d548..2942d488f5 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 6bc9456336..feb1247916 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c
index 4ec41ed9f1..ed6832145e 100644
--- a/libavcodec/arm/vp6dsp_init_arm.c
+++ b/libavcodec/arm/vp6dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S
index 10b4d0f14c..03dd28d1cb 100644
--- a/libavcodec/arm/vp6dsp_neon.S
+++ b/libavcodec/arm/vp6dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
index 93b2788835..965342d93b 100644
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 3863dc31a5..e7d25a45c1 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h
index 0d55e0ffc0..7281d0bfb1 100644
--- a/libavcodec/arm/vp8dsp.h
+++ b/libavcodec/arm/vp8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index 03100cdf2f..1adcbbdbb1 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -5,20 +5,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * This code was partially ported from libvpx, which uses this license:
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index aa77dbab98..8b801766d7 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index febe4e71a2..a5bcd733e0 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index 2b6c7750d3..53f1f23380 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index 544332c3c2..5319346951 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ass.c b/libavcodec/ass.c
index 098050a1a6..227d571af1 100644
--- a/libavcodec/ass.c
+++ b/libavcodec/ass.c
@@ -1,66 +1,76 @@
 /*
- * SSA/ASS common funtions
+ * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "ass.h"
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/common.h"
 
-/**
- * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
- *
- * @param avctx pointer to the AVCodecContext
- * @param font name of the default font face to use
- * @param font_size default font size to use
- * @param color default text color to use (ABGR)
- * @param back_color default background color to use (ABGR)
- * @param bold 1 for bold text, 0 for normal text
- * @param italic 1 for italic text, 0 for normal text
- * @param underline 1 for underline text, 0 for normal text
- * @param alignment position of the text (left, center, top...), defined after
- *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
- * @return >= 0 on success otherwise an error code <0
- */
-static int ass_subtitle_header(AVCodecContext *avctx,
-                               const char *font, int font_size,
-                               int color, int back_color,
-                               int bold, int italic, int underline,
-                               int alignment)
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int alignment)
 {
-    char header[512];
-
-    snprintf(header, sizeof(header),
+    avctx->subtitle_header = av_asprintf(
              "[Script Info]\r\n"
+             "; Script generated by FFmpeg/Lavc%s\r\n"
              "ScriptType: v4.00+\r\n"
+             "PlayResX: %d\r\n"
+             "PlayResY: %d\r\n"
              "\r\n"
              "[V4+ Styles]\r\n"
-             "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, AlphaLevel, Encoding\r\n"
-             "Style: Default,%s,%d,&H%x,&H%x,&H%x,&H%x,%d,%d,%d,1,1,0,%d,10,10,10,0,0\r\n"
+
+             /* ASSv4 header */
+             "Format: Name, "
+             "Fontname, Fontsize, "
+             "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+             "Bold, Italic, Underline, StrikeOut, "
+             "ScaleX, ScaleY, "
+             "Spacing, Angle, "
+             "BorderStyle, Outline, Shadow, "
+             "Alignment, MarginL, MarginR, MarginV, "
+             "Encoding\r\n"
+
+             "Style: "
+             "Default,"             /* Name */
+             "%s,%d,"               /* Font{name,size} */
+             "&H%x,&H%x,&H%x,&H%x," /* {Primary,Secondary,Outline,Back}Colour */
+             "%d,%d,%d,0,"          /* Bold, Italic, Underline, StrikeOut */
+             "100,100,"             /* Scale{X,Y} */
+             "0,0,"                 /* Spacing, Angle */
+             "1,1,0,"               /* BorderStyle, Outline, Shadow */
+             "%d,10,10,10,"         /* Alignment, Margin[LRV] */
+             "0\r\n"                /* Encoding */
+
              "\r\n"
              "[Events]\r\n"
-             "Format: Layer, Start, End, Text\r\n",
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+             ASS_DEFAULT_PLAYRESX, ASS_DEFAULT_PLAYRESY,
              font, font_size, color, color, back_color, back_color,
              -bold, -italic, -underline, alignment);
 
-    avctx->subtitle_header = av_strdup(header);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     avctx->subtitle_header_size = strlen(avctx->subtitle_header);
@@ -69,7 +79,7 @@ static int ass_subtitle_header(AVCodecContext *avctx,
 
 int ff_ass_subtitle_header_default(AVCodecContext *avctx)
 {
-    return ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
+    return ff_ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
                                ASS_DEFAULT_FONT_SIZE,
                                ASS_DEFAULT_COLOR,
                                ASS_DEFAULT_BACK_COLOR,
@@ -79,47 +89,135 @@ int ff_ass_subtitle_header_default(AVCodecContext *avctx)
                                ASS_DEFAULT_ALIGNMENT);
 }
 
-void ff_ass_init(AVSubtitle *sub)
+static void insert_ts(AVBPrint *buf, int ts)
 {
-    memset(sub, 0, sizeof(*sub));
-}
+    if (ts == -1) {
+        av_bprintf(buf, "9:59:59.99,");
+    } else {
+        int h, m, s;
 
-static int ts_to_string(char *str, int strlen, int ts)
-{
-    int h, m, s;
-    h = ts/360000;  ts -= 360000*h;
-    m = ts/  6000;  ts -=   6000*m;
-    s = ts/   100;  ts -=    100*s;
-    return snprintf(str, strlen, "%d:%02d:%02d.%02d", h, m, s, ts);
+        h = ts/360000;  ts -= 360000*h;
+        m = ts/  6000;  ts -=   6000*m;
+        s = ts/   100;  ts -=    100*s;
+        av_bprintf(buf, "%d:%02d:%02d.%02d,", h, m, s, ts);
+    }
 }
 
-int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw)
+int ff_ass_bprint_dialog(AVBPrint *buf, const char *dialog,
+                         int ts_start, int duration, int raw)
 {
-    int len = 0, dlen, duration = ts_end - ts_start;
-    char s_start[16], s_end[16], header[48] = {0};
-    AVSubtitleRect **rects;
+    int dlen;
+
+    if (!raw || raw == 2) {
+        long int layer = 0;
+
+        if (raw == 2) {
+            /* skip ReadOrder */
+            dialog = strchr(dialog, ',');
+            if (!dialog)
+                return AVERROR_INVALIDDATA;
+            dialog++;
 
-    if (!raw) {
-        ts_to_string(s_start, sizeof(s_start), ts_start);
-        ts_to_string(s_end,   sizeof(s_end),   ts_end  );
-        len = snprintf(header, sizeof(header), "Dialogue: 0,%s,%s,",
-                       s_start, s_end);
+            /* extract Layer or Marked */
+            layer = strtol(dialog, (char**)&dialog, 10);
+            if (*dialog != ',')
+                return AVERROR_INVALIDDATA;
+            dialog++;
+        }
+        av_bprintf(buf, "Dialogue: %ld,", layer);
+        insert_ts(buf, ts_start);
+        insert_ts(buf, duration == -1 ? -1 : ts_start + duration);
+        if (raw != 2)
+            av_bprintf(buf, "Default,,0,0,0,,");
     }
 
     dlen = strcspn(dialog, "\n");
     dlen += dialog[dlen] == '\n';
 
-    rects = av_realloc(sub->rects, (sub->num_rects+1) * sizeof(*sub->rects));
+    av_bprintf(buf, "%.*s", dlen, dialog);
+    if (raw == 2)
+        av_bprintf(buf, "\r\n");
+
+    return dlen;
+}
+
+int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
+                    int ts_start, int duration, int raw)
+{
+    AVBPrint buf;
+    int ret, dlen;
+    AVSubtitleRect **rects;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if ((ret = ff_ass_bprint_dialog(&buf, dialog, ts_start, duration, raw)) < 0)
+        goto err;
+    dlen = ret;
+    if (!av_bprint_is_complete(&buf))
+        goto errnomem;
+
+    rects = av_realloc_array(sub->rects, (sub->num_rects+1), sizeof(*sub->rects));
     if (!rects)
-        return AVERROR(ENOMEM);
+        goto errnomem;
     sub->rects = rects;
     sub->end_display_time = FFMAX(sub->end_display_time, 10 * duration);
     rects[sub->num_rects]       = av_mallocz(sizeof(*rects[0]));
     rects[sub->num_rects]->type = SUBTITLE_ASS;
-    rects[sub->num_rects]->ass  = av_malloc(len + dlen + 1);
-    strcpy (rects[sub->num_rects]->ass      , header);
-    av_strlcpy(rects[sub->num_rects]->ass + len, dialog, dlen + 1);
+    ret = av_bprint_finalize(&buf, &rects[sub->num_rects]->ass);
+    if (ret < 0)
+        goto err;
     sub->num_rects++;
     return dlen;
+
+errnomem:
+    ret = AVERROR(ENOMEM);
+err:
+    av_bprint_finalize(&buf, NULL);
+    return ret;
+}
+
+int ff_ass_add_rect_bprint(AVSubtitle *sub, AVBPrint *buf,
+                           int ts_start, int duration)
+{
+    av_bprintf(buf, "\r\n");
+    if (!av_bprint_is_complete(buf))
+        return AVERROR(ENOMEM);
+    return ff_ass_add_rect(sub, buf->str, ts_start, duration, 0);
+}
+
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup)
+{
+    const char *p_end = p + size;
+
+    for (; p < p_end && *p; p++) {
+
+        /* forced custom line breaks, not accounted as "normal" EOL */
+        if (linebreaks && strchr(linebreaks, *p)) {
+            av_bprintf(buf, "\\N");
+
+        /* standard ASS escaping so random characters don't get mis-interpreted
+         * as ASS */
+        } else if (!keep_ass_markup && strchr("{}\\", *p)) {
+            av_bprintf(buf, "\\%c", *p);
+
+        /* some packets might end abruptly (no \0 at the end, like for example
+         * in some cases of demuxing from a classic video container), some
+         * might be terminated with \n or \r\n which we have to remove (for
+         * consistency with those who haven't), and we also have to deal with
+         * evil cases such as \r at the end of the buffer (and no \0 terminated
+         * character) */
+        } else if (p[0] == '\n') {
+            /* some stuff left so we can insert a line break */
+            if (p < p_end - 1)
+                av_bprintf(buf, "\\N");
+        } else if (p[0] == '\r' && p < p_end - 1 && p[1] == '\n') {
+            /* \r followed by a \n, we can skip it. We don't insert the \N yet
+             * because we don't know if it is followed by more text */
+            continue;
+
+        /* finally, a sane character */
+        } else {
+            av_bprint_chars(buf, *p, 1);
+        }
+    }
 }
diff --git a/libavcodec/ass.h b/libavcodec/ass.h
index 594b5f3ac6..f3046efb79 100644
--- a/libavcodec/ass.h
+++ b/libavcodec/ass.h
@@ -1,21 +1,21 @@
 /*
- * SSA/ASS common funtions
+ * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,10 @@
 #define AVCODEC_ASS_H
 
 #include "avcodec.h"
+#include "libavutil/bprint.h"
+
+#define ASS_DEFAULT_PLAYRESX 384
+#define ASS_DEFAULT_PLAYRESY 288
 
 /**
  * @name Default values for ASS style
@@ -39,6 +43,27 @@
 /** @} */
 
 /**
+ * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
+ *
+ * @param avctx pointer to the AVCodecContext
+ * @param font name of the default font face to use
+ * @param font_size default font size to use
+ * @param color default text color to use (ABGR)
+ * @param back_color default background color to use (ABGR)
+ * @param bold 1 for bold text, 0 for normal text
+ * @param italic 1 for italic text, 0 for normal text
+ * @param underline 1 for underline text, 0 for normal text
+ * @param alignment position of the text (left, center, top...), defined after
+ *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int alignment);
+
+/**
  * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS
  * with default style.
  *
@@ -48,20 +73,45 @@
 int ff_ass_subtitle_header_default(AVCodecContext *avctx);
 
 /**
- * Initialize an AVSubtitle structure for use with ff_ass_add_rect().
+ * Add an ASS dialog line to an AVSubtitle as a new AVSubtitleRect.
  *
  * @param sub pointer to the AVSubtitle
+ * @param dialog ASS dialog to add to sub
+ * @param ts_start start timestamp for this dialog (in 1/100 second unit)
+ * @param duration duration for this dialog (in 1/100 second unit), can be -1
+ *                 to last until the end of the presentation
+ * @param raw when set to 2, it indicates that dialog contains an ASS
+ *                           dialog line as muxed in Matroska
+ *            when set to 1, it indicates that dialog contains a whole SSA
+ *                           dialog line which should be copied as is.
+ *            when set to 0, it indicates that dialog contains only the Text
+ *                           part of the ASS dialog line, the rest of the line
+ *                           will be generated.
+ * @return number of characters read from dialog. It can be less than the whole
+ *         length of dialog, if dialog contains several lines of text.
+ *         A negative value indicates an error.
  */
-void ff_ass_init(AVSubtitle *sub);
+int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
+                    int ts_start, int duration, int raw);
 
 /**
- * Add an ASS dialog line to an AVSubtitle as a new AVSubtitleRect.
+ * Same as ff_ass_add_rect_bprint, but taking an AVBPrint buffer instead of a
+ * string, and assuming raw=0.
+ */
+int ff_ass_add_rect_bprint(AVSubtitle *sub, AVBPrint *buf,
+                           int ts_start, int duration);
+
+/**
+ * Add an ASS dialog line to an AVBPrint buffer.
  *
- * @param sub pointer to the AVSubtitle
+ * @param buf pointer to an initialized AVBPrint buffer
  * @param dialog ASS dialog to add to sub
  * @param ts_start start timestamp for this dialog (in 1/100 second unit)
- * @param ts_end end timestamp for this dialog (in 1/100 second unit)
- * @param raw when set to 1, it indicates that dialog contains a whole ASS
+ * @param duration duration for this dialog (in 1/100 second unit), can be -1
+ *                 to last until the end of the presentation
+ * @param raw when set to 2, it indicates that dialog contains an ASS
+ *                           dialog line as muxed in Matroska
+ *            when set to 1, it indicates that dialog contains a whole SSA
  *                           dialog line which should be copied as is.
  *            when set to 0, it indicates that dialog contains only the Text
  *                           part of the ASS dialog line, the rest of the line
@@ -70,7 +120,19 @@ void ff_ass_init(AVSubtitle *sub);
  *         length of dialog, if dialog contains several lines of text.
  *         A negative value indicates an error.
  */
-int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw);
+int ff_ass_bprint_dialog(AVBPrint *buf, const char *dialog,
+                         int ts_start, int duration, int raw);
 
+/**
+ * Escape a text subtitle using ASS syntax into an AVBPrint buffer.
+ * Newline characters will be escaped to \N.
+ *
+ * @param buf pointer to an initialized AVBPrint buffer
+ * @param p source text
+ * @param size size of the source text
+ * @param linebreaks additional newline chars, which will be escaped to \N
+ * @param keep_ass_markup braces and backslash will not be escaped if set
+ */
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup);
 #endif /* AVCODEC_ASS_H */
diff --git a/libavcodec/ass_split.c b/libavcodec/ass_split.c
new file mode 100644
index 0000000000..9bc7b9d5d6
--- /dev/null
+++ b/libavcodec/ass_split.c
@@ -0,0 +1,531 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass_split.h"
+
+typedef enum {
+    ASS_STR,
+    ASS_INT,
+    ASS_FLT,
+    ASS_COLOR,
+    ASS_TIMESTAMP,
+    ASS_ALGN,
+} ASSFieldType;
+
+typedef struct {
+    const char *name;
+    int type;
+    int offset;
+} ASSFields;
+
+typedef struct {
+    const char *section;
+    const char *format_header;
+    const char *fields_header;
+    int         size;
+    int         offset;
+    int         offset_count;
+    ASSFields   fields[24];
+} ASSSection;
+
+static const ASSSection ass_sections[] = {
+    { .section       = "Script Info",
+      .offset        = offsetof(ASS, script_info),
+      .fields = {{"ScriptType", ASS_STR, offsetof(ASSScriptInfo, script_type)},
+                 {"Collisions", ASS_STR, offsetof(ASSScriptInfo, collisions) },
+                 {"PlayResX",   ASS_INT, offsetof(ASSScriptInfo, play_res_x) },
+                 {"PlayResY",   ASS_INT, offsetof(ASSScriptInfo, play_res_y) },
+                 {"Timer",      ASS_FLT, offsetof(ASSScriptInfo, timer)      },
+                 {0},
+        }
+    },
+    { .section       = "V4+ Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"OutlineColour",   ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"Underline",       ASS_INT,   offsetof(ASSStyle, underline)      },
+                 {"StrikeOut",       ASS_INT,   offsetof(ASSStyle, strikeout)      },
+                 {"ScaleX",          ASS_FLT,   offsetof(ASSStyle, scalex)         },
+                 {"ScaleY",          ASS_FLT,   offsetof(ASSStyle, scaley)         },
+                 {"Spacing",         ASS_FLT,   offsetof(ASSStyle, spacing)        },
+                 {"Angle",           ASS_FLT,   offsetof(ASSStyle, angle)          },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_INT,   offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "V4 Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"TertiaryColour",  ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_ALGN,  offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"AlphaLevel",      ASS_INT,   offsetof(ASSStyle, alpha_level)    },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "Events",
+      .format_header = "Format",
+      .fields_header = "Dialogue",
+      .size          = sizeof(ASSDialog),
+      .offset        = offsetof(ASS, dialogs),
+      .offset_count  = offsetof(ASS, dialogs_count),
+      .fields = {{"Layer",   ASS_INT,        offsetof(ASSDialog, layer)   },
+                 {"Start",   ASS_TIMESTAMP,  offsetof(ASSDialog, start)   },
+                 {"End",     ASS_TIMESTAMP,  offsetof(ASSDialog, end)     },
+                 {"Style",   ASS_STR,        offsetof(ASSDialog, style)   },
+                 {"Name",    ASS_STR,        offsetof(ASSDialog, name)    },
+                 {"MarginL", ASS_INT,        offsetof(ASSDialog, margin_l)},
+                 {"MarginR", ASS_INT,        offsetof(ASSDialog, margin_r)},
+                 {"MarginV", ASS_INT,        offsetof(ASSDialog, margin_v)},
+                 {"Effect",  ASS_STR,        offsetof(ASSDialog, effect)  },
+                 {"Text",    ASS_STR,        offsetof(ASSDialog, text)    },
+                 {0},
+        }
+    },
+};
+
+
+typedef int (*ASSConvertFunc)(void *dest, const char *buf, int len);
+
+static int convert_str(void *dest, const char *buf, int len)
+{
+    char *str = av_malloc(len + 1);
+    if (str) {
+        memcpy(str, buf, len);
+        str[len] = 0;
+        if (*(void **)dest)
+            av_free(*(void **)dest);
+        *(char **)dest = str;
+    }
+    return !str;
+}
+static int convert_int(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%d", (int *)dest) == 1;
+}
+static int convert_flt(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%f", (float *)dest) == 1;
+}
+static int convert_color(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "&H%8x", (int *)dest) == 1 ||
+           sscanf(buf, "%d",    (int *)dest) == 1;
+}
+static int convert_timestamp(void *dest, const char *buf, int len)
+{
+    int c, h, m, s, cs;
+    if ((c = sscanf(buf, "%d:%02d:%02d.%02d", &h, &m, &s, &cs)) == 4)
+        *(int *)dest = 360000*h + 6000*m + 100*s + cs;
+    return c == 4;
+}
+static int convert_alignment(void *dest, const char *buf, int len)
+{
+    int a;
+    if (sscanf(buf, "%d", &a) == 1) {
+        /* convert V4 Style alignment to V4+ Style */
+        *(int *)dest = a + ((a&4) >> 1) - 5*!!(a&8);
+        return 1;
+    }
+    return 0;
+}
+
+static const ASSConvertFunc convert_func[] = {
+    [ASS_STR]       = convert_str,
+    [ASS_INT]       = convert_int,
+    [ASS_FLT]       = convert_flt,
+    [ASS_COLOR]     = convert_color,
+    [ASS_TIMESTAMP] = convert_timestamp,
+    [ASS_ALGN]      = convert_alignment,
+};
+
+
+struct ASSSplitContext {
+    ASS ass;
+    int current_section;
+    int field_number[FF_ARRAY_ELEMS(ass_sections)];
+    int *field_order[FF_ARRAY_ELEMS(ass_sections)];
+};
+
+
+static uint8_t *realloc_section_array(ASSSplitContext *ctx)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    void **section_ptr = (void **)((uint8_t *)&ctx->ass + section->offset);
+    uint8_t *tmp = av_realloc_array(*section_ptr, (*count+1), section->size);
+    if (!tmp)
+        return NULL;
+    *section_ptr = tmp;
+    tmp += *count * section->size;
+    memset(tmp, 0, section->size);
+    (*count)++;
+    return tmp;
+}
+
+static inline int is_eol(char buf)
+{
+    return buf == '\r' || buf == '\n' || buf == 0;
+}
+
+static inline const char *skip_space(const char *buf)
+{
+    while (*buf == ' ')
+        buf++;
+    return buf;
+}
+
+static int *get_default_field_orders(const ASSSection *section)
+{
+    int i;
+    int *order = av_malloc_array(FF_ARRAY_ELEMS(section->fields), sizeof(*order));
+
+    if (!order)
+        return NULL;
+    for (i = 0; section->fields[i].name; i++)
+        order[i] = i;
+    while (i < FF_ARRAY_ELEMS(section->fields))
+        order[i] = -1;
+    return order;
+}
+
+static const char *ass_split_section(ASSSplitContext *ctx, const char *buf)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *number = &ctx->field_number[ctx->current_section];
+    int *order = ctx->field_order[ctx->current_section];
+    int *tmp, i, len;
+
+    while (buf && *buf) {
+        if (buf[0] == '[') {
+            ctx->current_section = -1;
+            break;
+        }
+        if (buf[0] == ';' || (buf[0] == '!' && buf[1] == ':')) {
+            /* skip comments */
+        } else if (section->format_header && !order) {
+            len = strlen(section->format_header);
+            if (strncmp(buf, section->format_header, len) || buf[len] != ':')
+                goto next_line;
+            buf += len + 1;
+            while (!is_eol(*buf)) {
+                buf = skip_space(buf);
+                len = strcspn(buf, ", \r\n");
+                if (!(tmp = av_realloc_array(order, (*number + 1), sizeof(*order))))
+                    return NULL;
+                order = tmp;
+                order[*number] = -1;
+                for (i=0; section->fields[i].name; i++)
+                    if (!strncmp(buf, section->fields[i].name, len)) {
+                        order[*number] = i;
+                        break;
+                    }
+                (*number)++;
+                buf = skip_space(buf + len + (buf[len] == ','));
+            }
+            ctx->field_order[ctx->current_section] = order;
+        } else if (section->fields_header) {
+            len = strlen(section->fields_header);
+            if (!strncmp(buf, section->fields_header, len) && buf[len] == ':') {
+                uint8_t *ptr, *struct_ptr = realloc_section_array(ctx);
+                if (!struct_ptr)  return NULL;
+
+                /* No format header line found so far, assume default */
+                if (!order) {
+                    order = get_default_field_orders(section);
+                    if (!order)
+                        return NULL;
+                    ctx->field_order[ctx->current_section] = order;
+                }
+
+                buf += len + 1;
+                for (i=0; !is_eol(*buf) && i < *number; i++) {
+                    int last = i == *number - 1;
+                    buf = skip_space(buf);
+                    len = strcspn(buf, last ? "\r\n" : ",\r\n");
+                    if (order[i] >= 0) {
+                        ASSFieldType type = section->fields[order[i]].type;
+                        ptr = struct_ptr + section->fields[order[i]].offset;
+                        convert_func[type](ptr, buf, len);
+                    }
+                    buf += len;
+                    if (!last && *buf) buf++;
+                    buf = skip_space(buf);
+                }
+            }
+        } else {
+            len = strcspn(buf, ":\r\n");
+            if (buf[len] == ':') {
+                for (i=0; section->fields[i].name; i++)
+                    if (!strncmp(buf, section->fields[i].name, len)) {
+                        ASSFieldType type = section->fields[i].type;
+                        uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+                        ptr += section->fields[i].offset;
+                        buf = skip_space(buf + len + 1);
+                        convert_func[type](ptr, buf, strcspn(buf, "\r\n"));
+                        break;
+                    }
+            }
+        }
+next_line:
+        buf += strcspn(buf, "\n");
+        buf += !!*buf;
+    }
+    return buf;
+}
+
+static int ass_split(ASSSplitContext *ctx, const char *buf)
+{
+    char c, section[16];
+    int i;
+
+    if (ctx->current_section >= 0)
+        buf = ass_split_section(ctx, buf);
+
+    while (buf && *buf) {
+        if (sscanf(buf, "[%15[0-9A-Za-z+ ]]%c", section, &c) == 2) {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+            for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+                if (!strcmp(section, ass_sections[i].section)) {
+                    ctx->current_section = i;
+                    buf = ass_split_section(ctx, buf);
+                }
+        } else {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+        }
+    }
+    return buf ? 0 : AVERROR_INVALIDDATA;
+}
+
+ASSSplitContext *ff_ass_split(const char *buf)
+{
+    ASSSplitContext *ctx = av_mallocz(sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+    ctx->current_section = -1;
+    if (ass_split(ctx, buf) < 0) {
+        ff_ass_split_free(ctx);
+        return NULL;
+    }
+    return ctx;
+}
+
+static void free_section(ASSSplitContext *ctx, const ASSSection *section)
+{
+    uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+    int i, j, *count, c = 1;
+
+    if (section->format_header) {
+        ptr   = *(void **)ptr;
+        count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    } else
+        count = &c;
+
+    if (ptr)
+        for (i=0; i<*count; i++, ptr += section->size)
+            for (j=0; section->fields[j].name; j++) {
+                const ASSFields *field = &section->fields[j];
+                if (field->type == ASS_STR)
+                    av_freep(ptr + field->offset);
+            }
+    *count = 0;
+
+    if (section->format_header)
+        av_freep((uint8_t *)&ctx->ass + section->offset);
+}
+
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number)
+{
+    ASSDialog *dialog = NULL;
+    int i, count;
+    if (!cache)
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+            if (!strcmp(ass_sections[i].section, "Events")) {
+                free_section(ctx, &ass_sections[i]);
+                break;
+            }
+    count = ctx->ass.dialogs_count;
+    if (ass_split(ctx, buf) == 0)
+        dialog = ctx->ass.dialogs + count;
+    if (number)
+        *number = ctx->ass.dialogs_count - count;
+    return dialog;
+}
+
+void ff_ass_split_free(ASSSplitContext *ctx)
+{
+    if (ctx) {
+        int i;
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++) {
+            free_section(ctx, &ass_sections[i]);
+            av_freep(&(ctx->field_order[i]));
+        }
+        av_free(ctx);
+    }
+}
+
+
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf)
+{
+    const char *text = NULL;
+    char new_line[2];
+    int text_len = 0;
+
+    while (buf && *buf) {
+        if (text && callbacks->text &&
+            (sscanf(buf, "\\%1[nN]", new_line) == 1 ||
+             !strncmp(buf, "{\\", 2))) {
+            callbacks->text(priv, text, text_len);
+            text = NULL;
+        }
+        if (sscanf(buf, "\\%1[nN]", new_line) == 1) {
+            if (callbacks->new_line)
+                callbacks->new_line(priv, new_line[0] == 'N');
+            buf += 2;
+        } else if (!strncmp(buf, "{\\", 2)) {
+            buf++;
+            while (*buf == '\\') {
+                char style[2], c[2], sep[2], c_num[2] = "0", tmp[128] = {0};
+                unsigned int color = 0xFFFFFFFF;
+                int len, size = -1, an = -1, alpha = -1;
+                int x1, y1, x2, y2, t1 = -1, t2 = -1;
+                if (sscanf(buf, "\\%1[bisu]%1[01\\}]%n", style, c, &len) > 1) {
+                    int close = c[0] == '0' ? 1 : c[0] == '1' ? 0 : -1;
+                    len += close != -1;
+                    if (callbacks->style)
+                        callbacks->style(priv, style[0], close);
+                } else if (sscanf(buf, "\\c%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\c&H%X&%1[\\}]%n", &color, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c&H%X&%1[\\}]%n", c_num, &color, sep, &len) > 2) {
+                    if (callbacks->color)
+                        callbacks->color(priv, color, c_num[0] - '0');
+                } else if (sscanf(buf, "\\alpha%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\alpha&H%2X&%1[\\}]%n", &alpha, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a&H%2X&%1[\\}]%n", c_num, &alpha, sep, &len) > 2) {
+                    if (callbacks->alpha)
+                        callbacks->alpha(priv, alpha, c_num[0] - '0');
+                } else if (sscanf(buf, "\\fn%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fn%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->font_name)
+                        callbacks->font_name(priv, tmp[0] ? tmp : NULL);
+                } else if (sscanf(buf, "\\fs%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fs%u%1[\\}]%n", &size, sep, &len) > 1) {
+                    if (callbacks->font_size)
+                        callbacks->font_size(priv, size);
+                } else if (sscanf(buf, "\\a%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\a%2u%1[\\}]%n", &an, sep, &len) > 1 ||
+                           sscanf(buf, "\\an%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\an%1u%1[\\}]%n", &an, sep, &len) > 1) {
+                    if (an != -1 && buf[2] != 'n')
+                        an = (an&3) + (an&4 ? 6 : an&8 ? 3 : 0);
+                    if (callbacks->alignment)
+                        callbacks->alignment(priv, an);
+                } else if (sscanf(buf, "\\r%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\r%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->cancel_overrides)
+                        callbacks->cancel_overrides(priv, tmp);
+                } else if (sscanf(buf, "\\move(%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, sep, &len) > 4 ||
+                           sscanf(buf, "\\move(%d,%d,%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, &t1, &t2, sep, &len) > 6) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x2, y2, t1, t2);
+                } else if (sscanf(buf, "\\pos(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x1, y1, -1, -1);
+                } else if (sscanf(buf, "\\org(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->origin)
+                        callbacks->origin(priv, x1, y1);
+                } else {
+                    len = strcspn(buf+1, "\\}") + 2;  /* skip unknown code */
+                }
+                buf += len - 1;
+            }
+            if (*buf++ != '}')
+                return AVERROR_INVALIDDATA;
+        } else {
+            if (!text) {
+                text = buf;
+                text_len = 1;
+            } else
+                text_len++;
+            buf++;
+        }
+    }
+    if (text && callbacks->text)
+        callbacks->text(priv, text, text_len);
+    if (callbacks->end)
+        callbacks->end(priv);
+    return 0;
+}
+
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style)
+{
+    ASS *ass = &ctx->ass;
+    int i;
+
+    if (!style || !*style)
+        style = "Default";
+    for (i=0; i<ass->styles_count; i++)
+        if (!strcmp(ass->styles[i].name, style))
+            return ass->styles + i;
+    return NULL;
+}
diff --git a/libavcodec/ass_split.h b/libavcodec/ass_split.h
new file mode 100644
index 0000000000..defb5ccd74
--- /dev/null
+++ b/libavcodec/ass_split.h
@@ -0,0 +1,192 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ASS_SPLIT_H
+#define AVCODEC_ASS_SPLIT_H
+
+/**
+ * fields extracted from the [Script Info] section
+ */
+typedef struct {
+    char *script_type;    /**< SSA script format version (eg. v4.00) */
+    char *collisions;     /**< how subtitles are moved to prevent collisions */
+    int   play_res_x;     /**< video width that ASS coords are referring to */
+    int   play_res_y;     /**< video height that ASS coords are referring to */
+    float timer;          /**< time multiplier to apply to SSA clock (in %) */
+} ASSScriptInfo;
+
+/**
+ * fields extracted from the [V4(+) Styles] section
+ */
+typedef struct {
+    char *name;           /**< name of the tyle (case sensitive) */
+    char *font_name;      /**< font face (case sensitive) */
+    int   font_size;      /**< font height */
+    int   primary_color;  /**< color that a subtitle will normally appear in */
+    int   secondary_color;
+    int   outline_color;  /**< color for outline in ASS, called tertiary in SSA */
+    int   back_color;     /**< color of the subtitle outline or shadow */
+    int   bold;           /**< whether text is bold (1) or not (0) */
+    int   italic;         /**< whether text is italic (1) or not (0) */
+    int   underline;      /**< whether text is underlined (1) or not (0) */
+    int   strikeout;
+    float scalex;
+    float scaley;
+    float spacing;
+    float angle;
+    int   border_style;
+    float outline;
+    float shadow;
+    int   alignment;      /**< position of the text (left, center, top...),
+                               defined after the layout of the numpad
+                               (1-3 sub, 4-6 mid, 7-9 top) */
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    int   alpha_level;
+    int   encoding;
+} ASSStyle;
+
+/**
+ * fields extracted from the [Events] section
+ */
+typedef struct {
+    int   layer;    /**< higher numbered layers are drawn over lower numbered */
+    int   start;    /**< start time of the dialog in centiseconds */
+    int   end;      /**< end time of the dialog in centiseconds */
+    char *style;    /**< name of the ASSStyle to use with this dialog */
+    char *name;
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    char *effect;
+    char *text;     /**< actual text which will be displayed as a subtitle,
+                         can include style override control codes (see
+                         ff_ass_split_override_codes()) */
+} ASSDialog;
+
+/**
+ * structure containing the whole split ASS data
+ */
+typedef struct {
+    ASSScriptInfo script_info;   /**< general information about the SSA script*/
+    ASSStyle     *styles;        /**< array of split out styles */
+    int           styles_count;  /**< number of ASSStyle in the styles array */
+    ASSDialog    *dialogs;       /**< array of split out dialogs */
+    int           dialogs_count; /**< number of ASSDialog in the dialogs array*/
+} ASS;
+
+/**
+ * This struct can be casted to ASS to access to the split data.
+ */
+typedef struct ASSSplitContext ASSSplitContext;
+
+/**
+ * Split a full ASS file or a ASS header from a string buffer and store
+ * the split structure in a newly allocated context.
+ *
+ * @param buf String containing the ASS formatted data.
+ * @return Newly allocated struct containing split data.
+ */
+ASSSplitContext *ff_ass_split(const char *buf);
+
+/**
+ * Split one or several ASS "Dialogue" lines from a string buffer and store
+ * them in a already initialized context.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param buf String containing the ASS "Dialogue" lines.
+ * @param cache Set to 1 to keep all the previously split ASSDialog in
+ *              the context, or set to 0 to free all the previously split
+ *              ASSDialog.
+ * @param number If not NULL, the pointed integer will be set to the number
+ *               of split ASSDialog.
+ * @return Pointer to the first split ASSDialog.
+ */
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number);
+
+/**
+ * Free all the memory allocated for an ASSSplitContext.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ */
+void ff_ass_split_free(ASSSplitContext *ctx);
+
+
+/**
+ * Set of callback functions corresponding to each override codes that can
+ * be encountered in a "Dialogue" Text field.
+ */
+typedef struct {
+    /**
+     * @defgroup ass_styles    ASS styles
+     * @{
+     */
+    void (*text)(void *priv, const char *text, int len);
+    void (*new_line)(void *priv, int forced);
+    void (*style)(void *priv, char style, int close);
+    void (*color)(void *priv, unsigned int /* color */, unsigned int color_id);
+    void (*alpha)(void *priv, int alpha, int alpha_id);
+    void (*font_name)(void *priv, const char *name);
+    void (*font_size)(void *priv, int size);
+    void (*alignment)(void *priv, int alignment);
+    void (*cancel_overrides)(void *priv, const char *style);
+    /** @} */
+
+    /**
+     * @defgroup ass_functions    ASS functions
+     * @{
+     */
+    void (*move)(void *priv, int x1, int y1, int x2, int y2, int t1, int t2);
+    void (*origin)(void *priv, int x, int y);
+    /** @} */
+
+    /**
+     * @defgroup ass_end    end of Dialogue Event
+     * @{
+     */
+    void (*end)(void *priv);
+    /** @} */
+} ASSCodesCallbacks;
+
+/**
+ * Split override codes out of a ASS "Dialogue" Text field.
+ *
+ * @param callbacks Set of callback functions called for each override code
+ *                  encountered.
+ * @param priv Opaque pointer passed to the callback functions.
+ * @param buf The ASS "Dialogue" Text field to split.
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf);
+
+/**
+ * Find an ASSStyle structure by its name.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param style name of the style to search for.
+ * @return the ASSStyle corresponding to style, or NULL if style can't be found
+ */
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style);
+
+#endif /* AVCODEC_ASS_SPLIT_H */
diff --git a/libavcodec/assdec.c b/libavcodec/assdec.c
index 48fe32ee99..11dbde0b8a 100644
--- a/libavcodec/assdec.c
+++ b/libavcodec/assdec.c
@@ -2,20 +2,20 @@
  * SSA/ASS decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,29 +23,45 @@
 
 #include "avcodec.h"
 #include "ass.h"
+#include "ass_split.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
 static av_cold int ass_decode_init(AVCodecContext *avctx)
 {
-    avctx->subtitle_header = av_malloc(avctx->extradata_size);
+    avctx->subtitle_header = av_malloc(avctx->extradata_size + 1);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     memcpy(avctx->subtitle_header, avctx->extradata, avctx->extradata_size);
+    avctx->subtitle_header[avctx->extradata_size] = 0;
     avctx->subtitle_header_size = avctx->extradata_size;
+    avctx->priv_data = ff_ass_split(avctx->extradata);
+    if(!avctx->priv_data)
+        return -1;
     return 0;
 }
 
-static int ass_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
+static int ass_decode_close(AVCodecContext *avctx)
+{
+    ff_ass_split_free(avctx->priv_data);
+    avctx->priv_data = NULL;
+    return 0;
+}
+
+#if CONFIG_SSA_DECODER
+static int ssa_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
                             AVPacket *avpkt)
 {
     const char *ptr = avpkt->data;
     int len, size = avpkt->size;
 
-    ff_ass_init(data);
-
     while (size > 0) {
-        len = ff_ass_add_rect(data, ptr, 0, 0/* FIXME: duration */, 1);
+        int duration;
+        ASSDialog *dialog = ff_ass_split_dialog(avctx->priv_data, ptr, 0, NULL);
+        if (!dialog)
+            return AVERROR_INVALIDDATA;
+        duration = dialog->end - dialog->start;
+        len = ff_ass_add_rect(data, ptr, 0, duration, 1);
         if (len < 0)
             return len;
         ptr  += len;
@@ -56,11 +72,49 @@ static int ass_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
     return avpkt->size;
 }
 
-AVCodec ff_ass_decoder = {
-    .name         = "ass",
+AVCodec ff_ssa_decoder = {
+    .name         = "ssa",
     .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
     .id           = AV_CODEC_ID_SSA,
     .init         = ass_decode_init,
+    .decode       = ssa_decode_frame,
+    .close        = ass_decode_close,
+};
+#endif
+
+#if CONFIG_ASS_DECODER
+static int ass_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
+                            AVPacket *avpkt)
+{
+    int ret;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    static const AVRational ass_tb = {1, 100};
+    const int ts_start    = av_rescale_q(avpkt->pts,      avctx->time_base, ass_tb);
+    const int ts_duration = av_rescale_q(avpkt->duration, avctx->time_base, ass_tb);
+
+    if (avpkt->size <= 0)
+        return avpkt->size;
+
+    ret = ff_ass_add_rect(sub, ptr, ts_start, ts_duration, 2);
+    if (ret < 0) {
+        if (ret == AVERROR_INVALIDDATA)
+            av_log(avctx, AV_LOG_ERROR, "Invalid ASS packet\n");
+        return ret;
+    }
+
+    *got_sub_ptr = avpkt->size > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_ass_decoder = {
+    .name         = "ass",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_decode_init,
     .decode       = ass_decode_frame,
+    .close        = ass_decode_close,
 };
+#endif
diff --git a/libavcodec/assenc.c b/libavcodec/assenc.c
index caf266e037..5dc3b09d65 100644
--- a/libavcodec/assenc.c
+++ b/libavcodec/assenc.c
@@ -2,37 +2,44 @@
  * SSA/ASS encoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
+#include "ass_split.h"
+#include "ass.h"
 #include "libavutil/avstring.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
+typedef struct {
+    int id; ///< current event id, ReadOrder field
+} ASSEncodeContext;
+
 static av_cold int ass_encode_init(AVCodecContext *avctx)
 {
-    avctx->extradata = av_malloc(avctx->subtitle_header_size);
+    avctx->extradata = av_malloc(avctx->subtitle_header_size + 1);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
     memcpy(avctx->extradata, avctx->subtitle_header, avctx->subtitle_header_size);
     avctx->extradata_size = avctx->subtitle_header_size;
+    avctx->extradata[avctx->extradata_size] = 0;
     return 0;
 }
 
@@ -40,15 +47,54 @@ static int ass_encode_frame(AVCodecContext *avctx,
                             unsigned char *buf, int bufsize,
                             const AVSubtitle *sub)
 {
+    ASSEncodeContext *s = avctx->priv_data;
     int i, len, total_len = 0;
 
     for (i=0; i<sub->num_rects; i++) {
+        char ass_line[2048];
+        const char *ass = sub->rects[i]->ass;
+
         if (sub->rects[i]->type != SUBTITLE_ASS) {
             av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
             return -1;
         }
 
-        len = av_strlcpy(buf+total_len, sub->rects[i]->ass, bufsize-total_len);
+        if (strncmp(ass, "Dialogue: ", 10)) {
+            av_log(avctx, AV_LOG_ERROR, "AVSubtitle rectangle ass \"%s\""
+                   " does not look like a SSA markup\n", ass);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (avctx->codec->id == AV_CODEC_ID_ASS) {
+            long int layer;
+            char *p;
+
+            if (i > 0) {
+                av_log(avctx, AV_LOG_ERROR, "ASS encoder supports only one "
+                       "ASS rectangle field.\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            ass += 10; // skip "Dialogue: "
+            /* parse Layer field. If it's a Marked field, the content
+             * will be "Marked=N" instead of the layer num, so we will
+             * have layer=0, which is fine. */
+            layer = strtol(ass, &p, 10);
+
+#define SKIP_ENTRY(ptr) do {        \
+    char *sep = strchr(ptr, ',');   \
+    if (sep)                        \
+        ptr = sep + 1;              \
+} while (0)
+
+            SKIP_ENTRY(p); // skip layer or marked
+            SKIP_ENTRY(p); // skip start timestamp
+            SKIP_ENTRY(p); // skip end timestamp
+            snprintf(ass_line, sizeof(ass_line), "%d,%ld,%s", ++s->id, layer, p);
+            ass_line[strcspn(ass_line, "\r\n")] = 0;
+            ass = ass_line;
+        }
+        len = av_strlcpy(buf+total_len, ass, bufsize-total_len);
 
         if (len > bufsize-total_len-1) {
             av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
@@ -61,11 +107,26 @@ static int ass_encode_frame(AVCodecContext *avctx,
     return total_len;
 }
 
-AVCodec ff_ass_encoder = {
-    .name         = "ass",
+#if CONFIG_SSA_ENCODER
+AVCodec ff_ssa_encoder = {
+    .name         = "ssa",
     .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
     .id           = AV_CODEC_ID_SSA,
     .init         = ass_encode_init,
     .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
+};
+#endif
+
+#if CONFIG_ASS_ENCODER
+AVCodec ff_ass_encoder = {
+    .name         = "ass",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_encode_init,
+    .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
 };
+#endif
diff --git a/libavcodec/asv.c b/libavcodec/asv.c
index b9e93f7989..14fdf73333 100644
--- a/libavcodec/asv.c
+++ b/libavcodec/asv.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index 18f7a9571d..e2cdc81300 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c
index f17f06465c..036d56ed0d 100644
--- a/libavcodec/asvdec.c
+++ b/libavcodec/asvdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "asv.h"
 #include "avcodec.h"
 #include "blockdsp.h"
-#include "put_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -211,10 +210,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *const p = data;
     int mb_x, mb_y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -277,8 +274,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->extradata_size < 1) {
-        av_log(avctx, AV_LOG_ERROR, "No extradata provided\n");
-        return AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_WARNING, "No extradata provided\n");
     }
 
     ff_asv_common_init(avctx);
@@ -288,8 +284,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable, ff_asv_scantab);
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-    a->inv_qscale = avctx->extradata[0];
-    if (a->inv_qscale == 0) {
+    if (avctx->extradata_size < 1 || (a->inv_qscale = avctx->extradata[0]) == 0) {
         av_log(avctx, AV_LOG_ERROR, "illegal qscale 0\n");
         if (avctx->codec_id == AV_CODEC_ID_ASV1)
             a->inv_qscale = 6;
@@ -317,6 +312,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+#if CONFIG_ASV1_DECODER
 AVCodec ff_asv1_decoder = {
     .name           = "asv1",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V1"),
@@ -328,7 +324,9 @@ AVCodec ff_asv1_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
 
+#if CONFIG_ASV2_DECODER
 AVCodec ff_asv2_decoder = {
     .name           = "asv2",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V2"),
@@ -340,3 +338,4 @@ AVCodec ff_asv2_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index ac7c317fa1..d51df80331 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,10 @@
 #include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 
+#include "aandcttab.h"
 #include "asv.h"
 #include "avcodec.h"
+#include "dct.h"
 #include "fdctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -50,7 +52,7 @@ static inline void asv1_put_level(PutBitContext *pb, int level)
     }
 }
 
-static inline void asv2_put_level(PutBitContext *pb, int level)
+static inline void asv2_put_level(ASV1Context *a, PutBitContext *pb, int level)
 {
     unsigned int index = level + 31;
 
@@ -58,6 +60,10 @@ static inline void asv2_put_level(PutBitContext *pb, int level)
         put_bits(pb, ff_asv2_level_tab[index][1], ff_asv2_level_tab[index][0]);
     } else {
         put_bits(pb, ff_asv2_level_tab[31][1], ff_asv2_level_tab[31][0]);
+        if (level < -128 || level > 127) {
+            av_log(a->avctx, AV_LOG_WARNING, "Cliping level %d, increase qscale\n", level);
+            level = av_clip_int8(level);
+        }
         asv2_put_bits(pb, 8, level & 0xFF);
     }
 }
@@ -108,7 +114,7 @@ static inline void asv1_encode_block(ASV1Context *a, int16_t block[64])
     put_bits(&a->pb, ff_asv_ccp_tab[16][1], ff_asv_ccp_tab[16][0]);
 }
 
-static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
+static inline void asv2_encode_block(ASV1Context *a, int16_t block[64])
 {
     int i;
     int count = 0;
@@ -142,8 +148,7 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
                                  a->q_intra_matrix[index + 9] + (1 << 15)) >> 16))
             ccp |= 1;
 
-        if (!i && ccp >= 8)
-            return AVERROR_BUG;
+        av_assert2(i || ccp < 8);
         if (i)
             put_bits(&a->pb, ff_asv_ac_ccp_tab[ccp][1], ff_asv_ac_ccp_tab[ccp][0]);
         else
@@ -151,24 +156,22 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
 
         if (ccp) {
             if (ccp & 8)
-                asv2_put_level(&a->pb, block[index + 0]);
+                asv2_put_level(a, &a->pb, block[index + 0]);
             if (ccp & 4)
-                asv2_put_level(&a->pb, block[index + 8]);
+                asv2_put_level(a, &a->pb, block[index + 8]);
             if (ccp & 2)
-                asv2_put_level(&a->pb, block[index + 1]);
+                asv2_put_level(a, &a->pb, block[index + 1]);
             if (ccp & 1)
-                asv2_put_level(&a->pb, block[index + 9]);
+                asv2_put_level(a, &a->pb, block[index + 9]);
         }
     }
-
-    return 0;
 }
 
 #define MAX_MB_SIZE (30 * 16 * 16 * 3 / 2 / 8)
 
 static inline int encode_mb(ASV1Context *a, int16_t block[6][64])
 {
-    int i, ret;
+    int i;
 
     if (a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb) >> 3) < MAX_MB_SIZE) {
         av_log(a->avctx, AV_LOG_ERROR, "encoded frame too large\n");
@@ -180,9 +183,7 @@ static inline int encode_mb(ASV1Context *a, int16_t block[6][64])
             asv1_encode_block(a, block[i]);
     } else {
         for (i = 0; i < 6; i++) {
-            ret = asv2_encode_block(a, block[i]);
-            if (ret < 0)
-                return ret;
+            asv2_encode_block(a, block[i]);
         }
     }
     return 0;
@@ -221,13 +222,52 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int size, ret;
     int mb_x, mb_y;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if (pict->width % 16 || pict->height % 16) {
+        AVFrame *clone = av_frame_alloc();
+        int i;
+
+        if (!clone)
+            return AVERROR(ENOMEM);
+        clone->format = pict->format;
+        clone->width  = FFALIGN(pict->width, 16);
+        clone->height = FFALIGN(pict->height, 16);
+        ret = av_frame_get_buffer(clone, 32);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        ret = av_frame_copy(clone, pict);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        for (i = 0; i<3; i++) {
+            int x, y;
+            int w  = FF_CEIL_RSHIFT(pict->width, !!i);
+            int h  = FF_CEIL_RSHIFT(pict->height, !!i);
+            int w2 = FF_CEIL_RSHIFT(clone->width, !!i);
+            int h2 = FF_CEIL_RSHIFT(clone->height, !!i);
+            for (y=0; y<h; y++)
+                for (x=w; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][w - 1 + y*clone->linesize[i]];
+            for (y=h; y<h2; y++)
+                for (x=0; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][x + (h-1)*clone->linesize[i]];
+        }
+        ret = encode_frame(avctx, pkt, clone, got_packet);
+
+        av_frame_free(&clone);
         return ret;
     }
 
+    if ((ret = ff_alloc_packet2(avctx, pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
     init_put_bits(&a->pb, pkt->data, pkt->size);
 
     for (mb_y = 0; mb_y < a->mb_height2; mb_y++) {
@@ -282,18 +322,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int i;
     const int scale = avctx->codec_id == AV_CODEC_ID_ASV1 ? 1 : 2;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     ff_asv_common_init(avctx);
     ff_fdctdsp_init(&a->fdsp, avctx);
     ff_pixblockdsp_init(&a->pdsp, avctx);
 
-    if (avctx->global_quality == 0)
+    if (avctx->global_quality <= 0)
         avctx->global_quality = 4 * FF_QUALITY_SCALE;
 
     a->inv_qscale = (32 * scale * FF_QUALITY_SCALE +
@@ -307,8 +340,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     ((uint32_t *) avctx->extradata)[1] = av_le2ne32(AV_RL32("ASUS"));
 
     for (i = 0; i < 64; i++) {
-        int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
-        a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        if (a->fdsp.fdct == ff_fdct_ifast) {
+            int q = 32LL * scale * ff_mpeg1_default_intra_matrix[i] * ff_aanscales[i];
+            a->q_intra_matrix[i] = (((int64_t)a->inv_qscale << 30) + q / 2) / q;
+        } else {
+            int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
+            a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        }
     }
 
     return 0;
diff --git a/libavcodec/atrac.c b/libavcodec/atrac.c
index f36db9e8c5..12e8997dbc 100644
--- a/libavcodec/atrac.c
+++ b/libavcodec/atrac.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2013 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,7 +124,8 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
     memcpy(prev, &in[num_samples], num_samples * sizeof(float));
 }
 
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp)
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp)
 {
     int   i, j;
     float   *p1, *p3;
diff --git a/libavcodec/atrac.h b/libavcodec/atrac.h
index 8909323b01..05208bbee6 100644
--- a/libavcodec/atrac.h
+++ b/libavcodec/atrac.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2009-2013 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,6 +91,7 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
  * @param delayBuf  delayBuf buffer
  * @param temp      temp buffer
  */
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp);
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp);
 
 #endif /* AVCODEC_ATRAC_H */
diff --git a/libavcodec/atrac1.c b/libavcodec/atrac1.c
index e938976d88..b70cf4fd92 100644
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -80,7 +80,7 @@ typedef struct AT1Ctx {
     DECLARE_ALIGNED(32, float, high)[512];
     float*              bands[3];
     FFTContext          mdct_ctx[3];
-    AVFloatDSPContext   fdsp;
+    AVFloatDSPContext   *fdsp;
 } AT1Ctx;
 
 /** size of the transform in samples in the long mode for each QMF band */
@@ -140,7 +140,7 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)
             at1_imdct(q, &q->spec[pos], &su->spectrum[0][ref_pos + start_pos], nbits, band_num);
 
             /* overlap and window */
-            q->fdsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
+            q->fdsp->vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
                                        &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
 
             prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
@@ -242,7 +242,7 @@ static int at1_unpack_dequant(GetBitContext* gb, AT1SUCtx* su,
                      */
                     spec[pos+i] = get_sbits(gb, word_len) * scale_factor * max_quant;
                 }
-            } else { /* word_len = 0 -> empty BFU, zero all specs in the emty BFU */
+            } else { /* word_len = 0 -> empty BFU, zero all specs in the empty BFU */
                 memset(&spec[pos], 0, num_specs * sizeof(float));
             }
         }
@@ -287,10 +287,8 @@ static int atrac1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AT1_SU_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
         AT1SUCtx* su = &q->SUs[ch];
@@ -326,6 +324,8 @@ static av_cold int atrac1_decode_end(AVCodecContext * avctx)
     ff_mdct_end(&q->mdct_ctx[1]);
     ff_mdct_end(&q->mdct_ctx[2]);
 
+    av_freep(&q->fdsp);
+
     return 0;
 }
 
@@ -343,6 +343,11 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported block align.");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /* Init the mdct transforms */
     if ((ret = ff_mdct_init(&q->mdct_ctx[0], 6, 1, -1.0/ (1 << 15))) ||
         (ret = ff_mdct_init(&q->mdct_ctx[1], 8, 1, -1.0/ (1 << 15))) ||
@@ -356,7 +361,7 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
 
     ff_atrac_generate_tables();
 
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     q->bands[0] = q->low;
     q->bands[1] = q->mid;
diff --git a/libavcodec/atrac1data.h b/libavcodec/atrac1data.h
index d4b8cd0def..62c218b790 100644
--- a/libavcodec/atrac1data.h
+++ b/libavcodec/atrac1data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,7 +43,7 @@ static const uint8_t bfu_bands_t[4]  = {0, 20, 36, 52};
  */
 static const uint8_t specs_per_bfu[52] = {
      8,  8,  8,  8,  4,  4,  4,  4,  8,  8,  8,  8,  6,  6,  6,  6, 6, 6, 6, 6, // low band
-     6,  6,  6,  6,  7,  7,  7,  7,  9,  9,  9,  9, 10, 10, 10, 10,             // midle band
+     6,  6,  6,  6,  7,  7,  7,  7,  9,  9,  9,  9, 10, 10, 10, 10,             // middle band
     12, 12, 12, 12, 12, 12, 12, 12, 20, 20, 20, 20, 20, 20, 20, 20              // high band
 };
 
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index 2e1fd3c133..4bdb63f98b 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2008 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "fft.h"
@@ -104,9 +105,9 @@ typedef struct ATRAC3Context {
     int scrambled_stream;
     //@}
 
-    AtracGCContext  gainc_ctx;
-    FFTContext mdct_ctx;
-    AVFloatDSPContext fdsp;
+    AtracGCContext    gainc_ctx;
+    FFTContext        mdct_ctx;
+    AVFloatDSPContext *fdsp;
 } ATRAC3Context;
 
 static DECLARE_ALIGNED(32, float, mdct_window)[MDCT_SIZE];
@@ -139,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
     q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
 
     /* Perform windowing on the output. */
-    q->fdsp.vector_fmul(output, output, mdct_window, MDCT_SIZE);
+    q->fdsp->vector_fmul(output, output, mdct_window, MDCT_SIZE);
 }
 
 /*
@@ -187,8 +188,9 @@ static av_cold int atrac3_decode_close(AVCodecContext *avctx)
 {
     ATRAC3Context *q = avctx->priv_data;
 
-    av_free(q->units);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->units);
+    av_freep(&q->decoded_bytes_buffer);
+    av_freep(&q->fdsp);
 
     ff_mdct_end(&q->mdct_ctx);
 
@@ -407,17 +409,17 @@ static int decode_tonal_components(GetBitContext *gb,
 static int decode_gain_control(GetBitContext *gb, GainBlock *block,
                                int num_bands)
 {
-    int i, j;
+    int b, j;
     int *level, *loc;
 
     AtracGainInfo *gain = block->g_block;
 
-    for (i = 0; i <= num_bands; i++) {
-        gain[i].num_points    = get_bits(gb, 3);
-        level                 = gain[i].lev_code;
-        loc                   = gain[i].loc_code;
+    for (b = 0; b <= num_bands; b++) {
+        gain[b].num_points = get_bits(gb, 3);
+        level              = gain[b].lev_code;
+        loc                = gain[b].loc_code;
 
-        for (j = 0; j < gain[i].num_points; j++) {
+        for (j = 0; j < gain[b].num_points; j++) {
             level[j] = get_bits(gb, 4);
             loc[j]   = get_bits(gb, 5);
             if (j && loc[j] <= loc[j - 1])
@@ -426,8 +428,8 @@ static int decode_gain_control(GetBitContext *gb, GainBlock *block,
     }
 
     /* Clear the unused blocks. */
-    for (; i < 4 ; i++)
-        gain[i].num_points = 0;
+    for (; b < 4 ; b++)
+        gain[b].num_points = 0;
 
     return 0;
 }
@@ -518,7 +520,7 @@ static void reverse_matrixing(float *su1, float *su2, int *prev_code,
             }
             break;
         default:
-            assert(0);
+            av_assert1(0);
         }
     }
 }
@@ -673,7 +675,7 @@ static int decode_frame(AVCodecContext *avctx, const uint8_t *databuf,
 
 
         /* set the bitstream reader at the start of the second Sound Unit*/
-        init_get_bits(&q->gb, ptr1, (avctx->block_align - i) * 8);
+        init_get_bits8(&q->gb, ptr1, q->decoded_bytes_buffer + avctx->block_align - ptr1);
 
         /* Fill the Weighting coeffs delay buffer */
         memmove(q->weighting_delay, &q->weighting_delay[2],
@@ -747,10 +749,8 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = SAMPLES_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /* Check if we need to descramble and what buffer to pass on. */
     if (q->scrambled_stream) {
@@ -771,7 +771,7 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-static av_cold void atrac3_init_static_data(AVCodec *codec)
+static av_cold void atrac3_init_static_data(void)
 {
     int i;
 
@@ -791,6 +791,7 @@ static av_cold void atrac3_init_static_data(AVCodec *codec)
 
 static av_cold int atrac3_decode_init(AVCodecContext *avctx)
 {
+    static int static_init_done;
     int i, ret;
     int version, delay, samples_per_frame, frame_factor;
     const uint8_t *edata_ptr = avctx->extradata;
@@ -801,6 +802,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
+    if (!static_init_done)
+        atrac3_init_static_data();
+    static_init_done = 1;
+
     /* Take care of the codec-specific extradata. */
     if (avctx->extradata_size == 14) {
         /* Parse the extradata, WAV format */
@@ -829,7 +834,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
                    avctx->channels, frame_factor);
             return AVERROR_INVALIDDATA;
         }
-    } else if (avctx->extradata_size == 10) {
+    } else if (avctx->extradata_size == 12 || avctx->extradata_size == 10) {
         /* Parse the extradata, RM format. */
         version                = bytestream_get_be32(&edata_ptr);
         samples_per_frame      = bytestream_get_be16(&edata_ptr);
@@ -866,8 +871,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
     if (q->coding_mode == STEREO)
         av_log(avctx, AV_LOG_DEBUG, "Normal stereo detected.\n");
     else if (q->coding_mode == JOINT_STEREO) {
-        if (avctx->channels != 2)
+        if (avctx->channels != 2) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid coding mode\n");
             return AVERROR_INVALIDDATA;
+        }
         av_log(avctx, AV_LOG_DEBUG, "Joint stereo detected.\n");
     } else {
         av_log(avctx, AV_LOG_ERROR, "Unknown channel coding mode %x!\n",
@@ -907,10 +914,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
     }
 
     ff_atrac_init_gain_compensation(&q->gainc_ctx, 4, 3);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
-    q->units = av_mallocz(sizeof(*q->units) * avctx->channels);
-    if (!q->units) {
+    q->units = av_mallocz_array(avctx->channels, sizeof(*q->units));
+    if (!q->units || !q->fdsp) {
         atrac3_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -925,7 +932,6 @@ AVCodec ff_atrac3_decoder = {
     .id               = AV_CODEC_ID_ATRAC3,
     .priv_data_size   = sizeof(ATRAC3Context),
     .init             = atrac3_decode_init,
-    .init_static_data = atrac3_init_static_data,
     .close            = atrac3_decode_close,
     .decode           = atrac3_decode_frame,
     .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
diff --git a/libavcodec/atrac3data.h b/libavcodec/atrac3data.h
index 4f5c122021..5d91274f48 100644
--- a/libavcodec/atrac3data.h
+++ b/libavcodec/atrac3data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Maxim Poliakovski
  * Copyright (c) 2006-2007 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plus.c b/libavcodec/atrac3plus.c
index 7071596a8c..b16a13971d 100644
--- a/libavcodec/atrac3plus.c
+++ b/libavcodec/atrac3plus.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -80,56 +80,56 @@ static av_cold void build_canonical_huff(const uint8_t *cb, const uint8_t *xlat,
     *tab_offset += 1 << max_len;
 }
 
-av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
+av_cold void ff_atrac3p_init_vlcs(void)
 {
     int i, wl_vlc_offs, ct_vlc_offs, sf_vlc_offs, tab_offset;
 
-    static int wl_nb_bits[4]  = { 2, 3, 5, 5 };
-    static int wl_nb_codes[4] = { 3, 5, 8, 8 };
-    static const uint8_t *wl_bits[4] = {
+    static const int wl_nb_bits[4]  = { 2, 3, 5, 5 };
+    static const int wl_nb_codes[4] = { 3, 5, 8, 8 };
+    static const uint8_t * const wl_bits[4] = {
         atrac3p_wl_huff_bits1, atrac3p_wl_huff_bits2,
         atrac3p_wl_huff_bits3, atrac3p_wl_huff_bits4
     };
-    static const uint8_t *wl_codes[4] = {
+    static const uint8_t * const wl_codes[4] = {
         atrac3p_wl_huff_code1, atrac3p_wl_huff_code2,
         atrac3p_wl_huff_code3, atrac3p_wl_huff_code4
     };
-    static const uint8_t *wl_xlats[4] = {
+    static const uint8_t * const wl_xlats[4] = {
         atrac3p_wl_huff_xlat1, atrac3p_wl_huff_xlat2, NULL, NULL
     };
 
-    static int ct_nb_bits[4]  = { 3, 4, 4, 4 };
-    static int ct_nb_codes[4] = { 4, 8, 8, 8 };
-    static const uint8_t *ct_bits[4]  = {
+    static const int ct_nb_bits[4]  = { 3, 4, 4, 4 };
+    static const int ct_nb_codes[4] = { 4, 8, 8, 8 };
+    static const uint8_t * const ct_bits[4]  = {
         atrac3p_ct_huff_bits1, atrac3p_ct_huff_bits2,
         atrac3p_ct_huff_bits2, atrac3p_ct_huff_bits3
     };
-    static const uint8_t *ct_codes[4] = {
+    static const uint8_t * const ct_codes[4] = {
         atrac3p_ct_huff_code1, atrac3p_ct_huff_code2,
         atrac3p_ct_huff_code2, atrac3p_ct_huff_code3
     };
-    static const uint8_t *ct_xlats[4] = {
+    static const uint8_t * const ct_xlats[4] = {
         NULL, NULL, atrac3p_ct_huff_xlat1, NULL
     };
 
-    static int sf_nb_bits[8]  = {  9,  9,  9,  9,  6,  6,  7,  7 };
-    static int sf_nb_codes[8] = { 64, 64, 64, 64, 16, 16, 16, 16 };
-    static const uint8_t  *sf_bits[8]  = {
+    static const  int sf_nb_bits[8]  = {  9,  9,  9,  9,  6,  6,  7,  7 };
+    static const  int sf_nb_codes[8] = { 64, 64, 64, 64, 16, 16, 16, 16 };
+    static const uint8_t  * const sf_bits[8]  = {
         atrac3p_sf_huff_bits1, atrac3p_sf_huff_bits1, atrac3p_sf_huff_bits2,
         atrac3p_sf_huff_bits3, atrac3p_sf_huff_bits4, atrac3p_sf_huff_bits4,
         atrac3p_sf_huff_bits5, atrac3p_sf_huff_bits6
     };
-    static const uint16_t *sf_codes[8] = {
+    static const uint16_t * const sf_codes[8] = {
         atrac3p_sf_huff_code1, atrac3p_sf_huff_code1, atrac3p_sf_huff_code2,
         atrac3p_sf_huff_code3, atrac3p_sf_huff_code4, atrac3p_sf_huff_code4,
         atrac3p_sf_huff_code5, atrac3p_sf_huff_code6
     };
-    static const uint8_t  *sf_xlats[8] = {
+    static const uint8_t  * const sf_xlats[8] = {
         atrac3p_sf_huff_xlat1, atrac3p_sf_huff_xlat2, NULL, NULL,
         atrac3p_sf_huff_xlat4, atrac3p_sf_huff_xlat5, NULL, NULL
     };
 
-    static const uint8_t *gain_cbs[11] = {
+    static const uint8_t * const gain_cbs[11] = {
         atrac3p_huff_gain_npoints1_cb, atrac3p_huff_gain_npoints1_cb,
         atrac3p_huff_gain_lev1_cb, atrac3p_huff_gain_lev2_cb,
         atrac3p_huff_gain_lev3_cb, atrac3p_huff_gain_lev4_cb,
@@ -137,7 +137,7 @@ av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
         atrac3p_huff_gain_loc4_cb, atrac3p_huff_gain_loc2_cb,
         atrac3p_huff_gain_loc5_cb
     };
-    static const uint8_t *gain_xlats[11] = {
+    static const uint8_t * const gain_xlats[11] = {
         NULL, atrac3p_huff_gain_npoints2_xlat, atrac3p_huff_gain_lev1_xlat,
         atrac3p_huff_gain_lev2_xlat, atrac3p_huff_gain_lev3_xlat,
         atrac3p_huff_gain_lev4_xlat, atrac3p_huff_gain_loc3_xlat,
@@ -145,13 +145,13 @@ av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
         atrac3p_huff_gain_loc2_xlat, atrac3p_huff_gain_loc5_xlat
     };
 
-    static const uint8_t *tone_cbs[7] = {
+    static const uint8_t * const tone_cbs[7] = {
         atrac3p_huff_tonebands_cb,  atrac3p_huff_numwavs1_cb,
         atrac3p_huff_numwavs2_cb,   atrac3p_huff_wav_ampsf1_cb,
         atrac3p_huff_wav_ampsf2_cb, atrac3p_huff_wav_ampsf3_cb,
         atrac3p_huff_freq_cb
     };
-    static const uint8_t *tone_xlats[7] = {
+    static const uint8_t * const tone_xlats[7] = {
         NULL, NULL, atrac3p_huff_numwavs2_xlat, atrac3p_huff_wav_ampsf1_xlat,
         atrac3p_huff_wav_ampsf2_xlat, atrac3p_huff_wav_ampsf3_xlat,
         atrac3p_huff_freq_xlat
@@ -820,7 +820,7 @@ static void decode_qu_spectra(GetBitContext *gb, const Atrac3pSpecCodeTab *tab,
     int num_coeffs = tab->num_coeffs;
     int bits       = tab->bits;
     int is_signed  = tab->is_signed;
-    unsigned val, mask = (1 << bits) - 1;
+    unsigned val;
 
     for (pos = 0; pos < num_specs;) {
         if (group_size == 1 || get_bits1(gb)) {
@@ -828,7 +828,7 @@ static void decode_qu_spectra(GetBitContext *gb, const Atrac3pSpecCodeTab *tab,
                 val = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
 
                 for (i = 0; i < num_coeffs; i++) {
-                    cf = val & mask;
+                    cf = av_mod_uintp2(val, bits);
                     if (is_signed)
                         cf = sign_extend(cf, bits);
                     else if (cf && get_bits1(gb))
@@ -1724,11 +1724,7 @@ static int decode_tones_info(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
     if (num_channels == 2) {
         get_subband_flags(gb, ctx->waves_info->tone_sharing, ctx->waves_info->num_tone_bands);
         get_subband_flags(gb, ctx->waves_info->tone_master,  ctx->waves_info->num_tone_bands);
-        if (get_subband_flags(gb, ctx->waves_info->phase_shift,
-                              ctx->waves_info->num_tone_bands)) {
-            avpriv_report_missing_feature(avctx, "GHA Phase shifting");
-            return AVERROR_PATCHWELCOME;
-        }
+        get_subband_flags(gb, ctx->waves_info->invert_phase, ctx->waves_info->num_tone_bands);
     }
 
     ctx->waves_info->tones_index = 0;
diff --git a/libavcodec/atrac3plus.h b/libavcodec/atrac3plus.h
index e56c4441b6..a33c38a3ee 100644
--- a/libavcodec/atrac3plus.h
+++ b/libavcodec/atrac3plus.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -122,7 +122,7 @@ typedef struct Atrac3pWaveSynthParams {
     int num_tone_bands;                     ///< number of PQF bands with tones
     uint8_t tone_sharing[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise tone sharing flags
     uint8_t tone_master[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise tone channel swapping
-    uint8_t phase_shift[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise 180° phase shifting
+    uint8_t invert_phase[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise phase inversion
     int tones_index;                        ///< total sum of tones in this unit
     Atrac3pWaveParam waves[48];
 } Atrac3pWaveSynthParams;
@@ -155,10 +155,8 @@ typedef struct Atrac3pChanUnitCtx {
 
 /**
  * Initialize VLC tables for bitstream parsing.
- *
- * @param[in]   codec    ptr to the AVCodec
  */
-void ff_atrac3p_init_vlcs(AVCodec *codec);
+void ff_atrac3p_init_vlcs(void);
 
 /**
  * Decode bitstream data of a channel unit.
@@ -169,8 +167,8 @@ void ff_atrac3p_init_vlcs(AVCodec *codec);
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-int  ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
-                                    int num_channels, AVCodecContext *avctx);
+int ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
+                                   int num_channels, AVCodecContext *avctx);
 
 /**
  * Initialize IMDCT transform.
diff --git a/libavcodec/atrac3plus_data.h b/libavcodec/atrac3plus_data.h
index 5026a59930..2a107eef1f 100644
--- a/libavcodec/atrac3plus_data.h
+++ b/libavcodec/atrac3plus_data.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plusdec.c b/libavcodec/atrac3plusdec.c
index 4a742c159e..ec2b1ad3b2 100644
--- a/libavcodec/atrac3plusdec.c
+++ b/libavcodec/atrac3plusdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@
 
 typedef struct ATRAC3PContext {
     GetBitContext gb;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
     DECLARE_ALIGNED(32, float, samples)[2][ATRAC3P_FRAME_SAMPLES];  ///< quantized MDCT spectrum
     DECLARE_ALIGNED(32, float, mdct_buf)[2][ATRAC3P_FRAME_SAMPLES]; ///< output of the IMDCT
@@ -67,7 +67,13 @@ typedef struct ATRAC3PContext {
 
 static av_cold int atrac3p_decode_close(AVCodecContext *avctx)
 {
-    av_free(((ATRAC3PContext *)(avctx->priv_data))->ch_units);
+    ATRAC3PContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->ch_units);
+    av_freep(&ctx->fdsp);
+
+    ff_mdct_end(&ctx->mdct_ctx);
+    ff_mdct_end(&ctx->ipqf_dct_ctx);
 
     return 0;
 }
@@ -148,7 +154,7 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    avpriv_float_dsp_init(&ctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    ff_atrac3p_init_vlcs();
 
     /* initialize IPQF */
     ff_mdct_init(&ctx->ipqf_dct_ctx, 5, 1, 32.0 / 32768.0);
@@ -164,9 +170,10 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
 
     ctx->my_channel_layout = avctx->channel_layout;
 
-    ctx->ch_units = av_mallocz(sizeof(*ctx->ch_units) *
-                               ctx->num_channel_blocks);
-    if (!ctx->ch_units) {
+    ctx->ch_units = av_mallocz_array(ctx->num_channel_blocks, sizeof(*ctx->ch_units));
+    ctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+    if (!ctx->ch_units || !ctx->fdsp) {
         atrac3p_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -263,7 +270,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
     for (ch = 0; ch < num_channels; ch++) {
         for (sb = 0; sb < ch_unit->num_subbands; sb++) {
             /* inverse transform and windowing */
-            ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
+            ff_atrac3p_imdct(ctx->fdsp, &ctx->mdct_ctx,
                              &ctx->samples[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              &ctx->mdct_buf[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              (ch_unit->channels[ch].wnd_shape_prev[sb] << 1) +
@@ -297,7 +304,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
             for (sb = 0; sb < ch_unit->num_subbands; sb++)
                 if (ch_unit->channels[ch].tones_info[sb].num_wavs ||
                     ch_unit->channels[ch].tones_info_prev[sb].num_wavs) {
-                    ff_atrac3p_generate_tones(ch_unit, &ctx->fdsp, ch, sb,
+                    ff_atrac3p_generate_tones(ch_unit, ctx->fdsp, ch, sb,
                                               &ctx->time_buf[ch][sb * 128]);
                 }
         }
@@ -329,10 +336,8 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
     float **samples_p = (float **)frame->extended_data;
 
     frame->nb_samples = ATRAC3P_FRAME_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if ((ret = init_get_bits8(&ctx->gb, avpkt->data, avpkt->size)) < 0)
         return ret;
@@ -379,18 +384,17 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return FFMIN(avctx->block_align, avpkt->size);
 }
 
 AVCodec ff_atrac3p_decoder = {
-    .name             = "atrac3plus",
-    .long_name        = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
-    .type             = AVMEDIA_TYPE_AUDIO,
-    .id               = AV_CODEC_ID_ATRAC3P,
-    .capabilities     = AV_CODEC_CAP_DR1,
-    .priv_data_size   = sizeof(ATRAC3PContext),
-    .init             = atrac3p_decode_init,
-    .init_static_data = ff_atrac3p_init_vlcs,
-    .close            = atrac3p_decode_close,
-    .decode           = atrac3p_decode_frame,
+    .name           = "atrac3plus",
+    .long_name      = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ATRAC3P,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(ATRAC3PContext),
+    .init           = atrac3p_decode_init,
+    .close          = atrac3p_decode_close,
+    .decode         = atrac3p_decode_frame,
 };
diff --git a/libavcodec/atrac3plusdsp.c b/libavcodec/atrac3plusdsp.c
index 468f098383..17c6437707 100644
--- a/libavcodec/atrac3plusdsp.c
+++ b/libavcodec/atrac3plusdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -116,14 +116,16 @@ av_cold void ff_atrac3p_init_wave_synth(void)
  *  @param[in]    synth_param   ptr to common synthesis parameters
  *  @param[in]    waves_info    parameters for each sine wave
  *  @param[in]    envelope      envelope data for all waves in a group
- *  @param[in]    phase_shift   flag indicates 180° phase shift
+ *  @param[in]    fdsp          ptr to floating-point DSP context
+ *  @param[in]    invert_phase  flag indicating 180° phase shift
  *  @param[in]    reg_offset    region offset for trimming envelope data
  *  @param[out]   out           receives sythesized data
  */
 static void waves_synth(Atrac3pWaveSynthParams *synth_param,
                         Atrac3pWavesData *waves_info,
                         Atrac3pWaveEnvelope *envelope,
-                        int phase_shift, int reg_offset, float *out)
+                        AVFloatDSPContext *fdsp,
+                        int invert_phase, int reg_offset, float *out)
 {
     int i, wn, inc, pos;
     double amp;
@@ -146,6 +148,10 @@ static void waves_synth(Atrac3pWaveSynthParams *synth_param,
         }
     }
 
+    /* invert phase if requested */
+    if (invert_phase)
+        fdsp->vector_fmul_scalar(out, out, -1.0f, 128);
+
     /* fade in with steep Hann window if requested */
     if (envelope->has_start_point) {
         pos = (envelope->start_pos << 2) - reg_offset;
@@ -216,12 +222,12 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, AVFloatDSPContext *f
     /* synthesize waves for both overlapping regions */
     if (tones_now->num_wavs && reg1_env_nonzero)
         waves_synth(ch_unit->waves_info_prev, tones_now, &tones_now->curr_env,
-                    ch_unit->waves_info_prev->phase_shift[sb] & ch_num,
+                    fdsp, ch_unit->waves_info_prev->invert_phase[sb] & ch_num,
                     128, wavreg1);
 
     if (tones_next->num_wavs && reg2_env_nonzero)
-        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env,
-                    ch_unit->waves_info->phase_shift[sb] & ch_num, 0, wavreg2);
+        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env, fdsp,
+                    ch_unit->waves_info->invert_phase[sb] & ch_num, 0, wavreg2);
 
     /* Hann windowing for non-faded wave signals */
     if (tones_now->num_wavs && tones_next->num_wavs &&
@@ -599,8 +605,8 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
                      const float *in, float *out)
 {
     int i, s, sb, t, pos_now, pos_next;
-    DECLARE_ALIGNED(32, float, idct_in)[ATRAC3P_SUBBANDS];
-    DECLARE_ALIGNED(32, float, idct_out)[ATRAC3P_SUBBANDS];
+    LOCAL_ALIGNED(32, float, idct_in, [ATRAC3P_SUBBANDS]);
+    LOCAL_ALIGNED(32, float, idct_out, [ATRAC3P_SUBBANDS]);
 
     memset(out, 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out));
 
diff --git a/libavcodec/audio_frame_queue.c b/libavcodec/audio_frame_queue.c
index c4ca02b01f..f2ccd69281 100644
--- a/libavcodec/audio_frame_queue.c
+++ b/libavcodec/audio_frame_queue.c
@@ -2,110 +2,72 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
-#include "libavutil/mathematics.h"
-#include "internal.h"
 #include "audio_frame_queue.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
 
 av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
 {
-    afq->avctx             = avctx;
-    afq->next_pts          = AV_NOPTS_VALUE;
+    afq->avctx = avctx;
     afq->remaining_delay   = avctx->initial_padding;
     afq->remaining_samples = avctx->initial_padding;
-    afq->frame_queue       = NULL;
-}
-
-static void delete_next_frame(AudioFrameQueue *afq)
-{
-    AudioFrame *f = afq->frame_queue;
-    if (f) {
-        afq->frame_queue = f->next;
-        f->next = NULL;
-        av_freep(&f);
-    }
+    afq->frame_count       = 0;
 }
 
 void ff_af_queue_close(AudioFrameQueue *afq)
 {
-    /* remove/free any remaining frames */
-    while (afq->frame_queue)
-        delete_next_frame(afq);
+    if(afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count);
+    av_freep(&afq->frames);
     memset(afq, 0, sizeof(*afq));
 }
 
-#ifdef DEBUG
-static void af_queue_log_state(AudioFrameQueue *afq)
-{
-    AudioFrame *f;
-    ff_dlog(afq->avctx, "remaining delay   = %d\n", afq->remaining_delay);
-    ff_dlog(afq->avctx, "remaining samples = %d\n", afq->remaining_samples);
-    ff_dlog(afq->avctx, "frames:\n");
-    f = afq->frame_queue;
-    while (f) {
-        ff_dlog(afq->avctx, "  [ pts=%9"PRId64" duration=%d ]\n",
-                f->pts, f->duration);
-        f = f->next;
-    }
-}
-#endif /* DEBUG */
-
 int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
 {
-    AudioFrame *new_frame;
-    AudioFrame *queue_end = afq->frame_queue;
-
-    /* find the end of the queue */
-    while (queue_end && queue_end->next)
-        queue_end = queue_end->next;
-
-    /* allocate new frame queue entry */
-    if (!(new_frame = av_malloc(sizeof(*new_frame))))
+    AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1));
+    if(!new)
         return AVERROR(ENOMEM);
+    afq->frames = new;
+    new += afq->frame_count;
 
     /* get frame parameters */
-    new_frame->next = NULL;
-    new_frame->duration = f->nb_samples;
+    new->duration = f->nb_samples;
+    new->duration += afq->remaining_delay;
     if (f->pts != AV_NOPTS_VALUE) {
-        new_frame->pts = av_rescale_q(f->pts,
+        new->pts = av_rescale_q(f->pts,
                                       afq->avctx->time_base,
                                       (AVRational){ 1, afq->avctx->sample_rate });
-        afq->next_pts = new_frame->pts + new_frame->duration;
+        new->pts -= afq->remaining_delay;
+        if(afq->frame_count && new[-1].pts >= new->pts)
+            av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n");
     } else {
-        new_frame->pts = AV_NOPTS_VALUE;
-        afq->next_pts  = AV_NOPTS_VALUE;
+        new->pts = AV_NOPTS_VALUE;
     }
-
-    /* add new frame to the end of the queue */
-    if (!queue_end)
-        afq->frame_queue = new_frame;
-    else
-        queue_end->next = new_frame;
+    afq->remaining_delay = 0;
 
     /* add frame sample count */
     afq->remaining_samples += f->nb_samples;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
+    afq->frame_count++;
 
     return 0;
 }
@@ -115,50 +77,37 @@ void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
 {
     int64_t out_pts = AV_NOPTS_VALUE;
     int removed_samples = 0;
+    int i;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
-
-    /* get output pts from the next frame or generated pts */
-    if (afq->frame_queue) {
-        if (afq->frame_queue->pts != AV_NOPTS_VALUE)
-            out_pts = afq->frame_queue->pts - afq->remaining_delay;
-    } else {
-        if (afq->next_pts != AV_NOPTS_VALUE)
-            out_pts = afq->next_pts - afq->remaining_delay;
+    if (afq->frame_count || afq->frame_alloc) {
+        if (afq->frames->pts != AV_NOPTS_VALUE)
+            out_pts = afq->frames->pts;
     }
-    if (pts) {
-        if (out_pts != AV_NOPTS_VALUE)
-            *pts = ff_samples_to_time_base(afq->avctx, out_pts);
-        else
-            *pts = AV_NOPTS_VALUE;
-    }
-
-    /* if the delay is larger than the packet duration, we use up delay samples
-       for the output packet and leave all frames in the queue */
-    if (afq->remaining_delay >= nb_samples) {
-        removed_samples      += nb_samples;
-        afq->remaining_delay -= nb_samples;
-    }
-    /* remove frames from the queue until we have enough to cover the
-       requested number of samples or until the queue is empty */
-    while (removed_samples < nb_samples && afq->frame_queue) {
-        removed_samples += afq->frame_queue->duration;
-        delete_next_frame(afq);
+    if(!afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
+    if (pts)
+        *pts = ff_samples_to_time_base(afq->avctx, out_pts);
+
+    for(i=0; nb_samples && i<afq->frame_count; i++){
+        int n= FFMIN(afq->frames[i].duration, nb_samples);
+        afq->frames[i].duration -= n;
+        nb_samples              -= n;
+        removed_samples         += n;
+        if(afq->frames[i].pts != AV_NOPTS_VALUE)
+            afq->frames[i].pts      += n;
     }
     afq->remaining_samples -= removed_samples;
-
-    /* if there are no frames left and we have room for more samples, use
-       any remaining delay samples */
-    if (removed_samples < nb_samples && afq->remaining_samples > 0) {
-        int add_samples = FFMIN(afq->remaining_samples,
-                                nb_samples - removed_samples);
-        removed_samples        += add_samples;
-        afq->remaining_samples -= add_samples;
+    i -= i && afq->frames[i-1].duration;
+    memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
+    afq->frame_count -= i;
+
+    if(nb_samples){
+        av_assert0(!afq->frame_count);
+        av_assert0(afq->remaining_samples == afq->remaining_delay);
+        if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
+            afq->frames[0].pts += nb_samples;
+        av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
     }
-    if (removed_samples > nb_samples)
-        av_log(afq->avctx, AV_LOG_WARNING, "frame_size is too large\n");
     if (duration)
         *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
 }
diff --git a/libavcodec/audio_frame_queue.h b/libavcodec/audio_frame_queue.h
index 1250ec22eb..d8076eae54 100644
--- a/libavcodec/audio_frame_queue.h
+++ b/libavcodec/audio_frame_queue.h
@@ -2,20 +2,20 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,15 @@
 typedef struct AudioFrame {
     int64_t pts;
     int duration;
-    struct AudioFrame *next;
 } AudioFrame;
 
 typedef struct AudioFrameQueue {
     AVCodecContext *avctx;
-    int64_t next_pts;
     int remaining_delay;
     int remaining_samples;
-    AudioFrame *frame_queue;
+    AudioFrame *frames;
+    unsigned frame_count;
+    unsigned frame_alloc;
 } AudioFrameQueue;
 
 /**
diff --git a/libavcodec/audioconvert.c b/libavcodec/audioconvert.c
new file mode 100644
index 0000000000..5e46fae2df
--- /dev/null
+++ b/libavcodec/audioconvert.c
@@ -0,0 +1,120 @@
+/*
+ * audio conversion
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * audio conversion
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/libm.h"
+#include "libavutil/samplefmt.h"
+#include "avcodec.h"
+#include "audioconvert.h"
+
+#if FF_API_AUDIO_CONVERT
+
+struct AVAudioConvert {
+    int in_channels, out_channels;
+    int fmt_pair;
+};
+
+AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
+                                       enum AVSampleFormat in_fmt, int in_channels,
+                                       const float *matrix, int flags)
+{
+    AVAudioConvert *ctx;
+    if (in_channels!=out_channels)
+        return NULL;  /* FIXME: not supported */
+    ctx = av_malloc(sizeof(AVAudioConvert));
+    if (!ctx)
+        return NULL;
+    ctx->in_channels = in_channels;
+    ctx->out_channels = out_channels;
+    ctx->fmt_pair = out_fmt + AV_SAMPLE_FMT_NB*in_fmt;
+    return ctx;
+}
+
+void av_audio_convert_free(AVAudioConvert *ctx)
+{
+    av_free(ctx);
+}
+
+int av_audio_convert(AVAudioConvert *ctx,
+                           void * const out[6], const int out_stride[6],
+                     const void * const  in[6], const int  in_stride[6], int len)
+{
+    int ch;
+
+    //FIXME optimize common cases
+
+    for(ch=0; ch<ctx->out_channels; ch++){
+        const int is=  in_stride[ch];
+        const int os= out_stride[ch];
+        const uint8_t *pi=  in[ch];
+        uint8_t *po= out[ch];
+        uint8_t *end= po + os*len;
+        if(!out[ch])
+            continue;
+
+#define CONV(ofmt, otype, ifmt, expr)\
+if(ctx->fmt_pair == ofmt + AV_SAMPLE_FMT_NB*ifmt){\
+    do{\
+        *(otype*)po = expr; pi += is; po += os;\
+    }while(po < end);\
+}
+
+//FIXME put things below under ifdefs so we do not waste space for cases no codec will need
+//FIXME rounding ?
+
+             CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_U8 ,  *(const uint8_t*)pi)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<8)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<24)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S16, (*(const int16_t*)pi>>8) + 0x80)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi<<16)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S32, (*(const int32_t*)pi>>24) + 0x80)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi>>16)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_FLT, av_clip_uint8(  lrintf(*(const float*)pi * (1<<7)) + 0x80))
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, av_clip_int16(  lrintf(*(const float*)pi * (1<<15))))
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, av_clipl_int32(llrintf(*(const float*)pi * (1U<<31))))
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_FLT, *(const float*)pi)
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_FLT, *(const float*)pi)
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_DBL, av_clip_uint8(  lrint(*(const double*)pi * (1<<7)) + 0x80))
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, av_clip_int16(  lrint(*(const double*)pi * (1<<15))))
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, av_clipl_int32(llrint(*(const double*)pi * (1U<<31))))
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_DBL, *(const double*)pi)
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_DBL, *(const double*)pi)
+        else return -1;
+    }
+    return 0;
+}
+
+#endif /* FF_API_AUDIO_CONVERT */
diff --git a/libavcodec/audioconvert.h b/libavcodec/audioconvert.h
new file mode 100644
index 0000000000..996c3f37ad
--- /dev/null
+++ b/libavcodec/audioconvert.h
@@ -0,0 +1,86 @@
+/*
+ * audio conversion
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2008 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AUDIOCONVERT_H
+#define AVCODEC_AUDIOCONVERT_H
+
+#include "version.h"
+
+/**
+ * @file
+ * Audio format conversion routines
+ * This interface is deprecated and will be dropped in a future
+ * version. You should use the libswresample library instead.
+ */
+
+#if FF_API_AUDIO_CONVERT
+
+#include "libavutil/cpu.h"
+#include "avcodec.h"
+#include "libavutil/channel_layout.h"
+
+struct AVAudioConvert;
+typedef struct AVAudioConvert AVAudioConvert;
+
+/**
+ * Create an audio sample format converter context
+ * @param out_fmt Output sample format
+ * @param out_channels Number of output channels
+ * @param in_fmt Input sample format
+ * @param in_channels Number of input channels
+ * @param[in] matrix Channel mixing matrix (of dimension in_channel*out_channels). Set to NULL to ignore.
+ * @param flags See AV_CPU_FLAG_xx
+ * @return NULL on error
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
+                                       enum AVSampleFormat in_fmt, int in_channels,
+                                       const float *matrix, int flags);
+
+/**
+ * Free audio sample format converter context
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+void av_audio_convert_free(AVAudioConvert *ctx);
+
+/**
+ * Convert between audio sample formats
+ * @param[in] out array of output buffers for each channel. set to NULL to ignore processing of the given channel.
+ * @param[in] out_stride distance between consecutive output samples (measured in bytes)
+ * @param[in] in array of input buffers for each channel
+ * @param[in] in_stride distance between consecutive input samples (measured in bytes)
+ * @param len length of audio frame size (measured in samples)
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+int av_audio_convert(AVAudioConvert *ctx,
+                           void * const out[6], const int out_stride[6],
+                     const void * const  in[6], const int  in_stride[6], int len);
+
+#endif /* FF_API_AUDIO_CONVERT */
+
+#endif /* AVCODEC_AUDIOCONVERT_H */
diff --git a/libavcodec/audiodsp.c b/libavcodec/audiodsp.c
index f7e6167cb0..85b5a74947 100644
--- a/libavcodec/audiodsp.c
+++ b/libavcodec/audiodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/audiodsp.h b/libavcodec/audiodsp.h
index 58205a1f19..b55bf858e0 100644
--- a/libavcodec/audiodsp.h
+++ b/libavcodec/audiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aura.c b/libavcodec/aura.c
index a1ef6f8d10..5f84d957fc 100644
--- a/libavcodec/aura.c
+++ b/libavcodec/aura.c
@@ -1,20 +1,20 @@
 /*
  * Aura 2 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,10 +59,8 @@ static int aura_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     buf += 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     Y = frame->data[0];
     U = frame->data[1];
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 11ae1fcd91..ff70d254af 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include "libavutil/avutil.h"
 #include "libavutil/buffer.h"
 #include "libavutil/cpu.h"
+#include "libavutil/channel_layout.h"
 #include "libavutil/dict.h"
 #include "libavutil/frame.h"
 #include "libavutil/log.h"
@@ -41,11 +42,6 @@
 
 #include "version.h"
 
-#if FF_API_FAST_MALLOC
-// to provide fast_*alloc
-#include "libavutil/mem.h"
-#endif
-
 /**
  * @defgroup libavc Encoding/Decoding Library
  * @{
@@ -97,7 +93,7 @@
  *
  * If you add a codec ID to this list, add it so that
  * 1. no value of a existing codec ID changes (that would break ABI),
- * 2. it is as close as possible to similar codecs.
+ * 2. it is as close as possible to similar codecs
  *
  * After adding new codec IDs, do not forget to add an entry to the codec
  * descriptor list and bump libavcodec minor version.
@@ -283,6 +279,7 @@ enum AVCodecID {
     AV_CODEC_ID_WEBP,
     AV_CODEC_ID_HNM4_VIDEO,
     AV_CODEC_ID_HEVC,
+#define AV_CODEC_ID_H265 AV_CODEC_ID_HEVC
     AV_CODEC_ID_FIC,
     AV_CODEC_ID_ALIAS_PIX,
     AV_CODEC_ID_BRENDER_PIX,
@@ -301,6 +298,22 @@ enum AVCodecID {
     AV_CODEC_ID_DXV,
     AV_CODEC_ID_SCREENPRESSO,
 
+    AV_CODEC_ID_Y41P = 0x8000,
+    AV_CODEC_ID_AVRP,
+    AV_CODEC_ID_012V,
+    AV_CODEC_ID_AVUI,
+    AV_CODEC_ID_AYUV,
+    AV_CODEC_ID_TARGA_Y216,
+    AV_CODEC_ID_V308,
+    AV_CODEC_ID_V408,
+    AV_CODEC_ID_YUV4,
+    AV_CODEC_ID_AVRN,
+    AV_CODEC_ID_CPIA,
+    AV_CODEC_ID_XFACE,
+    AV_CODEC_ID_SNOW,
+    AV_CODEC_ID_SMVJPEG,
+    AV_CODEC_ID_APNG,
+
     /* various PCM "codecs" */
     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
     AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -334,6 +347,9 @@ enum AVCodecID {
     AV_CODEC_ID_PCM_S24LE_PLANAR,
     AV_CODEC_ID_PCM_S32LE_PLANAR,
     AV_CODEC_ID_PCM_S16BE_PLANAR,
+    /* new PCM "codecs" should be added right below this line starting with
+     * an explicit value of for example 0x10800
+     */
 
     /* various ADPCM codecs */
     AV_CODEC_ID_ADPCM_IMA_QT = 0x11000,
@@ -367,6 +383,16 @@ enum AVCodecID {
     AV_CODEC_ID_ADPCM_G722,
     AV_CODEC_ID_ADPCM_IMA_APC,
     AV_CODEC_ID_ADPCM_VIMA,
+#if FF_API_VIMA_DECODER
+    AV_CODEC_ID_VIMA = AV_CODEC_ID_ADPCM_VIMA,
+#endif
+
+    AV_CODEC_ID_ADPCM_AFC = 0x11800,
+    AV_CODEC_ID_ADPCM_IMA_OKI,
+    AV_CODEC_ID_ADPCM_DTK,
+    AV_CODEC_ID_ADPCM_IMA_RAD,
+    AV_CODEC_ID_ADPCM_G726LE,
+    AV_CODEC_ID_ADPCM_THP_LE,
 
     /* AMR */
     AV_CODEC_ID_AMR_NB = 0x12000,
@@ -454,6 +480,17 @@ enum AVCodecID {
     AV_CODEC_ID_ON2AVC,
     AV_CODEC_ID_DSS_SP,
 
+    AV_CODEC_ID_FFWAVESYNTH = 0x15800,
+    AV_CODEC_ID_SONIC,
+    AV_CODEC_ID_SONIC_LS,
+    AV_CODEC_ID_EVRC,
+    AV_CODEC_ID_SMV,
+    AV_CODEC_ID_DSD_LSBF,
+    AV_CODEC_ID_DSD_MSBF,
+    AV_CODEC_ID_DSD_LSBF_PLANAR,
+    AV_CODEC_ID_DSD_MSBF_PLANAR,
+    AV_CODEC_ID_4GV,
+
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
     AV_CODEC_ID_DVD_SUBTITLE = 0x17000,
@@ -466,10 +503,36 @@ enum AVCodecID {
     AV_CODEC_ID_DVB_TELETEXT,
     AV_CODEC_ID_SRT,
 
+    AV_CODEC_ID_MICRODVD   = 0x17800,
+    AV_CODEC_ID_EIA_608,
+    AV_CODEC_ID_JACOSUB,
+    AV_CODEC_ID_SAMI,
+    AV_CODEC_ID_REALTEXT,
+    AV_CODEC_ID_STL,
+    AV_CODEC_ID_SUBVIEWER1,
+    AV_CODEC_ID_SUBVIEWER,
+    AV_CODEC_ID_SUBRIP,
+    AV_CODEC_ID_WEBVTT,
+    AV_CODEC_ID_MPL2,
+    AV_CODEC_ID_VPLAYER,
+    AV_CODEC_ID_PJS,
+    AV_CODEC_ID_ASS,
+    AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+
     /* other specific kind of codecs (generally used for attachments) */
     AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
     AV_CODEC_ID_TTF = 0x18000,
 
+    AV_CODEC_ID_BINTEXT    = 0x18800,
+    AV_CODEC_ID_XBIN,
+    AV_CODEC_ID_IDF,
+    AV_CODEC_ID_OTF,
+    AV_CODEC_ID_SMPTE_KLV,
+    AV_CODEC_ID_DVD_NAV,
+    AV_CODEC_ID_TIMED_ID3,
+    AV_CODEC_ID_BIN_DATA,
+
+
     AV_CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like AV_CODEC_ID_NONE) but lavf should attempt to identify it
 
     AV_CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
@@ -501,6 +564,13 @@ typedef struct AVCodecDescriptor {
      * Codec properties, a combination of AV_CODEC_PROP_* flags.
      */
     int             props;
+
+    /**
+     * MIME type(s) associated with the codec.
+     * May be NULL; if not, a NULL-terminated array of MIME types.
+     * The first item is always non-NULL and is the preferred MIME type.
+     */
+    const char *const *mime_types;
 } AVCodecDescriptor;
 
 /**
@@ -528,6 +598,16 @@ typedef struct AVCodecDescriptor {
  * equal.
  */
 #define AV_CODEC_PROP_REORDER       (1 << 3)
+/**
+ * Subtitle codec is bitmap based
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->pict field.
+ */
+#define AV_CODEC_PROP_BITMAP_SUB    (1 << 16)
+/**
+ * Subtitle codec is text based.
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->ass field.
+ */
+#define AV_CODEC_PROP_TEXT_SUB      (1 << 17)
 
 /**
  * @ingroup lavc_decoding
@@ -537,7 +617,7 @@ typedef struct AVCodecDescriptor {
  * Note: If the first 23 bits of the additional bytes are not 0, then damaged
  * MPEG bitstreams could cause overread and segfault.
  */
-#define AV_INPUT_BUFFER_PADDING_SIZE 8
+#define AV_INPUT_BUFFER_PADDING_SIZE 32
 
 /**
  * @ingroup lavc_encoding
@@ -550,7 +630,7 @@ typedef struct AVCodecDescriptor {
 /**
  * @deprecated use AV_INPUT_BUFFER_PADDING_SIZE instead
  */
-#define FF_INPUT_BUFFER_PADDING_SIZE 8
+#define FF_INPUT_BUFFER_PADDING_SIZE 32
 
 /**
  * @deprecated use AV_INPUT_BUFFER_MIN_SIZE instead
@@ -574,6 +654,7 @@ enum Motion_Est_ID {
     ME_HEX,         ///< hexagon based search
     ME_UMH,         ///< uneven multi-hexagon search
     ME_TESA,        ///< transformed exhaustive search algorithm
+    ME_ITER=50,     ///< iterative search
 };
 #endif
 
@@ -587,6 +668,7 @@ enum AVDiscard{
     AVDISCARD_DEFAULT =  0, ///< discard useless packets like 0 size packets in avi
     AVDISCARD_NONREF  =  8, ///< discard all non reference
     AVDISCARD_BIDIR   = 16, ///< discard all bidirectional frames
+    AVDISCARD_NONINTRA= 24, ///< discard all non intra frames
     AVDISCARD_NONKEY  = 32, ///< discard all frames except keyframes
     AVDISCARD_ALL     = 48, ///< discard all
 };
@@ -700,7 +782,7 @@ typedef struct RcOverride{
 /**
  * Allow non spec compliant speedup tricks.
  */
-#define AV_CODEC_FLAG_CLOSED_GOP      (1 << 31)
+#define AV_CODEC_FLAG_CLOSED_GOP      (1U << 31)
 
 #define AV_CODEC_FLAG2_FAST           (1 <<  0)
 /**
@@ -711,6 +793,12 @@ typedef struct RcOverride{
  * Place global headers at every keyframe instead of in extradata.
  */
 #define AV_CODEC_FLAG2_LOCAL_HEADER   (1 <<  3)
+
+/**
+ * timecode is in drop frame format. DEPRECATED!!!!
+ */
+#define AV_CODEC_FLAG2_DROP_FRAME_TIMECODE (1 << 13)
+
 /**
  * Input bitstream might be truncated at a packet boundaries
  * instead of only at frame boundaries.
@@ -721,6 +809,19 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_FLAG2_IGNORE_CROP    (1 << 16)
 
+/**
+ * Show all frames before the first keyframe
+ */
+#define AV_CODEC_FLAG2_SHOW_ALL       (1 << 22)
+/**
+ * Export motion vectors through frame side data
+ */
+#define AV_CODEC_FLAG2_EXPORT_MVS     (1 << 28)
+/**
+ * Do not skip samples and export skip information as frame side data
+ */
+#define AV_CODEC_FLAG2_SKIP_MANUAL    (1 << 29)
+
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
  *              Reference Picture Selection
@@ -768,6 +869,14 @@ typedef struct RcOverride{
  * This can be used to prevent truncation of the last audio samples.
  */
 #define AV_CODEC_CAP_SMALL_LAST_FRAME    (1 <<  6)
+
+#if FF_API_CAP_VDPAU
+/**
+ * Codec can export data for HW decoding (VDPAU).
+ */
+#define AV_CODEC_CAP_HWACCEL_VDPAU       (1 <<  7)
+#endif
+
 /**
  * Codec can output multiple frames per AVPacket
  * Normally demuxers return one frame at a time, demuxers which do not do
@@ -809,17 +918,26 @@ typedef struct RcOverride{
  * Audio encoder supports receiving a different number of samples in each call.
  */
 #define AV_CODEC_CAP_VARIABLE_FRAME_SIZE (1 << 16)
+/**
+ * Codec is intra only.
+ */
+#define AV_CODEC_CAP_INTRA_ONLY       0x40000000
+/**
+ * Codec is lossless.
+ */
+#define AV_CODEC_CAP_LOSSLESS         0x80000000
+
 
 #if FF_API_WITHOUT_PREFIX
 /**
  * Allow decoders to produce frames with data planes that are not aligned
  * to CPU requirements (e.g. due to cropping).
  */
-#define CODEC_FLAG_UNALIGNED 0x0001
-#define CODEC_FLAG_QSCALE 0x0002  ///< Use fixed qscale.
-#define CODEC_FLAG_4MV    0x0004  ///< 4 MV per MB allowed / advanced prediction for H.263.
-#define CODEC_FLAG_OUTPUT_CORRUPT 0x0008 ///< Output even those frames that might be corrupted
-#define CODEC_FLAG_QPEL   0x0010  ///< Use qpel MC.
+#define CODEC_FLAG_UNALIGNED AV_CODEC_FLAG_UNALIGNED
+#define CODEC_FLAG_QSCALE AV_CODEC_FLAG_QSCALE
+#define CODEC_FLAG_4MV    AV_CODEC_FLAG_4MV
+#define CODEC_FLAG_OUTPUT_CORRUPT AV_CODEC_FLAG_OUTPUT_CORRUPT
+#define CODEC_FLAG_QPEL   AV_CODEC_FLAG_QPEL
 #if FF_API_GMC
 /**
  * @deprecated use the "gmc" private option of the libxvid encoder
@@ -840,9 +958,9 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_INPUT_PRESERVED 0x0100
 #endif
-#define CODEC_FLAG_PASS1           0x0200   ///< Use internal 2pass ratecontrol in first pass mode.
-#define CODEC_FLAG_PASS2           0x0400   ///< Use internal 2pass ratecontrol in second pass mode.
-#define CODEC_FLAG_GRAY            0x2000   ///< Only decode/encode grayscale.
+#define CODEC_FLAG_PASS1           AV_CODEC_FLAG_PASS1
+#define CODEC_FLAG_PASS2           AV_CODEC_FLAG_PASS2
+#define CODEC_FLAG_GRAY            AV_CODEC_FLAG_GRAY
 #if FF_API_EMU_EDGE
 /**
  * @deprecated edges are not used/required anymore. I.e. this flag is now always
@@ -850,9 +968,9 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_EMU_EDGE        0x4000
 #endif
-#define CODEC_FLAG_PSNR            0x8000   ///< error[?] variables will be set during encoding.
-#define CODEC_FLAG_TRUNCATED       0x00010000 /** Input bitstream might be truncated at a random
-                                                  location instead of only at frame boundaries. */
+#define CODEC_FLAG_PSNR            AV_CODEC_FLAG_PSNR
+#define CODEC_FLAG_TRUNCATED       AV_CODEC_FLAG_TRUNCATED
+
 #if FF_API_NORMALIZE_AQP
 /**
  * @deprecated use the flag "naq" in the "mpv_flags" private option of the
@@ -860,21 +978,24 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_NORMALIZE_AQP  0x00020000
 #endif
-#define CODEC_FLAG_INTERLACED_DCT 0x00040000 ///< Use interlaced DCT.
-#define CODEC_FLAG_LOW_DELAY      0x00080000 ///< Force low delay.
-#define CODEC_FLAG_GLOBAL_HEADER  0x00400000 ///< Place global headers in extradata instead of every keyframe.
-#define CODEC_FLAG_BITEXACT       0x00800000 ///< Use only bitexact stuff (except (I)DCT).
-/* Fx : Flag for h263+ extra options */
-#define CODEC_FLAG_AC_PRED        0x01000000 ///< H.263 advanced intra coding / MPEG-4 AC prediction
-#define CODEC_FLAG_LOOP_FILTER    0x00000800 ///< loop filter
-#define CODEC_FLAG_INTERLACED_ME  0x20000000 ///< interlaced motion estimation
-#define CODEC_FLAG_CLOSED_GOP     0x80000000
-#define CODEC_FLAG2_FAST          0x00000001 ///< Allow non spec compliant speedup tricks.
-#define CODEC_FLAG2_NO_OUTPUT     0x00000004 ///< Skip bitstream encoding.
-#define CODEC_FLAG2_LOCAL_HEADER  0x00000008 ///< Place global headers at every keyframe instead of in extradata.
-#define CODEC_FLAG2_IGNORE_CROP   0x00010000 ///< Discard cropping information from SPS.
-
-#define CODEC_FLAG2_CHUNKS        0x00008000 ///< Input bitstream might be truncated at a packet boundaries instead of only at frame boundaries.
+#define CODEC_FLAG_INTERLACED_DCT AV_CODEC_FLAG_INTERLACED_DCT
+#define CODEC_FLAG_LOW_DELAY      AV_CODEC_FLAG_LOW_DELAY
+#define CODEC_FLAG_GLOBAL_HEADER  AV_CODEC_FLAG_GLOBAL_HEADER
+#define CODEC_FLAG_BITEXACT       AV_CODEC_FLAG_BITEXACT
+#define CODEC_FLAG_AC_PRED        AV_CODEC_FLAG_AC_PRED
+#define CODEC_FLAG_LOOP_FILTER    AV_CODEC_FLAG_LOOP_FILTER
+#define CODEC_FLAG_INTERLACED_ME  AV_CODEC_FLAG_INTERLACED_ME
+#define CODEC_FLAG_CLOSED_GOP     AV_CODEC_FLAG_CLOSED_GOP
+#define CODEC_FLAG2_FAST          AV_CODEC_FLAG2_FAST
+#define CODEC_FLAG2_NO_OUTPUT     AV_CODEC_FLAG2_NO_OUTPUT
+#define CODEC_FLAG2_LOCAL_HEADER  AV_CODEC_FLAG2_LOCAL_HEADER
+#define CODEC_FLAG2_DROP_FRAME_TIMECODE AV_CODEC_FLAG2_DROP_FRAME_TIMECODE
+#define CODEC_FLAG2_IGNORE_CROP   AV_CODEC_FLAG2_IGNORE_CROP
+
+#define CODEC_FLAG2_CHUNKS        AV_CODEC_FLAG2_CHUNKS
+#define CODEC_FLAG2_SHOW_ALL      AV_CODEC_FLAG2_SHOW_ALL
+#define CODEC_FLAG2_EXPORT_MVS    AV_CODEC_FLAG2_EXPORT_MVS
+#define CODEC_FLAG2_SKIP_MANUAL   AV_CODEC_FLAG2_SKIP_MANUAL
 
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
@@ -883,16 +1004,22 @@ typedef struct RcOverride{
 /* /Fx */
 /* codec capabilities */
 
-#define CODEC_CAP_DRAW_HORIZ_BAND 0x0001 ///< Decoder can use draw_horiz_band callback.
+#define CODEC_CAP_DRAW_HORIZ_BAND AV_CODEC_CAP_DRAW_HORIZ_BAND ///< Decoder can use draw_horiz_band callback.
 /**
  * Codec uses get_buffer() for allocating buffers and supports custom allocators.
  * If not set, it might not use get_buffer() at all or use operations that
  * assume the buffer was allocated by avcodec_default_get_buffer.
  */
-#define CODEC_CAP_DR1             0x0002
-#define CODEC_CAP_TRUNCATED       0x0008
+#define CODEC_CAP_DR1             AV_CODEC_CAP_DR1
+#define CODEC_CAP_TRUNCATED       AV_CODEC_CAP_TRUNCATED
 #if FF_API_XVMC
-/* Codec can export data for HW decoding (XvMC). */
+/* Codec can export data for HW decoding. This flag indicates that
+ * the codec would call get_format() with list that might contain HW accelerated
+ * pixel formats (XvMC, VDPAU, VAAPI, etc). The application can pick any of them
+ * including raw image format.
+ * The application can use the passed context to determine bitstream version,
+ * chroma format, resolution etc.
+ */
 #define CODEC_CAP_HWACCEL         0x0010
 #endif /* FF_API_XVMC */
 /**
@@ -918,17 +1045,17 @@ typedef struct RcOverride{
  *       each output packet. If this flag is not set, the pts and duration will
  *       be determined by libavcodec from the input frame.
  */
-#define CODEC_CAP_DELAY           0x0020
+#define CODEC_CAP_DELAY           AV_CODEC_CAP_DELAY
 /**
  * Codec can be fed a final frame with a smaller size.
  * This can be used to prevent truncation of the last audio samples.
  */
-#define CODEC_CAP_SMALL_LAST_FRAME 0x0040
+#define CODEC_CAP_SMALL_LAST_FRAME AV_CODEC_CAP_SMALL_LAST_FRAME
 #if FF_API_CAP_VDPAU
 /**
  * Codec can export data for HW decoding (VDPAU).
  */
-#define CODEC_CAP_HWACCEL_VDPAU    0x0080
+#define CODEC_CAP_HWACCEL_VDPAU    AV_CODEC_CAP_HWACCEL_VDPAU
 #endif
 /**
  * Codec can output multiple frames per AVPacket
@@ -941,16 +1068,16 @@ typedef struct RcOverride{
  * prohibiting stream copy in many cases thus it should only be considered
  * as a last resort.
  */
-#define CODEC_CAP_SUBFRAMES        0x0100
+#define CODEC_CAP_SUBFRAMES        AV_CODEC_CAP_SUBFRAMES
 /**
  * Codec is experimental and is thus avoided in favor of non experimental
  * encoders
  */
-#define CODEC_CAP_EXPERIMENTAL     0x0200
+#define CODEC_CAP_EXPERIMENTAL     AV_CODEC_CAP_EXPERIMENTAL
 /**
  * Codec should fill in channel configuration and samplerate instead of container
  */
-#define CODEC_CAP_CHANNEL_CONF     0x0400
+#define CODEC_CAP_CHANNEL_CONF     AV_CODEC_CAP_CHANNEL_CONF
 #if FF_API_NEG_LINESIZES
 /**
  * @deprecated no codecs use this capability
@@ -960,23 +1087,37 @@ typedef struct RcOverride{
 /**
  * Codec supports frame-level multithreading.
  */
-#define CODEC_CAP_FRAME_THREADS    0x1000
+#define CODEC_CAP_FRAME_THREADS    AV_CODEC_CAP_FRAME_THREADS
 /**
  * Codec supports slice-based (or partition-based) multithreading.
  */
-#define CODEC_CAP_SLICE_THREADS    0x2000
+#define CODEC_CAP_SLICE_THREADS    AV_CODEC_CAP_SLICE_THREADS
 /**
  * Codec supports changed parameters at any point.
  */
-#define CODEC_CAP_PARAM_CHANGE     0x4000
+#define CODEC_CAP_PARAM_CHANGE     AV_CODEC_CAP_PARAM_CHANGE
 /**
  * Codec supports avctx->thread_count == 0 (auto).
  */
-#define CODEC_CAP_AUTO_THREADS     0x8000
+#define CODEC_CAP_AUTO_THREADS     AV_CODEC_CAP_AUTO_THREADS
 /**
  * Audio encoder supports receiving a different number of samples in each call.
  */
-#define CODEC_CAP_VARIABLE_FRAME_SIZE 0x10000
+#define CODEC_CAP_VARIABLE_FRAME_SIZE AV_CODEC_CAP_VARIABLE_FRAME_SIZE
+/**
+ * Codec is intra only.
+ */
+#define CODEC_CAP_INTRA_ONLY       AV_CODEC_CAP_INTRA_ONLY
+/**
+ * Codec is lossless.
+ */
+#define CODEC_CAP_LOSSLESS         AV_CODEC_CAP_LOSSLESS
+
+/**
+ * HWAccel is experimental and is thus avoided in favor of non experimental
+ * codecs
+ */
+#define HWACCEL_CODEC_CAP_EXPERIMENTAL     0x0200
 #endif /* FF_API_WITHOUT_PREFIX */
 
 #if FF_API_MB_TYPE
@@ -1122,13 +1263,84 @@ enum AVPacketSideDataType {
     AV_PKT_DATA_AUDIO_SERVICE_TYPE,
 
     /**
-     * This side data contains an integer value representing the quality
-     * factor of the compressed frame. Allowed range is between 1 (good)
-     * and FF_LAMBDA_MAX (bad).
+     * This side data contains quality related information from the encoder.
+     * @code
+     * u32le quality factor of the compressed frame. Allowed range is between 1 (good) and FF_LAMBDA_MAX (bad).
+     * u8    picture type
+     * u8    error count
+     * u16   reserved
+     * u64le[error count] sum of squared differences between encoder in and output
+     * @endcode
+     */
+    AV_PKT_DATA_QUALITY_STATS,
+
+    /**
+     * Recommmends skipping the specified number of samples
+     * @code
+     * u32le number of samples to skip from start of this packet
+     * u32le number of samples to skip from end of this packet
+     * u8    reason for start skip
+     * u8    reason for end   skip (0=padding silence, 1=convergence)
+     * @endcode
      */
-    AV_PKT_DATA_QUALITY_FACTOR,
+    AV_PKT_DATA_SKIP_SAMPLES=70,
+
+    /**
+     * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that
+     * the packet may contain "dual mono" audio specific to Japanese DTV
+     * and if it is true, recommends only the selected channel to be used.
+     * @code
+     * u8    selected channels (0=mail/left, 1=sub/right, 2=both)
+     * @endcode
+     */
+    AV_PKT_DATA_JP_DUALMONO,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop.
+     */
+    AV_PKT_DATA_STRINGS_METADATA,
+
+    /**
+     * Subtitle event position
+     * @code
+     * u32le x1
+     * u32le y1
+     * u32le x2
+     * u32le y2
+     * @endcode
+     */
+    AV_PKT_DATA_SUBTITLE_POSITION,
+
+    /**
+     * Data found in BlockAdditional element of matroska container. There is
+     * no end marker for the data, so it is required to rely on the side data
+     * size to recognize the end. 8 byte id (as found in BlockAddId) followed
+     * by data.
+     */
+    AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+
+    /**
+     * The optional first identifier line of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_IDENTIFIER,
+
+    /**
+     * The optional settings (rendering instructions) that immediately
+     * follow the timestamp specifier of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_SETTINGS,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop. This
+     * side data includes updated metadata which appeared in the stream.
+     */
+    AV_PKT_DATA_METADATA_UPDATE,
 };
 
+#define AV_PKT_DATA_QUALITY_FACTOR AV_PKT_DATA_QUALITY_STATS //DEPRECATED
+
 typedef struct AVPacketSideData {
     uint8_t *data;
     int      size;
@@ -1145,7 +1357,7 @@ typedef struct AVPacketSideData {
  * packets, with no compressed data, containing only side data
  * (e.g. to update some stream parameters at the end of encoding).
  *
- * AVPacket is one of the few structs in Libav, whose size is a part of public
+ * AVPacket is one of the few structs in FFmpeg, whose size is a part of public
  * ABI. Thus it may be allocated on stack and no new fields can be added to it
  * without libavcodec and libavformat major bump.
  *
@@ -1243,6 +1455,8 @@ enum AVFieldOrder {
  * New fields can be added to the end with minor version bumps.
  * Removal, reordering and changes to existing fields require a major
  * version bump.
+ * Please use AVOptions (av_opt* / av_set/get*()) to access these fields from user
+ * applications.
  * sizeof(AVCodecContext) must not be used outside libav*.
  */
 typedef struct AVCodecContext {
@@ -1307,9 +1521,10 @@ typedef struct AVCodecContext {
     /**
      * the average bitrate
      * - encoding: Set by user; unused for constant quantizer encoding.
-     * - decoding: Set by libavcodec. 0 or some bitrate if this info is available in the stream.
+     * - decoding: Set by user, may be overwritten by libavcodec
+     *             if this info is available in the stream
      */
-    int bit_rate;
+    int64_t bit_rate;
 
     /**
      * number of bits the bitstream is allowed to diverge from the reference.
@@ -1354,7 +1569,7 @@ typedef struct AVCodecContext {
      * rv10: additional flags
      * mpeg4: global headers (they can be in the bitstream or here)
      * The allocated memory should be AV_INPUT_BUFFER_PADDING_SIZE bytes larger
-     * than extradata_size to avoid prolems if it is read with the bitstream reader.
+     * than extradata_size to avoid problems if it is read with the bitstream reader.
      * The bytewise contents of extradata must not depend on the architecture or CPU endianness.
      * - encoding: Set/allocated/freed by libavcodec.
      * - decoding: Set/allocated/freed by user.
@@ -1367,6 +1582,8 @@ typedef struct AVCodecContext {
      * of which frame timestamps are represented. For fixed-fps content,
      * timebase should be 1/framerate and timestamp increments should be
      * identically 1.
+     * This often, but not always is the inverse of the frame rate or field rate
+     * for video.
      * - encoding: MUST be set by user.
      * - decoding: the use of this field for decoding is deprecated.
      *             Use framerate instead.
@@ -1385,6 +1602,11 @@ typedef struct AVCodecContext {
     /**
      * Codec delay.
      *
+     * Encoding: Number of frames delay there will be from the encoder input to
+     *           the decoder output. (we assume the decoder matches the spec)
+     * Decoding: Number of frames delay in addition to what a standard decoder
+     *           as specified in the spec would produce.
+     *
      * Video:
      *   Number of frames the decoded output will be delayed relative to the
      *   encoded input.
@@ -1420,7 +1642,7 @@ typedef struct AVCodecContext {
 
     /**
      * Bitstream width / height, may be different from width/height e.g. when
-     * the decoded frame is cropped before being output.
+     * the decoded frame is cropped before being output or lowres is enabled.
      *
      * @note Those field may not match the value of the last
      * AVFrame outputted by avcodec_decode_video2 due frame
@@ -1447,7 +1669,7 @@ typedef struct AVCodecContext {
     /**
      * Pixel format, see AV_PIX_FMT_xxx.
      * May be set by the demuxer if known from headers.
-     * May be overriden by the decoder if it knows better.
+     * May be overridden by the decoder if it knows better.
      *
      * @note This field may not match the value of the last
      * AVFrame outputted by avcodec_decode_video2 due frame
@@ -1674,6 +1896,8 @@ typedef struct AVCodecContext {
 #define FF_CMP_VSAD   8
 #define FF_CMP_VSSE   9
 #define FF_CMP_NSSE   10
+#define FF_CMP_W53    11
+#define FF_CMP_W97    12
 #define FF_CMP_DCTMAX 13
 #define FF_CMP_DCT264 14
 #define FF_CMP_CHROMA 256
@@ -1777,7 +2001,7 @@ typedef struct AVCodecContext {
      * XVideo Motion Acceleration
      * - encoding: forbidden
      * - decoding: set by decoder
-     * @deprecated XvMC support is slated for removal.
+     * @deprecated XvMC doesn't need it anymore.
      */
     attribute_deprecated int xvmc_acceleration;
 #endif /* FF_API_XVMC */
@@ -1838,7 +2062,7 @@ typedef struct AVCodecContext {
     /**
      * precision of the intra DC coefficient - 8
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by libavcodec
      */
     int intra_dc_precision;
 
@@ -1990,7 +2214,7 @@ typedef struct AVCodecContext {
 
     /** Field order
      * - encoding: set by libavcodec
-     * - decoding: Set by libavcodec
+     * - decoding: Set by user.
      */
     enum AVFieldOrder field_order;
 
@@ -2044,7 +2268,7 @@ typedef struct AVCodecContext {
     /**
      * Audio channel layout.
      * - encoding: set by user.
-     * - decoding: set by libavcodec.
+     * - decoding: set by user, may be overwritten by libavcodec.
      */
     uint64_t channel_layout;
 
@@ -2063,9 +2287,10 @@ typedef struct AVCodecContext {
     enum AVAudioServiceType audio_service_type;
 
     /**
-     * Used to request a sample format from the decoder.
-     * - encoding: unused.
+     * desired sample format
+     * - encoding: Not used.
      * - decoding: Set by user.
+     * Decoder will decode to this format if it can.
      */
     enum AVSampleFormat request_sample_fmt;
 
@@ -2123,6 +2348,8 @@ typedef struct AVCodecContext {
      * avcodec_align_dimensions2() should be used to find the required width and
      * height, as they normally need to be rounded up to the next multiple of 16.
      *
+     * Some decoders do not support linesizes changing between frames.
+     *
      * If frame multithreading is used and thread_safe_callbacks is set,
      * this callback may be called from a different thread, but not from more
      * than one at once. Does not need to be reentrant.
@@ -2226,16 +2453,16 @@ typedef struct AVCodecContext {
     /**
      * maximum bitrate
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by user, may be overwritten by libavcodec.
      */
-    int rc_max_rate;
+    int64_t rc_max_rate;
 
     /**
      * minimum bitrate
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int rc_min_rate;
+    int64_t rc_min_rate;
 
 #if FF_API_MPV_OPT
     /**
@@ -2352,9 +2579,9 @@ typedef struct AVCodecContext {
     int max_prediction_order;
 
     /**
-     * GOP timecode frame start number, in non drop frame format
-     * - encoding: Set by user.
-     * - decoding: unused
+     * GOP timecode frame start number
+     * - encoding: Set by user, in non drop frame format
+     * - decoding: Set by libavcodec (timecode in the 25 bits format, -1 if unset)
      */
     int64_t timecode_frame_start;
 
@@ -2459,6 +2686,7 @@ typedef struct AVCodecContext {
     int error_concealment;
 #define FF_EC_GUESS_MVS   1
 #define FF_EC_DEBLOCK     2
+#define FF_EC_FAVOR_INTER 256
 
     /**
      * debug
@@ -2487,17 +2715,21 @@ typedef struct AVCodecContext {
 #define FF_DEBUG_MMCO        0x00000800
 #define FF_DEBUG_BUGS        0x00001000
 #if FF_API_DEBUG_MV
-#define FF_DEBUG_VIS_QP      0x00002000
-#define FF_DEBUG_VIS_MB_TYPE 0x00004000
+#define FF_DEBUG_VIS_QP      0x00002000 ///< only access through AVOptions from outside libavcodec
+#define FF_DEBUG_VIS_MB_TYPE 0x00004000 ///< only access through AVOptions from outside libavcodec
 #endif
 #define FF_DEBUG_BUFFERS     0x00008000
 #define FF_DEBUG_THREADS     0x00010000
+#define FF_DEBUG_GREEN_MD    0x00800000
+#define FF_DEBUG_NOMC        0x01000000
 
 #if FF_API_DEBUG_MV
     /**
-     * @deprecated this option does not have any effect
+     * debug
+     * Code outside libavcodec should access this field using AVOptions
+     * - encoding: Set by user.
+     * - decoding: Set by user.
      */
-    attribute_deprecated
     int debug_mv;
 #define FF_DEBUG_VIS_MV_P_FOR  0x00000001 //visualize forward predicted MVs of P frames
 #define FF_DEBUG_VIS_MV_B_FOR  0x00000002 //visualize forward predicted MVs of B frames
@@ -2518,9 +2750,15 @@ typedef struct AVCodecContext {
  * decoder returning an error.
  */
 #define AV_EF_CRCCHECK  (1<<0)
-#define AV_EF_BITSTREAM (1<<1)
-#define AV_EF_BUFFER    (1<<2)
-#define AV_EF_EXPLODE   (1<<3)
+#define AV_EF_BITSTREAM (1<<1)          ///< detect bitstream specification deviations
+#define AV_EF_BUFFER    (1<<2)          ///< detect improper bitstream length
+#define AV_EF_EXPLODE   (1<<3)          ///< abort decoding on minor error detection
+
+#define AV_EF_IGNORE_ERR (1<<15)        ///< ignore errors and continue
+#define AV_EF_CAREFUL    (1<<16)        ///< consider things that violate the spec, are fast to calculate and have not been seen in the wild as errors
+#define AV_EF_COMPLIANT  (1<<17)        ///< consider all spec non compliances as errors
+#define AV_EF_AGGRESSIVE (1<<18)        ///< consider things that a sane encoder should not do as an error
+
 
     /**
      * opaque 64bit number (generally a PTS) that will be reordered and
@@ -2541,8 +2779,8 @@ typedef struct AVCodecContext {
      * Hardware accelerator context.
      * For some hardware accelerators, a global context needs to be
      * provided by the user. In that case, this holds display-dependent
-     * data Libav cannot instantiate itself. Please refer to the
-     * Libav HW accelerator documentation to know how to fill this
+     * data FFmpeg cannot instantiate itself. Please refer to the
+     * FFmpeg HW accelerator documentation to know how to fill this
      * is. e.g. for VA API, this is a struct vaapi_context.
      * - encoding: unused
      * - decoding: Set by user
@@ -2602,6 +2840,7 @@ typedef struct AVCodecContext {
 #if FF_API_ARCH_ALPHA
 #define FF_IDCT_SIMPLEALPHA   23
 #endif
+#define FF_IDCT_SIMPLEAUTO    128
 
     /**
      * bits per sample/pixel from the demuxer (needed for huffyuv).
@@ -2622,10 +2861,10 @@ typedef struct AVCodecContext {
      * low resolution decoding, 1-> 1/2 size, 2->1/4 size
      * - encoding: unused
      * - decoding: Set by user.
-     *
-     * @deprecated use decoder private options instead
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_lowres(avctx)
      */
-    attribute_deprecated int lowres;
+     int lowres;
 #endif
 
 #if FF_API_CODED_FRAME
@@ -2708,7 +2947,7 @@ typedef struct AVCodecContext {
     int (*execute2)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, int count);
 
     /**
-     * noise vs. sse weight for the nsse comparsion function
+     * noise vs. sse weight for the nsse comparison function
      * - encoding: Set by user.
      * - decoding: unused
      */
@@ -2801,6 +3040,7 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_HEVC_MAIN                        1
 #define FF_PROFILE_HEVC_MAIN_10                     2
 #define FF_PROFILE_HEVC_MAIN_STILL_PICTURE          3
+#define FF_PROFILE_HEVC_REXT                        4
 
     /**
      * level
@@ -2811,21 +3051,21 @@ typedef struct AVCodecContext {
 #define FF_LEVEL_UNKNOWN -99
 
     /**
-     *
+     * Skip loop filtering for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_loop_filter;
 
     /**
-     *
+     * Skip IDCT/dequantization for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_idct;
 
     /**
-     *
+     * Skip decoding for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
@@ -2891,7 +3131,7 @@ typedef struct AVCodecContext {
      */
     int initial_padding;
 
-    /*
+    /**
      * - decoding: For codecs that store a framerate value in the compressed
      *             bitstream, the decoder may export it here. { 0, 1} when
      *             unknown.
@@ -2905,8 +3145,153 @@ typedef struct AVCodecContext {
      * - decoding: Set by libavcodec before calling get_format()
      */
     enum AVPixelFormat sw_pix_fmt;
+
+    /**
+     * Timebase in which pkt_dts/pts and AVPacket.dts/pts are.
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_pkt_timebase(avctx)
+     * - encoding unused.
+     * - decoding set by user.
+     */
+    AVRational pkt_timebase;
+
+    /**
+     * AVCodecDescriptor
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_codec_descriptor(avctx)
+     * - encoding: unused.
+     * - decoding: set by libavcodec.
+     */
+    const AVCodecDescriptor *codec_descriptor;
+
+#if !FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_lowres(avctx)
+     */
+     int lowres;
+#endif
+
+    /**
+     * Current statistics for PTS correction.
+     * - decoding: maintained and used by libavcodec, not intended to be used by user apps
+     * - encoding: unused
+     */
+    int64_t pts_correction_num_faulty_pts; /// Number of incorrect PTS values so far
+    int64_t pts_correction_num_faulty_dts; /// Number of incorrect DTS values so far
+    int64_t pts_correction_last_pts;       /// PTS of the last frame
+    int64_t pts_correction_last_dts;       /// DTS of the last frame
+
+    /**
+     * Character encoding of the input subtitles file.
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    char *sub_charenc;
+
+    /**
+     * Subtitles character encoding mode. Formats or codecs might be adjusting
+     * this setting (if they are doing the conversion themselves for instance).
+     * - decoding: set by libavcodec
+     * - encoding: unused
+     */
+    int sub_charenc_mode;
+#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
+#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
+#define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
+
+    /**
+     * Skip processing alpha if supported by codec.
+     * Note that if the format uses pre-multiplied alpha (common with VP6,
+     * and recommended due to better video quality/compression)
+     * the image will look as if alpha-blended onto a black background.
+     * However for formats that do not use pre-multiplied alpha
+     * there might be serious artefacts (though e.g. libswscale currently
+     * assumes pre-multiplied alpha anyway).
+     * Code outside libavcodec should access this field using AVOptions
+     *
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int skip_alpha;
+
+    /**
+     * Number of samples to skip after a discontinuity
+     * - decoding: unused
+     * - encoding: set by libavcodec
+     */
+    int seek_preroll;
+
+#if !FF_API_DEBUG_MV
+    /**
+     * debug motion vectors
+     * Code outside libavcodec should access this field using AVOptions
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 //visualize forward predicted MVs of P frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 //visualize forward predicted MVs of B frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 //visualize backward predicted MVs of B frames
+#endif
+
+    /**
+     * custom intra quantization matrix
+     * Code outside libavcodec should access this field using av_codec_g/set_chroma_intra_matrix()
+     * - encoding: Set by user, can be NULL.
+     * - decoding: unused.
+     */
+    uint16_t *chroma_intra_matrix;
+
+    /**
+     * dump format separator.
+     * can be ", " or "\n      " or anything else
+     * Code outside libavcodec should access this field using AVOptions
+     * (NO direct access).
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    uint8_t *dump_separator;
+
+    /**
+     * ',' separated list of allowed decoders.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user through AVOPtions (NO direct access)
+     */
+    char *codec_whitelist;
+
+    /*
+     * Properties of the stream that gets decoded
+     * To be accessed through av_codec_get_properties() (NO direct access)
+     * - encoding: unused
+     * - decoding: set by libavcodec
+     */
+    unsigned properties;
+#define FF_CODEC_PROPERTY_LOSSLESS        0x00000001
+#define FF_CODEC_PROPERTY_CLOSED_CAPTIONS 0x00000002
 } AVCodecContext;
 
+AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+void       av_codec_set_pkt_timebase         (AVCodecContext *avctx, AVRational val);
+
+const AVCodecDescriptor *av_codec_get_codec_descriptor(const AVCodecContext *avctx);
+void                     av_codec_set_codec_descriptor(AVCodecContext *avctx, const AVCodecDescriptor *desc);
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *avctx);
+
+int  av_codec_get_lowres(const AVCodecContext *avctx);
+void av_codec_set_lowres(AVCodecContext *avctx, int val);
+
+int  av_codec_get_seek_preroll(const AVCodecContext *avctx);
+void av_codec_set_seek_preroll(AVCodecContext *avctx, int val);
+
+uint16_t *av_codec_get_chroma_intra_matrix(const AVCodecContext *avctx);
+void av_codec_set_chroma_intra_matrix(AVCodecContext *avctx, uint16_t *val);
+
 /**
  * AVProfile.
  */
@@ -2947,9 +3332,7 @@ typedef struct AVCodec {
     const int *supported_samplerates;       ///< array of supported audio samplerates, or NULL if unknown, array is terminated by 0
     const enum AVSampleFormat *sample_fmts; ///< array of supported sample formats, or NULL if unknown, array is terminated by -1
     const uint64_t *channel_layouts;         ///< array of support channel layouts, or NULL if unknown. array is terminated by 0
-#if FF_API_LOWRES
-    attribute_deprecated uint8_t max_lowres; ///< maximum value for lowres supported by the decoder
-#endif
+    uint8_t max_lowres;                     ///< maximum value for lowres supported by the decoder, no direct access, use av_codec_get_max_lowres()
     const AVClass *priv_class;              ///< AVClass for the private context
     const AVProfile *profiles;              ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN}
 
@@ -3021,6 +3404,10 @@ typedef struct AVCodec {
     int caps_internal;
 } AVCodec;
 
+int av_codec_get_max_lowres(const AVCodec *codec);
+
+struct MpegEncContext;
+
 /**
  * @defgroup lavc_hwaccel AVHWAccel
  * @{
@@ -3056,7 +3443,7 @@ typedef struct AVHWAccel {
 
     /**
      * Hardware accelerated codec capabilities.
-     * see FF_HWACCEL_CODEC_CAP_*
+     * see HWACCEL_CODEC_CAP_*
      */
     int capabilities;
 
@@ -3095,6 +3482,7 @@ typedef struct AVHWAccel {
      *
      * Meaningful slice information (codec specific) is guaranteed to
      * be parsed at this point. This function is mandatory.
+     * The only exception is XvMC, that works on MB level.
      *
      * @param avctx the codec context
      * @param buf the slice data buffer base
@@ -3124,6 +3512,17 @@ typedef struct AVHWAccel {
     int frame_priv_data_size;
 
     /**
+     * Called for every Macroblock in a slice.
+     *
+     * XvMC uses it to replace the ff_mpv_decode_mb().
+     * Instead of decoding to raw picture, MB parameters are
+     * stored in an array provided by the video driver.
+     *
+     * @param s the mpeg context
+     */
+    void (*decode_mb)(struct MpegEncContext *s);
+
+    /**
      * Initialize the hwaccel private data.
      *
      * This will be called from ff_get_format(), after hwaccel and
@@ -3151,6 +3550,9 @@ typedef struct AVHWAccel {
  * Hardware acceleration should be used for decoding even if the codec level
  * used is unknown or higher than the maximum supported level reported by the
  * hardware driver.
+ *
+ * It's generally a good idea to pass this flag unless you have a specific
+ * reason not to, as hardware tends to under-report supported levels.
  */
 #define AV_HWACCEL_FLAG_IGNORE_LEVEL (1 << 0)
 
@@ -3172,11 +3574,13 @@ typedef struct AVHWAccel {
  */
 
 /**
- * four components are given, that's all.
- * the last component is alpha
+ * Picture data structure.
+ *
+ * Up to four components can be stored into it, the last component is
+ * alpha.
  */
 typedef struct AVPicture {
-    uint8_t *data[AV_NUM_DATA_POINTERS];
+    uint8_t *data[AV_NUM_DATA_POINTERS];    ///< pointers to the image data planes
     int linesize[AV_NUM_DATA_POINTERS];     ///< number of bytes per line
 } AVPicture;
 
@@ -3184,9 +3588,6 @@ typedef struct AVPicture {
  * @}
  */
 
-#define AVPALETTE_SIZE 1024
-#define AVPALETTE_COUNT 256
-
 enum AVSubtitleType {
     SUBTITLE_NONE,
 
@@ -3216,7 +3617,7 @@ typedef struct AVSubtitleRect {
 
     /**
      * data+linesize for the bitmap of this subtitle.
-     * can be set for text/ass as well once they where rendered
+     * can be set for text/ass as well once they are rendered
      */
     AVPicture pict;
     enum AVSubtitleType type;
@@ -3225,10 +3626,11 @@ typedef struct AVSubtitleRect {
 
     /**
      * 0 terminated ASS/SSA compatible event line.
-     * The pressentation of this is unaffected by the other values in this
+     * The presentation of this is unaffected by the other values in this
      * struct.
      */
     char *ass;
+
     int flags;
 } AVSubtitleRect;
 
@@ -3327,13 +3729,29 @@ int avcodec_get_context_defaults3(AVCodecContext *s, const AVCodec *codec);
 const AVClass *avcodec_get_class(void);
 
 /**
+ * Get the AVClass for AVFrame. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_frame_class(void);
+
+/**
+ * Get the AVClass for AVSubtitleRect. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_subtitle_rect_class(void);
+
+/**
  * Copy the settings of the source AVCodecContext into the destination
  * AVCodecContext. The resulting destination codec context will be
  * unopened, i.e. you are required to call avcodec_open2() before you
  * can use this AVCodecContext to decode/encode video/audio data.
  *
  * @param dest target codec context, should be initialized with
- *             avcodec_alloc_context3(), but otherwise uninitialized
+ *             avcodec_alloc_context3(NULL), but otherwise uninitialized
  * @param src source codec context
  * @return AVERROR() on error (e.g. memory allocation error), 0 on success
  */
@@ -3465,6 +3883,20 @@ int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size);
 int av_dup_packet(AVPacket *pkt);
 
 /**
+ * Copy packet, including contents
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Copy packet side data
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet_side_data(AVPacket *dst, const AVPacket *src);
+
+/**
  * Free a packet.
  *
  * @param pkt packet to free
@@ -3504,6 +3936,31 @@ int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
 uint8_t* av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                                  int *size);
 
+int av_packet_merge_side_data(AVPacket *pkt);
+
+int av_packet_split_side_data(AVPacket *pkt);
+
+const char *av_packet_side_data_name(enum AVPacketSideDataType type);
+
+/**
+ * Pack a dictionary for use in side_data.
+ *
+ * @param dict The dictionary to pack.
+ * @param size pointer to store the size of the returned data
+ * @return pointer to data if successful, NULL otherwise
+ */
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size);
+/**
+ * Unpack a dictionary from side_data.
+ *
+ * @param data data from side_data
+ * @param size size of the data
+ * @param dict the metadata storage dictionary
+ * @return 0 on success, < 0 on failure
+ */
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict);
+
+
 /**
  * Convenience function to free all the side data stored.
  * All the other fields stay untouched.
@@ -3528,7 +3985,7 @@ void av_packet_free_side_data(AVPacket *pkt);
  *
  * @return 0 on success, a negative AVERROR on error.
  */
-int av_packet_ref(AVPacket *dst, AVPacket *src);
+int av_packet_ref(AVPacket *dst, const AVPacket *src);
 
 /**
  * Wipe the packet.
@@ -3644,6 +4101,28 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS]);
 
 /**
+ * Converts AVChromaLocation to swscale x/y chroma position.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos);
+
+/**
+ * Converts swscale x/y chroma position to AVChromaLocation.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos);
+
+/**
  * Decode the audio frame of size avpkt->size from avpkt->data into frame.
  *
  * Some decoders may support multiple frames in a single AVPacket. Such
@@ -3695,7 +4174,7 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
  *         AVPacket is returned.
  */
 int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
-                          int *got_frame_ptr, AVPacket *avpkt);
+                          int *got_frame_ptr, const AVPacket *avpkt);
 
 /**
  * Decode the video frame of size avpkt->size from avpkt->data into picture.
@@ -3731,7 +4210,7 @@ int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
  *             next call to this function or until closing or flushing the
  *             decoder. The caller may not write to it.
  *
- * @param[in] avpkt The input AVpacket containing the input buffer.
+ * @param[in] avpkt The input AVPacket containing the input buffer.
  *            You can create such packet with av_init_packet() and by then setting
  *            data and size, some decoders might in addition need other fields like
  *            flags&AV_PKT_FLAG_KEY. All decoders are designed to use the least
@@ -3742,7 +4221,7 @@ int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
  */
 int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                          int *got_picture_ptr,
-                         AVPacket *avpkt);
+                         const AVPacket *avpkt);
 
 /**
  * Decode a subtitle message.
@@ -3754,12 +4233,20 @@ int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
  * and reusing a get_buffer written for video codecs would probably perform badly
  * due to a potentially very different allocation pattern.
  *
+ * Some decoders (those marked with CODEC_CAP_DELAY) have a delay between input
+ * and output. This means that for some packets they will not immediately
+ * produce decoded output and need to be flushed at the end of decoding to get
+ * all the decoded data. Flushing is done by calling this function with packets
+ * with avpkt->data set to NULL and avpkt->size set to 0 until it stops
+ * returning subtitles. It is safe to flush even those decoders that are not
+ * marked with CODEC_CAP_DELAY, then no subtitles will be returned.
+ *
  * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
  * before packets may be fed to the decoder.
  *
  * @param avctx the codec context
- * @param[out] sub The AVSubtitle in which the decoded subtitle will be stored, must be
-                   freed with avsubtitle_free if *got_sub_ptr is set.
+ * @param[out] sub The Preallocated AVSubtitle in which the decoded subtitle will be stored,
+ *                 must be freed with avsubtitle_free if *got_sub_ptr is set.
  * @param[in,out] got_sub_ptr Zero if no subtitle could be decompressed, otherwise, it is nonzero.
  * @param[in] avpkt The input AVPacket containing the input buffer.
  */
@@ -3817,6 +4304,7 @@ typedef struct AVCodecParserContext {
 #define PARSER_FLAG_ONCE                      0x0002
 /// Set if the parser has a valid file offset
 #define PARSER_FLAG_FETCHED_OFFSET            0x0004
+#define PARSER_FLAG_USE_CODEC_TS              0x1000
 
     int64_t offset;      ///< byte offset from starting packet start
     int64_t cur_frame_end[AV_PARSER_PTS_NB];
@@ -4002,7 +4490,7 @@ int av_parser_parse2(AVCodecParserContext *s,
 
 /**
  * @return 0 if the output buffer is a subset of the input, 1 if it is allocated and must be freed
- * @deprecated use AVBitstreamFilter
+ * @deprecated use AVBitStreamFilter
  */
 int av_parser_change(AVCodecParserContext *s,
                      AVCodecContext *avctx,
@@ -4049,11 +4537,12 @@ AVCodec *avcodec_find_encoder_by_name(const char *name);
  *                  The user can supply an output buffer by setting
  *                  avpkt->data and avpkt->size prior to calling the
  *                  function, but if the size of the user-provided data is not
- *                  large enough, encoding will fail. All other AVPacket fields
- *                  will be reset by the encoder using av_init_packet(). If
- *                  avpkt->data is NULL, the encoder will allocate it.
- *                  The encoder will set avpkt->size to the size of the
- *                  output packet.
+ *                  large enough, encoding will fail. If avpkt->data and
+ *                  avpkt->size are set, avpkt->destruct must also be set. All
+ *                  other AVPacket fields will be reset by the encoder using
+ *                  av_init_packet(). If avpkt->data is NULL, the encoder will
+ *                  allocate it. The encoder will set avpkt->size to the size
+ *                  of the output packet.
  *
  *                  If this function fails or produces no output, avpkt will be
  *                  freed using av_free_packet() (i.e. avpkt->destruct will be
@@ -4122,21 +4611,121 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
  * @}
  */
 
+#if FF_API_AVCODEC_RESAMPLE
+/**
+ * @defgroup lavc_resample Audio resampling
+ * @ingroup libavc
+ * @deprecated use libswresample instead
+ *
+ * @{
+ */
+struct ReSampleContext;
+struct AVResampleContext;
+
+typedef struct ReSampleContext ReSampleContext;
+
+/**
+ *  Initialize audio resampling context.
+ *
+ * @param output_channels  number of output channels
+ * @param input_channels   number of input channels
+ * @param output_rate      output sample rate
+ * @param input_rate       input sample rate
+ * @param sample_fmt_out   requested output sample format
+ * @param sample_fmt_in    input sample format
+ * @param filter_length    length of each FIR filter in the filterbank relative to the cutoff frequency
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear           if 1 then the used FIR filter will be linearly interpolated
+                           between the 2 closest, if 0 the closest will be used
+ * @param cutoff           cutoff frequency, 1.0 corresponds to half the output sampling rate
+ * @return allocated ReSampleContext, NULL if error occurred
+ */
+attribute_deprecated
+ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
+                                        int output_rate, int input_rate,
+                                        enum AVSampleFormat sample_fmt_out,
+                                        enum AVSampleFormat sample_fmt_in,
+                                        int filter_length, int log2_phase_count,
+                                        int linear, double cutoff);
+
+attribute_deprecated
+int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples);
+
+/**
+ * Free resample context.
+ *
+ * @param s a non-NULL pointer to a resample context previously
+ *          created with av_audio_resample_init()
+ */
+attribute_deprecated
+void audio_resample_close(ReSampleContext *s);
+
+
+/**
+ * Initialize an audio resampler.
+ * Note, if either rate is not an integer then simply scale both rates up so they are.
+ * @param filter_length length of each FIR filter in the filterbank relative to the cutoff freq
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear If 1 then the used FIR filter will be linearly interpolated
+                 between the 2 closest, if 0 the closest will be used
+ * @param cutoff cutoff frequency, 1.0 corresponds to half the output sampling rate
+ */
+attribute_deprecated
+struct AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_length, int log2_phase_count, int linear, double cutoff);
+
+/**
+ * Resample an array of samples using a previously configured context.
+ * @param src an array of unconsumed samples
+ * @param consumed the number of samples of src which have been consumed are returned here
+ * @param src_size the number of unconsumed samples available
+ * @param dst_size the amount of space in samples available in dst
+ * @param update_ctx If this is 0 then the context will not be modified, that way several channels can be resampled with the same context.
+ * @return the number of samples written in dst or -1 if an error occurred
+ */
+attribute_deprecated
+int av_resample(struct AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx);
+
+
+/**
+ * Compensate samplerate/timestamp drift. The compensation is done by changing
+ * the resampler parameters, so no audible clicks or similar distortions occur
+ * @param compensation_distance distance in output samples over which the compensation should be performed
+ * @param sample_delta number of output samples which should be output less
+ *
+ * example: av_resample_compensate(c, 10, 500)
+ * here instead of 510 samples only 500 samples would be output
+ *
+ * note, due to rounding the actual compensation might be slightly different,
+ * especially if the compensation_distance is large and the in_rate used during init is small
+ */
+attribute_deprecated
+void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int compensation_distance);
+attribute_deprecated
+void av_resample_close(struct AVResampleContext *c);
+
+/**
+ * @}
+ */
+#endif
+
 /**
  * @addtogroup lavc_picture
  * @{
  */
 
 /**
- * Allocate memory for a picture.  Call avpicture_free() to free it.
+ * Allocate memory for the pixels of a picture and setup the AVPicture
+ * fields for it.
+ *
+ * Call avpicture_free() to free it.
  *
- * @see avpicture_fill()
+ * @param picture            the picture structure to be filled in
+ * @param pix_fmt            the pixel format of the picture
+ * @param width              the width of the picture
+ * @param height             the height of the picture
+ * @return zero if successful, a negative error code otherwise
  *
- * @param picture the picture to be filled in
- * @param pix_fmt the format of the picture
- * @param width the width of the picture
- * @param height the height of the picture
- * @return zero if successful, a negative value if not
+ * @see av_image_alloc(), avpicture_fill()
  */
 int avpicture_alloc(AVPicture *picture, enum AVPixelFormat pix_fmt, int width, int height);
 
@@ -4150,34 +4739,69 @@ int avpicture_alloc(AVPicture *picture, enum AVPixelFormat pix_fmt, int width, i
 void avpicture_free(AVPicture *picture);
 
 /**
- * Fill in the AVPicture fields, always assume a linesize alignment of 1.
+ * Setup the picture fields based on the specified image parameters
+ * and the provided image data buffer.
  *
- * @see av_image_fill_arrays().
+ * The picture fields are filled in by using the image data buffer
+ * pointed to by ptr.
+ *
+ * If ptr is NULL, the function will fill only the picture linesize
+ * array and return the required size for the image buffer.
+ *
+ * To allocate an image buffer and fill the picture data in one call,
+ * use avpicture_alloc().
+ *
+ * @param picture       the picture to be filled in
+ * @param ptr           buffer where the image data is stored, or NULL
+ * @param pix_fmt       the pixel format of the image
+ * @param width         the width of the image in pixels
+ * @param height        the height of the image in pixels
+ * @return the size in bytes required for src, a negative error code
+ * in case of failure
+ *
+ * @see av_image_fill_arrays()
  */
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height);
 
 /**
- * Copy pixel data from an AVPicture into a buffer, always assume a
- * linesize alignment of 1.
+ * Copy pixel data from an AVPicture into a buffer.
+ *
+ * avpicture_get_size() can be used to compute the required size for
+ * the buffer to fill.
  *
- * @see av_image_copy_to_buffer().
+ * @param src        source picture with filled data
+ * @param pix_fmt    picture pixel format
+ * @param width      picture width
+ * @param height     picture height
+ * @param dest       destination buffer
+ * @param dest_size  destination buffer size in bytes
+ * @return the number of bytes written to dest, or a negative value
+ * (error code) on error, for example if the destination buffer is not
+ * big enough
+ *
+ * @see av_image_copy_to_buffer()
  */
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
+int avpicture_layout(const AVPicture *src, enum AVPixelFormat pix_fmt,
                      int width, int height,
                      unsigned char *dest, int dest_size);
 
 /**
  * Calculate the size in bytes that a picture of the given width and height
  * would occupy if stored in the given picture format.
- * Always assume a linesize alignment of 1.
+ *
+ * @param pix_fmt    picture pixel format
+ * @param width      picture width
+ * @param height     picture height
+ * @return the computed picture buffer size or a negative error code
+ * in case of error
  *
  * @see av_image_get_buffer_size().
  */
 int avpicture_get_size(enum AVPixelFormat pix_fmt, int width, int height);
 
 /**
- * Copy image src to dst. Wraps av_picture_data_copy() above.
+ * Copy image src to dst. Wraps av_image_copy().
  */
 void av_picture_copy(AVPicture *dst, const AVPicture *src,
                      enum AVPixelFormat pix_fmt, int width, int height);
@@ -4215,10 +4839,21 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
  */
 
 /**
- * @deprecated Use av_pix_fmt_get_chroma_sub_sample
+ * Utility function to access log2_chroma_w log2_chroma_h from
+ * the pixel format AVPixFmtDescriptor.
+ *
+ * This function asserts that pix_fmt is valid. See av_pix_fmt_get_chroma_sub_sample
+ * for one that returns a failure code and continues in case of invalid
+ * pix_fmts.
+ *
+ * @param[in]  pix_fmt the pixel format
+ * @param[out] h_shift store log2_chroma_w
+ * @param[out] v_shift store log2_chroma_h
+ *
+ * @see av_pix_fmt_get_chroma_sub_sample
  */
 
-void attribute_deprecated avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
+void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
 
 /**
  * Return a value representing the fourCC code associated to the
@@ -4227,29 +4862,8 @@ void attribute_deprecated avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_f
  */
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat pix_fmt);
 
-#define FF_LOSS_RESOLUTION  0x0001 /**< loss due to resolution change */
-#define FF_LOSS_DEPTH       0x0002 /**< loss due to color depth change */
-#define FF_LOSS_COLORSPACE  0x0004 /**< loss due to color space conversion */
-#define FF_LOSS_ALPHA       0x0008 /**< loss of alpha bits */
-#define FF_LOSS_COLORQUANT  0x0010 /**< loss due to color quantization */
-#define FF_LOSS_CHROMA      0x0020 /**< loss of chroma (e.g. RGB to gray conversion) */
-
-/**
- * Compute what kind of losses will occur when converting from one specific
- * pixel format to another.
- * When converting from one pixel format to another, information loss may occur.
- * For example, when converting from RGB24 to GRAY, the color information will
- * be lost. Similarly, other losses occur when converting from some formats to
- * other formats. These losses can involve loss of chroma, but also loss of
- * resolution, loss of color depth, loss due to the color space conversion, loss
- * of the alpha bits or loss due to color quantization.
- * avcodec_get_fix_fmt_loss() informs you about the various types of losses
- * which will occur when converting from one pixel format to another.
- *
- * @param[in] dst_pix_fmt destination pixel format
- * @param[in] src_pix_fmt source pixel format
- * @param[in] has_alpha Whether the source pixel format alpha channel is used.
- * @return Combination of flags informing you what kind of losses will occur.
+/**
+ * @deprecated see av_get_pix_fmt_loss()
  */
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat src_pix_fmt,
                              int has_alpha);
@@ -4259,7 +4873,7 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * format.  When converting from one pixel format to another, information loss
  * may occur.  For example, when converting from RGB24 to GRAY, the color
  * information will be lost. Similarly, other losses occur when converting from
- * some formats to other formats. avcodec_find_best_pix_fmt2() searches which of
+ * some formats to other formats. avcodec_find_best_pix_fmt_of_2() searches which of
  * the given pixel formats should be used to suffer the least amount of loss.
  * The pixel formats from which it chooses one, are determined by the
  * pix_fmt_list parameter.
@@ -4271,9 +4885,26 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * @param[out] loss_ptr Combination of flags informing you what kind of losses will occur.
  * @return The best pixel format to convert to or -1 if none was found.
  */
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr);
+
+/**
+ * @deprecated see av_find_best_pix_fmt_of_2()
+ */
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+attribute_deprecated
+#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
+enum AVPixelFormat avcodec_find_best_pix_fmt2(const enum AVPixelFormat *pix_fmt_list,
                                               enum AVPixelFormat src_pix_fmt,
                                               int has_alpha, int *loss_ptr);
+#else
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+#endif
+
 
 enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
 
@@ -4316,7 +4947,12 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
 //FIXME func typedef
 
 /**
- * Fill audio frame data and linesize.
+ * Fill AVFrame audio data and linesize pointers.
+ *
+ * The buffer buf must be a preallocated buffer with a size big enough
+ * to contain the specified samples amount. The filled AVFrame data
+ * pointers will point to this buffer.
+ *
  * AVFrame extended_data channel pointers are allocated if necessary for
  * planar audio.
  *
@@ -4329,7 +4965,9 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
  * @param buf         buffer to use for frame data
  * @param buf_size    size of buffer
  * @param align       plane size sample alignment (0 = default)
- * @return            0 on success, negative error code on failure
+ * @return            >=0 on success, negative error code on failure
+ * @todo return the size in bytes required to store the samples in
+ * case of success, at the next libavutil bump
  */
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
@@ -4355,6 +4993,14 @@ void avcodec_flush_buffers(AVCodecContext *avctx);
 int av_get_bits_per_sample(enum AVCodecID codec_id);
 
 /**
+ * Return the PCM codec associated with a sample format.
+ * @param be  endianness, 0 for little, 1 for big,
+ *            -1 (or anything else) for native
+ * @return  AV_CODEC_ID_PCM_* or AV_CODEC_ID_NONE
+ */
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be);
+
+/**
  * Return codec bits per sample.
  * Only return non-zero if the bits per sample is exactly correct, not an
  * approximation.
@@ -4394,28 +5040,98 @@ typedef struct AVBitStreamFilter {
     struct AVBitStreamFilter *next;
 } AVBitStreamFilter;
 
+/**
+ * Register a bitstream filter.
+ *
+ * The filter will be accessible to the application code through
+ * av_bitstream_filter_next() or can be directly initialized with
+ * av_bitstream_filter_init().
+ *
+ * @see avcodec_register_all()
+ */
 void av_register_bitstream_filter(AVBitStreamFilter *bsf);
+
+/**
+ * Create and initialize a bitstream filter context given a bitstream
+ * filter name.
+ *
+ * The returned context must be freed with av_bitstream_filter_close().
+ *
+ * @param name    the name of the bitstream filter
+ * @return a bitstream filter context if a matching filter was found
+ * and successfully initialized, NULL otherwise
+ */
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
+
+/**
+ * Filter bitstream.
+ *
+ * This function filters the buffer buf with size buf_size, and places the
+ * filtered buffer in the buffer pointed to by poutbuf.
+ *
+ * The output buffer must be freed by the caller.
+ *
+ * @param bsfc            bitstream filter context created by av_bitstream_filter_init()
+ * @param avctx           AVCodecContext accessed by the filter, may be NULL.
+ *                        If specified, this must point to the encoder context of the
+ *                        output stream the packet is sent to.
+ * @param args            arguments which specify the filter configuration, may be NULL
+ * @param poutbuf         pointer which is updated to point to the filtered buffer
+ * @param poutbuf_size    pointer which is updated to the filtered buffer size in bytes
+ * @param buf             buffer containing the data to filter
+ * @param buf_size        size in bytes of buf
+ * @param keyframe        set to non-zero if the buffer to filter corresponds to a key-frame packet data
+ * @return >= 0 in case of success, or a negative error code in case of failure
+ *
+ * If the return value is positive, an output buffer is allocated and
+ * is available in *poutbuf, and is distinct from the input buffer.
+ *
+ * If the return value is 0, the output buffer is not allocated and
+ * should be considered identical to the input buffer, or in case
+ * *poutbuf was set it points to the input buffer (not necessarily to
+ * its starting address).
+ */
 int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
                                AVCodecContext *avctx, const char *args,
                                uint8_t **poutbuf, int *poutbuf_size,
                                const uint8_t *buf, int buf_size, int keyframe);
+
+/**
+ * Release bitstream filter context.
+ *
+ * @param bsf the bitstream filter context created with
+ * av_bitstream_filter_init(), can be NULL
+ */
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
 
+/**
+ * If f is NULL, return the first registered bitstream filter,
+ * if f is non-NULL, return the next registered bitstream filter
+ * after f, or NULL if f is the last one.
+ *
+ * This function can be used to iterate over all registered bitstream
+ * filters.
+ */
 AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
 
 /* memory */
 
 /**
- * Allocate a buffer with padding, reusing the given one if large enough.
- *
  * Same behaviour av_fast_malloc but the buffer has additional
- * AV_INPUT_PADDING_SIZE at the end which will always memset to 0.
+ * AV_INPUT_BUFFER_PADDING_SIZE at the end which will always be 0.
  *
+ * In addition the whole buffer will initially and after resizes
+ * be 0-initialized so that no uninitialized data will ever appear.
  */
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size);
 
 /**
+ * Same behaviour av_fast_padded_malloc except that buffer will always
+ * be 0-initialized after call.
+ */
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size);
+
+/**
  * Encode extradata length to a buffer. Used by xiph codecs.
  *
  * @param s buffer to write to; must be at least (v/255+1) bytes long
@@ -4427,7 +5143,7 @@ unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
 #if FF_API_MISSING_SAMPLE
 /**
  * Log a generic warning message about a missing feature. This function is
- * intended to be used internally by Libav (libavcodec, libavformat, etc.)
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
  * only, and would normally not be used by applications.
  * @param[in] avc a pointer to an arbitrary struct of which the first field is
  * a pointer to an AVClass struct
@@ -4443,7 +5159,7 @@ void av_log_missing_feature(void *avc, const char *feature, int want_sample);
 
 /**
  * Log a generic warning message asking for a sample. This function is
- * intended to be used internally by Libav (libavcodec, libavformat, etc.)
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
  * only, and would normally not be used by applications.
  * @param[in] avc a pointer to an arbitrary struct of which the first field is
  * a pointer to an AVClass struct
@@ -4508,6 +5224,12 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
 enum AVMediaType avcodec_get_type(enum AVCodecID codec_id);
 
 /**
+ * Get the name of a codec.
+ * @return  a static string identifying the codec; never NULL
+ */
+const char *avcodec_get_name(enum AVCodecID id);
+
+/**
  * @return a positive value if s is open (i.e. avcodec_open2() was called on it
  * with no corresponding avcodec_close()), 0 otherwise.
  */
diff --git a/libavcodec/avcodecres.rc b/libavcodec/avcodecres.rc
new file mode 100644
index 0000000000..4b69686177
--- /dev/null
+++ b/libavcodec/avcodecres.rc
@@ -0,0 +1,55 @@
+/*
+ * Windows resource file for libavcodec
+ *
+ * Copyright (C) 2012 James Almer
+ * Copyright (C) 2013 Tiancheng "Timothy" Gu
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <windows.h>
+#include "libavcodec/version.h"
+#include "libavutil/ffversion.h"
+#include "config.h"
+
+1 VERSIONINFO
+FILEVERSION     LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+PRODUCTVERSION  LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
+FILEOS          VOS_NT_WINDOWS32
+FILETYPE        VFT_DLL
+{
+    BLOCK "StringFileInfo"
+    {
+        BLOCK "040904B0"
+        {
+            VALUE "CompanyName",      "FFmpeg Project"
+            VALUE "FileDescription",  "FFmpeg codec library"
+            VALUE "FileVersion",      AV_STRINGIFY(LIBAVCODEC_VERSION)
+            VALUE "InternalName",     "libavcodec"
+            VALUE "LegalCopyright",   "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
+            VALUE "OriginalFilename", "avcodec" BUILDSUF "-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) SLIBSUF
+            VALUE "ProductName",      "FFmpeg"
+            VALUE "ProductVersion",   FFMPEG_VERSION
+        }
+    }
+
+    BLOCK "VarFileInfo"
+    {
+        VALUE "Translation", 0x0409, 0x04B0
+    }
+}
diff --git a/libavcodec/avdct.c b/libavcodec/avdct.c
new file mode 100644
index 0000000000..3b622bac23
--- /dev/null
+++ b/libavcodec/avdct.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "idctdsp.h"
+#include "fdctdsp.h"
+#include "pixblockdsp.h"
+#include "avdct.h"
+
+#define OFFSET(x) offsetof(AVDCT,x)
+#define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C
+//these names are too long to be readable
+#define V AV_OPT_FLAG_VIDEO_PARAM
+#define A AV_OPT_FLAG_AUDIO_PARAM
+#define E AV_OPT_FLAG_ENCODING_PARAM
+#define D AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption avdct_options[] = {
+{"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
+{"auto", "autoselect a good one (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
+{"fastint", "fast integer (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"mmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"},
+{"faan", "floating point AAN DCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"},
+
+{"idct", "select IDCT implementation", OFFSET(idct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E|D, "idct"},
+{"auto", "autoselect a good one (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_AUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"int", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simple", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplemmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"arm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#if FF_API_ARCH_SH4
+{"sh4", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SH4 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+{"simplearm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv5te", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv6", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simpleneon", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#if FF_API_ARCH_ALPHA
+{"simplealpha", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEALPHA }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+#if FF_API_UNUSED_MEMBERS
+{"ipp", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_IPP }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+{"xvid", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"xvidmmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"faani", "floating point AAN IDCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+
+{"bits_per_sample", "", OFFSET(bits_per_sample), AV_OPT_TYPE_INT, {.i64 = 8 }, 0, 14, 0,},
+{NULL},
+};
+
+static const AVClass avdct_class = {
+    .class_name              = "AVDCT",
+    .option                  = avdct_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_dct_get_class(void)
+{
+    return &avdct_class;
+}
+
+AVDCT *avcodec_dct_alloc(void)
+{
+    AVDCT *dsp = av_mallocz(sizeof(AVDCT));
+
+    if (!dsp)
+        return NULL;
+
+    dsp->av_class = &avdct_class;
+    av_opt_set_defaults(dsp);
+
+    return dsp;
+}
+
+int avcodec_dct_init(AVDCT *dsp)
+{
+    AVCodecContext *avctx = avcodec_alloc_context3(NULL);
+
+    if (!avctx)
+        return AVERROR(ENOMEM);
+
+    avctx->idct_algo = dsp->idct_algo;
+    avctx->dct_algo  = dsp->dct_algo;
+    avctx->bits_per_raw_sample = dsp->bits_per_sample;
+
+#define COPY(src, name) memcpy(&dsp->name, &src.name, sizeof(dsp->name))
+
+#if CONFIG_IDCTDSP
+    {
+        IDCTDSPContext idsp;
+        ff_idctdsp_init(&idsp, avctx);
+        COPY(idsp, idct);
+        COPY(idsp, idct_permutation);
+    }
+#endif
+
+#if CONFIG_FDCTDSP
+    {
+        FDCTDSPContext fdsp;
+        ff_fdctdsp_init(&fdsp, avctx);
+        COPY(fdsp, fdct);
+    }
+#endif
+
+#if CONFIG_PIXBLOCKDSP
+    {
+        PixblockDSPContext pdsp;
+        ff_pixblockdsp_init(&pdsp, avctx);
+        COPY(pdsp, get_pixels);
+    }
+#endif
+
+    avcodec_close(avctx);
+    av_free(avctx);
+
+    return 0;
+}
diff --git a/libavcodec/avdct.h b/libavcodec/avdct.h
new file mode 100644
index 0000000000..272422e44c
--- /dev/null
+++ b/libavcodec/avdct.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVDCT_H
+#define AVCODEC_AVDCT_H
+
+#include "libavutil/opt.h"
+
+/**
+ * AVDCT context.
+ * @note function pointers can be NULL if the specific features have been
+ *       disabled at build time.
+ */
+typedef struct AVDCT {
+    const AVClass *av_class;
+
+    void (*idct)(int16_t *block /* align 16 */);
+
+    /**
+     * IDCT input permutation.
+     * Several optimized IDCTs need a permutated input (relative to the
+     * normal order of the reference IDCT).
+     * This permutation must be performed before the idct_put/add.
+     * Note, normally this can be merged with the zigzag/alternate scan<br>
+     * An example to avoid confusion:
+     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
+     * - (x -> reference DCT -> reference IDCT -> x)
+     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
+     *    -> simple_idct_mmx -> x)
+     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
+     *    -> simple_idct_mmx -> ...)
+     */
+    uint8_t idct_permutation[64];
+
+    void (*fdct)(int16_t *block /* align 16 */);
+
+
+    /**
+     * DCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int dct_algo;
+
+    /**
+     * IDCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int idct_algo;
+
+    void (*get_pixels)(int16_t *block /* align 16 */,
+                       const uint8_t *pixels /* align 8 */,
+                       ptrdiff_t line_size);
+
+    int bits_per_sample;
+} AVDCT;
+
+/**
+ * Allocates a AVDCT context.
+ * This needs to be initialized with avcodec_dct_init() after optionally
+ * configuring it with AVOptions.
+ *
+ * To free it use av_free()
+ */
+AVDCT *avcodec_dct_alloc(void);
+int avcodec_dct_init(AVDCT *);
+
+const AVClass *avcodec_dct_get_class(void);
+
+#endif /* AVCODEC_AVDCT_H */
diff --git a/libavcodec/avfft.c b/libavcodec/avfft.c
index 513f57e5f4..675d2b906b 100644
--- a/libavcodec/avfft.c
+++ b/libavcodec/avfft.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 FFTContext *av_fft_init(int nbits, int inverse)
 {
-    FFTContext *s = av_malloc(sizeof(*s));
+    FFTContext *s = av_mallocz(sizeof(*s));
 
     if (s && ff_fft_init(s, nbits, inverse))
         av_freep(&s);
@@ -142,4 +142,38 @@ av_cold void av_dct_end(DCTContext *s)
     }
 }
 
+#ifdef TEST
+int main(int argc, char **argv)
+{
+    int i;
+#define LEN 1024
+    FFTSample *ref  = av_malloc_array(LEN, sizeof(*ref));
+    FFTSample *data = av_malloc_array(LEN, sizeof(*data));
+    RDFTContext *rdft_context  = av_rdft_init(10, DFT_R2C);
+    RDFTContext *irdft_context = av_rdft_init(10, IDFT_C2R);
+
+    if (!ref || !data || !rdft_context || !irdft_context)
+        return 2;
+    for (i=0; i<LEN; i++) {
+        ref[i] = data[i] = i*456 + 123 + i*i;
+    }
+    av_rdft_calc(rdft_context, data);
+    av_rdft_calc(irdft_context, data);
+
+    for (i=0; i<LEN; i++) {
+        if (fabs(ref[i] - data[i]/LEN*2) > 1) {
+            fprintf(stderr, "Failed at %d (%f %f)\n", i, ref[i], data[i]/LEN*2);
+            return 1;
+        }
+    }
+
+    av_rdft_end(rdft_context);
+    av_rdft_end(irdft_context);
+    av_free(data);
+    av_free(ref);
+
+    return 0;
+}
+#endif
+
 #endif /* CONFIG_DCT */
diff --git a/libavcodec/avfft.h b/libavcodec/avfft.h
index e2e727da9e..0c0f9b8d8d 100644
--- a/libavcodec/avfft.h
+++ b/libavcodec/avfft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c
index e762a8782a..0496254cf4 100644
--- a/libavcodec/avpacket.c
+++ b/libavcodec/avpacket.c
@@ -2,20 +2,20 @@
  * AVPacket functions for libavcodec
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
 
 void av_init_packet(AVPacket *pkt)
 {
@@ -155,32 +157,50 @@ do {                                         \
         dst = data;                                                     \
     } while (0)
 
-int av_dup_packet(AVPacket *pkt)
+/* Makes duplicates of data, side_data, but does not copy any other fields */
+static int copy_packet_data(AVPacket *pkt, const AVPacket *src, int dup)
 {
-    AVPacket tmp_pkt;
-
-    if (!pkt->buf && pkt->data) {
-        tmp_pkt = *pkt;
-
-        pkt->data      = NULL;
-        pkt->side_data = NULL;
-        DUP_DATA(pkt->data, tmp_pkt.data, pkt->size, 1, ALLOC_BUF);
+    pkt->data      = NULL;
+    pkt->side_data = NULL;
+    if (pkt->buf) {
+        AVBufferRef *ref = av_buffer_ref(src->buf);
+        if (!ref)
+            return AVERROR(ENOMEM);
+        pkt->buf  = ref;
+        pkt->data = ref->data;
+    } else {
+        DUP_DATA(pkt->data, src->data, pkt->size, 1, ALLOC_BUF);
+    }
+    if (pkt->side_data_elems && dup)
+        pkt->side_data = src->side_data;
+    if (pkt->side_data_elems && !dup) {
+        return av_copy_packet_side_data(pkt, src);
+    }
+    return 0;
 
-        if (pkt->side_data_elems) {
-            int i;
+failed_alloc:
+    av_free_packet(pkt);
+    return AVERROR(ENOMEM);
+}
 
-            DUP_DATA(pkt->side_data, tmp_pkt.side_data,
-                     pkt->side_data_elems * sizeof(*pkt->side_data), 0, ALLOC_MALLOC);
+int av_copy_packet_side_data(AVPacket *pkt, const AVPacket *src)
+{
+    if (src->side_data_elems) {
+        int i;
+        DUP_DATA(pkt->side_data, src->side_data,
+                src->side_data_elems * sizeof(*src->side_data), 0, ALLOC_MALLOC);
+        if (src != pkt) {
             memset(pkt->side_data, 0,
-                   pkt->side_data_elems * sizeof(*pkt->side_data));
-            for (i = 0; i < pkt->side_data_elems; i++) {
-                DUP_DATA(pkt->side_data[i].data, tmp_pkt.side_data[i].data,
-                         tmp_pkt.side_data[i].size, 1, ALLOC_MALLOC);
-                pkt->side_data[i].size = tmp_pkt.side_data[i].size;
-                pkt->side_data[i].type = tmp_pkt.side_data[i].type;
-            }
+                   src->side_data_elems * sizeof(*src->side_data));
+        }
+        for (i = 0; i < src->side_data_elems; i++) {
+            DUP_DATA(pkt->side_data[i].data, src->side_data[i].data,
+                    src->side_data[i].size, 1, ALLOC_MALLOC);
+            pkt->side_data[i].size = src->side_data[i].size;
+            pkt->side_data[i].type = src->side_data[i].type;
         }
     }
+    pkt->side_data_elems = src->side_data_elems;
     return 0;
 
 failed_alloc:
@@ -188,11 +208,28 @@ failed_alloc:
     return AVERROR(ENOMEM);
 }
 
+int av_dup_packet(AVPacket *pkt)
+{
+    AVPacket tmp_pkt;
+
+    if (!pkt->buf && pkt->data) {
+        tmp_pkt = *pkt;
+        return copy_packet_data(pkt, &tmp_pkt, 1);
+    }
+    return 0;
+}
+
+int av_copy_packet(AVPacket *dst, const AVPacket *src)
+{
+    *dst = *src;
+    return copy_packet_data(dst, src, 0);
+}
+
 void av_packet_free_side_data(AVPacket *pkt)
 {
     int i;
     for (i = 0; i < pkt->side_data_elems; i++)
-        av_free(pkt->side_data[i].data);
+        av_freep(&pkt->side_data[i].data);
     av_freep(&pkt->side_data);
     pkt->side_data_elems = 0;
 }
@@ -224,7 +261,7 @@ uint8_t *av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
     if (!pkt->side_data)
         return NULL;
 
-    pkt->side_data[elems].data = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+    pkt->side_data[elems].data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!pkt->side_data[elems].data)
         return NULL;
     pkt->side_data[elems].size = size;
@@ -249,6 +286,168 @@ uint8_t *av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
     return NULL;
 }
 
+const char *av_packet_side_data_name(enum AVPacketSideDataType type)
+{
+    switch(type) {
+    case AV_PKT_DATA_PALETTE:                   return "Palette";
+    case AV_PKT_DATA_NEW_EXTRADATA:             return "New Extradata";
+    case AV_PKT_DATA_PARAM_CHANGE:              return "Param Change";
+    case AV_PKT_DATA_H263_MB_INFO:              return "H263 MB Info";
+    case AV_PKT_DATA_REPLAYGAIN:                return "Replay Gain";
+    case AV_PKT_DATA_DISPLAYMATRIX:             return "Display Matrix";
+    case AV_PKT_DATA_STEREO3D:                  return "Stereo 3D";
+    case AV_PKT_DATA_AUDIO_SERVICE_TYPE:        return "Audio Service Type";
+    case AV_PKT_DATA_SKIP_SAMPLES:              return "Skip Samples";
+    case AV_PKT_DATA_JP_DUALMONO:               return "JP Dual Mono";
+    case AV_PKT_DATA_STRINGS_METADATA:          return "Strings Metadata";
+    case AV_PKT_DATA_SUBTITLE_POSITION:         return "Subtitle Position";
+    case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL:  return "Matroska BlockAdditional";
+    case AV_PKT_DATA_WEBVTT_IDENTIFIER:         return "WebVTT ID";
+    case AV_PKT_DATA_WEBVTT_SETTINGS:           return "WebVTT Settings";
+    case AV_PKT_DATA_METADATA_UPDATE:           return "Metadata Update";
+    }
+    return NULL;
+}
+
+#define FF_MERGE_MARKER 0x8c4d9d108e25e9feULL
+
+int av_packet_merge_side_data(AVPacket *pkt){
+    if(pkt->side_data_elems){
+        AVBufferRef *buf;
+        int i;
+        uint8_t *p;
+        uint64_t size= pkt->size + 8LL + AV_INPUT_BUFFER_PADDING_SIZE;
+        AVPacket old= *pkt;
+        for (i=0; i<old.side_data_elems; i++) {
+            size += old.side_data[i].size + 5LL;
+        }
+        if (size > INT_MAX)
+            return AVERROR(EINVAL);
+        buf = av_buffer_alloc(size);
+        if (!buf)
+            return AVERROR(ENOMEM);
+        pkt->buf = buf;
+        pkt->data = p = buf->data;
+        pkt->size = size - AV_INPUT_BUFFER_PADDING_SIZE;
+        bytestream_put_buffer(&p, old.data, old.size);
+        for (i=old.side_data_elems-1; i>=0; i--) {
+            bytestream_put_buffer(&p, old.side_data[i].data, old.side_data[i].size);
+            bytestream_put_be32(&p, old.side_data[i].size);
+            *p++ = old.side_data[i].type | ((i==old.side_data_elems-1)*128);
+        }
+        bytestream_put_be64(&p, FF_MERGE_MARKER);
+        av_assert0(p-pkt->data == pkt->size);
+        memset(p, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        av_free_packet(&old);
+        pkt->side_data_elems = 0;
+        pkt->side_data = NULL;
+        return 1;
+    }
+    return 0;
+}
+
+int av_packet_split_side_data(AVPacket *pkt){
+    if (!pkt->side_data_elems && pkt->size >12 && AV_RB64(pkt->data + pkt->size - 8) == FF_MERGE_MARKER){
+        int i;
+        unsigned int size;
+        uint8_t *p;
+
+        p = pkt->data + pkt->size - 8 - 5;
+        for (i=1; ; i++){
+            size = AV_RB32(p);
+            if (size>INT_MAX || p - pkt->data < size)
+                return 0;
+            if (p[4]&128)
+                break;
+            p-= size+5;
+        }
+
+        pkt->side_data = av_malloc_array(i, sizeof(*pkt->side_data));
+        if (!pkt->side_data)
+            return AVERROR(ENOMEM);
+
+        p= pkt->data + pkt->size - 8 - 5;
+        for (i=0; ; i++){
+            size= AV_RB32(p);
+            av_assert0(size<=INT_MAX && p - pkt->data >= size);
+            pkt->side_data[i].data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
+            pkt->side_data[i].size = size;
+            pkt->side_data[i].type = p[4]&127;
+            if (!pkt->side_data[i].data)
+                return AVERROR(ENOMEM);
+            memcpy(pkt->side_data[i].data, p-size, size);
+            pkt->size -= size + 5;
+            if(p[4]&128)
+                break;
+            p-= size+5;
+        }
+        pkt->size -= 8;
+        pkt->side_data_elems = i+1;
+        return 1;
+    }
+    return 0;
+}
+
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size)
+{
+    AVDictionaryEntry *t = NULL;
+    uint8_t *data = NULL;
+    *size = 0;
+
+    if (!dict)
+        return NULL;
+
+    while ((t = av_dict_get(dict, "", t, AV_DICT_IGNORE_SUFFIX))) {
+        const size_t keylen   = strlen(t->key);
+        const size_t valuelen = strlen(t->value);
+        const size_t new_size = *size + keylen + 1 + valuelen + 1;
+        uint8_t *const new_data = av_realloc(data, new_size);
+
+        if (!new_data)
+            goto fail;
+        data = new_data;
+        if (new_size > INT_MAX)
+            goto fail;
+
+        memcpy(data + *size, t->key, keylen + 1);
+        memcpy(data + *size + keylen + 1, t->value, valuelen + 1);
+
+        *size = new_size;
+    }
+
+    return data;
+
+fail:
+    av_freep(&data);
+    *size = 0;
+    return NULL;
+}
+
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict)
+{
+    const uint8_t *end = data + size;
+    int ret = 0;
+
+    if (!dict || !data || !size)
+        return ret;
+    if (size && end[-1])
+        return AVERROR_INVALIDDATA;
+    while (data < end) {
+        const uint8_t *key = data;
+        const uint8_t *val = data + strlen(key) + 1;
+
+        if (val >= end)
+            return AVERROR_INVALIDDATA;
+
+        ret = av_dict_set(dict, key, val, 0);
+        if (ret < 0)
+            break;
+        data = val + strlen(val) + 1;
+    }
+
+    return ret;
+}
+
 int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                                int size)
 {
@@ -306,7 +505,7 @@ void av_packet_unref(AVPacket *pkt)
     pkt->size = 0;
 }
 
-int av_packet_ref(AVPacket *dst, AVPacket *src)
+int av_packet_ref(AVPacket *dst, const AVPacket *src)
 {
     int ret;
 
@@ -356,3 +555,28 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 }
+
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type)
+{
+    uint8_t *side_data;
+    int side_data_size;
+    int i;
+
+    side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_QUALITY_STATS, &side_data_size);
+    if (!side_data) {
+        side_data_size = 4+4+8*error_count;
+        side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_STATS,
+                                            side_data_size);
+    }
+
+    if (!side_data || side_data_size < 4+4+8*error_count)
+        return AVERROR(ENOMEM);
+
+    AV_WL32(side_data   , quality  );
+    side_data[4] = pict_type;
+    side_data[5] = error_count;
+    for (i = 0; i<error_count; i++)
+        AV_WL64(side_data+8 + 8*i , error[i]);
+
+    return 0;
+}
diff --git a/libavcodec/avpicture.c b/libavcodec/avpicture.c
index 6bde0f8445..0484dc3f7b 100644
--- a/libavcodec/avpicture.c
+++ b/libavcodec/avpicture.c
@@ -2,20 +2,20 @@
  * AVPicture management routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,19 +31,18 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/colorspace.h"
 
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height)
 {
     return av_image_fill_arrays(picture->data, picture->linesize,
                                 ptr, pix_fmt, width, height, 1);
 }
 
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
-                     int width, int height,
+int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt, int width, int height,
                      unsigned char *dest, int dest_size)
 {
     return av_image_copy_to_buffer(dest, dest_size,
-                                   src->data, src->linesize,
+                                   (const uint8_t * const*)src->data, src->linesize,
                                    pix_fmt, width, height, 1);
 }
 
@@ -67,13 +66,13 @@ int avpicture_alloc(AVPicture *picture,
 
 void avpicture_free(AVPicture *picture)
 {
-    av_free(picture->data[0]);
+    av_freep(&picture->data[0]);
 }
 
 void av_picture_copy(AVPicture *dst, const AVPicture *src,
                      enum AVPixelFormat pix_fmt, int width, int height)
 {
-    av_image_copy(dst->data, dst->linesize, src->data,
+    av_image_copy(dst->data, dst->linesize, (const uint8_t **)src->data,
                   src->linesize, pix_fmt, width, height);
 }
 
diff --git a/libavcodec/avr32/mathops.h b/libavcodec/avr32/mathops.h
index 528b7adb33..85f42b594d 100644
--- a/libavcodec/avr32/mathops.h
+++ b/libavcodec/avr32/mathops.h
@@ -2,20 +2,20 @@
  * Simple math operations
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avrndec.c b/libavcodec/avrndec.c
new file mode 100644
index 0000000000..5e40d664dc
--- /dev/null
+++ b/libavcodec/avrndec.c
@@ -0,0 +1,173 @@
+/*
+ * AVRn decoder
+ * Copyright (c) 2012 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mjpeg.h"
+#include "mjpegdec.h"
+#include "libavutil/imgutils.h"
+
+typedef struct {
+    AVCodecContext *mjpeg_avctx;
+    int is_mjpeg;
+    int interlace; //FIXME use frame.interlaced_frame
+    int tff;
+} AVRnContext;
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+    int ret;
+
+    // Support "Resolution 1:1" for Avid AVI Codec
+    a->is_mjpeg = avctx->extradata_size < 31 || memcmp(&avctx->extradata[28], "1:1", 3);
+
+    if(!a->is_mjpeg && avctx->lowres) {
+        av_log(avctx, AV_LOG_ERROR, "lowres is not possible with rawvideo\n");
+        return AVERROR(EINVAL);
+    }
+
+    if(a->is_mjpeg) {
+        AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+        AVDictionary *thread_opt = NULL;
+        if (!codec) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+            return AVERROR_DECODER_NOT_FOUND;
+        }
+
+        a->mjpeg_avctx = avcodec_alloc_context3(codec);
+
+        av_dict_set(&thread_opt, "threads", "1", 0); // Is this needed ?
+        a->mjpeg_avctx->refcounted_frames = 1;
+        a->mjpeg_avctx->flags = avctx->flags;
+        a->mjpeg_avctx->idct_algo = avctx->idct_algo;
+        a->mjpeg_avctx->lowres = avctx->lowres;
+        a->mjpeg_avctx->width = avctx->width;
+        a->mjpeg_avctx->height = avctx->height;
+
+        if ((ret = ff_codec_open2_recursive(a->mjpeg_avctx, codec, &thread_opt)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        }
+        av_dict_free(&thread_opt);
+
+        return ret;
+    }
+
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+
+    if(avctx->extradata_size >= 9 && avctx->extradata[4]+28 < avctx->extradata_size) {
+        int ndx = avctx->extradata[4] + 4;
+        a->interlace = !memcmp(avctx->extradata + ndx, "1:1(", 4);
+        if(a->interlace) {
+            a->tff = avctx->extradata[ndx + 24] == 1;
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int end(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+
+    avcodec_close(a->mjpeg_avctx);
+    av_freep(&a->mjpeg_avctx);
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    AVRnContext *a = avctx->priv_data;
+    AVFrame *p = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int y, ret, true_height;
+
+    if(a->is_mjpeg) {
+        ret = avcodec_decode_video2(a->mjpeg_avctx, data, got_frame, avpkt);
+
+        if (ret >= 0 && *got_frame && avctx->width <= p->width && avctx->height <= p->height) {
+            int shift = p->height - avctx->height;
+            int subsample_h, subsample_v;
+
+            av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &subsample_h, &subsample_v);
+
+            p->data[0] += p->linesize[0] * shift;
+            if (p->data[2]) {
+                p->data[1] += p->linesize[1] * (shift>>subsample_v);
+                p->data[2] += p->linesize[2] * (shift>>subsample_v);
+            }
+
+            p->width  = avctx->width;
+            p->height = avctx->height;
+        }
+        avctx->pix_fmt = a->mjpeg_avctx->pix_fmt;
+        return ret;
+    }
+
+    true_height    = buf_size / (2*avctx->width);
+
+    if(buf_size < 2*avctx->width * avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+    p->pict_type= AV_PICTURE_TYPE_I;
+    p->key_frame= 1;
+
+    if(a->interlace) {
+        buf += (true_height - avctx->height)*avctx->width;
+        for(y = 0; y < avctx->height-1; y+=2) {
+            memcpy(p->data[0] + (y+ a->tff)*p->linesize[0], buf                             , 2*avctx->width);
+            memcpy(p->data[0] + (y+!a->tff)*p->linesize[0], buf + avctx->width*true_height+4, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    } else {
+        buf += (true_height - avctx->height)*avctx->width*2;
+        for(y = 0; y < avctx->height; y++) {
+            memcpy(p->data[0] + y*p->linesize[0], buf, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    }
+
+    *got_frame      = 1;
+    return buf_size;
+}
+
+AVCodec ff_avrn_decoder = {
+    .name           = "avrn",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRN,
+    .priv_data_size = sizeof(AVRnContext),
+    .init           = init,
+    .close          = end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
+};
diff --git a/libavcodec/avs.c b/libavcodec/avs.c
index 0d127f85d1..345d628dd5 100644
--- a/libavcodec/avs.c
+++ b/libavcodec/avs.c
@@ -2,20 +2,20 @@
  * AVS video decoder.
  * Copyright (c) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,12 +57,10 @@ avs_decode_frame(AVCodecContext * avctx,
     int i, j, x, y, stride, ret, vect_w = 3, vect_h = 3;
     AvsVideoSubType sub_type;
     AvsBlockType type;
-    GetBitContext change_map;
+    GetBitContext change_map = {0}; //init to silence warning
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_P;
     p->key_frame = 0;
 
@@ -84,8 +82,10 @@ avs_decode_frame(AVCodecContext * avctx,
         if (first >= 256 || last > 256 || buf_end - buf < 4 + 4 + 3 * (last - first))
             return AVERROR_INVALIDDATA;
         buf += 4;
-        for (i=first; i<last; i++, buf+=3)
+        for (i=first; i<last; i++, buf+=3) {
             pal[i] = (buf[0] << 18) | (buf[1] << 10) | (buf[2] << 2);
+            pal[i] |= 0xFFU << 24 | (pal[i] >> 6) & 0x30303;
+        }
 
         sub_type = buf[0];
         type = buf[1];
diff --git a/libavcodec/avuidec.c b/libavcodec/avuidec.c
new file mode 100644
index 0000000000..5117844fa9
--- /dev/null
+++ b/libavcodec/avuidec.c
@@ -0,0 +1,130 @@
+/*
+ * AVID Meridien decoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+    return 0;
+}
+
+static int avui_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    int ret;
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data, *extradata = avctx->extradata;
+    const uint8_t *srca;
+    uint8_t *y, *u, *v, *a;
+    int transparent, interlaced = 1, skip, opaque_length, i, j, k;
+    uint32_t extradata_size = avctx->extradata_size;
+
+    while (extradata_size >= 24) {
+        uint32_t atom_size = AV_RB32(extradata);
+        if (!memcmp(&extradata[4], "APRGAPRG0001", 12)) {
+            interlaced = extradata[19] != 1;
+            break;
+        }
+        if (atom_size && atom_size <= extradata_size) {
+            extradata      += atom_size;
+            extradata_size -= atom_size;
+        } else {
+            break;
+        }
+    }
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    opaque_length = 2 * avctx->width * (avctx->height + skip) + 4 * interlaced;
+    if (avpkt->size < opaque_length) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+    transparent = avctx->bits_per_coded_sample == 32 &&
+                  avpkt->size >= opaque_length * 2 + 4;
+    srca = src + opaque_length + 5;
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    if (!interlaced) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+    }
+
+    for (i = 0; i < interlaced + 1; i++) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+        if (interlaced && avctx->height == 486) {
+            y = pic->data[0] + (1 - i) * pic->linesize[0];
+            u = pic->data[1] + (1 - i) * pic->linesize[1];
+            v = pic->data[2] + (1 - i) * pic->linesize[2];
+            a = pic->data[3] + (1 - i) * pic->linesize[3];
+        } else {
+            y = pic->data[0] + i * pic->linesize[0];
+            u = pic->data[1] + i * pic->linesize[1];
+            v = pic->data[2] + i * pic->linesize[2];
+            a = pic->data[3] + i * pic->linesize[3];
+        }
+
+        for (j = 0; j < avctx->height >> interlaced; j++) {
+            for (k = 0; k < avctx->width >> 1; k++) {
+                u[    k    ] = *src++;
+                y[2 * k    ] = *src++;
+                a[2 * k    ] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+                v[    k    ] = *src++;
+                y[2 * k + 1] = *src++;
+                a[2 * k + 1] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+            }
+
+            y += (interlaced + 1) * pic->linesize[0];
+            u += (interlaced + 1) * pic->linesize[1];
+            v += (interlaced + 1) * pic->linesize[2];
+            a += (interlaced + 1) * pic->linesize[3];
+        }
+        src  += 4;
+        srca += 4;
+    }
+    *got_frame       = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_avui_decoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_decode_init,
+    .decode       = avui_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/avuienc.c b/libavcodec/avuienc.c
new file mode 100644
index 0000000000..b219906706
--- /dev/null
+++ b/libavcodec/avuienc.c
@@ -0,0 +1,103 @@
+/*
+ * AVID Meridien encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width != 720 || avctx->height != 486 && avctx->height != 576) {
+        av_log(avctx, AV_LOG_ERROR, "Only 720x486 and 720x576 are supported.\n");
+        return AVERROR(EINVAL);
+    }
+    if (!(avctx->extradata = av_mallocz(144 + AV_INPUT_BUFFER_PADDING_SIZE)))
+        return AVERROR(ENOMEM);
+    avctx->extradata_size = 144;
+    memcpy(avctx->extradata, "\0\0\0\x18""APRGAPRG0001", 16);
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        avctx->extradata[19] = 2;
+    } else {
+        avctx->extradata[19] = 1;
+    }
+    memcpy(avctx->extradata + 24, "\0\0\0\x78""ARESARES0001""\0\0\0\x98", 20);
+    AV_WB32(avctx->extradata + 44, avctx->width);
+    AV_WB32(avctx->extradata + 48, avctx->height);
+    memcpy(avctx->extradata + 52, "\0\0\0\x1\0\0\0\x20\0\0\0\x2", 12);
+
+
+    return 0;
+}
+
+static int avui_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    int i, j, skip, ret, size, interlaced;
+
+    interlaced = avctx->field_order > AV_FIELD_PROGRESSIVE;
+
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    size = 2 * avctx->width * (avctx->height + skip) + 8 * interlaced;
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+    dst = pkt->data;
+    if (!interlaced) {
+        memset(dst, 0, avctx->width * skip);
+        dst += avctx->width * skip;
+    }
+
+    for (i = 0; i <= interlaced; i++) {
+        uint8_t *src;
+        if (interlaced && avctx->height == 486) {
+            src = pic->data[0] + (1 - i) * pic->linesize[0];
+        } else {
+            src = pic->data[0] + i * pic->linesize[0];
+        }
+        memset(dst, 0, avctx->width * skip + 4 * i);
+        dst += avctx->width * skip + 4 * i;
+        for (j = 0; j < avctx->height; j += interlaced + 1) {
+            memcpy(dst, src, avctx->width * 2);
+            src += (interlaced + 1) * pic->linesize[0];
+            dst += avctx->width * 2;
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+AVCodec ff_avui_encoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_encode_init,
+    .encode2      = avui_encode_frame,
+    .capabilities = AV_CODEC_CAP_EXPERIMENTAL | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_UYVY422, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/bethsoftvideo.c b/libavcodec/bethsoftvideo.c
index 11e2cfaab2..97b745d38a 100644
--- a/libavcodec/bethsoftvideo.c
+++ b/libavcodec/bethsoftvideo.c
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,7 +59,8 @@ static int set_palette(BethsoftvidContext *ctx)
         return AVERROR_INVALIDDATA;
 
     for(a = 0; a < 256; a++){
-        palette[a] = bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] = 0xFFU << 24 | bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] |= palette[a] >> 6 & 0x30303;
     }
     ctx->frame->palette_has_changed = 1;
     return 0;
@@ -78,10 +79,8 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx,
     int code, ret;
     int yoffset;
 
-    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0)
         return ret;
-    }
     wrap_to_next_line = vid->frame->linesize[0] - avctx->width;
 
     if (avpkt->side_data_elems > 0 &&
diff --git a/libavcodec/bethsoftvideo.h b/libavcodec/bethsoftvideo.h
index 5cbbdfdff0..d5b5d0a525 100644
--- a/libavcodec/bethsoftvideo.h
+++ b/libavcodec/bethsoftvideo.h
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bfi.c b/libavcodec/bfi.c
index 8335e9d125..6727629b52 100644
--- a/libavcodec/bfi.c
+++ b/libavcodec/bfi.c
@@ -2,20 +2,20 @@
  * Brute Force & Ignorance (BFI) video decoder
  * Copyright (c) 2008 Sisir Koppaka
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 typedef struct BFIContext {
     AVCodecContext *avctx;
     uint8_t *dst;
+    uint32_t pal[256];
 } BFIContext;
 
 static av_cold int bfi_decode_init(AVCodecContext *avctx)
@@ -41,6 +42,8 @@ static av_cold int bfi_decode_init(AVCodecContext *avctx)
     BFIContext *bfi = avctx->priv_data;
     avctx->pix_fmt  = AV_PIX_FMT_PAL8;
     bfi->dst        = av_mallocz(avctx->width * avctx->height);
+    if (!bfi->dst)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
@@ -57,10 +60,8 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
     uint32_t *pal;
     int i, j, ret, height = avctx->height;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&g, avpkt->data, buf_size);
 
@@ -76,16 +77,19 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
         pal = (uint32_t *)frame->data[1];
         for (i = 0; i < avctx->extradata_size / 3; i++) {
             int shift = 16;
-            *pal = 0;
+            *pal = 0xFFU << 24;
             for (j = 0; j < 3; j++, shift -= 8)
                 *pal += ((avctx->extradata[i * 3 + j] << 2) |
                          (avctx->extradata[i * 3 + j] >> 4)) << shift;
             pal++;
         }
+        memcpy(bfi->pal, frame->data[1], sizeof(bfi->pal));
         frame->palette_has_changed = 1;
     } else {
         frame->pict_type = AV_PICTURE_TYPE_P;
         frame->key_frame = 0;
+        frame->palette_has_changed = 0;
+        memcpy(frame->data[1], bfi->pal, sizeof(bfi->pal));
     }
 
     bytestream2_skip(&g, 4); // Unpacked size, not required.
@@ -167,7 +171,7 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
 static av_cold int bfi_decode_close(AVCodecContext *avctx)
 {
     BFIContext *bfi = avctx->priv_data;
-    av_free(bfi->dst);
+    av_freep(&bfi->dst);
     return 0;
 }
 
diff --git a/libavcodec/bfin/README b/libavcodec/bfin/README
new file mode 100644
index 0000000000..afb3461b72
--- /dev/null
+++ b/libavcodec/bfin/README
@@ -0,0 +1,6 @@
+BFIN optimizations have been removed in
+commit 880e2aa23645ed9871c66ee1cbd00f93c72d2d73
+The last revission with the optimizations is fa4e17c14035ebf43130fb369e1728cdd98d0b72
+
+If you want to maintain these (or other) BFIN optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/bgmc.c b/libavcodec/bgmc.c
index ad8baaecac..1a6817b73f 100644
--- a/libavcodec/bgmc.c
+++ b/libavcodec/bgmc.c
@@ -1,28 +1,28 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder as used by MPEG-4 ALS
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include "libavutil/attributes.h"
diff --git a/libavcodec/bgmc.h b/libavcodec/bgmc.h
index 3d5b49034d..4893736af5 100644
--- a/libavcodec/bgmc.h
+++ b/libavcodec/bgmc.h
@@ -1,28 +1,28 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder header
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index 4e72267667..131eaa16f7 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Konstantin Shishkov
  * Copyright (C) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -120,6 +120,7 @@ typedef struct BinkContext {
     int            version;              ///< internal Bink file version
     int            has_alpha;
     int            swap_planes;
+    unsigned       frame_num;
 
     Bundle         bundle[BINKB_NB_SRC]; ///< bundles for decoding all data types
     Tree           col_high[16];         ///< trees for decoding high nibble in "colours" data type
@@ -143,7 +144,7 @@ enum BlockTypes {
 };
 
 /**
- * Initialize length length in all bundles.
+ * Initialize length in all bundles.
  *
  * @param c     decoder context
  * @param width plane width
@@ -174,7 +175,7 @@ static void init_lengths(BinkContext *c, int width, int bw)
  *
  * @param c decoder context
  */
-static av_cold void init_bundles(BinkContext *c)
+static av_cold int init_bundles(BinkContext *c)
 {
     int bw, bh, blocks;
     int i;
@@ -184,9 +185,13 @@ static av_cold void init_bundles(BinkContext *c)
     blocks = bw * bh;
 
     for (i = 0; i < BINKB_NB_SRC; i++) {
-        c->bundle[i].data = av_malloc(blocks * 64);
+        c->bundle[i].data = av_mallocz(blocks * 64);
+        if (!c->bundle[i].data)
+            return AVERROR(ENOMEM);
         c->bundle[i].data_end = c->bundle[i].data + blocks * 64;
     }
+
+    return 0;
 }
 
 /**
@@ -679,11 +684,12 @@ static int read_dct_coeffs(GetBitContext *gb, int32_t block[64], const uint8_t *
         quant_idx = get_bits(gb, 4);
     } else {
         quant_idx = q;
+        if (quant_idx > 15U) {
+            av_log(NULL, AV_LOG_ERROR, "quant_index %d out of range\n", quant_idx);
+            return AVERROR_INVALIDDATA;
+        }
     }
 
-    if (quant_idx >= 16)
-        return AVERROR_INVALIDDATA;
-
     quant = quant_matrices[quant_idx];
 
     block[0] = (block[0] * quant[0]) >> 11;
@@ -866,7 +872,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTRA_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTRA_Q);
-                read_dct_coeffs(gb, dctblock, bink_scan, binkb_intra_quant, qp);
+                read_dct_coeffs(gb, dctblock, bink_scan, (const int32_t (*)[64])binkb_intra_quant, qp);
                 c->binkdsp.idct_put(dst, stride, dctblock);
                 break;
             case 3:
@@ -899,7 +905,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTER_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTER_Q);
-                read_dct_coeffs(gb, dctblock, bink_scan, binkb_inter_quant, qp);
+                read_dct_coeffs(gb, dctblock, bink_scan, (const int32_t (*)[64])binkb_inter_quant, qp);
                 c->binkdsp.idct_add(dst, stride, dctblock);
                 break;
             case 5:
@@ -1184,15 +1190,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int bits_count = pkt->size << 3;
 
     if (c->version > 'b') {
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
     } else {
-        if ((ret = ff_reget_buffer(avctx, c->last)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, c->last)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, c->last)) < 0)
             return ret;
     }
@@ -1207,6 +1209,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     if (c->version >= 'i')
         skip_bits_long(&gb, 32);
 
+    c->frame_num++;
+
     for (plane = 0; plane < 3; plane++) {
         plane_idx = (!plane || !c->swap_planes) ? plane : (plane ^ 3);
 
@@ -1215,7 +1219,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
                 return ret;
         } else {
             if ((ret = binkb_decode_plane(c, frame, &gb, plane_idx,
-                                          !avctx->frame_number, !!plane)) < 0)
+                                          c->frame_num == 1, !!plane)) < 0)
                 return ret;
         }
         if (get_bits_count(&gb) >= bits_count)
@@ -1241,41 +1245,28 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
 static av_cold void binkb_calc_quant(void)
 {
     uint8_t inv_bink_scan[64];
-    double s[64];
+    static const int s[64]={
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+        1489322693,2065749918,1945893874,1751258219,1489322693,1170153332, 806015634, 410903207,
+        1402911301,1945893874,1832991949,1649649171,1402911301,1102260336, 759250125, 387062357,
+        1262586814,1751258219,1649649171,1484645031,1262586814, 992008094, 683307060, 348346918,
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+         843633538,1170153332,1102260336, 992008094, 843633538, 662838617, 456571181, 232757969,
+         581104888, 806015634, 759250125, 683307060, 581104888, 456571181, 314491699, 160326478,
+         296244703, 410903207, 387062357, 348346918, 296244703, 232757969, 160326478,  81733730,
+    };
     int i, j;
-
-    for (j = 0; j < 8; j++) {
-        for (i = 0; i < 8; i++) {
-            if (j && j != 4)
-               if (i && i != 4)
-                   s[j*8 + i] = cos(j * M_PI/16.0) * cos(i * M_PI/16.0) * 2.0;
-               else
-                   s[j*8 + i] = cos(j * M_PI/16.0) * sqrt(2.0);
-            else
-               if (i && i != 4)
-                   s[j*8 + i] = cos(i * M_PI/16.0) * sqrt(2.0);
-               else
-                   s[j*8 + i] = 1.0;
-        }
-    }
-
+#define C (1LL<<30)
     for (i = 0; i < 64; i++)
         inv_bink_scan[bink_scan[i]] = i;
 
     for (j = 0; j < 16; j++) {
         for (i = 0; i < 64; i++) {
             int k = inv_bink_scan[i];
-            if (s[i] == 1.0) {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-            } else {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-            }
+            binkb_intra_quant[j][k] = binkb_intra_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
+            binkb_inter_quant[j][k] = binkb_inter_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
         }
     }
 }
@@ -1321,7 +1312,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ff_hpeldsp_init(&c->hdsp, avctx->flags);
     ff_binkdsp_init(&c->binkdsp);
 
-    init_bundles(c);
+    if ((ret = init_bundles(c)) < 0) {
+        free_bundles(c);
+        return ret;
+    }
 
     if (c->version == 'b') {
         if (!binkb_initialised) {
@@ -1343,6 +1337,13 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static void flush(AVCodecContext *avctx)
+{
+    BinkContext * const c = avctx->priv_data;
+
+    c->frame_num = 0;
+}
+
 AVCodec ff_bink_decoder = {
     .name           = "binkvideo",
     .long_name      = NULL_IF_CONFIG_SMALL("Bink video"),
@@ -1352,5 +1353,6 @@ AVCodec ff_bink_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index 71ad344bbf..5cc23311f5 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007-2011 Peter Ross (pross@xvid.org)
  * Copyright (c) 2009 Daniel Verkamp (daniel@drv.nu)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -81,14 +81,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
         frame_len_bits = 11;
     }
 
-    if (avctx->channels > MAX_CHANNELS) {
-        av_log(avctx, AV_LOG_ERROR, "too many channels: %d\n", avctx->channels);
-        return -1;
+    if (avctx->channels < 1 || avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels: %d\n", avctx->channels);
+        return AVERROR_INVALIDDATA;
     }
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO :
                                                    AV_CH_LAYOUT_STEREO;
 
-    s->version_b = avctx->extradata && avctx->extradata[3] == 'b';
+    s->version_b = avctx->extradata_size >= 4 && avctx->extradata[3] == 'b';
 
     if (avctx->codec->id == AV_CODEC_ID_BINKAUDIO_RDFT) {
         // audio is already interleaved for the RDFT format variant
@@ -305,9 +305,11 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         buf = av_realloc(s->packet_buffer, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!buf)
             return AVERROR(ENOMEM);
+        memset(buf + avpkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
         s->packet_buffer = buf;
         memcpy(s->packet_buffer, avpkt->data, avpkt->size);
-        init_get_bits(gb, s->packet_buffer, avpkt->size * 8);
+        if ((ret = init_get_bits8(gb, s->packet_buffer, avpkt->size)) < 0)
+            return ret;
         consumed = avpkt->size;
 
         /* skip reported size */
@@ -316,10 +318,8 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_block(s, (float **)frame->extended_data,
                      avctx->codec->id == AV_CODEC_ID_BINKAUDIO_DCT)) {
diff --git a/libavcodec/binkdata.h b/libavcodec/binkdata.h
index 3da6b7e987..57619beee2 100644
--- a/libavcodec/binkdata.h
+++ b/libavcodec/binkdata.h
@@ -2,20 +2,20 @@
  * Bink video decoder
  * Copyright (C) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/binkdsp.c b/libavcodec/binkdsp.c
index 0dfe12cd4f..9d70e2326f 100644
--- a/libavcodec/binkdsp.c
+++ b/libavcodec/binkdsp.c
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -129,7 +129,7 @@ static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align
     }
 }
 
-static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
+static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
                           int line_size)
 {
     int i;
diff --git a/libavcodec/binkdsp.h b/libavcodec/binkdsp.h
index 418afb9895..f319d1ffe5 100644
--- a/libavcodec/binkdsp.h
+++ b/libavcodec/binkdsp.h
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bintext.c b/libavcodec/bintext.c
new file mode 100644
index 0000000000..90bbe67b59
--- /dev/null
+++ b/libavcodec/bintext.c
@@ -0,0 +1,258 @@
+/*
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/xga_font_data.h"
+#include "avcodec.h"
+#include "cga_data.h"
+#include "bintext.h"
+#include "internal.h"
+
+typedef struct XbinContext {
+    AVFrame *frame;
+    int palette[16];
+    int flags;
+    int font_height;
+    const uint8_t *font;
+    int x, y;
+} XbinContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    uint8_t *p;
+    int i;
+
+    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    p = avctx->extradata;
+    if (p) {
+        s->font_height = p[0];
+        s->flags = p[1];
+        p += 2;
+        if(avctx->extradata_size < 2 + (!!(s->flags & BINTEXT_PALETTE))*3*16
+                                     + (!!(s->flags & BINTEXT_FONT))*s->font_height*256) {
+            av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        s->font_height = 8;
+        s->flags = 0;
+    }
+
+    if ((s->flags & BINTEXT_PALETTE)) {
+        for (i = 0; i < 16; i++) {
+            s->palette[i] = 0xFF000000 | (AV_RB24(p) << 2) | ((AV_RB24(p) >> 4) & 0x30303);
+            p += 3;
+        }
+    } else {
+        for (i = 0; i < 16; i++)
+            s->palette[i] = 0xFF000000 | ff_cga_palette[i];
+    }
+
+    if ((s->flags & BINTEXT_FONT)) {
+        s->font = p;
+    } else {
+        switch(s->font_height) {
+        default:
+            av_log(avctx, AV_LOG_WARNING, "font height %i not supported\n", s->font_height);
+            s->font_height = 8;
+        case 8:
+            s->font = avpriv_cga_font;
+            break;
+        case 16:
+            s->font = avpriv_vga16_font;
+            break;
+        }
+    }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+#define DEFAULT_BG_COLOR 0
+av_unused static void hscroll(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y < avctx->height - s->font_height) {
+        s->y += s->font_height;
+    } else {
+        memmove(s->frame->data[0], s->frame->data[0] + s->font_height*s->frame->linesize[0],
+            (avctx->height - s->font_height)*s->frame->linesize[0]);
+        memset(s->frame->data[0] + (avctx->height - s->font_height)*s->frame->linesize[0],
+            DEFAULT_BG_COLOR, s->font_height * s->frame->linesize[0]);
+    }
+}
+
+#define FONT_WIDTH 8
+
+/**
+ * Draw character to screen
+ */
+static void draw_char(AVCodecContext *avctx, int c, int a)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y > avctx->height - s->font_height)
+        return;
+    ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
+                    s->frame->linesize[0], s->font, s->font_height, c,
+                    a & 0x0F, a >> 4);
+    s->x += FONT_WIDTH;
+    if (s->x > avctx->width - FONT_WIDTH) {
+        s->x = 0;
+        s->y += s->font_height;
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame,
+                            AVPacket *avpkt)
+{
+    XbinContext *s = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    const uint8_t *buf_end = buf+buf_size;
+    int ret;
+
+    s->x = s->y = 0;
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+    s->frame->pict_type           = AV_PICTURE_TYPE_I;
+    s->frame->palette_has_changed = 1;
+    memcpy(s->frame->data[1], s->palette, 16 * 4);
+
+    if (avctx->codec_id == AV_CODEC_ID_XBIN) {
+        while (buf + 2 < buf_end) {
+            int i,c,a;
+            int type  = *buf >> 6;
+            int count = (*buf & 0x3F) + 1;
+            buf++;
+            switch (type) {
+            case 0: //no compression
+                for (i = 0; i < count && buf + 1 < buf_end; i++) {
+                    draw_char(avctx, buf[0], buf[1]);
+                    buf += 2;
+                }
+                break;
+            case 1: //character compression
+                c = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, *buf++);
+                break;
+            case 2: //attribute compression
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, *buf++, a);
+                break;
+            case 3: //character/attribute compression
+                c = *buf++;
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, a);
+                break;
+            }
+        }
+    } else if (avctx->codec_id == AV_CODEC_ID_IDF) {
+        while (buf + 2 < buf_end) {
+            if (AV_RL16(buf) == 1) {
+               int i;
+               if (buf + 6 > buf_end)
+                   break;
+               for (i = 0; i < buf[2]; i++)
+                   draw_char(avctx, buf[4], buf[5]);
+               buf += 6;
+            } else {
+               draw_char(avctx, buf[0], buf[1]);
+               buf += 2;
+            }
+        }
+    } else {
+        while (buf + 1 < buf_end) {
+            draw_char(avctx, buf[0], buf[1]);
+            buf += 2;
+        }
+    }
+
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+    *got_frame      = 1;
+    return buf_size;
+}
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
+#if CONFIG_BINTEXT_DECODER
+AVCodec ff_bintext_decoder = {
+    .name           = "bintext",
+    .long_name      = NULL_IF_CONFIG_SMALL("Binary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_BINTEXT,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_XBIN_DECODER
+AVCodec ff_xbin_decoder = {
+    .name           = "xbin",
+    .long_name      = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XBIN,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_IDF_DECODER
+AVCodec ff_idf_decoder = {
+    .name           = "idf",
+    .long_name      = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_IDF,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/bintext.h b/libavcodec/bintext.h
new file mode 100644
index 0000000000..21428ba476
--- /dev/null
+++ b/libavcodec/bintext.h
@@ -0,0 +1,37 @@
+/*
+ * Binary text decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ */
+
+#ifndef AVCODEC_BINTEXT_H
+#define AVCODEC_BINTEXT_H
+
+/* flag values passed between avformat and avcodec;
+ * while these are identical to the XBIN flags, they are also used
+ * for the BINTEXT and IDF decoders.
+ */
+#define BINTEXT_PALETTE  0x1
+#define BINTEXT_FONT     0x2
+
+#endif /* AVCODEC_BINTEXT_H */
diff --git a/libavcodec/bit_depth_template.c b/libavcodec/bit_depth_template.c
index 27e658ba75..80184892f5 100644
--- a/libavcodec/bit_depth_template.c
+++ b/libavcodec/bit_depth_template.c
@@ -1,23 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "mathops.h"
 #include "rnd_avg.h"
+#include "libavutil/intreadwrite.h"
 
 #ifndef BIT_DEPTH
 #define BIT_DEPTH 8
@@ -71,7 +72,7 @@
 #   define pixel4 uint32_t
 #   define dctcoef int16_t
 
-#   define INIT_CLIP const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+#   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg32
 #   define    rnd_avg_pixel4    rnd_avg32
 #   define AV_RN2P  AV_RN16
@@ -83,7 +84,7 @@
 #   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
 
 #   define av_clip_pixel(a) av_clip_uint8(a)
-#   define CLIP(a) cm[a]
+#   define CLIP(a) av_clip_uint8(a)
 #endif
 
 #define FUNC3(a, b, c)  a ## _ ## b ## c
diff --git a/libavcodec/bitstream.c b/libavcodec/bitstream.c
index 93047a2b91..924cc519a2 100644
--- a/libavcodec/bitstream.c
+++ b/libavcodec/bitstream.c
@@ -6,20 +6,20 @@
  *
  * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
  * bitstream api.
  */
 
+#include "libavutil/atomic.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
@@ -68,6 +70,8 @@ void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
     if (length == 0)
         return;
 
+    av_assert0(length <= put_bits_left(pb));
+
     if (CONFIG_SMALL || words < 16 || put_bits_count(pb) & 7) {
         for (i = 0; i < words; i++)
             put_bits(pb, 16, AV_RB16(src + 2 * i));
@@ -107,17 +111,16 @@ static int alloc_table(VLC *vlc, int size, int use_static)
 
     vlc->table_size += size;
     if (vlc->table_size > vlc->table_allocated) {
-        int err;
         if (use_static)
-            return AVERROR_BUG;
+            abort(); // cannot do anything, init_vlc() is used with too little memory
         vlc->table_allocated += (1 << vlc->bits);
-        if ((err = av_reallocp(&vlc->table,
-                               sizeof(VLC_TYPE) * 2 *
-                               vlc->table_allocated)) < 0) {
+        vlc->table = av_realloc_f(vlc->table, vlc->table_allocated, sizeof(VLC_TYPE) * 2);
+        if (!vlc->table) {
             vlc->table_allocated = 0;
             vlc->table_size = 0;
-            return err;
+            return AVERROR(ENOMEM);
         }
+        memset(vlc->table + vlc->table_allocated - (1 << vlc->bits), 0, sizeof(VLC_TYPE) * 2 << vlc->bits);
     }
     return index;
 }
@@ -163,19 +166,16 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
     int table_size, table_index, index, code_prefix, symbol, subtable_bits;
     int i, j, k, n, nb, inc;
     uint32_t code;
-    VLC_TYPE (*table)[2];
+    volatile VLC_TYPE (* volatile table)[2]; // the double volatile is needed to prevent a internal compiler error in gcc 4.2
 
     table_size = 1 << table_nb_bits;
+    if (table_nb_bits > 30)
+       return -1;
     table_index = alloc_table(vlc, table_size, flags & INIT_VLC_USE_NEW_STATIC);
     ff_dlog(NULL, "new table index=%d size=%d\n", table_index, table_size);
     if (table_index < 0)
         return table_index;
-    table = &vlc->table[table_index];
-
-    for (i = 0; i < table_size; i++) {
-        table[i][1] = 0; //bits
-        table[i][0] = -1; //codes
-    }
+    table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
 
     /* first pass: map codes and compute auxiliary table sizes */
     for (i = 0; i < nb_codes; i++) {
@@ -193,8 +193,9 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
                 inc = 1 << n;
             }
             for (k = 0; k < nb; k++) {
+                int bits = table[j][1];
                 ff_dlog(NULL, "%4x: code=%d n=%d\n", j, i, n);
-                if (table[j][1] /*bits*/ != 0) {
+                if (bits != 0 && bits != n) {
                     av_log(NULL, AV_LOG_ERROR, "incorrect codes\n");
                     return AVERROR_INVALIDDATA;
                 }
@@ -229,18 +230,24 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
             if (index < 0)
                 return index;
             /* note: realloc has been done, so reload tables */
-            table = &vlc->table[table_index];
+            table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
             table[j][0] = index; //code
             i = k-1;
         }
     }
+
+    for (i = 0; i < table_size; i++) {
+        if (table[i][1] == 0) //bits
+            table[i][0] = -1; //codes
+    }
+
     return table_index;
 }
 
 
 /* Build VLC decoding tables suitable for use with get_vlc().
 
-   'nb_bits' set thee decoding table size (2^nb_bits) entries. The
+   'nb_bits' set the decoding table size (2^nb_bits) entries. The
    bigger it is, the faster is the decoding. But it should not be too
    big to save memory and L1 cache. '9' is a good compromise.
 
@@ -258,13 +265,13 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
    'xxx_size' : gives the number of bytes of each entry of the 'bits'
    or 'codes' tables.
 
-   'wrap' and 'size' allows to use any memory configuration and types
+   'wrap' and 'size' make it possible to use any memory configuration and types
    (byte/word/long) to store the 'bits', 'codes', and 'symbols' tables.
 
    'use_static' should be set to 1 for tables, which should be freed
    with av_free_static(), 0 if ff_free_vlc() will be used.
 */
-int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
+int ff_init_vlc_sparse(VLC *vlc_arg, int nb_bits, int nb_codes,
                        const void *bits, int bits_wrap, int bits_size,
                        const void *codes, int codes_wrap, int codes_size,
                        const void *symbols, int symbols_wrap, int symbols_size,
@@ -272,42 +279,56 @@ int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
 {
     VLCcode *buf;
     int i, j, ret;
+    VLCcode localbuf[1500]; // the maximum currently needed is 1296 by rv34
+    VLC localvlc, *vlc;
 
+    vlc = vlc_arg;
     vlc->bits = nb_bits;
     if (flags & INIT_VLC_USE_NEW_STATIC) {
-        if (vlc->table_size && vlc->table_size == vlc->table_allocated) {
-            return 0;
-        } else if (vlc->table_size) {
-            return AVERROR_BUG;
-        }
+        av_assert0(nb_codes + 1 <= FF_ARRAY_ELEMS(localbuf));
+        buf = localbuf;
+        localvlc = *vlc_arg;
+        vlc = &localvlc;
+        vlc->table_size = 0;
     } else {
         vlc->table           = NULL;
         vlc->table_allocated = 0;
         vlc->table_size      = 0;
-    }
 
-    ff_dlog(NULL, "build table nb_codes=%d\n", nb_codes);
+        buf = av_malloc_array((nb_codes + 1), sizeof(VLCcode));
+        if (!buf)
+            return AVERROR(ENOMEM);
+    }
 
-    buf = av_malloc((nb_codes + 1) * sizeof(VLCcode));
-    if (!buf)
-        return AVERROR(ENOMEM);
 
-    assert(symbols_size <= 2 || !symbols);
+    av_assert0(symbols_size <= 2 || !symbols);
     j = 0;
-#define COPY(condition)                                                     \
+#define COPY(condition)\
     for (i = 0; i < nb_codes; i++) {                                        \
         GET_DATA(buf[j].bits, bits, i, bits_wrap, bits_size);               \
         if (!(condition))                                                   \
             continue;                                                       \
+        if (buf[j].bits > 3*nb_bits || buf[j].bits>32) {                    \
+            av_log(NULL, AV_LOG_ERROR, "Too long VLC (%d) in init_vlc\n", buf[j].bits);\
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return -1;                                                      \
+        }                                                                   \
         GET_DATA(buf[j].code, codes, i, codes_wrap, codes_size);            \
+        if (buf[j].code >= (1LL<<buf[j].bits)) {                            \
+            av_log(NULL, AV_LOG_ERROR, "Invalid code in init_vlc\n");       \
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return -1;                                                      \
+        }                                                                   \
         if (flags & INIT_VLC_LE)                                            \
             buf[j].code = bitswap_32(buf[j].code);                          \
         else                                                                \
             buf[j].code <<= 32 - buf[j].bits;                               \
         if (symbols)                                                        \
             GET_DATA(buf[j].symbol, symbols, i, symbols_wrap, symbols_size) \
-            else                                                            \
-                buf[j].symbol = i;                                          \
+        else                                                                \
+            buf[j].symbol = i;                                              \
         j++;                                                                \
     }
     COPY(buf[j].bits > nb_bits);
@@ -318,15 +339,19 @@ int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
 
     ret = build_table(vlc, nb_bits, nb_codes, buf, flags);
 
-    av_free(buf);
-    if (ret < 0) {
-        av_freep(&vlc->table);
-        return ret;
+    if (flags & INIT_VLC_USE_NEW_STATIC) {
+        if(vlc->table_size != vlc->table_allocated)
+            av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n", vlc->table_size, vlc->table_allocated);
+
+        av_assert0(ret >= 0);
+        *vlc_arg = *vlc;
+    } else {
+        av_free(buf);
+        if (ret < 0) {
+            av_freep(&vlc->table);
+            return ret;
+        }
     }
-    if ((flags & INIT_VLC_USE_NEW_STATIC) &&
-        vlc->table_size != vlc->table_allocated)
-        av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n",
-               vlc->table_size, vlc->table_allocated);
     return 0;
 }
 
diff --git a/libavcodec/bitstream_filter.c b/libavcodec/bitstream_filter.c
index 3b19bbdc76..a4e437df5f 100644
--- a/libavcodec/bitstream_filter.c
+++ b/libavcodec/bitstream_filter.c
@@ -1,26 +1,27 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
+#include "libavutil/atomic.h"
 #include "libavutil/mem.h"
 
 static AVBitStreamFilter *first_bitstream_filter = NULL;
@@ -35,15 +36,16 @@ AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f)
 
 void av_register_bitstream_filter(AVBitStreamFilter *bsf)
 {
-    bsf->next              = first_bitstream_filter;
-    first_bitstream_filter = bsf;
+    do {
+        bsf->next = first_bitstream_filter;
+    } while(bsf->next != avpriv_atomic_ptr_cas((void * volatile *)&first_bitstream_filter, bsf->next, bsf));
 }
 
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name)
 {
-    AVBitStreamFilter *bsf = first_bitstream_filter;
+    AVBitStreamFilter *bsf = NULL;
 
-    while (bsf) {
+    while (bsf = av_bitstream_filter_next(bsf)) {
         if (!strcmp(name, bsf->name)) {
             AVBitStreamFilterContext *bsfc =
                 av_mallocz(sizeof(AVBitStreamFilterContext));
@@ -60,13 +62,14 @@ AVBitStreamFilterContext *av_bitstream_filter_init(const char *name)
             }
             return bsfc;
         }
-        bsf = bsf->next;
     }
     return NULL;
 }
 
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsfc)
 {
+    if (!bsfc)
+        return;
     if (bsfc->filter->close)
         bsfc->filter->close(bsfc);
     av_freep(&bsfc->priv_data);
diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
index e3d2ca1fdc..a5c527a4da 100644
--- a/libavcodec/blockdsp.c
+++ b/libavcodec/blockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,12 @@
 #include "blockdsp.h"
 #include "version.h"
 
-static void clear_block_8_c(int16_t *block)
+static void clear_block_c(int16_t *block)
 {
     memset(block, 0, sizeof(int16_t) * 64);
 }
 
-static void clear_blocks_8_c(int16_t *blocks)
+static void clear_blocks_c(int16_t *blocks)
 {
     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }
@@ -57,22 +57,20 @@ static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 
 av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
 {
-    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
-
-    c->clear_block  = clear_block_8_c;
-    c->clear_blocks = clear_blocks_8_c;
+    c->clear_block  = clear_block_c;
+    c->clear_blocks = clear_blocks_c;
 
     c->fill_block_tab[0] = fill_block16_c;
     c->fill_block_tab[1] = fill_block8_c;
 
+    if (ARCH_ALPHA)
+        ff_blockdsp_init_alpha(c);
     if (ARCH_ARM)
-        ff_blockdsp_init_arm(c, high_bit_depth);
+        ff_blockdsp_init_arm(c);
     if (ARCH_PPC)
-        ff_blockdsp_init_ppc(c, high_bit_depth);
+        ff_blockdsp_init_ppc(c);
     if (ARCH_X86)
-#if FF_API_XVMC
-        ff_blockdsp_init_x86(c, high_bit_depth, avctx);
-#else
-        ff_blockdsp_init_x86(c, high_bit_depth);
-#endif /* FF_API_XVMC */
+        ff_blockdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_blockdsp_init_mips(c);
 }
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
index 32c671cf5a..95e1d0f32e 100644
--- a/libavcodec/blockdsp.h
+++ b/libavcodec/blockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,13 +40,10 @@ typedef struct BlockDSPContext {
 
 void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
 
-void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth);
-void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth);
-#if FF_API_XVMC
-void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
-                          AVCodecContext *avctx);
-#else
-void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth);
-#endif /* FF_API_XVMC */
+void ff_blockdsp_init_alpha(BlockDSPContext *c);
+void ff_blockdsp_init_arm(BlockDSPContext *c);
+void ff_blockdsp_init_ppc(BlockDSPContext *c);
+void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx);
+void ff_blockdsp_init_mips(BlockDSPContext *c);
 
 #endif /* AVCODEC_BLOCKDSP_H */
diff --git a/libavcodec/bmp.c b/libavcodec/bmp.c
index f545e784b8..3019d01016 100644
--- a/libavcodec/bmp.c
+++ b/libavcodec/bmp.c
@@ -2,20 +2,20 @@
  * BMP image format decoder
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     BiCompression comp;
     unsigned int ihsize;
     int i, j, n, linesize, ret;
-    uint32_t rgb[3];
+    uint32_t rgb[3] = {0};
+    uint32_t alpha = 0;
     uint8_t *ptr;
     int dsize;
     const uint8_t *buf0 = buf;
@@ -69,7 +70,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     hsize  = bytestream_get_le32(&buf); /* header size */
     ihsize = bytestream_get_le32(&buf); /* more header size */
-    if (ihsize + 14 > hsize) {
+    if (ihsize + 14LL > hsize) {
         av_log(avctx, AV_LOG_ERROR, "invalid header size %u\n", hsize);
         return AVERROR_INVALIDDATA;
     }
@@ -86,7 +87,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     }
 
     switch (ihsize) {
-    case  40: // windib v3
+    case  40: // windib
+    case  56: // windib v3
     case  64: // OS/2 v2
     case 108: // windib v4
     case 124: // windib v5
@@ -110,7 +112,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     depth = bytestream_get_le16(&buf);
 
-    if (ihsize == 40)
+    if (ihsize >= 40)
         comp = bytestream_get_le32(&buf);
     else
         comp = BMP_RGB;
@@ -126,6 +128,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         rgb[0] = bytestream_get_le32(&buf);
         rgb[1] = bytestream_get_le32(&buf);
         rgb[2] = bytestream_get_le32(&buf);
+        if (ihsize > 40)
+        alpha = bytestream_get_le32(&buf);
     }
 
     avctx->width  = width;
@@ -136,21 +140,21 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     switch (depth) {
     case 32:
         if (comp == BMP_BITFIELDS) {
-            rgb[0] = (rgb[0] >> 15) & 3;
-            rgb[1] = (rgb[1] >> 15) & 3;
-            rgb[2] = (rgb[2] >> 15) & 3;
-
-            if (rgb[0] + rgb[1] + rgb[2] != 3 ||
-                rgb[0] == rgb[1] || rgb[0] == rgb[2] || rgb[1] == rgb[2]) {
-                break;
+            if (rgb[0] == 0xFF000000 && rgb[1] == 0x00FF0000 && rgb[2] == 0x0000FF00)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ABGR : AV_PIX_FMT_0BGR;
+            else if (rgb[0] == 0x00FF0000 && rgb[1] == 0x0000FF00 && rgb[2] == 0x000000FF)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_BGRA : AV_PIX_FMT_BGR0;
+            else if (rgb[0] == 0x0000FF00 && rgb[1] == 0x00FF0000 && rgb[2] == 0xFF000000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ARGB : AV_PIX_FMT_0RGB;
+            else if (rgb[0] == 0x000000FF && rgb[1] == 0x0000FF00 && rgb[2] == 0x00FF0000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_RGBA : AV_PIX_FMT_RGB0;
+            else {
+                av_log(avctx, AV_LOG_ERROR, "Unknown bitfields %0X %0X %0X\n", rgb[0], rgb[1], rgb[2]);
+                return AVERROR(EINVAL);
             }
         } else {
-            rgb[0] = 2;
-            rgb[1] = 1;
-            rgb[2] = 0;
+            avctx->pix_fmt = AV_PIX_FMT_BGRA;
         }
-
-        avctx->pix_fmt = AV_PIX_FMT_BGR24;
         break;
     case 24:
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
@@ -199,10 +203,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -210,12 +212,16 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     dsize = buf_size - hsize;
 
     /* Line size in file multiple of 4 */
-    n = ((avctx->width * depth) / 8 + 3) & ~3;
+    n = ((avctx->width * depth + 31) / 8) & ~3;
 
     if (n * avctx->height > dsize && comp != BMP_RLE4 && comp != BMP_RLE8) {
-        av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
-               dsize, n * avctx->height);
-        return AVERROR_INVALIDDATA;
+        n = (avctx->width * depth + 7) / 8;
+        if (n * avctx->height > dsize) {
+            av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
+                   dsize, n * avctx->height);
+            return AVERROR_INVALIDDATA;
+        }
+        av_log(avctx, AV_LOG_ERROR, "data size too small, assuming missing line alignment\n");
     }
 
     // RLE may skip decoding some picture areas, so blank picture before decoding
@@ -246,20 +252,26 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             } else if (t) {
                 colors = t;
             }
+        } else {
+            colors = FFMIN(256, (hsize-ihsize-14) / 3);
         }
         buf = buf0 + 14 + ihsize; //palette location
         // OS/2 bitmap, 3 bytes per palette entry
         if ((hsize-ihsize-14) < (colors << 2)) {
+            if ((hsize-ihsize-14) < colors * 3) {
+                av_log(avctx, AV_LOG_ERROR, "palette doesn't fit in packet\n");
+                return AVERROR_INVALIDDATA;
+            }
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le24(&buf);
+                ((uint32_t*)p->data[1])[i] = (0xFFU<<24) | bytestream_get_le24(&buf);
         } else {
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le32(&buf);
+                ((uint32_t*)p->data[1])[i] = 0xFFU << 24 | bytestream_get_le32(&buf);
         }
         buf = buf0 + hsize;
     }
     if (comp == BMP_RLE4 || comp == BMP_RLE8) {
-        if (height < 0) {
+        if (comp == BMP_RLE8 && height < 0) {
             p->data[0]    +=  p->linesize[0] * (avctx->height - 1);
             p->linesize[0] = -p->linesize[0];
         }
@@ -290,6 +302,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             break;
         case 8:
         case 24:
+        case 32:
             for (i = 0; i < avctx->height; i++) {
                 memcpy(ptr, buf, n);
                 buf += n;
@@ -319,28 +332,25 @@ static int bmp_decode_frame(AVCodecContext *avctx,
                 ptr += linesize;
             }
             break;
-        case 32:
-            for (i = 0; i < avctx->height; i++) {
-                const uint8_t *src = buf;
-                uint8_t *dst       = ptr;
-
-                for (j = 0; j < avctx->width; j++) {
-                    dst[0] = src[rgb[2]];
-                    dst[1] = src[rgb[1]];
-                    dst[2] = src[rgb[0]];
-                    dst += 3;
-                    src += 4;
-                }
-
-                buf += n;
-                ptr += linesize;
-            }
-            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "BMP decoder is broken\n");
             return AVERROR_INVALIDDATA;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_BGRA) {
+        for (i = 0; i < avctx->height; i++) {
+            int j;
+            uint8_t *ptr = p->data[0] + p->linesize[0]*i + 3;
+            for (j = 0; j < avctx->width; j++) {
+                if (ptr[4*j])
+                    break;
+            }
+            if (j < avctx->width)
+                break;
+        }
+        if (i == avctx->height)
+            avctx->pix_fmt = p->format = AV_PIX_FMT_BGR0;
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/bmp.h b/libavcodec/bmp.h
index a472f59d3c..fb21090678 100644
--- a/libavcodec/bmp.h
+++ b/libavcodec/bmp.h
@@ -2,20 +2,20 @@
  * internals for BMP codecs
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bmp_parser.c b/libavcodec/bmp_parser.c
index b85dd8bcb5..c9493dc32d 100644
--- a/libavcodec/bmp_parser.c
+++ b/libavcodec/bmp_parser.c
@@ -2,20 +2,20 @@
  * BMP parser
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,21 +45,32 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int i = 0;
 
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
-    if (!bpc->pc.frame_start_found) {
+restart:
+    if (bpc->pc.frame_start_found <= 2+4+4) {
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state >> 48) == (('B' << 8) | 'M')) {
-                bpc->fsize = av_bswap32(state >> 16);
-                bpc->pc.frame_start_found = 1;
-                if (bpc->fsize > buf_size - i + 7)
-                    bpc->remaining_size = bpc->fsize - buf_size + i - 7;
-                else
-                    next = bpc->fsize + i - 7;
-                break;
-            }
+            if (bpc->pc.frame_start_found == 0) {
+                if ((state >> 48) == (('B' << 8) | 'M')) {
+                    bpc->fsize = av_bswap32(state >> 16);
+                    bpc->pc.frame_start_found = 1;
+                }
+            } else if (bpc->pc.frame_start_found == 2+4+4) {
+//                 unsigned hsize = av_bswap32(state>>32);
+                unsigned ihsize = av_bswap32(state);
+                if (ihsize < 12 || ihsize > 200) {
+                    bpc->pc.frame_start_found = 0;
+                    continue;
+                }
+                bpc->pc.frame_start_found++;
+                bpc->remaining_size = bpc->fsize + i - 17;
+
+                if (bpc->pc.index + i > 17) {
+                    next = i - 17;
+                } else
+                    goto restart;
+            } else if (bpc->pc.frame_start_found)
+                bpc->pc.frame_start_found++;
         }
         bpc->pc.state64 = state;
     } else {
@@ -68,7 +79,9 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             bpc->remaining_size -= i;
             if (bpc->remaining_size)
                 goto flush;
-            next = i;
+
+            bpc->pc.frame_start_found = 0;
+            goto restart;
         }
     }
 
diff --git a/libavcodec/bmpenc.c b/libavcodec/bmpenc.c
index 915c3968a7..e829d68475 100644
--- a/libavcodec/bmpenc.c
+++ b/libavcodec/bmpenc.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2006, 2007 Michel Bardiaux
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "bmp.h"
@@ -32,6 +33,9 @@ static const uint32_t rgb444_masks[]  = { 0x0F00, 0x00F0, 0x000F };
 
 static av_cold int bmp_encode_init(AVCodecContext *avctx){
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_BGRA:
+        avctx->bits_per_coded_sample = 32;
+        break;
     case AV_PIX_FMT_BGR24:
         avctx->bits_per_coded_sample = 24;
         break;
@@ -53,7 +57,7 @@ static av_cold int bmp_encode_init(AVCodecContext *avctx){
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -65,6 +69,7 @@ static int bmp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int n_bytes_image, n_bytes_per_row, n_bytes, i, n, hsize, ret;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     int pad_bytes_per_row, pal_entries = 0, compression = BMP_RGB;
     int bit_count = avctx->bits_per_coded_sample;
     uint8_t *ptr, *buf;
@@ -91,7 +96,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
-        avpriv_set_systematic_pal2((uint32_t*)p->data[1], avctx->pix_fmt);
+        av_assert1(bit_count == 8);
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         pal = (uint32_t *)p->data[1];
         break;
@@ -110,10 +118,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define SIZE_BITMAPINFOHEADER 40
     hsize = SIZE_BITMAPFILEHEADER + SIZE_BITMAPINFOHEADER + (pal_entries << 2);
     n_bytes = n_bytes_image + hsize;
-    if ((ret = ff_alloc_packet(pkt, n_bytes)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, n_bytes, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
     bytestream_put_byte(&buf, 'B');                   // BITMAPFILEHEADER.bfType
     bytestream_put_byte(&buf, 'M');                   // do.
@@ -165,8 +171,8 @@ AVCodec ff_bmp_encoder = {
     .init           = bmp_encode_init,
     .encode2        = bmp_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24,
-        AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444, AV_PIX_FMT_RGB565,
+        AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_RGB565, AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444,
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_MONOBLACK,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/bmvaudio.c b/libavcodec/bmvaudio.c
index 8b4bd784a1..b1587ab366 100644
--- a/libavcodec/bmvaudio.c
+++ b/libavcodec/bmvaudio.c
@@ -2,20 +2,20 @@
  * Discworld II BMV audio decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,10 +58,8 @@ static int bmv_aud_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = total_blocks * 32;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
 
     for (blocks = 0; blocks < total_blocks; blocks++) {
diff --git a/libavcodec/bmvvideo.c b/libavcodec/bmvvideo.c
index f4b8f29f55..97f850dbae 100644
--- a/libavcodec/bmvvideo.c
+++ b/libavcodec/bmvvideo.c
@@ -2,23 +2,24 @@
  * Discworld II BMV video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 #include "avcodec.h"
@@ -50,7 +51,7 @@ typedef struct BMVDecContext {
     const uint8_t *stream;
 } BMVDecContext;
 
-#define NEXT_BYTE(v) v = forward ? v + 1 : v - 1;
+#define NEXT_BYTE(v) (v) = forward ? (v) + 1 : (v) - 1;
 
 static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame, int frame_off)
 {
@@ -100,6 +101,8 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         if (!(val & 0xC)) {
             for (;;) {
+                if(shift>22)
+                    return -1;
                 if (!read_two_nibbles) {
                     if (src < source || src >= source_end)
                         return AVERROR_INVALIDDATA;
@@ -133,6 +136,7 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         advance_mode = val & 1;
         len = (val >> 1) - 1;
+        av_assert0(len>0);
         mode += 1 + advance_mode;
         if (mode >= 4)
             mode -= 3;
@@ -185,8 +189,6 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
                 memset(dst, val, len);
             }
             break;
-        default:
-            break;
         }
         if (dst == dst_end)
             return 0;
@@ -227,7 +229,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
         for (i = 0; i < 256; i++)
-            c->pal[i] = bytestream_get_be24(&c->stream);
+            c->pal[i] = 0xFFU << 24 | bytestream_get_be24(&c->stream);
     }
     if (type & BMV_SCROLL) {
         if (c->stream - pkt->data > pkt->size - 2) {
@@ -241,10 +243,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         scr_off = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_bmv_frame(c->stream, pkt->size - (c->stream - pkt->data), c->frame, scr_off)) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding frame data\n");
@@ -276,6 +276,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (avctx->width != SCREEN_WIDE || avctx->height != SCREEN_HIGH) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimension %dx%d\n", avctx->width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
     c->frame = c->frame_base + 640;
 
     return 0;
diff --git a/libavcodec/brenderpix.c b/libavcodec/brenderpix.c
index a4b4c874c4..0556858de1 100644
--- a/libavcodec/brenderpix.c
+++ b/libavcodec/brenderpix.c
@@ -2,20 +2,20 @@
  * BRender PIX (.pix) image decoder
  * Copyright (c) 2012 Aleksi Nurmi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -134,7 +134,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 {
     AVFrame *frame = data;
 
-    int ret, i, j;
+    int ret, i;
     GetByteContext gb;
 
     unsigned int bytes_pp;
@@ -142,6 +142,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     unsigned int chunk_type;
     unsigned int data_len;
     unsigned int bytes_per_scanline;
+    unsigned int bytes_left;
     PixHeader hdr;
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
@@ -168,7 +169,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     ret = pix_decode_header(&hdr, &gb);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Invalid header length.\n");
-        return AVERROR_INVALIDDATA;
+        return ret;
     }
     switch (hdr.format) {
     case 3:
@@ -187,7 +188,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->pix_fmt = AV_PIX_FMT_RGB24;
         bytes_pp = 3;
         break;
-    case 7: // XRGB
+    case 7:
+        avctx->pix_fmt = AV_PIX_FMT_0RGB;
+        bytes_pp = 4;
+        break;
     case 8: // ARGB
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
         bytes_pp = 4;
@@ -219,22 +223,21 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = pix_decode_header(&palhdr, &gb);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette header length.\n");
-            return AVERROR_INVALIDDATA;
+            return ret;
         }
         if (palhdr.format != 7)
             avpriv_request_sample(avctx, "Palette not in RGB format");
 
         chunk_type = bytestream2_get_be32(&gb);
         data_len = bytestream2_get_be32(&gb);
-        if (chunk_type != IMAGE_DATA_CHUNK ||
-            bytestream2_get_bytes_left(&gb) < data_len) {
+        bytestream2_skip(&gb, 8);
+        if (chunk_type != IMAGE_DATA_CHUNK || data_len != 1032 ||
+            bytestream2_get_bytes_left(&gb) < 1032) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette data.\n");
             return AVERROR_INVALIDDATA;
         }
-
         // palette data is surrounded by 8 null bytes (both top and bottom)
-        bytestream2_skip(&gb, 8);
-        // convert to machine endian format (ARGB)
+        // convert 0RGB to machine endian format (ARGB32)
         for (i = 0; i < 256; ++i)
             *pal_out++ = (0xFFU << 24) | bytestream2_get_be32u(&gb);
         bytestream2_skip(&gb, 8);
@@ -259,9 +262,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // read the image data to the buffer
     bytes_per_scanline = bytes_pp * hdr.width;
-    if (chunk_type != IMAGE_DATA_CHUNK ||
-        data_len < bytes_per_scanline * hdr.height ||
-        bytestream2_get_bytes_left(&gb) < data_len) {
+    bytes_left = bytestream2_get_bytes_left(&gb);
+
+    if (chunk_type != IMAGE_DATA_CHUNK || data_len != bytes_left ||
+        bytes_left / bytes_per_scanline < hdr.height) {
         av_log(avctx, AV_LOG_ERROR, "Invalid image data.\n");
         return AVERROR_INVALIDDATA;
     }
@@ -271,12 +275,6 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         bytes_per_scanline,
                         bytes_per_scanline, hdr.height);
 
-    // make alpha opaque for XRGB
-    if (hdr.format == 7)
-        for (j = 0; j < frame->height; j++)
-            for (i = 0; i < frame->linesize[0]; i += 4)
-                frame->data[0][j * frame->linesize[0] + i] = 0xFF;
-
     frame->pict_type = AV_PICTURE_TYPE_I;
     frame->key_frame = 1;
     *got_frame = 1;
diff --git a/libavcodec/bswapdsp.c b/libavcodec/bswapdsp.c
index 6700cfd980..a6e1ec069c 100644
--- a/libavcodec/bswapdsp.c
+++ b/libavcodec/bswapdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bswapdsp.h b/libavcodec/bswapdsp.h
index fd10a8892c..f167d77fea 100644
--- a/libavcodec/bswapdsp.h
+++ b/libavcodec/bswapdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bytestream.h b/libavcodec/bytestream.h
index cb3573b869..7c05ea6cf5 100644
--- a/libavcodec/bytestream.h
+++ b/libavcodec/bytestream.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -133,6 +134,7 @@ static av_always_inline void bytestream2_init(GetByteContext *g,
                                               const uint8_t *buf,
                                               int buf_size)
 {
+    av_assert0(buf_size >= 0);
     g->buffer       = buf;
     g->buffer_start = buf;
     g->buffer_end   = buf + buf_size;
@@ -142,6 +144,7 @@ static av_always_inline void bytestream2_init_writer(PutByteContext *p,
                                                      uint8_t *buf,
                                                      int buf_size)
 {
+    av_assert0(buf_size >= 0);
     p->buffer       = buf;
     p->buffer_start = buf;
     p->buffer_end   = buf + buf_size;
diff --git a/libavcodec/c93.c b/libavcodec/c93.c
index 18df9586d0..fd026acb8a 100644
--- a/libavcodec/c93.c
+++ b/libavcodec/c93.c
@@ -2,20 +2,20 @@
  * Interplay C93 video decoder
  * Copyright (c) 2007 Anssi Hannula <anssi.hannula@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -133,12 +133,13 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *out;
     int stride, ret, i, x, y, b, bt = 0;
 
+    if ((ret = ff_set_dimensions(avctx, WIDTH, HEIGHT)) < 0)
+        return ret;
+
     c93->currentpic ^= 1;
 
-    if ((ret = ff_reget_buffer(avctx, newpic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, newpic)) < 0)
         return ret;
-    }
 
     stride = newpic->linesize[0];
 
@@ -176,7 +177,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             case C93_4X4_FROM_PREV:
                 for (j = 0; j < 8; j += 4) {
                     for (i = 0; i < 8; i += 4) {
-                        offset = bytestream2_get_le16(&gb);
+                        int offset = bytestream2_get_le16(&gb);
+                        int from_x = offset % WIDTH;
+                        int from_y = offset / WIDTH;
+                        if (block_type == C93_4X4_FROM_CURR && from_y == y+j &&
+                            (FFABS(from_x - x-i) < 4 || FFABS(from_x - x-i) > WIDTH-4)) {
+                            avpriv_request_sample(avctx, "block overlap %d %d %d %d", from_x, x+i, from_y, y+j);
+                            return AVERROR_INVALIDDATA;
+                        }
                         if ((ret = copy_block(avctx, &out[j*stride+i],
                                               copy_from, offset, 4, stride)) < 0)
                             return ret;
@@ -236,7 +244,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (b & C93_HAS_PALETTE) {
         uint32_t *palette = (uint32_t *) newpic->data[1];
         for (i = 0; i < 256; i++) {
-            palette[i] = bytestream2_get_be24(&gb);
+            palette[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         newpic->palette_has_changed = 1;
     } else {
diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
index b6f56f05ec..8cc9333e09 100644
--- a/libavcodec/cabac.c
+++ b/libavcodec/cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,80 +27,25 @@
 #include <string.h>
 
 #include "libavutil/common.h"
+#include "libavutil/timer.h"
 #include "get_bits.h"
 #include "cabac.h"
 #include "cabac_functions.h"
 
-uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63] = {
- 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
- 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-};
-
-static const uint8_t lps_range[64][4]= {
-{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
-{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
-{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
-{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
-{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
-{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
-{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
-{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
-{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
-{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
-{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
-{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
-{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
-{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
-{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
-{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
-};
-
-static const uint8_t mps_state[64]= {
-  1, 2, 3, 4, 5, 6, 7, 8,
-  9,10,11,12,13,14,15,16,
- 17,18,19,20,21,22,23,24,
- 25,26,27,28,29,30,31,32,
- 33,34,35,36,37,38,39,40,
- 41,42,43,44,45,46,47,48,
- 49,50,51,52,53,54,55,56,
- 57,58,59,60,61,62,62,63,
-};
-
-static const uint8_t lps_state[64]= {
-  0, 0, 1, 2, 2, 4, 4, 5,
-  6, 7, 8, 9, 9,11,11,12,
- 13,13,15,15,16,16,18,18,
- 19,19,21,21,22,22,23,24,
- 24,25,26,26,27,27,28,29,
- 29,30,30,30,31,32,32,33,
- 33,33,34,34,35,35,35,36,
- 36,36,37,37,37,38,38,63,
-};
-
-static const uint8_t last_coeff_flag_offset_8x8[63] = {
- 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
- 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
-};
+#include "cabac_tablegen.h"
+
+/**
+ *
+ * @param buf_size size of buf in bits
+ */
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size){
+    init_put_bits(&c->pb, buf, buf_size);
+
+    c->low= 0;
+    c->range= 0x1FE;
+    c->outstanding_count= 0;
+    c->pb.bit_left++; //avoids firstBitFlag
+}
 
 /**
  *
@@ -123,32 +68,158 @@ void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
 
 void ff_init_cabac_states(void)
 {
-    int i, j;
     static int initialized = 0;
 
     if (initialized)
         return;
 
-    for(i=0; i<64; i++){
-        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
-            ff_h264_lps_range[j*2*64+2*i+0]=
-            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
-        }
+    cabac_tableinit();
+
+    initialized = 1;
+}
 
-        ff_h264_mlps_state[128 + 2 * i + 0] = 2 * mps_state[i] + 0;
-        ff_h264_mlps_state[128 + 2 * i + 1] = 2 * mps_state[i] + 1;
+#ifdef TEST
+#define SIZE 10240
 
-        if( i ){
-            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
-            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
+#include "libavutil/lfg.h"
+#include "avcodec.h"
+
+static inline void put_cabac_bit(CABACContext *c, int b){
+    put_bits(&c->pb, 1, b);
+    for(;c->outstanding_count; c->outstanding_count--){
+        put_bits(&c->pb, 1, 1-b);
+    }
+}
+
+static inline void renorm_cabac_encoder(CABACContext *c){
+    while(c->range < 0x100){
+        //FIXME optimize
+        if(c->low<0x100){
+            put_cabac_bit(c, 0);
+        }else if(c->low<0x200){
+            c->outstanding_count++;
+            c->low -= 0x100;
         }else{
-            ff_h264_mlps_state[128-2*i-1]= 1;
-            ff_h264_mlps_state[128-2*i-2]= 0;
+            put_cabac_bit(c, 1);
+            c->low -= 0x200;
         }
+
+        c->range+= c->range;
+        c->low += c->low;
     }
-    for(i=0; i< 63; i++){
-      ff_h264_last_coeff_flag_offset_8x8[i] = last_coeff_flag_offset_8x8[i];
+}
+
+static void put_cabac(CABACContext *c, uint8_t * const state, int bit){
+    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state];
+
+    if(bit == ((*state)&1)){
+        c->range -= RangeLPS;
+        *state    = ff_h264_mlps_state[128 + *state];
+    }else{
+        c->low += c->range - RangeLPS;
+        c->range = RangeLPS;
+        *state= ff_h264_mlps_state[127 - *state];
     }
 
-    initialized = 1;
+    renorm_cabac_encoder(c);
+}
+
+/**
+ * @param bit 0 -> write zero bit, !=0 write one bit
+ */
+static void put_cabac_bypass(CABACContext *c, int bit){
+    c->low += c->low;
+
+    if(bit){
+        c->low += c->range;
+    }
+//FIXME optimize
+    if(c->low<0x200){
+        put_cabac_bit(c, 0);
+    }else if(c->low<0x400){
+        c->outstanding_count++;
+        c->low -= 0x200;
+    }else{
+        put_cabac_bit(c, 1);
+        c->low -= 0x400;
+    }
+}
+
+/**
+ *
+ * @return the number of bytes written
+ */
+static int put_cabac_terminate(CABACContext *c, int bit){
+    c->range -= 2;
+
+    if(!bit){
+        renorm_cabac_encoder(c);
+    }else{
+        c->low += c->range;
+        c->range= 2;
+
+        renorm_cabac_encoder(c);
+
+        av_assert0(c->low <= 0x1FF);
+        put_cabac_bit(c, c->low>>9);
+        put_bits(&c->pb, 2, ((c->low>>7)&3)|1);
+
+        flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
+    }
+
+    return (put_bits_count(&c->pb)+7)>>3;
 }
+
+int main(void){
+    CABACContext c;
+    uint8_t b[9*SIZE];
+    uint8_t r[9*SIZE];
+    int i, ret = 0;
+    uint8_t state[10]= {0};
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    ff_init_cabac_encoder(&c, b, SIZE);
+    ff_init_cabac_states();
+
+    for(i=0; i<SIZE; i++){
+        if(2*i<SIZE) r[i] = av_lfg_get(&prng) % 7;
+        else         r[i] = (i>>8)&1;
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac_bypass(&c, r[i]&1);
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac(&c, state, r[i]&1);
+    }
+
+    put_cabac_terminate(&c, 1);
+
+    ff_init_cabac_decoder(&c, b, SIZE);
+
+    memset(state, 0, sizeof(state));
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_bypass(&c) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
+            ret = 1;
+        }
+    }
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_noinline(&c, state) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
+            ret = 1;
+        }
+    }
+    if(!get_cabac_terminate(&c)) {
+        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
+        ret = 1;
+    }
+
+    return ret;
+}
+
+#endif /* TEST */
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 426f338e34..f9eafed105 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,12 @@
 
 #include "put_bits.h"
 
-extern uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+#if CONFIG_HARDCODED_TABLES
+#define CABAC_TABLE_CONST const
+#else
+#define CABAC_TABLE_CONST
+#endif
+extern CABAC_TABLE_CONST uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 #define H264_NORM_SHIFT_OFFSET 0
 #define H264_LPS_RANGE_OFFSET 512
 #define H264_MLPS_STATE_OFFSET 1024
@@ -43,11 +48,14 @@ extern uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
+    int outstanding_count;
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
+    PutBitContext pb;
 }CABACContext;
 
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size);
 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
 void ff_init_cabac_states(void);
 
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 4b8f1bca8a..15dba29f8e 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 #include "cabac.h"
 #include "config.h"
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
 #if ARCH_AARCH64
 #   include "aarch64/cabac.h"
 #endif
@@ -42,10 +46,10 @@
 #   include "x86/cabac.h"
 #endif
 
-static uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET;
-static uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET;
-static uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
-static uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
+static CABAC_TABLE_CONST uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET;
+static CABAC_TABLE_CONST uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET;
+static CABAC_TABLE_CONST uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
+static CABAC_TABLE_CONST uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
 
 static void refill(CABACContext *c){
 #if CABAC_BITS == 16
@@ -54,7 +58,9 @@ static void refill(CABACContext *c){
         c->low+= c->bytestream[0]<<1;
 #endif
     c->low -= CABAC_MASK;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS / 8;
 }
 
@@ -82,7 +88,9 @@ static void refill2(CABACContext *c){
 #endif
 
     c->low += x<<i;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS/8;
 }
 
diff --git a/libavcodec/cabac_tablegen.c b/libavcodec/cabac_tablegen.c
new file mode 100644
index 0000000000..ed5444759a
--- /dev/null
+++ b/libavcodec/cabac_tablegen.c
@@ -0,0 +1,41 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/common.h"
+#include "cabac_functions.h"
+#undef CONFIG_HARDCODED_TABLES
+#define CONFIG_HARDCODED_TABLES 0
+av_const int av_log2(unsigned v) { int r = 0; while (v >>= 1) r++; return r; }
+#include "cabac_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    cabac_tableinit();
+
+    write_fileheader();
+
+    WRITE_ARRAY("const", uint8_t, ff_h264_cabac_tables);
+
+    return 0;
+}
diff --git a/libavcodec/cabac_tablegen.h b/libavcodec/cabac_tablegen.h
new file mode 100644
index 0000000000..a637912204
--- /dev/null
+++ b/libavcodec/cabac_tablegen.h
@@ -0,0 +1,108 @@
+/*
+ * Header file for hardcoded CABAC table
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CABAC_TABLEGEN_H
+#define AVCODEC_CABAC_TABLEGEN_H
+
+#if CONFIG_HARDCODED_TABLES
+#define cabac_tableinit()
+#include "libavcodec/cabac_tables.h"
+#else
+uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+
+static const uint8_t lps_range[64][4]= {
+{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
+{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
+{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
+{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
+{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
+{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
+{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
+{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
+{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
+{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
+{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
+{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
+{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
+{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
+{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
+{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
+};
+
+static const uint8_t mps_state[64]= {
+  1, 2, 3, 4, 5, 6, 7, 8,
+  9,10,11,12,13,14,15,16,
+ 17,18,19,20,21,22,23,24,
+ 25,26,27,28,29,30,31,32,
+ 33,34,35,36,37,38,39,40,
+ 41,42,43,44,45,46,47,48,
+ 49,50,51,52,53,54,55,56,
+ 57,58,59,60,61,62,62,63,
+};
+
+static const uint8_t lps_state[64]= {
+  0, 0, 1, 2, 2, 4, 4, 5,
+  6, 7, 8, 9, 9,11,11,12,
+ 13,13,15,15,16,16,18,18,
+ 19,19,21,21,22,22,23,24,
+ 24,25,26,26,27,27,28,29,
+ 29,30,30,30,31,32,32,33,
+ 33,33,34,34,35,35,35,36,
+ 36,36,37,37,37,38,38,63,
+};
+
+static const uint8_t last_coeff_flag_offset_8x8[63] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
+};
+
+static av_cold void cabac_tableinit(void)
+{
+    int i, j;
+    for (i = 0; i < 512; i++)
+        ff_h264_norm_shift[i] = i ? 8 - av_log2(i) : 9;
+
+    for(i=0; i<64; i++){
+        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
+            ff_h264_lps_range[j*2*64+2*i+0]=
+            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
+        }
+        ff_h264_mlps_state[128 + 2 * i + 0] = 2 * mps_state[i] + 0;
+        ff_h264_mlps_state[128 + 2 * i + 1] = 2 * mps_state[i] + 1;
+
+        if( i ){
+            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
+            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
+        }else{
+            ff_h264_mlps_state[128-2*i-1]= 1;
+            ff_h264_mlps_state[128-2*i-2]= 0;
+        }
+    }
+    for(i=0; i< 63; i++){
+      ff_h264_last_coeff_flag_offset_8x8[i] = last_coeff_flag_offset_8x8[i];
+    }
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_CABAC_TABLEGEN_H */
diff --git a/libavcodec/canopus.c b/libavcodec/canopus.c
index 729e7ef876..ea6cc647d3 100644
--- a/libavcodec/canopus.c
+++ b/libavcodec/canopus.c
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/canopus.h b/libavcodec/canopus.h
index 9e5702da30..faa1e8dfbf 100644
--- a/libavcodec/canopus.h
+++ b/libavcodec/canopus.h
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
index fac695843c..10a25d8749 100644
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,15 +75,16 @@ static inline int get_bs(cavs_vector *mvP, cavs_vector *mvQ, int b)
 {
     if ((mvP->ref == REF_INTRA) || (mvQ->ref == REF_INTRA))
         return 2;
-    if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
+    if((abs(mvP->x - mvQ->x) >= 4) ||
+       (abs(mvP->y - mvQ->y) >= 4) ||
+       (mvP->ref != mvQ->ref))
         return 1;
     if (b) {
         mvP += MV_BWD_OFFS;
         mvQ += MV_BWD_OFFS;
-        if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
-            return 1;
-    } else {
-        if (mvP->ref != mvQ->ref)
+        if((abs(mvP->x - mvQ->x) >= 4) ||
+           (abs(mvP->y - mvQ->y) >= 4) ||
+           (mvP->ref != mvQ->ref))
             return 1;
     }
     return 0;
@@ -149,6 +150,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->left_qp + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lv(h->cy, h->l_stride, alpha, beta, tc, bs[0], bs[1]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->left_qp] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_cv(h->cu, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
                 h->cdsp.cavs_filter_cv(h->cv, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
             }
@@ -161,6 +164,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->top_qp[h->mbx] + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lh(h->cy, h->l_stride, alpha, beta, tc, bs[4], bs[5]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->top_qp[h->mbx]] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_ch(h->cu, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
                 h->cdsp.cavs_filter_ch(h->cv, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
             }
@@ -234,9 +239,14 @@ void ff_cavs_load_intra_pred_chroma(AVSContext *h)
     /* extend borders by one pixel */
     h->left_border_u[9]              = h->left_border_u[8];
     h->left_border_v[9]              = h->left_border_v[8];
-    h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
-    h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
-    if (h->mbx && h->mby) {
+    if(h->flags & C_AVAIL) {
+        h->top_border_u[h->mbx*10 + 9] = h->top_border_u[h->mbx*10 + 11];
+        h->top_border_v[h->mbx*10 + 9] = h->top_border_v[h->mbx*10 + 11];
+    } else {
+        h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
+        h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
+    }
+    if((h->flags & A_AVAIL) && (h->flags & B_AVAIL)) {
         h->top_border_u[h->mbx * 10] = h->left_border_u[0] = h->topleft_border_u;
         h->top_border_v[h->mbx * 10] = h->left_border_v[0] = h->topleft_border_v;
     } else {
@@ -528,7 +538,7 @@ void ff_cavs_inter(AVSContext *h, enum cavs_mb mb_type)
 static inline void scale_mv(AVSContext *h, int *d_x, int *d_y,
                             cavs_vector *src, int distp)
 {
-    int den = h->scale_den[src->ref];
+    int den = h->scale_den[FFMAX(src->ref, 0)];
 
     *d_x = (src->x * distp * den + 256 + FF_SIGNBIT(src->x)) >> 9;
     *d_y = (src->y * distp * den + 256 + FF_SIGNBIT(src->y)) >> 9;
@@ -575,7 +585,7 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
 
     mvP->ref  = ref;
     mvP->dist = h->dist[mvP->ref];
-    if (mvC->ref == NOT_AVAIL)
+    if (mvC->ref == NOT_AVAIL || (nP == MV_FWD_X3) || (nP == MV_BWD_X3 ))
         mvC = &h->mv[nP - 5];  // set to top-left (mvD)
     if (mode == MV_PRED_PSKIP &&
         (mvA->ref == NOT_AVAIL ||
@@ -705,7 +715,7 @@ int ff_cavs_next_mb(AVSContext *h)
  *
  ****************************************************************************/
 
-void ff_cavs_init_pic(AVSContext *h)
+int ff_cavs_init_pic(AVSContext *h)
 {
     int i;
 
@@ -726,6 +736,8 @@ void ff_cavs_init_pic(AVSContext *h)
     h->luma_scan[3]   = 8 * h->l_stride + 8;
     h->mbx            = h->mby = h->mbidx = 0;
     h->flags          = 0;
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -739,22 +751,39 @@ void ff_cavs_init_pic(AVSContext *h)
  * this data has to be stored for one complete row of macroblocks
  * and this storage space is allocated here
  */
-void ff_cavs_init_top_lines(AVSContext *h)
+int ff_cavs_init_top_lines(AVSContext *h)
 {
     /* alloc top line of predictors */
     h->top_qp       = av_mallocz(h->mb_width);
-    h->top_mv[0]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_mv[1]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_pred_Y   = av_mallocz(h->mb_width * 2 * sizeof(*h->top_pred_Y));
-    h->top_border_y = av_mallocz((h->mb_width + 1) * 16);
-    h->top_border_u = av_mallocz(h->mb_width * 10);
-    h->top_border_v = av_mallocz(h->mb_width * 10);
+    h->top_mv[0]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_mv[1]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_pred_Y   = av_mallocz_array(h->mb_width * 2,  sizeof(*h->top_pred_Y));
+    h->top_border_y = av_mallocz_array(h->mb_width + 1,  16);
+    h->top_border_u = av_mallocz_array(h->mb_width,  10);
+    h->top_border_v = av_mallocz_array(h->mb_width,  10);
 
     /* alloc space for co-located MVs and types */
-    h->col_mv        = av_mallocz(h->mb_width * h->mb_height * 4 *
-                                  sizeof(cavs_vector));
+    h->col_mv        = av_mallocz_array(h->mb_width * h->mb_height,
+                                        4 * sizeof(cavs_vector));
     h->col_type_base = av_mallocz(h->mb_width * h->mb_height);
     h->block         = av_mallocz(64 * sizeof(int16_t));
+
+    if (!h->top_qp || !h->top_mv[0] || !h->top_mv[1] || !h->top_pred_Y ||
+        !h->top_border_y || !h->top_border_u || !h->top_border_v ||
+        !h->col_mv || !h->col_type_base || !h->block) {
+        av_freep(&h->top_qp);
+        av_freep(&h->top_mv[0]);
+        av_freep(&h->top_mv[1]);
+        av_freep(&h->top_pred_Y);
+        av_freep(&h->top_border_y);
+        av_freep(&h->top_border_u);
+        av_freep(&h->top_border_v);
+        av_freep(&h->col_mv);
+        av_freep(&h->col_type_base);
+        av_freep(&h->block);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
 }
 
 av_cold int ff_cavs_init(AVCodecContext *avctx)
@@ -811,16 +840,16 @@ av_cold int ff_cavs_end(AVCodecContext *avctx)
     av_frame_free(&h->DPB[0].f);
     av_frame_free(&h->DPB[1].f);
 
-    av_free(h->top_qp);
-    av_free(h->top_mv[0]);
-    av_free(h->top_mv[1]);
-    av_free(h->top_pred_Y);
-    av_free(h->top_border_y);
-    av_free(h->top_border_u);
-    av_free(h->top_border_v);
-    av_free(h->col_mv);
-    av_free(h->col_type_base);
-    av_free(h->block);
+    av_freep(&h->top_qp);
+    av_freep(&h->top_mv[0]);
+    av_freep(&h->top_mv[1]);
+    av_freep(&h->top_pred_Y);
+    av_freep(&h->top_border_y);
+    av_freep(&h->top_border_u);
+    av_freep(&h->top_border_v);
+    av_freep(&h->col_mv);
+    av_freep(&h->col_type_base);
+    av_freep(&h->block);
     av_freep(&h->edge_emu_buffer);
     return 0;
 }
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index cfae05576b..fb9df15194 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -216,6 +216,7 @@ typedef struct AVSContext {
     int luma_scan[4];
     int qp;
     int qp_fixed;
+    int pic_qp_fixed;
     int cbp;
     ScanTable scantable;
 
@@ -241,6 +242,7 @@ typedef struct AVSContext {
     int16_t *block;
 } AVSContext;
 
+extern const uint8_t     ff_cavs_chroma_qp[64];
 extern const uint8_t     ff_cavs_partition_flags[30];
 extern const cavs_vector ff_cavs_intra_mv;
 extern const cavs_vector ff_cavs_dir_mv;
@@ -269,8 +271,8 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
                 enum cavs_mv_pred mode, enum cavs_block size, int ref);
 void ff_cavs_init_mb(AVSContext *h);
 int  ff_cavs_next_mb(AVSContext *h);
-void ff_cavs_init_pic(AVSContext *h);
-void ff_cavs_init_top_lines(AVSContext *h);
+int ff_cavs_init_pic(AVSContext *h);
+int ff_cavs_init_top_lines(AVSContext *h);
 int ff_cavs_init(AVCodecContext *avctx);
 int ff_cavs_end (AVCodecContext *avctx);
 
diff --git a/libavcodec/cavs_parser.c b/libavcodec/cavs_parser.c
index 84f647c196..6067a39826 100644
--- a/libavcodec/cavs_parser.c
+++ b/libavcodec/cavs_parser.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) parser.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavsdata.c b/libavcodec/cavsdata.c
index 4e4a131987..2835a4be09 100644
--- a/libavcodec/cavsdata.c
+++ b/libavcodec/cavsdata.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,13 @@ const uint8_t ff_cavs_partition_flags[30] = {
                       SPLITH|SPLITV, //B_8X8 = 29
 };
 
+const uint8_t ff_cavs_chroma_qp[64] = {
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
+  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
+};
+
 /** mark block as "no prediction from this direction"
     e.g. forward motion vector in BWD partition */
 const cavs_vector ff_cavs_dir_mv   = {0,0,1,REF_DIR};
diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
index a455a34bd0..70ac6f8a42 100644
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * @author Stefan Gehrer <stefan.gehrer@gmx.de>
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
@@ -50,13 +51,6 @@ static const uint8_t cbp_tab[64][2] = {
 
 static const uint8_t scan3x3[4] = { 4, 5, 7, 8 };
 
-static const uint8_t cavs_chroma_qp[64] = {
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
-  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
-};
-
 static const uint8_t dequant_shift[64] = {
   14, 14, 14, 14, 14, 14, 14, 14,
   13, 13, 13, 13, 13, 13, 13, 13,
@@ -508,11 +502,15 @@ static inline void mv_pred_sym(AVSContext *h, cavs_vector *src,
 /** kth-order exponential golomb code */
 static inline int get_ue_code(GetBitContext *gb, int order)
 {
+    unsigned ret = get_ue_golomb(gb);
+    if (ret >= ((1U<<31)>>order)) {
+        av_log(NULL, AV_LOG_ERROR, "get_ue_code: value too larger\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (order) {
-        int ret = get_ue_golomb(gb) << order;
-        return ret + get_bits(gb, order);
+        return (ret<<order) + get_bits(gb, order);
     }
-    return get_ue_golomb(gb);
+    return ret;
 }
 
 static inline int dequant(AVSContext *h, int16_t *level_buf, uint8_t *run_buf,
@@ -549,29 +547,37 @@ static int decode_residual_block(AVSContext *h, GetBitContext *gb,
                                  const struct dec_2dvlc *r, int esc_golomb_order,
                                  int qp, uint8_t *dst, int stride)
 {
-    int i, level_code, esc_code, level, run, mask, ret;
+    int i, esc_code, level, mask, ret;
+    unsigned int level_code, run;
     int16_t level_buf[65];
     uint8_t run_buf[65];
     int16_t *block = h->block;
 
-    for (i = 0;i < 65; i++) {
+    for (i = 0; i < 65; i++) {
         level_code = get_ue_code(gb, r->golomb_order);
         if (level_code >= ESCAPE_CODE) {
             run      = ((level_code - ESCAPE_CODE) >> 1) + 1;
+            if(run > 64) {
+                av_log(h->avctx, AV_LOG_ERROR, "run %d is too large\n", run);
+                return AVERROR_INVALIDDATA;
+            }
             esc_code = get_ue_code(gb, esc_golomb_order);
+            if (esc_code < 0 || esc_code > 32767) {
+                av_log(h->avctx, AV_LOG_ERROR, "esc_code invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
+
             level    = esc_code + (run > r->max_run ? 1 : r->level_add[run]);
             while (level > r->inc_limit)
                 r++;
             mask  = -(level_code & 1);
             level = (level ^ mask) - mask;
-        } else if (level_code >= 0) {
+        } else {
             level = r->rltab[level_code][0];
             if (!level) //end of block signal
                 break;
             run = r->rltab[level_code][1];
             r  += r->rltab[level_code][2];
-        } else {
-            break;
         }
         level_buf[i] = level;
         run_buf[i]   = run;
@@ -589,10 +595,10 @@ static inline void decode_residual_chroma(AVSContext *h)
 {
     if (h->cbp & (1 << 4))
         decode_residual_block(h, &h->gb, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cu, h->c_stride);
+                              ff_cavs_chroma_qp[h->qp], h->cu, h->c_stride);
     if (h->cbp & (1 << 5))
         decode_residual_block(h, &h->gb, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cv, h->c_stride);
+                              ff_cavs_chroma_qp[h->qp], h->cv, h->c_stride);
 }
 
 static inline int decode_residual_inter(AVSContext *h)
@@ -601,7 +607,7 @@ static inline int decode_residual_inter(AVSContext *h)
 
     /* get coded block pattern */
     int cbp = get_ue_golomb(&h->gb);
-    if (cbp > 63 || cbp < 0) {
+    if (cbp > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal inter cbp %d\n", cbp);
         return AVERROR_INVALIDDATA;
     }
@@ -672,7 +678,7 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
     /* get coded block pattern */
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I)
         cbp_code = get_ue_golomb(gb);
-    if (cbp_code > 63 || cbp_code < 0) {
+    if (cbp_code > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal intra cbp\n");
         return AVERROR_INVALIDDATA;
     }
@@ -759,7 +765,7 @@ static void decode_mb_p(AVSContext *h, enum cavs_mb mb_type)
     h->col_type_base[h->mbidx] = mb_type;
 }
 
-static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
+static int decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
 {
     int block;
     enum cavs_sub_mb sub_type[4];
@@ -796,6 +802,8 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         ff_cavs_mv(h, MV_BWD_X0, MV_BWD_C2, MV_PRED_MEDIAN, BLK_16X16, 0);
         break;
     case B_8X8:
+#define TMP_UNUSED_INX  7
+        flags = 0;
         for (block = 0; block < 4; block++)
             sub_type[block] = get_bits(&h->gb, 2);
         for (block = 0; block < 4; block++) {
@@ -803,11 +811,30 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
             case B_SUB_DIRECT:
                 if (!h->col_type_base[h->mbidx]) {
                     /* intra MB at co-location, do in-plane prediction */
-                    ff_cavs_mv(h, mv_scan[block], mv_scan[block] - 3,
-                               MV_PRED_BSKIP, BLK_8X8, 1);
-                    ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
-                               mv_scan[block] - 3 + MV_BWD_OFFS,
-                               MV_PRED_BSKIP, BLK_8X8, 0);
+                    if(flags==0) {
+                        // if col-MB is a Intra MB, current Block size is 16x16.
+                        // AVS standard section 9.9.1
+                        if(block>0){
+                            h->mv[TMP_UNUSED_INX              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[TMP_UNUSED_INX + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                        }
+                        ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2,
+                                   MV_PRED_BSKIP, BLK_8X8, 1);
+                        ff_cavs_mv(h, MV_FWD_X0+MV_BWD_OFFS,
+                                   MV_FWD_C2+MV_BWD_OFFS,
+                                   MV_PRED_BSKIP, BLK_8X8, 0);
+                        if(block>0) {
+                            flags = mv_scan[block];
+                            h->mv[flags              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[flags + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                            h->mv[MV_FWD_X0              ] = h->mv[TMP_UNUSED_INX              ];
+                            h->mv[MV_FWD_X0 + MV_BWD_OFFS] = h->mv[TMP_UNUSED_INX + MV_BWD_OFFS];
+                        } else
+                            flags = MV_FWD_X0;
+                    } else {
+                        h->mv[mv_scan[block]              ] = h->mv[flags              ];
+                        h->mv[mv_scan[block] + MV_BWD_OFFS] = h->mv[flags + MV_BWD_OFFS];
+                    }
                 } else
                     mv_pred_direct(h, &h->mv[mv_scan[block]],
                                    &h->col_mv[h->mbidx * 4 + block]);
@@ -823,6 +850,7 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
                 break;
             }
         }
+#undef TMP_UNUSED_INX
         for (block = 0; block < 4; block++) {
             if (sub_type[block] == B_SUB_BWD)
                 ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
@@ -831,7 +859,11 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         }
         break;
     default:
-        assert((mb_type > B_SYM_16X16) && (mb_type < B_8X8));
+        if (mb_type <= B_SYM_16X16) {
+            av_log(h->avctx, AV_LOG_ERROR, "Invalid mb_type %d in B frame\n", mb_type);
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert2(mb_type < B_8X8);
         flags = ff_cavs_partition_flags[mb_type];
         if (mb_type & 1) { /* 16x8 macroblock types */
             if (flags & FWD0)
@@ -866,6 +898,8 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
     if (mb_type != B_SKIP)
         decode_residual_inter(h);
     ff_cavs_filter(h, mb_type);
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -878,12 +912,18 @@ static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
 {
     if (h->stc > 0xAF)
         av_log(h->avctx, AV_LOG_ERROR, "unexpected start code 0x%02x\n", h->stc);
+
+    if (h->stc >= h->mb_height) {
+        av_log(h->avctx, AV_LOG_ERROR, "stc 0x%02x is too large\n", h->stc);
+        return AVERROR_INVALIDDATA;
+    }
+
     h->mby   = h->stc;
     h->mbidx = h->mby * h->mb_width;
 
     /* mark top macroblocks as unavailable */
     h->flags &= ~(B_AVAIL | C_AVAIL);
-    if ((h->mby == 0) && (!h->qp_fixed)) {
+    if (!h->pic_qp_fixed) {
         h->qp_fixed = get_bits1(gb);
         h->qp       = get_bits(gb, 6);
     }
@@ -976,16 +1016,17 @@ static int decode_pic(AVSContext *h)
             return AVERROR(ENOMEM);
     }
 
-    ff_cavs_init_pic(h);
+    if ((ret = ff_cavs_init_pic(h)) < 0)
+        return ret;
     h->cur.poc = get_bits(&h->gb, 8) * 2;
 
     /* get temporal distances and MV scaling factors */
     if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-        h->dist[0] = (h->cur.poc - h->DPB[0].poc  + 512) % 512;
+        h->dist[0] = (h->cur.poc - h->DPB[0].poc) & 511;
     } else {
-        h->dist[0] = (h->DPB[0].poc  - h->cur.poc + 512) % 512;
+        h->dist[0] = (h->DPB[0].poc  - h->cur.poc) & 511;
     }
-    h->dist[1] = (h->cur.poc - h->DPB[1].poc  + 512) % 512;
+    h->dist[1] = (h->cur.poc - h->DPB[1].poc) & 511;
     h->scale_den[0] = h->dist[0] ? 512/h->dist[0] : 0;
     h->scale_den[1] = h->dist[1] ? 512/h->dist[1] : 0;
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_B) {
@@ -1005,6 +1046,7 @@ static int decode_pic(AVSContext *h)
         skip_bits1(&h->gb);     //advanced_pred_mode_disable
     skip_bits1(&h->gb);        //top_field_first
     skip_bits1(&h->gb);        //repeat_first_field
+    h->pic_qp_fixed =
     h->qp_fixed = get_bits1(&h->gb);
     h->qp       = get_bits(&h->gb, 6);
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
@@ -1080,6 +1122,7 @@ static int decode_seq_header(AVSContext *h)
 {
     int frame_rate_code;
     int width, height;
+    int ret;
 
     h->profile = get_bits(&h->gb, 8);
     h->level   = get_bits(&h->gb, 8);
@@ -1092,24 +1135,36 @@ static int decode_seq_header(AVSContext *h)
                                       "Width/height changing in CAVS");
         return AVERROR_PATCHWELCOME;
     }
-    h->width  = width;
-    h->height = height;
-
+    if (width <= 0 || height <= 0) {
+        av_log(h->avctx, AV_LOG_ERROR, "Dimensions invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
     skip_bits(&h->gb, 2); //chroma format
     skip_bits(&h->gb, 3); //sample_precision
     h->aspect_ratio = get_bits(&h->gb, 4);
     frame_rate_code = get_bits(&h->gb, 4);
+    if (frame_rate_code == 0 || frame_rate_code > 13) {
+        av_log(h->avctx, AV_LOG_WARNING,
+               "frame_rate_code %d is invalid\n", frame_rate_code);
+        frame_rate_code = 1;
+    }
+
     skip_bits(&h->gb, 18); //bit_rate_lower
     skip_bits1(&h->gb);    //marker_bit
     skip_bits(&h->gb, 12); //bit_rate_upper
     h->low_delay =  get_bits1(&h->gb);
+
+    ret = ff_set_dimensions(h->avctx, width, height);
+    if (ret < 0)
+        return ret;
+
+    h->width  = width;
+    h->height = height;
     h->mb_width  = (h->width  + 15) >> 4;
     h->mb_height = (h->height + 15) >> 4;
     h->avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_code];
-    h->avctx->width  = h->width;
-    h->avctx->height = h->height;
     if (!h->top_qp)
-        ff_cavs_init_top_lines(h);
+        return ff_cavs_init_top_lines(h);
     return 0;
 }
 
@@ -1138,12 +1193,17 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return 0;
     }
 
+    h->stc = 0;
+
     buf_ptr = buf;
     buf_end = buf + buf_size;
     for(;;) {
         buf_ptr = avpriv_find_start_code(buf_ptr, buf_end, &stc);
-        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end)
+        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end) {
+            if (!h->stc)
+                av_log(h->avctx, AV_LOG_WARNING, "no frame decoded\n");
             return FFMAX(0, buf_ptr - buf);
+        }
         input_size = (buf_end - buf_ptr) * 8;
         switch (stc) {
         case CAVS_START_CODE:
@@ -1166,8 +1226,8 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 break;
             *got_frame = 1;
             if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-                if (h->DPB[1].f->data[0]) {
-                    if ((ret = av_frame_ref(data, h->DPB[1].f)) < 0)
+                if (h->DPB[!h->low_delay].f->data[0]) {
+                    if ((ret = av_frame_ref(data, h->DPB[!h->low_delay].f)) < 0)
                         return ret;
                 } else {
                     *got_frame = 0;
diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c
index cc7898968f..91f6d7350b 100644
--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -5,20 +5,20 @@
  *
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavsdsp.h b/libavcodec/cavsdsp.h
index 248afd508c..847f5c4c87 100644
--- a/libavcodec/cavsdsp.h
+++ b/libavcodec/cavsdsp.h
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cbrt_fixed_tablegen.c b/libavcodec/cbrt_fixed_tablegen.c
new file mode 100644
index 0000000000..24d2fbb7e0
--- /dev/null
+++ b/libavcodec/cbrt_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.c b/libavcodec/cbrt_tablegen.c
index e92c0f1db1..8c2235e987 100644
--- a/libavcodec/cbrt_tablegen.c
+++ b/libavcodec/cbrt_tablegen.c
@@ -3,35 +3,22 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "cbrt_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    cbrt_tableinit();
-
-    write_fileheader();
-
-    WRITE_ARRAY("static const", uint32_t, cbrt_tab);
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.h b/libavcodec/cbrt_tablegen.h
index 60d900a94f..27a3e3ae29 100644
--- a/libavcodec/cbrt_tablegen.h
+++ b/libavcodec/cbrt_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,14 +25,27 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
+#include "libavcodec/aac_defines.h"
+
+#if USE_FIXED
+#define CBRT(x) (int)floor((x).f * 8192 + 0.5)
+#else
+#define CBRT(x) x.i
+#endif
 
 #if CONFIG_HARDCODED_TABLES
+#if USE_FIXED
+#define cbrt_tableinit_fixed()
+#include "libavcodec/cbrt_fixed_tables.h"
+#else
 #define cbrt_tableinit()
 #include "libavcodec/cbrt_tables.h"
+#endif
 #else
 static uint32_t cbrt_tab[1 << 13];
 
-static void cbrt_tableinit(void)
+static av_cold void AAC_RENAME(cbrt_tableinit)(void)
 {
     if (!cbrt_tab[(1<<13) - 1]) {
         int i;
@@ -42,8 +55,8 @@ static void cbrt_tableinit(void)
                 float f;
                 uint32_t i;
             } f;
-            f.f = powf(i, 1.0 / 3.0) * i;
-            cbrt_tab[i] = f.i;
+            f.f = pow(i, 1.0 / 3.0) * i;
+            cbrt_tab[i] = CBRT(f);
         }
     }
 }
diff --git a/libavcodec/cbrt_tablegen_template.c b/libavcodec/cbrt_tablegen_template.c
new file mode 100644
index 0000000000..9dd2cf5de6
--- /dev/null
+++ b/libavcodec/cbrt_tablegen_template.c
@@ -0,0 +1,37 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "cbrt_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    AAC_RENAME(cbrt_tableinit)();
+
+    write_fileheader();
+
+    WRITE_ARRAY("static const", uint32_t, cbrt_tab);
+
+    return 0;
+}
diff --git a/libavcodec/ccaption_dec.c b/libavcodec/ccaption_dec.c
new file mode 100644
index 0000000000..9f67caa028
--- /dev/null
+++ b/libavcodec/ccaption_dec.c
@@ -0,0 +1,592 @@
+/*
+ * Closed Caption Decoding
+ * Copyright (c) 2015 Anshul Maheshwari
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+
+#define SCREEN_ROWS 15
+#define SCREEN_COLUMNS 32
+
+#define SET_FLAG(var, val)   ( (var) |=   ( 1 << (val)) )
+#define UNSET_FLAG(var, val) ( (var) &=  ~( 1 << (val)) )
+#define CHECK_FLAG(var, val) ( (var) &    ( 1 << (val)) )
+
+/*
+ * TODO list
+ * 1) handle font and color completely
+ */
+enum cc_mode {
+    CCMODE_POPON,
+    CCMODE_PAINTON,
+    CCMODE_ROLLUP_2,
+    CCMODE_ROLLUP_3,
+    CCMODE_ROLLUP_4,
+    CCMODE_TEXT,
+};
+
+enum cc_color_code {
+    CCCOL_WHITE,
+    CCCOL_GREEN,
+    CCCOL_BLUE,
+    CCCOL_CYAN,
+    CCCOL_RED,
+    CCCOL_YELLOW,
+    CCCOL_MAGENTA,
+    CCCOL_USERDEFINED,
+    CCCOL_BLACK,
+    CCCOL_TRANSPARENT,
+};
+
+enum cc_font {
+    CCFONT_REGULAR,
+    CCFONT_ITALICS,
+    CCFONT_UNDERLINED,
+    CCFONT_UNDERLINED_ITALICS,
+};
+
+static const unsigned char pac2_attribs[32][3] = // Color, font, ident
+{
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x40 || 0x60
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x41 || 0x61
+    { CCCOL_GREEN,   CCFONT_REGULAR,            0 },  // 0x42 || 0x62
+    { CCCOL_GREEN,   CCFONT_UNDERLINED,         0 },  // 0x43 || 0x63
+    { CCCOL_BLUE,    CCFONT_REGULAR,            0 },  // 0x44 || 0x64
+    { CCCOL_BLUE,    CCFONT_UNDERLINED,         0 },  // 0x45 || 0x65
+    { CCCOL_CYAN,    CCFONT_REGULAR,            0 },  // 0x46 || 0x66
+    { CCCOL_CYAN,    CCFONT_UNDERLINED,         0 },  // 0x47 || 0x67
+    { CCCOL_RED,     CCFONT_REGULAR,            0 },  // 0x48 || 0x68
+    { CCCOL_RED,     CCFONT_UNDERLINED,         0 },  // 0x49 || 0x69
+    { CCCOL_YELLOW,  CCFONT_REGULAR,            0 },  // 0x4a || 0x6a
+    { CCCOL_YELLOW,  CCFONT_UNDERLINED,         0 },  // 0x4b || 0x6b
+    { CCCOL_MAGENTA, CCFONT_REGULAR,            0 },  // 0x4c || 0x6c
+    { CCCOL_MAGENTA, CCFONT_UNDERLINED,         0 },  // 0x4d || 0x6d
+    { CCCOL_WHITE,   CCFONT_ITALICS,            0 },  // 0x4e || 0x6e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED_ITALICS, 0 },  // 0x4f || 0x6f
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x50 || 0x70
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x51 || 0x71
+    { CCCOL_WHITE,   CCFONT_REGULAR,            4 },  // 0x52 || 0x72
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         4 },  // 0x53 || 0x73
+    { CCCOL_WHITE,   CCFONT_REGULAR,            8 },  // 0x54 || 0x74
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         8 },  // 0x55 || 0x75
+    { CCCOL_WHITE,   CCFONT_REGULAR,           12 },  // 0x56 || 0x76
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        12 },  // 0x57 || 0x77
+    { CCCOL_WHITE,   CCFONT_REGULAR,           16 },  // 0x58 || 0x78
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        16 },  // 0x59 || 0x79
+    { CCCOL_WHITE,   CCFONT_REGULAR,           20 },  // 0x5a || 0x7a
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        20 },  // 0x5b || 0x7b
+    { CCCOL_WHITE,   CCFONT_REGULAR,           24 },  // 0x5c || 0x7c
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        24 },  // 0x5d || 0x7d
+    { CCCOL_WHITE,   CCFONT_REGULAR,           28 },  // 0x5e || 0x7e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        28 }   // 0x5f || 0x7f
+    /* total 32 entries */
+};
+
+/* 0-255 needs 256 spaces */
+static const uint8_t parity_table[256] = { 0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           0, 1, 1, 0, 1, 0, 0, 1,
+                                           1, 0, 0, 1, 0, 1, 1, 0 };
+
+struct Screen {
+    /* +1 is used to compensate null character of string */
+    uint8_t characters[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t colors[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t fonts[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    /*
+     * Bitmask of used rows; if a bit is not set, the
+     * corresponding row is not used.
+     * for setting row 1  use row | (1 << 0)
+     * for setting row 15 use row | (1 << 14)
+     */
+    int16_t  row_used;
+};
+
+
+typedef struct CCaptionSubContext {
+    AVClass *class;
+    struct Screen screen[2];
+    int active_screen;
+    uint8_t cursor_row;
+    uint8_t cursor_column;
+    uint8_t cursor_color;
+    uint8_t cursor_font;
+    AVBPrint buffer;
+    int screen_changed;
+    int rollup;
+    enum  cc_mode mode;
+    int64_t start_time;
+    /* visible screen time */
+    int64_t startv_time;
+    int64_t end_time;
+    char prev_cmd[2];
+    /* buffer to store pkt data */
+    AVBufferRef *pktbuf;
+}CCaptionSubContext;
+
+
+static av_cold int init_decoder(AVCodecContext *avctx)
+{
+    int ret;
+    CCaptionSubContext *ctx = avctx->priv_data;
+
+    av_bprint_init(&ctx->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    /* taking by default roll up to 2 */
+    ctx->mode = CCMODE_ROLLUP_2;
+    ctx->rollup = 2;
+    ret = ff_ass_subtitle_header_default(avctx);
+    if(ret < 0) {
+        return ret;
+    }
+    /* allocate pkt buffer */
+    ctx->pktbuf = av_buffer_alloc(128);
+    if( !ctx->pktbuf) {
+        ret = AVERROR(ENOMEM);
+    }
+    return ret;
+}
+
+static av_cold int close_decoder(AVCodecContext *avctx)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    av_bprint_finalize( &ctx->buffer, NULL);
+    av_buffer_unref(&ctx->pktbuf);
+    return 0;
+}
+
+/**
+ * @param ctx closed caption context just to print log
+ */
+static int write_char (CCaptionSubContext *ctx, char *row,uint8_t col, char ch)
+{
+    if(col < SCREEN_COLUMNS) {
+        row[col] = ch;
+        return 0;
+    }
+    /* We have extra space at end only for null character */
+    else if ( col == SCREEN_COLUMNS && ch == 0) {
+        row[col] = ch;
+        return 0;
+    }
+    else {
+        av_log(ctx, AV_LOG_WARNING,"Data Ignored since exceeding screen width\n");
+        return AVERROR_INVALIDDATA;
+    }
+}
+
+/**
+ * This function after validating parity bit, also remove it from data pair.
+ * The first byte doesn't pass parity, we replace it with a solid blank
+ * and process the pair.
+ * If the second byte doesn't pass parity, it returns INVALIDDATA
+ * user can ignore the whole pair and pass the other pair.
+ */
+static int validate_cc_data_pair (uint8_t *cc_data_pair)
+{
+    uint8_t cc_valid = (*cc_data_pair & 4) >>2;
+    uint8_t cc_type = *cc_data_pair & 3;
+
+    if (!cc_valid)
+        return AVERROR_INVALIDDATA;
+
+    // if EIA-608 data then verify parity.
+    if (cc_type==0 || cc_type==1) {
+        if (!parity_table[cc_data_pair[2]]) {
+            return AVERROR_INVALIDDATA;
+        }
+        if (!parity_table[cc_data_pair[1]]) {
+            cc_data_pair[1]=0x7F;
+        }
+    }
+
+    //Skip non-data
+    if( (cc_data_pair[0] == 0xFA || cc_data_pair[0] == 0xFC || cc_data_pair[0] == 0xFD )
+         && (cc_data_pair[1] & 0x7F) == 0 && (cc_data_pair[2] & 0x7F) == 0)
+        return AVERROR_PATCHWELCOME;
+
+    //skip 708 data
+    if(cc_type == 3 || cc_type == 2 )
+        return AVERROR_PATCHWELCOME;
+
+    /* remove parity bit */
+    cc_data_pair[1] &= 0x7F;
+    cc_data_pair[2] &= 0x7F;
+
+
+    return 0;
+
+}
+
+static struct Screen *get_writing_screen(CCaptionSubContext *ctx)
+{
+    switch (ctx->mode) {
+    case CCMODE_POPON:
+        // use Inactive screen
+        return ctx->screen + !ctx->active_screen;
+    case CCMODE_PAINTON:
+    case CCMODE_ROLLUP_2:
+    case CCMODE_ROLLUP_3:
+    case CCMODE_ROLLUP_4:
+    case CCMODE_TEXT:
+        // use active screen
+        return ctx->screen + ctx->active_screen;
+    }
+    /* It was never an option */
+    return NULL;
+}
+
+static void roll_up(CCaptionSubContext *ctx)
+{
+    struct Screen *screen;
+    int i, keep_lines;
+
+    if(ctx->mode == CCMODE_TEXT)
+        return;
+
+    screen = get_writing_screen(ctx);
+
+    /* +1 signify cursor_row starts from 0
+     * Can't keep lines less then row cursor pos
+     */
+    keep_lines = FFMIN(ctx->cursor_row + 1, ctx->rollup);
+
+    for( i = 0; i < ctx->cursor_row - keep_lines; i++ )
+        UNSET_FLAG(screen->row_used, i);
+
+
+    for( i = 0; i < keep_lines && screen->row_used; i++ ) {
+        const int i_row = ctx->cursor_row - keep_lines + i + 1;
+
+        memcpy( screen->characters[i_row], screen->characters[i_row+1], SCREEN_COLUMNS );
+        memcpy( screen->colors[i_row], screen->colors[i_row+1], SCREEN_COLUMNS);
+        memcpy( screen->fonts[i_row], screen->fonts[i_row+1], SCREEN_COLUMNS);
+        if(CHECK_FLAG(screen->row_used, i_row + 1))
+            SET_FLAG(screen->row_used, i_row);
+
+    }
+    UNSET_FLAG(screen->row_used, ctx->cursor_row);
+
+}
+
+static int reap_screen(CCaptionSubContext *ctx, int64_t pts)
+{
+    int i;
+    int ret = 0;
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+    ctx->start_time = ctx->startv_time;
+
+    for( i = 0; screen->row_used && i < SCREEN_ROWS; i++)
+    {
+        if(CHECK_FLAG(screen->row_used,i)) {
+            char *str = screen->characters[i];
+            /* skip space */
+            while (*str == ' ')
+                str++;
+
+            av_bprintf(&ctx->buffer, "%s\\N", str);
+            ret = av_bprint_is_complete(&ctx->buffer);
+            if( ret == 0) {
+                ret = AVERROR(ENOMEM);
+                break;
+            }
+        }
+
+    }
+    if(screen->row_used && ctx->buffer.len >= 2 ) {
+        ctx->buffer.len -= 2;
+        ctx->buffer.str[ctx->buffer.len] = 0;
+    }
+    ctx->startv_time = pts;
+    ctx->end_time = pts;
+    return ret;
+}
+
+static void handle_textattr( CCaptionSubContext *ctx, uint8_t hi, uint8_t lo )
+{
+    int i = lo - 0x20;
+    int ret;
+    struct Screen *screen = get_writing_screen(ctx);
+    char *row = screen->characters[ctx->cursor_row];
+
+    if( i >= 32)
+        return;
+
+    ctx->cursor_color =  pac2_attribs[i][0];
+    ctx->cursor_font = pac2_attribs[i][1];
+
+    SET_FLAG(screen->row_used,ctx->cursor_row);
+    ret = write_char(ctx, row, ctx->cursor_column, ' ');
+    if(ret == 0)
+        ctx->cursor_column++;
+}
+
+static void handle_pac( CCaptionSubContext *ctx, uint8_t hi, uint8_t lo )
+{
+    static const int8_t row_map[] = {
+        11, -1, 1, 2, 3, 4, 12, 13, 14, 15, 5, 6, 7, 8, 9, 10
+    };
+    const int index = ( (hi<<1) & 0x0e) | ( (lo>>5) & 0x01 );
+    struct Screen *screen = get_writing_screen(ctx);
+    char *row;
+    int indent,i,ret;
+
+    if( row_map[index] <= 0 ) {
+        av_log(ctx, AV_LOG_DEBUG,"Invalid pac index encountered\n");
+        return;
+    }
+
+    lo &= 0x1f;
+
+    ctx->cursor_row = row_map[index] - 1;
+    ctx->cursor_color =  pac2_attribs[lo][0];
+    ctx->cursor_font = pac2_attribs[lo][1];
+    ctx->cursor_column = 0;
+    indent = pac2_attribs[lo][2];
+    row = screen->characters[ctx->cursor_row];
+    for(i = 0;i < indent; i++) {
+        ret = write_char(ctx, row, ctx->cursor_column, ' ');
+        if(  ret == 0 )
+            ctx->cursor_column++;
+    }
+
+}
+
+/**
+ * @param pts it is required to set end time
+ */
+static int handle_edm(CCaptionSubContext *ctx,int64_t pts)
+{
+    int ret = 0;
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+
+    reap_screen(ctx, pts);
+    screen->row_used = 0;
+    ctx->screen_changed = 1;
+    return ret;
+}
+
+static int handle_eoc(CCaptionSubContext *ctx, int64_t pts)
+{
+    int ret;
+    ret = handle_edm(ctx,pts);
+    ctx->active_screen = !ctx->active_screen;
+    ctx->cursor_column = 0;
+    return ret;
+}
+
+static void handle_delete_end_of_row( CCaptionSubContext *ctx, char hi, char lo)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+    char *row = screen->characters[ctx->cursor_row];
+    write_char(ctx, row, ctx->cursor_column, 0);
+
+}
+
+static void handle_char(CCaptionSubContext *ctx, char hi, char lo, int64_t pts)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+    char *row = screen->characters[ctx->cursor_row];
+    int ret;
+
+    SET_FLAG(screen->row_used,ctx->cursor_row);
+
+    ret = write_char(ctx, row, ctx->cursor_column, hi);
+    if( ret == 0 )
+        ctx->cursor_column++;
+
+    if(lo) {
+        ret = write_char(ctx, row, ctx->cursor_column, lo);
+        if ( ret == 0 )
+            ctx->cursor_column++;
+    }
+    write_char(ctx, row, ctx->cursor_column, 0);
+
+    /* reset prev command since character can repeat */
+    ctx->prev_cmd[0] = 0;
+    ctx->prev_cmd[1] = 0;
+    if (lo)
+       ff_dlog(ctx, "(%c,%c)\n",hi,lo);
+    else
+       ff_dlog(ctx, "(%c)\n",hi);
+}
+
+static int process_cc608(CCaptionSubContext *ctx, int64_t pts, uint8_t hi, uint8_t lo)
+{
+    int ret = 0;
+#define COR3(var, with1, with2, with3)  ( (var) == (with1) ||  (var) == (with2) || (var) == (with3) )
+    if ( hi == ctx->prev_cmd[0] && lo == ctx->prev_cmd[1]) {
+    /* ignore redundant command */
+    } else if ( (hi == 0x10 && (lo >= 0x40 || lo <= 0x5f)) ||
+              ( (hi >= 0x11 && hi <= 0x17) && (lo >= 0x40 && lo <= 0x7f) ) ) {
+        handle_pac(ctx, hi, lo);
+    } else if ( ( hi == 0x11 && lo >= 0x20 && lo <= 0x2f ) ||
+                ( hi == 0x17 && lo >= 0x2e && lo <= 0x2f) ) {
+        handle_textattr(ctx, hi, lo);
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x20 ) {
+    /* resume caption loading */
+        ctx->mode = CCMODE_POPON;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x24 ) {
+        handle_delete_end_of_row(ctx, hi, lo);
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x25 ) {
+        ctx->rollup = 2;
+        ctx->mode = CCMODE_ROLLUP_2;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x26 ) {
+        ctx->rollup = 3;
+        ctx->mode = CCMODE_ROLLUP_3;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x27 ) {
+        ctx->rollup = 4;
+        ctx->mode = CCMODE_ROLLUP_4;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x29 ) {
+    /* resume direct captioning */
+        ctx->mode = CCMODE_PAINTON;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x2B ) {
+    /* resume text display */
+        ctx->mode = CCMODE_TEXT;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x2C ) {
+    /* erase display memory */
+        ret = handle_edm(ctx, pts);
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x2D ) {
+    /* carriage return */
+        ff_dlog(ctx, "carriage return\n");
+        reap_screen(ctx, pts);
+        roll_up(ctx);
+        ctx->screen_changed = 1;
+        ctx->cursor_column = 0;
+    } else if ( COR3(hi, 0x14, 0x15, 0x1C) && lo == 0x2F ) {
+    /* end of caption */
+        ff_dlog(ctx, "handle_eoc\n");
+        ret = handle_eoc(ctx, pts);
+    } else if (hi>=0x20) {
+    /* Standard characters (always in pairs) */
+        handle_char(ctx, hi, lo, pts);
+    } else {
+    /* Ignoring all other non data code */
+        ff_dlog(ctx, "Unknown command 0x%hhx 0x%hhx\n", hi, lo);
+    }
+
+    /* set prev command */
+     ctx->prev_cmd[0] = hi;
+     ctx->prev_cmd[1] = lo;
+
+#undef COR3
+    return ret;
+
+}
+
+static int decode(AVCodecContext *avctx, void *data, int *got_sub, AVPacket *avpkt)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    AVSubtitle *sub = data;
+    uint8_t *bptr = NULL;
+    int len = avpkt->size;
+    int ret = 0;
+    int i;
+
+    if ( ctx->pktbuf->size < len) {
+        ret = av_buffer_realloc(&ctx->pktbuf, len);
+         if(ret < 0) {
+            av_log(ctx, AV_LOG_WARNING, "Insufficient Memory of %d truncated to %d\n",len, ctx->pktbuf->size);
+            len = ctx->pktbuf->size;
+            ret = 0;
+        }
+    }
+    memcpy(ctx->pktbuf->data, avpkt->data, len);
+    bptr = ctx->pktbuf->data;
+
+
+    for (i  = 0; i < len; i += 3) {
+        uint8_t cc_type = *(bptr + i) & 3;
+        if (validate_cc_data_pair( bptr + i) )
+            continue;
+        /* ignoring data field 1 */
+        if(cc_type == 1)
+            continue;
+        else
+            process_cc608(ctx, avpkt->pts, *(bptr + i + 1) & 0x7f, *(bptr + i + 2) & 0x7f);
+        if(ctx->screen_changed && *ctx->buffer.str)
+        {
+            int start_time = av_rescale_q(ctx->start_time, avctx->time_base, (AVRational){ 1, 100 });
+            int end_time = av_rescale_q(ctx->end_time, avctx->time_base, (AVRational){ 1, 100 });
+            ff_dlog(ctx, "cdp writing data (%s)\n",ctx->buffer.str);
+            ret = ff_ass_add_rect_bprint(sub, &ctx->buffer, start_time, end_time - start_time);
+            if (ret < 0)
+                return ret;
+            sub->pts = av_rescale_q(ctx->start_time, avctx->time_base, AV_TIME_BASE_Q);
+            ctx->screen_changed = 0;
+            av_bprint_clear(&ctx->buffer);
+        }
+    }
+
+    *got_sub = sub->num_rects > 0;
+    return ret;
+}
+
+static const AVOption options[] = {
+    {NULL}
+};
+
+static const AVClass ccaption_dec_class = {
+    .class_name = "Closed caption Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ccaption_decoder = {
+    .name           = "cc_dec",
+    .long_name      = NULL_IF_CONFIG_SMALL("Closed Caption (EIA-608 / CEA-708) Decoder"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_EIA_608,
+    .priv_data_size = sizeof(CCaptionSubContext),
+    .init           = init_decoder,
+    .close          = close_decoder,
+    .decode         = decode,
+    .priv_class     = &ccaption_dec_class,
+};
diff --git a/libavcodec/cdgraphics.c b/libavcodec/cdgraphics.c
index 3b68f452f4..aca7cb057b 100644
--- a/libavcodec/cdgraphics.c
+++ b/libavcodec/cdgraphics.c
@@ -2,20 +2,20 @@
  * CD Graphics Video Decoder
  * Copyright (c) 2009 Michael Tison
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,7 +119,7 @@ static void cdg_load_palette(CDGraphicsContext *cc, uint8_t *data, int low)
         r = ((color >> 8) & 0x000F) * 17;
         g = ((color >> 4) & 0x000F) * 17;
         b = ((color     ) & 0x000F) * 17;
-        palette[i + array_offset] = r << 16 | g << 8 | b;
+        palette[i + array_offset] = 0xFFU << 24 | r << 16 | g << 8 | b;
     }
     cc->frame->palette_has_changed = 1;
 }
@@ -265,20 +265,27 @@ static int cdg_decode_frame(AVCodecContext *avctx,
     int buf_size       = avpkt->size;
     int ret;
     uint8_t command, inst;
-    uint8_t cdg_data[CDG_DATA_SIZE];
+    uint8_t cdg_data[CDG_DATA_SIZE] = {0};
     AVFrame *frame = data;
     CDGraphicsContext *cc = avctx->priv_data;
 
-    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    if (buf_size < CDG_MINIMUM_PKT_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too small for decoder\n");
+        return AVERROR(EINVAL);
+    }
+    if (buf_size > CDG_HEADER_SIZE + CDG_DATA_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too big for decoder\n");
+        return AVERROR(EINVAL);
+    }
 
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    ret = ff_reget_buffer(avctx, cc->frame);
-    if (ret) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, cc->frame)) < 0)
         return ret;
-    }
-    if (!avctx->frame_number)
+    if (!avctx->frame_number) {
         memset(cc->frame->data[0], 0, cc->frame->linesize[0] * avctx->height);
+        memset(cc->frame->data[1], 0, AVPALETTE_SIZE);
+    }
 
     command = bytestream2_get_byte(&gb);
     inst    = bytestream2_get_byte(&gb);
@@ -325,11 +332,8 @@ static int cdg_decode_frame(AVCodecContext *avctx,
                 return AVERROR(EINVAL);
             }
 
-            ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
 
             cdg_scroll(cc, cdg_data, frame, inst == CDG_INST_SCROLL_COPY);
             av_frame_unref(cc->frame);
diff --git a/libavcodec/cdxl.c b/libavcodec/cdxl.c
index 99e96eb502..50d514b25d 100644
--- a/libavcodec/cdxl.c
+++ b/libavcodec/cdxl.c
@@ -2,23 +2,31 @@
  * CDXL video decoder
  * Copyright (c) 2011-2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Commodore CDXL video decoder
+ * @author Paul B Mahol
+ */
+
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -26,8 +34,8 @@
 #include "internal.h"
 
 #define BIT_PLANAR   0x00
-#define BYTE_PLANAR  0x20
-#define CHUNKY       0x40
+#define CHUNKY       0x20
+#define BYTE_PLANAR  0x40
 #define BIT_LINE     0x80
 #define BYTE_LINE    0xC0
 
@@ -63,7 +71,7 @@ static void import_palette(CDXLVideoContext *c, uint32_t *new_palette)
         unsigned r   = ((rgb >> 8) & 0xF) * 0x11;
         unsigned g   = ((rgb >> 4) & 0xF) * 0x11;
         unsigned b   =  (rgb       & 0xF) * 0x11;
-        AV_WN32(&new_palette[i], (r << 16) | (g << 8) | b);
+        AV_WN32(&new_palette[i], (0xFFU << 24) | (r << 16) | (g << 8) | b);
     }
 }
 
@@ -72,7 +80,8 @@ static void bitplanar2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
     GetBitContext gb;
     int x, y, plane;
 
-    init_get_bits(&gb, c->video, c->video_size * 8);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (plane = 0; plane < c->bpp; plane++) {
         for (y = 0; y < c->avctx->height; y++) {
             for (x = 0; x < c->avctx->width; x++)
@@ -87,7 +96,8 @@ static void bitline2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
     GetBitContext  gb;
     int x, y, plane;
 
-    init_get_bits(&gb, c->video, c->video_size * 8);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (y = 0; y < c->avctx->height; y++) {
         for (plane = 0; plane < c->bpp; plane++) {
             for (x = 0; x < c->avctx->width; x++)
@@ -115,6 +125,7 @@ static void cdxl_decode_rgb(CDXLVideoContext *c, AVFrame *frame)
 {
     uint32_t *new_palette = (uint32_t *)frame->data[1];
 
+    memset(frame->data[1], 0, AVPALETTE_SIZE);
     import_palette(c, new_palette);
     import_format(c, frame->linesize[0], frame->data[0]);
 }
@@ -255,10 +266,8 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (encoding) {
@@ -282,7 +291,7 @@ static av_cold int cdxl_decode_end(AVCodecContext *avctx)
 {
     CDXLVideoContext *c = avctx->priv_data;
 
-    av_free(c->new_video);
+    av_freep(&c->new_video);
 
     return 0;
 }
diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c
index 61474f5906..a81fd8831b 100644
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "avcodec.h"
 #include "celp_filters.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 void ff_celp_convolve_circ(int16_t* fc_out, const int16_t* fc_in,
@@ -104,6 +105,8 @@ void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs,
     c -= filter_coeffs[1] * filter_coeffs[0];
     c -= filter_coeffs[0] * b;
 
+    av_assert2((filter_length&1)==0 && filter_length>=4);
+
     old_out0 = out[-4];
     old_out1 = out[-3];
     old_out2 = out[-2];
@@ -205,3 +208,12 @@ void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs,
             out[n] += filter_coeffs[i-1] * in[n-i];
     }
 }
+
+void ff_celp_filter_init(CELPFContext *c)
+{
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_filter_init_mips(c);
+}
diff --git a/libavcodec/celp_filters.h b/libavcodec/celp_filters.h
index c328258460..f644ec325e 100644
--- a/libavcodec/celp_filters.h
+++ b/libavcodec/celp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,55 @@
 
 #include <stdint.h>
 
+typedef struct CELPFContext {
+    /**
+     * LP synthesis filter.
+     * @param[out] out pointer to output buffer
+     *        - the array out[-filter_length, -1] must
+     *        contain the previous result of this filter
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     * @param buffer_length amount of data to process
+     * @param filter_length filter length (10 for 10th order LP filter). Must be
+     *                      greater than 4 and even.
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies 1/A(z) filter to given speech data.
+     */
+    void (*celp_lp_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                      const float *in, int buffer_length,
+                                      int filter_length);
+
+    /**
+     * LP zero synthesis filter.
+     * @param[out] out pointer to output buffer
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     *        - the array in[-filter_length, -1] must
+     *        contain the previous input of this filter
+     * @param buffer_length amount of data to process (should be a multiple of eight)
+     * @param filter_length filter length (10 for 10th order LP filter;
+     *                                      should be a multiple of two)
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies A(z) filter to given speech data.
+     */
+    void (*celp_lp_zero_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                           const float *in, int buffer_length,
+                                           int filter_length);
+
+}CELPFContext;
+
+/**
+ * Initialize CELPFContext.
+ */
+void ff_celp_filter_init(CELPFContext *c);
+void ff_celp_filter_init_mips(CELPFContext *c);
+
 /**
  * Circularly convolve fixed vector with a phase dispersion impulse
  *        response filter (D.6.2 of G.729 and 6.1.5 of AMR).
diff --git a/libavcodec/celp_math.c b/libavcodec/celp_math.c
index a9ebef697c..a96b1aed9e 100644
--- a/libavcodec/celp_math.c
+++ b/libavcodec/celp_math.c
@@ -3,28 +3,29 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 #include <limits.h>
-#include <assert.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
+#include "mathops.h"
 #include "celp_math.h"
 #include "libavutil/common.h"
 
@@ -48,7 +49,7 @@ int ff_exp2(uint16_t power)
 {
     unsigned int result= exp2a[power>>10] + 0x10000;
 
-    assert(power <= 0x7fff);
+    av_assert2(power <= 0x7fff);
 
     result= (result<<3) + ((result*exp2b[(power>>5)&31])>>17);
     return result + ((result*(power&31)*89)>>22);
@@ -61,10 +62,17 @@ int ff_exp2(uint16_t power)
  */
 static const uint16_t tab_log2[33] =
 {
+#ifdef G729_BITEXACT
+      0,   1455,   2866,   4236,   5568,   6863,   8124,   9352,
+  10549,  11716,  12855,  13967,  15054,  16117,  17156,  18172,
+  19167,  20142,  21097,  22033,  22951,  23852,  24735,  25603,
+  26455,  27291,  28113,  28922,  29716,  30497,  31266,  32023,  32767,
+#else
       4,   1459,   2870,   4240,   5572,   6867,   8127,   9355,
   10552,  11719,  12858,  13971,  15057,  16120,  17158,  18175,
   19170,  20145,  21100,  22036,  22954,  23854,  24738,  25605,
   26457,  27294,  28116,  28924,  29719,  30500,  31269,  32025,  32769,
+#endif
 };
 
 int ff_log2_q15(uint32_t value)
@@ -86,3 +94,33 @@ int ff_log2_q15(uint32_t value)
 
     return (power_int << 15) + value;
 }
+
+int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length)
+{
+    int i;
+    int64_t sum = 0;
+
+    for (i = 0; i < length; i++)
+        sum += MUL16(a[i], b[i]);
+
+    return sum;
+}
+
+float ff_dot_productf(const float* a, const float* b, int length)
+{
+    float sum = 0;
+    int i;
+
+    for(i=0; i<length; i++)
+        sum += a[i] * b[i];
+
+    return sum;
+}
+
+void ff_celp_math_init(CELPMContext *c)
+{
+    c->dot_productf   = ff_dot_productf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_math_init_mips(c);
+}
diff --git a/libavcodec/celp_math.h b/libavcodec/celp_math.h
index ed3f8c0fe2..18d3ad94d1 100644
--- a/libavcodec/celp_math.h
+++ b/libavcodec/celp_math.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,25 @@
 
 #include <stdint.h>
 
+typedef struct CELPMContext {
+    /**
+     * Return the dot product.
+     * @param a input data array
+     * @param b input data array
+     * @param length number of elements
+     *
+     * @return dot product = sum of elementwise products
+     */
+    float (*dot_productf)(const float* a, const float* b, int length);
+
+}CELPMContext;
+
+/**
+ * Initialize CELPMContext.
+ */
+void ff_celp_math_init(CELPMContext *c);
+void ff_celp_math_init_mips(CELPMContext *c);
+
 /**
  * fixed-point implementation of exp2(x) in [0; 1] domain.
  * @param power argument to exp2, 0 <= power <= 0x7fff
@@ -55,4 +74,24 @@ static inline int bidir_sal(int value, int offset)
     else           return value <<  offset;
 }
 
+/**
+ * returns the dot product of 2 int16_t vectors.
+ * @param a input data array
+ * @param b input data array
+ * @param length number of elements
+ *
+ * @return dot product = sum of elementwise products
+ */
+int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length);
+
+/**
+ * Return the dot product.
+ * @param a input data array
+ * @param b input data array
+ * @param length number of elements
+ *
+ * @return dot product = sum of elementwise products
+ */
+float ff_dot_productf(const float* a, const float* b, int length);
+
 #endif /* AVCODEC_CELP_MATH_H */
diff --git a/libavcodec/cga_data.c b/libavcodec/cga_data.c
index 2c63ff2001..023a86b175 100644
--- a/libavcodec/cga_data.c
+++ b/libavcodec/cga_data.c
@@ -1,435 +1,46 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #include <stdint.h>
 #include "cga_data.h"
 
-const uint8_t ff_cga_font[2048] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0xbd, 0x99, 0x81, 0x7e,
- 0x7e, 0xff, 0xdb, 0xff, 0xc3, 0xe7, 0xff, 0x7e, 0x6c, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00,
- 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x38, 0x7c, 0x38, 0xfe, 0xfe, 0x7c, 0x38, 0x7c,
- 0x10, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x7c, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00,
- 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00,
- 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0x0f, 0x07, 0x0f, 0x7d, 0xcc, 0xcc, 0xcc, 0x78,
- 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x70, 0xf0, 0xe0,
- 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x67, 0xe6, 0xc0, 0x99, 0x5a, 0x3c, 0xe7, 0xe7, 0x3c, 0x5a, 0x99,
- 0x80, 0xe0, 0xf8, 0xfe, 0xf8, 0xe0, 0x80, 0x00, 0x02, 0x0e, 0x3e, 0xfe, 0x3e, 0x0e, 0x02, 0x00,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x00,
- 0x7f, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x00, 0x3e, 0x63, 0x38, 0x6c, 0x6c, 0x38, 0xcc, 0x78,
- 0x00, 0x00, 0x00, 0x00, 0x7e, 0x7e, 0x7e, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x7e, 0x3c, 0x18, 0xff,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00,
- 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7e, 0x3c, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x78, 0x78, 0x30, 0x30, 0x00, 0x30, 0x00,
- 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0xfe, 0x6c, 0x6c, 0x00,
- 0x30, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x30, 0x00, 0x00, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xc6, 0x00,
- 0x38, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0x76, 0x00, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x60, 0x60, 0x30, 0x18, 0x00, 0x60, 0x30, 0x18, 0x18, 0x18, 0x30, 0x60, 0x00,
- 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00,
- 0x7c, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0x7c, 0x00, 0x30, 0x70, 0x30, 0x30, 0x30, 0x30, 0xfc, 0x00,
- 0x78, 0xcc, 0x0c, 0x38, 0x60, 0xcc, 0xfc, 0x00, 0x78, 0xcc, 0x0c, 0x38, 0x0c, 0xcc, 0x78, 0x00,
- 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x1e, 0x00, 0xfc, 0xc0, 0xf8, 0x0c, 0x0c, 0xcc, 0x78, 0x00,
- 0x38, 0x60, 0xc0, 0xf8, 0xcc, 0xcc, 0x78, 0x00, 0xfc, 0xcc, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x00,
- 0x78, 0xcc, 0xcc, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x78, 0xcc, 0xcc, 0x7c, 0x0c, 0x18, 0x70, 0x00,
- 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x60,
- 0x18, 0x30, 0x60, 0xc0, 0x60, 0x30, 0x18, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0xfc, 0x00, 0x00,
- 0x60, 0x30, 0x18, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x78, 0xcc, 0x0c, 0x18, 0x30, 0x00, 0x30, 0x00,
- 0x7c, 0xc6, 0xde, 0xde, 0xde, 0xc0, 0x78, 0x00, 0x30, 0x78, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x66, 0x66, 0xfc, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xc0, 0x66, 0x3c, 0x00,
- 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0xfe, 0x62, 0x68, 0x78, 0x68, 0x62, 0xfe, 0x00,
- 0xfe, 0x62, 0x68, 0x78, 0x68, 0x60, 0xf0, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xce, 0x66, 0x3e, 0x00,
- 0xcc, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0xcc, 0x00, 0x78, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x1e, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0x00, 0xe6, 0x66, 0x6c, 0x78, 0x6c, 0x66, 0xe6, 0x00,
- 0xf0, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0xc6, 0xee, 0xfe, 0xfe, 0xd6, 0xc6, 0xc6, 0x00,
- 0xc6, 0xe6, 0xf6, 0xde, 0xce, 0xc6, 0xc6, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x38, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xdc, 0x78, 0x1c, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0xe6, 0x00, 0x78, 0xcc, 0xe0, 0x70, 0x1c, 0xcc, 0x78, 0x00,
- 0xfc, 0xb4, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xfc, 0x00,
- 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0xc6, 0xc6, 0xc6, 0xd6, 0xfe, 0xee, 0xc6, 0x00,
- 0xc6, 0xc6, 0x6c, 0x38, 0x38, 0x6c, 0xc6, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x30, 0x78, 0x00,
- 0xfe, 0xc6, 0x8c, 0x18, 0x32, 0x66, 0xfe, 0x00, 0x78, 0x60, 0x60, 0x60, 0x60, 0x60, 0x78, 0x00,
- 0xc0, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x02, 0x00, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x78, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x76, 0x00,
- 0xe0, 0x60, 0x60, 0x7c, 0x66, 0x66, 0xdc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x00,
- 0x1c, 0x0c, 0x0c, 0x7c, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0x38, 0x6c, 0x60, 0xf0, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0xe0, 0x60, 0x6c, 0x76, 0x66, 0x66, 0xe6, 0x00, 0x30, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x0c, 0x00, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0xe0, 0x60, 0x66, 0x6c, 0x78, 0x6c, 0xe6, 0x00,
- 0x70, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0x00, 0x00, 0xcc, 0xfe, 0xfe, 0xd6, 0xc6, 0x00,
- 0x00, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xdc, 0x66, 0x66, 0x7c, 0x60, 0xf0, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0x1e,
- 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x00,
- 0x10, 0x30, 0x7c, 0x30, 0x30, 0x34, 0x18, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00,
- 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0x00, 0x00, 0xc6, 0xd6, 0xfe, 0xfe, 0x6c, 0x00,
- 0x00, 0x00, 0xc6, 0x6c, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0x00, 0x00, 0xfc, 0x98, 0x30, 0x64, 0xfc, 0x00, 0x1c, 0x30, 0x30, 0xe0, 0x30, 0x30, 0x1c, 0x00,
- 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00, 0xe0, 0x30, 0x30, 0x1c, 0x30, 0x30, 0xe0, 0x00,
- 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0x00,
- 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x18, 0x0c, 0x78, 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x1c, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0x7e, 0xc3, 0x3c, 0x06, 0x3e, 0x66, 0x3f, 0x00,
- 0xcc, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0xe0, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00,
- 0x30, 0x30, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x00, 0x00, 0x78, 0xc0, 0xc0, 0x78, 0x0c, 0x38,
- 0x7e, 0xc3, 0x3c, 0x66, 0x7e, 0x60, 0x3c, 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0xe0, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0xcc, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x7c, 0xc6, 0x38, 0x18, 0x18, 0x18, 0x3c, 0x00, 0xe0, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0xc6, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0xc6, 0x00, 0x30, 0x30, 0x00, 0x78, 0xcc, 0xfc, 0xcc, 0x00,
- 0x1c, 0x00, 0xfc, 0x60, 0x78, 0x60, 0xfc, 0x00, 0x00, 0x00, 0x7f, 0x0c, 0x7f, 0xcc, 0x7f, 0x00,
- 0x3e, 0x6c, 0xcc, 0xfe, 0xcc, 0xcc, 0xce, 0x00, 0x78, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0xe0, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00, 0x00, 0xe0, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8, 0xc3, 0x18, 0x3c, 0x66, 0x66, 0x3c, 0x18, 0x00,
- 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x18, 0x18, 0x7e, 0xc0, 0xc0, 0x7e, 0x18, 0x18,
- 0x38, 0x6c, 0x64, 0xf0, 0x60, 0xe6, 0xfc, 0x00, 0xcc, 0xcc, 0x78, 0xfc, 0x30, 0xfc, 0x30, 0x30,
- 0xf8, 0xcc, 0xcc, 0xfa, 0xc6, 0xcf, 0xc6, 0xc7, 0x0e, 0x1b, 0x18, 0x3c, 0x18, 0x18, 0xd8, 0x70,
- 0x1c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x38, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x00, 0x1c, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x1c, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xf8, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0x00, 0xfc, 0x00, 0xcc, 0xec, 0xfc, 0xdc, 0xcc, 0x00,
- 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00,
- 0x30, 0x00, 0x30, 0x60, 0xc0, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00, 0xfc, 0xc0, 0xc0, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfc, 0x0c, 0x0c, 0x00, 0x00, 0xc3, 0xc6, 0xcc, 0xde, 0x33, 0x66, 0xcc, 0x0f,
- 0xc3, 0xc6, 0xcc, 0xdb, 0x37, 0x6f, 0xcf, 0x03, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0x00, 0x33, 0x66, 0xcc, 0x66, 0x33, 0x00, 0x00, 0x00, 0xcc, 0x66, 0x33, 0x66, 0xcc, 0x00, 0x00,
- 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdb, 0x77, 0xdb, 0xee, 0xdb, 0x77, 0xdb, 0xee, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0xc8, 0xdc, 0x76, 0x00, 0x00, 0x78, 0xcc, 0xf8, 0xcc, 0xf8, 0xc0, 0xc0,
- 0x00, 0xfc, 0xcc, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00,
- 0xfc, 0xcc, 0x60, 0x30, 0x60, 0xcc, 0xfc, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0x70, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0xc0, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0xfc, 0x30, 0x78, 0xcc, 0xcc, 0x78, 0x30, 0xfc, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0x6c, 0x38, 0x00,
- 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x6c, 0xee, 0x00, 0x1c, 0x30, 0x18, 0x7c, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x06, 0x0c, 0x7e, 0xdb, 0xdb, 0x7e, 0x60, 0xc0,
- 0x38, 0x60, 0xc0, 0xf8, 0xc0, 0x60, 0x38, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00,
- 0x00, 0xfc, 0x00, 0xfc, 0x00, 0xfc, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0xfc, 0x00,
- 0x60, 0x30, 0x18, 0x30, 0x60, 0x00, 0xfc, 0x00, 0x18, 0x30, 0x60, 0x30, 0x18, 0x00, 0xfc, 0x00,
- 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0x70,
- 0x30, 0x30, 0x00, 0xfc, 0x00, 0x30, 0x30, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00,
- 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x3c, 0x1c,
- 0x78, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x70, 0x18, 0x30, 0x60, 0x78, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x3c, 0x3c, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-};
-
-const uint8_t ff_vga16_font[4096] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd, 0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0xff, 0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x1e, 0x0e, 0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30, 0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x02, 0x06, 0x0e, 0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0xdb, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c, 0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c, 0x18, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18, 0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x18, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc3, 0xc3, 0xdb, 0xdb, 0xc3, 0xc3, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xde, 0xde, 0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xe7, 0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c, 0x0c, 0x0e, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x70, 0x18, 0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x38, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06, 0x3c, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x66, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x38, 0x6c, 0x38, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b, 0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3e, 0x6c, 0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00,
- 0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x18, 0x7e, 0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xfc, 0x66, 0x66, 0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0x70, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06, 0x0c, 0x1f, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36, 0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44,
- 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60, 0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
 const uint32_t ff_cga_palette[16] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAA5500, 0xAAAAAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF,
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAA5500, 0xFFAAAAAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF,
 };
 
 const uint32_t ff_ega_palette[64] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAAAA00, 0xAAAAAA,
-    0x000055, 0x0000FF, 0x00AA55, 0x00AAFF, 0xAA0055, 0xAA00FF, 0xAAAA55, 0xAAAAFF,
-    0x005500, 0x0055AA, 0x00FF00, 0x00FFAA, 0xAA5500, 0xAA55AA, 0xAAFF00, 0xAAFFAA,
-    0x005555, 0x0055FF, 0x00FF55, 0x00FFFF, 0xAA5555, 0xAA55FF, 0xAAFF55, 0xAAFFFF,
-    0x550000, 0x5500AA, 0x55AA00, 0x55AAAA, 0xFF0000, 0xFF00AA, 0xFFAA00, 0xFFAAAA,
-    0x550055, 0x5500FF, 0x55AA55, 0x55AAFF, 0xFF0055, 0xFF00FF, 0xFFAA55, 0xFFAAFF,
-    0x555500, 0x5555AA, 0x55FF00, 0x55FFAA, 0xFF5500, 0xFF55AA, 0xFFFF00, 0xFFFFAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAAAA00, 0xFFAAAAAA,
+    0xFF000055, 0xFF0000FF, 0xFF00AA55, 0xFF00AAFF, 0xFFAA0055, 0xFFAA00FF, 0xFFAAAA55, 0xFFAAAAFF,
+    0xFF005500, 0xFF0055AA, 0xFF00FF00, 0xFF00FFAA, 0xFFAA5500, 0xFFAA55AA, 0xFFAAFF00, 0xFFAAFFAA,
+    0xFF005555, 0xFF0055FF, 0xFF00FF55, 0xFF00FFFF, 0xFFAA5555, 0xFFAA55FF, 0xFFAAFF55, 0xFFAAFFFF,
+    0xFF550000, 0xFF5500AA, 0xFF55AA00, 0xFF55AAAA, 0xFFFF0000, 0xFFFF00AA, 0xFFFFAA00, 0xFFFFAAAA,
+    0xFF550055, 0xFF5500FF, 0xFF55AA55, 0xFF55AAFF, 0xFFFF0055, 0xFFFF00FF, 0xFFFFAA55, 0xFFFFAAFF,
+    0xFF555500, 0xFF5555AA, 0xFF55FF00, 0xFF55FFAA, 0xFFFF5500, 0xFFFF55AA, 0xFFFFFF00, 0xFFFFFFAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF
 };
 
 void ff_draw_pc_font(uint8_t *dst, int linesize, const uint8_t *font, int font_height, int ch, int fg, int bg)
diff --git a/libavcodec/cga_data.h b/libavcodec/cga_data.h
index 2149cfd2f1..3f5281a264 100644
--- a/libavcodec/cga_data.h
+++ b/libavcodec/cga_data.h
@@ -1,26 +1,27 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #ifndef AVCODEC_CGA_DATA_H
@@ -28,8 +29,6 @@
 
 #include <stdint.h>
 
-extern const uint8_t ff_cga_font[2048];
-extern const uint8_t ff_vga16_font[4096];
 extern const uint32_t ff_cga_palette[16];
 extern const uint32_t ff_ega_palette[64];
 
diff --git a/libavcodec/chomp_bsf.c b/libavcodec/chomp_bsf.c
index 9ed7496930..2b93fa999e 100644
--- a/libavcodec/chomp_bsf.c
+++ b/libavcodec/chomp_bsf.c
@@ -2,20 +2,20 @@
  * Chomp bitstream filter
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,6 @@ static int chomp_filter(AVBitStreamFilterContext *bsfc,
  * This filter removes a string of NULL bytes from the end of a packet.
  */
 AVBitStreamFilter ff_chomp_bsf = {
-    "chomp",
-    0,
-    chomp_filter,
+    .name   = "chomp",
+    .filter = chomp_filter,
 };
diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index a3ca817479..f1a4656303 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -1,21 +1,21 @@
 /*
  * Cinepak Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,9 @@
  *   http://www.csse.monash.edu.au/~timf/
  * @see For more information on the quirky data inside Sega FILM/CPK files, visit:
  *   http://wiki.multimedia.cx/index.php?title=Sega_FILM
+ *
+ * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
+ * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -40,10 +43,7 @@
 #include "internal.h"
 
 
-typedef struct cvid_codebook {
-    uint8_t  y0, y1, y2, y3;
-    uint8_t  u, v;
-} cvid_codebook;
+typedef uint8_t cvid_codebook[12];
 
 #define MAX_STRIPS      32
 
@@ -79,12 +79,14 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
+    uint8_t *p;
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
     flag = 0;
     mask = 0;
 
+    p = codebook[0];
     for (i=0; i < 256; i++) {
         if ((chunk_id & 0x01) && !(mask >>= 1)) {
             if ((data + 4) > eod)
@@ -96,28 +98,33 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
         }
 
         if (!(chunk_id & 0x01) || (flag & mask)) {
+            int k, kk;
+
             if ((data + n) > eod)
                 break;
 
+            for (k = 0; k < 4; ++k) {
+                int r = *data++;
+                for (kk = 0; kk < 3; ++kk)
+                    *p++ = r;
+            }
             if (n == 6) {
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128 + *data++;
-                codebook[i].v  = 128 + *data++;
-            } else {
-                /* this codebook type indicates either greyscale or
-                 * palettized video; if palettized, U & V components will
-                 * not be used so it is safe to set them to 128 for the
-                 * benefit of greyscale rendering in YUV420P */
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128;
-                codebook[i].v  = 128;
+                int r, g, b, u, v;
+                u = *(int8_t *)data++;
+                v = *(int8_t *)data++;
+                p -= 12;
+                for(k=0; k<4; ++k) {
+                    r = *p++ + v*2;
+                    g = *p++ - (u/2) - v;
+                    b = *p   + u*2;
+                    p -= 2;
+                    *p++ = av_clip_uint8(r);
+                    *p++ = av_clip_uint8(g);
+                    *p++ = av_clip_uint8(b);
+                }
             }
+        } else {
+            p += 12;
         }
     }
 }
@@ -127,25 +134,31 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    cvid_codebook   *codebook;
-    unsigned int     x, y;
-    uint32_t         iy[4];
-    uint32_t         iu[2];
-    uint32_t         iv[2];
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int             x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
 
     flag = 0;
     mask = 0;
 
     for (y=strip->y1; y < strip->y2; y+=4) {
 
-        iy[0] = strip->x1 + (y * s->frame->linesize[0]);
-        iy[1] = iy[0] + s->frame->linesize[0];
-        iy[2] = iy[1] + s->frame->linesize[0];
-        iy[3] = iy[2] + s->frame->linesize[0];
-        iu[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[1]);
-        iu[1] = iu[0] + s->frame->linesize[1];
-        iv[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[2]);
-        iv[1] = iv[0] + s->frame->linesize[2];
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+/* to get the correct picture for not-multiple-of-4 cases let us fill
+ * each block from the bottom up, thus possibly overwriting the top line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
 
         for (x=strip->x1; x < strip->x2; x+=4) {
             if ((chunk_id & 0x01) && !(mask >>= 1)) {
@@ -168,97 +181,82 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                 }
 
                 if ((chunk_id & 0x02) || (~flag & mask)) {
+                    uint8_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v1_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y0;
-                    s->frame->data[0][iy[1] + 0] = codebook->y0;
-                    s->frame->data[0][iy[1] + 1] = codebook->y0;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[0] + 2] = codebook->y1;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y1;
-                    s->frame->data[0][iy[1] + 3] = codebook->y1;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 0] = codebook->y2;
-                    s->frame->data[0][iy[2] + 1] = codebook->y2;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y2;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 2] = codebook->y3;
-                    s->frame->data[0][iy[2] + 3] = codebook->y3;
-                    s->frame->data[0][iy[3] + 2] = codebook->y3;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    p = strip->v1_codebook[*data++];
+                    if (s->palette_video) {
+                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
+                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
+                        ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
+                    } else {
+                        p += 6;
+                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
+                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
+                        p += 3; /* ... + 9 */
+                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
+                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
+                        p -= 9; /* ... + 0 */
+                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
+                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
+                        p += 3; /* ... + 3 */
+                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
+                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
                     }
 
                 } else if (flag & mask) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y1;
-                    s->frame->data[0][iy[1] + 0] = codebook->y2;
-                    s->frame->data[0][iy[1] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 2] = codebook->y0;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y2;
-                    s->frame->data[0][iy[1] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 0] = codebook->y0;
-                    s->frame->data[0][iy[2] + 1] = codebook->y1;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 2] = codebook->y0;
-                    s->frame->data[0][iy[2] + 3] = codebook->y1;
-                    s->frame->data[0][iy[3] + 2] = codebook->y2;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    cb0 = strip->v4_codebook[*data++];
+                    cb1 = strip->v4_codebook[*data++];
+                    cb2 = strip->v4_codebook[*data++];
+                    cb3 = strip->v4_codebook[*data++];
+                    if (s->palette_video) {
+                        uint8_t *p;
+                        p = ip3;
+                        *p++ = cb2[6];
+                        *p++ = cb2[9];
+                        *p++ = cb3[6];
+                        *p   = cb3[9];
+                        p = ip2;
+                        *p++ = cb2[0];
+                        *p++ = cb2[3];
+                        *p++ = cb3[0];
+                        *p   = cb3[3];
+                        p = ip1;
+                        *p++ = cb0[6];
+                        *p++ = cb0[9];
+                        *p++ = cb1[6];
+                        *p   = cb1[9];
+                        p = ip0;
+                        *p++ = cb0[0];
+                        *p++ = cb0[3];
+                        *p++ = cb1[0];
+                        *p   = cb1[3];
+                    } else {
+                        memcpy(ip3 + 0, cb2 + 6, 6);
+                        memcpy(ip3 + 6, cb3 + 6, 6);
+                        memcpy(ip2 + 0, cb2 + 0, 6);
+                        memcpy(ip2 + 6, cb3 + 0, 6);
+                        memcpy(ip1 + 0, cb0 + 6, 6);
+                        memcpy(ip1 + 6, cb1 + 6, 6);
+                        memcpy(ip0 + 0, cb0 + 0, 6);
+                        memcpy(ip0 + 6, cb1 + 0, 6);
                     }
 
                 }
             }
 
-            iy[0] += 4;  iy[1] += 4;
-            iy[2] += 4;  iy[3] += 4;
-            iu[0] += 2;  iu[1] += 2;
-            iv[0] += 2;  iv[1] += 2;
+            if (s->palette_video) {
+                ip0 += 4;  ip1 += 4;
+                ip2 += 4;  ip3 += 4;
+            } else {
+                ip0 += 12;  ip1 += 12;
+                ip2 += 12;  ip3 += 12;
+            }
         }
     }
 
@@ -362,15 +360,23 @@ static int cinepak_decode (CinepakContext *s)
 
     num_strips = FFMIN(num_strips, MAX_STRIPS);
 
+    s->frame->key_frame = 0;
+
     for (i=0; i < num_strips; i++) {
         if ((s->data + 12) > eod)
             return AVERROR_INVALIDDATA;
 
         s->strips[i].id = s->data[0];
-        s->strips[i].y1 = y0;
-        s->strips[i].x1 = 0;
-        s->strips[i].y2 = y0 + AV_RB16 (&s->data[8]);
-        s->strips[i].x2 = s->avctx->width;
+/* zero y1 means "relative to the previous stripe" */
+        if (!(s->strips[i].y1 = AV_RB16 (&s->data[4])))
+            s->strips[i].y2 = (s->strips[i].y1 = y0) + AV_RB16 (&s->data[8]);
+        else
+            s->strips[i].y2 = AV_RB16 (&s->data[8]);
+        s->strips[i].x1 = AV_RB16 (&s->data[6]);
+        s->strips[i].x2 = AV_RB16 (&s->data[10]);
+
+        if (s->strips[i].id == 0x10)
+            s->frame->key_frame = 1;
 
         strip_size = AV_RB24 (&s->data[1]) - 12;
         if (strip_size < 0)
@@ -403,12 +409,13 @@ static av_cold int cinepak_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
     s->height = (avctx->height + 3) & ~3;
+
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
     // check for paletted data
     if (avctx->bits_per_coded_sample != 8) {
         s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
     } else {
         s->palette_video = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -432,10 +439,8 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
     s->data = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame))) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (s->palette_video) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
@@ -445,7 +450,9 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
         }
     }
 
-    cinepak_decode(s);
+    if ((ret = cinepak_decode(s)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n");
+    }
 
     if (s->palette_video)
         memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE);
diff --git a/libavcodec/cinepakenc.c b/libavcodec/cinepakenc.c
new file mode 100644
index 0000000000..0f0b8cde7a
--- /dev/null
+++ b/libavcodec/cinepakenc.c
@@ -0,0 +1,1335 @@
+/*
+ * Cinepak encoder (c) 2011 Tomas H�rdin
+ * http://titan.codemill.se/~tomhar/cinepakenc.patch
+ *
+ * Fixes and improvements, vintage decoders compatibility
+ *  (c) 2013, 2014 Rl, Aetey Global Technologies AB
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+ * TODO:
+ * - optimize: color space conversion, ...
+ * - implement options to set the min/max number of strips?
+ * MAYBE:
+ * - "optimally" split the frame into several non-regular areas
+ *   using a separate codebook pair for each area and approximating
+ *   the area by several rectangular strips (generally not full width ones)
+ *   (use quadtree splitting? a simple fixed-granularity grid?)
+ *
+ *
+ * version 2014-01-23 Rl
+ * - added option handling for flexibility
+ *
+ * version 2014-01-21 Rl
+ * - believe it or not, now we get even smaller files, with better quality
+ *   (which means I missed an optimization earlier :)
+ *
+ * version 2014-01-20 Rl
+ * - made the encoder compatible with vintage decoders
+ *   and added some yet unused code for possible future
+ *   incremental codebook updates
+ * - fixed a small memory leak
+ *
+ * version 2013-04-28 Rl
+ * - bugfixed codebook optimization logic
+ *
+ * version 2013-02-14 Rl
+ * "Valentine's Day" version:
+ * - made strip division more robust
+ * - minimized bruteforcing the number of strips,
+ *   (costs some R/D but speeds up compession a lot), the heuristic
+ *   assumption is that score as a function of the number of strips has
+ *   one wide minimum which moves slowly, of course not fully true
+ * - simplified codebook generation,
+ *   the old code was meant for other optimizations than we actually do
+ * - optimized the codebook generation / error estimation for MODE_MC
+ *
+ * version 2013-02-12 Rl
+ * - separated codebook training sets, avoided the transfer of wasted bytes,
+ *   which yields both better quality and smaller files
+ * - now using the correct colorspace (TODO: move conversion to libswscale)
+ *
+ * version 2013-02-08 Rl
+ * - fixes/optimization in multistrip encoding and codebook size choice,
+ *   quality/bitrate is now better than that of the binary proprietary encoder
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "libavutil/lfg.h"
+#include "elbg.h"
+#include "internal.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+
+#define CVID_HEADER_SIZE 10
+#define STRIP_HEADER_SIZE 12
+#define CHUNK_HEADER_SIZE 4
+
+#define MB_SIZE 4           //4x4 MBs
+#define MB_AREA (MB_SIZE*MB_SIZE)
+
+#define VECTOR_MAX 6        //six or four entries per vector depending on format
+#define CODEBOOK_MAX 256    //size of a codebook
+
+#define MAX_STRIPS  32      //Note: having fewer choices regarding the number of strips speeds up encoding (obviously)
+#define MIN_STRIPS  1       //Note: having more strips speeds up encoding the frame (this is less obvious)
+// MAX_STRIPS limits the maximum quality you can reach
+//            when you want hight quality on high resolutions,
+// MIN_STRIPS limits the minimum efficiently encodable bit rate
+//            on low resolutions
+// the numbers are only used for brute force optimization for the first frame,
+// for the following frames they are adaptively readjusted
+// NOTE the decoder in ffmpeg has its own arbitrary limitation on the number
+// of strips, currently 32
+
+typedef enum {
+    MODE_V1_ONLY = 0,
+    MODE_V1_V4,
+    MODE_MC,
+
+    MODE_COUNT,
+} CinepakMode;
+
+typedef enum {
+    ENC_V1,
+    ENC_V4,
+    ENC_SKIP,
+
+    ENC_UNCERTAIN
+} mb_encoding;
+
+typedef struct {
+    int v1_vector;                  //index into v1 codebook
+    int v1_error;                   //error when using V1 encoding
+    int v4_vector[4];               //indices into v4 codebooks
+    int v4_error;                   //error when using V4 encoding
+    int skip_error;                 //error when block is skipped (aka copied from last frame)
+    mb_encoding best_encoding;      //last result from calculate_mode_score()
+} mb_info;
+
+typedef struct {
+    int v1_codebook[CODEBOOK_MAX*VECTOR_MAX];
+    int v4_codebook[CODEBOOK_MAX*VECTOR_MAX];
+    int v1_size;
+    int v4_size;
+    CinepakMode mode;
+} strip_info;
+
+typedef struct {
+    const AVClass *class;
+    AVCodecContext *avctx;
+    unsigned char *pict_bufs[4], *strip_buf, *frame_buf;
+    AVFrame *last_frame;
+    AVFrame *best_frame;
+    AVFrame *scratch_frame;
+    AVFrame *input_frame;
+    enum AVPixelFormat pix_fmt;
+    int w, h;
+    int frame_buf_size;
+    int curframe, keyint;
+    AVLFG randctx;
+    uint64_t lambda;
+    int *codebook_input;
+    int *codebook_closest;
+    mb_info *mb;                                //MB RD state
+    int min_strips;          //the current limit
+    int max_strips;          //the current limit
+#ifdef CINEPAKENC_DEBUG
+    mb_info *best_mb;                           //TODO: remove. only used for printing stats
+    int num_v1_mode, num_v4_mode, num_mc_mode;
+    int num_v1_encs, num_v4_encs, num_skips;
+#endif
+// options
+    int max_extra_cb_iterations;
+    int skip_empty_cb;
+    int min_min_strips;
+    int max_max_strips;
+    int strip_number_delta_range;
+} CinepakEncContext;
+
+#define OFFSET(x) offsetof(CinepakEncContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "max_extra_cb_iterations", "Max extra codebook recalculation passes, more is better and slower", OFFSET(max_extra_cb_iterations), AV_OPT_TYPE_INT, { .i64 = 2 }, 0, INT_MAX, VE },
+    { "skip_empty_cb", "Avoid wasting bytes, ignore vintage MacOS decoder", OFFSET(skip_empty_cb), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "max_strips", "Limit strips/frame, vintage compatible is 1..3, otherwise the more the better", OFFSET(max_max_strips), AV_OPT_TYPE_INT, { .i64 = 3 }, MIN_STRIPS, MAX_STRIPS, VE },
+    { "min_strips", "Enforce min strips/frame, more is worse and faster, must be <= max_strips", OFFSET(min_min_strips), AV_OPT_TYPE_INT, { .i64 = MIN_STRIPS }, MIN_STRIPS, MAX_STRIPS, VE },
+    { "strip_number_adaptivity", "How fast the strip number adapts, more is slightly better, much slower", OFFSET(strip_number_delta_range), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, MAX_STRIPS-MIN_STRIPS, VE },
+    { NULL },
+};
+
+static const AVClass cinepak_class = {
+    .class_name = "cinepak",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static av_cold int cinepak_encode_init(AVCodecContext *avctx)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int x, mb_count, strip_buf_size, frame_buf_size;
+
+    if (avctx->width & 3 || avctx->height & 3) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be multiples of four (got %ix%i)\n",
+                avctx->width, avctx->height);
+        return AVERROR(EINVAL);
+    }
+
+    if (s->min_min_strips > s->max_max_strips) {
+        av_log(avctx, AV_LOG_ERROR, "minimal number of strips can not exceed maximal (got %i and %i)\n",
+                s->min_min_strips, s->max_max_strips);
+        return AVERROR(EINVAL);
+    }
+
+    if (!(s->last_frame = av_frame_alloc()))
+        return AVERROR(ENOMEM);
+    if (!(s->best_frame = av_frame_alloc()))
+        goto enomem;
+    if (!(s->scratch_frame = av_frame_alloc()))
+        goto enomem;
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        if (!(s->input_frame = av_frame_alloc()))
+            goto enomem;
+
+    if (!(s->codebook_input = av_malloc(sizeof(int) * (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4) * (avctx->width * avctx->height) >> 2)))
+        goto enomem;
+
+    if (!(s->codebook_closest = av_malloc(sizeof(int) * (avctx->width * avctx->height) >> 2)))
+        goto enomem;
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        if(!(s->pict_bufs[x] = av_malloc((avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4) * (avctx->width * avctx->height) >> 2)))
+            goto enomem;
+
+    mb_count = avctx->width * avctx->height / MB_AREA;
+
+    //the largest possible chunk is 0x31 with all MBs encoded in V4 mode
+    //and full codebooks being replaced in INTER mode,
+    // which is 34 bits per MB
+    //and 2*256 extra flag bits per strip
+    strip_buf_size = STRIP_HEADER_SIZE + 3 * CHUNK_HEADER_SIZE + 2 * VECTOR_MAX * CODEBOOK_MAX + 4 * (mb_count + (mb_count + 15) / 16) + (2 * CODEBOOK_MAX)/8;
+
+    frame_buf_size = CVID_HEADER_SIZE + s->max_max_strips * strip_buf_size;
+
+    if (!(s->strip_buf = av_malloc(strip_buf_size)))
+        goto enomem;
+
+    if (!(s->frame_buf = av_malloc(frame_buf_size)))
+        goto enomem;
+
+    if (!(s->mb = av_malloc_array(mb_count, sizeof(mb_info))))
+        goto enomem;
+
+#ifdef CINEPAKENC_DEBUG
+    if (!(s->best_mb = av_malloc_array(mb_count, sizeof(mb_info))))
+        goto enomem;
+#endif
+
+    av_lfg_init(&s->randctx, 1);
+    s->avctx = avctx;
+    s->w = avctx->width;
+    s->h = avctx->height;
+    s->frame_buf_size = frame_buf_size;
+    s->curframe = 0;
+    s->keyint = avctx->keyint_min;
+    s->pix_fmt = avctx->pix_fmt;
+
+    //set up AVFrames
+    s->last_frame->data[0]        = s->pict_bufs[0];
+    s->last_frame->linesize[0]    = s->w;
+    s->best_frame->data[0]        = s->pict_bufs[1];
+    s->best_frame->linesize[0]    = s->w;
+    s->scratch_frame->data[0]     = s->pict_bufs[2];
+    s->scratch_frame->linesize[0] = s->w;
+
+    if (s->pix_fmt == AV_PIX_FMT_RGB24) {
+        s->last_frame->data[1]        = s->last_frame->data[0] + s->w * s->h;
+        s->last_frame->data[2]        = s->last_frame->data[1] + ((s->w * s->h) >> 2);
+        s->last_frame->linesize[1]    = s->last_frame->linesize[2] = s->w >> 1;
+
+        s->best_frame->data[1]        = s->best_frame->data[0] + s->w * s->h;
+        s->best_frame->data[2]        = s->best_frame->data[1] + ((s->w * s->h) >> 2);
+        s->best_frame->linesize[1]    = s->best_frame->linesize[2] = s->w >> 1;
+
+        s->scratch_frame->data[1]     = s->scratch_frame->data[0] + s->w * s->h;
+        s->scratch_frame->data[2]     = s->scratch_frame->data[1] + ((s->w * s->h) >> 2);
+        s->scratch_frame->linesize[1] = s->scratch_frame->linesize[2] = s->w >> 1;
+
+        s->input_frame->data[0]       = s->pict_bufs[3];
+        s->input_frame->linesize[0]   = s->w;
+        s->input_frame->data[1]       = s->input_frame->data[0] + s->w * s->h;
+        s->input_frame->data[2]       = s->input_frame->data[1] + ((s->w * s->h) >> 2);
+        s->input_frame->linesize[1]   = s->input_frame->linesize[2] = s->w >> 1;
+    }
+
+    s->min_strips = s->min_min_strips;
+    s->max_strips = s->max_max_strips;
+
+#ifdef CINEPAKENC_DEBUG
+    s->num_v1_mode = s->num_v4_mode = s->num_mc_mode = s->num_v1_encs = s->num_v4_encs = s->num_skips = 0;
+#endif
+
+    return 0;
+
+enomem:
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->best_frame);
+    av_frame_free(&s->scratch_frame);
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        av_frame_free(&s->input_frame);
+    av_freep(&s->codebook_input);
+    av_freep(&s->codebook_closest);
+    av_freep(&s->strip_buf);
+    av_freep(&s->frame_buf);
+    av_freep(&s->mb);
+#ifdef CINEPAKENC_DEBUG
+    av_freep(&s->best_mb);
+#endif
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        av_freep(&s->pict_bufs[x]);
+
+    return AVERROR(ENOMEM);
+}
+
+static int64_t calculate_mode_score(CinepakEncContext *s, int h, strip_info *info, int report, int *training_set_v1_shrunk, int *training_set_v4_shrunk
+#ifdef CINEPAK_REPORT_SERR
+, int64_t *serr
+#endif
+)
+{
+    //score = FF_LAMBDA_SCALE * error + lambda * bits
+    int x;
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int mb_count = s->w * h / MB_AREA;
+    mb_info *mb;
+    int64_t score1, score2, score3;
+    int64_t ret = s->lambda * ((info->v1_size ? CHUNK_HEADER_SIZE + info->v1_size * entry_size : 0) +
+                   (info->v4_size ? CHUNK_HEADER_SIZE + info->v4_size * entry_size : 0) +
+                   CHUNK_HEADER_SIZE) << 3;
+
+    //av_log(s->avctx, AV_LOG_INFO, "sizes %3i %3i -> %9"PRId64" score mb_count %i", info->v1_size, info->v4_size, ret, mb_count);
+
+#ifdef CINEPAK_REPORT_SERR
+    *serr = 0;
+#endif
+
+    switch(info->mode) {
+    case MODE_V1_ONLY:
+        //one byte per MB
+        ret += s->lambda * 8 * mb_count;
+
+// while calculating we assume all blocks are ENC_V1
+        for(x = 0; x < mb_count; x++) {
+            mb = &s->mb[x];
+            ret += FF_LAMBDA_SCALE * mb->v1_error;
+#ifdef CINEPAK_REPORT_SERR
+            *serr += mb->v1_error;
+#endif
+// this function is never called for report in MODE_V1_ONLY
+//            if(!report)
+            mb->best_encoding = ENC_V1;
+        }
+
+        break;
+    case MODE_V1_V4:
+        //9 or 33 bits per MB
+        if(report) {
+// no moves between the corresponding training sets are allowed
+            *training_set_v1_shrunk = *training_set_v4_shrunk = 0;
+            for(x = 0; x < mb_count; x++) {
+                int mberr;
+                mb = &s->mb[x];
+                if(mb->best_encoding == ENC_V1)
+                    score1 = s->lambda * 9  + FF_LAMBDA_SCALE * (mberr=mb->v1_error);
+                else
+                    score1 = s->lambda * 33 + FF_LAMBDA_SCALE * (mberr=mb->v4_error);
+                ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                *serr += mberr;
+#endif
+            }
+        } else { // find best mode per block
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+                score1 = s->lambda * 9  + FF_LAMBDA_SCALE * mb->v1_error;
+                score2 = s->lambda * 33 + FF_LAMBDA_SCALE * mb->v4_error;
+
+                if(score1 <= score2) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v1_error;
+#endif
+                    mb->best_encoding = ENC_V1;
+                } else {
+                    ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v4_error;
+#endif
+                    mb->best_encoding = ENC_V4;
+                }
+            }
+        }
+
+        break;
+    case MODE_MC:
+        //1, 10 or 34 bits per MB
+        if(report) {
+            int v1_shrunk = 0, v4_shrunk = 0;
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+// it is OK to move blocks to ENC_SKIP here
+// but not to any codebook encoding!
+                score1 = s->lambda * 1  + FF_LAMBDA_SCALE * mb->skip_error;
+                if(mb->best_encoding == ENC_SKIP) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->skip_error;
+#endif
+                } else if(mb->best_encoding == ENC_V1) {
+                    if((score2=s->lambda * 10 + FF_LAMBDA_SCALE * mb->v1_error) >= score1) {
+                        mb->best_encoding = ENC_SKIP;
+                        ++v1_shrunk;
+                        ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->skip_error;
+#endif
+                    } else {
+                        ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->v1_error;
+#endif
+                    }
+                } else {
+                    if((score3=s->lambda * 34 + FF_LAMBDA_SCALE * mb->v4_error) >= score1) {
+                        mb->best_encoding = ENC_SKIP;
+                        ++v4_shrunk;
+                        ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->skip_error;
+#endif
+                    } else {
+                        ret += score3;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->v4_error;
+#endif
+                    }
+                }
+            }
+            *training_set_v1_shrunk = v1_shrunk;
+            *training_set_v4_shrunk = v4_shrunk;
+        } else { // find best mode per block
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+                score1 = s->lambda * 1  + FF_LAMBDA_SCALE * mb->skip_error;
+                score2 = s->lambda * 10 + FF_LAMBDA_SCALE * mb->v1_error;
+                score3 = s->lambda * 34 + FF_LAMBDA_SCALE * mb->v4_error;
+
+                if(score1 <= score2 && score1 <= score3) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->skip_error;
+#endif
+                    mb->best_encoding = ENC_SKIP;
+                } else if(score2 <= score3) {
+                    ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v1_error;
+#endif
+                    mb->best_encoding = ENC_V1;
+                } else {
+                    ret += score3;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v4_error;
+#endif
+                    mb->best_encoding = ENC_V4;
+                }
+            }
+        }
+
+        break;
+    }
+
+    return ret;
+}
+
+static int write_chunk_header(unsigned char *buf, int chunk_type, int chunk_size)
+{
+    buf[0] = chunk_type;
+    AV_WB24(&buf[1], chunk_size + CHUNK_HEADER_SIZE);
+    return CHUNK_HEADER_SIZE;
+}
+
+static int encode_codebook(CinepakEncContext *s, int *codebook, int size, int chunk_type_yuv, int chunk_type_gray, unsigned char *buf)
+{
+    int x, y, ret, entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int incremental_codebook_replacement_mode = 0; // hardcoded here,
+                // the compiler should notice that this is a constant -- rl
+
+    ret = write_chunk_header(buf,
+          s->pix_fmt == AV_PIX_FMT_RGB24 ?
+           chunk_type_yuv+(incremental_codebook_replacement_mode?1:0) :
+           chunk_type_gray+(incremental_codebook_replacement_mode?1:0),
+          entry_size * size
+           + (incremental_codebook_replacement_mode?(size+31)/32*4:0) );
+
+// we do codebook encoding according to the "intra" mode
+// but we keep the "dead" code for reference in case we will want
+// to use incremental codebook updates (which actually would give us
+// "kind of" motion compensation, especially in 1 strip/frame case) -- rl
+// (of course, the code will be not useful as-is)
+    if(incremental_codebook_replacement_mode) {
+        int flags = 0;
+        int flagsind;
+        for(x = 0; x < size; x++) {
+            if(flags == 0) {
+                flagsind = ret;
+                ret += 4;
+                flags = 0x80000000;
+            } else
+                flags = ((flags>>1) | 0x80000000);
+            for(y = 0; y < entry_size; y++)
+                buf[ret++] = codebook[y + x*entry_size] ^ (y >= 4 ? 0x80 : 0);
+            if((flags&0xffffffff) == 0xffffffff) {
+                AV_WB32(&buf[flagsind], flags);
+                flags = 0;
+            }
+        }
+        if(flags)
+            AV_WB32(&buf[flagsind], flags);
+    } else
+        for(x = 0; x < size; x++)
+            for(y = 0; y < entry_size; y++)
+                buf[ret++] = codebook[y + x*entry_size] ^ (y >= 4 ? 0x80 : 0);
+
+    return ret;
+}
+
+//sets out to the sub picture starting at (x,y) in in
+static void get_sub_picture(CinepakEncContext *s, int x, int y, AVPicture *in, AVPicture *out)
+{
+    out->data[0] = in->data[0] + x + y * in->linesize[0];
+    out->linesize[0] = in->linesize[0];
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        out->data[1] = in->data[1] + (x >> 1) + (y >> 1) * in->linesize[1];
+        out->linesize[1] = in->linesize[1];
+
+        out->data[2] = in->data[2] + (x >> 1) + (y >> 1) * in->linesize[2];
+        out->linesize[2] = in->linesize[2];
+    }
+}
+
+//decodes the V1 vector in mb into the 4x4 MB pointed to by sub_pict
+static void decode_v1_vector(CinepakEncContext *s, AVPicture *sub_pict, int v1_vector, strip_info *info)
+{
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+
+    sub_pict->data[0][0] =
+            sub_pict->data[0][1] =
+            sub_pict->data[0][    sub_pict->linesize[0]] =
+            sub_pict->data[0][1+  sub_pict->linesize[0]] = info->v1_codebook[v1_vector*entry_size];
+
+    sub_pict->data[0][2] =
+            sub_pict->data[0][3] =
+            sub_pict->data[0][2+  sub_pict->linesize[0]] =
+            sub_pict->data[0][3+  sub_pict->linesize[0]] = info->v1_codebook[v1_vector*entry_size+1];
+
+    sub_pict->data[0][2*sub_pict->linesize[0]] =
+            sub_pict->data[0][1+2*sub_pict->linesize[0]] =
+            sub_pict->data[0][  3*sub_pict->linesize[0]] =
+            sub_pict->data[0][1+3*sub_pict->linesize[0]] = info->v1_codebook[v1_vector*entry_size+2];
+
+    sub_pict->data[0][2+2*sub_pict->linesize[0]] =
+            sub_pict->data[0][3+2*sub_pict->linesize[0]] =
+            sub_pict->data[0][2+3*sub_pict->linesize[0]] =
+            sub_pict->data[0][3+3*sub_pict->linesize[0]] = info->v1_codebook[v1_vector*entry_size+3];
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        sub_pict->data[1][0] =
+            sub_pict->data[1][1] =
+            sub_pict->data[1][    sub_pict->linesize[1]] =
+            sub_pict->data[1][1+  sub_pict->linesize[1]] = info->v1_codebook[v1_vector*entry_size+4];
+
+        sub_pict->data[2][0] =
+            sub_pict->data[2][1] =
+            sub_pict->data[2][    sub_pict->linesize[2]] =
+            sub_pict->data[2][1+  sub_pict->linesize[2]] = info->v1_codebook[v1_vector*entry_size+5];
+    }
+}
+
+//decodes the V4 vectors in mb into the 4x4 MB pointed to by sub_pict
+static void decode_v4_vector(CinepakEncContext *s, AVPicture *sub_pict, int *v4_vector, strip_info *info)
+{
+    int i, x, y, entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+
+    for(i = y = 0; y < 4; y += 2) {
+        for(x = 0; x < 4; x += 2, i++) {
+            sub_pict->data[0][x   +     y*sub_pict->linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size];
+            sub_pict->data[0][x+1 +     y*sub_pict->linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+1];
+            sub_pict->data[0][x   + (y+1)*sub_pict->linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+2];
+            sub_pict->data[0][x+1 + (y+1)*sub_pict->linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+3];
+
+            if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+                sub_pict->data[1][(x>>1) + (y>>1)*sub_pict->linesize[1]] = info->v4_codebook[v4_vector[i]*entry_size+4];
+                sub_pict->data[2][(x>>1) + (y>>1)*sub_pict->linesize[2]] = info->v4_codebook[v4_vector[i]*entry_size+5];
+            }
+        }
+    }
+}
+
+static void copy_mb(CinepakEncContext *s, AVPicture *a, AVPicture *b)
+{
+    int y, p;
+
+    for(y = 0; y < MB_SIZE; y++) {
+        memcpy(a->data[0]+y*a->linesize[0], b->data[0]+y*b->linesize[0],
+               MB_SIZE);
+    }
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        for(p = 1; p <= 2; p++) {
+            for(y = 0; y < MB_SIZE/2; y++) {
+                memcpy(a->data[p] + y*a->linesize[p],
+                       b->data[p] + y*b->linesize[p],
+                       MB_SIZE/2);
+            }
+        }
+    }
+}
+
+static int encode_mode(CinepakEncContext *s, int h, AVPicture *scratch_pict, AVPicture *last_pict, strip_info *info, unsigned char *buf)
+{
+    int x, y, z, flags, bits, temp_size, header_ofs, ret = 0, mb_count = s->w * h / MB_AREA;
+    int needs_extra_bit, should_write_temp;
+    unsigned char temp[64]; //32/2 = 16 V4 blocks at 4 B each -> 64 B
+    mb_info *mb;
+    AVPicture sub_scratch = {{0}}, sub_last = {{0}};
+
+    //encode codebooks
+////// MacOS vintage decoder compatibility dictates the presence of
+////// the codebook chunk even when the codebook is empty - pretty dumb...
+////// and also the certain order of the codebook chunks -- rl
+    if(info->v4_size || !s->skip_empty_cb)
+        ret += encode_codebook(s, info->v4_codebook, info->v4_size, 0x20, 0x24, buf + ret);
+
+    if(info->v1_size || !s->skip_empty_cb)
+        ret += encode_codebook(s, info->v1_codebook, info->v1_size, 0x22, 0x26, buf + ret);
+
+    //update scratch picture
+    for(z = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, z++) {
+            mb = &s->mb[z];
+
+            get_sub_picture(s, x, y, scratch_pict, &sub_scratch);
+
+            if(info->mode == MODE_MC && mb->best_encoding == ENC_SKIP) {
+                get_sub_picture(s, x, y, last_pict, &sub_last);
+                copy_mb(s, &sub_scratch, &sub_last);
+            } else if(info->mode == MODE_V1_ONLY || mb->best_encoding == ENC_V1)
+                decode_v1_vector(s, &sub_scratch, mb->v1_vector, info);
+            else
+                decode_v4_vector(s, &sub_scratch, mb->v4_vector, info);
+        }
+    }
+
+    switch(info->mode) {
+    case MODE_V1_ONLY:
+        //av_log(s->avctx, AV_LOG_INFO, "mb_count = %i\n", mb_count);
+        ret += write_chunk_header(buf + ret, 0x32, mb_count);
+
+        for(x = 0; x < mb_count; x++)
+            buf[ret++] = s->mb[x].v1_vector;
+
+        break;
+    case MODE_V1_V4:
+        //remember header position
+        header_ofs = ret;
+        ret += CHUNK_HEADER_SIZE;
+
+        for(x = 0; x < mb_count; x += 32) {
+            flags = 0;
+            for(y = x; y < FFMIN(x+32, mb_count); y++)
+                if(s->mb[y].best_encoding == ENC_V4)
+                    flags |= 1 << (31 - y + x);
+
+            AV_WB32(&buf[ret], flags);
+            ret += 4;
+
+            for(y = x; y < FFMIN(x+32, mb_count); y++) {
+                mb = &s->mb[y];
+
+                if(mb->best_encoding == ENC_V1)
+                    buf[ret++] = mb->v1_vector;
+                else
+                    for(z = 0; z < 4; z++)
+                        buf[ret++] = mb->v4_vector[z];
+            }
+        }
+
+        write_chunk_header(buf + header_ofs, 0x30, ret - header_ofs - CHUNK_HEADER_SIZE);
+
+        break;
+    case MODE_MC:
+        //remember header position
+        header_ofs = ret;
+        ret += CHUNK_HEADER_SIZE;
+        flags = bits = temp_size = 0;
+
+        for(x = 0; x < mb_count; x++) {
+            mb = &s->mb[x];
+            flags |= (mb->best_encoding != ENC_SKIP) << (31 - bits++);
+            needs_extra_bit = 0;
+            should_write_temp = 0;
+
+            if(mb->best_encoding != ENC_SKIP) {
+                if(bits < 32)
+                    flags |= (mb->best_encoding == ENC_V4) << (31 - bits++);
+                else
+                    needs_extra_bit = 1;
+            }
+
+            if(bits == 32) {
+                AV_WB32(&buf[ret], flags);
+                ret += 4;
+                flags = bits = 0;
+
+                if(mb->best_encoding == ENC_SKIP || needs_extra_bit) {
+                    memcpy(&buf[ret], temp, temp_size);
+                    ret += temp_size;
+                    temp_size = 0;
+                } else
+                    should_write_temp = 1;
+            }
+
+            if(needs_extra_bit) {
+                flags = (mb->best_encoding == ENC_V4) << 31;
+                bits = 1;
+            }
+
+            if(mb->best_encoding == ENC_V1)
+                temp[temp_size++] = mb->v1_vector;
+            else if(mb->best_encoding == ENC_V4)
+                for(z = 0; z < 4; z++)
+                    temp[temp_size++] = mb->v4_vector[z];
+
+            if(should_write_temp) {
+                memcpy(&buf[ret], temp, temp_size);
+                ret += temp_size;
+                temp_size = 0;
+            }
+        }
+
+        if(bits > 0) {
+            AV_WB32(&buf[ret], flags);
+            ret += 4;
+            memcpy(&buf[ret], temp, temp_size);
+            ret += temp_size;
+        }
+
+        write_chunk_header(buf + header_ofs, 0x31, ret - header_ofs - CHUNK_HEADER_SIZE);
+
+        break;
+    }
+
+    return ret;
+}
+
+//computes distortion of 4x4 MB in b compared to a
+static int compute_mb_distortion(CinepakEncContext *s, AVPicture *a, AVPicture *b)
+{
+    int x, y, p, d, ret = 0;
+
+    for(y = 0; y < MB_SIZE; y++) {
+        for(x = 0; x < MB_SIZE; x++) {
+            d = a->data[0][x + y*a->linesize[0]] - b->data[0][x + y*b->linesize[0]];
+            ret += d*d;
+        }
+    }
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        for(p = 1; p <= 2; p++) {
+            for(y = 0; y < MB_SIZE/2; y++) {
+                for(x = 0; x < MB_SIZE/2; x++) {
+                    d = a->data[p][x + y*a->linesize[p]] - b->data[p][x + y*b->linesize[p]];
+                    ret += d*d;
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+// return the possibly adjusted size of the codebook
+#define CERTAIN(x) ((x)!=ENC_UNCERTAIN)
+static int quantize(CinepakEncContext *s, int h, AVPicture *pict,
+                    int v1mode, strip_info *info,
+                    mb_encoding encoding)
+{
+    int x, y, i, j, k, x2, y2, x3, y3, plane, shift, mbn;
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int *codebook = v1mode ? info->v1_codebook : info->v4_codebook;
+    int size = v1mode ? info->v1_size : info->v4_size;
+    int64_t total_error = 0;
+    uint8_t vq_pict_buf[(MB_AREA*3)/2];
+    AVPicture sub_pict, vq_pict;
+
+    for(mbn = i = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, ++mbn) {
+            int *base;
+
+            if(CERTAIN(encoding)) {
+// use for the training only the blocks known to be to be encoded [sic:-]
+               if(s->mb[mbn].best_encoding != encoding) continue;
+            }
+
+            base = s->codebook_input + i*entry_size;
+            if(v1mode) {
+                //subsample
+                for(j = y2 = 0; y2 < entry_size; y2 += 2) {
+                    for(x2 = 0; x2 < 4; x2 += 2, j++) {
+                        plane = y2 < 4 ? 0 : 1 + (x2 >> 1);
+                        shift = y2 < 4 ? 0 : 1;
+                        x3 = shift ? 0 : x2;
+                        y3 = shift ? 0 : y2;
+                        base[j] = (pict->data[plane][((x+x3) >> shift) +      ((y+y3) >> shift)      * pict->linesize[plane]] +
+                                   pict->data[plane][((x+x3) >> shift) + 1 +  ((y+y3) >> shift)      * pict->linesize[plane]] +
+                                   pict->data[plane][((x+x3) >> shift) +     (((y+y3) >> shift) + 1) * pict->linesize[plane]] +
+                                   pict->data[plane][((x+x3) >> shift) + 1 + (((y+y3) >> shift) + 1) * pict->linesize[plane]]) >> 2;
+                    }
+                }
+            } else {
+                //copy
+                for(j = y2 = 0; y2 < MB_SIZE; y2 += 2) {
+                    for(x2 = 0; x2 < MB_SIZE; x2 += 2) {
+                        for(k = 0; k < entry_size; k++, j++) {
+                            plane = k >= 4 ? k - 3 : 0;
+
+                            if(k >= 4) {
+                                x3 = (x+x2) >> 1;
+                                y3 = (y+y2) >> 1;
+                            } else {
+                                x3 = x + x2 + (k & 1);
+                                y3 = y + y2 + (k >> 1);
+                            }
+
+                            base[j] = pict->data[plane][x3 + y3*pict->linesize[plane]];
+                        }
+                    }
+                }
+            }
+            i += v1mode ? 1 : 4;
+        }
+    }
+//    if(i < mbn*(v1mode ? 1 : 4)) {
+//        av_log(s->avctx, AV_LOG_INFO, "reducing training set for %s from %i to %i (encoding %i)\n", v1mode?"v1":"v4", mbn*(v1mode ? 1 : 4), i, encoding);
+//    }
+
+    if(i == 0) // empty training set, nothing to do
+        return 0;
+    if(i < size) {
+        //av_log(s->avctx, (CERTAIN(encoding) ? AV_LOG_ERROR : AV_LOG_INFO), "WOULD WASTE: %s cbsize %i bigger than training set size %i (encoding %i)\n", v1mode?"v1":"v4", size, i, encoding);
+        size = i;
+    }
+
+    avpriv_init_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+    avpriv_do_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+
+    //setup vq_pict, which contains a single MB
+    vq_pict.data[0] = vq_pict_buf;
+    vq_pict.linesize[0] = MB_SIZE;
+    vq_pict.data[1] = &vq_pict_buf[MB_AREA];
+    vq_pict.data[2] = vq_pict.data[1] + (MB_AREA >> 2);
+    vq_pict.linesize[1] = vq_pict.linesize[2] = MB_SIZE >> 1;
+
+    //copy indices
+    for(i = j = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, j++) {
+            mb_info *mb = &s->mb[j];
+// skip uninteresting blocks if we know their preferred encoding
+            if(CERTAIN(encoding) && mb->best_encoding != encoding)
+                continue;
+
+            //point sub_pict to current MB
+            get_sub_picture(s, x, y, pict, &sub_pict);
+
+            if(v1mode) {
+                mb->v1_vector = s->codebook_closest[i];
+
+                //fill in vq_pict with V1 data
+                decode_v1_vector(s, &vq_pict, mb->v1_vector, info);
+
+                mb->v1_error = compute_mb_distortion(s, &sub_pict, &vq_pict);
+                total_error += mb->v1_error;
+            } else {
+                for(k = 0; k < 4; k++)
+                    mb->v4_vector[k] = s->codebook_closest[i+k];
+
+                //fill in vq_pict with V4 data
+                decode_v4_vector(s, &vq_pict, mb->v4_vector, info);
+
+                mb->v4_error = compute_mb_distortion(s, &sub_pict, &vq_pict);
+                total_error += mb->v4_error;
+            }
+            i += v1mode ? 1 : 4;
+        }
+    }
+// check that we did it right in the beginning of the function
+    av_assert0(i >= size); // training set is no smaller than the codebook
+
+    //av_log(s->avctx, AV_LOG_INFO, "isv1 %i size= %i i= %i error %"PRId64"\n", v1mode, size, i, total_error);
+
+    return size;
+}
+
+static void calculate_skip_errors(CinepakEncContext *s, int h, AVPicture *last_pict, AVPicture *pict, strip_info *info)
+{
+    int x, y, i;
+    AVPicture sub_last, sub_pict;
+
+    for(i = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, i++) {
+            get_sub_picture(s, x, y, last_pict, &sub_last);
+            get_sub_picture(s, x, y, pict,      &sub_pict);
+
+            s->mb[i].skip_error = compute_mb_distortion(s, &sub_last, &sub_pict);
+        }
+    }
+}
+
+static void write_strip_header(CinepakEncContext *s, int y, int h, int keyframe, unsigned char *buf, int strip_size)
+{
+// actually we are exclusively using intra strip coding (how much can we win
+// otherwise? how to choose which part of a codebook to update?),
+// keyframes are different only because we disallow ENC_SKIP on them -- rl
+// (besides, the logic here used to be inverted: )
+//    buf[0] = keyframe ? 0x11: 0x10;
+    buf[0] = keyframe ? 0x10: 0x11;
+    AV_WB24(&buf[1], strip_size + STRIP_HEADER_SIZE);
+//    AV_WB16(&buf[4], y); /* using absolute y values works -- rl */
+    AV_WB16(&buf[4], 0); /* using relative values works as well -- rl */
+    AV_WB16(&buf[6], 0);
+//    AV_WB16(&buf[8], y+h); /* using absolute y values works -- rl */
+    AV_WB16(&buf[8], h); /* using relative values works as well -- rl */
+    AV_WB16(&buf[10], s->w);
+    //av_log(s->avctx, AV_LOG_INFO, "write_strip_header() %x keyframe=%d\n", buf[0], keyframe);
+}
+
+static int rd_strip(CinepakEncContext *s, int y, int h, int keyframe, AVPicture *last_pict, AVPicture *pict, AVPicture *scratch_pict, unsigned char *buf, int64_t *best_score
+#ifdef CINEPAK_REPORT_SERR
+, int64_t *best_serr
+#endif
+)
+{
+    int64_t score = 0;
+#ifdef CINEPAK_REPORT_SERR
+    int64_t serr;
+#endif
+    int best_size = 0;
+    strip_info info;
+// for codebook optimization:
+    int v1enough, v1_size, v4enough, v4_size;
+    int new_v1_size, new_v4_size;
+    int v1shrunk, v4shrunk;
+
+    if(!keyframe)
+        calculate_skip_errors(s, h, last_pict, pict, &info);
+
+    //try some powers of 4 for the size of the codebooks
+    //constraint the v4 codebook to be no bigger than v1 one,
+    //(and no less than v1_size/4)
+    //thus making v1 preferable and possibly losing small details? should be ok
+#define SMALLEST_CODEBOOK 1
+    for(v1enough = 0, v1_size = SMALLEST_CODEBOOK; v1_size <= CODEBOOK_MAX && !v1enough; v1_size <<= 2) {
+        for(v4enough = 0, v4_size = 0; v4_size <= v1_size && !v4enough; v4_size = v4_size ? v4_size << 2 : v1_size >= SMALLEST_CODEBOOK << 2 ? v1_size >> 2 : SMALLEST_CODEBOOK) {
+            //try all modes
+            for(CinepakMode mode = 0; mode < MODE_COUNT; mode++) {
+                //don't allow MODE_MC in intra frames
+                if(keyframe && mode == MODE_MC)
+                    continue;
+
+                if(mode == MODE_V1_ONLY) {
+                    info.v1_size = v1_size;
+// the size may shrink even before optimizations if the input is short:
+                    info.v1_size = quantize(s, h, pict, 1, &info, ENC_UNCERTAIN);
+                    if(info.v1_size < v1_size)
+// too few eligible blocks, no sense in trying bigger sizes
+                        v1enough = 1;
+
+                    info.v4_size = 0;
+                } else { // mode != MODE_V1_ONLY
+                    // if v4 codebook is empty then only allow V1-only mode
+                    if(!v4_size)
+                        continue;
+
+                    if(mode == MODE_V1_V4) {
+                        info.v4_size = v4_size;
+                        info.v4_size = quantize(s, h, pict, 0, &info, ENC_UNCERTAIN);
+                        if(info.v4_size < v4_size)
+// too few eligible blocks, no sense in trying bigger sizes
+                            v4enough = 1;
+                    }
+                }
+
+                info.mode = mode;
+// choose the best encoding per block, based on current experience
+                score = calculate_mode_score(s, h, &info, 0,
+                                             &v1shrunk, &v4shrunk
+#ifdef CINEPAK_REPORT_SERR
+, &serr
+#endif
+);
+
+                if(mode != MODE_V1_ONLY){
+                    int extra_iterations_limit = s->max_extra_cb_iterations;
+// recompute the codebooks, omitting the extra blocks
+// we assume we _may_ come here with more blocks to encode than before
+                    info.v1_size = v1_size;
+                    new_v1_size = quantize(s, h, pict, 1, &info, ENC_V1);
+                    if(new_v1_size < info.v1_size){
+                        //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v1 codebook to %i entries\n", mode, v1_size, v4_size, new_v1_size);
+                        info.v1_size = new_v1_size;
+                    }
+// we assume we _may_ come here with more blocks to encode than before
+                    info.v4_size = v4_size;
+                    new_v4_size = quantize(s, h, pict, 0, &info, ENC_V4);
+                    if(new_v4_size < info.v4_size) {
+                        //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v4 codebook to %i entries at first iteration\n", mode, v1_size, v4_size, new_v4_size);
+                        info.v4_size = new_v4_size;
+                    }
+// calculate the resulting score
+// (do not move blocks to codebook encodings now, as some blocks may have
+// got bigger errors despite a smaller training set - but we do not
+// ever grow the training sets back)
+                    for(;;) {
+                        score = calculate_mode_score(s, h, &info, 1,
+                                                     &v1shrunk, &v4shrunk
+#ifdef CINEPAK_REPORT_SERR
+, &serr
+#endif
+);
+// do we have a reason to reiterate? if so, have we reached the limit?
+                        if((!v1shrunk && !v4shrunk) || !extra_iterations_limit--) break;
+// recompute the codebooks, omitting the extra blocks
+                        if(v1shrunk) {
+                            info.v1_size = v1_size;
+                            new_v1_size = quantize(s, h, pict, 1, &info, ENC_V1);
+                            if(new_v1_size < info.v1_size){
+                                //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v1 codebook to %i entries\n", mode, v1_size, v4_size, new_v1_size);
+                                info.v1_size = new_v1_size;
+                            }
+                        }
+                        if(v4shrunk) {
+                            info.v4_size = v4_size;
+                            new_v4_size = quantize(s, h, pict, 0, &info, ENC_V4);
+                            if(new_v4_size < info.v4_size) {
+                                //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v4 codebook to %i entries\n", mode, v1_size, v4_size, new_v4_size);
+                                info.v4_size = new_v4_size;
+                            }
+                        }
+                    }
+                }
+
+                //av_log(s->avctx, AV_LOG_INFO, "%3i %3i score = %"PRId64"\n", v1_size, v4_size, score);
+
+                if(best_size == 0 || score < *best_score) {
+
+                    *best_score = score;
+#ifdef CINEPAK_REPORT_SERR
+                    *best_serr = serr;
+#endif
+                    best_size = encode_mode(s, h, scratch_pict, last_pict, &info, s->strip_buf + STRIP_HEADER_SIZE);
+
+                    //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: %18"PRId64" %i B", mode, info.v1_size, info.v4_size, score, best_size);
+                    //av_log(s->avctx, AV_LOG_INFO, "\n");
+#ifdef CINEPAK_REPORT_SERR
+                    av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: %18"PRId64" %i B\n", mode, v1_size, v4_size, serr, best_size);
+#endif
+
+#ifdef CINEPAKENC_DEBUG
+                    //save MB encoding choices
+                    memcpy(s->best_mb, s->mb, mb_count*sizeof(mb_info));
+#endif
+
+                    //memcpy(strip_temp + STRIP_HEADER_SIZE, strip_temp, best_size);
+                    write_strip_header(s, y, h, keyframe, s->strip_buf, best_size);
+
+                }
+            }
+        }
+    }
+
+#ifdef CINEPAKENC_DEBUG
+    //gather stats. this will only work properly of MAX_STRIPS == 1
+    if(best_info.mode == MODE_V1_ONLY) {
+        s->num_v1_mode++;
+        s->num_v1_encs += s->w*h/MB_AREA;
+    } else {
+        if(best_info.mode == MODE_V1_V4)
+            s->num_v4_mode++;
+        else
+            s->num_mc_mode++;
+
+        int x;
+        for(x = 0; x < s->w*h/MB_AREA; x++)
+            if(s->best_mb[x].best_encoding == ENC_V1)
+                s->num_v1_encs++;
+            else if(s->best_mb[x].best_encoding == ENC_V4)
+                s->num_v4_encs++;
+            else
+                s->num_skips++;
+    }
+#endif
+
+    best_size += STRIP_HEADER_SIZE;
+    memcpy(buf, s->strip_buf, best_size);
+
+    return best_size;
+}
+
+static int write_cvid_header(CinepakEncContext *s, unsigned char *buf, int num_strips, int data_size, int isakeyframe)
+{
+    buf[0] = isakeyframe ? 0 : 1;
+    AV_WB24(&buf[1], data_size + CVID_HEADER_SIZE);
+    AV_WB16(&buf[4], s->w);
+    AV_WB16(&buf[6], s->h);
+    AV_WB16(&buf[8], num_strips);
+
+    return CVID_HEADER_SIZE;
+}
+
+static int rd_frame(CinepakEncContext *s, const AVFrame *frame, int isakeyframe, unsigned char *buf, int buf_size)
+{
+    int num_strips, strip, i, y, nexty, size, temp_size;
+    AVPicture last_pict, pict, scratch_pict;
+    int64_t best_score = 0, score, score_temp;
+#ifdef CINEPAK_REPORT_SERR
+    int64_t best_serr = 0, serr, serr_temp;
+#endif
+
+    int best_nstrips = -1, best_size = -1; // mark as uninitialzed
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        int x;
+// build a copy of the given frame in the correct colorspace
+        for(y = 0; y < s->h; y += 2) {
+            for(x = 0; x < s->w; x += 2) {
+                uint8_t *ir[2]; int32_t r, g, b, rr, gg, bb;
+                ir[0] = ((AVPicture*)frame)->data[0] + x*3 + y*((AVPicture*)frame)->linesize[0];
+                ir[1] = ir[0] + ((AVPicture*)frame)->linesize[0];
+                get_sub_picture(s, x, y, (AVPicture*)s->input_frame, &scratch_pict);
+                r = g = b = 0;
+                for(i=0; i<4; ++i) {
+                    int i1, i2;
+                    i1 = (i&1); i2 = (i>=2);
+                    rr = ir[i2][i1*3+0];
+                    gg = ir[i2][i1*3+1];
+                    bb = ir[i2][i1*3+2];
+                    r += rr; g += gg; b += bb;
+// using fixed point arithmetic for portable repeatability, scaling by 2^23
+// "Y"
+//                    rr = 0.2857*rr + 0.5714*gg + 0.1429*bb;
+                    rr = (2396625*rr + 4793251*gg + 1198732*bb) >> 23;
+                    if(      rr <   0) rr =   0;
+                    else if (rr > 255) rr = 255;
+                    scratch_pict.data[0][i1 + i2*scratch_pict.linesize[0]] = rr;
+                }
+// let us scale down as late as possible
+//                r /= 4; g /= 4; b /= 4;
+// "U"
+//                rr = -0.1429*r - 0.2857*g + 0.4286*b;
+                rr = (-299683*r - 599156*g + 898839*b) >> 23;
+                if(      rr < -128) rr = -128;
+                else if (rr >  127) rr =  127;
+                scratch_pict.data[1][0] = rr + 128; // quantize needs unsigned
+// "V"
+//                rr = 0.3571*r - 0.2857*g - 0.0714*b;
+                rr = (748893*r - 599156*g - 149737*b) >> 23;
+                if(      rr < -128) rr = -128;
+                else if (rr >  127) rr =  127;
+                scratch_pict.data[2][0] = rr + 128; // quantize needs unsigned
+            }
+        }
+    }
+
+    //would be nice but quite certainly incompatible with vintage players:
+    // support encoding zero strips (meaning skip the whole frame)
+    for(num_strips = s->min_strips; num_strips <= s->max_strips && num_strips <= s->h / MB_SIZE; num_strips++) {
+        score = 0;
+        size = 0;
+#ifdef CINEPAK_REPORT_SERR
+        serr = 0;
+#endif
+
+        for(y = 0, strip = 1; y < s->h; strip++, y = nexty) {
+            int strip_height;
+
+            nexty = strip * s->h / num_strips; // <= s->h
+            //make nexty the next multiple of 4 if not already there
+            if(nexty & 3)
+                nexty += 4 - (nexty & 3);
+
+            strip_height = nexty - y;
+            if(strip_height <= 0) { // can this ever happen?
+                av_log(s->avctx, AV_LOG_INFO, "skipping zero height strip %i of %i\n", strip, num_strips);
+                continue;
+            }
+
+            if(s->pix_fmt == AV_PIX_FMT_RGB24)
+                get_sub_picture(s, 0, y, (AVPicture*)s->input_frame,    &pict);
+            else
+                get_sub_picture(s, 0, y, (AVPicture*)frame,              &pict);
+            get_sub_picture(s, 0, y, (AVPicture*)s->last_frame,    &last_pict);
+            get_sub_picture(s, 0, y, (AVPicture*)s->scratch_frame, &scratch_pict);
+
+            if((temp_size = rd_strip(s, y, strip_height, isakeyframe, &last_pict, &pict, &scratch_pict, s->frame_buf + size + CVID_HEADER_SIZE, &score_temp
+#ifdef CINEPAK_REPORT_SERR
+, &serr_temp
+#endif
+)) < 0)
+                return temp_size;
+
+            score += score_temp;
+#ifdef CINEPAK_REPORT_SERR
+            serr += serr_temp;
+#endif
+            size += temp_size;
+            //av_log(s->avctx, AV_LOG_INFO, "strip %d, isakeyframe=%d", strip, isakeyframe);
+            //av_log(s->avctx, AV_LOG_INFO, "\n");
+        }
+
+        if(best_score == 0 || score < best_score) {
+            best_score = score;
+#ifdef CINEPAK_REPORT_SERR
+            best_serr = serr;
+#endif
+            best_size = size + write_cvid_header(s, s->frame_buf, num_strips, size, isakeyframe);
+            //av_log(s->avctx, AV_LOG_INFO, "best number of strips so far: %2i, %12"PRId64", %i B\n", num_strips, score, best_size);
+#ifdef CINEPAK_REPORT_SERR
+            av_log(s->avctx, AV_LOG_INFO, "best number of strips so far: %2i, %12"PRId64", %i B\n", num_strips, serr, best_size);
+#endif
+
+            FFSWAP(AVFrame *, s->best_frame, s->scratch_frame);
+            memcpy(buf, s->frame_buf, best_size);
+            best_nstrips = num_strips;
+        }
+// avoid trying too many strip numbers without a real reason
+// (this makes the processing of the very first frame faster)
+        if(num_strips - best_nstrips > 4)
+            break;
+    }
+
+    av_assert0(best_nstrips >= 0 && best_size >= 0);
+
+// let the number of strips slowly adapt to the changes in the contents,
+// compared to full bruteforcing every time this will occasionally lead
+// to some r/d performance loss but makes encoding up to several times faster
+    if(!s->strip_number_delta_range) {
+        if(best_nstrips == s->max_strips) { // let us try to step up
+            s->max_strips = best_nstrips + 1;
+            if(s->max_strips >= s->max_max_strips)
+                s->max_strips = s->max_max_strips;
+        } else { // try to step down
+            s->max_strips = best_nstrips;
+        }
+        s->min_strips = s->max_strips - 1;
+        if(s->min_strips < s->min_min_strips)
+            s->min_strips = s->min_min_strips;
+    } else {
+        s->max_strips = best_nstrips + s->strip_number_delta_range;
+        if(s->max_strips >= s->max_max_strips)
+            s->max_strips = s->max_max_strips;
+        s->min_strips = best_nstrips - s->strip_number_delta_range;
+        if(s->min_strips < s->min_min_strips)
+            s->min_strips = s->min_min_strips;
+    }
+
+    return best_size;
+}
+
+static int cinepak_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *frame, int *got_packet)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int ret;
+
+    s->lambda = frame->quality ? frame->quality - 1 : 2 * FF_LAMBDA_SCALE;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->frame_buf_size, 0)) < 0)
+        return ret;
+    ret = rd_frame(s, frame, (s->curframe == 0), pkt->data, s->frame_buf_size);
+    pkt->size = ret;
+    if (s->curframe == 0)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    FFSWAP(AVFrame *, s->last_frame, s->best_frame);
+
+    if (++s->curframe >= s->keyint)
+        s->curframe = 0;
+
+    return 0;
+}
+
+static av_cold int cinepak_encode_end(AVCodecContext *avctx)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int x;
+
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->best_frame);
+    av_frame_free(&s->scratch_frame);
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        av_frame_free(&s->input_frame);
+    av_freep(&s->codebook_input);
+    av_freep(&s->codebook_closest);
+    av_freep(&s->strip_buf);
+    av_freep(&s->frame_buf);
+    av_freep(&s->mb);
+#ifdef CINEPAKENC_DEBUG
+    av_freep(&s->best_mb);
+#endif
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        av_freep(&s->pict_bufs[x]);
+
+#ifdef CINEPAKENC_DEBUG
+    av_log(avctx, AV_LOG_INFO, "strip coding stats: %i V1 mode, %i V4 mode, %i MC mode (%i V1 encs, %i V4 encs, %i skips)\n",
+        s->num_v1_mode, s->num_v4_mode, s->num_mc_mode, s->num_v1_encs, s->num_v4_encs, s->num_skips);
+#endif
+
+    return 0;
+}
+
+AVCodec ff_cinepak_encoder = {
+    .name           = "cinepak",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CINEPAK,
+    .priv_data_size = sizeof(CinepakEncContext),
+    .init           = cinepak_encode_init,
+    .encode2        = cinepak_encode_frame,
+    .close          = cinepak_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_RGB24, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
+    .long_name      = NULL_IF_CONFIG_SMALL("Cinepak / CVID"),
+    .priv_class     = &cinepak_class,
+};
diff --git a/libavcodec/cljrdec.c b/libavcodec/cljrdec.c
index 33d8023429..4b187f8cf3 100644
--- a/libavcodec/cljrdec.c
+++ b/libavcodec/cljrdec.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,16 +43,14 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if (buf_size < avctx->height * avctx->width) {
+    if (buf_size / avctx->height < avctx->width) {
         av_log(avctx, AV_LOG_ERROR,
                "Resolution larger than buffer size. Invalid header?\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -63,10 +61,10 @@ static int decode_frame(AVCodecContext *avctx,
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
         for (x = 0; x < avctx->width; x += 4) {
-            luma[3] = get_bits(&gb, 5) << 3;
-            luma[2] = get_bits(&gb, 5) << 3;
-            luma[1] = get_bits(&gb, 5) << 3;
-            luma[0] = get_bits(&gb, 5) << 3;
+            luma[3] = (get_bits(&gb, 5)*33) >> 2;
+            luma[2] = (get_bits(&gb, 5)*33) >> 2;
+            luma[1] = (get_bits(&gb, 5)*33) >> 2;
+            luma[0] = (get_bits(&gb, 5)*33) >> 2;
             luma += 4;
             *(cb++) = get_bits(&gb, 6) << 2;
             *(cr++) = get_bits(&gb, 6) << 2;
@@ -93,3 +91,4 @@ AVCodec ff_cljr_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+
diff --git a/libavcodec/cljrenc.c b/libavcodec/cljrenc.c
index 0687e3004b..a3718259d1 100644
--- a/libavcodec/cljrenc.c
+++ b/libavcodec/cljrenc.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) encoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,28 +25,39 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 #include "put_bits.h"
 
+typedef struct CLJRContext {
+    AVClass        *avclass;
+    int             dither_type;
+} CLJRContext;
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *p, int *got_packet)
 {
+    CLJRContext *a = avctx->priv_data;
     PutBitContext pb;
     int x, y, ret;
+    uint32_t dither= avctx->frame_number;
+    static const uint32_t ordered_dither[2][2] =
+    {
+        { 0x10400000, 0x104F0000 },
+        { 0xCB2A0000, 0xCB250000 },
+    };
 
-    if ((ret = ff_alloc_packet(pkt, 32*avctx->height*avctx->width/4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    if (avctx->width%4 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+         av_log(avctx, AV_LOG_ERROR,
+                "Widths which are not a multiple of 4 might fail with some decoders, "
+                "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->width);
+         return AVERROR_EXPERIMENTAL;
     }
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    if ((ret = ff_alloc_packet2(avctx, pkt, 32*avctx->height*avctx->width/4, 0)) < 0)
+        return ret;
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
@@ -54,14 +65,25 @@ FF_ENABLE_DEPRECATION_WARNINGS
         uint8_t *luma = &p->data[0][y * p->linesize[0]];
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
+        uint8_t luma_tmp[4];
         for (x = 0; x < avctx->width; x += 4) {
-            put_bits(&pb, 5, luma[3] >> 3);
-            put_bits(&pb, 5, luma[2] >> 3);
-            put_bits(&pb, 5, luma[1] >> 3);
-            put_bits(&pb, 5, luma[0] >> 3);
+            switch (a->dither_type) {
+            case 0: dither = 0x492A0000;                       break;
+            case 1: dither = dither * 1664525 + 1013904223;    break;
+            case 2: dither = ordered_dither[ y&1 ][ (x>>2)&1 ];break;
+            }
+            if (x+3 >= avctx->width) {
+                memset(luma_tmp, 0, sizeof(luma_tmp));
+                memcpy(luma_tmp, luma, avctx->width - x);
+                luma = luma_tmp;
+            }
+            put_bits(&pb, 5, (249*(luma[3] +  (dither>>29)   )) >> 11);
+            put_bits(&pb, 5, (249*(luma[2] + ((dither>>26)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[1] + ((dither>>23)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[0] + ((dither>>20)&7))) >> 11);
             luma += 4;
-            put_bits(&pb, 6, *(cb++) >> 2);
-            put_bits(&pb, 6, *(cr++) >> 2);
+            put_bits(&pb, 6, (253*(*(cb++) + ((dither>>18)&3))) >> 10);
+            put_bits(&pb, 6, (253*(*(cr++) + ((dither>>16)&3))) >> 10);
         }
     }
 
@@ -73,12 +95,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#define OFFSET(x) offsetof(CLJRContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "dither_type",   "Dither type",   OFFSET(dither_type),        AV_OPT_TYPE_INT, { .i64=1 }, 0, 2, VE},
+    { NULL },
+};
+
+static const AVClass cljr_class = {
+    .class_name = "cljr encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_cljr_encoder = {
     .name           = "cljr",
     .long_name      = NULL_IF_CONFIG_SMALL("Cirrus Logic AccuPak"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_CLJR,
+    .priv_data_size = sizeof(CLJRContext),
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
                                                    AV_PIX_FMT_NONE },
+    .priv_class     = &cljr_class,
 };
diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index cdbed74474..1c6902afd4 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012-2013 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -398,7 +398,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
     ctx->bdsp.bswap16_buf((uint16_t *) ctx->swapped_buf, (uint16_t *) src,
                           data_size / 2);
 
-    init_get_bits(&gb, ctx->swapped_buf, data_size * 8);
+    if ((ret = init_get_bits8(&gb, ctx->swapped_buf, data_size)) < 0)
+        return ret;
 
     /*
      * Read in coding type. The types are as follows:
@@ -416,11 +417,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_yuv_frame(ctx, &gb, pic);
         if (ret < 0)
@@ -432,11 +430,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_RGB24;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_rgb24_frame(ctx, &gb, pic);
         if (ret < 0)
@@ -447,11 +442,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_ARGB;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_argb_frame(ctx, &gb, pic);
         if (ret < 0)
diff --git a/libavcodec/cngdec.c b/libavcodec/cngdec.c
index 482ef94a32..c0295cd77f 100644
--- a/libavcodec/cngdec.c
+++ b/libavcodec/cngdec.c
@@ -2,20 +2,20 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,11 +41,11 @@ typedef struct CNGContext {
 static av_cold int cng_decode_close(AVCodecContext *avctx)
 {
     CNGContext *p = avctx->priv_data;
-    av_free(p->refl_coef);
-    av_free(p->target_refl_coef);
-    av_free(p->lpc_coef);
-    av_free(p->filter_out);
-    av_free(p->excitation);
+    av_freep(&p->refl_coef);
+    av_freep(&p->target_refl_coef);
+    av_freep(&p->lpc_coef);
+    av_freep(&p->filter_out);
+    av_freep(&p->excitation);
     return 0;
 }
 
@@ -59,12 +59,12 @@ static av_cold int cng_decode_init(AVCodecContext *avctx)
 
     p->order            = 12;
     avctx->frame_size   = 640;
-    p->refl_coef        = av_mallocz(p->order * sizeof(*p->refl_coef));
-    p->target_refl_coef = av_mallocz(p->order * sizeof(*p->target_refl_coef));
-    p->lpc_coef         = av_mallocz(p->order * sizeof(*p->lpc_coef));
-    p->filter_out       = av_mallocz((avctx->frame_size + p->order) *
+    p->refl_coef        = av_mallocz_array(p->order, sizeof(*p->refl_coef));
+    p->target_refl_coef = av_mallocz_array(p->order, sizeof(*p->target_refl_coef));
+    p->lpc_coef         = av_mallocz_array(p->order, sizeof(*p->lpc_coef));
+    p->filter_out       = av_mallocz_array(avctx->frame_size + p->order,
                                      sizeof(*p->filter_out));
-    p->excitation       = av_mallocz(avctx->frame_size * sizeof(*p->excitation));
+    p->excitation       = av_mallocz_array(avctx->frame_size, sizeof(*p->excitation));
     if (!p->refl_coef || !p->target_refl_coef || !p->lpc_coef ||
         !p->filter_out || !p->excitation) {
         cng_decode_close(avctx);
@@ -142,10 +142,8 @@ static int cng_decode_frame(AVCodecContext *avctx, void *data,
                                  p->excitation, avctx->frame_size, p->order);
 
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (int16_t *)frame->data[0];
     for (i = 0; i < avctx->frame_size; i++)
         buf_out[i] = p->filter_out[i + p->order];
diff --git a/libavcodec/cngenc.c b/libavcodec/cngenc.c
index 98f3c4e91a..302c703f72 100644
--- a/libavcodec/cngenc.c
+++ b/libavcodec/cngenc.c
@@ -2,20 +2,20 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,8 +56,8 @@ static av_cold int cng_encode_init(AVCodecContext *avctx)
     p->order = 10;
     if ((ret = ff_lpc_init(&p->lpc, avctx->frame_size, p->order, FF_LPC_TYPE_LEVINSON)) < 0)
         return ret;
-    p->samples32 = av_malloc(avctx->frame_size * sizeof(*p->samples32));
-    p->ref_coef = av_malloc(p->order * sizeof(*p->ref_coef));
+    p->samples32 = av_malloc_array(avctx->frame_size, sizeof(*p->samples32));
+    p->ref_coef = av_malloc_array(p->order, sizeof(*p->ref_coef));
     if (!p->samples32 || !p->ref_coef) {
         cng_encode_close(avctx);
         return AVERROR(ENOMEM);
@@ -75,7 +75,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int qdbov;
     int16_t *samples = (int16_t*) frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, 1 + p->order))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 1 + p->order, 1 + p->order))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
@@ -87,7 +87,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     energy /= frame->nb_samples;
     if (energy > 0) {
         double dbov = 10 * log10(energy / 1081109975);
-        qdbov = av_clip(-floor(dbov), 0, 127);
+        qdbov = av_clip_uintp2(-floor(dbov), 7);
     } else {
         qdbov = 127;
     }
@@ -97,7 +97,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         avpkt->data[1 + i] = p->ref_coef[i] * 127 + 127;
 
     *got_packet_ptr = 1;
-    avpkt->size = 1 + p->order;
+    av_assert1(avpkt->size == 1 + p->order);
 
     return 0;
 }
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index d2c7a9179c..da8d65816e 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * This table was generated from the long and short names of AVCodecs
+ * please see the respective codec sources for authorship
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +26,8 @@
 #include "avcodec.h"
 #include "version.h"
 
+#define MT(...) (const char *const[]){ __VA_ARGS__, NULL }
+
 static const AVCodecDescriptor codec_descriptors[] = {
     /* video codecs */
     {
@@ -82,6 +87,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "mjpeg",
         .long_name = NULL_IF_CONFIG_SMALL("Motion JPEG"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .mime_types= MT("image/jpeg"),
     },
     {
         .id        = AV_CODEC_ID_MJPEGB,
@@ -399,6 +405,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_SNOW,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "snow",
+        .long_name = NULL_IF_CONFIG_SMALL("Snow"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_TSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "tscc",
@@ -516,6 +529,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "bmp",
         .long_name = NULL_IF_CONFIG_SMALL("BMP (Windows and OS/2 bitmap)"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-ms-bmp"),
     },
     {
         .id        = AV_CODEC_ID_CSCD,
@@ -587,6 +601,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("JPEG 2000"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
                      AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/jp2"),
     },
     {
         .id        = AV_CODEC_ID_VMNC,
@@ -1013,7 +1028,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "cdxl",
         .long_name = NULL_IF_CONFIG_SMALL("Commodore CDXL video"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_ZEROCODEC,
@@ -1072,6 +1087,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_Y41P,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "y41p",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
         .id        = AV_CODEC_ID_ESCAPE130,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "escape130",
@@ -1079,6 +1101,88 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_AVRP,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avrp",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_012V,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "012v",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AVUI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avui",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AYUV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ayuv",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_TARGA_Y216,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "targa_y216",
+        .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_V308,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "v308",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_V408,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "v408",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_YUV4,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "yuv4",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AVRN,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avrn",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_CPIA,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "cpia",
+        .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"),
+    },
+    {
+        .id        = AV_CODEC_ID_XFACE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xface",
+        .long_name = NULL_IF_CONFIG_SMALL("X-face image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SMVJPEG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "smvjpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"),
+    },
+
+    {
         .id        = AV_CODEC_ID_G2M,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "g2m",
@@ -1096,7 +1200,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_HEVC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hevc",
-        .long_name = NULL_IF_CONFIG_SMALL("HEVC (High Efficiency Video Coding)"),
+        .long_name = NULL_IF_CONFIG_SMALL("H.265 / HEVC (High Efficiency Video Coding)"),
         .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER,
     },
     {
@@ -1124,7 +1228,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SANM,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "sanm",
-        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM/SMUSH video"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1204,7 +1308,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_DPX,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dpx",
-        .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
+        .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1221,6 +1325,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "gif",
         .long_name = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
         .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/gif"),
     },
     {
         .id        = AV_CODEC_ID_JPEGLS,
@@ -1243,6 +1348,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "pam",
         .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-portable-pixmap"),
     },
     {
         .id        = AV_CODEC_ID_PBM,
@@ -1257,6 +1363,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "pcx",
         .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-pcx"),
     },
     {
         .id        = AV_CODEC_ID_PGM,
@@ -1278,6 +1385,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "png",
         .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
         .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
     },
     {
         .id        = AV_CODEC_ID_PPM,
@@ -1320,6 +1428,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "targa",
         .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-targa", "image/x-tga"),
     },
     {
         .id        = AV_CODEC_ID_TDSC,
@@ -1334,6 +1443,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "tiff",
         .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/tiff"),
     },
     {
         .id        = AV_CODEC_ID_TXD,
@@ -1356,6 +1466,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("WebP"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
                      AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/webp"),
     },
     {
         .id        = AV_CODEC_ID_WMV3IMAGE,
@@ -1377,6 +1488,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "xwd",
         .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xwindowdump"),
+    },
+    {
+        .id        = AV_CODEC_ID_APNG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "apng",
+        .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
     },
 
     /* various PCM "codecs" */
@@ -1426,13 +1546,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_PCM_MULAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_mulaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law / G.711 mu-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_ALAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_alaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law / G.711 A-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_S32LE,
@@ -1515,7 +1637,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_PCM_S16LE_PLANAR,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_s16le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM 16-bit little-endian planar"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit little-endian planar"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1586,7 +1708,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "s302m",
         .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_PCM_S8_PLANAR,
@@ -1727,7 +1849,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_ADPCM_THP,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_thp",
-        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube THP"),
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_THP_LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_thp_le",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1808,6 +1937,41 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ADPCM_AFC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_afc",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube AFC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_OKI,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_oki",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Dialogic OKI"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_DTK,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_dtk",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube DTK"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_RAD,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_rad",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Radical"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_G726LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_g726le",
+        .long_name = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_ADPCM_VIMA,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_vima",
@@ -2315,6 +2479,24 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_FFWAVESYNTH,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "wavesynth",
+        .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonic",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC_LS,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonicls",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    },
+    {
         .id        = AV_CODEC_ID_OPUS,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "opus",
@@ -2356,6 +2538,55 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("On2 Audio for Video Codec"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_EVRC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "evrc",
+        .long_name = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SMV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "smv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMV (Selectable Mode Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_4GV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "4gv",
+        .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* subtitle codecs */
     {
@@ -2363,42 +2594,56 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvd_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVD subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvb_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVB subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "text",
         .long_name = NULL_IF_CONFIG_SMALL("raw UTF-8 text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_XSUB,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "xsub",
         .long_name = NULL_IF_CONFIG_SMALL("XSUB"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_ASS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "ass",
+        .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_SSA,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "ssa",
-        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) / ASS (Advanced SSA) subtitle"),
+        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_MOV_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "mov_text",
         .long_name = NULL_IF_CONFIG_SMALL("MOV text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_HDMV_PGS_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "hdmv_pgs_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_TELETEXT,
@@ -2410,8 +2655,171 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SRT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "srt",
-        .long_name = NULL_IF_CONFIG_SMALL("SubRip Text"),
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle with embedded timing"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBRIP,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subrip",
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_MICRODVD,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "microdvd",
+        .long_name = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
+    {
+        .id        = AV_CODEC_ID_MPL2,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "mpl2",
+        .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_EIA_608,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "eia_608",
+        .long_name = NULL_IF_CONFIG_SMALL("EIA-608 closed captions"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_JACOSUB,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "jacosub",
+        .long_name = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_PJS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "pjs",
+        .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SAMI,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "sami",
+        .long_name = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_REALTEXT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "realtext",
+        .long_name = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_STL,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "stl",
+        .long_name = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER1,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer1",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer v1 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_VPLAYER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "vplayer",
+        .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_WEBVTT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "webvtt",
+        .long_name = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "hdmv_text_subtitle",
+        .long_name = NULL_IF_CONFIG_SMALL("HDMV Text subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+
+    /* other kind of codecs and pseudo-codecs */
+    {
+        .id        = AV_CODEC_ID_TTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "ttf",
+        .long_name = NULL_IF_CONFIG_SMALL("TrueType font"),
+        .mime_types= MT("application/x-truetype-font", "application/x-font"),
+    },
+    {
+        .id        = AV_CODEC_ID_BINTEXT,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "bintext",
+        .long_name = NULL_IF_CONFIG_SMALL("Binary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_XBIN,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xbin",
+        .long_name = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_IDF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "idf",
+        .long_name = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_OTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "otf",
+        .long_name = NULL_IF_CONFIG_SMALL("OpenType font"),
+        .mime_types= MT("application/vnd.ms-opentype"),
+    },
+    {
+        .id        = AV_CODEC_ID_SMPTE_KLV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "klv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMPTE 336M Key-Length-Value (KLV) metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_DVD_NAV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "dvd_nav_packet",
+        .long_name = NULL_IF_CONFIG_SMALL("DVD Nav packet"),
+    },
+    {
+        .id        = AV_CODEC_ID_TIMED_ID3,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "timed_id3",
+        .long_name = NULL_IF_CONFIG_SMALL("timed ID3 metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_BIN_DATA,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "bin_data",
+        .long_name = NULL_IF_CONFIG_SMALL("binary data"),
+        .mime_types= MT("application/octet-stream"),
+    },
+
+    /* deprecated codec ids */
 };
 
 const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
diff --git a/libavcodec/cook.c b/libavcodec/cook.c
index ddce57f6d3..d8fb736828 100644
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,7 @@
 #include "fft.h"
 #include "internal.h"
 #include "sinewin.h"
+#include "unary.h"
 
 #include "cookdata.h"
 
@@ -219,7 +220,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
     int j, ret;
     int mlt_size = q->samples_per_channel;
 
-    if ((q->mlt_window = av_malloc(mlt_size * sizeof(*q->mlt_window))) == 0)
+    if ((q->mlt_window = av_malloc_array(mlt_size, sizeof(*q->mlt_window))) == 0)
         return AVERROR(ENOMEM);
 
     /* Initialize the MLT window: simple sine window. */
@@ -229,7 +230,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
 
     /* Initialize the MDCT. */
     if ((ret = ff_mdct_init(&q->mdct_ctx, av_log2(mlt_size) + 1, 1, 1.0 / 32768.0))) {
-        av_free(q->mlt_window);
+        av_freep(&q->mlt_window);
         return ret;
     }
     av_log(q->avctx, AV_LOG_DEBUG, "MDCT initialized, order = %d.\n",
@@ -303,8 +304,8 @@ static av_cold int cook_decode_close(AVCodecContext *avctx)
     av_log(avctx, AV_LOG_DEBUG, "Deallocating memory.\n");
 
     /* Free allocated memory buffers. */
-    av_free(q->mlt_window);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->mlt_window);
+    av_freep(&q->decoded_bytes_buffer);
 
     /* Free the transform. */
     ff_mdct_end(&q->mdct_ctx);
@@ -332,11 +333,7 @@ static void decode_gain_info(GetBitContext *gb, int *gaininfo)
 {
     int i, n;
 
-    while (get_bits1(gb)) {
-        /* NOTHING */
-    }
-
-    n = get_bits_count(gb) - 1;     // amount of elements*2 to update
+    n = get_unary(gb, 0, get_bits_left(gb));     // amount of elements*2 to update
 
     i = 0;
     while (n--) {
@@ -397,7 +394,7 @@ static int decode_envelope(COOKContext *q, COOKSubpacket *p,
  * @param category              pointer to the category array
  * @param category_index        pointer to the category_index array
  */
-static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
+static void categorize(COOKContext *q, COOKSubpacket *p, const int *quant_index_table,
                        int *category, int *category_index)
 {
     int exp_idx, bias, tmpbias1, tmpbias2, bits_left, num_bits, index, v, i, j;
@@ -421,7 +418,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
         num_bits = 0;
         index    = 0;
         for (j = p->total_subbands; j > 0; j--) {
-            exp_idx = av_clip((i - quant_index_table[index] + bias) / 2, 0, 7);
+            exp_idx = av_clip_uintp2((i - quant_index_table[index] + bias) / 2, 3);
             index++;
             num_bits += expbits_tab[exp_idx];
         }
@@ -432,7 +429,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
     /* Calculate total number of bits. */
     num_bits = 0;
     for (i = 0; i < p->total_subbands; i++) {
-        exp_idx = av_clip((bias - quant_index_table[i]) / 2, 0, 7);
+        exp_idx = av_clip_uintp2((bias - quant_index_table[i]) / 2, 3);
         num_bits += expbits_tab[exp_idx];
         exp_index1[i] = exp_idx;
         exp_index2[i] = exp_idx;
@@ -630,13 +627,17 @@ static int mono_decode(COOKContext *q, COOKSubpacket *p, float *mlt_buffer)
     int category_index[128] = { 0 };
     int category[128]       = { 0 };
     int quant_index_table[102];
-    int res;
+    int res, i;
 
     if ((res = decode_envelope(q, p, quant_index_table)) < 0)
         return res;
     q->num_vectors = get_bits(&q->gb, p->log2_numvector_size);
     categorize(q, p, quant_index_table, category, category_index);
     expand_category(q, category, category_index);
+    for (i=0; i<p->total_subbands; i++) {
+        if (category[i] > 7)
+            return AVERROR_INVALIDDATA;
+    }
     decode_vectors(q, p, category, quant_index_table, mlt_buffer);
 
     return 0;
@@ -736,7 +737,7 @@ static void imlt_gain(COOKContext *q, float *inbuffer,
  * @param q                 pointer to the COOKContext
  * @param decouple_tab      decoupling array
  */
-static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
+static int decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
 {
     int i;
     int vlc    = get_bits1(&q->gb);
@@ -745,7 +746,7 @@ static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
     int length = end - start + 1;
 
     if (start > end)
-        return;
+        return 0;
 
     if (vlc)
         for (i = 0; i < length; i++)
@@ -753,11 +754,18 @@ static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
                                                p->channel_coupling.table,
                                                p->channel_coupling.bits, 2);
     else
-        for (i = 0; i < length; i++)
-            decouple_tab[start + i] = get_bits(&q->gb, p->js_vlc_bits);
+        for (i = 0; i < length; i++) {
+            int v = get_bits(&q->gb, p->js_vlc_bits);
+            if (v == (1<<p->js_vlc_bits)-1) {
+                av_log(q->avctx, AV_LOG_ERROR, "decouple value too large\n");
+                return AVERROR_INVALIDDATA;
+            }
+            decouple_tab[start + i] = v;
+        }
+    return 0;
 }
 
-/*
+/**
  * function decouples a pair of signals from a single signal via multiplication.
  *
  * @param q                 pointer to the COOKContext
@@ -805,10 +813,10 @@ static int joint_decode(COOKContext *q, COOKSubpacket *p,
     /* Make sure the buffers are zeroed out. */
     memset(mlt_buffer_left,  0, 1024 * sizeof(*mlt_buffer_left));
     memset(mlt_buffer_right, 0, 1024 * sizeof(*mlt_buffer_right));
-    decouple_info(q, p, decouple_tab);
+    if ((res = decouple_info(q, p, decouple_tab)) < 0)
+        return res;
     if ((res = mono_decode(q, p, decode_buffer)) < 0)
         return res;
-
     /* The two channels are stored interleaved in decode_buffer. */
     for (i = 0; i < p->js_subband_start; i++) {
         for (j = 0; j < SUBBAND_SIZE; j++) {
@@ -927,7 +935,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
                           p->mono_previous_buffer1,
                           outbuffer ? outbuffer[p->ch_idx] : NULL);
 
-    if (p->num_channels == 2)
+    if (p->num_channels == 2) {
         if (p->joint_stereo)
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains1,
                                   p->mono_previous_buffer2,
@@ -936,6 +944,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains2,
                                   p->mono_previous_buffer2,
                                   outbuffer ? outbuffer[p->ch_idx + 1] : NULL);
+    }
 
     return 0;
 }
@@ -959,10 +968,8 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (q->discarded_packets >= 2) {
         frame->nb_samples = q->samples_per_channel;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         samples = (float **)frame->extended_data;
     }
 
@@ -1009,7 +1016,6 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-#ifdef DEBUG
 static void dump_cook_context(COOKContext *q)
 {
     //int i=0;
@@ -1022,7 +1028,7 @@ static void dump_cook_context(COOKContext *q)
     }
     ff_dlog(q->avctx, "COOKContext\n");
     PRINT("nb_channels", q->avctx->channels);
-    PRINT("bit_rate", q->avctx->bit_rate);
+    PRINT("bit_rate", (int)q->avctx->bit_rate);
     PRINT("sample_rate", q->avctx->sample_rate);
     PRINT("samples_per_channel", q->subpacket[0].samples_per_channel);
     PRINT("subbands", q->subpacket[0].subbands);
@@ -1031,7 +1037,6 @@ static void dump_cook_context(COOKContext *q)
     PRINT("numvector_size", q->subpacket[0].numvector_size);
     PRINT("total_subbands", q->subpacket[0].total_subbands);
 }
-#endif
 
 /**
  * Cook initialization
@@ -1046,7 +1051,7 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     int extradata_size = avctx->extradata_size;
     int s = 0;
     unsigned int channel_mask = 0;
-    int samples_per_frame;
+    int samples_per_frame = 0;
     int ret;
     q->avctx = avctx;
 
@@ -1080,6 +1085,11 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
         if (extradata_size >= 8) {
             bytestream_get_be32(&edata_ptr);    // Unknown unused
             q->subpacket[s].js_subband_start = bytestream_get_be16(&edata_ptr);
+            if (q->subpacket[s].js_subband_start >= 51) {
+                av_log(avctx, AV_LOG_ERROR, "js_subband_start %d is too large\n", q->subpacket[s].js_subband_start);
+                return AVERROR_INVALIDDATA;
+            }
+
             q->subpacket[s].js_vlc_bits = bytestream_get_be16(&edata_ptr);
             extradata_size -= 8;
         }
@@ -1187,15 +1197,24 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
             avpriv_request_sample(avctx, "subbands > 50");
             return AVERROR_PATCHWELCOME;
         }
+        if (q->subpacket[s].subbands == 0) {
+            avpriv_request_sample(avctx, "subbands = 0");
+            return AVERROR_PATCHWELCOME;
+        }
         q->subpacket[s].gains1.now      = q->subpacket[s].gain_1;
         q->subpacket[s].gains1.previous = q->subpacket[s].gain_2;
         q->subpacket[s].gains2.now      = q->subpacket[s].gain_3;
         q->subpacket[s].gains2.previous = q->subpacket[s].gain_4;
 
+        if (q->num_subpackets + q->subpacket[s].num_channels > q->avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "Too many subpackets %d for channels %d\n", q->num_subpackets, q->avctx->channels);
+            return AVERROR_INVALIDDATA;
+        }
+
         q->num_subpackets++;
         s++;
-        if (s > MAX_SUBPACKETS) {
-            avpriv_request_sample(avctx, "subpackets > %d", MAX_SUBPACKETS);
+        if (s > FFMIN(MAX_SUBPACKETS, avctx->block_align)) {
+            avpriv_request_sample(avctx, "subpackets > %d", FFMIN(MAX_SUBPACKETS, avctx->block_align));
             return AVERROR_PATCHWELCOME;
         }
     }
@@ -1248,9 +1267,9 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     else
         avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
-#ifdef DEBUG
+
     dump_cook_context(q);
-#endif
+
     return 0;
 }
 
diff --git a/libavcodec/cook_parser.c b/libavcodec/cook_parser.c
index f140e90461..6dbbfd8a53 100644
--- a/libavcodec/cook_parser.c
+++ b/libavcodec/cook_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,12 @@ static int cook_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
 {
     CookParseContext *s = s1->priv_data;
 
-    if (s->duration)
-        s1->duration = s->duration;
-    else if (avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
+    if (!s->duration &&
+                avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
         s->duration = AV_RB16(avctx->extradata + 4) / avctx->channels;
 
+    s1->duration = s->duration;
+
     /* always return the full packet. this parser isn't doing any splitting or
        combining, only setting packet duration */
     *poutbuf      = buf;
diff --git a/libavcodec/cookdata.h b/libavcodec/cookdata.h
index c4c26fae5f..7b9cba3c11 100644
--- a/libavcodec/cookdata.h
+++ b/libavcodec/cookdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/copy_block.h b/libavcodec/copy_block.h
index 10718ccfda..9ed451f294 100644
--- a/libavcodec/copy_block.h
+++ b/libavcodec/copy_block.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,17 @@
 
 #include "libavutil/intreadwrite.h"
 
+static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_COPY16U(dst, src);
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
 static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
 {
     int i;
diff --git a/libavcodec/cos_tablegen.c b/libavcodec/cos_tablegen.c
index 92b8295336..9af83f4d7d 100644
--- a/libavcodec/cos_tablegen.c
+++ b/libavcodec/cos_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ int main(int argc, char *argv[])
     printf("#include \"libavcodec/%s\"\n", do_sin ? "rdft.h" : "fft.h");
     for (i = 4; i <= BITS; i++) {
         int m = 1 << i;
-        double freq = 2*M_PI/m;
+        double freq = 2*3.14159265358979323846/m;
         printf("%s(%i) = {\n   ", do_sin ? "SINTABLE" : "COSTABLE", m);
         for (j = 0; j < m/2 - 1; j++) {
             int idx = j > m/4 ? m/2 - j : j;
diff --git a/libavcodec/cpia.c b/libavcodec/cpia.c
new file mode 100644
index 0000000000..6b784b2051
--- /dev/null
+++ b/libavcodec/cpia.c
@@ -0,0 +1,233 @@
+/*
+ * CPiA video decoder.
+ * Copyright (c) 2010 Hans de Goede <hdegoede@redhat.com>
+ *
+ * This decoder is based on the LGPL code available at
+ * https://v4l4j.googlecode.com/svn/v4l4j/trunk/libvideo/libv4lconvert/cpia1.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+
+
+#define FRAME_HEADER_SIZE 64
+#define MAGIC_0         0x19    /**< First header byte */
+#define MAGIC_1         0x68    /**< Second header byte */
+#define SUBSAMPLE_420      0
+#define SUBSAMPLE_422      1
+#define YUVORDER_YUYV      0
+#define YUVORDER_UYVY      1
+#define NOT_COMPRESSED     0
+#define COMPRESSED         1
+#define NO_DECIMATION      0
+#define DECIMATION_ENAB    1
+#define EOL             0xfd    /**< End Of Line marker */
+#define EOI             0xff    /**< End Of Image marker */
+
+
+typedef struct {
+    AVFrame *frame;
+} CpiaContext;
+
+
+static int cpia_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_frame, AVPacket* avpkt)
+{
+    CpiaContext* const cpia = avctx->priv_data;
+    int i,j,ret;
+
+    uint8_t* const header = avpkt->data;
+    uint8_t* src;
+    int src_size;
+    uint16_t linelength;
+    uint8_t skip;
+
+    AVFrame *frame = cpia->frame;
+    uint8_t *y, *u, *v, *y_end, *u_end, *v_end;
+
+    // Check header
+    if ( avpkt->size < FRAME_HEADER_SIZE
+      || header[0] != MAGIC_0 || header[1] != MAGIC_1
+      || (header[17] != SUBSAMPLE_420 && header[17] != SUBSAMPLE_422)
+      || (header[18] != YUVORDER_YUYV && header[18] != YUVORDER_UYVY)
+      || (header[28] != NOT_COMPRESSED && header[28] != COMPRESSED)
+      || (header[29] != NO_DECIMATION && header[29] != DECIMATION_ENAB)
+    ) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid header!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // currently unsupported properties
+    if (header[17] == SUBSAMPLE_422) {
+        avpriv_report_missing_feature(avctx, "4:2:2 subsampling");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[18] == YUVORDER_UYVY) {
+        avpriv_report_missing_feature(avctx, "YUV byte order UYVY");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[29] == DECIMATION_ENAB) {
+        avpriv_report_missing_feature(avctx, "Decimation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    src = header + FRAME_HEADER_SIZE;
+    src_size = avpkt->size - FRAME_HEADER_SIZE;
+
+    if (header[28] == NOT_COMPRESSED) {
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        frame->key_frame = 1;
+    } else {
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        frame->key_frame = 0;
+    }
+
+    // Get buffer filled with previous frame
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
+        return ret;
+
+
+    for ( i = 0;
+          i < frame->height;
+          i++, src += linelength, src_size -= linelength
+    ) {
+        // Read line length, two byte little endian
+        linelength = AV_RL16(src);
+        src += 2;
+
+        if (src_size < linelength) {
+            av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+            av_log(avctx, AV_LOG_WARNING, "Frame ended unexpectedly!\n");
+            break;
+        }
+        if (src[linelength - 1] != EOL) {
+            av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+            av_log(avctx, AV_LOG_WARNING, "Wrong line length %d or line not terminated properly (found 0x%02x)!\n", linelength, src[linelength - 1]);
+            break;
+        }
+
+        /* Update the data pointers. Y data is on every line.
+         * U and V data on every second line
+         */
+        y = &frame->data[0][i * frame->linesize[0]];
+        u = &frame->data[1][(i >> 1) * frame->linesize[1]];
+        v = &frame->data[2][(i >> 1) * frame->linesize[2]];
+        y_end = y + frame->linesize[0] - 1;
+        u_end = u + frame->linesize[1] - 1;
+        v_end = v + frame->linesize[2] - 1;
+
+        if ((i & 1) && header[17] == SUBSAMPLE_420) {
+            /* We are on a odd line and 420 subsample is used.
+             * On this line only Y values are specified, one per pixel.
+             */
+            for (j = 0; j < linelength - 1; j++) {
+                if (y > y_end) {
+                    av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    /* It seems that odd lines are always uncompressed, but
+                     * we do it according to specification anyways.
+                     */
+                    skip = src[j] >> 1;
+                    y += skip;
+                } else {
+                    *(y++) = src[j];
+                }
+            }
+        } else if (header[17] == SUBSAMPLE_420) {
+            /* We are on an even line and 420 subsample is used.
+             * On this line each pair of pixels is described by four bytes.
+             */
+            for (j = 0; j < linelength - 4; ) {
+                if (y + 1 > y_end || u > u_end || v > v_end) {
+                    av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    // Skip amount of pixels and move forward one byte
+                    skip = src[j] >> 1;
+                    y += skip;
+                    u += skip >> 1;
+                    v += skip >> 1;
+                    j++;
+                } else {
+                    // Set image data as specified and move forward 4 bytes
+                    *(y++) = src[j];
+                    *(u++) = src[j+1];
+                    *(y++) = src[j+2];
+                    *(v++) = src[j+3];
+                    j += 4;
+                }
+            }
+        }
+    }
+
+    *got_frame = 1;
+    if ((ret = av_frame_ref(data, cpia->frame)) < 0)
+        return ret;
+
+    return avpkt->size;
+}
+
+static av_cold int cpia_decode_init(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    // output pixel format
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    /* The default timebase set by the v4l2 demuxer leads to probing which is buggy.
+     * Set some reasonable time_base to skip this.
+     */
+    if (avctx->time_base.num == 1 && avctx->time_base.den == 1000000) {
+        avctx->time_base.num = 1;
+        avctx->time_base.den = 60;
+    }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int cpia_decode_end(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
+AVCodec ff_cpia_decoder = {
+    .name           = "cpia",
+    .long_name      = NULL_IF_CONFIG_SMALL("CPiA video format"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CPIA,
+    .priv_data_size = sizeof(CpiaContext),
+    .init           = cpia_decode_init,
+    .close          = cpia_decode_end,
+    .decode         = cpia_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/crystalhd.c b/libavcodec/crystalhd.c
new file mode 100644
index 0000000000..3cb32a8c59
--- /dev/null
+++ b/libavcodec/crystalhd.c
@@ -0,0 +1,1226 @@
+/*
+ * - CrystalHD decoder module -
+ *
+ * Copyright(C) 2010,2011 Philip Langdale <ffmpeg.philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * - Principles of Operation -
+ *
+ * The CrystalHD decoder operates at the bitstream level - which is an even
+ * higher level than the decoding hardware you typically see in modern GPUs.
+ * This means it has a very simple interface, in principle. You feed demuxed
+ * packets in one end and get decoded picture (fields/frames) out the other.
+ *
+ * Of course, nothing is ever that simple. Due, at the very least, to b-frame
+ * dependencies in the supported formats, the hardware has a delay between
+ * when a packet goes in, and when a picture comes out. Furthermore, this delay
+ * is not just a function of time, but also one of the dependency on additional
+ * frames being fed into the decoder to satisfy the b-frame dependencies.
+ *
+ * As such, a pipeline will build up that is roughly equivalent to the required
+ * DPB for the file being played. If that was all it took, things would still
+ * be simple - so, of course, it isn't.
+ *
+ * The hardware has a way of indicating that a picture is ready to be copied out,
+ * but this is unreliable - and sometimes the attempt will still fail so, based
+ * on testing, the code will wait until 3 pictures are ready before starting
+ * to copy out - and this has the effect of extending the pipeline.
+ *
+ * Finally, while it is tempting to say that once the decoder starts outputting
+ * frames, the software should never fail to return a frame from a decode(),
+ * this is a hard assertion to make, because the stream may switch between
+ * differently encoded content (number of b-frames, interlacing, etc) which
+ * might require a longer pipeline than before. If that happened, you could
+ * deadlock trying to retrieve a frame that can't be decoded without feeding
+ * in additional packets.
+ *
+ * As such, the code will return in the event that a picture cannot be copied
+ * out, leading to an increase in the length of the pipeline. This in turn,
+ * means we have to be sensitive to the time it takes to decode a picture;
+ * We do not want to give up just because the hardware needed a little more
+ * time to prepare the picture! For this reason, there are delays included
+ * in the decode() path that ensure that, under normal conditions, the hardware
+ * will only fail to return a frame if it really needs additional packets to
+ * complete the decoding.
+ *
+ * Finally, to be explicit, we do not want the pipeline to grow without bound
+ * for two reasons: 1) The hardware can only buffer a finite number of packets,
+ * and 2) The client application may not be able to cope with arbitrarily long
+ * delays in the video path relative to the audio path. For example. MPlayer
+ * can only handle a 20 picture delay (although this is arbitrary, and needs
+ * to be extended to fully support the CrystalHD where the delay could be up
+ * to 32 pictures - consider PAFF H.264 content with 16 b-frames).
+ */
+
+/*****************************************************************************
+ * Includes
+ ****************************************************************************/
+
+#define _XOPEN_SOURCE 600
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libcrystalhd/bc_dts_types.h>
+#include <libcrystalhd/bc_dts_defs.h>
+#include <libcrystalhd/libcrystalhd_if.h>
+
+#include "avcodec.h"
+#include "h264.h"
+#include "internal.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+/** Timeout parameter passed to DtsProcOutput() in us */
+#define OUTPUT_PROC_TIMEOUT 50
+/** Step between fake timestamps passed to hardware in units of 100ns */
+#define TIMESTAMP_UNIT 100000
+/** Initial value in us of the wait in decode() */
+#define BASE_WAIT 10000
+/** Increment in us to adjust wait in decode() */
+#define WAIT_UNIT 1000
+
+
+/*****************************************************************************
+ * Module private data
+ ****************************************************************************/
+
+typedef enum {
+    RET_ERROR           = -1,
+    RET_OK              = 0,
+    RET_COPY_AGAIN      = 1,
+    RET_SKIP_NEXT_COPY  = 2,
+    RET_COPY_NEXT_FIELD = 3,
+} CopyRet;
+
+typedef struct OpaqueList {
+    struct OpaqueList *next;
+    uint64_t fake_timestamp;
+    uint64_t reordered_opaque;
+    uint8_t pic_type;
+} OpaqueList;
+
+typedef struct {
+    AVClass *av_class;
+    AVCodecContext *avctx;
+    AVFrame *pic;
+    HANDLE dev;
+
+    uint8_t *orig_extradata;
+    uint32_t orig_extradata_size;
+
+    AVBitStreamFilterContext *bsfc;
+    AVCodecParserContext *parser;
+
+    uint8_t is_70012;
+    uint8_t *sps_pps_buf;
+    uint32_t sps_pps_size;
+    uint8_t is_nal;
+    uint8_t output_ready;
+    uint8_t need_second_field;
+    uint8_t skip_next_output;
+    uint64_t decode_wait;
+
+    uint64_t last_picture;
+
+    OpaqueList *head;
+    OpaqueList *tail;
+
+    /* Options */
+    uint32_t sWidth;
+    uint8_t bframe_bug;
+} CHDContext;
+
+static const AVOption options[] = {
+    { "crystalhd_downscale_width",
+      "Turn on downscaling to the specified width",
+      offsetof(CHDContext, sWidth),
+      AV_OPT_TYPE_INT, {.i64 = 0}, 0, UINT32_MAX,
+      AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM, },
+    { NULL, },
+};
+
+
+/*****************************************************************************
+ * Helper functions
+ ****************************************************************************/
+
+static inline BC_MEDIA_SUBTYPE id2subtype(CHDContext *priv, enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_MPEG4:
+        return BC_MSUBTYPE_DIVX;
+    case AV_CODEC_ID_MSMPEG4V3:
+        return BC_MSUBTYPE_DIVX311;
+    case AV_CODEC_ID_MPEG2VIDEO:
+        return BC_MSUBTYPE_MPEG2VIDEO;
+    case AV_CODEC_ID_VC1:
+        return BC_MSUBTYPE_VC1;
+    case AV_CODEC_ID_WMV3:
+        return BC_MSUBTYPE_WMV3;
+    case AV_CODEC_ID_H264:
+        return priv->is_nal ? BC_MSUBTYPE_AVC1 : BC_MSUBTYPE_H264;
+    default:
+        return BC_MSUBTYPE_INVALID;
+    }
+}
+
+static inline void print_frame_info(CHDContext *priv, BC_DTS_PROC_OUT *output)
+{
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tYBuffSz: %u\n", output->YbuffSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tYBuffDoneSz: %u\n",
+           output->YBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tUVBuffDoneSz: %u\n",
+           output->UVBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tTimestamp: %"PRIu64"\n",
+           output->PicInfo.timeStamp);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tPicture Number: %u\n",
+           output->PicInfo.picture_number);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tWidth: %u\n",
+           output->PicInfo.width);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tHeight: %u\n",
+           output->PicInfo.height);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tChroma: 0x%03x\n",
+           output->PicInfo.chroma_format);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tPulldown: %u\n",
+           output->PicInfo.pulldown);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFlags: 0x%08x\n",
+           output->PicInfo.flags);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFrame Rate/Res: %u\n",
+           output->PicInfo.frame_rate);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tAspect Ratio: %u\n",
+           output->PicInfo.aspect_ratio);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tColor Primaries: %u\n",
+           output->PicInfo.colour_primaries);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tMetaData: %u\n",
+           output->PicInfo.picture_meta_payload);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tSession Number: %u\n",
+           output->PicInfo.sess_num);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tycom: %u\n",
+           output->PicInfo.ycom);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tCustom Aspect: %u\n",
+           output->PicInfo.custom_aspect_ratio_width_height);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFrames to Drop: %u\n",
+           output->PicInfo.n_drop);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tH264 Valid Fields: 0x%08x\n",
+           output->PicInfo.other.h264.valid);
+}
+
+
+/*****************************************************************************
+ * OpaqueList functions
+ ****************************************************************************/
+
+static uint64_t opaque_list_push(CHDContext *priv, uint64_t reordered_opaque,
+                                 uint8_t pic_type)
+{
+    OpaqueList *newNode = av_mallocz(sizeof (OpaqueList));
+    if (!newNode) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "Unable to allocate new node in OpaqueList.\n");
+        return 0;
+    }
+    if (!priv->head) {
+        newNode->fake_timestamp = TIMESTAMP_UNIT;
+        priv->head              = newNode;
+    } else {
+        newNode->fake_timestamp = priv->tail->fake_timestamp + TIMESTAMP_UNIT;
+        priv->tail->next        = newNode;
+    }
+    priv->tail = newNode;
+    newNode->reordered_opaque = reordered_opaque;
+    newNode->pic_type = pic_type;
+
+    return newNode->fake_timestamp;
+}
+
+/*
+ * The OpaqueList is built in decode order, while elements will be removed
+ * in presentation order. If frames are reordered, this means we must be
+ * able to remove elements that are not the first element.
+ *
+ * Returned node must be freed by caller.
+ */
+static OpaqueList *opaque_list_pop(CHDContext *priv, uint64_t fake_timestamp)
+{
+    OpaqueList *node = priv->head;
+
+    if (!priv->head) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "CrystalHD: Attempted to query non-existent timestamps.\n");
+        return NULL;
+    }
+
+    /*
+     * The first element is special-cased because we have to manipulate
+     * the head pointer rather than the previous element in the list.
+     */
+    if (priv->head->fake_timestamp == fake_timestamp) {
+        priv->head = node->next;
+
+        if (!priv->head->next)
+            priv->tail = priv->head;
+
+        node->next = NULL;
+        return node;
+    }
+
+    /*
+     * The list is processed at arm's length so that we have the
+     * previous element available to rewrite its next pointer.
+     */
+    while (node->next) {
+        OpaqueList *current = node->next;
+        if (current->fake_timestamp == fake_timestamp) {
+            node->next = current->next;
+
+            if (!node->next)
+               priv->tail = node;
+
+            current->next = NULL;
+            return current;
+        } else {
+            node = current;
+        }
+    }
+
+    av_log(priv->avctx, AV_LOG_VERBOSE,
+           "CrystalHD: Couldn't match fake_timestamp.\n");
+    return NULL;
+}
+
+
+/*****************************************************************************
+ * Video decoder API function definitions
+ ****************************************************************************/
+
+static void flush(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+
+    avctx->has_b_frames     = 0;
+    priv->last_picture      = -1;
+    priv->output_ready      = 0;
+    priv->need_second_field = 0;
+    priv->skip_next_output  = 0;
+    priv->decode_wait       = BASE_WAIT;
+
+    av_frame_unref (priv->pic);
+
+    /* Flush mode 4 flushes all software and hardware buffers. */
+    DtsFlushInput(priv->dev, 4);
+}
+
+
+static av_cold int uninit(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+    HANDLE device;
+
+    device = priv->dev;
+    DtsStopDecoder(device);
+    DtsCloseDecoder(device);
+    DtsDeviceClose(device);
+
+    /*
+     * Restore original extradata, so that if the decoder is
+     * reinitialised, the bitstream detection and filtering
+     * will work as expected.
+     */
+    if (priv->orig_extradata) {
+        av_free(avctx->extradata);
+        avctx->extradata = priv->orig_extradata;
+        avctx->extradata_size = priv->orig_extradata_size;
+        priv->orig_extradata = NULL;
+        priv->orig_extradata_size = 0;
+    }
+
+    av_parser_close(priv->parser);
+    if (priv->bsfc) {
+        av_bitstream_filter_close(priv->bsfc);
+    }
+
+    av_freep(&priv->sps_pps_buf);
+
+    av_frame_free (&priv->pic);
+
+    if (priv->head) {
+       OpaqueList *node = priv->head;
+       while (node) {
+          OpaqueList *next = node->next;
+          av_free(node);
+          node = next;
+       }
+    }
+
+    return 0;
+}
+
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    CHDContext* priv;
+    BC_STATUS ret;
+    BC_INFO_CRYSTAL version;
+    BC_INPUT_FORMAT format = {
+        .FGTEnable   = FALSE,
+        .Progressive = TRUE,
+        .OptFlags    = 0x80000000 | vdecFrameRate59_94 | 0x40,
+        .width       = avctx->width,
+        .height      = avctx->height,
+    };
+
+    BC_MEDIA_SUBTYPE subtype;
+
+    uint32_t mode = DTS_PLAYBACK_MODE |
+                    DTS_LOAD_FILE_PLAY_FW |
+                    DTS_SKIP_TX_CHK_CPB |
+                    DTS_PLAYBACK_DROP_RPT_MODE |
+                    DTS_SINGLE_THREADED_MODE |
+                    DTS_DFLT_RESOLUTION(vdecRESOLUTION_1080p23_976);
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD Init for %s\n",
+           avctx->codec->name);
+
+    avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+
+    /* Initialize the library */
+    priv               = avctx->priv_data;
+    priv->avctx        = avctx;
+    priv->is_nal       = avctx->extradata_size > 0 && *(avctx->extradata) == 1;
+    priv->last_picture = -1;
+    priv->decode_wait  = BASE_WAIT;
+    priv->pic          = av_frame_alloc();
+
+    subtype = id2subtype(priv, avctx->codec->id);
+    switch (subtype) {
+    case BC_MSUBTYPE_AVC1:
+        {
+            uint8_t *dummy_p;
+            int dummy_int;
+
+            /* Back up the extradata so it can be restored at close time. */
+            priv->orig_extradata = av_malloc(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!priv->orig_extradata) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Failed to allocate copy of extradata\n");
+                return AVERROR(ENOMEM);
+            }
+            priv->orig_extradata_size = avctx->extradata_size;
+            memcpy(priv->orig_extradata, avctx->extradata, avctx->extradata_size);
+
+            priv->bsfc = av_bitstream_filter_init("h264_mp4toannexb");
+            if (!priv->bsfc) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Cannot open the h264_mp4toannexb BSF!\n");
+                return AVERROR_BSF_NOT_FOUND;
+            }
+            av_bitstream_filter_filter(priv->bsfc, avctx, NULL, &dummy_p,
+                                       &dummy_int, NULL, 0, 0);
+        }
+        subtype = BC_MSUBTYPE_H264;
+        // Fall-through
+    case BC_MSUBTYPE_H264:
+        format.startCodeSz = 4;
+        // Fall-through
+    case BC_MSUBTYPE_VC1:
+    case BC_MSUBTYPE_WVC1:
+    case BC_MSUBTYPE_WMV3:
+    case BC_MSUBTYPE_WMVA:
+    case BC_MSUBTYPE_MPEG2VIDEO:
+    case BC_MSUBTYPE_DIVX:
+    case BC_MSUBTYPE_DIVX311:
+        format.pMetaData  = avctx->extradata;
+        format.metaDataSz = avctx->extradata_size;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: Unknown codec name\n");
+        return AVERROR(EINVAL);
+    }
+    format.mSubtype = subtype;
+
+    if (priv->sWidth) {
+        format.bEnableScaling = 1;
+        format.ScalingParams.sWidth = priv->sWidth;
+    }
+
+    /* Get a decoder instance */
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: starting up\n");
+    // Initialize the Link and Decoder devices
+    ret = DtsDeviceOpen(&priv->dev, mode);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: DtsDeviceOpen failed\n");
+        goto fail;
+    }
+
+    ret = DtsCrystalHDVersion(priv->dev, &version);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: DtsCrystalHDVersion failed\n");
+        goto fail;
+    }
+    priv->is_70012 = version.device == 0;
+
+    if (priv->is_70012 &&
+        (subtype == BC_MSUBTYPE_DIVX || subtype == BC_MSUBTYPE_DIVX311)) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: BCM70012 doesn't support MPEG4-ASP/DivX/Xvid\n");
+        goto fail;
+    }
+
+    ret = DtsSetInputFormat(priv->dev, &format);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: SetInputFormat failed\n");
+        goto fail;
+    }
+
+    ret = DtsOpenDecoder(priv->dev, BC_STREAM_TYPE_ES);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsOpenDecoder failed\n");
+        goto fail;
+    }
+
+    ret = DtsSetColorSpace(priv->dev, OUTPUT_MODE422_YUY2);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsSetColorSpace failed\n");
+        goto fail;
+    }
+    ret = DtsStartDecoder(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartDecoder failed\n");
+        goto fail;
+    }
+    ret = DtsStartCapture(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartCapture failed\n");
+        goto fail;
+    }
+
+    if (avctx->codec->id == AV_CODEC_ID_H264) {
+        priv->parser = av_parser_init(avctx->codec->id);
+        if (!priv->parser)
+            av_log(avctx, AV_LOG_WARNING,
+                   "Cannot open the h.264 parser! Interlaced h.264 content "
+                   "will not be detected reliably.\n");
+        priv->parser->flags = PARSER_FLAG_COMPLETE_FRAMES;
+    }
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Init complete.\n");
+
+    return 0;
+
+ fail:
+    uninit(avctx);
+    return -1;
+}
+
+
+static inline CopyRet copy_frame(AVCodecContext *avctx,
+                                 BC_DTS_PROC_OUT *output,
+                                 void *data, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    uint8_t trust_interlaced;
+    uint8_t interlaced;
+
+    CHDContext *priv = avctx->priv_data;
+    int64_t pkt_pts  = AV_NOPTS_VALUE;
+    uint8_t pic_type = 0;
+
+    uint8_t bottom_field = (output->PicInfo.flags & VDEC_FLAG_BOTTOMFIELD) ==
+                           VDEC_FLAG_BOTTOMFIELD;
+    uint8_t bottom_first = !!(output->PicInfo.flags & VDEC_FLAG_BOTTOM_FIRST);
+
+    int width    = output->PicInfo.width;
+    int height   = output->PicInfo.height;
+    int bwidth;
+    uint8_t *src = output->Ybuff;
+    int sStride;
+    uint8_t *dst;
+    int dStride;
+
+    if (output->PicInfo.timeStamp != 0) {
+        OpaqueList *node = opaque_list_pop(priv, output->PicInfo.timeStamp);
+        if (node) {
+            pkt_pts = node->reordered_opaque;
+            pic_type = node->pic_type;
+            av_free(node);
+        } else {
+            /*
+             * We will encounter a situation where a timestamp cannot be
+             * popped if a second field is being returned. In this case,
+             * each field has the same timestamp and the first one will
+             * cause it to be popped. To keep subsequent calculations
+             * simple, pic_type should be set a FIELD value - doesn't
+             * matter which, but I chose BOTTOM.
+             */
+            pic_type = PICT_BOTTOM_FIELD;
+        }
+        av_log(avctx, AV_LOG_VERBOSE, "output \"pts\": %"PRIu64"\n",
+               output->PicInfo.timeStamp);
+        av_log(avctx, AV_LOG_VERBOSE, "output picture type %d\n",
+               pic_type);
+    }
+
+    ret = DtsGetDriverStatus(priv->dev, &decoder_status);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR,
+               "CrystalHD: GetDriverStatus failed: %u\n", ret);
+       return RET_ERROR;
+    }
+
+    /*
+     * For most content, we can trust the interlaced flag returned
+     * by the hardware, but sometimes we can't. These are the
+     * conditions under which we can trust the flag:
+     *
+     * 1) It's not h.264 content
+     * 2) The UNKNOWN_SRC flag is not set
+     * 3) We know we're expecting a second field
+     * 4) The hardware reports this picture and the next picture
+     *    have the same picture number.
+     *
+     * Note that there can still be interlaced content that will
+     * fail this check, if the hardware hasn't decoded the next
+     * picture or if there is a corruption in the stream. (In either
+     * case a 0 will be returned for the next picture number)
+     */
+    trust_interlaced = avctx->codec->id != AV_CODEC_ID_H264 ||
+                       !(output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) ||
+                       priv->need_second_field ||
+                       (decoder_status.picNumFlags & ~0x40000000) ==
+                       output->PicInfo.picture_number;
+
+    /*
+     * If we got a false negative for trust_interlaced on the first field,
+     * we will realise our mistake here when we see that the picture number is that
+     * of the previous picture. We cannot recover the frame and should discard the
+     * second field to keep the correct number of output frames.
+     */
+    if (output->PicInfo.picture_number == priv->last_picture && !priv->need_second_field) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Incorrectly guessed progressive frame. Discarding second field\n");
+        /* Returning without providing a picture. */
+        return RET_OK;
+    }
+
+    interlaced = (output->PicInfo.flags & VDEC_FLAG_INTERLACED_SRC) &&
+                 trust_interlaced;
+
+    if (!trust_interlaced && (decoder_status.picNumFlags & ~0x40000000) == 0) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "Next picture number unknown. Assuming progressive frame.\n");
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "Interlaced state: %d | trust_interlaced %d\n",
+           interlaced, trust_interlaced);
+
+    if (priv->pic->data[0] && !priv->need_second_field)
+        av_frame_unref(priv->pic);
+
+    priv->need_second_field = interlaced && !priv->need_second_field;
+
+    if (!priv->pic->data[0]) {
+        if (ff_get_buffer(avctx, priv->pic, AV_GET_BUFFER_FLAG_REF) < 0)
+            return RET_ERROR;
+    }
+
+    bwidth = av_image_get_linesize(avctx->pix_fmt, width, 0);
+    if (priv->is_70012) {
+        int pStride;
+
+        if (width <= 720)
+            pStride = 720;
+        else if (width <= 1280)
+            pStride = 1280;
+        else pStride = 1920;
+        sStride = av_image_get_linesize(avctx->pix_fmt, pStride, 0);
+    } else {
+        sStride = bwidth;
+    }
+
+    dStride = priv->pic->linesize[0];
+    dst     = priv->pic->data[0];
+
+    av_log(priv->avctx, AV_LOG_VERBOSE, "CrystalHD: Copying out frame\n");
+
+    if (interlaced) {
+        int dY = 0;
+        int sY = 0;
+
+        height /= 2;
+        if (bottom_field) {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: bottom field\n");
+            dY = 1;
+        } else {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: top field\n");
+            dY = 0;
+        }
+
+        for (sY = 0; sY < height; dY++, sY++) {
+            memcpy(&(dst[dY * dStride]), &(src[sY * sStride]), bwidth);
+            dY++;
+        }
+    } else {
+        av_image_copy_plane(dst, dStride, src, sStride, bwidth, height);
+    }
+
+    priv->pic->interlaced_frame = interlaced;
+    if (interlaced)
+        priv->pic->top_field_first = !bottom_first;
+
+    priv->pic->pkt_pts = pkt_pts;
+
+    if (!priv->need_second_field) {
+        *got_frame       = 1;
+        if ((ret = av_frame_ref(data, priv->pic)) < 0) {
+            return ret;
+        }
+    }
+
+    /*
+     * Two types of PAFF content have been observed. One form causes the
+     * hardware to return a field pair and the other individual fields,
+     * even though the input is always individual fields. We must skip
+     * copying on the next decode() call to maintain pipeline length in
+     * the first case.
+     */
+    if (!interlaced && (output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) &&
+        (pic_type == PICT_TOP_FIELD || pic_type == PICT_BOTTOM_FIELD)) {
+        av_log(priv->avctx, AV_LOG_VERBOSE, "Fieldpair from two packets.\n");
+        return RET_SKIP_NEXT_COPY;
+    }
+
+    /*
+     * The logic here is purely based on empirical testing with samples.
+     * If we need a second field, it could come from a second input packet,
+     * or it could come from the same field-pair input packet at the current
+     * field. In the first case, we should return and wait for the next time
+     * round to get the second field, while in the second case, we should
+     * ask the decoder for it immediately.
+     *
+     * Testing has shown that we are dealing with the fieldpair -> two fields
+     * case if the VDEC_FLAG_UNKNOWN_SRC is not set or if the input picture
+     * type was PICT_FRAME (in this second case, the flag might still be set)
+     */
+    return priv->need_second_field &&
+           (!(output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) ||
+            pic_type == PICT_FRAME) ?
+           RET_COPY_NEXT_FIELD : RET_OK;
+}
+
+
+static inline CopyRet receive_frame(AVCodecContext *avctx,
+                                    void *data, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_PROC_OUT output = {
+        .PicInfo.width  = avctx->width,
+        .PicInfo.height = avctx->height,
+    };
+    CHDContext *priv = avctx->priv_data;
+    HANDLE dev       = priv->dev;
+
+    *got_frame = 0;
+
+    // Request decoded data from the driver
+    ret = DtsProcOutputNoCopy(dev, OUTPUT_PROC_TIMEOUT, &output);
+    if (ret == BC_STS_FMT_CHANGE) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Initial format change\n");
+        avctx->width  = output.PicInfo.width;
+        avctx->height = output.PicInfo.height;
+        switch ( output.PicInfo.aspect_ratio ) {
+        case vdecAspectRatioSquare:
+            avctx->sample_aspect_ratio = (AVRational) {  1,  1};
+            break;
+        case vdecAspectRatio12_11:
+            avctx->sample_aspect_ratio = (AVRational) { 12, 11};
+            break;
+        case vdecAspectRatio10_11:
+            avctx->sample_aspect_ratio = (AVRational) { 10, 11};
+            break;
+        case vdecAspectRatio16_11:
+            avctx->sample_aspect_ratio = (AVRational) { 16, 11};
+            break;
+        case vdecAspectRatio40_33:
+            avctx->sample_aspect_ratio = (AVRational) { 40, 33};
+            break;
+        case vdecAspectRatio24_11:
+            avctx->sample_aspect_ratio = (AVRational) { 24, 11};
+            break;
+        case vdecAspectRatio20_11:
+            avctx->sample_aspect_ratio = (AVRational) { 20, 11};
+            break;
+        case vdecAspectRatio32_11:
+            avctx->sample_aspect_ratio = (AVRational) { 32, 11};
+            break;
+        case vdecAspectRatio80_33:
+            avctx->sample_aspect_ratio = (AVRational) { 80, 33};
+            break;
+        case vdecAspectRatio18_11:
+            avctx->sample_aspect_ratio = (AVRational) { 18, 11};
+            break;
+        case vdecAspectRatio15_11:
+            avctx->sample_aspect_ratio = (AVRational) { 15, 11};
+            break;
+        case vdecAspectRatio64_33:
+            avctx->sample_aspect_ratio = (AVRational) { 64, 33};
+            break;
+        case vdecAspectRatio160_99:
+            avctx->sample_aspect_ratio = (AVRational) {160, 99};
+            break;
+        case vdecAspectRatio4_3:
+            avctx->sample_aspect_ratio = (AVRational) {  4,  3};
+            break;
+        case vdecAspectRatio16_9:
+            avctx->sample_aspect_ratio = (AVRational) { 16,  9};
+            break;
+        case vdecAspectRatio221_1:
+            avctx->sample_aspect_ratio = (AVRational) {221,  1};
+            break;
+        }
+        return RET_COPY_AGAIN;
+    } else if (ret == BC_STS_SUCCESS) {
+        int copy_ret = -1;
+        if (output.PoutFlags & BC_POUT_FLAGS_PIB_VALID) {
+            if (priv->last_picture == -1) {
+                /*
+                 * Init to one less, so that the incrementing code doesn't
+                 * need to be special-cased.
+                 */
+                priv->last_picture = output.PicInfo.picture_number - 1;
+            }
+
+            if (avctx->codec->id == AV_CODEC_ID_MPEG4 &&
+                output.PicInfo.timeStamp == 0 && priv->bframe_bug) {
+                av_log(avctx, AV_LOG_VERBOSE,
+                       "CrystalHD: Not returning packed frame twice.\n");
+                priv->last_picture++;
+                DtsReleaseOutputBuffs(dev, NULL, FALSE);
+                return RET_COPY_AGAIN;
+            }
+
+            print_frame_info(priv, &output);
+
+            if (priv->last_picture + 1 < output.PicInfo.picture_number) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: Picture Number discontinuity\n");
+                /*
+                 * Have we lost frames? If so, we need to shrink the
+                 * pipeline length appropriately.
+                 *
+                 * XXX: I have no idea what the semantics of this situation
+                 * are so I don't even know if we've lost frames or which
+                 * ones.
+                 *
+                 * In any case, only warn the first time.
+                 */
+               priv->last_picture = output.PicInfo.picture_number - 1;
+            }
+
+            copy_ret = copy_frame(avctx, &output, data, got_frame);
+            if (*got_frame > 0) {
+                avctx->has_b_frames--;
+                priv->last_picture++;
+                av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Pipeline length: %u\n",
+                       avctx->has_b_frames);
+            }
+        } else {
+            /*
+             * An invalid frame has been consumed.
+             */
+            av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput succeeded with "
+                                        "invalid PIB\n");
+            avctx->has_b_frames--;
+            copy_ret = RET_OK;
+        }
+        DtsReleaseOutputBuffs(dev, NULL, FALSE);
+
+        return copy_ret;
+    } else if (ret == BC_STS_BUSY) {
+        return RET_COPY_AGAIN;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput failed %d\n", ret);
+        return RET_ERROR;
+    }
+}
+
+
+static int decode(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
+{
+    BC_STATUS ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    CopyRet rec_ret;
+    CHDContext *priv   = avctx->priv_data;
+    HANDLE dev         = priv->dev;
+    uint8_t *in_data   = avpkt->data;
+    int len            = avpkt->size;
+    int free_data      = 0;
+    uint8_t pic_type   = 0;
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: decode_frame\n");
+
+    if (avpkt->size == 7 && !priv->bframe_bug) {
+        /*
+         * The use of a drop frame triggers the bug
+         */
+        av_log(avctx, AV_LOG_INFO,
+               "CrystalHD: Enabling work-around for packed b-frame bug\n");
+        priv->bframe_bug = 1;
+    } else if (avpkt->size == 8 && priv->bframe_bug) {
+        /*
+         * Delay frames don't trigger the bug
+         */
+        av_log(avctx, AV_LOG_INFO,
+               "CrystalHD: Disabling work-around for packed b-frame bug\n");
+        priv->bframe_bug = 0;
+    }
+
+    if (len) {
+        int32_t tx_free = (int32_t)DtsTxFreeSize(dev);
+
+        if (priv->parser) {
+            int ret = 0;
+
+            if (priv->bsfc) {
+                ret = av_bitstream_filter_filter(priv->bsfc, avctx, NULL,
+                                                 &in_data, &len,
+                                                 avpkt->data, len, 0);
+            }
+            free_data = ret > 0;
+
+            if (ret >= 0) {
+                uint8_t *pout;
+                int psize;
+                int index;
+                H264Context *h = priv->parser->priv_data;
+
+                index = av_parser_parse2(priv->parser, avctx, &pout, &psize,
+                                         in_data, len, avctx->internal->pkt->pts,
+                                         avctx->internal->pkt->dts, 0);
+                if (index < 0) {
+                    av_log(avctx, AV_LOG_WARNING,
+                           "CrystalHD: Failed to parse h.264 packet to "
+                           "detect interlacing.\n");
+                } else if (index != len) {
+                    av_log(avctx, AV_LOG_WARNING,
+                           "CrystalHD: Failed to parse h.264 packet "
+                           "completely. Interlaced frames may be "
+                           "incorrectly detected.\n");
+                } else {
+                    av_log(avctx, AV_LOG_VERBOSE,
+                           "CrystalHD: parser picture type %d\n",
+                           h->picture_structure);
+                    pic_type = h->picture_structure;
+                }
+            } else {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: mp4toannexb filter failed to filter "
+                       "packet. Interlaced frames may be incorrectly "
+                       "detected.\n");
+            }
+        }
+
+        if (len < tx_free - 1024) {
+            /*
+             * Despite being notionally opaque, either libcrystalhd or
+             * the hardware itself will mangle pts values that are too
+             * small or too large. The docs claim it should be in units
+             * of 100ns. Given that we're nominally dealing with a black
+             * box on both sides, any transform we do has no guarantee of
+             * avoiding mangling so we need to build a mapping to values
+             * we know will not be mangled.
+             */
+            uint64_t pts = opaque_list_push(priv, avctx->internal->pkt->pts, pic_type);
+            if (!pts) {
+                if (free_data) {
+                    av_freep(&in_data);
+                }
+                return AVERROR(ENOMEM);
+            }
+            av_log(priv->avctx, AV_LOG_VERBOSE,
+                   "input \"pts\": %"PRIu64"\n", pts);
+            ret = DtsProcInput(dev, in_data, len, pts, 0);
+            if (free_data) {
+                av_freep(&in_data);
+            }
+            if (ret == BC_STS_BUSY) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: ProcInput returned busy\n");
+                usleep(BASE_WAIT);
+                return AVERROR(EBUSY);
+            } else if (ret != BC_STS_SUCCESS) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "CrystalHD: ProcInput failed: %u\n", ret);
+                return -1;
+            }
+            avctx->has_b_frames++;
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "CrystalHD: Input buffer full\n");
+            len = 0; // We didn't consume any bytes.
+        }
+    } else {
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: No more input data\n");
+    }
+
+    if (priv->skip_next_output) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Skipping next output.\n");
+        priv->skip_next_output = 0;
+        avctx->has_b_frames--;
+        return len;
+    }
+
+    ret = DtsGetDriverStatus(dev, &decoder_status);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: GetDriverStatus failed\n");
+        return -1;
+    }
+
+    /*
+     * No frames ready. Don't try to extract.
+     *
+     * Empirical testing shows that ReadyListCount can be a damn lie,
+     * and ProcOut still fails when count > 0. The same testing showed
+     * that two more iterations were needed before ProcOutput would
+     * succeed.
+     */
+    if (priv->output_ready < 2) {
+        if (decoder_status.ReadyListCount != 0)
+            priv->output_ready++;
+        usleep(BASE_WAIT);
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: Filling pipeline.\n");
+        return len;
+    } else if (decoder_status.ReadyListCount == 0) {
+        /*
+         * After the pipeline is established, if we encounter a lack of frames
+         * that probably means we're not giving the hardware enough time to
+         * decode them, so start increasing the wait time at the end of a
+         * decode call.
+         */
+        usleep(BASE_WAIT);
+        priv->decode_wait += WAIT_UNIT;
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: No frames ready. Returning\n");
+        return len;
+    }
+
+    do {
+        rec_ret = receive_frame(avctx, data, got_frame);
+        if (rec_ret == RET_OK && *got_frame == 0) {
+            /*
+             * This case is for when the encoded fields are stored
+             * separately and we get a separate avpkt for each one. To keep
+             * the pipeline stable, we should return nothing and wait for
+             * the next time round to grab the second field.
+             * H.264 PAFF is an example of this.
+             */
+            av_log(avctx, AV_LOG_VERBOSE, "Returning after first field.\n");
+            avctx->has_b_frames--;
+        } else if (rec_ret == RET_COPY_NEXT_FIELD) {
+            /*
+             * This case is for when the encoded fields are stored in a
+             * single avpkt but the hardware returns then separately. Unless
+             * we grab the second field before returning, we'll slip another
+             * frame in the pipeline and if that happens a lot, we're sunk.
+             * So we have to get that second field now.
+             * Interlaced mpeg2 and vc1 are examples of this.
+             */
+            av_log(avctx, AV_LOG_VERBOSE, "Trying to get second field.\n");
+            while (1) {
+                usleep(priv->decode_wait);
+                ret = DtsGetDriverStatus(dev, &decoder_status);
+                if (ret == BC_STS_SUCCESS &&
+                    decoder_status.ReadyListCount > 0) {
+                    rec_ret = receive_frame(avctx, data, got_frame);
+                    if ((rec_ret == RET_OK && *got_frame > 0) ||
+                        rec_ret == RET_ERROR)
+                        break;
+                }
+            }
+            av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Got second field.\n");
+        } else if (rec_ret == RET_SKIP_NEXT_COPY) {
+            /*
+             * Two input packets got turned into a field pair. Gawd.
+             */
+            av_log(avctx, AV_LOG_VERBOSE,
+                   "Don't output on next decode call.\n");
+            priv->skip_next_output = 1;
+        }
+        /*
+         * If rec_ret == RET_COPY_AGAIN, that means that either we just handled
+         * a FMT_CHANGE event and need to go around again for the actual frame,
+         * we got a busy status and need to try again, or we're dealing with
+         * packed b-frames, where the hardware strangely returns the packed
+         * p-frame twice. We choose to keep the second copy as it carries the
+         * valid pts.
+         */
+    } while (rec_ret == RET_COPY_AGAIN);
+    usleep(priv->decode_wait);
+    return len;
+}
+
+
+#if CONFIG_H264_CRYSTALHD_DECODER
+static AVClass h264_class = {
+    "h264_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_crystalhd_decoder = {
+    .name           = "h264_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &h264_class,
+};
+#endif
+
+#if CONFIG_MPEG2_CRYSTALHD_DECODER
+static AVClass mpeg2_class = {
+    "mpeg2_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg2_crystalhd_decoder = {
+    .name           = "mpeg2_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-2 Video (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &mpeg2_class,
+};
+#endif
+
+#if CONFIG_MPEG4_CRYSTALHD_DECODER
+static AVClass mpeg4_class = {
+    "mpeg4_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg4_crystalhd_decoder = {
+    .name           = "mpeg4_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 Part 2 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &mpeg4_class,
+};
+#endif
+
+#if CONFIG_MSMPEG4_CRYSTALHD_DECODER
+static AVClass msmpeg4_class = {
+    "msmpeg4_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_msmpeg4_crystalhd_decoder = {
+    .name           = "msmpeg4_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 Part 2 Microsoft variant version 3 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MSMPEG4V3,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_EXPERIMENTAL,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &msmpeg4_class,
+};
+#endif
+
+#if CONFIG_VC1_CRYSTALHD_DECODER
+static AVClass vc1_class = {
+    "vc1_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_vc1_crystalhd_decoder = {
+    .name           = "vc1_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-1 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &vc1_class,
+};
+#endif
+
+#if CONFIG_WMV3_CRYSTALHD_DECODER
+static AVClass wmv3_class = {
+    "wmv3_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_wmv3_crystalhd_decoder = {
+    .name           = "wmv3_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Video 9 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WMV3,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &wmv3_class,
+};
+#endif
diff --git a/libavcodec/cscd.c b/libavcodec/cscd.c
index 0cb375b60d..9e1dec9d96 100644
--- a/libavcodec/cscd.c
+++ b/libavcodec/cscd.c
@@ -2,20 +2,20 @@
  * CamStudio decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdio.h>
@@ -31,14 +31,15 @@
 #include "libavutil/lzo.h"
 
 typedef struct CamStudioContext {
+    AVFrame *pic;
     int linelen, height, bpp;
     unsigned int decomp_size;
     unsigned char* decomp_buf;
 } CamStudioContext;
 
-static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
+static void copy_frame_default(AVFrame *f, const uint8_t *src,
                                int linelen, int height) {
-    int i;
+    int i, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -48,9 +49,9 @@ static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
+static void add_frame_default(AVFrame *f, const uint8_t *src,
                               int linelen, int height) {
-    int i, j;
+    int i, j, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -61,87 +62,11 @@ static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-#if !HAVE_BIGENDIAN
-#define copy_frame_16(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define copy_frame_32(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define add_frame_16(f, s, l, h) add_frame_default(f, s, l, l, h)
-#define add_frame_32(f, s, l, h) add_frame_default(f, s, l, l, h)
-#else
-static void copy_frame_16(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] = src[1];
-          dst[1] = src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void copy_frame_32(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] = src[3];
-          dst[1] = src[2];
-          dst[2] = src[1];
-          dst[3] = src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_16(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] += src[1];
-          dst[1] += src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_32(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] += src[3];
-          dst[1] += src[2];
-          dst[2] += src[1];
-          dst[3] += src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-#endif
-
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt) {
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamStudioContext *c = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
     if (buf_size < 2) {
@@ -149,10 +74,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     // decompress data
     switch ((buf[0] >> 1) & 7) {
@@ -180,36 +103,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // flip upside down, add difference frame
     if (buf[0] & 1) { // keyframe
-        picture->pict_type = AV_PICTURE_TYPE_I;
-        picture->key_frame = 1;
-        switch (c->bpp) {
-          case 16:
-              copy_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              copy_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              copy_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_I;
+        c->pic->key_frame = 1;
+              copy_frame_default(c->pic, c->decomp_buf,
                                  c->linelen, c->height);
-        }
     } else {
-        picture->pict_type = AV_PICTURE_TYPE_P;
-        picture->key_frame = 0;
-        switch (c->bpp) {
-          case 16:
-              add_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              add_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              add_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_P;
+        c->pic->key_frame = 0;
+              add_frame_default(c->pic, c->decomp_buf,
                                 c->linelen, c->height);
-        }
     }
 
     *got_frame = 1;
+    if ((ret = av_frame_ref(data, c->pic)) < 0)
+        return ret;
+
     return buf_size;
 }
 
@@ -217,9 +125,9 @@ static av_cold int decode_init(AVCodecContext *avctx) {
     CamStudioContext *c = avctx->priv_data;
     int stride;
     switch (avctx->bits_per_coded_sample) {
-        case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
+        case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555LE; break;
         case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24; break;
-        case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
+        case 32: avctx->pix_fmt = AV_PIX_FMT_BGR0; break;
         default:
             av_log(avctx, AV_LOG_ERROR,
                    "CamStudio codec error: invalid depth %i bpp\n",
@@ -229,21 +137,23 @@ static av_cold int decode_init(AVCodecContext *avctx) {
     c->bpp = avctx->bits_per_coded_sample;
     c->linelen = avctx->width * avctx->bits_per_coded_sample / 8;
     c->height = avctx->height;
-    stride = c->linelen;
-    if (avctx->bits_per_coded_sample == 24)
-        stride = FFALIGN(stride, 4);
+    stride = FFALIGN(c->linelen, 4);
     c->decomp_size = c->height * stride;
     c->decomp_buf = av_malloc(c->decomp_size + AV_LZO_OUTPUT_PADDING);
     if (!c->decomp_buf) {
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
+    c->pic = av_frame_alloc();
+    if (!c->pic)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx) {
     CamStudioContext *c = avctx->priv_data;
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->pic);
     return 0;
 }
 
diff --git a/libavcodec/cyuv.c b/libavcodec/cyuv.c
index c8d4ed3045..c7ec0085b0 100644
--- a/libavcodec/cyuv.c
+++ b/libavcodec/cyuv.c
@@ -4,22 +4,22 @@
  * based on "Creative YUV (CYUV) stream format for AVI":
  *   http://www.csse.monash.edu.au/~timf/videocodec/cyuv.txt
  *
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,7 +52,6 @@ static av_cold int cyuv_decode_init(AVCodecContext *avctx)
     if (s->width & 0x3)
         return AVERROR_INVALIDDATA;
     s->height = avctx->height;
-    avctx->pix_fmt = AV_PIX_FMT_YUV411P;
 
     return 0;
 }
@@ -82,6 +81,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     int stream_ptr;
     unsigned char cur_byte;
     int pixel_groups;
+    int rawsize = s->height * FFALIGN(s->width,2) * 2;
     int ret;
 
     if (avctx->codec_id == AV_CODEC_ID_AURA) {
@@ -92,7 +92,11 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
      * followed by (height) lines each with 3 bytes to represent groups
      * of 4 pixels. Thus, the total size of the buffer ought to be:
      *    (3 * 16) + height * (width * 3 / 4) */
-    if (buf_size != 48 + s->height * (s->width * 3 / 4)) {
+    if (buf_size == 48 + s->height * (s->width * 3 / 4)) {
+        avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+    } else if(buf_size == rawsize ) {
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+    } else {
         av_log(avctx, AV_LOG_ERROR, "got a buffer with %d bytes when %d were expected\n",
                buf_size, 48 + s->height * (s->width * 3 / 4));
         return AVERROR_INVALIDDATA;
@@ -101,15 +105,22 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     stream_ptr = 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     y_plane = frame->data[0];
     u_plane = frame->data[1];
     v_plane = frame->data[2];
 
+    if (buf_size == rawsize) {
+        int linesize = FFALIGN(s->width,2) * 2;
+        y_plane += frame->linesize[0] * s->height;
+        for (stream_ptr = 0; stream_ptr < rawsize; stream_ptr += linesize) {
+            y_plane -= frame->linesize[0];
+            memcpy(y_plane, buf+stream_ptr, linesize);
+        }
+    } else {
+
     /* iterate through each line in the height */
     for (y_ptr = 0, u_ptr = 0, v_ptr = 0;
          y_ptr < (s->height * frame->linesize[0]);
@@ -157,6 +168,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
 
         }
     }
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/d3d11va.c b/libavcodec/d3d11va.c
index 946de06cf1..9967f322c9 100644
--- a/libavcodec/d3d11va.c
+++ b/libavcodec/d3d11va.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/d3d11va.h b/libavcodec/d3d11va.h
index 9264ec6ed4..6816b6c1e6 100644
--- a/libavcodec/d3d11va.h
+++ b/libavcodec/d3d11va.h
@@ -4,20 +4,20 @@
  * copyright (c) 2009 Laurent Aimar
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the Direct3D11 Libav HWAccel implementation.
+ * to the Direct3D11 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  *
@@ -88,7 +88,7 @@ typedef struct AVD3D11VAContext {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index ebe9fdb47c..8dd043088e 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -1,20 +1,24 @@
 /*
  * DCA compatible decoder data
+ * Copyright (C) 2004 Gildas Bazin
+ * Copyright (C) 2004 Benjamin Zores
+ * Copyright (C) 2006 Benjamin Larsson
+ * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +36,7 @@ const uint32_t avpriv_dca_sample_rates[16] = {
     12000, 24000, 48000, 96000, 192000
 };
 
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
                              int max_size)
 {
     uint32_t mrk;
diff --git a/libavcodec/dca.h b/libavcodec/dca.h
index 887eb375f2..897ebf4b73 100644
--- a/libavcodec/dca.h
+++ b/libavcodec/dca.h
@@ -5,20 +5,20 @@
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,8 @@
 #define DCA_SUBFRAMES_MAX     (16)
 #define DCA_BLOCKS_MAX        (16)
 #define DCA_LFE_MAX            (3)
+#define DCA_CHSETS_MAX         (4)
+#define DCA_CHSET_CHANS_MAX    (8)
 
 #define DCA_PRIM_CHANNELS_MAX  (7)
 #define DCA_ABITS_MAX         (32)      /* Should be 28 */
@@ -131,7 +133,7 @@ typedef struct QMF64_table {
 } QMF64_table;
 
 typedef struct DCAContext {
-    AVClass *class;             ///< class for AVOptions
+    const AVClass *class;       ///< class for AVOptions
     AVCodecContext *avctx;
     /* Frame header */
     int frame_type;             ///< type of the current frame
@@ -234,6 +236,20 @@ typedef struct DCAContext {
     int xch_base_channel;       ///< index of first (only) channel containing XCH data
     int xch_disable;            ///< whether the XCh extension should be decoded or not
 
+    /* XXCH extension information */
+    int xxch_chset;
+    int xxch_nbits_spk_mask;
+    uint32_t xxch_core_spkmask;
+    uint32_t xxch_spk_masks[4]; /* speaker masks, last element is core mask */
+    int xxch_chset_nch[4];
+    float xxch_dmix_sf[DCA_CHSETS_MAX];
+
+    uint32_t xxch_dmix_embedded;  /* lower layer has mix pre-embedded, per chset */
+    float xxch_dmix_coeff[DCA_PRIM_CHANNELS_MAX][32]; /* worst case sizing */
+
+    int8_t xxch_order_tab[32];
+    int8_t lfe_index;
+
     /* XLL extension information */
     int xll_disable;
     int xll_nch_sets;           ///< number of channel sets per frame
@@ -262,7 +278,7 @@ typedef struct DCAContext {
     int one2one_map_chtospkr;
 
     int debug_flag;             ///< used for suppressing repeated error messages output
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext imdct;
     SynthFilterContext synth;
     DCADSPContext dcadsp;
@@ -275,9 +291,12 @@ extern av_export const uint32_t avpriv_dca_sample_rates[16];
 /**
  * Convert bitstream to one representation based on sync marker
  */
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
                              int max_size);
 
+int ff_dca_xbr_parse_frame(DCAContext *s);
+int ff_dca_xxch_decode_frame(DCAContext *s);
+
 void ff_dca_exss_parse_header(DCAContext *s);
 
 int ff_dca_xll_decode_header(DCAContext *s);
diff --git a/libavcodec/dca_exss.c b/libavcodec/dca_exss.c
index 2895e2096e..ed014906ce 100644
--- a/libavcodec/dca_exss.c
+++ b/libavcodec/dca_exss.c
@@ -1,20 +1,20 @@
 /*
  * DCA ExSS extension
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -316,6 +316,8 @@ void ff_dca_exss_parse_header(DCAContext *s)
         }
     }
 
+    av_assert0(num_assets > 0); // silence a warning
+
     for (i = 0; i < num_assets; i++)
         asset_size[i] = get_bits_long(&s->gb, 16 + 4 * blownup) + 1;
 
@@ -324,7 +326,6 @@ void ff_dca_exss_parse_header(DCAContext *s)
             return;
     }
 
-    if (num_assets > 0) {
         j = get_bits_count(&s->gb);
         if (start_pos + hdrsize * 8 > j)
             skip_bits_long(&s->gb, start_pos + hdrsize * 8 - j);
@@ -337,6 +338,13 @@ void ff_dca_exss_parse_header(DCAContext *s)
 
             /* parse extensions that we know about */
             switch (mkr) {
+            case DCA_SYNCWORD_XBR:
+                ff_dca_xbr_parse_frame(s);
+                break;
+            case DCA_SYNCWORD_XXCH:
+                ff_dca_xxch_decode_frame(s);
+                s->core_ext_mask |= DCA_EXT_XXCH; /* xxx use for chan reordering */
+                break;
             case DCA_SYNCWORD_XLL:
                 if (s->xll_disable) {
                     av_log(s->avctx, AV_LOG_DEBUG,
@@ -349,11 +357,9 @@ void ff_dca_exss_parse_header(DCAContext *s)
                     ff_dca_xll_decode_navi(s, end_pos) == 0)
                     s->exss_ext_mask |= DCA_EXT_EXSS_XLL;
                 break;
-            case DCA_SYNCWORD_XBR:
-            case DCA_SYNCWORD_XXCH:
             default:
-                av_log(s->avctx, AV_LOG_VERBOSE,
-                       "DTS-ExSS: unknown marker = 0x%08"PRIx32"\n", mkr);
+                av_log(s->avctx, AV_LOG_DEBUG,
+                       "DTS-ExSS: unknown marker = 0x%08x\n", mkr);
             }
 
             /* skip to end of block */
@@ -364,5 +370,4 @@ void ff_dca_exss_parse_header(DCAContext *s)
             if (j < end_pos)
                 skip_bits_long(&s->gb, end_pos - j);
         }
-    }
 }
diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c
index c33cc9acaa..337a99d2cb 100644
--- a/libavcodec/dca_parser.c
+++ b/libavcodec/dca_parser.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,7 +119,7 @@ static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration,
     if (buf_size < 12)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_dca_convert_bitstream(buf, 12, hdr, 12)) < 0)
+    if ((ret = avpriv_dca_convert_bitstream(buf, 12, hdr, 12)) < 0)
         return ret;
 
     init_get_bits(&gb, hdr, 96);
diff --git a/libavcodec/dca_syncwords.h b/libavcodec/dca_syncwords.h
index 07b60e080d..3466b6bc6f 100644
--- a/libavcodec/dca_syncwords.h
+++ b/libavcodec/dca_syncwords.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dca_xll.c b/libavcodec/dca_xll.c
index 5a558b8c48..98fd4c8eaa 100644
--- a/libavcodec/dca_xll.c
+++ b/libavcodec/dca_xll.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 Paul B Mahol
  * Copyright (C) 2014 Niels Möller
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -393,7 +393,7 @@ static void dca_xll_inv_adapt_pred(int *samples, int nsamples, unsigned order,
             for (; j < order; j++)
                 s += (int64_t) c[j] * prev[DCA_XLL_AORDER_MAX + i - 1 - j];
 
-            samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
+            samples[i] -= av_clip_intp2((s + 0x8000) >> 16, 24);
         }
     }
     for (i = order; i < nsamples; i++) {
@@ -402,7 +402,7 @@ static void dca_xll_inv_adapt_pred(int *samples, int nsamples, unsigned order,
 
         /* NOTE: Equations seem to imply addition, while the
          * pseudocode seems to use subtraction.*/
-        samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
+        samples[i] -= av_clip_intp2((s + 0x8000) >> 16, 24);
     }
 }
 
diff --git a/libavcodec/dcadata.c b/libavcodec/dcadata.c
index 1db1938b29..5d7d5943a1 100644
--- a/libavcodec/dcadata.c
+++ b/libavcodec/dcadata.c
@@ -3,27 +3,29 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (c) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 
+#include "libavutil/channel_layout.h"
 #include "libavutil/mem.h"
 
+#include "dca.h"
 #include "dcadata.h"
 
 /* Generic tables */
@@ -8316,6 +8318,95 @@ const int32_t ff_dca_sampling_freqs[16] = {
  * where Ch(n) represents the subband samples in the (n)th audio channel.
  */
 
+const uint32_t ff_dca_map_xxch_to_native[28] = {
+    AV_CH_FRONT_CENTER,
+    AV_CH_FRONT_LEFT,
+    AV_CH_FRONT_RIGHT,
+    AV_CH_SIDE_LEFT,
+    AV_CH_SIDE_RIGHT,
+    AV_CH_LOW_FREQUENCY,
+    AV_CH_BACK_CENTER,
+    AV_CH_BACK_LEFT,
+    AV_CH_BACK_RIGHT,
+    AV_CH_SIDE_LEFT,           /* side surround left -- dup sur side L */
+    AV_CH_SIDE_RIGHT,          /* side surround right -- dup sur side R */
+    AV_CH_FRONT_LEFT_OF_CENTER,
+    AV_CH_FRONT_RIGHT_OF_CENTER,
+    AV_CH_TOP_FRONT_LEFT,
+    AV_CH_TOP_FRONT_CENTER,
+    AV_CH_TOP_FRONT_RIGHT,
+    AV_CH_LOW_FREQUENCY,        /* lfe2 -- duplicate lfe1 position */
+    AV_CH_FRONT_LEFT_OF_CENTER, /* side front left -- dup front cntr L */
+    AV_CH_FRONT_RIGHT_OF_CENTER,/* side front right -- dup front cntr R */
+    AV_CH_TOP_CENTER,           /* overhead */
+    AV_CH_TOP_FRONT_LEFT,       /* side high left -- dup */
+    AV_CH_TOP_FRONT_RIGHT,      /* side high right -- dup */
+    AV_CH_TOP_BACK_CENTER,
+    AV_CH_TOP_BACK_LEFT,
+    AV_CH_TOP_BACK_RIGHT,
+    AV_CH_BACK_CENTER,          /* rear low center -- dup */
+    AV_CH_BACK_LEFT,            /* rear low left -- dup */
+    AV_CH_BACK_RIGHT            /* read low right -- dup  */
+};
+
+/* -1 are reserved or unknown */
+const int ff_dca_ext_audio_descr_mask[8] = {
+    DCA_EXT_XCH,
+    -1,
+    DCA_EXT_X96,
+    DCA_EXT_XCH | DCA_EXT_X96,
+    -1,
+    -1,
+    DCA_EXT_XXCH,
+    -1,
+};
+
+/* Tables for mapping dts channel configurations to libavcodec multichannel api.
+ * Some compromises have been made for special configurations. Most configurations
+ * are never used so complete accuracy is not needed.
+ *
+ * L = left, R = right, C = center, S = surround, F = front, R = rear, T = total, OV = overhead.
+ * S  -> side, when both rear and back are configured move one of them to the side channel
+ * OV -> center back
+ * All 2 channel configurations -> AV_CH_LAYOUT_STEREO
+ */
+const uint64_t ff_dca_core_channel_layout[16] = {
+    AV_CH_FRONT_CENTER,                                                     ///< 1, A
+    AV_CH_LAYOUT_STEREO,                                                    ///< 2, A + B (dual mono)
+    AV_CH_LAYOUT_STEREO,                                                    ///< 2, L + R (stereo)
+    AV_CH_LAYOUT_STEREO,                                                    ///< 2, (L + R) + (L - R) (sum-difference)
+    AV_CH_LAYOUT_STEREO,                                                    ///< 2, LT + RT (left and right total)
+    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER,                               ///< 3, C + L + R
+    AV_CH_LAYOUT_STEREO | AV_CH_BACK_CENTER,                                ///< 3, L + R + S
+    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_BACK_CENTER,           ///< 4, C + L + R + S
+    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,               ///< 4, L + R + SL + SR
+
+    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT |
+    AV_CH_SIDE_RIGHT,                                                       ///< 5, C + L + R + SL + SR
+
+    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
+    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER,               ///< 6, CL + CR + L + R + SL + SR
+
+    AV_CH_LAYOUT_STEREO | AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT |
+    AV_CH_FRONT_CENTER  | AV_CH_BACK_CENTER,                                ///< 6, C + L + R + LR + RR + OV
+
+    AV_CH_FRONT_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
+    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_BACK_CENTER   |
+    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 6, CF + CR + LF + RF + LR + RR
+
+    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
+    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
+    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,                                     ///< 7, CL + C + CR + L + R + SL + SR
+
+    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
+    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
+    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 8, CL + CR + L + R + SL1 + SL2 + SR1 + SR2
+
+    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
+    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
+    AV_CH_SIDE_LEFT | AV_CH_BACK_CENTER | AV_CH_SIDE_RIGHT,                 ///< 8, CL + C + CR + L + R + SL + S + SR
+};
+
 const int8_t ff_dca_lfe_index[16] = {
     1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3
 };
diff --git a/libavcodec/dcadata.h b/libavcodec/dcadata.h
index 7a9d994ed2..1d3d605b9e 100644
--- a/libavcodec/dcadata.h
+++ b/libavcodec/dcadata.h
@@ -1,20 +1,20 @@
 /*
  * DCA compatible decoder data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,6 +58,11 @@ extern const uint32_t ff_dca_inv_dmixtable[FF_DCA_INV_DMIXTABLE_SIZE];
 
 extern const float ff_dca_default_coeffs[10][6][2];
 
+extern const uint32_t ff_dca_map_xxch_to_native[28];
+extern const int ff_dca_ext_audio_descr_mask[8];
+
+extern const uint64_t ff_dca_core_channel_layout[16];
+
 extern const int32_t ff_dca_sampling_freqs[16];
 
 extern const int8_t ff_dca_lfe_index[16];
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 33658288af..bf28f3bca1 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -7,20 +7,20 @@
  * Copyright (C) 2012 Paul B Mahol
  * Copyright (C) 2014 Niels Möller
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,6 @@
 #include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
-#include "put_bits.h"
 #include "synth_filter.h"
 
 #if ARCH_ARM
@@ -70,62 +69,36 @@ enum DCAMode {
     DCA_4F2R
 };
 
-/* -1 are reserved or unknown */
-static const int dca_ext_audio_descr_mask[] = {
-    DCA_EXT_XCH,
-    -1,
-    DCA_EXT_X96,
-    DCA_EXT_XCH | DCA_EXT_X96,
-    -1,
-    -1,
-    DCA_EXT_XXCH,
-    -1,
-};
 
-/* Tables for mapping dts channel configurations to libavcodec multichannel api.
- * Some compromises have been made for special configurations. Most configurations
- * are never used so complete accuracy is not needed.
- *
- * L = left, R = right, C = center, S = surround, F = front, R = rear, T = total, OV = overhead.
- * S  -> side, when both rear and back are configured move one of them to the side channel
- * OV -> center back
- * All 2 channel configurations -> AV_CH_LAYOUT_STEREO
- */
-static const uint64_t dca_core_channel_layout[] = {
-    AV_CH_FRONT_CENTER,                                                     ///< 1, A
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, A + B (dual mono)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, L + R (stereo)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, (L + R) + (L - R) (sum-difference)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, LT + RT (left and right total)
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER,                               ///< 3, C + L + R
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_CENTER,                                ///< 3, L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_BACK_CENTER,           ///< 4, C + L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,               ///< 4, L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT |
-    AV_CH_SIDE_RIGHT,                                                       ///< 5, C + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER,               ///< 6, CL + CR + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT |
-    AV_CH_FRONT_CENTER  | AV_CH_BACK_CENTER,                                ///< 6, C + L + R + LR + RR + OV
-
-    AV_CH_FRONT_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_BACK_CENTER   |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 6, CF + CR + LF + RF + LR + RR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,                                     ///< 7, CL + C + CR + L + R + SL + SR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 8, CL + CR + L + R + SL1 + SL2 + SR1 + SR2
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_BACK_CENTER | AV_CH_SIDE_RIGHT,                 ///< 8, CL + C + CR + L + R + SL + S + SR
+enum DCAXxchSpeakerMask {
+    DCA_XXCH_FRONT_CENTER          = 0x0000001,
+    DCA_XXCH_FRONT_LEFT            = 0x0000002,
+    DCA_XXCH_FRONT_RIGHT           = 0x0000004,
+    DCA_XXCH_SIDE_REAR_LEFT        = 0x0000008,
+    DCA_XXCH_SIDE_REAR_RIGHT       = 0x0000010,
+    DCA_XXCH_LFE1                  = 0x0000020,
+    DCA_XXCH_REAR_CENTER           = 0x0000040,
+    DCA_XXCH_SURROUND_REAR_LEFT    = 0x0000080,
+    DCA_XXCH_SURROUND_REAR_RIGHT   = 0x0000100,
+    DCA_XXCH_SIDE_SURROUND_LEFT    = 0x0000200,
+    DCA_XXCH_SIDE_SURROUND_RIGHT   = 0x0000400,
+    DCA_XXCH_FRONT_CENTER_LEFT     = 0x0000800,
+    DCA_XXCH_FRONT_CENTER_RIGHT    = 0x0001000,
+    DCA_XXCH_FRONT_HIGH_LEFT       = 0x0002000,
+    DCA_XXCH_FRONT_HIGH_CENTER     = 0x0004000,
+    DCA_XXCH_FRONT_HIGH_RIGHT      = 0x0008000,
+    DCA_XXCH_LFE2                  = 0x0010000,
+    DCA_XXCH_SIDE_FRONT_LEFT       = 0x0020000,
+    DCA_XXCH_SIDE_FRONT_RIGHT      = 0x0040000,
+    DCA_XXCH_OVERHEAD              = 0x0080000,
+    DCA_XXCH_SIDE_HIGH_LEFT        = 0x0100000,
+    DCA_XXCH_SIDE_HIGH_RIGHT       = 0x0200000,
+    DCA_XXCH_REAR_HIGH_CENTER      = 0x0400000,
+    DCA_XXCH_REAR_HIGH_LEFT        = 0x0800000,
+    DCA_XXCH_REAR_HIGH_RIGHT       = 0x1000000,
+    DCA_XXCH_REAR_LOW_CENTER       = 0x2000000,
+    DCA_XXCH_REAR_LOW_LEFT         = 0x4000000,
+    DCA_XXCH_REAR_LOW_RIGHT        = 0x8000000,
 };
 
 #define DCA_DOLBY                  101           /* FIXME */
@@ -161,6 +134,8 @@ static av_always_inline int get_bitalloc(GetBitContext *gb, BitAlloc *ba,
            ba->offset;
 }
 
+static float dca_dmix_code(unsigned code);
+
 static av_cold void dca_init_vlcs(void)
 {
     static int vlcs_initialized = 0;
@@ -222,16 +197,103 @@ static inline void get_array(GetBitContext *gb, int *dst, int len, int bits)
         *dst++ = get_bits(gb, bits);
 }
 
-static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
+static inline int dca_xxch2index(DCAContext *s, int xxch_ch)
+{
+    int i, base, mask;
+
+    /* locate channel set containing the channel */
+    for (i = -1, base = 0, mask = (s->xxch_core_spkmask & ~DCA_XXCH_LFE1);
+         i <= s->xxch_chset && !(mask & xxch_ch); mask = s->xxch_spk_masks[++i])
+        base += av_popcount(mask);
+
+    return base + av_popcount(mask & (xxch_ch - 1));
+}
+
+static int dca_parse_audio_coding_header(DCAContext *s, int base_channel,
+                                         int xxch)
 {
     int i, j;
     static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
     static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
     static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
+    int hdr_pos = 0, hdr_size = 0;
+    float scale_factor;
+    int this_chans, acc_mask;
+    int embedded_downmix;
+    int nchans, mask[8];
+    int coeff, ichan;
+
+    /* xxch has arbitrary sized audio coding headers */
+    if (xxch) {
+        hdr_pos  = get_bits_count(&s->gb);
+        hdr_size = get_bits(&s->gb, 7) + 1;
+    }
+
+    nchans = get_bits(&s->gb, 3) + 1;
+    if (xxch && nchans >= 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "nchans %d is too large\n", nchans);
+        return AVERROR_INVALIDDATA;
+    } else if (nchans + base_channel > DCA_PRIM_CHANNELS_MAX) {
+        av_log(s->avctx, AV_LOG_ERROR, "channel sum %d + %d is too large\n", nchans, base_channel);
+        return AVERROR_INVALIDDATA;
+    }
 
-    s->total_channels = get_bits(&s->gb, 3) + 1 + base_channel;
+    s->total_channels = nchans + base_channel;
     s->prim_channels  = s->total_channels;
 
+    /* obtain speaker layout mask & downmix coefficients for XXCH */
+    if (xxch) {
+        acc_mask = s->xxch_core_spkmask;
+
+        this_chans = get_bits(&s->gb, s->xxch_nbits_spk_mask - 6) << 6;
+        s->xxch_spk_masks[s->xxch_chset] = this_chans;
+        s->xxch_chset_nch[s->xxch_chset] = nchans;
+
+        for (i = 0; i <= s->xxch_chset; i++)
+            acc_mask |= s->xxch_spk_masks[i];
+
+        /* check for downmixing information */
+        if (get_bits1(&s->gb)) {
+            embedded_downmix = get_bits1(&s->gb);
+            coeff            = get_bits(&s->gb, 6);
+
+            if (coeff<1 || coeff>61) {
+                av_log(s->avctx, AV_LOG_ERROR, "6bit coeff %d is out of range\n", coeff);
+                return AVERROR_INVALIDDATA;
+            }
+
+            scale_factor     = -1.0f / dca_dmix_code((coeff<<2)-3);
+
+            s->xxch_dmix_sf[s->xxch_chset] = scale_factor;
+
+            for (i = base_channel; i < s->prim_channels; i++) {
+                mask[i] = get_bits(&s->gb, s->xxch_nbits_spk_mask);
+            }
+
+            for (j = base_channel; j < s->prim_channels; j++) {
+                memset(s->xxch_dmix_coeff[j], 0, sizeof(s->xxch_dmix_coeff[0]));
+                s->xxch_dmix_embedded |= (embedded_downmix << j);
+                for (i = 0; i < s->xxch_nbits_spk_mask; i++) {
+                    if (mask[j] & (1 << i)) {
+                        if ((1 << i) == DCA_XXCH_LFE1) {
+                            av_log(s->avctx, AV_LOG_WARNING,
+                                   "DCA-XXCH: dmix to LFE1 not supported.\n");
+                            continue;
+                        }
+
+                        coeff = get_bits(&s->gb, 7);
+                        ichan = dca_xxch2index(s, 1 << i);
+                        if ((coeff&63)<1 || (coeff&63)>61) {
+                            av_log(s->avctx, AV_LOG_ERROR, "7bit coeff %d is out of range\n", coeff);
+                            return AVERROR_INVALIDDATA;
+                        }
+                        s->xxch_dmix_coeff[j][ichan] = dca_dmix_code((coeff<<2)-3);
+                    }
+                }
+            }
+        }
+    }
+
     if (s->prim_channels > DCA_PRIM_CHANNELS_MAX)
         s->prim_channels = DCA_PRIM_CHANNELS_MAX;
 
@@ -267,9 +329,16 @@ static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
             if (s->quant_index_huffman[i][j] < thr[j])
                 s->scalefactor_adj[i][j] = adj_table[get_bits(&s->gb, 2)];
 
-    if (s->crc_present) {
-        /* Audio header CRC check */
-        get_bits(&s->gb, 16);
+    if (!xxch) {
+        if (s->crc_present) {
+            /* Audio header CRC check */
+            get_bits(&s->gb, 16);
+        }
+    } else {
+        /* Skip to the end of the header, also ignore CRC if present  */
+        i = get_bits_count(&s->gb);
+        if (hdr_pos + 8 * hdr_size > i)
+            skip_bits_long(&s->gb, hdr_pos + 8 * hdr_size - i);
     }
 
     s->current_subframe    = 0;
@@ -314,6 +383,7 @@ static int dca_parse_frame_header(DCAContext *s)
     s->predictor_history = get_bits(&s->gb, 1);
 
     if (s->lfe > 2) {
+        s->lfe = 0;
         av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE value: %d\n", s->lfe);
         return AVERROR_INVALIDDATA;
     }
@@ -338,7 +408,7 @@ static int dca_parse_frame_header(DCAContext *s)
     /* Primary audio coding header */
     s->subframes = get_bits(&s->gb, 4) + 1;
 
-    return dca_parse_audio_coding_header(s, 0);
+    return dca_parse_audio_coding_header(s, 0, 0);
 }
 
 static inline int get_scale(GetBitContext *gb, int level, int value, int log2range)
@@ -368,6 +438,10 @@ static int dca_subframe_header(DCAContext *s, int base_channel, int block_index)
 
     if (!base_channel) {
         s->subsubframes[s->current_subframe]    = get_bits(&s->gb, 2) + 1;
+        if (block_index + s->subsubframes[s->current_subframe] > (s->sample_blocks / SAMPLES_PER_SUBBAND)) {
+            s->subsubframes[s->current_subframe] = 1;
+            return AVERROR_INVALIDDATA;
+        }
         s->partial_samples[s->current_subframe] = get_bits(&s->gb, 3);
     }
 
@@ -513,6 +587,7 @@ static int dca_subframe_header(DCAContext *s, int base_channel, int block_index)
 
     /* Low frequency effect data */
     if (!base_channel && s->lfe) {
+        int quant7;
         /* LFE samples */
         int lfe_samples    = 2 * s->lfe * (4 + block_index);
         int lfe_end_sample = 2 * s->lfe * (4 + block_index + s->subsubframes[s->current_subframe]);
@@ -524,8 +599,12 @@ static int dca_subframe_header(DCAContext *s, int base_channel, int block_index)
         }
 
         /* Scale factor index */
-        skip_bits(&s->gb, 1);
-        s->lfe_scale_factor = ff_dca_scale_factor_quant7[get_bits(&s->gb, 7)];
+        quant7 = get_bits(&s->gb, 8);
+        if (quant7 > 127) {
+            avpriv_request_sample(s->avctx, "LFEScaleIndex larger than 127");
+            return AVERROR_INVALIDDATA;
+        }
+        s->lfe_scale_factor = ff_dca_scale_factor_quant7[quant7];
 
         /* Quantization step size * scale factor */
         lfe_scale = 0.035 * s->lfe_scale_factor;
@@ -703,7 +782,7 @@ static void dca_downmix(float **samples, int srcfmt, int lfe_present,
     switch (srcfmt) {
     case DCA_MONO:
     case DCA_4F2R:
-        av_log(NULL, 0, "Not implemented!\n");
+        av_log(NULL, AV_LOG_ERROR, "Not implemented!\n");
         break;
     case DCA_CHANNEL:
     case DCA_STEREO:
@@ -895,7 +974,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
                         else if (s->predictor_history)
                             sum += ff_dca_adpcm_vb[s->prediction_vq[k][l]][n - 1] *
                                    s->subband_samples_hist[k][l][m - n + 4];
-                    subband_samples[k][l][m] += sum * 1.0f / 8192;
+                    subband_samples[k][l][m] += sum * (1.0f / 8192);
                 }
             }
         }
@@ -904,7 +983,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
          * Decode VQ encoded high frequencies
          */
         if (s->subband_activity[k] > s->vq_start_subband[k]) {
-            if (!s->debug_flag & 0x01) {
+            if (!(s->debug_flag & 0x01)) {
                 av_log(s->avctx, AV_LOG_DEBUG,
                        "Stream with high frequencies VQ coding\n");
                 s->debug_flag |= 0x01;
@@ -964,7 +1043,7 @@ static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
 
     /* Generate LFE samples for this subsubframe FIXME!!! */
     if (s->lfe) {
-        float *samples = s->samples_chanptr[ff_dca_lfe_index[s->amode]];
+        float *samples = s->samples_chanptr[s->lfe_index];
         lfe_interpolation_fir(s,
                               s->lfe_data + 2 * s->lfe * (block_index + 4),
                               samples);
@@ -972,7 +1051,7 @@ static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
             unsigned i;
             /* Should apply the filter in Table 6-11 when upsampling. For
              * now, just duplicate. */
-            for (i = 511; i > 0; i--) {
+            for (i = 255; i > 0; i--) {
                 samples[2 * i]     =
                 samples[2 * i + 1] = samples[i];
             }
@@ -1137,11 +1216,242 @@ static int dca_decode_block(DCAContext *s, int base_channel, int block_index)
     return 0;
 }
 
+int ff_dca_xbr_parse_frame(DCAContext *s)
+{
+    int scale_table_high[DCA_CHSET_CHANS_MAX][DCA_SUBBANDS][2];
+    int active_bands[DCA_CHSETS_MAX][DCA_CHSET_CHANS_MAX];
+    int abits_high[DCA_CHSET_CHANS_MAX][DCA_SUBBANDS];
+    int anctemp[DCA_CHSET_CHANS_MAX];
+    int chset_fsize[DCA_CHSETS_MAX];
+    int n_xbr_ch[DCA_CHSETS_MAX];
+    int hdr_size, num_chsets, xbr_tmode, hdr_pos;
+    int i, j, k, l, chset, chan_base;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "DTS-XBR: decoding XBR extension\n");
+
+    /* get bit position of sync header */
+    hdr_pos = get_bits_count(&s->gb) - 32;
+
+    hdr_size = get_bits(&s->gb, 6) + 1;
+    num_chsets = get_bits(&s->gb, 2) + 1;
+
+    for(i = 0; i < num_chsets; i++)
+        chset_fsize[i] = get_bits(&s->gb, 14) + 1;
+
+    xbr_tmode = get_bits1(&s->gb);
+
+    for(i = 0; i < num_chsets; i++) {
+        n_xbr_ch[i] = get_bits(&s->gb, 3) + 1;
+        k = get_bits(&s->gb, 2) + 5;
+        for(j = 0; j < n_xbr_ch[i]; j++) {
+            active_bands[i][j] = get_bits(&s->gb, k) + 1;
+            if (active_bands[i][j] > DCA_SUBBANDS) {
+                av_log(s->avctx, AV_LOG_ERROR, "too many active subbands (%d)\n", active_bands[i][j]);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    /* skip to the end of the header */
+    i = get_bits_count(&s->gb);
+    if(hdr_pos + hdr_size * 8 > i)
+        skip_bits_long(&s->gb, hdr_pos + hdr_size * 8 - i);
+
+    /* loop over the channel data sets */
+    /* only decode as many channels as we've decoded base data for */
+    for(chset = 0, chan_base = 0;
+        chset < num_chsets && chan_base + n_xbr_ch[chset] <= s->prim_channels;
+        chan_base += n_xbr_ch[chset++]) {
+        int start_posn = get_bits_count(&s->gb);
+        int subsubframe = 0;
+        int subframe = 0;
+
+        /* loop over subframes */
+        for (k = 0; k < (s->sample_blocks / 8); k++) {
+            /* parse header if we're on first subsubframe of a block */
+            if(subsubframe == 0) {
+                /* Parse subframe header */
+                for(i = 0; i < n_xbr_ch[chset]; i++) {
+                    anctemp[i] = get_bits(&s->gb, 2) + 2;
+                }
+
+                for(i = 0; i < n_xbr_ch[chset]; i++) {
+                    get_array(&s->gb, abits_high[i], active_bands[chset][i], anctemp[i]);
+                }
+
+                for(i = 0; i < n_xbr_ch[chset]; i++) {
+                    anctemp[i] = get_bits(&s->gb, 3);
+                    if(anctemp[i] < 1) {
+                        av_log(s->avctx, AV_LOG_ERROR, "DTS-XBR: SYNC ERROR\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+
+                /* generate scale factors */
+                for(i = 0; i < n_xbr_ch[chset]; i++) {
+                    const uint32_t *scale_table;
+                    int nbits;
+                    int scale_table_size;
+
+                    if (s->scalefactor_huffman[chan_base+i] == 6) {
+                        scale_table = ff_dca_scale_factor_quant7;
+                        scale_table_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7);
+                    } else {
+                        scale_table = ff_dca_scale_factor_quant6;
+                        scale_table_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant6);
+                    }
+
+                    nbits = anctemp[i];
+
+                    for(j = 0; j < active_bands[chset][i]; j++) {
+                        if(abits_high[i][j] > 0) {
+                            int index = get_bits(&s->gb, nbits);
+                            if (index >= scale_table_size) {
+                                av_log(s->avctx, AV_LOG_ERROR, "scale table index %d invalid\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            scale_table_high[i][j][0] = scale_table[index];
+
+                            if(xbr_tmode && s->transition_mode[i][j]) {
+                                int index = get_bits(&s->gb, nbits);
+                                if (index >= scale_table_size) {
+                                    av_log(s->avctx, AV_LOG_ERROR, "scale table index %d invalid\n", index);
+                                    return AVERROR_INVALIDDATA;
+                                }
+                                scale_table_high[i][j][1] = scale_table[index];
+                            }
+                        }
+                    }
+                }
+            }
+
+            /* decode audio array for this block */
+            for(i = 0; i < n_xbr_ch[chset]; i++) {
+                for(j = 0; j < active_bands[chset][i]; j++) {
+                    const int xbr_abits = abits_high[i][j];
+                    const float quant_step_size = ff_dca_lossless_quant_d[xbr_abits];
+                    const int sfi = xbr_tmode && s->transition_mode[i][j] && subsubframe >= s->transition_mode[i][j];
+                    const float rscale = quant_step_size * scale_table_high[i][j][sfi];
+                    float *subband_samples = s->subband_samples[k][chan_base+i][j];
+                    int block[8];
+
+                    if(xbr_abits <= 0)
+                        continue;
+
+                    if(xbr_abits > 7) {
+                        get_array(&s->gb, block, 8, xbr_abits - 3);
+                    } else {
+                        int block_code1, block_code2, size, levels, err;
+
+                        size   = abits_sizes[xbr_abits - 1];
+                        levels = abits_levels[xbr_abits - 1];
+
+                        block_code1 = get_bits(&s->gb, size);
+                        block_code2 = get_bits(&s->gb, size);
+                        err = decode_blockcodes(block_code1, block_code2,
+                                                levels, block);
+                        if (err) {
+                            av_log(s->avctx, AV_LOG_ERROR,
+                                   "ERROR: DTS-XBR: block code look-up failed\n");
+                            return AVERROR_INVALIDDATA;
+                        }
+                    }
+
+                    /* scale & sum into subband */
+                    for(l = 0; l < 8; l++)
+                        subband_samples[l] += (float)block[l] * rscale;
+                }
+            }
+
+            /* check DSYNC marker */
+            if(s->aspf || subsubframe == s->subsubframes[subframe] - 1) {
+                if(get_bits(&s->gb, 16) != 0xffff) {
+                    av_log(s->avctx, AV_LOG_ERROR, "DTS-XBR: Didn't get subframe DSYNC\n");
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+
+            /* advance sub-sub-frame index */
+            if(++subsubframe >= s->subsubframes[subframe]) {
+                subsubframe = 0;
+                subframe++;
+            }
+        }
+
+        /* skip to next channel set */
+        i = get_bits_count(&s->gb);
+        if(start_posn + chset_fsize[chset] * 8 != i) {
+            j = start_posn + chset_fsize[chset] * 8 - i;
+            if(j < 0 || j >= 8)
+                av_log(s->avctx, AV_LOG_ERROR, "DTS-XBR: end of channel set,"
+                       " skipping further than expected (%d bits)\n", j);
+            skip_bits_long(&s->gb, j);
+        }
+    }
+
+    return 0;
+}
+
+
+/* parse initial header for XXCH and dump details */
+int ff_dca_xxch_decode_frame(DCAContext *s)
+{
+    int hdr_size, spkmsk_bits, num_chsets, core_spk, hdr_pos;
+    int i, chset, base_channel, chstart, fsize[8];
+
+    /* assume header word has already been parsed */
+    hdr_pos     = get_bits_count(&s->gb) - 32;
+    hdr_size    = get_bits(&s->gb, 6) + 1;
+  /*chhdr_crc   =*/ skip_bits1(&s->gb);
+    spkmsk_bits = get_bits(&s->gb, 5) + 1;
+    num_chsets  = get_bits(&s->gb, 2) + 1;
+
+    for (i = 0; i < num_chsets; i++)
+        fsize[i] = get_bits(&s->gb, 14) + 1;
+
+    core_spk               = get_bits(&s->gb, spkmsk_bits);
+    s->xxch_core_spkmask   = core_spk;
+    s->xxch_nbits_spk_mask = spkmsk_bits;
+    s->xxch_dmix_embedded  = 0;
+
+    /* skip to the end of the header */
+    i = get_bits_count(&s->gb);
+    if (hdr_pos + hdr_size * 8 > i)
+        skip_bits_long(&s->gb, hdr_pos + hdr_size * 8 - i);
+
+    for (chset = 0; chset < num_chsets; chset++) {
+        chstart       = get_bits_count(&s->gb);
+        base_channel  = s->prim_channels;
+        s->xxch_chset = chset;
+
+        /* XXCH and Core headers differ, see 6.4.2 "XXCH Channel Set Header" vs.
+           5.3.2 "Primary Audio Coding Header", DTS Spec 1.3.1 */
+        dca_parse_audio_coding_header(s, base_channel, 1);
+
+        /* decode channel data */
+        for (i = 0; i < (s->sample_blocks / 8); i++) {
+            if (dca_decode_block(s, base_channel, i)) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                       "Error decoding DTS-XXCH extension\n");
+                continue;
+            }
+        }
+
+        /* skip to end of this section */
+        i = get_bits_count(&s->gb);
+        if (chstart + fsize[chset] * 8 > i)
+            skip_bits_long(&s->gb, chstart + fsize[chset] * 8 - i);
+    }
+    s->xxch_chset = num_chsets;
+
+    return 0;
+}
+
 static float dca_dmix_code(unsigned code)
 {
     int sign = (code >> 8) - 1;
     code &= 0xff;
-    return ((ff_dca_dmixtable[code] ^ sign) - sign) * (1.0 / (1U << 15));
+    return ((ff_dca_dmixtable[code] ^ sign) - sign) * (1.0 / (1 << 15));
 }
 
 static int scan_for_extensions(AVCodecContext *avctx)
@@ -1153,7 +1463,7 @@ static int scan_for_extensions(AVCodecContext *avctx)
 
     /* only scan for extensions if ext_descr was unknown or indicated a
      * supported XCh extension */
-    if (s->core_ext_mask < 0 || s->core_ext_mask & DCA_EXT_XCH) {
+    if (s->core_ext_mask < 0 || s->core_ext_mask & (DCA_EXT_XCH | DCA_EXT_XXCH)) {
         /* if ext_descr was unknown, clear s->core_ext_mask so that the
          * extensions scan can fill it up */
         s->core_ext_mask = FFMAX(s->core_ext_mask, 0);
@@ -1191,8 +1501,13 @@ static int scan_for_extensions(AVCodecContext *avctx)
                     continue;
                 }
 
+                if (s->xch_base_channel < 2) {
+                    avpriv_request_sample(avctx, "XCh with fewer than 2 base channels");
+                    continue;
+                }
+
                 /* much like core primary audio coding header */
-                dca_parse_audio_coding_header(s, s->xch_base_channel);
+                dca_parse_audio_coding_header(s, s->xch_base_channel, 0);
 
                 for (i = 0; i < (s->sample_blocks / 8); i++)
                     if ((ret = dca_decode_block(s, s->xch_base_channel, i))) {
@@ -1208,6 +1523,7 @@ static int scan_for_extensions(AVCodecContext *avctx)
                 /* usually found either in core or HD part in DTS-HD HRA streams,
                  * but not in DTS-ES which contains XCh extensions instead */
                 s->core_ext_mask |= DCA_EXT_XXCH;
+                ff_dca_xxch_decode_frame(s);
                 break;
 
             case 0x1d95f262: {
@@ -1246,95 +1562,134 @@ static int scan_for_extensions(AVCodecContext *avctx)
     return ret;
 }
 
-static int set_channel_layout(AVCodecContext *avctx, int channels, int num_core_channels)
+static int set_channel_layout(AVCodecContext *avctx, int *channels, int num_core_channels)
 {
     DCAContext *s = avctx->priv_data;
-    int i;
+    int i, j, chset, mask;
+    int channel_layout, channel_mask;
+    int posn, lavc;
+
+    /* If we have XXCH then the channel layout is managed differently */
+    /* note that XLL will also have another way to do things */
+    if (!(s->core_ext_mask & DCA_EXT_XXCH)) {
+        /* xxx should also do MA extensions */
+        if (s->amode < 16) {
+            avctx->channel_layout = ff_dca_core_channel_layout[s->amode];
+
+            if (s->prim_channels + !!s->lfe > 2 &&
+                avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
+                /*
+                 * Neither the core's auxiliary data nor our default tables contain
+                 * downmix coefficients for the additional channel coded in the XCh
+                 * extension, so when we're doing a Stereo downmix, don't decode it.
+                 */
+                s->xch_disable = 1;
+            }
 
-    if (s->amode < 16) {
-        avctx->channel_layout = dca_core_channel_layout[s->amode];
+            if (s->xch_present && !s->xch_disable) {
+                if (avctx->channel_layout & AV_CH_BACK_CENTER) {
+                    avpriv_request_sample(avctx, "XCh with Back center channel");
+                    return AVERROR_INVALIDDATA;
+                }
+                avctx->channel_layout |= AV_CH_BACK_CENTER;
+                if (s->lfe) {
+                    avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
+                    s->channel_order_tab = ff_dca_channel_reorder_lfe_xch[s->amode];
+                } else {
+                    s->channel_order_tab = ff_dca_channel_reorder_nolfe_xch[s->amode];
+                }
+                if (s->channel_order_tab[s->xch_base_channel] < 0)
+                    return AVERROR_INVALIDDATA;
+            } else {
+                *channels       = num_core_channels + !!s->lfe;
+                s->xch_present = 0; /* disable further xch processing */
+                if (s->lfe) {
+                    avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
+                    s->channel_order_tab = ff_dca_channel_reorder_lfe[s->amode];
+                } else
+                    s->channel_order_tab = ff_dca_channel_reorder_nolfe[s->amode];
+            }
 
-        if (s->prim_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            /*
-             * Neither the core's auxiliary data nor our default tables contain
-             * downmix coefficients for the additional channel coded in the XCh
-             * extension, so when we're doing a Stereo downmix, don't decode it.
-             */
-            s->xch_disable = 1;
-        }
+            if (*channels > !!s->lfe &&
+                s->channel_order_tab[*channels - 1 - !!s->lfe] < 0)
+                return AVERROR_INVALIDDATA;
 
-        if (s->xch_present && !s->xch_disable) {
-            avctx->channel_layout |= AV_CH_BACK_CENTER;
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe_xch[s->amode];
-            } else {
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe_xch[s->amode];
+            if (av_get_channel_layout_nb_channels(avctx->channel_layout) != *channels) {
+                av_log(avctx, AV_LOG_ERROR, "Number of channels %d mismatches layout %d\n", *channels, av_get_channel_layout_nb_channels(avctx->channel_layout));
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (num_core_channels + !!s->lfe > 2 &&
+                avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
+                *channels              = 2;
+                s->output             = s->prim_channels == 2 ? s->amode : DCA_STEREO;
+                avctx->channel_layout = AV_CH_LAYOUT_STEREO;
             }
+            else if (avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE) {
+                static const int8_t dca_channel_order_native[9] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+                s->channel_order_tab = dca_channel_order_native;
+            }
+            s->lfe_index = ff_dca_lfe_index[s->amode];
         } else {
-            channels       = num_core_channels + !!s->lfe;
-            s->xch_present = 0; /* disable further xch processing */
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe[s->amode];
-            } else
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe[s->amode];
+            av_log(avctx, AV_LOG_ERROR,
+                   "Non standard configuration %d !\n", s->amode);
+            return AVERROR_INVALIDDATA;
         }
 
-        if (channels > !!s->lfe &&
-            s->channel_order_tab[channels - 1 - !!s->lfe] < 0)
-            return AVERROR_INVALIDDATA;
+        s->xxch_dmix_embedded = 0;
+    } else {
+        /* we only get here if an XXCH channel set can be added to the mix */
+        channel_mask = s->xxch_core_spkmask;
 
-        if (num_core_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            channels              = 2;
-            s->output             = s->prim_channels == 2 ? s->amode : DCA_STEREO;
-            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+        {
+            *channels = s->prim_channels + !!s->lfe;
+            for (i = 0; i < s->xxch_chset; i++) {
+                channel_mask |= s->xxch_spk_masks[i];
+            }
+        }
 
-            /* Stereo downmix coefficients
-             *
-             * The decoder can only downmix to 2-channel, so we need to ensure
-             * embedded downmix coefficients are actually targeting 2-channel.
-             */
-            if (s->core_downmix && (s->core_downmix_amode == DCA_STEREO ||
-                                    s->core_downmix_amode == DCA_STEREO_TOTAL)) {
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    /* Range checked earlier */
-                    s->downmix_coef[i][0] = dca_dmix_code(s->core_downmix_codes[i][0]);
-                    s->downmix_coef[i][1] = dca_dmix_code(s->core_downmix_codes[i][1]);
-                }
-                s->output = s->core_downmix_amode;
-            } else {
-                int am = s->amode & DCA_CHANNEL_MASK;
-                if (am >= FF_ARRAY_ELEMS(ff_dca_default_coeffs)) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid channel mode %d\n", am);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (num_core_channels + !!s->lfe >
-                    FF_ARRAY_ELEMS(ff_dca_default_coeffs[0])) {
-                    avpriv_request_sample(s->avctx, "Downmixing %d channels",
-                                          s->prim_channels + !!s->lfe);
-                    return AVERROR_PATCHWELCOME;
-                }
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    s->downmix_coef[i][0] = ff_dca_default_coeffs[am][i][0];
-                    s->downmix_coef[i][1] = ff_dca_default_coeffs[am][i][1];
-                }
+        /* Given the DTS spec'ed channel mask, generate an avcodec version */
+        channel_layout = 0;
+        for (i = 0; i < s->xxch_nbits_spk_mask; ++i) {
+            if (channel_mask & (1 << i)) {
+                channel_layout |= ff_dca_map_xxch_to_native[i];
             }
-            ff_dlog(s->avctx, "Stereo downmix coeffs:\n");
-            for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                ff_dlog(s->avctx, "L, input channel %d = %f\n", i,
-                        s->downmix_coef[i][0]);
-                ff_dlog(s->avctx, "R, input channel %d = %f\n", i,
-                        s->downmix_coef[i][1]);
+        }
+
+        /* make sure that we have managed to get equivalent dts/avcodec channel
+         * masks in some sense -- unfortunately some channels could overlap */
+        if (av_popcount(channel_mask) != av_popcount(channel_layout)) {
+            av_log(avctx, AV_LOG_DEBUG,
+                   "DTS-XXCH: Inconsistent avcodec/dts channel layouts\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        avctx->channel_layout = channel_layout;
+
+        if (!(avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE)) {
+            /* Estimate DTS --> avcodec ordering table */
+            for (chset = -1, j = 0; chset < s->xxch_chset; ++chset) {
+                mask = chset >= 0 ? s->xxch_spk_masks[chset]
+                                  : s->xxch_core_spkmask;
+                for (i = 0; i < s->xxch_nbits_spk_mask; i++) {
+                    if (mask & ~(DCA_XXCH_LFE1 | DCA_XXCH_LFE2) & (1 << i)) {
+                        lavc = ff_dca_map_xxch_to_native[i];
+                        posn = av_popcount(channel_layout & (lavc - 1));
+                        s->xxch_order_tab[j++] = posn;
+                    }
+                }
+
             }
-            ff_dlog(s->avctx, "\n");
+
+            s->lfe_index = av_popcount(channel_layout & (AV_CH_LOW_FREQUENCY-1));
+        } else { /* native ordering */
+            for (i = 0; i < *channels; i++)
+                s->xxch_order_tab[i] = i;
+
+            s->lfe_index = *channels - 1;
         }
-    } else {
-        av_log(avctx, AV_LOG_ERROR, "Non standard configuration %d !\n", s->amode);
-        return AVERROR_INVALIDDATA;
+
+        s->channel_order_tab = s->xxch_order_tab;
     }
 
     return 0;
@@ -1350,20 +1705,30 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
-
     int lfe_samples;
     int num_core_channels = 0;
     int i, ret;
-    float  **samples_flt;
+    float **samples_flt;
+    float *src_chan;
+    float *dst_chan;
     DCAContext *s = avctx->priv_data;
     int channels, full_channels;
+    float scale;
+    int achan;
+    int chset;
+    int mask;
+    int j, k;
+    int endch;
     int upsample = 0;
 
     s->exss_ext_mask = 0;
     s->xch_present   = 0;
 
-    s->dca_buffer_size = ff_dca_convert_bitstream(buf, buf_size, s->dca_buffer,
-                                                  DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE);
+    s->dca_buffer_size = AVERROR_INVALIDDATA;
+    for (i = 0; i < buf_size - 3 && s->dca_buffer_size == AVERROR_INVALIDDATA; i++)
+        s->dca_buffer_size = avpriv_dca_convert_bitstream(buf + i, buf_size - i, s->dca_buffer,
+                                                          DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE);
+
     if (s->dca_buffer_size == AVERROR_INVALIDDATA) {
         av_log(avctx, AV_LOG_ERROR, "Not a valid DCA frame\n");
         return AVERROR_INVALIDDATA;
@@ -1375,7 +1740,6 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
     }
     // set AVCodec values with parsed data
     avctx->sample_rate = s->sample_rate;
-    avctx->bit_rate    = s->bit_rate;
 
     s->profile = FF_PROFILE_DTS;
 
@@ -1389,8 +1753,51 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
     /* record number of core channels incase less than max channels are requested */
     num_core_channels = s->prim_channels;
 
+    if (s->prim_channels + !!s->lfe > 2 &&
+        avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
+            /* Stereo downmix coefficients
+             *
+             * The decoder can only downmix to 2-channel, so we need to ensure
+             * embedded downmix coefficients are actually targeting 2-channel.
+             */
+            if (s->core_downmix && (s->core_downmix_amode == DCA_STEREO ||
+                                    s->core_downmix_amode == DCA_STEREO_TOTAL)) {
+                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
+                    /* Range checked earlier */
+                    s->downmix_coef[i][0] = dca_dmix_code(s->core_downmix_codes[i][0]);
+                    s->downmix_coef[i][1] = dca_dmix_code(s->core_downmix_codes[i][1]);
+                }
+                s->output = s->core_downmix_amode;
+            } else {
+                int am = s->amode & DCA_CHANNEL_MASK;
+                if (am >= FF_ARRAY_ELEMS(ff_dca_default_coeffs)) {
+                    av_log(s->avctx, AV_LOG_ERROR,
+                           "Invalid channel mode %d\n", am);
+                    return AVERROR_INVALIDDATA;
+                }
+                if (num_core_channels + !!s->lfe >
+                    FF_ARRAY_ELEMS(ff_dca_default_coeffs[0])) {
+                    avpriv_request_sample(s->avctx, "Downmixing %d channels",
+                                          s->prim_channels + !!s->lfe);
+                    return AVERROR_PATCHWELCOME;
+                }
+                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
+                    s->downmix_coef[i][0] = ff_dca_default_coeffs[am][i][0];
+                    s->downmix_coef[i][1] = ff_dca_default_coeffs[am][i][1];
+                }
+            }
+            ff_dlog(s->avctx, "Stereo downmix coeffs:\n");
+            for (i = 0; i < num_core_channels + !!s->lfe; i++) {
+                ff_dlog(s->avctx, "L, input channel %d = %f\n", i,
+                        s->downmix_coef[i][0]);
+                ff_dlog(s->avctx, "R, input channel %d = %f\n", i,
+                        s->downmix_coef[i][1]);
+            }
+            ff_dlog(s->avctx, "\n");
+    }
+
     if (s->ext_coding)
-        s->core_ext_mask = dca_ext_audio_descr_mask[s->ext_descr];
+        s->core_ext_mask = ff_dca_ext_audio_descr_mask[s->ext_descr];
     else
         s->core_ext_mask = 0;
 
@@ -1400,10 +1807,9 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
 
     full_channels = channels = s->prim_channels + !!s->lfe;
 
-    ret = set_channel_layout(avctx, channels, num_core_channels);
+    ret = set_channel_layout(avctx, &channels, num_core_channels);
     if (ret < 0)
         return ret;
-    avctx->channels = channels;
 
     /* get output buffer */
     frame->nb_samples = 256 * (s->sample_blocks / SAMPLES_PER_SUBBAND);
@@ -1434,20 +1840,24 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
             /* If downmixing to stereo, don't decode additional channels.
              * FIXME: Using the xch_disable flag for this doesn't seem right. */
             if (!s->xch_disable)
-                avctx->channels += s->xll_channels - s->xll_residual_channels;
+                channels = s->xll_channels;
         }
     }
 
+    if (avctx->channels != channels) {
+        if (avctx->channels)
+            av_log(avctx, AV_LOG_INFO, "Number of channels changed in DCA decoder (%d -> %d)\n", avctx->channels, channels);
+        avctx->channels = channels;
+    }
+
     /* FIXME: This is an ugly hack, to just revert to the default
      * layout if we have additional channels. Need to convert the XLL
-     * channel masks to libav channel_layout mask. */
+     * channel masks to ffmpeg channel_layout mask. */
     if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
         avctx->channel_layout = 0;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples_flt = (float **) frame->extended_data;
 
     /* allocate buffer for extra channels if downmixing */
@@ -1488,8 +1898,55 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
             float *back_chan = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel]];
             float *lt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 2]];
             float *rt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 1]];
-            s->fdsp.vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
-            s->fdsp.vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
+            s->fdsp->vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
+            s->fdsp->vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
+        }
+
+        /* If stream contains XXCH, we might need to undo an embedded downmix */
+        if (s->xxch_dmix_embedded) {
+            /* Loop over channel sets in turn */
+            ch = num_core_channels;
+            for (chset = 0; chset < s->xxch_chset; chset++) {
+                endch = ch + s->xxch_chset_nch[chset];
+                mask = s->xxch_dmix_embedded;
+
+                /* undo downmix */
+                for (j = ch; j < endch; j++) {
+                    if (mask & (1 << j)) { /* this channel has been mixed-out */
+                        src_chan = s->samples_chanptr[s->channel_order_tab[j]];
+                        for (k = 0; k < endch; k++) {
+                            achan = s->channel_order_tab[k];
+                            scale = s->xxch_dmix_coeff[j][k];
+                            if (scale != 0.0) {
+                                dst_chan = s->samples_chanptr[achan];
+                                s->fdsp->vector_fmac_scalar(dst_chan, src_chan,
+                                                           -scale, 256);
+                            }
+                        }
+                    }
+                }
+
+                /* if a downmix has been embedded then undo the pre-scaling */
+                if ((mask & (1 << ch)) && s->xxch_dmix_sf[chset] != 1.0f) {
+                    scale = s->xxch_dmix_sf[chset];
+
+                    for (j = 0; j < ch; j++) {
+                        src_chan = s->samples_chanptr[s->channel_order_tab[j]];
+                        for (k = 0; k < 256; k++)
+                            src_chan[k] *= scale;
+                    }
+
+                    /* LFE channel is always part of core, scale if it exists */
+                    if (s->lfe) {
+                        src_chan = s->samples_chanptr[s->lfe_index];
+                        for (k = 0; k < 256; k++)
+                            src_chan[k] *= scale;
+                    }
+                }
+
+                ch = endch;
+            }
+
         }
     }
 
@@ -1512,6 +1969,9 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
+    if (   avctx->profile != FF_PROFILE_DTS_HD_MA
+        && avctx->profile != FF_PROFILE_DTS_HD_HRA)
+        avctx->bit_rate = s->bit_rate;
     *got_frame_ptr = 1;
 
     return buf_size;
@@ -1530,7 +1990,10 @@ static av_cold int dca_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     dca_init_vlcs();
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
     ff_mdct_init(&s->imdct, 6, 1, 1.0);
     ff_synth_filter_init(&s->synth);
     ff_dcadsp_init(&s->dcadsp);
@@ -1551,6 +2014,7 @@ static av_cold int dca_decode_end(AVCodecContext *avctx)
     DCAContext *s = avctx->priv_data;
     ff_mdct_end(&s->imdct);
     av_freep(&s->extra_channels_buffer);
+    av_freep(&s->fdsp);
     av_freep(&s->xll_sample_buf);
     av_freep(&s->qmf64_table);
     return 0;
@@ -1576,6 +2040,7 @@ static const AVClass dca_decoder_class = {
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
 };
 
 AVCodec ff_dca_decoder = {
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 34b5da2d24..b32d962c33 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004 Gildas Bazin
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index 0fa75a5ed6..abf577b61f 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,7 +22,7 @@
 #include "avfft.h"
 #include "synth_filter.h"
 
-#define DCA_SUBBANDS 32
+#define DCA_SUBBANDS 64
 
 typedef struct DCADSPContext {
     void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
diff --git a/libavcodec/dcaenc.c b/libavcodec/dcaenc.c
new file mode 100644
index 0000000000..5a6bdac010
--- /dev/null
+++ b/libavcodec/dcaenc.c
@@ -0,0 +1,991 @@
+/*
+ * DCA encoder
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *               2010 Benjamin Larsson
+ *               2011 Xiang Wang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/common.h"
+#include "avcodec.h"
+#include "dca.h"
+#include "dcadata.h"
+#include "dcaenc.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define MAX_CHANNELS 6
+#define DCA_MAX_FRAME_SIZE 16384
+#define DCA_HEADER_SIZE 13
+#define DCA_LFE_SAMPLES 8
+
+#define DCAENC_SUBBANDS 32
+#define SUBFRAMES 1
+#define SUBSUBFRAMES 2
+#define SUBBAND_SAMPLES (SUBFRAMES * SUBSUBFRAMES * 8)
+#define AUBANDS 25
+
+typedef struct DCAEncContext {
+    PutBitContext pb;
+    int frame_size;
+    int frame_bits;
+    int fullband_channels;
+    int channels;
+    int lfe_channel;
+    int samplerate_index;
+    int bitrate_index;
+    int channel_config;
+    const int32_t *band_interpolation;
+    const int32_t *band_spectrum;
+    int lfe_scale_factor;
+    softfloat lfe_quant;
+    int32_t lfe_peak_cb;
+    const int8_t *channel_order_tab;  ///< channel reordering table, lfe and non lfe
+
+    int32_t history[512][MAX_CHANNELS]; /* This is a circular buffer */
+    int32_t subband[SUBBAND_SAMPLES][DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t quantized[SUBBAND_SAMPLES][DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t peak_cb[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t downsampled_lfe[DCA_LFE_SAMPLES];
+    int32_t masking_curve_cb[SUBSUBFRAMES][256];
+    int abits[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int scale_factor[DCAENC_SUBBANDS][MAX_CHANNELS];
+    softfloat quant[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t eff_masking_curve_cb[256];
+    int32_t band_masking_cb[32];
+    int32_t worst_quantization_noise;
+    int32_t worst_noise_ever;
+    int consumed_bits;
+} DCAEncContext;
+
+static int32_t cos_table[2048];
+static int32_t band_interpolation[2][512];
+static int32_t band_spectrum[2][8];
+static int32_t auf[9][AUBANDS][256];
+static int32_t cb_to_add[256];
+static int32_t cb_to_level[2048];
+static int32_t lfe_fir_64i[512];
+
+/* Transfer function of outer and middle ear, Hz -> dB */
+static double hom(double f)
+{
+    double f1 = f / 1000;
+
+    return -3.64 * pow(f1, -0.8)
+           + 6.8 * exp(-0.6 * (f1 - 3.4) * (f1 - 3.4))
+           - 6.0 * exp(-0.15 * (f1 - 8.7) * (f1 - 8.7))
+           - 0.0006 * (f1 * f1) * (f1 * f1);
+}
+
+static double gammafilter(int i, double f)
+{
+    double h = (f - fc[i]) / erb[i];
+
+    h = 1 + h * h;
+    h = 1 / (h * h);
+    return 20 * log10(h);
+}
+
+static int encode_init(AVCodecContext *avctx)
+{
+    DCAEncContext *c = avctx->priv_data;
+    uint64_t layout = avctx->channel_layout;
+    int i, min_frame_bits;
+
+    c->fullband_channels = c->channels = avctx->channels;
+    c->lfe_channel = (avctx->channels == 3 || avctx->channels == 6);
+    c->band_interpolation = band_interpolation[1];
+    c->band_spectrum = band_spectrum[1];
+    c->worst_quantization_noise = -2047;
+    c->worst_noise_ever = -2047;
+
+    if (!layout) {
+        av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The "
+                                      "encoder will guess the layout, but it "
+                                      "might be incorrect.\n");
+        layout = av_get_default_channel_layout(avctx->channels);
+    }
+    switch (layout) {
+    case AV_CH_LAYOUT_MONO:         c->channel_config = 0; break;
+    case AV_CH_LAYOUT_STEREO:       c->channel_config = 2; break;
+    case AV_CH_LAYOUT_2_2:          c->channel_config = 8; break;
+    case AV_CH_LAYOUT_5POINT0:      c->channel_config = 9; break;
+    case AV_CH_LAYOUT_5POINT1:      c->channel_config = 9; break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported channel layout!\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->lfe_channel) {
+        c->fullband_channels--;
+        c->channel_order_tab = ff_dca_channel_reorder_lfe[c->channel_config];
+    } else {
+        c->channel_order_tab = ff_dca_channel_reorder_nolfe[c->channel_config];
+    }
+
+    for (i = 0; i < 9; i++) {
+        if (sample_rates[i] == avctx->sample_rate)
+            break;
+    }
+    if (i == 9)
+        return AVERROR(EINVAL);
+    c->samplerate_index = i;
+
+    if (avctx->bit_rate < 32000 || avctx->bit_rate > 3840000) {
+        av_log(avctx, AV_LOG_ERROR, "Bit rate %"PRId64" not supported.", (int64_t)avctx->bit_rate);
+        return AVERROR(EINVAL);
+    }
+    for (i = 0; ff_dca_bit_rates[i] < avctx->bit_rate; i++)
+        ;
+    c->bitrate_index = i;
+    avctx->bit_rate = ff_dca_bit_rates[i];
+    c->frame_bits = FFALIGN((avctx->bit_rate * 512 + avctx->sample_rate - 1) / avctx->sample_rate, 32);
+    min_frame_bits = 132 + (493 + 28 * 32) * c->fullband_channels + c->lfe_channel * 72;
+    if (c->frame_bits < min_frame_bits || c->frame_bits > (DCA_MAX_FRAME_SIZE << 3))
+        return AVERROR(EINVAL);
+
+    c->frame_size = (c->frame_bits + 7) / 8;
+
+    avctx->frame_size = 32 * SUBBAND_SAMPLES;
+
+    if (!cos_table[0]) {
+        int j, k;
+
+        for (i = 0; i < 2048; i++) {
+            cos_table[i]   = (int32_t)(0x7fffffff * cos(M_PI * i / 1024));
+            cb_to_level[i] = (int32_t)(0x7fffffff * pow(10, -0.005 * i));
+        }
+
+        for (k = 0; k < 32; k++) {
+            for (j = 0; j < 8; j++) {
+                lfe_fir_64i[64 * j + k] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+                lfe_fir_64i[64 * (7-j) + (63 - k)] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+            }
+        }
+
+        for (i = 0; i < 512; i++) {
+            band_interpolation[0][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_perfect[i]);
+            band_interpolation[1][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_nonperfect[i]);
+        }
+
+        for (i = 0; i < 9; i++) {
+            for (j = 0; j < AUBANDS; j++) {
+                for (k = 0; k < 256; k++) {
+                    double freq = sample_rates[i] * (k + 0.5) / 512;
+
+                    auf[i][j][k] = (int32_t)(10 * (hom(freq) + gammafilter(j, freq)));
+                }
+            }
+        }
+
+        for (i = 0; i < 256; i++) {
+            double add = 1 + pow(10, -0.01 * i);
+            cb_to_add[i] = (int32_t)(100 * log10(add));
+        }
+        for (j = 0; j < 8; j++) {
+            double accum = 0;
+            for (i = 0; i < 512; i++) {
+                double reconst = ff_dca_fir_32bands_perfect[i] * ((i & 64) ? (-1) : 1);
+                accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+            }
+            band_spectrum[0][j] = (int32_t)(200 * log10(accum));
+        }
+        for (j = 0; j < 8; j++) {
+            double accum = 0;
+            for (i = 0; i < 512; i++) {
+                double reconst = ff_dca_fir_32bands_nonperfect[i] * ((i & 64) ? (-1) : 1);
+                accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+            }
+            band_spectrum[1][j] = (int32_t)(200 * log10(accum));
+        }
+    }
+    return 0;
+}
+
+static inline int32_t cos_t(int x)
+{
+    return cos_table[x & 2047];
+}
+
+static inline int32_t sin_t(int x)
+{
+    return cos_t(x - 512);
+}
+
+static inline int32_t half32(int32_t a)
+{
+    return (a + 1) >> 1;
+}
+
+static inline int32_t mul32(int32_t a, int32_t b)
+{
+    int64_t r = (int64_t)a * b + 0x80000000ULL;
+    return r >> 32;
+}
+
+static void subband_transform(DCAEncContext *c, const int32_t *input)
+{
+    int ch, subs, i, k, j;
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        /* History is copied because it is also needed for PSY */
+        int32_t hist[512];
+        int hist_start = 0;
+        const int chi = c->channel_order_tab[ch];
+
+        for (i = 0; i < 512; i++)
+            hist[i] = c->history[i][ch];
+
+        for (subs = 0; subs < SUBBAND_SAMPLES; subs++) {
+            int32_t accum[64];
+            int32_t resp;
+            int band;
+
+            /* Calculate the convolutions at once */
+            for (i = 0; i < 64; i++)
+                accum[i] = 0;
+
+            for (k = 0, i = hist_start, j = 0;
+                    i < 512; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+            for (i = 0; i < hist_start; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+
+            for (k = 16; k < 32; k++)
+                accum[k] = accum[k] - accum[31 - k];
+            for (k = 32; k < 48; k++)
+                accum[k] = accum[k] + accum[95 - k];
+
+            for (band = 0; band < 32; band++) {
+                resp = 0;
+                for (i = 16; i < 48; i++) {
+                    int s = (2 * band + 1) * (2 * (i + 16) + 1);
+                    resp += mul32(accum[i], cos_t(s << 3)) >> 3;
+                }
+
+                c->subband[subs][band][ch] = ((band + 1) & 2) ? -resp : resp;
+            }
+
+            /* Copy in 32 new samples from input */
+            for (i = 0; i < 32; i++)
+                hist[i + hist_start] = input[(subs * 32 + i) * c->channels + chi];
+            hist_start = (hist_start + 32) & 511;
+        }
+    }
+}
+
+static void lfe_downsample(DCAEncContext *c, const int32_t *input)
+{
+    /* FIXME: make 128x LFE downsampling possible */
+    const int lfech = ff_dca_lfe_index[c->channel_config];
+    int i, j, lfes;
+    int32_t hist[512];
+    int32_t accum;
+    int hist_start = 0;
+
+    for (i = 0; i < 512; i++)
+        hist[i] = c->history[i][c->channels - 1];
+
+    for (lfes = 0; lfes < DCA_LFE_SAMPLES; lfes++) {
+        /* Calculate the convolution */
+        accum = 0;
+
+        for (i = hist_start, j = 0; i < 512; i++, j++)
+            accum += mul32(hist[i], lfe_fir_64i[j]);
+        for (i = 0; i < hist_start; i++, j++)
+            accum += mul32(hist[i], lfe_fir_64i[j]);
+
+        c->downsampled_lfe[lfes] = accum;
+
+        /* Copy in 64 new samples from input */
+        for (i = 0; i < 64; i++)
+            hist[i + hist_start] = input[(lfes * 64 + i) * c->channels + lfech];
+
+        hist_start = (hist_start + 64) & 511;
+    }
+}
+
+typedef struct {
+    int32_t re;
+    int32_t im;
+} cplx32;
+
+static void fft(const int32_t in[2 * 256], cplx32 out[256])
+{
+    cplx32 buf[256], rin[256], rout[256];
+    int i, j, k, l;
+
+    /* do two transforms in parallel */
+    for (i = 0; i < 256; i++) {
+        /* Apply the Hann window */
+        rin[i].re = mul32(in[2 * i], 0x3fffffff - (cos_t(8 * i + 2) >> 1));
+        rin[i].im = mul32(in[2 * i + 1], 0x3fffffff - (cos_t(8 * i + 6) >> 1));
+    }
+    /* pre-rotation */
+    for (i = 0; i < 256; i++) {
+        buf[i].re = mul32(cos_t(4 * i + 2), rin[i].re)
+                  - mul32(sin_t(4 * i + 2), rin[i].im);
+        buf[i].im = mul32(cos_t(4 * i + 2), rin[i].im)
+                  + mul32(sin_t(4 * i + 2), rin[i].re);
+    }
+
+    for (j = 256, l = 1; j != 1; j >>= 1, l <<= 1) {
+        for (k = 0; k < 256; k += j) {
+            for (i = k; i < k + j / 2; i++) {
+                cplx32 sum, diff;
+                int t = 8 * l * i;
+
+                sum.re = buf[i].re + buf[i + j / 2].re;
+                sum.im = buf[i].im + buf[i + j / 2].im;
+
+                diff.re = buf[i].re - buf[i + j / 2].re;
+                diff.im = buf[i].im - buf[i + j / 2].im;
+
+                buf[i].re = half32(sum.re);
+                buf[i].im = half32(sum.im);
+
+                buf[i + j / 2].re = mul32(diff.re, cos_t(t))
+                                  - mul32(diff.im, sin_t(t));
+                buf[i + j / 2].im = mul32(diff.im, cos_t(t))
+                                  + mul32(diff.re, sin_t(t));
+            }
+        }
+    }
+    /* post-rotation */
+    for (i = 0; i < 256; i++) {
+        int b = ff_reverse[i];
+        rout[i].re = mul32(buf[b].re, cos_t(4 * i))
+                   - mul32(buf[b].im, sin_t(4 * i));
+        rout[i].im = mul32(buf[b].im, cos_t(4 * i))
+                   + mul32(buf[b].re, sin_t(4 * i));
+    }
+    for (i = 0; i < 256; i++) {
+        /* separate the results of the two transforms */
+        cplx32 o1, o2;
+
+        o1.re =  rout[i].re - rout[255 - i].re;
+        o1.im =  rout[i].im + rout[255 - i].im;
+
+        o2.re =  rout[i].im - rout[255 - i].im;
+        o2.im = -rout[i].re - rout[255 - i].re;
+
+        /* combine them into one long transform */
+        out[i].re = mul32( o1.re + o2.re, cos_t(2 * i + 1))
+                  + mul32( o1.im - o2.im, sin_t(2 * i + 1));
+        out[i].im = mul32( o1.im + o2.im, cos_t(2 * i + 1))
+                  + mul32(-o1.re + o2.re, sin_t(2 * i + 1));
+    }
+}
+
+static int32_t get_cb(int32_t in)
+{
+    int i, res;
+
+    res = 0;
+    if (in < 0)
+        in = -in;
+    for (i = 1024; i > 0; i >>= 1) {
+        if (cb_to_level[i + res] >= in)
+            res += i;
+    }
+    return -res;
+}
+
+static int32_t add_cb(int32_t a, int32_t b)
+{
+    if (a < b)
+        FFSWAP(int32_t, a, b);
+
+    if (a - b >= 256)
+        return a;
+    return a + cb_to_add[a - b];
+}
+
+static void adjust_jnd(int samplerate_index,
+                       const int32_t in[512], int32_t out_cb[256])
+{
+    int32_t power[256];
+    cplx32 out[256];
+    int32_t out_cb_unnorm[256];
+    int32_t denom;
+    const int32_t ca_cb = -1114;
+    const int32_t cs_cb = 928;
+    int i, j;
+
+    fft(in, out);
+
+    for (j = 0; j < 256; j++) {
+        power[j] = add_cb(get_cb(out[j].re), get_cb(out[j].im));
+        out_cb_unnorm[j] = -2047; /* and can only grow */
+    }
+
+    for (i = 0; i < AUBANDS; i++) {
+        denom = ca_cb; /* and can only grow */
+        for (j = 0; j < 256; j++)
+            denom = add_cb(denom, power[j] + auf[samplerate_index][i][j]);
+        for (j = 0; j < 256; j++)
+            out_cb_unnorm[j] = add_cb(out_cb_unnorm[j],
+                    -denom + auf[samplerate_index][i][j]);
+    }
+
+    for (j = 0; j < 256; j++)
+        out_cb[j] = add_cb(out_cb[j], -out_cb_unnorm[j] - ca_cb - cs_cb);
+}
+
+typedef void (*walk_band_t)(DCAEncContext *c, int band1, int band2, int f,
+                            int32_t spectrum1, int32_t spectrum2, int channel,
+                            int32_t * arg);
+
+static void walk_band_low(DCAEncContext *c, int band, int channel,
+                          walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 0) {
+        for (f = 0; f < 4; f++)
+            walk(c, 0, 0, f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band - 1, 8 * band - 4 + f,
+                    c->band_spectrum[7 - f], c->band_spectrum[f], channel, arg);
+    }
+}
+
+static void walk_band_high(DCAEncContext *c, int band, int channel,
+                           walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 31) {
+        for (f = 0; f < 4; f++)
+            walk(c, 31, 31, 256 - 4 + f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band + 1, 8 * band + 4 + f,
+                    c->band_spectrum[f], c->band_spectrum[7 - f], channel, arg);
+    }
+}
+
+static void update_band_masking(DCAEncContext *c, int band1, int band2,
+                                int f, int32_t spectrum1, int32_t spectrum2,
+                                int channel, int32_t * arg)
+{
+    int32_t value = c->eff_masking_curve_cb[f] - spectrum1;
+
+    if (value < c->band_masking_cb[band1])
+        c->band_masking_cb[band1] = value;
+}
+
+static void calc_masking(DCAEncContext *c, const int32_t *input)
+{
+    int i, k, band, ch, ssf;
+    int32_t data[512];
+
+    for (i = 0; i < 256; i++)
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            c->masking_curve_cb[ssf][i] = -2047;
+
+    for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            for (i = 0, k = 128 + 256 * ssf; k < 512; i++, k++)
+                data[i] = c->history[k][ch];
+            for (k -= 512; i < 512; i++, k++)
+                data[i] = input[k * c->channels + chi];
+            adjust_jnd(c->samplerate_index, data, c->masking_curve_cb[ssf]);
+        }
+    for (i = 0; i < 256; i++) {
+        int32_t m = 2048;
+
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            if (c->masking_curve_cb[ssf][i] < m)
+                m = c->masking_curve_cb[ssf][i];
+        c->eff_masking_curve_cb[i] = m;
+    }
+
+    for (band = 0; band < 32; band++) {
+        c->band_masking_cb[band] = 2048;
+        walk_band_low(c, band, 0, update_band_masking, NULL);
+        walk_band_high(c, band, 0, update_band_masking, NULL);
+    }
+}
+
+static void find_peaks(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            int sample;
+            int32_t m = 0;
+
+            for (sample = 0; sample < SUBBAND_SAMPLES; sample++) {
+                int32_t s = abs(c->subband[sample][band][ch]);
+                if (m < s)
+                    m = s;
+            }
+            c->peak_cb[band][ch] = get_cb(m);
+        }
+
+    if (c->lfe_channel) {
+        int sample;
+        int32_t m = 0;
+
+        for (sample = 0; sample < DCA_LFE_SAMPLES; sample++)
+            if (m < abs(c->downsampled_lfe[sample]))
+                m = abs(c->downsampled_lfe[sample]);
+        c->lfe_peak_cb = get_cb(m);
+    }
+}
+
+static const int snr_fudge = 128;
+#define USED_1ABITS 1
+#define USED_NABITS 2
+#define USED_26ABITS 4
+
+static int init_quantization_noise(DCAEncContext *c, int noise)
+{
+    int ch, band, ret = 0;
+
+    c->consumed_bits = 132 + 493 * c->fullband_channels;
+    if (c->lfe_channel)
+        c->consumed_bits += 72;
+
+    /* attempt to guess the bit distribution based on the prevoius frame */
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            int snr_cb = c->peak_cb[band][ch] - c->band_masking_cb[band] - noise;
+
+            if (snr_cb >= 1312) {
+                c->abits[band][ch] = 26;
+                ret |= USED_26ABITS;
+            } else if (snr_cb >= 222) {
+                c->abits[band][ch] = 8 + mul32(snr_cb - 222, 69000000);
+                ret |= USED_NABITS;
+            } else if (snr_cb >= 0) {
+                c->abits[band][ch] = 2 + mul32(snr_cb, 106000000);
+                ret |= USED_NABITS;
+            } else {
+                c->abits[band][ch] = 1;
+                ret |= USED_1ABITS;
+            }
+        }
+    }
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            c->consumed_bits += bit_consumption[c->abits[band][ch]];
+        }
+
+    return ret;
+}
+
+static void assign_bits(DCAEncContext *c)
+{
+    /* Find the bounds where the binary search should work */
+    int low, high, down;
+    int used_abits = 0;
+
+    init_quantization_noise(c, c->worst_quantization_noise);
+    low = high = c->worst_quantization_noise;
+    if (c->consumed_bits > c->frame_bits) {
+        while (c->consumed_bits > c->frame_bits) {
+            av_assert0(used_abits != USED_1ABITS);
+            low = high;
+            high += snr_fudge;
+            used_abits = init_quantization_noise(c, high);
+        }
+    } else {
+        while (c->consumed_bits <= c->frame_bits) {
+            high = low;
+            if (used_abits == USED_26ABITS)
+                goto out; /* The requested bitrate is too high, pad with zeros */
+            low -= snr_fudge;
+            used_abits = init_quantization_noise(c, low);
+        }
+    }
+
+    /* Now do a binary search between low and high to see what fits */
+    for (down = snr_fudge >> 1; down; down >>= 1) {
+        init_quantization_noise(c, high - down);
+        if (c->consumed_bits <= c->frame_bits)
+            high -= down;
+    }
+    init_quantization_noise(c, high);
+out:
+    c->worst_quantization_noise = high;
+    if (high > c->worst_noise_ever)
+        c->worst_noise_ever = high;
+}
+
+static void shift_history(DCAEncContext *c, const int32_t *input)
+{
+    int k, ch;
+
+    for (k = 0; k < 512; k++)
+        for (ch = 0; ch < c->channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            c->history[k][ch] = input[k * c->channels + chi];
+        }
+}
+
+static int32_t quantize_value(int32_t value, softfloat quant)
+{
+    int32_t offset = 1 << (quant.e - 1);
+
+    value = mul32(value, quant.m) + offset;
+    value = value >> quant.e;
+    return value;
+}
+
+static int calc_one_scale(int32_t peak_cb, int abits, softfloat *quant)
+{
+    int32_t peak;
+    int our_nscale, try_remove;
+    softfloat our_quant;
+
+    av_assert0(peak_cb <= 0);
+    av_assert0(peak_cb >= -2047);
+
+    our_nscale = 127;
+    peak = cb_to_level[-peak_cb];
+
+    for (try_remove = 64; try_remove > 0; try_remove >>= 1) {
+        if (scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e <= 17)
+            continue;
+        our_quant.m = mul32(scalefactor_inv[our_nscale - try_remove].m, stepsize_inv[abits].m);
+        our_quant.e = scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e - 17;
+        if ((quant_levels[abits] - 1) / 2 < quantize_value(peak, our_quant))
+            continue;
+        our_nscale -= try_remove;
+    }
+
+    if (our_nscale >= 125)
+        our_nscale = 124;
+
+    quant->m = mul32(scalefactor_inv[our_nscale].m, stepsize_inv[abits].m);
+    quant->e = scalefactor_inv[our_nscale].e + stepsize_inv[abits].e - 17;
+    av_assert0((quant_levels[abits] - 1) / 2 >= quantize_value(peak, *quant));
+
+    return our_nscale;
+}
+
+static void calc_scales(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            c->scale_factor[band][ch] = calc_one_scale(c->peak_cb[band][ch],
+                                                       c->abits[band][ch],
+                                                       &c->quant[band][ch]);
+
+    if (c->lfe_channel)
+        c->lfe_scale_factor = calc_one_scale(c->lfe_peak_cb, 11, &c->lfe_quant);
+}
+
+static void quantize_all(DCAEncContext *c)
+{
+    int sample, band, ch;
+
+    for (sample = 0; sample < SUBBAND_SAMPLES; sample++)
+        for (band = 0; band < 32; band++)
+            for (ch = 0; ch < c->fullband_channels; ch++)
+                c->quantized[sample][band][ch] = quantize_value(c->subband[sample][band][ch], c->quant[band][ch]);
+}
+
+static void put_frame_header(DCAEncContext *c)
+{
+    /* SYNC */
+    put_bits(&c->pb, 16, 0x7ffe);
+    put_bits(&c->pb, 16, 0x8001);
+
+    /* Frame type: normal */
+    put_bits(&c->pb, 1, 1);
+
+    /* Deficit sample count: none */
+    put_bits(&c->pb, 5, 31);
+
+    /* CRC is not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Number of PCM sample blocks */
+    put_bits(&c->pb, 7, SUBBAND_SAMPLES - 1);
+
+    /* Primary frame byte size */
+    put_bits(&c->pb, 14, c->frame_size - 1);
+
+    /* Audio channel arrangement */
+    put_bits(&c->pb, 6, c->channel_config);
+
+    /* Core audio sampling frequency */
+    put_bits(&c->pb, 4, bitstream_sfreq[c->samplerate_index]);
+
+    /* Transmission bit rate */
+    put_bits(&c->pb, 5, c->bitrate_index);
+
+    /* Embedded down mix: disabled */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded dynamic range flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded time stamp flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Auxiliary data flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* HDCD source: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Extension audio ID: N/A */
+    put_bits(&c->pb, 3, 0);
+
+    /* Extended audio data: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Audio sync word insertion flag: after each sub-frame */
+    put_bits(&c->pb, 1, 0);
+
+    /* Low frequency effects flag: not present or 64x subsampling */
+    put_bits(&c->pb, 2, c->lfe_channel ? 2 : 0);
+
+    /* Predictor history switch flag: on */
+    put_bits(&c->pb, 1, 1);
+
+    /* No CRC */
+    /* Multirate interpolator switch: non-perfect reconstruction */
+    put_bits(&c->pb, 1, 0);
+
+    /* Encoder software revision: 7 */
+    put_bits(&c->pb, 4, 7);
+
+    /* Copy history: 0 */
+    put_bits(&c->pb, 2, 0);
+
+    /* Source PCM resolution: 16 bits, not DTS ES */
+    put_bits(&c->pb, 3, 0);
+
+    /* Front sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Surrounds sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Dialog normalization: 0 dB */
+    put_bits(&c->pb, 4, 0);
+}
+
+static void put_primary_audio_header(DCAEncContext *c)
+{
+    static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
+    static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
+
+    int ch, i;
+    /* Number of subframes */
+    put_bits(&c->pb, 4, SUBFRAMES - 1);
+
+    /* Number of primary audio channels */
+    put_bits(&c->pb, 3, c->fullband_channels - 1);
+
+    /* Subband activity count */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 2);
+
+    /* High frequency VQ start subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 1);
+
+    /* Joint intensity coding index: 0, 0 */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 0);
+
+    /* Transient mode codebook: A4, A4 (arbitrary) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 2, 0);
+
+    /* Scale factor code book: 7 bit linear, 7-bit sqrt table (for each channel) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 6);
+
+    /* Bit allocation quantizer select: linear 5-bit */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 6);
+
+    /* Quantization index codebook select: dummy data
+       to avoid transmission of scale factor adjustment */
+    for (i = 1; i < 11; i++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            put_bits(&c->pb, bitlen[i], thr[i]);
+
+    /* Scale factor adjustment index: not transmitted */
+    /* Audio header CRC check word: not transmitted */
+}
+
+static void put_subframe_samples(DCAEncContext *c, int ss, int band, int ch)
+{
+    if (c->abits[band][ch] <= 7) {
+        int sum, i, j;
+        for (i = 0; i < 8; i += 4) {
+            sum = 0;
+            for (j = 3; j >= 0; j--) {
+                sum *= quant_levels[c->abits[band][ch]];
+                sum += c->quantized[ss * 8 + i + j][band][ch];
+                sum += (quant_levels[c->abits[band][ch]] - 1) / 2;
+            }
+            put_bits(&c->pb, bit_consumption[c->abits[band][ch]] / 4, sum);
+        }
+    } else {
+        int i;
+        for (i = 0; i < 8; i++) {
+            int bits = bit_consumption[c->abits[band][ch]] / 16;
+            put_sbits(&c->pb, bits, c->quantized[ss * 8 + i][band][ch]);
+        }
+    }
+}
+
+static void put_subframe(DCAEncContext *c, int subframe)
+{
+    int i, band, ss, ch;
+
+    /* Subsubframes count */
+    put_bits(&c->pb, 2, SUBSUBFRAMES -1);
+
+    /* Partial subsubframe sample count: dummy */
+    put_bits(&c->pb, 3, 0);
+
+    /* Prediction mode: no ADPCM, in each channel and subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 1, 0);
+
+    /* Prediction VQ address: not transmitted */
+    /* Bit allocation index */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 5, c->abits[band][ch]);
+
+    if (SUBSUBFRAMES > 1) {
+        /* Transition mode: none for each channel and subband */
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                put_bits(&c->pb, 1, 0); /* codebook A4 */
+    }
+
+    /* Scale factors */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 7, c->scale_factor[band][ch]);
+
+    /* Joint subband scale factor codebook select: not transmitted */
+    /* Scale factors for joint subband coding: not transmitted */
+    /* Stereo down-mix coefficients: not transmitted */
+    /* Dynamic range coefficient: not transmitted */
+    /* Stde information CRC check word: not transmitted */
+    /* VQ encoded high frequency subbands: not transmitted */
+
+    /* LFE data: 8 samples and scalefactor */
+    if (c->lfe_channel) {
+        for (i = 0; i < DCA_LFE_SAMPLES; i++)
+            put_bits(&c->pb, 8, quantize_value(c->downsampled_lfe[i], c->lfe_quant) & 0xff);
+        put_bits(&c->pb, 8, c->lfe_scale_factor);
+    }
+
+    /* Audio data (subsubframes) */
+    for (ss = 0; ss < SUBSUBFRAMES ; ss++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                    put_subframe_samples(c, ss, band, ch);
+
+    /* DSYNC */
+    put_bits(&c->pb, 16, 0xffff);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                        const AVFrame *frame, int *got_packet_ptr)
+{
+    DCAEncContext *c = avctx->priv_data;
+    const int32_t *samples;
+    int ret, i;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, c->frame_size , 0)) < 0)
+        return ret;
+
+    samples = (const int32_t *)frame->data[0];
+
+    subband_transform(c, samples);
+    if (c->lfe_channel)
+        lfe_downsample(c, samples);
+
+    calc_masking(c, samples);
+    find_peaks(c);
+    assign_bits(c);
+    calc_scales(c);
+    quantize_all(c);
+    shift_history(c, samples);
+
+    init_put_bits(&c->pb, avpkt->data, avpkt->size);
+    put_frame_header(c);
+    put_primary_audio_header(c);
+    for (i = 0; i < SUBFRAMES; i++)
+        put_subframe(c, i);
+
+
+    for (i = put_bits_count(&c->pb); i < 8*c->frame_size; i++)
+        put_bits(&c->pb, 1, 0);
+
+    flush_put_bits(&c->pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    avpkt->size     = c->frame_size + 1;
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static const AVCodecDefault defaults[] = {
+    { "b",          "1411200" },
+    { NULL },
+};
+
+AVCodec ff_dca_encoder = {
+    .name                  = "dca",
+    .long_name             = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_DTS,
+    .priv_data_size        = sizeof(DCAEncContext),
+    .init                  = encode_init,
+    .encode2               = encode_frame,
+    .capabilities          = AV_CODEC_CAP_EXPERIMENTAL,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  AV_CH_LAYOUT_2_2,
+                                                  AV_CH_LAYOUT_5POINT0,
+                                                  AV_CH_LAYOUT_5POINT1,
+                                                  0 },
+    .defaults              = defaults,
+};
diff --git a/libavcodec/dcaenc.h b/libavcodec/dcaenc.h
new file mode 100644
index 0000000000..0443ca6511
--- /dev/null
+++ b/libavcodec/dcaenc.h
@@ -0,0 +1,113 @@
+/*
+ * DCA encoder tables
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCAENC_H
+#define AVCODEC_DCAENC_H
+
+#include <stdint.h>
+
+typedef struct {
+    int32_t m;
+    int32_t e;
+} softfloat;
+
+static const int sample_rates[] = {
+    8000, 16000, 32000, 11025, 22050, 44100, 12000, 24000, 48000, 0,
+};
+
+static const uint8_t bitstream_sfreq[] = { 1, 2, 3, 6, 7, 8, 11, 12, 13 };
+
+/* Auditory filter center frequencies and bandwidths, in Hz.
+ * The last two are made up, because there is no scientific data.
+ */
+static const uint16_t fc[] = {
+    50, 150, 250, 350, 450, 570, 700, 840, 1000, 1170, 1370, 1600, 1850, 2150,
+    2500, 2900, 3400, 4000, 4800, 5800, 7000, 8500, 10500, 13500, 17000
+};
+
+static const uint16_t erb[] = {
+    80, 100, 100, 100, 110, 120, 140, 150, 160, 190, 210, 240, 280,
+    320, 380, 450, 550, 700, 900, 1100, 1300, 1800, 2500, 3500, 4500
+};
+
+static const softfloat stepsize_inv[27] = {
+    {0, 0}, {1342177360, 21}, {2147483647, 21}, {1342177360, 20},
+    {1819901661, 20}, {2147483647, 20}, {1278263843, 19}, {1579032492, 19},
+    {1412817763, 18}, {1220162327, 17}, {1118482133, 16}, {1917391412, 16},
+    {1766017772, 15}, {1525212826, 14}, {1290553940, 13}, {2097179000, 13},
+    {1677683200, 12}, {1497972244, 11}, {1310893147, 10}, {1165354136, 9},
+    {1748031204, 9}, {1542092044, 8}, {1636178017, 7}, {1636178017, 6},
+    {1636178017, 5}, {1636178017, 4}, {1636178017, 3},
+};
+
+static const softfloat scalefactor_inv[128] = {
+    {2147483647, 1}, {2147483647, 1}, {2147483647, 2}, {2147483647, 2},
+    {2147483647, 2}, {2147483647, 2}, {1431655765, 2}, {1431655765, 2},
+    {1431655765, 2}, {2147483647, 3}, {2147483647, 3}, {1717986918, 3},
+    {1431655765, 3}, {1227133513, 3}, {1227133513, 3}, {2147483647, 4},
+    {1717986918, 4}, {1561806289, 4}, {1431655765, 4}, {1227133513, 4},
+    {2147483647, 5}, {1908874353, 5}, {1717986918, 5}, {1493901668, 5},
+    {1321528398, 5}, {1145324612, 5}, {2021161080, 6}, {1808407282, 6},
+    {1561806289, 6}, {1374389534, 6}, {1227133513, 6}, {2147483647, 7},
+    {1908874353, 7}, {1676084798, 7}, {1477838209, 7}, {1296593900, 7},
+    {1145324612, 7}, {2021161080, 8}, {1773405851, 8}, {1561806289, 8},
+    {1374389534, 8}, {1216273924, 8}, {2139127680, 9}, {1882725390, 9},
+    {1660893697, 9}, {1462116526, 9}, {1287484341, 9}, {1135859119, 9},
+    {1999112050, 10}, {1762037865, 10}, {1552982525, 10}, {1367551775, 10},
+    {1205604855, 10}, {2124660150, 11}, {1871509153, 11}, {1648443220, 11},
+    {1452459217, 11}, {1279990253, 11}, {1127704233, 11}, {1987368509, 12},
+    {1750814693, 12}, {1542632939, 12}, {1359099663, 12}, {1197398995, 12},
+    {2109880792, 13}, {1858853132, 13}, {1638006149, 13}, {1443165385, 13},
+    {1271479187, 13}, {1120235993, 13}, {1973767086, 14}, {1739045674, 14},
+    {1532153461, 14}, {1349922194, 14}, {1189384493, 14}, {2095804865, 15},
+    {1846464029, 15}, {1626872524, 15}, {1433347133, 15}, {1262853884, 15},
+    {1112619678, 15}, {1960569045, 16}, {1727349015, 16}, {1521881227, 16},
+    {1340842289, 16}, {1181357555, 16}, {2081669156, 17}, {1834047752, 17},
+    {1615889229, 17}, {1423675973, 17}, {1254322457, 17}, {1105123583, 17},
+    {1947330755, 18}, {1715693602, 18}, {1511607799, 18}, {1331801790, 18},
+    {1173384427, 18}, {2067616532, 19}, {1821667648, 19}, {1604980024, 19},
+    {1414066955, 19}, {1245861410, 19}, {1097665748, 19}, {1934193616, 20},
+    {1704119624, 20}, {1501412075, 20}, {1322817107, 20}, {1165466323, 20},
+    {2053666205, 21}, {1809379407, 21}, {1594151671, 21}, {1404526328, 21},
+    {1237455941, 21}, {1090259329, 21}, {1921143210, 22}, {1692621231, 22},
+    {1491281857, 22}, {1313892269, 22}, {1157603482, 22}, {2039810470, 23},
+    {1797172644, 23}, {1583396912, 23}, {1395050052, 23}, {1229107276, 23},
+    {1082903494, 23}, {1082903494, 23}, {1082903494, 23}, {1082903494, 23},
+};
+
+/* manually derived from
+ * Table B.5: Selection of quantization levels and codebooks
+ * FIXME: will become invalid when Huffman codes are introduced.
+ */
+static const int bit_consumption[27] = {
+    -8, 28, 40, 48, 52, 60, 68, 76, 80, 96,
+    112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
+    272, 288, 304, 320, 336, 352, 368,
+};
+
+/* Table B.5: Selection of quantization levels and codebooks */
+static const int quant_levels[27] = {
+    1, 3, 5, 7, 9, 13, 17, 25, 32, 64,
+    128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536,
+    131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608,
+};
+
+#endif /* AVCODEC_DCAENC_H */
diff --git a/libavcodec/dcahuff.h b/libavcodec/dcahuff.h
index 79be4938e2..9388b636a7 100644
--- a/libavcodec/dcahuff.h
+++ b/libavcodec/dcahuff.h
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index d3233f310e..56e1a629a7 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -2,20 +2,20 @@
  * (c) 2001 Fabrice Bellard
  *     2007 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,10 +65,24 @@ static const struct algo fdct_tab[] = {
 #endif /* CONFIG_FAANDCT */
 };
 
+static void ff_prores_idct_wrap(int16_t *dst){
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]);
+    int i;
+
+    for(i=0; i<64; i++){
+        qmat[i]=4;
+    }
+    ff_prores_idct(dst, qmat);
+    for(i=0; i<64; i++) {
+         dst[i] -= 512;
+    }
+}
+
 static const struct algo idct_tab[] = {
     { "REF-DBL",     ff_ref_idct,          FF_IDCT_PERM_NONE },
     { "INT",         ff_j_rev_dct,         FF_IDCT_PERM_LIBMPEG2 },
     { "SIMPLE-C",    ff_simple_idct_8,     FF_IDCT_PERM_NONE },
+    { "PR-C",        ff_prores_idct_wrap,  FF_IDCT_PERM_NONE, 0, 1 },
 #if CONFIG_FAANIDCT
     { "FAANI",       ff_faanidct,          FF_IDCT_PERM_NONE },
 #endif /* CONFIG_FAANIDCT */
@@ -96,7 +110,7 @@ static const struct algo idct_tab_arch[] = { { 0 } };
 DECLARE_ALIGNED(16, static int16_t, block)[64];
 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 
-static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
+static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 {
     int i, j;
 
@@ -105,7 +119,7 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
     switch (test) {
     case 0:
         for (i = 0; i < 64; i++)
-            block[i] = (av_lfg_get(prng) % 512) - 256;
+            block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
         if (is_idct) {
             ff_ref_fdct(block);
             for (i = 0; i < 64; i++)
@@ -114,11 +128,13 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
         break;
     case 1:
         j = av_lfg_get(prng) % 10 + 1;
-        for (i = 0; i < j; i++)
-            block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
+        for (i = 0; i < j; i++) {
+            int idx = av_lfg_get(prng) % 64;
+            block[idx] = av_lfg_get(prng) % (2*vals) -vals;
+        }
         break;
     case 2:
-        block[ 0] = av_lfg_get(prng) % 4096 - 2048;
+        block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
         block[63] = (block[0] & 1) ^ 1;
         break;
     }
@@ -143,6 +159,10 @@ static void permute(int16_t dst[64], const int16_t src[64],
         for (i = 0; i < 64; i++)
             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
         break;
+    case FF_IDCT_PERM_TRANSPOSE:
+        for (i = 0; i < 64; i++)
+            dst[(i>>3) | ((i<<3)&0x38)] = src[i];
+        break;
     default:
         for (i = 0; i < 64; i++)
             dst[i] = src[i];
@@ -150,7 +170,7 @@ static void permute(int16_t dst[64], const int16_t src[64],
     }
 }
 
-static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
+static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 {
     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
     int it, i, scale;
@@ -160,6 +180,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     int maxout = 0;
     int blockSumErrMax = 0, blockSumErr;
     AVLFG prng;
+    const int vals=1<<bits;
     double omse, ome;
     int spec_err;
 
@@ -170,7 +191,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     for (i = 0; i < 64; i++)
         sysErr[i] = 0;
     for (it = 0; it < NB_ITS; it++) {
-        init_block(block1, test, is_idct, &prng);
+        init_block(block1, test, is_idct, &prng, vals);
         permute(block, block1, dct->perm_type);
 
         dct->func(block);
@@ -184,6 +205,9 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
         }
 
         ref(block1);
+        if (!strcmp(dct->name, "PR-SSE2"))
+            for (i = 0; i < 64; i++)
+                block1[i] = av_clip(block1[i], 4-512, 1019-512);
 
         blockSumErr = 0;
         for (i = 0; i < 64; i++) {
@@ -216,7 +240,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
 
     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 
-    printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
+    printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
            omse, ome, (double) sysErrMax / NB_ITS,
            maxout, blockSumErrMax);
@@ -228,7 +252,8 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
         return 0;
 
     /* speed test */
-    init_block(block, test, is_idct, &prng);
+
+    init_block(block, test, is_idct, &prng, vals);
     permute(block1, block, dct->perm_type);
 
     ti = av_gettime_relative();
@@ -238,10 +263,10 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
             memcpy(block, block1, sizeof(block));
             dct->func(block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -366,6 +391,25 @@ static void idct248_error(const char *name,
             if (v > err_max)
                 err_max = v;
         }
+#if 0
+        printf("ref=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest1[i*8+j]);
+            }
+            printf("\n");
+        }
+
+        printf("out=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest[i*8+j]);
+            }
+            printf("\n");
+        }
+#endif
     }
     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 
@@ -380,10 +424,10 @@ static void idct248_error(const char *name,
                 block[i] = block1[i];
             idct248_put(img_dest, 8, block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -391,10 +435,11 @@ static void idct248_error(const char *name,
 
 static void help(void)
 {
-    printf("dct-test [-i] [<test-number>]\n"
+    printf("dct-test [-i] [<test-number>] [<bits>]\n"
            "test-number 0 -> test with random matrixes\n"
            "            1 -> test with random sparse matrixes\n"
            "            2 -> do 3. test from mpeg4 std\n"
+           "bits        Number of time domain bits to use, 8 is default\n"
            "-i          test IDCT implementations\n"
            "-4          test IDCT248 implementations\n"
            "-t          speed test\n");
@@ -411,6 +456,7 @@ int main(int argc, char **argv)
     int test = 1;
     int speed = 0;
     int err = 0;
+    int bits=8;
 
     ff_ref_dct_init();
 
@@ -437,8 +483,9 @@ int main(int argc, char **argv)
 
     if (optind < argc)
         test = atoi(argv[optind]);
+    if(optind+1 < argc) bits= atoi(argv[optind+1]);
 
-    printf("Libav DCT/IDCT test\n");
+    printf("ffmpeg DCT/IDCT test\n");
 
     if (test_248_dct) {
         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
@@ -446,20 +493,20 @@ int main(int argc, char **argv)
         const int cpu_flags = av_get_cpu_flags();
         if (test_idct) {
             for (i = 0; i < FF_ARRAY_ELEMS(idct_tab); i++)
-                err |= dct_error(&idct_tab[i], test, test_idct, speed);
+                err |= dct_error(&idct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; idct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & idct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed, bits);
         }
 #if CONFIG_FDCTDSP
         else {
             for (i = 0; i < FF_ARRAY_ELEMS(fdct_tab); i++)
-                err |= dct_error(&fdct_tab[i], test, test_idct, speed);
+                err |= dct_error(&fdct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; fdct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & fdct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed, bits);
         }
 #endif /* CONFIG_FDCTDSP */
     }
diff --git a/libavcodec/dct.c b/libavcodec/dct.c
index 180477e621..cca51eeaf8 100644
--- a/libavcodec/dct.c
+++ b/libavcodec/dct.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -190,12 +190,12 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
         ff_init_ff_cos_tabs(nbits + 2);
 
         s->costab = ff_cos_tabs[nbits + 2];
-        s->csc2   = av_malloc(n / 2 * sizeof(FFTSample));
+        s->csc2   = av_malloc_array(n / 2, sizeof(FFTSample));
         if (!s->csc2)
             return AVERROR(ENOMEM);
 
         if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) {
-            av_free(s->csc2);
+            av_freep(&s->csc2);
             return -1;
         }
 
@@ -220,5 +220,5 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
 av_cold void ff_dct_end(DCTContext *s)
 {
     ff_rdft_end(&s->rdft);
-    av_free(s->csc2);
+    av_freep(&s->csc2);
 }
diff --git a/libavcodec/dct.h b/libavcodec/dct.h
index 4a31f54fcb..05297ba9db 100644
--- a/libavcodec/dct.h
+++ b/libavcodec/dct.h
@@ -4,24 +4,24 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#ifndef AVCODEC_DCT_H
+#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_DCT_H
 
 #include <stdint.h>
@@ -59,6 +59,9 @@ void ff_fdct248_islow_8(int16_t *data);
 void ff_fdct248_islow_10(int16_t *data);
 
 void ff_j_rev_dct(int16_t *data);
+void ff_j_rev_dct4(int16_t *data);
+void ff_j_rev_dct2(int16_t *data);
+void ff_j_rev_dct1(int16_t *data);
 void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block);
 void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block);
 
diff --git a/libavcodec/dct32.h b/libavcodec/dct32.h
index 8bf6880155..61bf223a8d 100644
--- a/libavcodec/dct32.h
+++ b/libavcodec/dct32.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_fixed.c b/libavcodec/dct32_fixed.c
index 64efe8bf7a..9025d5efdd 100644
--- a/libavcodec/dct32_fixed.c
+++ b/libavcodec/dct32_fixed.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_float.c b/libavcodec/dct32_float.c
index ef37ce9687..597c9bb639 100644
--- a/libavcodec/dct32_float.c
+++ b/libavcodec/dct32_float.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_template.c b/libavcodec/dct32_template.c
index 272e0dbf95..fb53d53ab1 100644
--- a/libavcodec/dct32_template.c
+++ b/libavcodec/dct32_template.c
@@ -2,20 +2,20 @@
  * Template for the Discrete Cosine Transform for 32 samples
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dctref.c b/libavcodec/dctref.c
index ae3dec51cd..851014b664 100644
--- a/libavcodec/dctref.c
+++ b/libavcodec/dctref.c
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dctref.h b/libavcodec/dctref.h
index a93b70d9f2..f6fde8863a 100644
--- a/libavcodec/dctref.h
+++ b/libavcodec/dctref.h
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dds.c b/libavcodec/dds.c
index 04b1dd2de8..a604d5690f 100644
--- a/libavcodec/dds.c
+++ b/libavcodec/dds.c
@@ -2,20 +2,20 @@
  * DirectDraw Surface image decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -358,9 +358,9 @@ static int parse_pixel_format(AVCodecContext *avctx)
             avctx->pix_fmt = AV_PIX_FMT_BGR24;
         /* 32 bpp */
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_BGRA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_BGR0; // opaque
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_RGBA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_RGB0; // opaque
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0xff000000)
             avctx->pix_fmt = AV_PIX_FMT_BGRA;
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0xff000000)
@@ -653,17 +653,15 @@ static int dds_decode(AVCodecContext *avctx, void *data,
 
         if (ctx->paletted) {
             int i;
-            uint32_t *p = (uint32_t*) frame->data[1];
-
             /* Use the first 1024 bytes as palette, then copy the rest. */
-            for (i = 0; i < 256; i++) {
-                uint32_t rgba = 0;
-                rgba |= bytestream2_get_byte(gbc) << 16;
-                rgba |= bytestream2_get_byte(gbc) << 8;
-                rgba |= bytestream2_get_byte(gbc) << 0;
-                rgba |= bytestream2_get_byte(gbc) << 24;
-                p[i] = rgba;
-            }
+            bytestream2_get_buffer(gbc, frame->data[1], 256 * 4);
+            for (i = 0; i < 256; i++)
+                AV_WN32(frame->data[1] + i*4,
+                        (frame->data[1][2+i*4]<<0)+
+                        (frame->data[1][1+i*4]<<8)+
+                        (frame->data[1][0+i*4]<<16)+
+                        (frame->data[1][3+i*4]<<24)
+                );
 
             frame->palette_has_changed = 1;
         }
@@ -676,6 +674,8 @@ static int dds_decode(AVCodecContext *avctx, void *data,
     /* Run any post processing here if needed. */
     if (avctx->pix_fmt == AV_PIX_FMT_BGRA ||
         avctx->pix_fmt == AV_PIX_FMT_RGBA ||
+        avctx->pix_fmt == AV_PIX_FMT_RGB0 ||
+        avctx->pix_fmt == AV_PIX_FMT_BGR0 ||
         avctx->pix_fmt == AV_PIX_FMT_YA8)
         run_postproc(avctx, frame);
 
diff --git a/libavcodec/dfa.c b/libavcodec/dfa.c
index 8021193894..f45d019a79 100644
--- a/libavcodec/dfa.c
+++ b/libavcodec/dfa.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2011 Konstantin Shishkov
  * based on work by Vladimir "VAG" Gneushev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include "bytestream.h"
 #include "internal.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 
@@ -37,12 +38,13 @@ typedef struct DfaContext {
 static av_cold int dfa_decode_init(AVCodecContext *avctx)
 {
     DfaContext *s = avctx->priv_data;
-    int ret;
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
-    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-        return ret;
+    if (!avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    av_assert0(av_image_check_size(avctx->width, avctx->height, 0, avctx) >= 0);
 
     s->frame_buf = av_mallocz(avctx->width * avctx->height);
     if (!s->frame_buf)
@@ -70,6 +72,8 @@ static int decode_tsw1(GetByteContext *gb, uint8_t *frame, int width, int height
 
     segments = bytestream2_get_le32(gb);
     offset   = bytestream2_get_le32(gb);
+    if (segments == 0 && offset == frame_end - frame)
+        return 0; // skip frame
     if (frame_end - frame <= offset)
         return AVERROR_INVALIDDATA;
     frame += offset;
@@ -252,6 +256,9 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
             y        += skip_lines;
             segments = bytestream2_get_le16(gb);
         }
+
+        if (frame_end <= frame)
+            return AVERROR_INVALIDDATA;
         if (segments & 0x8000) {
             frame[width - 1] = segments & 0xFF;
             segments = bytestream2_get_le16(gb);
@@ -289,7 +296,7 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
 static int decode_tdlt(GetByteContext *gb, uint8_t *frame, int width, int height)
 {
     const uint8_t *frame_end = frame + width * height;
-    int segments = bytestream2_get_le32(gb);
+    uint32_t segments = bytestream2_get_le32(gb);
     int skip, copy;
 
     while (segments--) {
@@ -338,11 +345,10 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     uint8_t *dst;
     int ret;
     int i, pal_elems;
+    int version = avctx->extradata_size==2 ? AV_RL16(avctx->extradata) : 0;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
     while (bytestream2_get_bytes_left(&gb) > 0) {
@@ -355,7 +361,7 @@ static int dfa_decode_frame(AVCodecContext *avctx,
             pal_elems = FFMIN(chunk_size / 3, 256);
             for (i = 0; i < pal_elems; i++) {
                 s->pal[i] = bytestream2_get_be24(&gb) << 2;
-                s->pal[i] |= (s->pal[i] >> 6) & 0x333;
+                s->pal[i] |= 0xFFU << 24 | (s->pal[i] >> 6) & 0x30303;
             }
             frame->palette_has_changed = 1;
         } else if (chunk_type <= 9) {
@@ -375,9 +381,17 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     buf = s->frame_buf;
     dst = frame->data[0];
     for (i = 0; i < avctx->height; i++) {
-        memcpy(dst, buf, avctx->width);
+        if(version == 0x100) {
+            int j;
+            for(j = 0; j < avctx->width; j++) {
+                dst[j] = buf[ (i&3)*(avctx->width /4) + (j/4) +
+                             ((j&3)*(avctx->height/4) + (i/4))*avctx->width];
+            }
+        } else {
+            memcpy(dst, buf, avctx->width);
+            buf += avctx->width;
+        }
         dst += frame->linesize[0];
-        buf += avctx->width;
     }
     memcpy(frame->data[1], s->pal, sizeof(s->pal));
 
diff --git a/libavcodec/dirac.c b/libavcodec/dirac.c
index 5e13a849fc..07db919be4 100644
--- a/libavcodec/dirac.c
+++ b/libavcodec/dirac.c
@@ -1,28 +1,29 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Dirac Decoder
- * @author Marco Gerards <marco@gnu.org>
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
  */
 
 #include "libavutil/imgutils.h"
@@ -33,7 +34,7 @@
 #include "internal.h"
 #include "mpeg12data.h"
 
-// defaults for source parameters
+/* defaults for source parameters */
 static const dirac_source_params dirac_source_parameters_defaults[] = {
     {  640,  480, 2, 0, 0,  1, 1,  640,  480, 0, 0, 1, 0 },
     {  176,  120, 2, 0, 0,  9, 2,  176,  120, 0, 0, 1, 1 },
@@ -122,6 +123,7 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
     AVRational frame_rate = { 0, 0 };
     unsigned luma_depth = 8, luma_offset = 16;
     int idx;
+    int chroma_x_shift, chroma_y_shift;
 
     /* [DIRAC_STD] 10.3.2 Frame size. frame_size(video_params) */
     /* [DIRAC_STD] custom_dimensions_flag */
@@ -136,7 +138,7 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
     if (get_bits1(gb))
         /* [DIRAC_STD] CHROMA_FORMAT_INDEX */
         source->chroma_format = svq3_get_ue_golomb(gb);
-    if (source->chroma_format > 2) {
+    if (source->chroma_format > 2U) {
         av_log(avctx, AV_LOG_ERROR, "Unknown chroma format %d\n",
                source->chroma_format);
         return AVERROR_INVALIDDATA;
@@ -147,14 +149,14 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
     if (get_bits1(gb))
         /* [DIRAC_STD] SOURCE_SAMPLING */
         source->interlaced = svq3_get_ue_golomb(gb);
-    if (source->interlaced > 1)
+    if (source->interlaced > 1U)
         return AVERROR_INVALIDDATA;
 
     /* [DIRAC_STD] 10.3.5 Frame Rate. frame_rate(video_params) */
     if (get_bits1(gb)) { /* [DIRAC_STD] custom_frame_rate_flag */
         source->frame_rate_index = svq3_get_ue_golomb(gb);
 
-        if (source->frame_rate_index > 10)
+        if (source->frame_rate_index > 10U)
             return AVERROR_INVALIDDATA;
 
         if (!source->frame_rate_index) {
@@ -180,7 +182,7 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
         /* [DIRAC_STD] index */
         source->aspect_ratio_index = svq3_get_ue_golomb(gb);
 
-        if (source->aspect_ratio_index > 6)
+        if (source->aspect_ratio_index > 6U)
             return AVERROR_INVALIDDATA;
 
         if (!source->aspect_ratio_index) {
@@ -213,10 +215,10 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
         /* [DIRAC_STD] index */
         source->pixel_range_index = svq3_get_ue_golomb(gb);
 
-        if (source->pixel_range_index > 4)
+        if (source->pixel_range_index > 4U)
             return AVERROR_INVALIDDATA;
 
-        // This assumes either fullrange or MPEG levels only
+        /* This assumes either fullrange or MPEG levels only */
         if (!source->pixel_range_index) {
             luma_offset = svq3_get_ue_golomb(gb);
             luma_depth  = av_log2(svq3_get_ue_golomb(gb)) + 1;
@@ -235,16 +237,22 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
     }
 
     if (luma_depth > 8)
-        av_log(avctx, AV_LOG_WARNING, "Bitdepth greater than 8");
+        av_log(avctx, AV_LOG_WARNING, "Bitdepth greater than 8\n");
 
     avctx->pix_fmt = dirac_pix_fmt[!luma_offset][source->chroma_format];
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &chroma_x_shift, &chroma_y_shift);
+    if ((source->width % (1<<chroma_x_shift)) || (source->height % (1<<chroma_y_shift))) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions must be an integer multiple of the chroma subsampling\n");
+        return AVERROR_INVALIDDATA;
+    }
+
 
     /* [DIRAC_STD] 10.3.9 Colour specification. colour_spec(video_params) */
     if (get_bits1(gb)) { /* [DIRAC_STD] custom_colour_spec_flag */
         /* [DIRAC_STD] index */
         idx = source->color_spec_index = svq3_get_ue_golomb(gb);
 
-        if (source->color_spec_index > 4)
+        if (source->color_spec_index > 4U)
             return AVERROR_INVALIDDATA;
 
         avctx->color_primaries = dirac_color_presets[idx].color_primaries;
@@ -255,7 +263,7 @@ static int parse_source_parameters(AVCodecContext *avctx, GetBitContext *gb,
             /* [DIRAC_STD] 10.3.9.1 Colour primaries */
             if (get_bits1(gb)) {
                 idx = svq3_get_ue_golomb(gb);
-                if (idx < 3)
+                if (idx < 3U)
                     avctx->color_primaries = dirac_primaries[idx];
             }
             /* [DIRAC_STD] 10.3.9.2 Colour matrix */
@@ -302,10 +310,10 @@ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb,
     else if (version_major > 2)
         av_log(avctx, AV_LOG_WARNING, "Stream may have unhandled features\n");
 
-    if (video_format > 20)
+    if (video_format > 20U)
         return AVERROR_INVALIDDATA;
 
-    // Fill in defaults for the source parameters.
+    /* Fill in defaults for the source parameters. */
     *source = dirac_source_parameters_defaults[video_format];
 
     /* [DIRAC_STD] 10.3 Source Parameters
@@ -323,7 +331,7 @@ int avpriv_dirac_parse_sequence_header(AVCodecContext *avctx, GetBitContext *gb,
      * currently only used to signal field coding */
     picture_coding_mode = svq3_get_ue_golomb(gb);
     if (picture_coding_mode != 0) {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported picture coding mode %d",
+        av_log(avctx, AV_LOG_ERROR, "Unsupported picture coding mode %d\n",
                picture_coding_mode);
         return AVERROR_INVALIDDATA;
     }
diff --git a/libavcodec/dirac.h b/libavcodec/dirac.h
index e5b79b09d2..b0f955bf46 100644
--- a/libavcodec/dirac.h
+++ b/libavcodec/dirac.h
@@ -1,21 +1,22 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +27,8 @@
  * @file
  * Interface to Dirac Decoder/Encoder
  * @author Marco Gerards <marco@gnu.org>
+ * @author David Conrad
+ * @author Jordi Ortiz
  */
 
 #include "avcodec.h"
diff --git a/libavcodec/dirac_arith.c b/libavcodec/dirac_arith.c
new file mode 100644
index 0000000000..bf913928b3
--- /dev/null
+++ b/libavcodec/dirac_arith.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#include "dirac_arith.h"
+
+
+const uint16_t ff_dirac_prob[256] = {
+    0,    2,    5,    8,    11,   15,   20,   24,
+    29,   35,   41,   47,   53,   60,   67,   74,
+    82,   89,   97,   106,  114,  123,  132,  141,
+    150,  160,  170,  180,  190,  201,  211,  222,
+    233,  244,  256,  267,  279,  291,  303,  315,
+    327,  340,  353,  366,  379,  392,  405,  419,
+    433,  447,  461,  475,  489,  504,  518,  533,
+    548,  563,  578,  593,  609,  624,  640,  656,
+    672,  688,  705,  721,  738,  754,  771,  788,
+    805,  822,  840,  857,  875,  892,  910,  928,
+    946,  964,  983,  1001, 1020, 1038, 1057, 1076,
+    1095, 1114, 1133, 1153, 1172, 1192, 1211, 1231,
+    1251, 1271, 1291, 1311, 1332, 1352, 1373, 1393,
+    1414, 1435, 1456, 1477, 1498, 1520, 1541, 1562,
+    1584, 1606, 1628, 1649, 1671, 1694, 1716, 1738,
+    1760, 1783, 1806, 1828, 1851, 1874, 1897, 1920,
+    1935, 1942, 1949, 1955, 1961, 1968, 1974, 1980,
+    1985, 1991, 1996, 2001, 2006, 2011, 2016, 2021,
+    2025, 2029, 2033, 2037, 2040, 2044, 2047, 2050,
+    2053, 2056, 2058, 2061, 2063, 2065, 2066, 2068,
+    2069, 2070, 2071, 2072, 2072, 2072, 2072, 2072,
+    2072, 2071, 2070, 2069, 2068, 2066, 2065, 2063,
+    2060, 2058, 2055, 2052, 2049, 2045, 2042, 2038,
+    2033, 2029, 2024, 2019, 2013, 2008, 2002, 1996,
+    1989, 1982, 1975, 1968, 1960, 1952, 1943, 1934,
+    1925, 1916, 1906, 1896, 1885, 1874, 1863, 1851,
+    1839, 1827, 1814, 1800, 1786, 1772, 1757, 1742,
+    1727, 1710, 1694, 1676, 1659, 1640, 1622, 1602,
+    1582, 1561, 1540, 1518, 1495, 1471, 1447, 1422,
+    1396, 1369, 1341, 1312, 1282, 1251, 1219, 1186,
+    1151, 1114, 1077, 1037, 995,  952,  906,  857,
+    805,  750,  690,  625,  553,  471,  376,  255
+};
+
+const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT] = {
+    [CTX_ZPZN_F1]   = CTX_ZP_F2,
+    [CTX_ZPNN_F1]   = CTX_ZP_F2,
+    [CTX_ZP_F2]     = CTX_ZP_F3,
+    [CTX_ZP_F3]     = CTX_ZP_F4,
+    [CTX_ZP_F4]     = CTX_ZP_F5,
+    [CTX_ZP_F5]     = CTX_ZP_F6,
+    [CTX_ZP_F6]     = CTX_ZP_F6,
+    [CTX_NPZN_F1]   = CTX_NP_F2,
+    [CTX_NPNN_F1]   = CTX_NP_F2,
+    [CTX_NP_F2]     = CTX_NP_F3,
+    [CTX_NP_F3]     = CTX_NP_F4,
+    [CTX_NP_F4]     = CTX_NP_F5,
+    [CTX_NP_F5]     = CTX_NP_F6,
+    [CTX_NP_F6]     = CTX_NP_F6,
+    [CTX_DELTA_Q_F] = CTX_DELTA_Q_F,
+};
+
+int16_t ff_dirac_prob_branchless[256][2];
+
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length)
+{
+    int i;
+    align_get_bits(gb);
+
+    length = FFMIN(length, get_bits_left(gb)/8);
+
+    c->bytestream     = gb->buffer + get_bits_count(gb)/8;
+    c->bytestream_end = c->bytestream + length;
+    skip_bits_long(gb, length*8);
+
+    c->low = 0;
+    for (i = 0; i < 4; i++) {
+        c->low <<= 8;
+        if (c->bytestream < c->bytestream_end)
+            c->low |= *c->bytestream++;
+        else
+            c->low |= 0xff;
+    }
+
+    c->counter = -16;
+    c->range   = 0xffff;
+
+    for (i = 0; i < 256; i++) {
+        ff_dirac_prob_branchless[i][0] =  ff_dirac_prob[255-i];
+        ff_dirac_prob_branchless[i][1] = -ff_dirac_prob[i];
+    }
+
+    for (i = 0; i < DIRAC_CTX_COUNT; i++)
+        c->contexts[i] = 0x8000;
+}
diff --git a/libavcodec/dirac_arith.h b/libavcodec/dirac_arith.h
new file mode 100644
index 0000000000..003430a48d
--- /dev/null
+++ b/libavcodec/dirac_arith.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#ifndef AVCODEC_DIRAC_ARITH_H
+#define AVCODEC_DIRAC_ARITH_H
+
+#include "libavutil/x86/asm.h"
+#include "bytestream.h"
+#include "get_bits.h"
+
+enum dirac_arith_contexts {
+    CTX_ZPZN_F1,
+    CTX_ZPNN_F1,
+    CTX_NPZN_F1,
+    CTX_NPNN_F1,
+    CTX_ZP_F2,
+    CTX_ZP_F3,
+    CTX_ZP_F4,
+    CTX_ZP_F5,
+    CTX_ZP_F6,
+    CTX_NP_F2,
+    CTX_NP_F3,
+    CTX_NP_F4,
+    CTX_NP_F5,
+    CTX_NP_F6,
+    CTX_COEFF_DATA,
+    CTX_SIGN_NEG,
+    CTX_SIGN_ZERO,
+    CTX_SIGN_POS,
+    CTX_ZERO_BLOCK,
+    CTX_DELTA_Q_F,
+    CTX_DELTA_Q_DATA,
+    CTX_DELTA_Q_SIGN,
+
+    DIRAC_CTX_COUNT
+};
+
+// Dirac resets the arith decoder between decoding various types of data,
+// so many contexts are never used simultaneously. Thus, we can reduce
+// the number of contexts needed by reusing them.
+#define CTX_SB_F1        CTX_ZP_F5
+#define CTX_SB_DATA      0
+#define CTX_PMODE_REF1   0
+#define CTX_PMODE_REF2   1
+#define CTX_GLOBAL_BLOCK 2
+#define CTX_MV_F1        CTX_ZP_F2
+#define CTX_MV_DATA      0
+#define CTX_DC_F1        CTX_ZP_F5
+#define CTX_DC_DATA      0
+
+typedef struct {
+    unsigned low;
+    uint16_t range;
+    int16_t  counter;
+
+    const uint8_t *bytestream;
+    const uint8_t *bytestream_end;
+
+    uint16_t contexts[DIRAC_CTX_COUNT];
+} DiracArith;
+
+extern const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT];
+extern const uint16_t ff_dirac_prob[256];
+extern int16_t ff_dirac_prob_branchless[256][2];
+
+static inline void renorm(DiracArith *c)
+{
+#if HAVE_FAST_CLZ
+    int shift = 14 - av_log2_16bit(c->range-1) + ((c->range-1)>>15);
+
+    c->low    <<= shift;
+    c->range  <<= shift;
+    c->counter += shift;
+#else
+    while (c->range <= 0x4000) {
+        c->low   <<= 1;
+        c->range <<= 1;
+        c->counter++;
+    }
+#endif
+}
+
+static inline void refill(DiracArith *c)
+{
+    int counter = c->counter;
+
+    if (counter >= 0) {
+        int new = bytestream_get_be16(&c->bytestream);
+
+        // the spec defines overread bits to be 1, and streams rely on this
+        if (c->bytestream > c->bytestream_end) {
+            new |= 0xff;
+            if (c->bytestream > c->bytestream_end+1)
+                new |= 0xff00;
+
+            c->bytestream = c->bytestream_end;
+        }
+
+        c->low += new << counter;
+        counter -= 16;
+    }
+    c->counter = counter;
+}
+
+static inline int dirac_get_arith_bit(DiracArith *c, int ctx)
+{
+    int prob_zero = c->contexts[ctx];
+    int range_times_prob, bit;
+    unsigned low = c->low;
+    int    range = c->range;
+
+    range_times_prob = (c->range * prob_zero) >> 16;
+
+#if ARCH_X86 && HAVE_FAST_CMOV && HAVE_INLINE_ASM && HAVE_6REGS
+    low   -= range_times_prob << 16;
+    range -= range_times_prob;
+    bit = 0;
+    __asm__(
+        "cmpl   %5, %4 \n\t"
+        "setae  %b0    \n\t"
+        "cmovb  %3, %2 \n\t"
+        "cmovb  %5, %1 \n\t"
+        : "+q"(bit), "+r"(range), "+r"(low)
+        : "r"(c->low), "r"(c->low>>16),
+          "r"(range_times_prob)
+    );
+#else
+    bit = (low >> 16) >= range_times_prob;
+    if (bit) {
+        low   -= range_times_prob << 16;
+        range -= range_times_prob;
+    } else {
+        range  = range_times_prob;
+    }
+#endif
+
+    c->contexts[ctx] += ff_dirac_prob_branchless[prob_zero>>8][bit];
+    c->low   = low;
+    c->range = range;
+
+    renorm(c);
+    refill(c);
+    return bit;
+}
+
+static inline int dirac_get_arith_uint(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = 1;
+    while (!dirac_get_arith_bit(c, follow_ctx)) {
+        if (ret >= 0x40000000) {
+            av_log(NULL, AV_LOG_ERROR, "dirac_get_arith_uint overflow\n");
+            return -1;
+        }
+        ret <<= 1;
+        ret += dirac_get_arith_bit(c, data_ctx);
+        follow_ctx = ff_dirac_next_ctx[follow_ctx];
+    }
+    return ret-1;
+}
+
+static inline int dirac_get_arith_int(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = dirac_get_arith_uint(c, follow_ctx, data_ctx);
+    if (ret && dirac_get_arith_bit(c, data_ctx+1))
+        ret = -ret;
+    return ret;
+}
+
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length);
+
+#endif /* AVCODEC_DIRAC_ARITH_H */
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
new file mode 100644
index 0000000000..ee3665e7ce
--- /dev/null
+++ b/libavcodec/dirac_dwt.c
@@ -0,0 +1,561 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "dirac_dwt.h"
+#include "libavcodec/x86/dirac_dwt.h"
+
+
+static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i] + 2) >> 2;
+}
+
+
+static av_always_inline
+void interleave(IDWTELEM *dst, IDWTELEM *src0, IDWTELEM *src1, int w2, int add, int shift)
+{
+    int i;
+    for (i = 0; i < w2; i++) {
+        dst[2*i  ] = (src0[i] + add) >> shift;
+        dst[2*i+1] = (src1[i] + add) >> shift;
+    }
+}
+
+static void horizontal_compose_dirac53i(IDWTELEM *b, IDWTELEM *temp, int w)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    temp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_53iL0     (b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DIRAC53iH0(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DIRAC53iH0(temp[w2-1], b[w-1], temp[w2-1]);
+
+    interleave(b, temp, temp+w2, w2, 1, 1);
+}
+
+static void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    tmp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++)
+        tmp[x] = COMPOSE_53iL0(b[x+w2-1], b[x], b[x+w2]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+
+static void horizontal_compose_dd137i(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    tmp[0] = COMPOSE_DD137iL0(b[w2], b[w2], b[0], b[w2  ], b[w2+1]);
+    tmp[1] = COMPOSE_DD137iL0(b[w2], b[w2], b[1], b[w2+1], b[w2+2]);
+    for (x = 2; x < w2-1; x++)
+        tmp[x] = COMPOSE_DD137iL0(b[x+w2-2], b[x+w2-1], b[x], b[x+w2], b[x+w2+1]);
+    tmp[w2-1] = COMPOSE_DD137iL0(b[w-3], b[w-2], b[w2-1], b[w-1], b[w-1]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+
+static av_always_inline
+void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *temp, int w, int shift)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    for (x = 0; x < w2; x++) {
+        temp[x   ] = COMPOSE_HAARiL0(b[x   ], b[x+w2]);
+        temp[x+w2] = COMPOSE_HAARiH0(b[x+w2], temp[x]);
+    }
+
+    interleave(b, temp, temp+w2, w2, shift, shift);
+}
+
+static void horizontal_compose_haar0i(IDWTELEM *b, IDWTELEM *temp, int w)
+{
+    horizontal_compose_haari(b, temp, w, 0);
+}
+
+static void horizontal_compose_haar1i(IDWTELEM *b, IDWTELEM *temp, int w)
+{
+    horizontal_compose_haari(b, temp, w, 1);
+}
+
+static void horizontal_compose_fidelityi(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    const int w2 = w >> 1;
+    int i, x;
+    IDWTELEM v[8];
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = b[av_clip(x-3+i, 0, w2-1)];
+        tmp[x] = COMPOSE_FIDELITYiH0(v[0], v[1], v[2], v[3], b[x+w2], v[4], v[5], v[6], v[7]);
+    }
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = tmp[av_clip(x-4+i, 0, w2-1)];
+        tmp[x+w2] = COMPOSE_FIDELITYiL0(v[0], v[1], v[2], v[3], b[x], v[4], v[5], v[6], v[7]);
+    }
+
+    interleave(b, tmp+w2, tmp, w2, 0, 0);
+}
+
+static void horizontal_compose_daub97i(IDWTELEM *b, IDWTELEM *temp, int w)
+{
+    const int w2 = w >> 1;
+    int x, b0, b1, b2;
+
+    temp[0] = COMPOSE_DAUB97iL1(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_DAUB97iL1(b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DAUB97iH1(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DAUB97iH1(temp[w2-1], b[w-1], temp[w2-1]);
+
+    // second stage combined with interleave and shift
+    b0 = b2 = COMPOSE_DAUB97iL0(temp[w2], temp[0], temp[w2]);
+    b[0] = (b0 + 1) >> 1;
+    for (x = 1; x < w2; x++) {
+        b2 = COMPOSE_DAUB97iL0(temp[x+w2-1], temp[x     ], temp[x+w2]);
+        b1 = COMPOSE_DAUB97iH0(          b0, temp[x+w2-1], b2        );
+        b[2*x-1] = (b1 + 1) >> 1;
+        b[2*x  ] = (b2 + 1) >> 1;
+        b0 = b2;
+    }
+    b[w-1] = (COMPOSE_DAUB97iH0(b2, temp[w-1], b2) + 1) >> 1;
+}
+
+static void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  IDWTELEM *b3, IDWTELEM *b4, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                      IDWTELEM *b3, IDWTELEM *b4, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++) {
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+    }
+}
+
+static void vertical_compose_fidelityiH0(IDWTELEM *dst, IDWTELEM *b[8], int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiH0(b[0][i], b[1][i], b[2][i], b[3][i], dst[i], b[4][i], b[5][i], b[6][i], b[7][i]);
+    }
+}
+
+static void vertical_compose_fidelityiL0(IDWTELEM *dst, IDWTELEM *b[8], int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiL0(b[0][i], b[1][i], b[2][i], b[3][i], dst[i], b[4][i], b[5][i], b[6][i], b[7][i]);
+    }
+}
+
+static void vertical_compose_daub97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void vertical_compose_daub97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH1(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void vertical_compose_daub97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void vertical_compose_daub97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width)
+{
+    int i;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL1(b0[i], b1[i], b2[i]);
+    }
+}
+
+
+static void spatial_compose_dd97i_dy(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    IDWTELEM *b[8];
+    for (i = 0; i < 6; i++)
+        b[i] = cs->b[i];
+    b[6] = d->buffer + av_clip(y+5, 0, height-2)*stride;
+    b[7] = d->buffer + av_clip(y+6, 1, height-1)*stride;
+
+        if(y+5<(unsigned)height) vertical_compose_l0(      b[5], b[6], b[7],       width);
+        if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+        if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+        if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 6; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+static void spatial_compose_dirac53i_dy(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int y= cs->y;
+    IDWTELEM *b[4] = { cs->b[0], cs->b[1] };
+    b[2] = d->buffer + avpriv_mirror(y+1, height-1)*stride;
+    b[3] = d->buffer + avpriv_mirror(y+2, height-1)*stride;
+
+        if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+        if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+        if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+        if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    cs->b[0] = b[2];
+    cs->b[1] = b[3];
+    cs->y += 2;
+}
+
+
+static void spatial_compose_dd137i_dy(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_5tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    IDWTELEM *b[10];
+    for (i = 0; i < 8; i++)
+        b[i] = cs->b[i];
+    b[8] = d->buffer + av_clip(y+7, 0, height-2)*stride;
+    b[9] = d->buffer + av_clip(y+8, 1, height-1)*stride;
+
+        if(y+5<(unsigned)height) vertical_compose_l0(b[3], b[5], b[6], b[7], b[9], width);
+        if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+        if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+        if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 8; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+// haar makes the assumption that height is even (always true for dirac)
+static void spatial_compose_haari_dy(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_2tap vertical_compose = (void*)d->vertical_compose;
+    int y = d->cs[level].y;
+    IDWTELEM *b0 = d->buffer + (y-1)*stride;
+    IDWTELEM *b1 = d->buffer + (y  )*stride;
+
+    vertical_compose(b0, b1, width);
+    d->horizontal_compose(b0, d->temp, width);
+    d->horizontal_compose(b1, d->temp, width);
+
+    d->cs[level].y += 2;
+}
+
+// Don't do sliced idwt for fidelity; the 9 tap filter makes it a bit annoying
+// Fortunately, this filter isn't used in practice.
+static void spatial_compose_fidelity(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_9tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_9tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    int i, y;
+    IDWTELEM *b[8];
+
+    for (y = 1; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 0, height-2)*stride;
+        vertical_compose_h0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 1, height-1)*stride;
+        vertical_compose_l0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y++)
+        d->horizontal_compose(d->buffer + y*stride, d->temp, width);
+
+    d->cs[level].y = height+1;
+}
+
+static void spatial_compose_daub97i_dy(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    vertical_compose_3tap vertical_compose_l1 = (void*)d->vertical_compose_l1;
+    vertical_compose_3tap vertical_compose_h1 = (void*)d->vertical_compose_h1;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    IDWTELEM *b[6];
+    for (i = 0; i < 4; i++)
+        b[i] = cs->b[i];
+    b[4] = d->buffer + avpriv_mirror(y+3, height-1)*stride;
+    b[5] = d->buffer + avpriv_mirror(y+4, height-1)*stride;
+
+        if(y+3<(unsigned)height) vertical_compose_l1(b[3], b[4], b[5], width);
+        if(y+2<(unsigned)height) vertical_compose_h1(b[2], b[3], b[4], width);
+        if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+        if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+        if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+        if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 4; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+
+static void spatial_compose97i_init2(DWTCompose *cs, IDWTELEM *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-3-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-3  , height-1)*stride;
+    cs->b[2] = buffer + avpriv_mirror(-3+1, height-1)*stride;
+    cs->b[3] = buffer + avpriv_mirror(-3+2, height-1)*stride;
+    cs->y = -3;
+}
+
+static void spatial_compose53i_init2(DWTCompose *cs, IDWTELEM *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-1-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-1  , height-1)*stride;
+    cs->y = -1;
+}
+
+static void spatial_compose_dd97i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+static void spatial_compose_dd137i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->b[6] = buffer + av_clip(-5+5, 0, height-2)*stride;
+    cs->b[7] = buffer + av_clip(-5+6, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+int ff_spatial_idwt_init2(DWTContext *d, IDWTELEM *buffer, int width, int height,
+                          int stride, enum dwt_type type, int decomposition_count,
+                          IDWTELEM *temp)
+{
+    int level;
+
+    d->buffer = buffer;
+    d->width = width;
+    d->height = height;
+    d->stride = stride;
+    d->decomposition_count = decomposition_count;
+    d->temp = temp + 8;
+
+    for(level=decomposition_count-1; level>=0; level--){
+        int hl = height >> level;
+        int stride_l = stride << level;
+
+        switch(type){
+        case DWT_DIRAC_DD9_7:
+            spatial_compose_dd97i_init(d->cs+level, buffer, hl, stride_l);
+            break;
+        case DWT_DIRAC_LEGALL5_3:
+            spatial_compose53i_init2(d->cs+level, buffer, hl, stride_l);
+            break;
+        case DWT_DIRAC_DD13_7:
+            spatial_compose_dd137i_init(d->cs+level, buffer, hl, stride_l);
+            break;
+        case DWT_DIRAC_HAAR0:
+        case DWT_DIRAC_HAAR1:
+            d->cs[level].y = 1;
+            break;
+        case DWT_DIRAC_DAUB9_7:
+            spatial_compose97i_init2(d->cs+level, buffer, hl, stride_l);
+            break;
+        default:
+            d->cs[level].y = 0;
+            break;
+        }
+    }
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->spatial_compose = spatial_compose_dd97i_dy;
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0;
+        d->horizontal_compose = horizontal_compose_dd97i;
+        d->support = 7;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->spatial_compose = spatial_compose_dirac53i_dy;
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0;
+        d->horizontal_compose = horizontal_compose_dirac53i;
+        d->support = 3;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->spatial_compose = spatial_compose_dd137i_dy;
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0;
+        d->horizontal_compose = horizontal_compose_dd137i;
+        d->support = 7;
+        break;
+    case DWT_DIRAC_HAAR0:
+    case DWT_DIRAC_HAAR1:
+        d->spatial_compose = spatial_compose_haari_dy;
+        d->vertical_compose = (void*)vertical_compose_haar;
+        if (type == DWT_DIRAC_HAAR0)
+            d->horizontal_compose = horizontal_compose_haar0i;
+        else
+            d->horizontal_compose = horizontal_compose_haar1i;
+        d->support = 1;
+        break;
+    case DWT_DIRAC_FIDELITY:
+        d->spatial_compose = spatial_compose_fidelity;
+        d->vertical_compose_l0 = (void*)vertical_compose_fidelityiL0;
+        d->vertical_compose_h0 = (void*)vertical_compose_fidelityiH0;
+        d->horizontal_compose = horizontal_compose_fidelityi;
+        d->support = 0; // not really used
+        break;
+    case DWT_DIRAC_DAUB9_7:
+        d->spatial_compose = spatial_compose_daub97i_dy;
+        d->vertical_compose_l0 = (void*)vertical_compose_daub97iL0;
+        d->vertical_compose_h0 = (void*)vertical_compose_daub97iH0;
+        d->vertical_compose_l1 = (void*)vertical_compose_daub97iL1;
+        d->vertical_compose_h1 = (void*)vertical_compose_daub97iH1;
+        d->horizontal_compose = horizontal_compose_daub97i;
+        d->support = 5;
+        break;
+    default:
+        av_log(NULL, AV_LOG_ERROR, "Unknown wavelet type %d\n", type);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (HAVE_MMX) ff_spatial_idwt_init_mmx(d, type);
+
+    return 0;
+}
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y)
+{
+    int level, support = d->support;
+
+    for (level = d->decomposition_count-1; level >= 0; level--) {
+        int wl = d->width  >> level;
+        int hl = d->height >> level;
+        int stride_l = d->stride << level;
+
+        while (d->cs[level].y <= FFMIN((y>>level)+support, hl))
+            d->spatial_compose(d, level, wl, hl, stride_l);
+    }
+}
+
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
new file mode 100644
index 0000000000..e5e447b0ac
--- /dev/null
+++ b/libavcodec/dirac_dwt.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_DWT_H
+#define AVCODEC_DIRAC_DWT_H
+
+#include <stdint.h>
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DWT_SUPPORT 8
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    IDWTELEM *b[MAX_DWT_SUPPORT];
+    int y;
+} DWTCompose;
+
+struct DWTContext;
+
+// Possible prototypes for vertical_compose functions
+typedef void (*vertical_compose_2tap)(IDWTELEM *b0, IDWTELEM *b1, int width);
+typedef void (*vertical_compose_3tap)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width);
+typedef void (*vertical_compose_5tap)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width);
+typedef void (*vertical_compose_9tap)(IDWTELEM *dst, IDWTELEM *b[8], int width);
+
+typedef struct DWTContext {
+    IDWTELEM *buffer;
+    IDWTELEM *temp;
+    int width;
+    int height;
+    int stride;
+    int decomposition_count;
+    int support;
+
+    void (*spatial_compose)(struct DWTContext *cs, int level, int width, int height, int stride);
+    void (*vertical_compose_l0)(void);
+    void (*vertical_compose_h0)(void);
+    void (*vertical_compose_l1)(void);
+    void (*vertical_compose_h1)(void);
+    void (*vertical_compose)(void);     ///< one set of lowpass and highpass combined
+    void (*horizontal_compose)(IDWTELEM *b, IDWTELEM *tmp, int width);
+
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+} DWTContext;
+
+enum dwt_type {
+    DWT_SNOW_DAUB9_7,
+    DWT_SNOW_LEGALL5_3,
+    DWT_DIRAC_DD9_7,
+    DWT_DIRAC_LEGALL5_3,
+    DWT_DIRAC_DD13_7,
+    DWT_DIRAC_HAAR0,
+    DWT_DIRAC_HAAR1,
+    DWT_DIRAC_FIDELITY,
+    DWT_DIRAC_DAUB9_7,
+    DWT_NUM_TYPES
+};
+
+// -1 if an error occurred, e.g. the dwt_type isn't recognized
+int ff_spatial_idwt_init2(DWTContext *d, IDWTELEM *buffer, int width, int height,
+                          int stride, enum dwt_type type, int decomposition_count,
+                          IDWTELEM *temp);
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y);
+
+// shared stuff for simd optimizations
+#define COMPOSE_53iL0(b0, b1, b2)\
+    (b1 - ((b0 + b2 + 2) >> 2))
+
+#define COMPOSE_DIRAC53iH0(b0, b1, b2)\
+    (b1 + ((b0 + b2 + 1) >> 1))
+
+#define COMPOSE_DD97iH0(b0, b1, b2, b3, b4)\
+    (b2 + ((-b0 + 9*b1 + 9*b3 - b4 + 8) >> 4))
+
+#define COMPOSE_DD137iL0(b0, b1, b2, b3, b4)\
+    (b2 - ((-b0 + 9*b1 + 9*b3 - b4 + 16) >> 5))
+
+#define COMPOSE_HAARiL0(b0, b1)\
+    (b0 - ((b1 + 1) >> 1))
+
+#define COMPOSE_HAARiH0(b0, b1)\
+    (b0 + b1)
+
+#define COMPOSE_FIDELITYiL0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    (b4 - ((-8*(b0+b8) + 21*(b1+b7) - 46*(b2+b6) + 161*(b3+b5) + 128) >> 8))
+
+#define COMPOSE_FIDELITYiH0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    (b4 + ((-2*(b0+b8) + 10*(b1+b7) - 25*(b2+b6) + 81*(b3+b5) + 128) >> 8))
+
+#define COMPOSE_DAUB97iL1(b0, b1, b2)\
+    (b1 - ((1817*(b0 + b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH1(b0, b1, b2)\
+    (b1 - (( 113*(b0 + b2) + 64) >> 7))
+
+#define COMPOSE_DAUB97iL0(b0, b1, b2)\
+    (b1 + (( 217*(b0 + b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH0(b0, b1, b2)\
+    (b1 + ((6497*(b0 + b2) + 2048) >> 12))
+
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/dirac_parser.c b/libavcodec/dirac_parser.c
index 5c9d266480..45ded5a779 100644
--- a/libavcodec/dirac_parser.c
+++ b/libavcodec/dirac_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007-2008 Marco Gerards <marco@gnu.org>
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,6 +139,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         void *new_buffer =
             av_fast_realloc(pc->buffer, &pc->buffer_size,
                             pc->index + (*buf_size - pc->sync_offset));
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, (*buf + pc->sync_offset),
                *buf_size - pc->sync_offset);
@@ -149,6 +151,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         DiracParseUnit pu1, pu;
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            pc->index + next);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, *buf, next);
         pc->index += next;
@@ -161,7 +165,9 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
          * we can be pretty sure that we have a valid parse unit */
         if (!unpack_parse_unit(&pu1, pc, pc->index - 13)                     ||
             !unpack_parse_unit(&pu, pc, pc->index - 13 - pu1.prev_pu_offset) ||
-            pu.next_pu_offset != pu1.prev_pu_offset) {
+            pu.next_pu_offset != pu1.prev_pu_offset                          ||
+            pc->index < pc->dirac_unit_size + 13LL + pu1.prev_pu_offset
+        ) {
             pc->index              -= 9;
             *buf_size               = next - 9;
             pc->header_bytes_needed = 9;
@@ -245,7 +251,7 @@ static void dirac_parse_close(AVCodecParserContext *s)
     DiracParseContext *pc = s->priv_data;
 
     if (pc->buffer_size > 0)
-        av_free(pc->buffer);
+        av_freep(&pc->buffer);
 }
 
 AVCodecParser ff_dirac_parser = {
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
new file mode 100644
index 0000000000..ea16007f1d
--- /dev/null
+++ b/libavcodec/diracdec.c
@@ -0,0 +1,2046 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Dirac Decoder
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
+ */
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "golomb.h"
+#include "dirac_arith.h"
+#include "mpeg12data.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideoencdsp.h"
+#include "dirac_dwt.h"
+#include "dirac.h"
+#include "diracdsp.h"
+#include "videodsp.h"
+
+/**
+ * The spec limits the number of wavelet decompositions to 4 for both
+ * level 1 (VC-2) and 128 (long-gop default).
+ * 5 decompositions is the maximum before >16-bit buffers are needed.
+ * Schroedinger allows this for DD 9,7 and 13,7 wavelets only, limiting
+ * the others to 4 decompositions (or 3 for the fidelity filter).
+ *
+ * We use this instead of MAX_DECOMPOSITIONS to save some memory.
+ */
+#define MAX_DWT_LEVELS 5
+
+/**
+ * The spec limits this to 3 for frame coding, but in practice can be as high as 6
+ */
+#define MAX_REFERENCE_FRAMES 8
+#define MAX_DELAY 5         /* limit for main profile for frame coding (TODO: field coding) */
+#define MAX_FRAMES (MAX_REFERENCE_FRAMES + MAX_DELAY + 1)
+#define MAX_QUANT 68        /* max quant for VC-2 */
+#define MAX_BLOCKSIZE 32    /* maximum xblen/yblen we support */
+
+/**
+ * DiracBlock->ref flags, if set then the block does MC from the given ref
+ */
+#define DIRAC_REF_MASK_REF1   1
+#define DIRAC_REF_MASK_REF2   2
+#define DIRAC_REF_MASK_GLOBAL 4
+
+/**
+ * Value of Picture.reference when Picture is not a reference picture, but
+ * is held for delayed output.
+ */
+#define DELAYED_PIC_REF 4
+
+#define CALC_PADDING(size, depth)                       \
+    (((size + (1 << depth) - 1) >> depth) << depth)
+
+#define DIVRNDUP(a, b) (((a) + (b) - 1) / (b))
+
+typedef struct {
+    AVFrame *avframe;
+    int interpolated[3];    /* 1 if hpel[] is valid */
+    uint8_t *hpel[3][4];
+    uint8_t *hpel_base[3][4];
+    int reference;
+} DiracFrame;
+
+typedef struct {
+    union {
+        int16_t mv[2][2];
+        int16_t dc[3];
+    } u; /* anonymous unions aren't in C99 :( */
+    uint8_t ref;
+} DiracBlock;
+
+typedef struct SubBand {
+    int level;
+    int orientation;
+    int stride;
+    int width;
+    int height;
+    int quant;
+    IDWTELEM *ibuf;
+    struct SubBand *parent;
+
+    /* for low delay */
+    unsigned length;
+    const uint8_t *coeff_data;
+} SubBand;
+
+typedef struct Plane {
+    int width;
+    int height;
+    ptrdiff_t stride;
+
+    int idwt_width;
+    int idwt_height;
+    int idwt_stride;
+    IDWTELEM *idwt_buf;
+    IDWTELEM *idwt_buf_base;
+    IDWTELEM *idwt_tmp;
+
+    /* block length */
+    uint8_t xblen;
+    uint8_t yblen;
+    /* block separation (block n+1 starts after this many pixels in block n) */
+    uint8_t xbsep;
+    uint8_t ybsep;
+    /* amount of overspill on each edge (half of the overlap between blocks) */
+    uint8_t xoffset;
+    uint8_t yoffset;
+
+    SubBand band[MAX_DWT_LEVELS][4];
+} Plane;
+
+typedef struct DiracContext {
+    AVCodecContext *avctx;
+    MpegvideoEncDSPContext mpvencdsp;
+    VideoDSPContext vdsp;
+    DiracDSPContext diracdsp;
+    GetBitContext gb;
+    dirac_source_params source;
+    int seen_sequence_header;
+    int frame_number;           /* number of the next frame to display       */
+    Plane plane[3];
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    int zero_res;               /* zero residue flag                         */
+    int is_arith;               /* whether coeffs use arith or golomb coding */
+    int low_delay;              /* use the low delay syntax                  */
+    int globalmc_flag;          /* use global motion compensation            */
+    int num_refs;               /* number of reference pictures              */
+
+    /* wavelet decoding */
+    unsigned wavelet_depth;     /* depth of the IDWT                         */
+    unsigned wavelet_idx;
+
+    /**
+     * schroedinger older than 1.0.8 doesn't store
+     * quant delta if only one codebook exists in a band
+     */
+    unsigned old_delta_quant;
+    unsigned codeblock_mode;
+
+    struct {
+        unsigned width;
+        unsigned height;
+    } codeblock[MAX_DWT_LEVELS+1];
+
+    struct {
+        unsigned num_x;         /* number of horizontal slices               */
+        unsigned num_y;         /* number of vertical slices                 */
+        AVRational bytes;       /* average bytes per slice                   */
+        uint8_t quant[MAX_DWT_LEVELS][4]; /* [DIRAC_STD] E.1 */
+    } lowdelay;
+
+    struct {
+        int pan_tilt[2];        /* pan/tilt vector                           */
+        int zrs[2][2];          /* zoom/rotate/shear matrix                  */
+        int perspective[2];     /* perspective vector                        */
+        unsigned zrs_exp;
+        unsigned perspective_exp;
+    } globalmc[2];
+
+    /* motion compensation */
+    uint8_t mv_precision;       /* [DIRAC_STD] REFS_WT_PRECISION             */
+    int16_t weight[2];          /* [DIRAC_STD] REF1_WT and REF2_WT           */
+    unsigned weight_log2denom;  /* [DIRAC_STD] REFS_WT_PRECISION             */
+
+    int blwidth;                /* number of blocks (horizontally)           */
+    int blheight;               /* number of blocks (vertically)             */
+    int sbwidth;                /* number of superblocks (horizontally)      */
+    int sbheight;               /* number of superblocks (vertically)        */
+
+    uint8_t *sbsplit;
+    DiracBlock *blmotion;
+
+    uint8_t *edge_emu_buffer[4];
+    uint8_t *edge_emu_buffer_base;
+
+    uint16_t *mctmp;            /* buffer holding the MC data multiplied by OBMC weights */
+    uint8_t *mcscratch;
+    int buffer_stride;
+
+    DECLARE_ALIGNED(16, uint8_t, obmc_weight)[3][MAX_BLOCKSIZE*MAX_BLOCKSIZE];
+
+    void (*put_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*add_obmc)(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+    dirac_weight_func weight_func;
+    dirac_biweight_func biweight_func;
+
+    DiracFrame *current_picture;
+    DiracFrame *ref_pics[2];
+
+    DiracFrame *ref_frames[MAX_REFERENCE_FRAMES+1];
+    DiracFrame *delay_frames[MAX_DELAY+1];
+    DiracFrame all_frames[MAX_FRAMES];
+} DiracContext;
+
+/**
+ * Dirac Specification ->
+ * Parse code values. 9.6.1 Table 9.1
+ */
+enum dirac_parse_code {
+    pc_seq_header         = 0x00,
+    pc_eos                = 0x10,
+    pc_aux_data           = 0x20,
+    pc_padding            = 0x30,
+};
+
+enum dirac_subband {
+    subband_ll = 0,
+    subband_hl = 1,
+    subband_lh = 2,
+    subband_hh = 3,
+    subband_nb,
+};
+
+static const uint8_t default_qmat[][4][4] = {
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 4,  2,  2,  0}, { 0,  4,  4,  2}, { 0,  5,  5,  3}, { 0,  7,  7,  5} },
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 0,  4,  4,  8}, { 0,  8,  8, 12}, { 0, 13, 13, 17}, { 0, 17, 17, 21} },
+    { { 3,  1,  1,  0}, { 0,  4,  4,  2}, { 0,  6,  6,  5}, { 0,  9,  9,  7} },
+};
+
+static const int qscale_tab[MAX_QUANT+1] = {
+    4,     5,     6,     7,     8,    10,    11,    13,
+    16,    19,    23,    27,    32,    38,    45,    54,
+    64,    76,    91,   108,   128,   152,   181,   215,
+    256,   304,   362,   431,   512,   609,   724,   861,
+    1024,  1218,  1448,  1722,  2048,  2435,  2896,  3444,
+    4096,  4871,  5793,  6889,  8192,  9742, 11585, 13777,
+    16384, 19484, 23170, 27554, 32768, 38968, 46341, 55109,
+    65536, 77936
+};
+
+static const int qoffset_intra_tab[MAX_QUANT+1] = {
+    1,     2,     3,     4,     4,     5,     6,     7,
+    8,    10,    12,    14,    16,    19,    23,    27,
+    32,    38,    46,    54,    64,    76,    91,   108,
+    128,   152,   181,   216,   256,   305,   362,   431,
+    512,   609,   724,   861,  1024,  1218,  1448,  1722,
+    2048,  2436,  2897,  3445,  4096,  4871,  5793,  6889,
+    8192,  9742, 11585, 13777, 16384, 19484, 23171, 27555,
+    32768, 38968
+};
+
+static const int qoffset_inter_tab[MAX_QUANT+1] = {
+    1,     2,     2,     3,     3,     4,     4,     5,
+    6,     7,     9,    10,    12,    14,    17,    20,
+    24,    29,    34,    41,    48,    57,    68,    81,
+    96,   114,   136,   162,   192,   228,   272,   323,
+    384,   457,   543,   646,   768,   913,  1086,  1292,
+    1536,  1827,  2172,  2583,  3072,  3653,  4344,  5166,
+    6144,  7307,  8689, 10333, 12288, 14613, 17378, 20666,
+    24576, 29226
+};
+
+/* magic number division by 3 from schroedinger */
+static inline int divide3(int x)
+{
+    return ((x+1)*21845 + 10922) >> 16;
+}
+
+static DiracFrame *remove_frame(DiracFrame *framelist[], int picnum)
+{
+    DiracFrame *remove_pic = NULL;
+    int i, remove_idx = -1;
+
+    for (i = 0; framelist[i]; i++)
+        if (framelist[i]->avframe->display_picture_number == picnum) {
+            remove_pic = framelist[i];
+            remove_idx = i;
+        }
+
+    if (remove_pic)
+        for (i = remove_idx; framelist[i]; i++)
+            framelist[i] = framelist[i+1];
+
+    return remove_pic;
+}
+
+static int add_frame(DiracFrame *framelist[], int maxframes, DiracFrame *frame)
+{
+    int i;
+    for (i = 0; i < maxframes; i++)
+        if (!framelist[i]) {
+            framelist[i] = frame;
+            return 0;
+        }
+    return -1;
+}
+
+static int alloc_sequence_buffers(DiracContext *s)
+{
+    int sbwidth  = DIVRNDUP(s->source.width,  4);
+    int sbheight = DIVRNDUP(s->source.height, 4);
+    int i, w, h, top_padding;
+
+    /* todo: think more about this / use or set Plane here */
+    for (i = 0; i < 3; i++) {
+        int max_xblen = MAX_BLOCKSIZE >> (i ? s->chroma_x_shift : 0);
+        int max_yblen = MAX_BLOCKSIZE >> (i ? s->chroma_y_shift : 0);
+        w = s->source.width  >> (i ? s->chroma_x_shift : 0);
+        h = s->source.height >> (i ? s->chroma_y_shift : 0);
+
+        /* we allocate the max we support here since num decompositions can
+         * change from frame to frame. Stride is aligned to 16 for SIMD, and
+         * 1<<MAX_DWT_LEVELS top padding to avoid if(y>0) in arith decoding
+         * MAX_BLOCKSIZE padding for MC: blocks can spill up to half of that
+         * on each side */
+        top_padding = FFMAX(1<<MAX_DWT_LEVELS, max_yblen/2);
+        w = FFALIGN(CALC_PADDING(w, MAX_DWT_LEVELS), 8); /* FIXME: Should this be 16 for SSE??? */
+        h = top_padding + CALC_PADDING(h, MAX_DWT_LEVELS) + max_yblen/2;
+
+        s->plane[i].idwt_buf_base = av_mallocz_array((w+max_xblen), h * sizeof(IDWTELEM));
+        s->plane[i].idwt_tmp      = av_malloc_array((w+16), sizeof(IDWTELEM));
+        s->plane[i].idwt_buf      = s->plane[i].idwt_buf_base + top_padding*w;
+        if (!s->plane[i].idwt_buf_base || !s->plane[i].idwt_tmp)
+            return AVERROR(ENOMEM);
+    }
+
+    /* fixme: allocate using real stride here */
+    s->sbsplit  = av_malloc_array(sbwidth, sbheight);
+    s->blmotion = av_malloc_array(sbwidth, sbheight * 16 * sizeof(*s->blmotion));
+
+    if (!s->sbsplit || !s->blmotion)
+        return AVERROR(ENOMEM);
+    return 0;
+}
+
+static int alloc_buffers(DiracContext *s, int stride)
+{
+    int w = s->source.width;
+    int h = s->source.height;
+
+    av_assert0(stride >= w);
+    stride += 64;
+
+    if (s->buffer_stride >= stride)
+        return 0;
+    s->buffer_stride = 0;
+
+    av_freep(&s->edge_emu_buffer_base);
+    memset(s->edge_emu_buffer, 0, sizeof(s->edge_emu_buffer));
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+
+    s->edge_emu_buffer_base = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    s->mctmp     = av_malloc_array((stride+MAX_BLOCKSIZE), (h+MAX_BLOCKSIZE) * sizeof(*s->mctmp));
+    s->mcscratch = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    if (!s->edge_emu_buffer_base || !s->mctmp || !s->mcscratch)
+        return AVERROR(ENOMEM);
+
+    s->buffer_stride = stride;
+    return 0;
+}
+
+static void free_sequence_buffers(DiracContext *s)
+{
+    int i, j, k;
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        if (s->all_frames[i].avframe->data[0]) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+        for (j = 0; j < 3; j++)
+            for (k = 1; k < 4; k++)
+                av_freep(&s->all_frames[i].hpel_base[j][k]);
+    }
+
+    memset(s->ref_frames, 0, sizeof(s->ref_frames));
+    memset(s->delay_frames, 0, sizeof(s->delay_frames));
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->plane[i].idwt_buf_base);
+        av_freep(&s->plane[i].idwt_tmp);
+    }
+
+    s->buffer_stride = 0;
+    av_freep(&s->sbsplit);
+    av_freep(&s->blmotion);
+    av_freep(&s->edge_emu_buffer_base);
+
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+}
+
+static av_cold int dirac_decode_init(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i;
+
+    s->avctx = avctx;
+    s->frame_number = -1;
+
+    ff_diracdsp_init(&s->diracdsp);
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+    ff_videodsp_init(&s->vdsp, 8);
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        s->all_frames[i].avframe = av_frame_alloc();
+        if (!s->all_frames[i].avframe) {
+            while (i > 0)
+                av_frame_free(&s->all_frames[--i].avframe);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static void dirac_decode_flush(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    free_sequence_buffers(s);
+    s->seen_sequence_header = 0;
+    s->frame_number = -1;
+}
+
+static av_cold int dirac_decode_end(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i;
+
+    dirac_decode_flush(avctx);
+    for (i = 0; i < MAX_FRAMES; i++)
+        av_frame_free(&s->all_frames[i].avframe);
+
+    return 0;
+}
+
+#define SIGN_CTX(x) (CTX_SIGN_ZERO + ((x) > 0) - ((x) < 0))
+
+static inline void coeff_unpack_arith(DiracArith *c, int qfactor, int qoffset,
+                                      SubBand *b, IDWTELEM *buf, int x, int y)
+{
+    int coeff, sign;
+    int sign_pred = 0;
+    int pred_ctx = CTX_ZPZN_F1;
+
+    /* Check if the parent subband has a 0 in the corresponding position */
+    if (b->parent)
+        pred_ctx += !!b->parent->ibuf[b->parent->stride * (y>>1) + (x>>1)] << 1;
+
+    if (b->orientation == subband_hl)
+        sign_pred = buf[-b->stride];
+
+    /* Determine if the pixel has only zeros in its neighbourhood */
+    if (x) {
+        pred_ctx += !(buf[-1] | buf[-b->stride] | buf[-1-b->stride]);
+        if (b->orientation == subband_lh)
+            sign_pred = buf[-1];
+    } else {
+        pred_ctx += !buf[-b->stride];
+    }
+
+    coeff = dirac_get_arith_uint(c, pred_ctx, CTX_COEFF_DATA);
+    if (coeff) {
+        coeff = (coeff * qfactor + qoffset + 2) >> 2;
+        sign  = dirac_get_arith_bit(c, SIGN_CTX(sign_pred));
+        coeff = (coeff ^ -sign) + sign;
+    }
+    *buf = coeff;
+}
+
+static inline int coeff_unpack_golomb(GetBitContext *gb, int qfactor, int qoffset)
+{
+    int sign, coeff;
+
+    coeff = svq3_get_ue_golomb(gb);
+    if (coeff) {
+        coeff = (coeff * qfactor + qoffset + 2) >> 2;
+        sign  = get_bits1(gb);
+        coeff = (coeff ^ -sign) + sign;
+    }
+    return coeff;
+}
+
+/**
+ * Decode the coeffs in the rectangle defined by left, right, top, bottom
+ * [DIRAC_STD] 13.4.3.2 Codeblock unpacking loop. codeblock()
+ */
+static inline void codeblock(DiracContext *s, SubBand *b,
+                             GetBitContext *gb, DiracArith *c,
+                             int left, int right, int top, int bottom,
+                             int blockcnt_one, int is_arith)
+{
+    int x, y, zero_block;
+    int qoffset, qfactor;
+    IDWTELEM *buf;
+
+    /* check for any coded coefficients in this codeblock */
+    if (!blockcnt_one) {
+        if (is_arith)
+            zero_block = dirac_get_arith_bit(c, CTX_ZERO_BLOCK);
+        else
+            zero_block = get_bits1(gb);
+
+        if (zero_block)
+            return;
+    }
+
+    if (s->codeblock_mode && !(s->old_delta_quant && blockcnt_one)) {
+        int quant = b->quant;
+        if (is_arith)
+            quant += dirac_get_arith_int(c, CTX_DELTA_Q_F, CTX_DELTA_Q_DATA);
+        else
+            quant += dirac_get_se_golomb(gb);
+        if (quant < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid quant\n");
+            return;
+        }
+        b->quant = quant;
+    }
+
+    b->quant = FFMIN(b->quant, MAX_QUANT);
+
+    qfactor = qscale_tab[b->quant];
+    /* TODO: context pointer? */
+    if (!s->num_refs)
+        qoffset = qoffset_intra_tab[b->quant];
+    else
+        qoffset = qoffset_inter_tab[b->quant];
+
+    buf = b->ibuf + top * b->stride;
+    for (y = top; y < bottom; y++) {
+        for (x = left; x < right; x++) {
+            /* [DIRAC_STD] 13.4.4 Subband coefficients. coeff_unpack() */
+            if (is_arith)
+                coeff_unpack_arith(c, qfactor, qoffset, b, buf+x, x, y);
+            else
+                buf[x] = coeff_unpack_golomb(gb, qfactor, qoffset);
+        }
+        buf += b->stride;
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 13.3 intra_dc_prediction(band)
+ */
+static inline void intra_dc_prediction(SubBand *b)
+{
+    IDWTELEM *buf = b->ibuf;
+    int x, y;
+
+    for (x = 1; x < b->width; x++)
+        buf[x] += buf[x-1];
+    buf += b->stride;
+
+    for (y = 1; y < b->height; y++) {
+        buf[0] += buf[-b->stride];
+
+        for (x = 1; x < b->width; x++) {
+            int pred = buf[x - 1] + buf[x - b->stride] + buf[x - b->stride-1];
+            buf[x]  += divide3(pred);
+        }
+        buf += b->stride;
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 13.4.2 Non-skipped subbands.  subband_coeffs()
+ */
+static av_always_inline void decode_subband_internal(DiracContext *s, SubBand *b, int is_arith)
+{
+    int cb_x, cb_y, left, right, top, bottom;
+    DiracArith c;
+    GetBitContext gb;
+    int cb_width  = s->codeblock[b->level + (b->orientation != subband_ll)].width;
+    int cb_height = s->codeblock[b->level + (b->orientation != subband_ll)].height;
+    int blockcnt_one = (cb_width + cb_height) == 2;
+
+    if (!b->length)
+        return;
+
+    init_get_bits8(&gb, b->coeff_data, b->length);
+
+    if (is_arith)
+        ff_dirac_init_arith_decoder(&c, &gb, b->length);
+
+    top = 0;
+    for (cb_y = 0; cb_y < cb_height; cb_y++) {
+        bottom = (b->height * (cb_y+1LL)) / cb_height;
+        left = 0;
+        for (cb_x = 0; cb_x < cb_width; cb_x++) {
+            right = (b->width * (cb_x+1LL)) / cb_width;
+            codeblock(s, b, &gb, &c, left, right, top, bottom, blockcnt_one, is_arith);
+            left = right;
+        }
+        top = bottom;
+    }
+
+    if (b->orientation == subband_ll && s->num_refs == 0)
+        intra_dc_prediction(b);
+}
+
+static int decode_subband_arith(AVCodecContext *avctx, void *b)
+{
+    DiracContext *s = avctx->priv_data;
+    decode_subband_internal(s, b, 1);
+    return 0;
+}
+
+static int decode_subband_golomb(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    SubBand **b     = arg;
+    decode_subband_internal(s, *b, 0);
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * [DIRAC_STD] 13.4.1 core_transform_data()
+ */
+static void decode_component(DiracContext *s, int comp)
+{
+    AVCodecContext *avctx = s->avctx;
+    SubBand *bands[3*MAX_DWT_LEVELS+1];
+    enum dirac_subband orientation;
+    int level, num_bands = 0;
+
+    /* Unpack all subbands at all levels. */
+    for (level = 0; level < s->wavelet_depth; level++) {
+        for (orientation = !!level; orientation < 4; orientation++) {
+            SubBand *b = &s->plane[comp].band[level][orientation];
+            bands[num_bands++] = b;
+
+            align_get_bits(&s->gb);
+            /* [DIRAC_STD] 13.4.2 subband() */
+            b->length = svq3_get_ue_golomb(&s->gb);
+            if (b->length) {
+                b->quant = svq3_get_ue_golomb(&s->gb);
+                align_get_bits(&s->gb);
+                b->coeff_data = s->gb.buffer + get_bits_count(&s->gb)/8;
+                b->length = FFMIN(b->length, FFMAX(get_bits_left(&s->gb)/8, 0));
+                skip_bits_long(&s->gb, b->length*8);
+            }
+        }
+        /* arithmetic coding has inter-level dependencies, so we can only execute one level at a time */
+        if (s->is_arith)
+            avctx->execute(avctx, decode_subband_arith, &s->plane[comp].band[level][!!level],
+                           NULL, 4-!!level, sizeof(SubBand));
+    }
+    /* golomb coding has no inter-level dependencies, so we can execute all subbands in parallel */
+    if (!s->is_arith)
+        avctx->execute(avctx, decode_subband_golomb, bands, NULL, num_bands, sizeof(SubBand*));
+}
+
+/* [DIRAC_STD] 13.5.5.2 Luma slice subband data. luma_slice_band(level,orient,sx,sy) --> if b2 == NULL */
+/* [DIRAC_STD] 13.5.5.3 Chroma slice subband data. chroma_slice_band(level,orient,sx,sy) --> if b2 != NULL */
+static void lowdelay_subband(DiracContext *s, GetBitContext *gb, int quant,
+                             int slice_x, int slice_y, int bits_end,
+                             SubBand *b1, SubBand *b2)
+{
+    int left   = b1->width  * slice_x    / s->lowdelay.num_x;
+    int right  = b1->width  *(slice_x+1) / s->lowdelay.num_x;
+    int top    = b1->height * slice_y    / s->lowdelay.num_y;
+    int bottom = b1->height *(slice_y+1) / s->lowdelay.num_y;
+
+    int qfactor = qscale_tab[FFMIN(quant, MAX_QUANT)];
+    int qoffset = qoffset_intra_tab[FFMIN(quant, MAX_QUANT)];
+
+    IDWTELEM *buf1 =      b1->ibuf + top * b1->stride;
+    IDWTELEM *buf2 = b2 ? b2->ibuf + top * b2->stride : NULL;
+    int x, y;
+    /* we have to constantly check for overread since the spec explicitly
+       requires this, with the meaning that all remaining coeffs are set to 0 */
+    if (get_bits_count(gb) >= bits_end)
+        return;
+
+    for (y = top; y < bottom; y++) {
+        for (x = left; x < right; x++) {
+            buf1[x] = coeff_unpack_golomb(gb, qfactor, qoffset);
+            if (get_bits_count(gb) >= bits_end)
+                return;
+            if (buf2) {
+                buf2[x] = coeff_unpack_golomb(gb, qfactor, qoffset);
+                if (get_bits_count(gb) >= bits_end)
+                    return;
+            }
+        }
+        buf1 += b1->stride;
+        if (buf2)
+            buf2 += b2->stride;
+    }
+}
+
+struct lowdelay_slice {
+    GetBitContext gb;
+    int slice_x;
+    int slice_y;
+    int bytes;
+};
+
+
+/**
+ * Dirac Specification ->
+ * 13.5.2 Slices. slice(sx,sy)
+ */
+static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    struct lowdelay_slice *slice = arg;
+    GetBitContext *gb = &slice->gb;
+    enum dirac_subband orientation;
+    int level, quant, chroma_bits, chroma_end;
+
+    int quant_base  = get_bits(gb, 7); /*[DIRAC_STD] qindex */
+    int length_bits = av_log2(8 * slice->bytes)+1;
+    int luma_bits   = get_bits_long(gb, length_bits);
+    int luma_end    = get_bits_count(gb) + FFMIN(luma_bits, get_bits_left(gb));
+
+    /* [DIRAC_STD] 13.5.5.2 luma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            lowdelay_subband(s, gb, quant, slice->slice_x, slice->slice_y, luma_end,
+                             &s->plane[0].band[level][orientation], NULL);
+        }
+
+    /* consume any unused bits from luma */
+    skip_bits_long(gb, get_bits_count(gb) - luma_end);
+
+    chroma_bits = 8*slice->bytes - 7 - length_bits - luma_bits;
+    chroma_end  = get_bits_count(gb) + FFMIN(chroma_bits, get_bits_left(gb));
+    /* [DIRAC_STD] 13.5.5.3 chroma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            lowdelay_subband(s, gb, quant, slice->slice_x, slice->slice_y, chroma_end,
+                             &s->plane[1].band[level][orientation],
+                             &s->plane[2].band[level][orientation]);
+        }
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.5.1 low_delay_transform_data()
+ */
+static int decode_lowdelay(DiracContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    int slice_x, slice_y, bytes, bufsize;
+    const uint8_t *buf;
+    struct lowdelay_slice *slices;
+    int slice_num = 0;
+
+    slices = av_mallocz_array(s->lowdelay.num_x, s->lowdelay.num_y * sizeof(struct lowdelay_slice));
+    if (!slices)
+        return AVERROR(ENOMEM);
+
+    align_get_bits(&s->gb);
+    /*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
+    buf = s->gb.buffer + get_bits_count(&s->gb)/8;
+    bufsize = get_bits_left(&s->gb);
+
+    for (slice_y = 0; bufsize > 0 && slice_y < s->lowdelay.num_y; slice_y++)
+        for (slice_x = 0; bufsize > 0 && slice_x < s->lowdelay.num_x; slice_x++) {
+            bytes = (slice_num+1) * s->lowdelay.bytes.num / s->lowdelay.bytes.den
+                - slice_num    * s->lowdelay.bytes.num / s->lowdelay.bytes.den;
+
+            slices[slice_num].bytes   = bytes;
+            slices[slice_num].slice_x = slice_x;
+            slices[slice_num].slice_y = slice_y;
+            init_get_bits(&slices[slice_num].gb, buf, bufsize);
+            slice_num++;
+
+            buf     += bytes;
+            if (bufsize/8 >= bytes)
+                bufsize -= bytes*8;
+            else
+                bufsize = 0;
+        }
+
+    avctx->execute(avctx, decode_lowdelay_slice, slices, NULL, slice_num,
+                   sizeof(struct lowdelay_slice)); /* [DIRAC_STD] 13.5.2 Slices */
+    intra_dc_prediction(&s->plane[0].band[0][0]);  /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+    intra_dc_prediction(&s->plane[1].band[0][0]);  /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+    intra_dc_prediction(&s->plane[2].band[0][0]);  /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+    av_free(slices);
+    return 0;
+}
+
+static void init_planes(DiracContext *s)
+{
+    int i, w, h, level, orientation;
+
+    for (i = 0; i < 3; i++) {
+        Plane *p = &s->plane[i];
+
+        p->width       = s->source.width  >> (i ? s->chroma_x_shift : 0);
+        p->height      = s->source.height >> (i ? s->chroma_y_shift : 0);
+        p->idwt_width  = w = CALC_PADDING(p->width , s->wavelet_depth);
+        p->idwt_height = h = CALC_PADDING(p->height, s->wavelet_depth);
+        p->idwt_stride = FFALIGN(p->idwt_width, 8);
+
+        for (level = s->wavelet_depth-1; level >= 0; level--) {
+            w = w>>1;
+            h = h>>1;
+            for (orientation = !!level; orientation < 4; orientation++) {
+                SubBand *b = &p->band[level][orientation];
+
+                b->ibuf   = p->idwt_buf;
+                b->level  = level;
+                b->stride = p->idwt_stride << (s->wavelet_depth - level);
+                b->width  = w;
+                b->height = h;
+                b->orientation = orientation;
+
+                if (orientation & 1)
+                    b->ibuf += w;
+                if (orientation > 1)
+                    b->ibuf += b->stride>>1;
+
+                if (level)
+                    b->parent = &p->band[level-1][orientation];
+            }
+        }
+
+        if (i > 0) {
+            p->xblen = s->plane[0].xblen >> s->chroma_x_shift;
+            p->yblen = s->plane[0].yblen >> s->chroma_y_shift;
+            p->xbsep = s->plane[0].xbsep >> s->chroma_x_shift;
+            p->ybsep = s->plane[0].ybsep >> s->chroma_y_shift;
+        }
+
+        p->xoffset = (p->xblen - p->xbsep)/2;
+        p->yoffset = (p->yblen - p->ybsep)/2;
+    }
+}
+
+/**
+ * Unpack the motion compensation parameters
+ * Dirac Specification ->
+ * 11.2 Picture prediction data. picture_prediction()
+ */
+static int dirac_unpack_prediction_parameters(DiracContext *s)
+{
+    static const uint8_t default_blen[] = { 4, 12, 16, 24 };
+
+    GetBitContext *gb = &s->gb;
+    unsigned idx, ref;
+
+    align_get_bits(gb);
+    /* [DIRAC_STD] 11.2.2 Block parameters. block_parameters() */
+    /* Luma and Chroma are equal. 11.2.3 */
+    idx = svq3_get_ue_golomb(gb); /* [DIRAC_STD] index */
+
+    if (idx > 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block prediction index too high\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (idx == 0) {
+        s->plane[0].xblen = svq3_get_ue_golomb(gb);
+        s->plane[0].yblen = svq3_get_ue_golomb(gb);
+        s->plane[0].xbsep = svq3_get_ue_golomb(gb);
+        s->plane[0].ybsep = svq3_get_ue_golomb(gb);
+    } else {
+        /*[DIRAC_STD] preset_block_params(index). Table 11.1 */
+        s->plane[0].xblen = default_blen[idx-1];
+        s->plane[0].yblen = default_blen[idx-1];
+        s->plane[0].xbsep = 4 * idx;
+        s->plane[0].ybsep = 4 * idx;
+    }
+    /*[DIRAC_STD] 11.2.4 motion_data_dimensions()
+      Calculated in function dirac_unpack_block_motion_data */
+
+    if (s->plane[0].xblen % (1 << s->chroma_x_shift) != 0 ||
+        s->plane[0].yblen % (1 << s->chroma_y_shift) != 0 ||
+        !s->plane[0].xblen || !s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "invalid x/y block length (%d/%d) for x/y chroma shift (%d/%d)\n",
+               s->plane[0].xblen, s->plane[0].yblen, s->chroma_x_shift, s->chroma_y_shift);
+        return AVERROR_INVALIDDATA;
+    }
+    if (!s->plane[0].xbsep || !s->plane[0].ybsep || s->plane[0].xbsep < s->plane[0].xblen/2 || s->plane[0].ybsep < s->plane[0].yblen/2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (s->plane[0].xbsep > s->plane[0].xblen || s->plane[0].ybsep > s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation greater than size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (FFMAX(s->plane[0].xblen, s->plane[0].yblen) > MAX_BLOCKSIZE) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported large block size\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /*[DIRAC_STD] 11.2.5 Motion vector precision. motion_vector_precision()
+      Read motion vector precision */
+    s->mv_precision = svq3_get_ue_golomb(gb);
+    if (s->mv_precision > 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "MV precision finer than eighth-pel\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*[DIRAC_STD] 11.2.6 Global motion. global_motion()
+      Read the global motion compensation parameters */
+    s->globalmc_flag = get_bits1(gb);
+    if (s->globalmc_flag) {
+        memset(s->globalmc, 0, sizeof(s->globalmc));
+        /* [DIRAC_STD] pan_tilt(gparams) */
+        for (ref = 0; ref < s->num_refs; ref++) {
+            if (get_bits1(gb)) {
+                s->globalmc[ref].pan_tilt[0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].pan_tilt[1] = dirac_get_se_golomb(gb);
+            }
+            /* [DIRAC_STD] zoom_rotate_shear(gparams)
+               zoom/rotation/shear parameters */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].zrs_exp   = svq3_get_ue_golomb(gb);
+                s->globalmc[ref].zrs[0][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[0][1] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][1] = dirac_get_se_golomb(gb);
+            } else {
+                s->globalmc[ref].zrs[0][0] = 1;
+                s->globalmc[ref].zrs[1][1] = 1;
+            }
+            /* [DIRAC_STD] perspective(gparams) */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].perspective_exp = svq3_get_ue_golomb(gb);
+                s->globalmc[ref].perspective[0]  = dirac_get_se_golomb(gb);
+                s->globalmc[ref].perspective[1]  = dirac_get_se_golomb(gb);
+            }
+        }
+    }
+
+    /*[DIRAC_STD] 11.2.7 Picture prediction mode. prediction_mode()
+      Picture prediction mode, not currently used. */
+    if (svq3_get_ue_golomb(gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unknown picture prediction mode\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* [DIRAC_STD] 11.2.8 Reference picture weight. reference_picture_weights()
+       just data read, weight calculation will be done later on. */
+    s->weight_log2denom = 1;
+    s->weight[0]        = 1;
+    s->weight[1]        = 1;
+
+    if (get_bits1(gb)) {
+        s->weight_log2denom = svq3_get_ue_golomb(gb);
+        s->weight[0] = dirac_get_se_golomb(gb);
+        if (s->num_refs == 2)
+            s->weight[1] = dirac_get_se_golomb(gb);
+    }
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.3 Wavelet transform data. wavelet_transform()
+ */
+static int dirac_unpack_idwt_params(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    int i, level;
+    unsigned tmp;
+
+#define CHECKEDREAD(dst, cond, errmsg) \
+    tmp = svq3_get_ue_golomb(gb); \
+    if (cond) { \
+        av_log(s->avctx, AV_LOG_ERROR, errmsg); \
+        return AVERROR_INVALIDDATA; \
+    }\
+    dst = tmp;
+
+    align_get_bits(gb);
+
+    s->zero_res = s->num_refs ? get_bits1(gb) : 0;
+    if (s->zero_res)
+        return 0;
+
+    /*[DIRAC_STD] 11.3.1 Transform parameters. transform_parameters() */
+    CHECKEDREAD(s->wavelet_idx, tmp > 6, "wavelet_idx is too big\n")
+
+    CHECKEDREAD(s->wavelet_depth, tmp > MAX_DWT_LEVELS || tmp < 1, "invalid number of DWT decompositions\n")
+
+    if (!s->low_delay) {
+        /* Codeblock parameters (core syntax only) */
+        if (get_bits1(gb)) {
+            for (i = 0; i <= s->wavelet_depth; i++) {
+                CHECKEDREAD(s->codeblock[i].width , tmp < 1 || tmp > (s->avctx->width >>s->wavelet_depth-i), "codeblock width invalid\n")
+                CHECKEDREAD(s->codeblock[i].height, tmp < 1 || tmp > (s->avctx->height>>s->wavelet_depth-i), "codeblock height invalid\n")
+            }
+
+            CHECKEDREAD(s->codeblock_mode, tmp > 1, "unknown codeblock mode\n")
+        } else
+            for (i = 0; i <= s->wavelet_depth; i++)
+                s->codeblock[i].width = s->codeblock[i].height = 1;
+    } else {
+        /* Slice parameters + quantization matrix*/
+        /*[DIRAC_STD] 11.3.4 Slice coding Parameters (low delay syntax only). slice_parameters() */
+        s->lowdelay.num_x     = svq3_get_ue_golomb(gb);
+        s->lowdelay.num_y     = svq3_get_ue_golomb(gb);
+        s->lowdelay.bytes.num = svq3_get_ue_golomb(gb);
+        s->lowdelay.bytes.den = svq3_get_ue_golomb(gb);
+
+        if (s->lowdelay.bytes.den <= 0) {
+            av_log(s->avctx,AV_LOG_ERROR,"Invalid lowdelay.bytes.den\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        /* [DIRAC_STD] 11.3.5 Quantisation matrices (low-delay syntax). quant_matrix() */
+        if (get_bits1(gb)) {
+            av_log(s->avctx,AV_LOG_DEBUG,"Low Delay: Has Custom Quantization Matrix!\n");
+            /* custom quantization matrix */
+            s->lowdelay.quant[0][0] = svq3_get_ue_golomb(gb);
+            for (level = 0; level < s->wavelet_depth; level++) {
+                s->lowdelay.quant[level][1] = svq3_get_ue_golomb(gb);
+                s->lowdelay.quant[level][2] = svq3_get_ue_golomb(gb);
+                s->lowdelay.quant[level][3] = svq3_get_ue_golomb(gb);
+            }
+        } else {
+            if (s->wavelet_depth > 4) {
+                av_log(s->avctx,AV_LOG_ERROR,"Mandatory custom low delay matrix missing for depth %d\n", s->wavelet_depth);
+                return AVERROR_INVALIDDATA;
+            }
+            /* default quantization matrix */
+            for (level = 0; level < s->wavelet_depth; level++)
+                for (i = 0; i < 4; i++) {
+                    s->lowdelay.quant[level][i] = default_qmat[s->wavelet_idx][level][i];
+                    /* haar with no shift differs for different depths */
+                    if (s->wavelet_idx == 3)
+                        s->lowdelay.quant[level][i] += 4*(s->wavelet_depth-1 - level);
+                }
+        }
+    }
+    return 0;
+}
+
+static inline int pred_sbsplit(uint8_t *sbsplit, int stride, int x, int y)
+{
+    static const uint8_t avgsplit[7] = { 0, 0, 1, 1, 1, 2, 2 };
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return sbsplit[-1];
+    else if (!x)
+        return sbsplit[-stride];
+
+    return avgsplit[sbsplit[-1] + sbsplit[-stride] + sbsplit[-stride-1]];
+}
+
+static inline int pred_block_mode(DiracBlock *block, int stride, int x, int y, int refmask)
+{
+    int pred;
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return block[-1].ref & refmask;
+    else if (!x)
+        return block[-stride].ref & refmask;
+
+    /* return the majority */
+    pred = (block[-1].ref & refmask) + (block[-stride].ref & refmask) + (block[-stride-1].ref & refmask);
+    return (pred >> 1) & refmask;
+}
+
+static inline void pred_block_dc(DiracBlock *block, int stride, int x, int y)
+{
+    int i, n = 0;
+
+    memset(block->u.dc, 0, sizeof(block->u.dc));
+
+    if (x && !(block[-1].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1].u.dc[i];
+        n++;
+    }
+
+    if (y && !(block[-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-stride].u.dc[i];
+        n++;
+    }
+
+    if (x && y && !(block[-1-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1-stride].u.dc[i];
+        n++;
+    }
+
+    if (n == 2) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = (block->u.dc[i]+1)>>1;
+    } else if (n == 3) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = divide3(block->u.dc[i]);
+    }
+}
+
+static inline void pred_mv(DiracBlock *block, int stride, int x, int y, int ref)
+{
+    int16_t *pred[3];
+    int refmask = ref+1;
+    int mask = refmask | DIRAC_REF_MASK_GLOBAL; /*  exclude gmc blocks */
+    int n = 0;
+
+    if (x && (block[-1].ref & mask) == refmask)
+        pred[n++] = block[-1].u.mv[ref];
+
+    if (y && (block[-stride].ref & mask) == refmask)
+        pred[n++] = block[-stride].u.mv[ref];
+
+    if (x && y && (block[-stride-1].ref & mask) == refmask)
+        pred[n++] = block[-stride-1].u.mv[ref];
+
+    switch (n) {
+    case 0:
+        block->u.mv[ref][0] = 0;
+        block->u.mv[ref][1] = 0;
+        break;
+    case 1:
+        block->u.mv[ref][0] = pred[0][0];
+        block->u.mv[ref][1] = pred[0][1];
+        break;
+    case 2:
+        block->u.mv[ref][0] = (pred[0][0] + pred[1][0] + 1) >> 1;
+        block->u.mv[ref][1] = (pred[0][1] + pred[1][1] + 1) >> 1;
+        break;
+    case 3:
+        block->u.mv[ref][0] = mid_pred(pred[0][0], pred[1][0], pred[2][0]);
+        block->u.mv[ref][1] = mid_pred(pred[0][1], pred[1][1], pred[2][1]);
+        break;
+    }
+}
+
+static void global_mv(DiracContext *s, DiracBlock *block, int x, int y, int ref)
+{
+    int ez      = s->globalmc[ref].zrs_exp;
+    int ep      = s->globalmc[ref].perspective_exp;
+    int (*A)[2] = s->globalmc[ref].zrs;
+    int *b      = s->globalmc[ref].pan_tilt;
+    int *c      = s->globalmc[ref].perspective;
+
+    int m       = (1<<ep) - (c[0]*x + c[1]*y);
+    int mx      = m * ((A[0][0] * x + A[0][1]*y) + (1<<ez) * b[0]);
+    int my      = m * ((A[1][0] * x + A[1][1]*y) + (1<<ez) * b[1]);
+
+    block->u.mv[ref][0] = (mx + (1<<(ez+ep))) >> (ez+ep);
+    block->u.mv[ref][1] = (my + (1<<(ez+ep))) >> (ez+ep);
+}
+
+static void decode_block_params(DiracContext *s, DiracArith arith[8], DiracBlock *block,
+                                int stride, int x, int y)
+{
+    int i;
+
+    block->ref  = pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF1);
+    block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF1);
+
+    if (s->num_refs == 2) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF2);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF2) << 1;
+    }
+
+    if (!block->ref) {
+        pred_block_dc(block, stride, x, y);
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += dirac_get_arith_int(arith+1+i, CTX_DC_F1, CTX_DC_DATA);
+        return;
+    }
+
+    if (s->globalmc_flag) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_GLOBAL);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_GLOBAL_BLOCK) << 2;
+    }
+
+    for (i = 0; i < s->num_refs; i++)
+        if (block->ref & (i+1)) {
+            if (block->ref & DIRAC_REF_MASK_GLOBAL) {
+                global_mv(s, block, x, y, i);
+            } else {
+                pred_mv(block, stride, x, y, i);
+                block->u.mv[i][0] += dirac_get_arith_int(arith + 4 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+                block->u.mv[i][1] += dirac_get_arith_int(arith + 5 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+            }
+        }
+}
+
+/**
+ * Copies the current block to the other blocks covered by the current superblock split mode
+ */
+static void propagate_block_data(DiracBlock *block, int stride, int size)
+{
+    int x, y;
+    DiracBlock *dst = block;
+
+    for (x = 1; x < size; x++)
+        dst[x] = *block;
+
+    for (y = 1; y < size; y++) {
+        dst += stride;
+        for (x = 0; x < size; x++)
+            dst[x] = *block;
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 12. Block motion data syntax
+ */
+static int dirac_unpack_block_motion_data(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    uint8_t *sbsplit = s->sbsplit;
+    int i, x, y, q, p;
+    DiracArith arith[8];
+
+    align_get_bits(gb);
+
+    /* [DIRAC_STD] 11.2.4 and 12.2.1 Number of blocks and superblocks */
+    s->sbwidth  = DIVRNDUP(s->source.width,  4*s->plane[0].xbsep);
+    s->sbheight = DIVRNDUP(s->source.height, 4*s->plane[0].ybsep);
+    s->blwidth  = 4 * s->sbwidth;
+    s->blheight = 4 * s->sbheight;
+
+    /* [DIRAC_STD] 12.3.1 Superblock splitting modes. superblock_split_modes()
+       decode superblock split modes */
+    ff_dirac_init_arith_decoder(arith, gb, svq3_get_ue_golomb(gb));     /* svq3_get_ue_golomb(gb) is the length */
+    for (y = 0; y < s->sbheight; y++) {
+        for (x = 0; x < s->sbwidth; x++) {
+            unsigned int split  = dirac_get_arith_uint(arith, CTX_SB_F1, CTX_SB_DATA);
+            if (split > 2)
+                return AVERROR_INVALIDDATA;
+            sbsplit[x] = (split + pred_sbsplit(sbsplit+x, s->sbwidth, x, y)) % 3;
+        }
+        sbsplit += s->sbwidth;
+    }
+
+    /* setup arith decoding */
+    ff_dirac_init_arith_decoder(arith, gb, svq3_get_ue_golomb(gb));
+    for (i = 0; i < s->num_refs; i++) {
+        ff_dirac_init_arith_decoder(arith + 4 + 2 * i, gb, svq3_get_ue_golomb(gb));
+        ff_dirac_init_arith_decoder(arith + 5 + 2 * i, gb, svq3_get_ue_golomb(gb));
+    }
+    for (i = 0; i < 3; i++)
+        ff_dirac_init_arith_decoder(arith+1+i, gb, svq3_get_ue_golomb(gb));
+
+    for (y = 0; y < s->sbheight; y++)
+        for (x = 0; x < s->sbwidth; x++) {
+            int blkcnt = 1 << s->sbsplit[y * s->sbwidth + x];
+            int step   = 4 >> s->sbsplit[y * s->sbwidth + x];
+
+            for (q = 0; q < blkcnt; q++)
+                for (p = 0; p < blkcnt; p++) {
+                    int bx = 4 * x + p*step;
+                    int by = 4 * y + q*step;
+                    DiracBlock *block = &s->blmotion[by*s->blwidth + bx];
+                    decode_block_params(s, arith, block, s->blwidth, bx, by);
+                    propagate_block_data(block, s->blwidth, step);
+                }
+        }
+
+    return 0;
+}
+
+static int weight(int i, int blen, int offset)
+{
+#define ROLLOFF(i) offset == 1 ? ((i) ? 5 : 3) :        \
+    (1 + (6*(i) + offset - 1) / (2*offset - 1))
+
+    if (i < 2*offset)
+        return ROLLOFF(i);
+    else if (i > blen-1 - 2*offset)
+        return ROLLOFF(blen-1 - i);
+    return 8;
+}
+
+static void init_obmc_weight_row(Plane *p, uint8_t *obmc_weight, int stride,
+                                 int left, int right, int wy)
+{
+    int x;
+    for (x = 0; left && x < p->xblen >> 1; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < p->xblen >> right; x++)
+        obmc_weight[x] = wy*weight(x, p->xblen, p->xoffset);
+    for (; x < p->xblen; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < stride; x++)
+        obmc_weight[x] = 0;
+}
+
+static void init_obmc_weight(Plane *p, uint8_t *obmc_weight, int stride,
+                             int left, int right, int top, int bottom)
+{
+    int y;
+    for (y = 0; top && y < p->yblen >> 1; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen >> bottom; y++) {
+        int wy = weight(y, p->yblen, p->yoffset);
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, wy);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+}
+
+static void init_obmc_weights(DiracContext *s, Plane *p, int by)
+{
+    int top = !by;
+    int bottom = by == s->blheight-1;
+
+    /* don't bother re-initing for rows 2 to blheight-2, the weights don't change */
+    if (top || bottom || by == 1) {
+        init_obmc_weight(p, s->obmc_weight[0], MAX_BLOCKSIZE, 1, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[1], MAX_BLOCKSIZE, 0, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[2], MAX_BLOCKSIZE, 0, 1, top, bottom);
+    }
+}
+
+static const uint8_t epel_weights[4][4][4] = {
+    {{ 16,  0,  0,  0 },
+     { 12,  4,  0,  0 },
+     {  8,  8,  0,  0 },
+     {  4, 12,  0,  0 }},
+    {{ 12,  0,  4,  0 },
+     {  9,  3,  3,  1 },
+     {  6,  6,  2,  2 },
+     {  3,  9,  1,  3 }},
+    {{  8,  0,  8,  0 },
+     {  6,  2,  6,  2 },
+     {  4,  4,  4,  4 },
+     {  2,  6,  2,  6 }},
+    {{  4,  0, 12,  0 },
+     {  3,  1,  9,  3 },
+     {  2,  2,  6,  6 },
+     {  1,  3,  3,  9 }}
+};
+
+/**
+ * For block x,y, determine which of the hpel planes to do bilinear
+ * interpolation from and set src[] to the location in each hpel plane
+ * to MC from.
+ *
+ * @return the index of the put_dirac_pixels_tab function to use
+ *  0 for 1 plane (fpel,hpel), 1 for 2 planes (qpel), 2 for 4 planes (qpel), and 3 for epel
+ */
+static int mc_subpel(DiracContext *s, DiracBlock *block, const uint8_t *src[5],
+                     int x, int y, int ref, int plane)
+{
+    Plane *p = &s->plane[plane];
+    uint8_t **ref_hpel = s->ref_pics[ref]->hpel[plane];
+    int motion_x = block->u.mv[ref][0];
+    int motion_y = block->u.mv[ref][1];
+    int mx, my, i, epel, nplanes = 0;
+
+    if (plane) {
+        motion_x >>= s->chroma_x_shift;
+        motion_y >>= s->chroma_y_shift;
+    }
+
+    mx         = motion_x & ~(-1U << s->mv_precision);
+    my         = motion_y & ~(-1U << s->mv_precision);
+    motion_x >>= s->mv_precision;
+    motion_y >>= s->mv_precision;
+    /* normalize subpel coordinates to epel */
+    /* TODO: template this function? */
+    mx      <<= 3 - s->mv_precision;
+    my      <<= 3 - s->mv_precision;
+
+    x += motion_x;
+    y += motion_y;
+    epel = (mx|my)&1;
+
+    /* hpel position */
+    if (!((mx|my)&3)) {
+        nplanes = 1;
+        src[0] = ref_hpel[(my>>1)+(mx>>2)] + y*p->stride + x;
+    } else {
+        /* qpel or epel */
+        nplanes = 4;
+        for (i = 0; i < 4; i++)
+            src[i] = ref_hpel[i] + y*p->stride + x;
+
+        /* if we're interpolating in the right/bottom halves, adjust the planes as needed
+           we increment x/y because the edge changes for half of the pixels */
+        if (mx > 4) {
+            src[0] += 1;
+            src[2] += 1;
+            x++;
+        }
+        if (my > 4) {
+            src[0] += p->stride;
+            src[1] += p->stride;
+            y++;
+        }
+
+        /* hpel planes are:
+           [0]: F  [1]: H
+           [2]: V  [3]: C */
+        if (!epel) {
+            /* check if we really only need 2 planes since either mx or my is
+               a hpel position. (epel weights of 0 handle this there) */
+            if (!(mx&3)) {
+                /* mx == 0: average [0] and [2]
+                   mx == 4: average [1] and [3] */
+                src[!mx] = src[2 + !!mx];
+                nplanes = 2;
+            } else if (!(my&3)) {
+                src[0] = src[(my>>1)  ];
+                src[1] = src[(my>>1)+1];
+                nplanes = 2;
+            }
+        } else {
+            /* adjust the ordering if needed so the weights work */
+            if (mx > 4) {
+                FFSWAP(const uint8_t *, src[0], src[1]);
+                FFSWAP(const uint8_t *, src[2], src[3]);
+            }
+            if (my > 4) {
+                FFSWAP(const uint8_t *, src[0], src[2]);
+                FFSWAP(const uint8_t *, src[1], src[3]);
+            }
+            src[4] = epel_weights[my&3][mx&3];
+        }
+    }
+
+    /* fixme: v/h _edge_pos */
+    if (x + p->xblen > p->width +EDGE_WIDTH/2 ||
+        y + p->yblen > p->height+EDGE_WIDTH/2 ||
+        x < 0 || y < 0) {
+        for (i = 0; i < nplanes; i++) {
+            s->vdsp.emulated_edge_mc(s->edge_emu_buffer[i], src[i],
+                                     p->stride, p->stride,
+                                     p->xblen, p->yblen, x, y,
+                                     p->width+EDGE_WIDTH/2, p->height+EDGE_WIDTH/2);
+            src[i] = s->edge_emu_buffer[i];
+        }
+    }
+    return (nplanes>>1) + epel;
+}
+
+static void add_dc(uint16_t *dst, int dc, int stride,
+                   uint8_t *obmc_weight, int xblen, int yblen)
+{
+    int x, y;
+    dc += 128;
+
+    for (y = 0; y < yblen; y++) {
+        for (x = 0; x < xblen; x += 2) {
+            dst[x  ] += dc * obmc_weight[x  ];
+            dst[x+1] += dc * obmc_weight[x+1];
+        }
+        dst          += stride;
+        obmc_weight  += MAX_BLOCKSIZE;
+    }
+}
+
+static void block_mc(DiracContext *s, DiracBlock *block,
+                     uint16_t *mctmp, uint8_t *obmc_weight,
+                     int plane, int dstx, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    const uint8_t *src[5];
+    int idx;
+
+    switch (block->ref&3) {
+    case 0: /* DC */
+        add_dc(mctmp, block->u.dc[plane], p->stride, obmc_weight, p->xblen, p->yblen);
+        return;
+    case 1:
+    case 2:
+        idx = mc_subpel(s, block, src, dstx, dsty, (block->ref&3)-1, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        if (s->weight_func)
+            s->weight_func(s->mcscratch, p->stride, s->weight_log2denom,
+                           s->weight[0] + s->weight[1], p->yblen);
+        break;
+    case 3:
+        idx = mc_subpel(s, block, src, dstx, dsty, 0, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        idx = mc_subpel(s, block, src, dstx, dsty, 1, plane);
+        if (s->biweight_func) {
+            /* fixme: +32 is a quick hack */
+            s->put_pixels_tab[idx](s->mcscratch + 32, src, p->stride, p->yblen);
+            s->biweight_func(s->mcscratch, s->mcscratch+32, p->stride, s->weight_log2denom,
+                             s->weight[0], s->weight[1], p->yblen);
+        } else
+            s->avg_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        break;
+    }
+    s->add_obmc(mctmp, s->mcscratch, p->stride, obmc_weight, p->yblen);
+}
+
+static void mc_row(DiracContext *s, DiracBlock *block, uint16_t *mctmp, int plane, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    int x, dstx = p->xbsep - p->xoffset;
+
+    block_mc(s, block, mctmp, s->obmc_weight[0], plane, -p->xoffset, dsty);
+    mctmp += p->xbsep;
+
+    for (x = 1; x < s->blwidth-1; x++) {
+        block_mc(s, block+x, mctmp, s->obmc_weight[1], plane, dstx, dsty);
+        dstx  += p->xbsep;
+        mctmp += p->xbsep;
+    }
+    block_mc(s, block+x, mctmp, s->obmc_weight[2], plane, dstx, dsty);
+}
+
+static void select_dsp_funcs(DiracContext *s, int width, int height, int xblen, int yblen)
+{
+    int idx = 0;
+    if (xblen > 8)
+        idx = 1;
+    if (xblen > 16)
+        idx = 2;
+
+    memcpy(s->put_pixels_tab, s->diracdsp.put_dirac_pixels_tab[idx], sizeof(s->put_pixels_tab));
+    memcpy(s->avg_pixels_tab, s->diracdsp.avg_dirac_pixels_tab[idx], sizeof(s->avg_pixels_tab));
+    s->add_obmc = s->diracdsp.add_dirac_obmc[idx];
+    if (s->weight_log2denom > 1 || s->weight[0] != 1 || s->weight[1] != 1) {
+        s->weight_func   = s->diracdsp.weight_dirac_pixels_tab[idx];
+        s->biweight_func = s->diracdsp.biweight_dirac_pixels_tab[idx];
+    } else {
+        s->weight_func   = NULL;
+        s->biweight_func = NULL;
+    }
+}
+
+static int interpolate_refplane(DiracContext *s, DiracFrame *ref, int plane, int width, int height)
+{
+    /* chroma allocates an edge of 8 when subsampled
+       which for 4:2:2 means an h edge of 16 and v edge of 8
+       just use 8 for everything for the moment */
+    int i, edge = EDGE_WIDTH/2;
+
+    ref->hpel[plane][0] = ref->avframe->data[plane];
+    s->mpvencdsp.draw_edges(ref->hpel[plane][0], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM); /* EDGE_TOP | EDGE_BOTTOM values just copied to make it build, this needs to be ensured */
+
+    /* no need for hpel if we only have fpel vectors */
+    if (!s->mv_precision)
+        return 0;
+
+    for (i = 1; i < 4; i++) {
+        if (!ref->hpel_base[plane][i])
+            ref->hpel_base[plane][i] = av_malloc((height+2*edge) * ref->avframe->linesize[plane] + 32);
+        if (!ref->hpel_base[plane][i]) {
+            return AVERROR(ENOMEM);
+        }
+        /* we need to be 16-byte aligned even for chroma */
+        ref->hpel[plane][i] = ref->hpel_base[plane][i] + edge*ref->avframe->linesize[plane] + 16;
+    }
+
+    if (!ref->interpolated[plane]) {
+        s->diracdsp.dirac_hpel_filter(ref->hpel[plane][1], ref->hpel[plane][2],
+                                      ref->hpel[plane][3], ref->hpel[plane][0],
+                                      ref->avframe->linesize[plane], width, height);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][1], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][2], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][3], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+    }
+    ref->interpolated[plane] = 1;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.0 Transform data syntax. transform_data()
+ */
+static int dirac_decode_frame_internal(DiracContext *s)
+{
+    DWTContext d;
+    int y, i, comp, dsty;
+    int ret;
+
+    if (s->low_delay) {
+        /* [DIRAC_STD] 13.5.1 low_delay_transform_data() */
+        for (comp = 0; comp < 3; comp++) {
+            Plane *p = &s->plane[comp];
+            memset(p->idwt_buf, 0, p->idwt_stride * p->idwt_height * sizeof(IDWTELEM));
+        }
+        if (!s->zero_res) {
+            if ((ret = decode_lowdelay(s)) < 0)
+                return ret;
+        }
+    }
+
+    for (comp = 0; comp < 3; comp++) {
+        Plane *p       = &s->plane[comp];
+        uint8_t *frame = s->current_picture->avframe->data[comp];
+
+        /* FIXME: small resolutions */
+        for (i = 0; i < 4; i++)
+            s->edge_emu_buffer[i] = s->edge_emu_buffer_base + i*FFALIGN(p->width, 16);
+
+        if (!s->zero_res && !s->low_delay)
+        {
+            memset(p->idwt_buf, 0, p->idwt_stride * p->idwt_height * sizeof(IDWTELEM));
+            decode_component(s, comp); /* [DIRAC_STD] 13.4.1 core_transform_data() */
+        }
+        ret = ff_spatial_idwt_init2(&d, p->idwt_buf, p->idwt_width, p->idwt_height, p->idwt_stride,
+                                    s->wavelet_idx+2, s->wavelet_depth, p->idwt_tmp);
+        if (ret < 0)
+            return ret;
+
+        if (!s->num_refs) { /* intra */
+            for (y = 0; y < p->height; y += 16) {
+                ff_spatial_idwt_slice2(&d, y+16); /* decode */
+                s->diracdsp.put_signed_rect_clamped(frame + y*p->stride, p->stride,
+                                                    p->idwt_buf + y*p->idwt_stride, p->idwt_stride, p->width, 16);
+            }
+        } else { /* inter */
+            int rowheight = p->ybsep*p->stride;
+
+            select_dsp_funcs(s, p->width, p->height, p->xblen, p->yblen);
+
+            for (i = 0; i < s->num_refs; i++) {
+                int ret = interpolate_refplane(s, s->ref_pics[i], comp, p->width, p->height);
+                if (ret < 0)
+                    return ret;
+            }
+
+            memset(s->mctmp, 0, 4*p->yoffset*p->stride);
+
+            dsty = -p->yoffset;
+            for (y = 0; y < s->blheight; y++) {
+                int h     = 0,
+                    start = FFMAX(dsty, 0);
+                uint16_t *mctmp    = s->mctmp + y*rowheight;
+                DiracBlock *blocks = s->blmotion + y*s->blwidth;
+
+                init_obmc_weights(s, p, y);
+
+                if (y == s->blheight-1 || start+p->ybsep > p->height)
+                    h = p->height - start;
+                else
+                    h = p->ybsep - (start - dsty);
+                if (h < 0)
+                    break;
+
+                memset(mctmp+2*p->yoffset*p->stride, 0, 2*rowheight);
+                mc_row(s, blocks, mctmp, comp, dsty);
+
+                mctmp += (start - dsty)*p->stride + p->xoffset;
+                ff_spatial_idwt_slice2(&d, start + h); /* decode */
+                s->diracdsp.add_rect_clamped(frame + start*p->stride, mctmp, p->stride,
+                                             p->idwt_buf + start*p->idwt_stride, p->idwt_stride, p->width, h);
+
+                dsty += p->ybsep;
+            }
+        }
+    }
+
+
+    return 0;
+}
+
+static int get_buffer_with_edge(AVCodecContext *avctx, AVFrame *f, int flags)
+{
+    int ret, i;
+    int chroma_x_shift, chroma_y_shift;
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &chroma_x_shift, &chroma_y_shift);
+
+    f->width  = avctx->width  + 2 * EDGE_WIDTH;
+    f->height = avctx->height + 2 * EDGE_WIDTH + 2;
+    ret = ff_get_buffer(avctx, f, flags);
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; f->data[i]; i++) {
+        int offset = (EDGE_WIDTH >> (i && i<3 ? chroma_y_shift : 0)) *
+                     f->linesize[i] + 32;
+        f->data[i] += offset;
+    }
+    f->width  = avctx->width;
+    f->height = avctx->height;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.1.1 Picture Header. picture_header()
+ */
+static int dirac_decode_picture_header(DiracContext *s)
+{
+    unsigned retire, picnum;
+    int i, j, ret;
+    int64_t refdist, refnum;
+    GetBitContext *gb = &s->gb;
+
+    /* [DIRAC_STD] 11.1.1 Picture Header. picture_header() PICTURE_NUM */
+    picnum = s->current_picture->avframe->display_picture_number = get_bits_long(gb, 32);
+
+
+    av_log(s->avctx,AV_LOG_DEBUG,"PICTURE_NUM: %d\n",picnum);
+
+    /* if this is the first keyframe after a sequence header, start our
+       reordering from here */
+    if (s->frame_number < 0)
+        s->frame_number = picnum;
+
+    s->ref_pics[0] = s->ref_pics[1] = NULL;
+    for (i = 0; i < s->num_refs; i++) {
+        refnum = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        refdist = INT64_MAX;
+
+        /* find the closest reference to the one we want */
+        /* Jordi: this is needed if the referenced picture hasn't yet arrived */
+        for (j = 0; j < MAX_REFERENCE_FRAMES && refdist; j++)
+            if (s->ref_frames[j]
+                && FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum) < refdist) {
+                s->ref_pics[i] = s->ref_frames[j];
+                refdist = FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum);
+            }
+
+        if (!s->ref_pics[i] || refdist)
+            av_log(s->avctx, AV_LOG_DEBUG, "Reference not found\n");
+
+        /* if there were no references at all, allocate one */
+        if (!s->ref_pics[i])
+            for (j = 0; j < MAX_FRAMES; j++)
+                if (!s->all_frames[j].avframe->data[0]) {
+                    s->ref_pics[i] = &s->all_frames[j];
+                    get_buffer_with_edge(s->avctx, s->ref_pics[i]->avframe, AV_GET_BUFFER_FLAG_REF);
+                    break;
+                }
+
+        if (!s->ref_pics[i]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference could not be allocated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+    }
+
+    /* retire the reference frames that are not used anymore */
+    if (s->current_picture->reference) {
+        retire = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        if (retire != picnum) {
+            DiracFrame *retire_pic = remove_frame(s->ref_frames, retire);
+
+            if (retire_pic)
+                retire_pic->reference &= DELAYED_PIC_REF;
+            else
+                av_log(s->avctx, AV_LOG_DEBUG, "Frame to retire not found\n");
+        }
+
+        /* if reference array is full, remove the oldest as per the spec */
+        while (add_frame(s->ref_frames, MAX_REFERENCE_FRAMES, s->current_picture)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference frame overflow\n");
+            remove_frame(s->ref_frames, s->ref_frames[0]->avframe->display_picture_number)->reference &= DELAYED_PIC_REF;
+        }
+    }
+
+    if (s->num_refs) {
+        ret = dirac_unpack_prediction_parameters(s);  /* [DIRAC_STD] 11.2 Picture Prediction Data. picture_prediction() */
+        if (ret < 0)
+            return ret;
+        ret = dirac_unpack_block_motion_data(s);      /* [DIRAC_STD] 12. Block motion data syntax                       */
+        if (ret < 0)
+            return ret;
+    }
+    ret = dirac_unpack_idwt_params(s);                /* [DIRAC_STD] 11.3 Wavelet transform data                        */
+    if (ret < 0)
+        return ret;
+
+    init_planes(s);
+    return 0;
+}
+
+static int get_delayed_pic(DiracContext *s, AVFrame *picture, int *got_frame)
+{
+    DiracFrame *out = s->delay_frames[0];
+    int i, out_idx  = 0;
+    int ret;
+
+    /* find frame with lowest picture number */
+    for (i = 1; s->delay_frames[i]; i++)
+        if (s->delay_frames[i]->avframe->display_picture_number < out->avframe->display_picture_number) {
+            out     = s->delay_frames[i];
+            out_idx = i;
+        }
+
+    for (i = out_idx; s->delay_frames[i]; i++)
+        s->delay_frames[i] = s->delay_frames[i+1];
+
+    if (out) {
+        out->reference ^= DELAYED_PIC_REF;
+        *got_frame = 1;
+        if((ret = av_frame_ref(picture, out->avframe)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 9.6 Parse Info Header Syntax. parse_info()
+ * 4 byte start code + byte parse code + 4 byte size + 4 byte previous size
+ */
+#define DATA_UNIT_HEADER_SIZE 13
+
+/* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3
+   inside the function parse_sequence() */
+static int dirac_decode_data_unit(AVCodecContext *avctx, const uint8_t *buf, int size)
+{
+    DiracContext *s   = avctx->priv_data;
+    DiracFrame *pic   = NULL;
+    int ret, i, parse_code;
+    unsigned tmp;
+
+    if (size < DATA_UNIT_HEADER_SIZE)
+        return AVERROR_INVALIDDATA;
+
+    parse_code = buf[4];
+
+    init_get_bits(&s->gb, &buf[13], 8*(size - DATA_UNIT_HEADER_SIZE));
+
+    if (parse_code == pc_seq_header) {
+        if (s->seen_sequence_header)
+            return 0;
+
+        /* [DIRAC_STD] 10. Sequence header */
+        ret = avpriv_dirac_parse_sequence_header(avctx, &s->gb, &s->source);
+        if (ret < 0)
+            return ret;
+
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+
+        ret = alloc_sequence_buffers(s);
+        if (ret < 0)
+            return ret;
+
+        s->seen_sequence_header = 1;
+    } else if (parse_code == pc_eos) { /* [DIRAC_STD] End of Sequence */
+        free_sequence_buffers(s);
+        s->seen_sequence_header = 0;
+    } else if (parse_code == pc_aux_data) {
+        if (buf[13] == 1) {     /* encoder implementation/version */
+            int ver[3];
+            /* versions older than 1.0.8 don't store quant delta for
+               subbands with only one codeblock */
+            if (sscanf(buf+14, "Schroedinger %d.%d.%d", ver, ver+1, ver+2) == 3)
+                if (ver[0] == 1 && ver[1] == 0 && ver[2] <= 7)
+                    s->old_delta_quant = 1;
+        }
+    } else if (parse_code & 0x8) {  /* picture data unit */
+        if (!s->seen_sequence_header) {
+            av_log(avctx, AV_LOG_DEBUG, "Dropping frame without sequence header\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        /* find an unused frame */
+        for (i = 0; i < MAX_FRAMES; i++)
+            if (s->all_frames[i].avframe->data[0] == NULL)
+                pic = &s->all_frames[i];
+        if (!pic) {
+            av_log(avctx, AV_LOG_ERROR, "framelist full\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_frame_unref(pic->avframe);
+
+        /* [DIRAC_STD] Defined in 9.6.1 ... */
+        tmp            =  parse_code & 0x03;                   /* [DIRAC_STD] num_refs()      */
+        if (tmp > 2) {
+            av_log(avctx, AV_LOG_ERROR, "num_refs of 3\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->num_refs    = tmp;
+        s->is_arith    = (parse_code & 0x48) == 0x08;          /* [DIRAC_STD] using_ac()      */
+        s->low_delay   = (parse_code & 0x88) == 0x88;          /* [DIRAC_STD] is_low_delay()  */
+        pic->reference = (parse_code & 0x0C) == 0x0C;  /* [DIRAC_STD]  is_reference() */
+        pic->avframe->key_frame = s->num_refs == 0;             /* [DIRAC_STD] is_intra()      */
+        pic->avframe->pict_type = s->num_refs + 1;              /* Definition of AVPictureType in avutil.h */
+
+        if ((ret = get_buffer_with_edge(avctx, pic->avframe, (parse_code & 0x0C) == 0x0C ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
+            return ret;
+        s->current_picture = pic;
+        s->plane[0].stride = pic->avframe->linesize[0];
+        s->plane[1].stride = pic->avframe->linesize[1];
+        s->plane[2].stride = pic->avframe->linesize[2];
+
+        if (alloc_buffers(s, FFMAX3(FFABS(s->plane[0].stride), FFABS(s->plane[1].stride), FFABS(s->plane[2].stride))) < 0)
+            return AVERROR(ENOMEM);
+
+        /* [DIRAC_STD] 11.1 Picture parse. picture_parse() */
+        ret = dirac_decode_picture_header(s);
+        if (ret < 0)
+            return ret;
+
+        /* [DIRAC_STD] 13.0 Transform data syntax. transform_data() */
+        ret = dirac_decode_frame_internal(s);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static int dirac_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *pkt)
+{
+    DiracContext *s     = avctx->priv_data;
+    AVFrame *picture    = data;
+    uint8_t *buf        = pkt->data;
+    int buf_size        = pkt->size;
+    int i, buf_idx      = 0;
+    int ret;
+    unsigned data_unit_size;
+
+    /* release unused frames */
+    for (i = 0; i < MAX_FRAMES; i++)
+        if (s->all_frames[i].avframe->data[0] && !s->all_frames[i].reference) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+    s->current_picture = NULL;
+    *got_frame = 0;
+
+    /* end of stream, so flush delayed pics */
+    if (buf_size == 0)
+        return get_delayed_pic(s, (AVFrame *)data, got_frame);
+
+    for (;;) {
+        /*[DIRAC_STD] Here starts the code from parse_info() defined in 9.6
+          [DIRAC_STD] PARSE_INFO_PREFIX = "BBCD" as defined in ISO/IEC 646
+          BBCD start code search */
+        for (; buf_idx + DATA_UNIT_HEADER_SIZE < buf_size; buf_idx++) {
+            if (buf[buf_idx  ] == 'B' && buf[buf_idx+1] == 'B' &&
+                buf[buf_idx+2] == 'C' && buf[buf_idx+3] == 'D')
+                break;
+        }
+        /* BBCD found or end of data */
+        if (buf_idx + DATA_UNIT_HEADER_SIZE >= buf_size)
+            break;
+
+        data_unit_size = AV_RB32(buf+buf_idx+5);
+        if (data_unit_size > buf_size - buf_idx || !data_unit_size) {
+            if(data_unit_size > buf_size - buf_idx)
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Data unit with size %d is larger than input buffer, discarding\n",
+                   data_unit_size);
+            buf_idx += 4;
+            continue;
+        }
+        /* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3 inside the function parse_sequence() */
+        ret = dirac_decode_data_unit(avctx, buf+buf_idx, data_unit_size);
+        if (ret < 0)
+        {
+            av_log(s->avctx, AV_LOG_ERROR,"Error in dirac_decode_data_unit\n");
+            return ret;
+        }
+        buf_idx += data_unit_size;
+    }
+
+    if (!s->current_picture)
+        return buf_size;
+
+    if (s->current_picture->avframe->display_picture_number > s->frame_number) {
+        DiracFrame *delayed_frame = remove_frame(s->delay_frames, s->frame_number);
+
+        s->current_picture->reference |= DELAYED_PIC_REF;
+
+        if (add_frame(s->delay_frames, MAX_DELAY, s->current_picture)) {
+            int min_num = s->delay_frames[0]->avframe->display_picture_number;
+            /* Too many delayed frames, so we display the frame with the lowest pts */
+            av_log(avctx, AV_LOG_ERROR, "Delay frame overflow\n");
+
+            for (i = 1; s->delay_frames[i]; i++)
+                if (s->delay_frames[i]->avframe->display_picture_number < min_num)
+                    min_num = s->delay_frames[i]->avframe->display_picture_number;
+
+            delayed_frame = remove_frame(s->delay_frames, min_num);
+            add_frame(s->delay_frames, MAX_DELAY, s->current_picture);
+        }
+
+        if (delayed_frame) {
+            delayed_frame->reference ^= DELAYED_PIC_REF;
+            if((ret=av_frame_ref(data, delayed_frame->avframe)) < 0)
+                return ret;
+            *got_frame = 1;
+        }
+    } else if (s->current_picture->avframe->display_picture_number == s->frame_number) {
+        /* The right frame at the right time :-) */
+        if((ret=av_frame_ref(data, s->current_picture->avframe)) < 0)
+            return ret;
+        *got_frame = 1;
+    }
+
+    if (*got_frame)
+        s->frame_number = picture->display_picture_number + 1;
+
+    return buf_idx;
+}
+
+AVCodec ff_dirac_decoder = {
+    .name           = "dirac",
+    .long_name      = NULL_IF_CONFIG_SMALL("BBC Dirac VC-2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DIRAC,
+    .priv_data_size = sizeof(DiracContext),
+    .init           = dirac_decode_init,
+    .close          = dirac_decode_end,
+    .decode         = dirac_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .flush          = dirac_decode_flush,
+};
diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
new file mode 100644
index 0000000000..6b02779edb
--- /dev/null
+++ b/libavcodec/diracdsp.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "diracdsp.h"
+#include "libavcodec/x86/diracdsp_mmx.h"
+
+#define FILTER(src, stride)                                     \
+    ((21*((src)[ 0*stride] + (src)[1*stride])                   \
+      -7*((src)[-1*stride] + (src)[2*stride])                   \
+      +3*((src)[-2*stride] + (src)[3*stride])                   \
+      -1*((src)[-3*stride] + (src)[4*stride]) + 16) >> 5)
+
+static void dirac_hpel_filter(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src,
+                              int stride, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = -3; x < width+5; x++)
+            dstv[x] = av_clip_uint8(FILTER(src+x, stride));
+
+        for (x = 0; x < width; x++)
+            dstc[x] = av_clip_uint8(FILTER(dstv+x, 1));
+
+        for (x = 0; x < width; x++)
+            dsth[x] = av_clip_uint8(FILTER(src+x, 1));
+
+        src  += stride;
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+    }
+}
+
+#define PIXOP_BILINEAR(PFX, OP, WIDTH)                                  \
+    static void ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c(uint8_t *dst, const uint8_t *src[5], int stride, int h) \
+    {                                                                   \
+        int x;                                                          \
+        const uint8_t *s0 = src[0];                                     \
+        const uint8_t *s1 = src[1];                                     \
+        const uint8_t *s2 = src[2];                                     \
+        const uint8_t *s3 = src[3];                                     \
+        const uint8_t *w  = src[4];                                     \
+                                                                        \
+        while (h--) {                                                   \
+            for (x = 0; x < WIDTH; x++) {                               \
+                OP(dst[x], (s0[x]*w[0] + s1[x]*w[1] + s2[x]*w[2] + s3[x]*w[3] + 8) >> 4); \
+            }                                                           \
+                                                                        \
+            dst += stride;                                              \
+            s0 += stride;                                               \
+            s1 += stride;                                               \
+            s2 += stride;                                               \
+            s3 += stride;                                               \
+        }                                                               \
+    }
+
+#define OP_PUT(dst, val) (dst) = (val)
+#define OP_AVG(dst, val) (dst) = (((dst) + (val) + 1)>>1)
+
+PIXOP_BILINEAR(put, OP_PUT, 8)
+PIXOP_BILINEAR(put, OP_PUT, 16)
+PIXOP_BILINEAR(put, OP_PUT, 32)
+PIXOP_BILINEAR(avg, OP_AVG, 8)
+PIXOP_BILINEAR(avg, OP_AVG, 16)
+PIXOP_BILINEAR(avg, OP_AVG, 32)
+
+#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + (1<<(log2_denom-1))) >> log2_denom)
+#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + (1<<(log2_denom-1))) >> log2_denom)
+
+#define DIRAC_WEIGHT(W)                                                 \
+    static void weight_dirac_pixels ## W ## _c(uint8_t *block, int stride, int log2_denom, \
+                                               int weight, int h) {     \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale1(x);                                           \
+                op_scale1(x+1);                                         \
+            }                                                           \
+            block += stride;                                            \
+        }                                                               \
+    }                                                                   \
+    static void biweight_dirac_pixels ## W ## _c(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, \
+                                                 int weightd, int weights, int h) { \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale2(x);                                           \
+                op_scale2(x+1);                                         \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+        }                                                               \
+    }
+
+DIRAC_WEIGHT(8)
+DIRAC_WEIGHT(16)
+DIRAC_WEIGHT(32)
+
+#define ADD_OBMC(xblen)                                                 \
+    static void add_obmc ## xblen ## _c(uint16_t *dst, const uint8_t *src, int stride, \
+                                        const uint8_t *obmc_weight, int yblen) \
+    {                                                                   \
+        int x;                                                          \
+        while (yblen--) {                                               \
+            for (x = 0; x < xblen; x += 2) {                            \
+                dst[x  ] += src[x  ] * obmc_weight[x  ];                \
+                dst[x+1] += src[x+1] * obmc_weight[x+1];                \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+            obmc_weight += 32;                                          \
+        }                                                               \
+    }
+
+ADD_OBMC(8)
+ADD_OBMC(16)
+ADD_OBMC(32)
+
+static void put_signed_rect_clamped_c(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height)
+{
+    int x, y;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=4) {
+            dst[x  ] = av_clip_uint8(src[x  ] + 128);
+            dst[x+1] = av_clip_uint8(src[x+1] + 128);
+            dst[x+2] = av_clip_uint8(src[x+2] + 128);
+            dst[x+3] = av_clip_uint8(src[x+3] + 128);
+        }
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
+static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
+                               const int16_t *idwt, int idwt_stride,
+                               int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=2) {
+            dst[x  ] = av_clip_uint8(((src[x  ]+32)>>6) + idwt[x  ]);
+            dst[x+1] = av_clip_uint8(((src[x+1]+32)>>6) + idwt[x+1]);
+        }
+        dst += stride;
+        src += stride;
+        idwt += idwt_stride;
+    }
+}
+
+#define PIXFUNC(PFX, WIDTH)                                             \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][2] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l4_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][3] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c
+
+av_cold void ff_diracdsp_init(DiracDSPContext *c)
+{
+    c->dirac_hpel_filter = dirac_hpel_filter;
+    c->add_rect_clamped = add_rect_clamped_c;
+    c->put_signed_rect_clamped = put_signed_rect_clamped_c;
+
+    c->add_dirac_obmc[0] = add_obmc8_c;
+    c->add_dirac_obmc[1] = add_obmc16_c;
+    c->add_dirac_obmc[2] = add_obmc32_c;
+
+    c->weight_dirac_pixels_tab[0] = weight_dirac_pixels8_c;
+    c->weight_dirac_pixels_tab[1] = weight_dirac_pixels16_c;
+    c->weight_dirac_pixels_tab[2] = weight_dirac_pixels32_c;
+    c->biweight_dirac_pixels_tab[0] = biweight_dirac_pixels8_c;
+    c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
+    c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
+
+    PIXFUNC(put, 8);
+    PIXFUNC(put, 16);
+    PIXFUNC(put, 32);
+    PIXFUNC(avg, 8);
+    PIXFUNC(avg, 16);
+    PIXFUNC(avg, 32);
+
+    if (HAVE_MMX && HAVE_YASM) ff_diracdsp_init_mmx(c);
+}
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
new file mode 100644
index 0000000000..613ca5bc83
--- /dev/null
+++ b/libavcodec/diracdsp.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRACDSP_H
+#define AVCODEC_DIRACDSP_H
+
+#include <stdint.h>
+
+typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
+typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
+
+typedef struct {
+    void (*dirac_hpel_filter)(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src, int stride, int width, int height);
+    /**
+     * dirac_pixels_tab[width][subpel]
+     * width is 2 for 32, 1 for 16, 0 for 8
+     * subpel is 0 for fpel and hpel (only need to copy from the first plane in src)
+     *           1 if an average of the first 2 planes is needed (TODO: worth it?)
+     *           2 for general qpel (avg of 4)
+     *           3 for general epel (biweight of 4 using the weights in src[4])
+     * src[0-3] is each of the hpel planes
+     * src[4] is the 1/8 pel weights if needed
+     */
+    void (*put_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+    void (*put_signed_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const int16_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*put_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const int16_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
+    void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+    dirac_weight_func weight_dirac_pixels_tab[3];
+    dirac_biweight_func biweight_dirac_pixels_tab[3];
+} DiracDSPContext;
+
+#define DECL_DIRAC_PIXOP(PFX, EXT)                                      \
+    void ff_ ## PFX ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+
+DECL_DIRAC_PIXOP(put, c);
+DECL_DIRAC_PIXOP(avg, c);
+DECL_DIRAC_PIXOP(put, l2_c);
+DECL_DIRAC_PIXOP(avg, l2_c);
+DECL_DIRAC_PIXOP(put, l4_c);
+DECL_DIRAC_PIXOP(avg, l4_c);
+
+void ff_diracdsp_init(DiracDSPContext *c);
+
+#endif /* AVCODEC_DIRACDSP_H */
diff --git a/libavcodec/dnxhd_parser.c b/libavcodec/dnxhd_parser.c
index 0de3561fc3..fffb98fa48 100644
--- a/libavcodec/dnxhd_parser.c
+++ b/libavcodec/dnxhd_parser.c
@@ -2,20 +2,20 @@
  * DNxHD/VC-3 parser
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,21 +26,32 @@
 
 #include "parser.h"
 
-#define DNXHD_HEADER_PREFIX 0x0000028001
+#define DNXHD_HEADER_PREFIX 0x000002800100
 
-static int dnxhd_find_frame_end(ParseContext *pc,
+typedef struct {
+    ParseContext pc;
+    int interlaced;
+    int cur_field; /* first field is 0, second is 1 */
+} DNXHDParserContext;
+
+static int dnxhd_find_frame_end(DNXHDParserContext *dctx,
                                 const uint8_t *buf, int buf_size)
 {
+    ParseContext *pc = &dctx->pc;
     uint64_t state = pc->state64;
     int pic_found = pc->frame_start_found;
     int i = 0;
+    int interlaced = dctx->interlaced;
+    int cur_field = dctx->cur_field;
 
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
+            if ((state & 0xffffffffff00LL) == DNXHD_HEADER_PREFIX) {
                 i++;
                 pic_found = 1;
+                interlaced = (state&2)>>1; /* byte following the 5-byte header prefix */
+                cur_field = state&1;
                 break;
             }
         }
@@ -51,15 +62,25 @@ static int dnxhd_find_frame_end(ParseContext *pc,
             return 0;
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
-                pc->frame_start_found = 0;
-                pc->state64 = -1;
-                return i - 4;
+            if ((state & 0xffffffffff00LL) == DNXHD_HEADER_PREFIX) {
+                if (!interlaced || dctx->cur_field) {
+                    pc->frame_start_found = 0;
+                    pc->state64 = -1;
+                    dctx->interlaced = interlaced;
+                    dctx->cur_field = 0;
+                    return i - 5;
+                } else {
+                    /* continue, to get the second field */
+                    dctx->interlaced = interlaced = (state&2)>>1;
+                    dctx->cur_field = cur_field = state&1;
+                }
             }
         }
     }
     pc->frame_start_found = pic_found;
     pc->state64 = state;
+    dctx->interlaced = interlaced;
+    dctx->cur_field = cur_field;
     return END_NOT_FOUND;
 }
 
@@ -68,13 +89,14 @@ static int dnxhd_parse(AVCodecParserContext *s,
                        const uint8_t **poutbuf, int *poutbuf_size,
                        const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    DNXHDParserContext *dctx = s->priv_data;
+    ParseContext *pc = &dctx->pc;
     int next;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
     } else {
-        next = dnxhd_find_frame_end(pc, buf, buf_size);
+        next = dnxhd_find_frame_end(dctx, buf, buf_size);
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf      = NULL;
             *poutbuf_size = 0;
@@ -88,7 +110,7 @@ static int dnxhd_parse(AVCodecParserContext *s,
 
 AVCodecParser ff_dnxhd_parser = {
     .codec_ids      = { AV_CODEC_ID_DNXHD },
-    .priv_data_size = sizeof(ParseContext),
+    .priv_data_size = sizeof(DNXHDParserContext),
     .parser_parse   = dnxhd_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/dnxhddata.c b/libavcodec/dnxhddata.c
index 282829eeb2..c45a225bc9 100644
--- a/libavcodec/dnxhddata.c
+++ b/libavcodec/dnxhddata.c
@@ -2,20 +2,20 @@
  * VC3/DNxHD data.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -331,65 +331,44 @@ static const uint8_t dnxhd_1237_ac_bits[257] = {
 
 /* Used in CID 1237, 1242, 1253, 1259, 1260 */
 static const uint8_t dnxhd_1237_ac_level[257] = {
-     1,  1,  2,  0,  3,  4,  2,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-     4,  5, 13, 14, 15, 16,  6, 17, 18, 19, 20, 21,  7, 22, 23, 24,
-    25, 26, 27,  8,  9, 28, 29, 30, 31, 32, 33, 34, 10, 11, 12, 35,
-    36, 37, 38, 39, 40, 41, 13, 14, 15, 16, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 17, 18, 19, 20, 21, 53, 54, 55, 56, 57, 58,
-    59, 60, 61, 64,  1, 22, 23, 24, 25, 26, 27, 62, 63,  2,  3,  4,
-     5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
-    37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
+      3,  3,  5,  0,  7,  9,  5, 11, 13, 15,  7, 17, 19, 21, 23, 25,
+      9, 11, 27, 29, 31, 33, 13, 35, 37, 39, 41, 43, 15, 45, 47, 49,
+     51, 53, 55, 17, 19, 57, 59, 61, 63, 65, 67, 69, 21, 23, 25, 71,
+     73, 75, 77, 79, 81, 83, 27, 29, 31, 33, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105, 35, 37, 39, 41, 43,107,109,111,113,115,117,
+    119,121,123,129,  3, 45, 47, 49, 51, 53, 55,125,127,  5,  7,  9,
+     11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+     43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73,
+     75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99,101,103,105,
+    107,109,111,113,115,117,119,121,123,125,127,129, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,
 };
 
 /* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+static const uint8_t dnxhd_1237_ac_flags[257] = {
+    0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0,
+    2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
+    0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0,
+    0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 /* Used in CID 1238, 1243 */
@@ -452,65 +431,43 @@ static const uint8_t dnxhd_1238_ac_bits[257] = {
 
 /* Used in CID 1238, 1243 */
 static const uint8_t dnxhd_1238_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6,  7, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 10,
-    11, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 12, 13, 14, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 15, 16, 17, 18,
-    62, 63, 64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
-    14, 15, 16, 19, 20, 21, 22, 23, 24, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 25,
-    26, 27, 28, 29, 30, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
+      3,  3,  5,  7,  0,  9, 11,  5, 13, 15, 17,  7, 19, 21, 23,  9,
+     25, 27, 29, 31, 33, 11, 35, 37, 39, 41, 43, 45, 13, 15, 47, 49,
+     51, 53, 55, 57, 59, 17, 19, 61, 63, 65, 67, 69, 71, 73, 75, 21,
+     23, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 25, 27, 29, 99,
+    101,103,105,107,109,111,113,115,117,119,121,123, 31, 33, 35, 37,
+    125,127,129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27,
+     29, 31, 33, 39, 41, 43, 45, 47, 49, 35, 37, 39, 41, 43, 45, 47,
+     49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 81, 51,
+     53, 55, 57, 59, 61, 77, 79, 83, 85, 87, 89, 91, 93, 95, 97, 99,
+    101,103,105,107,109,111,113,115,117,119,121,123,125,127,129, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,
 }; /* 0 is EOB */
 
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+static const uint8_t dnxhd_1238_ac_flags[257] = {
+    0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
+    0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,
+    0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2,
+    2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,
     0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+    1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
+    2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 /* Used in CID 1235, 1241, 1256 */
@@ -573,65 +530,44 @@ static const uint8_t dnxhd_1235_ac_bits[257] = {
 
 /* Used in CID 1235, 1241, 1256 */
 static const uint8_t dnxhd_1235_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6,  7, 22, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    10, 11, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 12, 13,
-    14, 15, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,  1,
-    16, 17, 18, 19, 64,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
-    13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40, 41, 42, 25, 26, 27, 28, 29, 30, 31, 32, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
+      3,  3,  5,  7,  0,  9, 11,  5, 13, 15, 17,  7, 19, 21, 23,  9,
+     25, 27, 29, 31, 33, 11, 35, 37, 39, 41, 43, 13, 15, 45, 47, 49,
+     51, 53, 55, 57, 59, 17, 19, 61, 63, 65, 67, 69, 71, 73, 75, 77,
+     21, 23, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99,101, 25, 27,
+     29, 31,103,105,107,109,111,113,115,117,119,121,123,125,127,  3,
+     33, 35, 37, 39,129,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25,
+     27, 29, 31, 33, 35, 41, 43, 45, 47, 49, 37, 39, 41, 43, 45, 47,
+     49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+     81, 83, 85, 51, 53, 55, 57, 59, 61, 63, 65, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,
 };
 
 /* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+static const uint8_t dnxhd_1235_ac_flags[257] = {
+    0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
+    0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0,
+    0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
+    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+    2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 static const uint16_t dnxhd_1250_ac_codes[257] = {
@@ -689,61 +625,42 @@ static const uint8_t dnxhd_1250_ac_bits[257] = {
     16
 };
 static const uint8_t dnxhd_1250_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-     9, 10, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 11,
-    12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,  2,
-     3,  4,  5, 14, 15, 16, 17,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 18, 19, 20, 21,
-    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 22, 23, 24,
-    25, 26, 27, 54, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64
-};
-static const uint8_t dnxhd_1250_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      3,  3,  5,  7,  0,  9, 11,  5, 13, 15, 17,  7, 19, 21, 23,  9,
+     25, 27, 29, 31, 33, 11, 35, 37, 39, 41, 43, 45, 13, 47, 49, 51,
+     53, 55, 57, 59, 15, 17, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+     19, 21, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99,101,103,105, 23,
+     25, 27,107,109,111,113,115,117,119,121,123,125,127,129,  3,  5,
+      7,  9, 11, 29, 31, 33, 35, 13, 15, 17, 19, 21, 23, 25, 27, 29,
+     31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 37, 39, 41, 43,
+     55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85,
+     87, 89, 91, 93, 95, 97, 99,101,103,105,107,111,113, 45, 47, 49,
+     51, 53, 55,109,115,117,119,121,123,125,127,129, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129
+};
+static const uint8_t dnxhd_1250_ac_flags[257] = {
+    0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
+    0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
+    0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
+    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+    1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
-};
-static const uint8_t dnxhd_1250_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+    2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 static const uint16_t dnxhd_1251_ac_codes[257] = {
@@ -803,63 +720,43 @@ static const uint8_t dnxhd_1251_ac_bits[257] = {
 };
 
 static const uint8_t dnxhd_1251_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6, 22, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40,  9, 10, 11, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    12, 13, 14, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,
-     2,  3,  4,  5,  6,  7,  8, 15, 16, 17,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 18,
-    19, 20, 21, 22, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-    42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-    58, 23, 24, 25, 26, 27, 28, 59, 60, 61, 62, 63, 64, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-static const uint8_t dnxhd_1251_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      3,  3,  5,  7,  0,  9, 11,  5, 13, 15, 17,  7, 19, 21, 23,  9,
+     25, 27, 29, 31, 33, 11, 35, 37, 39, 41, 43, 13, 45, 47, 49, 51,
+     53, 55, 57, 59, 15, 17, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+     81, 19, 21, 23, 83, 85, 87, 89, 91, 93, 95, 97, 99,101,103,105,
+     25, 27, 29,107,109,111,113,115,117,119,121,123,125,127,129,  3,
+      5,  7,  9, 11, 13, 15, 17, 31, 33, 35, 19, 21, 23, 25, 27, 29,
+     31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 37,
+     39, 41, 43, 45, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83,
+     85, 87, 89, 91, 93, 95, 97, 99,101,103,105,107,109,111,113,115,
+    117, 47, 49, 51, 53, 55, 57,119,121,123,125,127,129, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,
+};
+
+static const uint8_t dnxhd_1251_ac_flags[257] = {
+    0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
+    0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
+    0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
+    2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-static const uint8_t dnxhd_1251_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+    1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 /* Used in CID 1252, 1258 */
@@ -922,65 +819,43 @@ static const uint8_t dnxhd_1252_ac_bits[257] = {
 
 /* Used in CID 1252, 1258 */
 static const uint8_t dnxhd_1252_ac_level[257] = {
-     1,  1,  2,  3,  2,  0,  4,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-    13, 14,  4,  5, 15, 16, 17, 18,  6, 19, 20, 21, 22, 23, 24,  7,
-     8, 25, 26, 27, 28, 29, 30, 31, 32,  9, 10, 33, 34, 35, 36, 37,
-    38, 39, 40, 41, 11, 12, 13, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-    51, 52, 53, 14, 15, 16, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3, 17, 18, 19, 20,  4,  5,  6,  7,  8,  9, 10, 11,
-    12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 22,
-    23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_run_flag[257] = {
-    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      3,  3,  5,  7,  5,  0,  9, 11, 13, 15,  7, 17, 19, 21, 23, 25,
+     27, 29,  9, 11, 31, 33, 35, 37, 13, 39, 41, 43, 45, 47, 49, 15,
+     17, 51, 53, 55, 57, 59, 61, 63, 65, 19, 21, 67, 69, 71, 73, 75,
+     77, 79, 81, 83, 23, 25, 27, 85, 87, 89, 91, 93, 95, 97, 99,101,
+    103,105,107, 29, 31, 33,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7, 35, 37, 39, 41,  9, 11, 13, 15, 17, 19, 21, 23,
+     25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 43, 45, 47, 49, 51, 45,
+     47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77,
+     79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99,101,103,105,107,109,
+    111,113,115,117,119,121,123,125,127,129, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+     33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+     65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+     97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127,
+    129,
+};
+
+static const uint8_t dnxhd_1252_ac_flags[257] = {
+    0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
+    0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2,
+    2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3,
 };
 
 /* Used in CID 1235, 1238, 1241, 1256 */
@@ -1075,102 +950,113 @@ static const uint8_t dnxhd_1250_run[62] = {
 };
 
 const CIDEntry ff_dnxhd_cid_table[] = {
-    { 1235, 1920, 1080, 0, 917504, 917504, 6, 10,
+    { 1235, 1920, 1080, 0, 917504, 917504, 6, 10, 4,
       dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
       dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_flags,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 175, 185, 365, 440 } },
-    { 1237, 1920, 1080, 0, 606208, 606208, 4, 8,
+      { 175, 185, 365, 440 },
+      { { 24000, 1001 }, { 25, 1 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1237, 1920, 1080, 0, 606208, 606208, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_flags,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 115, 120, 145, 240, 290 } },
-    { 1238, 1920, 1080, 0, 917504, 917504, 4, 8,
+      { 115, 120, 145, 240, 290 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1238, 1920, 1080, 0, 917504, 917504, 4, 8, 4,
       dnxhd_1238_luma_weight, dnxhd_1238_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_flags,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
-      { 175, 185, 220, 365, 440 } },
-    { 1241, 1920, 1080, 1, 917504, 458752, 6, 10,
+      { 175, 185, 220, 365, 440 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1241, 1920, 1080, 1, 917504, 458752, 6, 10, 4,
       dnxhd_1241_luma_weight, dnxhd_1241_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
       dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_flags,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 185, 220 } },
-    { 1242, 1920, 1080, 1, 606208, 303104, 4, 8,
+      { 185, 220 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1242, 1920, 1080, 1, 606208, 303104, 4, 8, 3,
       dnxhd_1242_luma_weight, dnxhd_1242_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_flags,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 120, 145 } },
-    { 1243, 1920, 1080, 1, 917504, 458752, 4, 8,
+      { 120, 145 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1243, 1920, 1080, 1, 917504, 458752, 4, 8, 4,
       dnxhd_1243_luma_weight, dnxhd_1243_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_flags,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
-      { 185, 220 } },
-    { 1250, 1280,  720, 0, 458752, 458752, 6, 10,
+      { 185, 220 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1250, 1280,  720, 0, 458752, 458752, 6, 10, 4,
       dnxhd_1250_luma_weight, dnxhd_1250_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
       dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_level,
-      dnxhd_1250_ac_run_flag, dnxhd_1250_ac_index_flag,
+      dnxhd_1250_ac_flags,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 90, 180, 220 } },
-    { 1251, 1280,  720, 0, 458752, 458752, 4, 8,
+      { 90, 90, 180, 220 },
+      { { 24000, 1001 }, { 25, 1 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1251, 1280,  720, 0, 458752, 458752, 4, 8, 4,
       dnxhd_1251_luma_weight, dnxhd_1251_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1251_ac_codes, dnxhd_1251_ac_bits, dnxhd_1251_ac_level,
-      dnxhd_1251_ac_run_flag, dnxhd_1251_ac_index_flag,
+      dnxhd_1251_ac_flags,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 90, 110, 175, 220 } },
-    { 1252, 1280,  720, 0, 303104, 303104, 4, 8,
+      { 90, 90, 110, 180, 220 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1252, 1280,  720, 0, 303104, 303104, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_flags,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 60, 75, 115, 145 } },
-    { 1253, 1920, 1080, 0, 188416, 188416, 4, 8,
+      { 60, 60, 75, 120, 145 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1253, 1920, 1080, 0, 188416, 188416, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_flags,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 36, 45, 75, 90 } },
-    { 1256, 1920, 1080, 0, 1835008, 1835008, 6, 10,
-      dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
+      { 36, 36, 45, 75, 90 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1256, 1920, 1080, 0, 1835008, 1835008, 6, 10, 4,
+      dnxhd_1235_luma_weight, dnxhd_1235_luma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
       dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_flags,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 350, 390, 440, 730, 880 } },
-    { 1258, 960, 720, 0, 212992, 212992, 4, 8,
+      { 350, 390, 440, 730, 880 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1258, 960, 720, 0, 212992, 212992, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_flags,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
       { 42, 60, 75, 115 } },
-    { 1259, 1440, 1080, 0, 417792, 417792, 4, 8,
+    { 1259, 1440, 1080, 0, 417792, 417792, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_flags,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 63, 84, 100, 110 } },
-    { 1260, 1440, 1080, 1, 835584, 417792, 4, 8,
+    { 1260, 1440, 1080, 1, 835584, 417792, 4, 8, 3,
       dnxhd_1260_luma_weight, dnxhd_1260_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
       dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_flags,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 80, 90, 100, 110 } },
 };
@@ -1184,6 +1070,22 @@ int ff_dnxhd_get_cid_table(int cid)
     return -1;
 }
 
+int avpriv_dnxhd_get_frame_size(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i<0)
+        return i;
+    return ff_dnxhd_cid_table[i].frame_size;
+}
+
+int avpriv_dnxhd_get_interlaced(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i < 0)
+        return i;
+    return ff_dnxhd_cid_table[i].interlaced;
+}
+
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
 {
     int i, j;
@@ -1195,7 +1097,7 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
         if (cid->width == avctx->width && cid->height == avctx->height &&
             cid->interlaced == !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) &&
             cid->bit_depth == bit_depth) {
-            for (j = 0; j < sizeof(cid->bit_rates); j++) {
+            for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
                 if (cid->bit_rates[j] == mbs)
                     return cid->cid;
             }
@@ -1203,3 +1105,19 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
     }
     return 0;
 }
+
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel)
+{
+    int i, j;
+    for (i = 0; i < FF_ARRAY_ELEMS(ff_dnxhd_cid_table); i++) {
+        const CIDEntry *cid = &ff_dnxhd_cid_table[i];
+        for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
+            if (!cid->bit_rates[j])
+                break;
+
+            av_log(avctx, loglevel, "Frame size: %dx%d%c; bitrate: %dMbps; pixel format: %s; framerate: %d/%d\n",
+                   cid->width, cid->height, cid->interlaced ? 'i' : 'p', cid->bit_rates[j],
+                   cid->bit_depth == 10 ? "yuv422p10" : "yuv422p", cid->frame_rates[j].num, cid->frame_rates[j].den);
+        }
+    }
+}
diff --git a/libavcodec/dnxhddata.h b/libavcodec/dnxhddata.h
index 66b03499d7..8cc27e88e7 100644
--- a/libavcodec/dnxhddata.h
+++ b/libavcodec/dnxhddata.h
@@ -2,20 +2,20 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include <stdint.h>
 #include "avcodec.h"
+#include "libavutil/internal.h"
 
 typedef struct CIDEntry {
     int cid;
@@ -33,19 +34,25 @@ typedef struct CIDEntry {
     unsigned int coding_unit_size;
     int index_bits;
     int bit_depth;
+    int eob_index;
     const uint8_t *luma_weight, *chroma_weight;
     const uint8_t *dc_codes, *dc_bits;
     const uint16_t *ac_codes;
     const uint8_t *ac_bits, *ac_level;
-    const uint8_t *ac_run_flag, *ac_index_flag;
+    const uint8_t *ac_flags;
     const uint16_t *run_codes;
     const uint8_t *run_bits, *run;
-    int bit_rates[5]; ///< Helpher to choose variants, rounded to nearest 5Mb/s
+    int bit_rates[5]; ///< Helper to choose variants, rounded to nearest 5Mb/s
+    AVRational frame_rates[5];
 } CIDEntry;
 
 extern const CIDEntry ff_dnxhd_cid_table[];
 
 int ff_dnxhd_get_cid_table(int cid);
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth);
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel);
+
+int avpriv_dnxhd_get_frame_size(int cid);
+int avpriv_dnxhd_get_interlaced(int cid);
 
 #endif /* AVCODEC_DNXHDDATA_H */
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index 50747ea489..296f7f74d6 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -2,23 +2,25 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  * Copyright (c) 2011 MirriAd Ltd
+ * Copyright (c) 2015 Christophe Gisquet
  *
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
+ * Slice multithreading and MB interlaced support added by Christophe Gisquet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,53 +28,76 @@
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#define  UNCHECKED_BITSTREAM_READER 1
 #include "get_bits.h"
 #include "dnxhddata.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "thread.h"
 
+typedef struct RowContext {
+    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
+    int luma_scale[64];
+    int chroma_scale[64];
+    GetBitContext gb;
+    int last_dc[3];
+    int last_qscale;
+    int errors;
+} RowContext;
+
 typedef struct DNXHDContext {
     AVCodecContext *avctx;
-    GetBitContext gb;
+    RowContext *rows;
     BlockDSPContext bdsp;
-    int cid;                            ///< compression id
+    const uint8_t* buf;
+    int buf_size;
+    int64_t cid;                        ///< compression id
     unsigned int width, height;
+    enum AVPixelFormat pix_fmt;
     unsigned int mb_width, mb_height;
     uint32_t mb_scan_index[68];         /* max for 1080p */
     int cur_field;                      ///< current interlaced field
     VLC ac_vlc, dc_vlc, run_vlc;
-    int last_dc[3];
     IDCTDSPContext idsp;
-    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
     ScanTable scantable;
     const CIDEntry *cid_table;
     int bit_depth; // 8, 10 or 0 if not initialized at all.
     int is_444;
     int mbaff;
-    void (*decode_dct_block)(struct DNXHDContext *ctx, int16_t *block,
-                             int n, int qscale);
+    int act;
+    int (*decode_dct_block)(const struct DNXHDContext *ctx,
+                            RowContext *row, int n);
 } DNXHDContext;
 
 #define DNXHD_VLC_BITS 9
 #define DNXHD_DC_VLC_BITS 7
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale);
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale);
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale);
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n);
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n);
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n);
 
 static av_cold int dnxhd_decode_init(AVCodecContext *avctx)
 {
     DNXHDContext *ctx = avctx->priv_data;
 
     ctx->avctx = avctx;
+    ctx->cid = -1;
+    avctx->colorspace = AVCOL_SPC_BT709;
+
+    avctx->coded_width  = FFALIGN(avctx->width,  16);
+    avctx->coded_height = FFALIGN(avctx->height, 16);
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
-static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
+static int dnxhd_init_vlc(DNXHDContext *ctx, uint32_t cid)
 {
     if (cid != ctx->cid) {
         int index;
@@ -81,6 +106,10 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
             av_log(ctx->avctx, AV_LOG_ERROR, "unsupported cid %d\n", cid);
             return AVERROR(ENOSYS);
         }
+        if (ff_dnxhd_cid_table[index].bit_depth != ctx->bit_depth) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "bit depth mismatches %d %d\n", ff_dnxhd_cid_table[index].bit_depth, ctx->bit_depth);
+            return AVERROR_INVALIDDATA;
+        }
         ctx->cid_table = &ff_dnxhd_cid_table[index];
         av_log(ctx->avctx, AV_LOG_VERBOSE, "Profile cid %d.\n", cid);
 
@@ -98,13 +127,25 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
                  ctx->cid_table->run_bits, 1, 1,
                  ctx->cid_table->run_codes, 2, 2, 0);
 
-        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
-                          ff_zigzag_direct);
         ctx->cid = cid;
     }
     return 0;
 }
 
+static av_cold int dnxhd_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    DNXHDContext *ctx = avctx->priv_data;
+
+    // make sure VLC tables will be loaded when cid is parsed
+    ctx->cid = -1;
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
 static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
                                const uint8_t *buf, int buf_size,
                                int first_field)
@@ -115,8 +156,8 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
     int old_bit_depth = ctx->bit_depth;
 
     if (buf_size < 0x280) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "buffer too small (%d < 640).\n",
-               buf_size);
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "buffer too small (%d < 640).\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
@@ -132,6 +173,8 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         frame->top_field_first  = first_field ^ ctx->cur_field;
         av_log(ctx->avctx, AV_LOG_DEBUG,
                "interlaced %d, cur field %d\n", buf[5] & 3, ctx->cur_field);
+    } else {
+        ctx->cur_field = 0;
     }
     ctx->mbaff = buf[0x6] & 32;
 
@@ -145,25 +188,29 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
 
         if (buf[0x4] == 0x2) {
             ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+            ctx->pix_fmt = AV_PIX_FMT_YUV444P10;
             ctx->is_444 = 1;
         } else {
             ctx->decode_dct_block = dnxhd_decode_dct_block_10;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            ctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            ctx->is_444 = 0;
         }
     } else if (buf[0x21] == 0x38) { /* 8 bit */
         ctx->bit_depth = ctx->avctx->bits_per_raw_sample = 8;
 
-        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        ctx->is_444 = 0;
         ctx->decode_dct_block = dnxhd_decode_dct_block_8;
     } else {
-        av_log(ctx->avctx, AV_LOG_ERROR, "invalid bit depth value (%d).\n",
-               buf[0x21]);
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "invalid bit depth value (%d).\n", buf[0x21]);
         return AVERROR_INVALIDDATA;
     }
     if (ctx->bit_depth != old_bit_depth) {
         ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
         ff_idctdsp_init(&ctx->idsp, ctx->avctx);
+        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
+                          ff_zigzag_direct);
     }
 
     cid = AV_RB32(buf + 0x28);
@@ -175,6 +222,11 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         av_log(ctx->avctx, AV_LOG_WARNING,
                "Adaptive MB interlace flag in an unsupported profile.\n");
 
+    ctx->act = buf[0x2C] & 7;
+    if (ctx->act && ctx->cid_table->cid != 1256)
+        av_log(ctx->avctx, AV_LOG_WARNING,
+               "Adaptive color transform in an unsupported profile.\n");
+
     // make sure profile size constraints are respected
     // DNx100 allows 1920->1440 and 1280->960 subsampling
     if (ctx->width != ctx->cid_table->width) {
@@ -209,7 +261,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
     for (i = 0; i < ctx->mb_height; i++) {
         ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i << 2));
         ff_dlog(ctx->avctx, "mb scan index %d\n", ctx->mb_scan_index[i]);
-        if (buf_size < ctx->mb_scan_index[i] + 0x280) {
+        if (buf_size < ctx->mb_scan_index[i] + 0x280LL) {
             av_log(ctx->avctx, AV_LOG_ERROR,
                    "invalid mb scan index (%d < %d).\n",
                    buf_size, ctx->mb_scan_index[i] + 0x280);
@@ -220,132 +272,163 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
     return 0;
 }
 
-static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx,
-                                                    int16_t *block, int n,
-                                                    int qscale,
-                                                    int index_bits,
-                                                    int level_bias,
-                                                    int level_shift)
+static av_always_inline int dnxhd_decode_dct_block(const DNXHDContext *ctx,
+                                                   RowContext *row,
+                                                   int n,
+                                                   int index_bits,
+                                                   int level_bias,
+                                                   int level_shift)
 {
-    int i, j, index1, index2, len;
+    int i, j, index1, index2, len, flags;
     int level, component, sign;
+    const int *scale;
     const uint8_t *weight_matrix;
-    OPEN_READER(bs, &ctx->gb);
+    const uint8_t *ac_level = ctx->cid_table->ac_level;
+    const uint8_t *ac_flags = ctx->cid_table->ac_flags;
+    int16_t *block = row->blocks[n];
+    const int eob_index     = ctx->cid_table->eob_index;
+    int ret = 0;
+    OPEN_READER(bs, &row->gb);
+
+    ctx->bdsp.clear_block(block);
 
     if (!ctx->is_444) {
         if (n & 2) {
             component     = 1 + (n & 1);
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
             component     = 0;
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     } else {
         component = (n >> 1) % 3;
         if (component) {
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     }
 
-    UPDATE_CACHE(bs, &ctx->gb);
-    GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(len, bs, &row->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
     if (len) {
-        level = GET_CACHE(bs, &ctx->gb);
-        LAST_SKIP_BITS(bs, &ctx->gb, len);
+        level = GET_CACHE(bs, &row->gb);
+        LAST_SKIP_BITS(bs, &row->gb, len);
         sign  = ~level >> 31;
         level = (NEG_USR32(sign ^ level, len) ^ sign) - sign;
-        ctx->last_dc[component] += level;
+        row->last_dc[component] += level;
     }
-    block[0] = ctx->last_dc[component];
+    block[0] = row->last_dc[component];
 
-    for (i = 1; ; i++) {
-        UPDATE_CACHE(bs, &ctx->gb);
-        GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table,
-                DNXHD_VLC_BITS, 2);
-        level = ctx->cid_table->ac_level[index1];
-        if (!level) /* EOB */
-            break;
+    i = 0;
 
-        sign = SHOW_SBITS(bs, &ctx->gb, 1);
-        SKIP_BITS(bs, &ctx->gb, 1);
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+            DNXHD_VLC_BITS, 2);
 
-        if (ctx->cid_table->ac_index_flag[index1]) {
-            level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 6;
-            SKIP_BITS(bs, &ctx->gb, index_bits);
+    while (index1 != eob_index) {
+        level = ac_level[index1];
+        flags = ac_flags[index1];
+
+        sign = SHOW_SBITS(bs, &row->gb, 1);
+        SKIP_BITS(bs, &row->gb, 1);
+
+        if (flags & 1) {
+            level += SHOW_UBITS(bs, &row->gb, index_bits) << 7;
+            SKIP_BITS(bs, &row->gb, index_bits);
         }
 
-        if (ctx->cid_table->ac_run_flag[index1]) {
-            UPDATE_CACHE(bs, &ctx->gb);
-            GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table,
+        if (flags & 2) {
+            UPDATE_CACHE(bs, &row->gb);
+            GET_VLC(index2, bs, &row->gb, ctx->run_vlc.table,
                     DNXHD_VLC_BITS, 2);
             i += ctx->cid_table->run[index2];
         }
 
-        if (i > 63) {
+        if (++i > 63) {
             av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
+            ret = -1;
             break;
         }
 
         j     = ctx->scantable.permutated[i];
-        level = (2 * level + 1) * qscale * weight_matrix[i];
+        level *= scale[i];
+        level += scale[i] >> 1;
         if (level_bias < 32 || weight_matrix[i] != level_bias)
-            level += level_bias;
+            level += level_bias; // 1<<(level_shift-1)
         level >>= level_shift;
 
         block[j] = (level ^ sign) - sign;
+
+        UPDATE_CACHE(bs, &row->gb);
+        GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+                DNXHD_VLC_BITS, 2);
     }
 
-    CLOSE_READER(bs, &ctx->gb);
+    CLOSE_READER(bs, &row->gb);
+    return ret;
 }
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale)
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 4, 32, 6);
 }
 
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale)
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 8, 4);
 }
 
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale)
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 32, 6);
 }
 
-static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
-                                   int x, int y)
+static int dnxhd_decode_macroblock(const DNXHDContext *ctx, RowContext *row,
+                                   AVFrame *frame, int x, int y)
 {
     int shift1 = ctx->bit_depth == 10;
     int dct_linesize_luma   = frame->linesize[0];
     int dct_linesize_chroma = frame->linesize[1];
     uint8_t *dest_y, *dest_u, *dest_v;
     int dct_y_offset, dct_x_offset;
-    int qscale, i;
+    int qscale, i, act;
     int interlaced_mb = 0;
 
     if (ctx->mbaff) {
-        interlaced_mb = get_bits1(&ctx->gb);
-        qscale = get_bits(&ctx->gb, 10);
+        interlaced_mb = get_bits1(&row->gb);
+        qscale = get_bits(&row->gb, 10);
     } else {
-        qscale = get_bits(&ctx->gb, 11);
+        qscale = get_bits(&row->gb, 11);
     }
-    skip_bits1(&ctx->gb);
-
-    for (i = 0; i < 8; i++) {
-        ctx->bdsp.clear_block(ctx->blocks[i]);
-        ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+    act = get_bits1(&row->gb);
+    if (act) {
+        static int warned = 0;
+        if (!warned) {
+            warned = 1;
+            av_log(ctx->avctx, AV_LOG_ERROR,
+                   "Unsupported adaptive color transform, patch welcome.\n");
+        }
     }
-    if (ctx->is_444) {
-        for (; i < 12; i++) {
-            ctx->bdsp.clear_block(ctx->blocks[i]);
-            ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+
+    if (qscale != row->last_qscale) {
+        for (i = 0; i < 64; i++) {
+            row->luma_scale[i]   = qscale * ctx->cid_table->luma_weight[i];
+            row->chroma_scale[i] = qscale * ctx->cid_table->chroma_weight[i];
         }
+        row->last_qscale = qscale;
+    }
+
+    for (i = 0; i < 8 + 4 * ctx->is_444; i++) {
+        if (ctx->decode_dct_block(ctx, row, i) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
     if (frame->interlaced_frame) {
@@ -357,7 +440,7 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dest_u = frame->data[1] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
     dest_v = frame->data[2] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
 
-    if (ctx->cur_field) {
+    if (frame->interlaced_frame && ctx->cur_field) {
         dest_y += frame->linesize[0];
         dest_u += frame->linesize[1];
         dest_v += frame->linesize[2];
@@ -370,55 +453,62 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dct_y_offset = interlaced_mb ? frame->linesize[0] : (dct_linesize_luma << 3);
     dct_x_offset = 8 << shift1;
     if (!ctx->is_444) {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[4]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[5]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
+            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, row->blocks[6]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, row->blocks[7]);
         }
     } else {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[6]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[6]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[7]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, ctx->blocks[8]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]);
-            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, ctx->blocks[4]);
-            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, ctx->blocks[5]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, ctx->blocks[10]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]);
+            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, row->blocks[8]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[9]);
+            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, row->blocks[4]);
+            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, row->blocks[5]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, row->blocks[10]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[11]);
         }
     }
 
     return 0;
 }
 
-static int dnxhd_decode_macroblocks(DNXHDContext *ctx, AVFrame *frame,
-                                    const uint8_t *buf, int buf_size)
+static int dnxhd_decode_row(AVCodecContext *avctx, void *data,
+                            int rownb, int threadnb)
 {
-    int x, y;
-    for (y = 0; y < ctx->mb_height; y++) {
-        ctx->last_dc[0] =
-        ctx->last_dc[1] =
-        ctx->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
-        init_get_bits(&ctx->gb, buf + ctx->mb_scan_index[y], (buf_size - ctx->mb_scan_index[y]) << 3);
-        for (x = 0; x < ctx->mb_width; x++) {
-            //START_TIMER;
-            dnxhd_decode_macroblock(ctx, frame, x, y);
-            //STOP_TIMER("decode macroblock");
+    const DNXHDContext *ctx = avctx->priv_data;
+    uint32_t offset = ctx->mb_scan_index[rownb];
+    RowContext *row = ctx->rows + threadnb;
+    int x;
+
+    row->last_dc[0] =
+    row->last_dc[1] =
+    row->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
+    init_get_bits(&row->gb, ctx->buf + offset, (ctx->buf_size - offset) << 3);
+    for (x = 0; x < ctx->mb_width; x++) {
+        //START_TIMER;
+        int ret = dnxhd_decode_macroblock(ctx, row, data, x, rownb);
+        if (ret < 0) {
+            row->errors++;
+            return ret;
         }
+        //STOP_TIMER("decode macroblock");
     }
+
     return 0;
 }
 
@@ -428,16 +518,15 @@ static int dnxhd_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     DNXHDContext *ctx = avctx->priv_data;
-    ThreadFrame tf;
+    ThreadFrame frame = { .f = data };
+    AVFrame *picture = data;
     int first_field = 1;
-    int ret;
-
-    tf.f = data;
+    int ret, i;
 
     ff_dlog(avctx, "frame size %d\n", buf_size);
 
 decode_coding_unit:
-    if ((ret = dnxhd_decode_header(ctx, tf.f, buf, buf_size, first_field)) < 0)
+    if ((ret = dnxhd_decode_header(ctx, picture, buf, buf_size, first_field)) < 0)
         return ret;
 
     if ((avctx->width || avctx->height) &&
@@ -446,31 +535,48 @@ decode_coding_unit:
                avctx->width, avctx->height, ctx->width, ctx->height);
         first_field = 1;
     }
+    if (avctx->pix_fmt != AV_PIX_FMT_NONE && avctx->pix_fmt != ctx->pix_fmt) {
+        av_log(avctx, AV_LOG_WARNING, "pix_fmt changed: %s -> %s\n",
+               av_get_pix_fmt_name(avctx->pix_fmt), av_get_pix_fmt_name(ctx->pix_fmt));
+        first_field = 1;
+    }
 
+    avctx->pix_fmt = ctx->pix_fmt;
     ret = ff_set_dimensions(avctx, ctx->width, ctx->height);
     if (ret < 0)
         return ret;
 
     if (first_field) {
-        if ((ret = ff_thread_get_buffer(avctx, &tf, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
-        tf.f->pict_type = AV_PICTURE_TYPE_I;
-        tf.f->key_frame = 1;
+        picture->pict_type = AV_PICTURE_TYPE_I;
+        picture->key_frame = 1;
     }
 
-    dnxhd_decode_macroblocks(ctx, tf.f, buf + 0x280, buf_size - 0x280);
+    ctx->buf_size = buf_size - 0x280;
+    ctx->buf = buf + 0x280;
+    avctx->execute2(avctx, dnxhd_decode_row, picture, NULL, ctx->mb_height);
 
-    if (first_field && tf.f->interlaced_frame) {
+    if (first_field && picture->interlaced_frame) {
         buf      += ctx->cid_table->coding_unit_size;
         buf_size -= ctx->cid_table->coding_unit_size;
         first_field = 0;
         goto decode_coding_unit;
     }
 
+    ret = 0;
+    for (i = 0; i < avctx->thread_count; i++) {
+        ret += ctx->rows[i].errors;
+        ctx->rows[i].errors = 0;
+    }
+
+    if (ret) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "%d lines with errors\n", ret);
+        return AVERROR_INVALIDDATA;
+    }
+
     *got_frame = 1;
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
@@ -480,6 +586,9 @@ static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
     ff_free_vlc(&ctx->ac_vlc);
     ff_free_vlc(&ctx->dc_vlc);
     ff_free_vlc(&ctx->run_vlc);
+
+    av_freep(&ctx->rows);
+
     return 0;
 }
 
@@ -492,5 +601,7 @@ AVCodec ff_dnxhd_decoder = {
     .init           = dnxhd_decode_init,
     .close          = dnxhd_decode_close,
     .decode         = dnxhd_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
+                      AV_CODEC_CAP_SLICE_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(dnxhd_decode_init_thread_copy),
 };
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index d2fbc167d2..7d96cd4f78 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -6,20 +6,20 @@
  * VC-3 encoder funded by the British Broadcasting Corporation
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,7 @@
 #include "pixblockdsp.h"
 #include "dnxhdenc.h"
 
+
 // The largest value that will not lead to overflow for 10bit samples.
 #define DNX10BIT_QMAT_SHIFT 18
 #define RC_VARIANCE 1 // use variance or ssd for fast rc
@@ -47,18 +48,18 @@ static const AVOption options[] = {
         offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "ibias", "intra quant bias",
         offsetof(DNXHDEncContext, intra_quant_bias), AV_OPT_TYPE_INT,
-        { .i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, VE },
+        { .i64 = 0 }, INT_MIN, INT_MAX, VE },
     { NULL }
 };
 
-static const AVClass class = {
-    "dnxhd",
-    av_default_item_name,
-    options,
-    LIBAVUTIL_VERSION_INT
+static const AVClass dnxhd_class = {
+    .class_name = "dnxhd",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
+static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                           const uint8_t *pixels,
                                           ptrdiff_t line_size)
 {
@@ -82,25 +83,33 @@ static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
 }
 
 static av_always_inline
-void dnxhd_10bit_get_pixels_8x4_sym(int16_t *restrict block,
+void dnxhd_10bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                     const uint8_t *pixels,
                                     ptrdiff_t line_size)
 {
     int i;
-
-    block += 32;
+    const uint16_t* pixels16 = (const uint16_t*)pixels;
+    line_size >>= 1;
 
     for (i = 0; i < 4; i++) {
-        memcpy(block + i * 8, pixels + i * line_size, 8 * sizeof(*block));
-        memcpy(block - (i + 1) * 8, pixels + i * line_size, 8 * sizeof(*block));
+        block[0] = pixels16[0]; block[1] = pixels16[1];
+        block[2] = pixels16[2]; block[3] = pixels16[3];
+        block[4] = pixels16[4]; block[5] = pixels16[5];
+        block[6] = pixels16[6]; block[7] = pixels16[7];
+        pixels16 += line_size;
+        block += 8;
     }
+    memcpy(block,      block -  8, sizeof(*block) * 8);
+    memcpy(block +  8, block - 16, sizeof(*block) * 8);
+    memcpy(block + 16, block - 24, sizeof(*block) * 8);
+    memcpy(block + 24, block - 32, sizeof(*block) * 8);
 }
 
 static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, int16_t *block,
                                     int n, int qscale, int *overflow)
 {
     const uint8_t *scantable= ctx->intra_scantable.scantable;
-    const int *qmat = ctx->q_intra_matrix[qscale];
+    const int *qmat = n<4 ? ctx->q_intra_matrix[qscale] : ctx->q_chroma_intra_matrix[qscale];
     int last_non_zero = 0;
     int i;
 
@@ -127,10 +136,10 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
     int i, j, level, run;
     int max_level = 1 << (ctx->cid_table->bit_depth + 2);
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes,
-                      max_level * 4 * sizeof(*ctx->vlc_codes), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,
-                      max_level * 4 * sizeof(*ctx->vlc_bits), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->vlc_codes,
+                      max_level, 4 * sizeof(*ctx->vlc_codes), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,
+                      max_level, 4 * sizeof(*ctx->vlc_bits), fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes,
                       63 * 2, fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits,
@@ -149,10 +158,10 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                 alevel -= offset << 6;
             }
             for (j = 0; j < 257; j++) {
-                if (ctx->cid_table->ac_level[j] == alevel &&
-                    (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) &&
-                    (!run    || (ctx->cid_table->ac_run_flag  [j] && run))) {
-                    assert(!ctx->vlc_codes[index]);
+                if (ctx->cid_table->ac_level[j] >> 1 == alevel &&
+                    (!offset || (ctx->cid_table->ac_flags[j] & 1) && offset) &&
+                    (!run    || (ctx->cid_table->ac_flags[j] & 2) && run)) {
+                    av_assert1(!ctx->vlc_codes[index]);
                     if (alevel) {
                         ctx->vlc_codes[index] =
                             (ctx->cid_table->ac_codes[j] << 1) | (sign & 1);
@@ -164,7 +173,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                     break;
                 }
             }
-            assert(!alevel || j < 257);
+            av_assert0(!alevel || j < 257);
             if (offset) {
                 ctx->vlc_codes[index] =
                     (ctx->vlc_codes[index] << ctx->cid_table->index_bits) | offset;
@@ -174,7 +183,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
     }
     for (i = 0; i < 62; i++) {
         int run = ctx->cid_table->run[i];
-        assert(run < 63);
+        av_assert0(run < 63);
         ctx->run_codes[run] = ctx->cid_table->run_codes[i];
         ctx->run_bits[run]  = ctx->cid_table->run_bits[i];
     }
@@ -191,15 +200,15 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
     const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
     const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
 
     if (ctx->cid_table->bit_depth == 8) {
@@ -253,6 +262,11 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         }
     }
 
+    ctx->m.q_chroma_intra_matrix16 = ctx->qmatrix_c16;
+    ctx->m.q_chroma_intra_matrix   = ctx->qmatrix_c;
+    ctx->m.q_intra_matrix16        = ctx->qmatrix_l16;
+    ctx->m.q_intra_matrix          = ctx->qmatrix_l;
+
     return 0;
 fail:
     return AVERROR(ENOMEM);
@@ -260,11 +274,10 @@ fail:
 
 static av_cold int dnxhd_init_rc(DNXHDEncContext *ctx)
 {
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc,
-                      8160 * ctx->m.avctx->qmax * sizeof(RCEntry), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_rc, (ctx->m.avctx->qmax + 1), 8160 * sizeof(RCEntry), fail);
     if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
-        FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
-                          ctx->m.mb_num * sizeof(RCCMPEntry), fail);
+        FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
+                          ctx->m.mb_num, sizeof(RCCMPEntry), fail);
 
     ctx->frame_bits = (ctx->cid_table->coding_unit_size -
                        640 - 4 - ctx->min_padding) * 8;
@@ -296,14 +309,15 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
     ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
     if (!ctx->cid) {
         av_log(avctx, AV_LOG_ERROR,
-               "video parameters incompatible with DNxHD\n");
+               "video parameters incompatible with DNxHD. Valid DNxHD profiles:\n");
+        ff_dnxhd_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
     av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
 
     index = ff_dnxhd_get_cid_table(ctx->cid);
-    if (index < 0)
-        return index;
+    av_assert0(index >= 0);
+
     ctx->cid_table = &ff_dnxhd_cid_table[index];
 
     ctx->m.avctx    = avctx;
@@ -317,6 +331,8 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
     ff_mpv_idct_init(&ctx->m);
     ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
     ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
+    ff_dct_encode_init(&ctx->m);
+
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = ff_dct_quantize_c;
 
@@ -383,6 +399,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         return AVERROR(EINVAL);
     }
 
+    if (avctx->qmax <= 1) {
+        av_log(avctx, AV_LOG_ERROR, "qmax must be at least 2\n");
+        return AVERROR(EINVAL);
+    }
+
     ctx->thread[0] = ctx;
     for (i = 1; i < avctx->thread_count; i++) {
         ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext));
@@ -397,7 +418,7 @@ fail:  // for FF_ALLOCZ_OR_GOTO
 static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
+    static const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
 
     memset(buf, 0, 640);
 
@@ -436,7 +457,7 @@ static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
     }
     put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
              (ctx->cid_table->dc_codes[nbits] << nbits) +
-             (diff & ((1 << nbits) - 1)));
+             av_mod_uintp2(diff, nbits));
 }
 
 static av_always_inline
@@ -590,15 +611,8 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 static av_always_inline
 int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
 {
-    if (i & 2) {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_c16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_c;
-        return 1 + (i & 1);
-    } else {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_l16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_l;
-        return 0;
-    }
+    const static uint8_t component[8]={0,0,1,2,0,0,1,2};
+    return component[i];
 }
 
 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
@@ -629,7 +643,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
             int n = dnxhd_switch_matrix(ctx, i);
 
             memcpy(block, src_block, 64 * sizeof(*block));
-            last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            last_index = ctx->m.dct_quantize(&ctx->m, block, 4 & (2*i),
                                              qscale, &overflow);
             ac_bits   += dnxhd_calc_ac_bits(ctx, block, last_index);
 
@@ -639,7 +653,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
             else
                 nbits = av_log2_16bit(2 * diff);
 
-            assert(nbits < ctx->cid_table->bit_depth + 4);
+            av_assert1(nbits < ctx->cid_table->bit_depth + 4);
             dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;
 
             ctx->m.last_dc[n] = block[0];
@@ -681,7 +695,7 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg,
         for (i = 0; i < 8; i++) {
             int16_t *block = ctx->blocks[i];
             int overflow, n = dnxhd_switch_matrix(ctx, i);
-            int last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            int last_index = ctx->m.dct_quantize(&ctx->m, block, 4 & (2*i),
                                                  qscale, &overflow);
             // START_TIMER;
             dnxhd_encode_block(ctx, block, last_index, n);
@@ -940,13 +954,13 @@ static void radix_count(const RCCMPEntry *data, int size,
             buckets[j][get_bucket(v, 0)]++;
             v >>= BUCKET_BITS;
         }
-        assert(!v);
+        av_assert1(!v);
     }
     for (j = 0; j < RADIX_PASSES; j++) {
         int offset = size;
         for (i = NBUCKETS - 1; i >= 0; i--)
             buckets[j][i] = offset -= buckets[j][i];
-        assert(!buckets[j][0]);
+        av_assert1(!buckets[j][0]);
     }
 }
 
@@ -965,7 +979,7 @@ static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data,
 static void radix_sort(RCCMPEntry *data, int size)
 {
     int buckets[RADIX_PASSES][NBUCKETS];
-    RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
+    RCCMPEntry *tmp = av_malloc_array(size, sizeof(*tmp));
     radix_count(data, size, buckets);
     radix_sort_pass(tmp, data, size, buckets[0], 0);
     radix_sort_pass(data, tmp, size, buckets[1], 1);
@@ -1043,13 +1057,10 @@ static int dnxhd_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     DNXHDEncContext *ctx = avctx->priv_data;
     int first_field = 1;
     int offset, i, ret;
-    uint8_t *buf, *sd;
+    uint8_t *buf;
 
-    if ((ret = ff_alloc_packet(pkt, ctx->cid_table->frame_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "output buffer is too small to compress picture\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, ctx->cid_table->frame_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     dnxhd_load_picture(ctx, frame);
@@ -1079,12 +1090,12 @@ encode_coding_unit:
     for (i = 0; i < ctx->m.mb_height; i++) {
         AV_WB32(ctx->msip + i * 4, offset);
         offset += ctx->slice_size[i];
-        assert(!(ctx->slice_size[i] & 3));
+        av_assert1(!(ctx->slice_size[i] & 3));
     }
 
     avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
 
-    assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
+    av_assert1(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
     memset(buf + 640 + offset, 0,
            ctx->cid_table->coding_unit_size - 4 - offset - 640);
 
@@ -1103,10 +1114,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = ctx->qscale * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, ctx->qscale * FF_QP2LAMBDA, NULL, 0, AV_PICTURE_TYPE_I);
 
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
@@ -1142,6 +1150,11 @@ static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVCodecDefault dnxhd_defaults[] = {
+    { "qmax", "1024" }, /* Maximum quantization scale factor allowed for VC-3 */
+    { NULL },
+};
+
 AVCodec ff_dnxhd_encoder = {
     .name           = "dnxhd",
     .long_name      = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
@@ -1157,5 +1170,6 @@ AVCodec ff_dnxhd_encoder = {
         AV_PIX_FMT_YUV422P10,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &dnxhd_class,
+    .defaults       = dnxhd_defaults,
 };
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index d3df0e0001..3f531efcfb 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,8 +85,6 @@ typedef struct DNXHDEncContext {
     unsigned qscale;
     unsigned lambda;
 
-    unsigned thread_size;
-
     uint16_t *mb_bits;
     uint8_t  *mb_qscale;
 
diff --git a/libavcodec/dpcm.c b/libavcodec/dpcm.c
index b9f5b93ebc..c13945edb6 100644
--- a/libavcodec/dpcm.c
+++ b/libavcodec/dpcm.c
@@ -1,21 +1,21 @@
 /*
  * Assorted DPCM codecs
- * Copyright (c) 2003 The ffmpeg Project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -118,7 +118,7 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
-        av_log(avctx, AV_LOG_INFO, "invalid number of channels\n");
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
 
@@ -205,13 +205,14 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR, "packet is too small\n");
         return AVERROR(EINVAL);
     }
+    if (out % avctx->channels) {
+        av_log(avctx, AV_LOG_WARNING, "channels have differing number of samples\n");
+    }
 
     /* get output buffer */
-    frame->nb_samples = out / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = (out + avctx->channels - 1) / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
     samples_end = output_samples + out;
 
diff --git a/libavcodec/dpx.c b/libavcodec/dpx.c
index d4effa4c48..15c939fcad 100644
--- a/libavcodec/dpx.c
+++ b/libavcodec/dpx.c
@@ -2,29 +2,42 @@
  * DPX (.dpx) image decoder
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/intfloat.h"
 #include "libavutil/imgutils.h"
 #include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
+static unsigned int read16(const uint8_t **ptr, int is_big)
+{
+    unsigned int temp;
+    if (is_big) {
+        temp = AV_RB16(*ptr);
+    } else {
+        temp = AV_RL16(*ptr);
+    }
+    *ptr += 2;
+    return temp;
+}
+
 static unsigned int read32(const uint8_t **ptr, int is_big)
 {
     unsigned int temp;
@@ -37,12 +50,19 @@ static unsigned int read32(const uint8_t **ptr, int is_big)
     return temp;
 }
 
-static inline unsigned make_16bit(unsigned value)
+static uint16_t read10in32(const uint8_t **ptr, uint32_t * lbuf,
+                                  int * n_datum, int is_big)
 {
-    // mask away invalid bits
-    value &= 0xFFC0;
-    // correctly expand to 16 bits
-    return value + (value >> 10);
+    if (*n_datum)
+        (*n_datum)--;
+    else {
+        *lbuf = read32(ptr, is_big);
+        *n_datum = 2;
+    }
+
+    *lbuf = (*lbuf << 10) | (*lbuf >> 22);
+
+    return *lbuf & 0x3FF;
 }
 
 static int decode_frame(AVCodecContext *avctx,
@@ -51,17 +71,18 @@ static int decode_frame(AVCodecContext *avctx,
                         AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = avpkt->data + avpkt->size;
     int buf_size       = avpkt->size;
     AVFrame *const p = data;
-    uint8_t *ptr;
+    uint8_t *ptr[AV_NUM_DATA_POINTERS];
 
     unsigned int offset;
     int magic_num, endian;
-    int x, y, ret;
-    int w, h, stride, bits_per_color, descriptor, elements, target_packet_size, source_packet_size;
+    int x, y, stride, i, ret;
+    int w, h, bits_per_color, descriptor, elements, packing;
+    int encoding, need_align = 0;
 
-    unsigned int rgbBuffer;
+    unsigned int rgbBuffer = 0;
+    int n_datum = 0;
 
     if (avpkt->size <= 1634) {
         av_log(avctx, AV_LOG_ERROR, "Packet too small for DPX header\n");
@@ -87,11 +108,24 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Invalid data start offset\n");
         return AVERROR_INVALIDDATA;
     }
+
+    // Check encryption
+    buf = avpkt->data + 660;
+    ret = read32(&buf, endian);
+    if (ret != 0xFFFFFFFF) {
+        avpriv_report_missing_feature(avctx, "Encryption");
+        av_log(avctx, AV_LOG_WARNING, "The image is encrypted and may "
+               "not properly decode.\n");
+    }
+
     // Need to end in 0x304 offset from start of file
     buf = avpkt->data + 0x304;
     w = read32(&buf, endian);
     h = read32(&buf, endian);
 
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
+        return ret;
+
     // Need to end in 0x320 to read the descriptor
     buf += 20;
     descriptor = buf[0];
@@ -100,108 +134,251 @@ static int decode_frame(AVCodecContext *avctx,
     buf += 3;
     avctx->bits_per_raw_sample =
     bits_per_color = buf[0];
+    buf++;
+    packing = read16(&buf, endian);
+    encoding = read16(&buf, endian);
+
+    if (packing > 1) {
+        avpriv_report_missing_feature(avctx, "Packing %d", packing);
+        return AVERROR_PATCHWELCOME;
+    }
+    if (encoding) {
+        avpriv_report_missing_feature(avctx, "Encoding %d", encoding);
+        return AVERROR_PATCHWELCOME;
+    }
 
-    buf += 825;
+    buf += 820;
     avctx->sample_aspect_ratio.num = read32(&buf, endian);
     avctx->sample_aspect_ratio.den = read32(&buf, endian);
+    if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0)
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den,
+                  0x10000);
+    else
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+
+    if (offset >= 1724 + 4) {
+        buf = avpkt->data + 1724;
+        i = read32(&buf, endian);
+        if(i) {
+            AVRational q = av_d2q(av_int2float(i), 4096);
+            if (q.num > 0 && q.den > 0)
+                avctx->framerate = q;
+        }
+    }
 
     switch (descriptor) {
-        case 51: // RGBA
-            elements = 4;
-            break;
-        case 50: // RGB
-            elements = 3;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported descriptor %d\n", descriptor);
-            return AVERROR_INVALIDDATA;
+    case 6:  // Y
+        elements = 1;
+        break;
+    case 52: // ABGR
+    case 51: // RGBA
+    case 103: // UYVA4444
+        elements = 4;
+        break;
+    case 50: // RGB
+    case 102: // UYV444
+        elements = 3;
+        break;
+    case 100: // UYVY422
+        elements = 2;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx, "Descriptor %d", descriptor);
+        return AVERROR_PATCHWELCOME;
     }
 
     switch (bits_per_color) {
-        case 8:
-            if (elements == 4) {
-                avctx->pix_fmt = AV_PIX_FMT_RGBA;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB24;
-            }
-            source_packet_size = elements;
-            target_packet_size = elements;
-            break;
-        case 10:
-            avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            target_packet_size = 6;
-            source_packet_size = 4;
-            break;
-        case 12:
-        case 16:
-            if (endian) {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
-            }
-            target_packet_size = 6;
-            source_packet_size = elements * 2;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported color depth : %d\n", bits_per_color);
+    case 8:
+        stride = avctx->width * elements;
+        break;
+    case 10:
+        if (!packing) {
+            av_log(avctx, AV_LOG_ERROR, "Packing to 32bit required\n");
+            return -1;
+        }
+        stride = (avctx->width * elements + 2) / 3 * 4;
+        break;
+    case 12:
+        if (!packing) {
+            av_log(avctx, AV_LOG_ERROR, "Packing to 16bit required\n");
+            return -1;
+        }
+        stride = 2 * avctx->width * elements;
+        break;
+    case 16:
+        stride = 2 * avctx->width * elements;
+        break;
+    case 1:
+    case 32:
+    case 64:
+        avpriv_report_missing_feature(avctx, "Depth %d", bits_per_color);
+        return AVERROR_PATCHWELCOME;
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Table 3c: Runs will always break at scan line boundaries. Packing
+    // will always break to the next 32-bit word at scan-line boundaries.
+    // Unfortunately, the encoder produced invalid files, so attempt
+    // to detect it
+    need_align = FFALIGN(stride, 4);
+    if (need_align*avctx->height + (int64_t)offset > avpkt->size) {
+        // Alignment seems unappliable, try without
+        if (stride*avctx->height + (int64_t)offset > avpkt->size) {
+            av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
             return AVERROR_INVALIDDATA;
+        } else {
+            av_log(avctx, AV_LOG_INFO, "Decoding DPX without scanline "
+                   "alignment.\n");
+            need_align = 0;
+        }
+    } else {
+        need_align -= stride;
+        stride = FFALIGN(stride, 4);
     }
 
-    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
-        return ret;
+    switch (1000 * descriptor + 10 * bits_per_color + endian) {
+    case 6081:
+    case 6080:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        break;
+    case 50081:
+    case 50080:
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        break;
+    case 52081:
+    case 52080:
+        avctx->pix_fmt = AV_PIX_FMT_ABGR;
+        break;
+    case 51081:
+    case 51080:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        break;
+    case 50100:
+    case 51100:
+    case 50101:
+    case 51101:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        break;
+    case 50120:
+    case 51120:
+    case 50121:
+    case 51121:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        break;
+    case 6161:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        break;
+    case 6160:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16LE;
+        break;
+    case 50161:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        break;
+    case 50160:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
+        break;
+    case 51161:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        break;
+    case 51160:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64LE;
+        break;
+    case 100081:
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+        break;
+    case 102081:
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+    case 103081:
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported format\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     ff_set_sar(avctx, avctx->sample_aspect_ratio);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     // Move pointer to offset from start of file
     buf =  avpkt->data + offset;
 
-    ptr    = p->data[0];
-    stride = p->linesize[0];
+    for (i=0; i<AV_NUM_DATA_POINTERS; i++)
+        ptr[i] = p->data[i];
 
-    if (source_packet_size*avctx->width*avctx->height > buf_end - buf) {
-        av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
-        return AVERROR_INVALIDDATA;
-    }
     switch (bits_per_color) {
-        case 10:
-            for (x = 0; x < avctx->height; x++) {
-               uint16_t *dst = (uint16_t*)ptr;
-               for (y = 0; y < avctx->width; y++) {
-                   rgbBuffer = read32(&buf, endian);
-                   // Read out the 10-bit colors and convert to 16-bit
-                   *dst++ = make_16bit(rgbBuffer >> 16);
-                   *dst++ = make_16bit(rgbBuffer >>  6);
-                   *dst++ = make_16bit(rgbBuffer <<  4);
-               }
-               ptr += stride;
+    case 10:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[3] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2]};
+            for (y = 0; y < avctx->width; y++) {
+                *dst[2]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                *dst[0]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                *dst[1]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                // For 10 bit, ignore alpha
+                if (elements == 4)
+                    read10in32(&buf, &rgbBuffer,
+                               &n_datum, endian);
             }
-            break;
-        case 8:
-        case 12: // Treat 12-bit as 16-bit
-        case 16:
-            if (source_packet_size == target_packet_size) {
-                for (x = 0; x < avctx->height; x++) {
-                    memcpy(ptr, buf, target_packet_size*avctx->width);
-                    ptr += stride;
-                    buf += source_packet_size*avctx->width;
-                }
-            } else {
-                for (x = 0; x < avctx->height; x++) {
-                    uint8_t *dst = ptr;
-                    for (y = 0; y < avctx->width; y++) {
-                        memcpy(dst, buf, target_packet_size);
-                        dst += target_packet_size;
-                        buf += source_packet_size;
-                    }
-                    ptr += stride;
+            n_datum = 0;
+            for (i = 0; i < 3; i++)
+                ptr[i] += p->linesize[i];
+        }
+        break;
+    case 12:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[3] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2]};
+            for (y = 0; y < avctx->width; y++) {
+                *dst[2] = read16(&buf, endian) >> 4;
+                dst[2]++;
+                *dst[0] = read16(&buf, endian) >> 4;
+                dst[0]++;
+                *dst[1] = read16(&buf, endian) >> 4;
+                dst[1]++;
+                // For 12 bit, ignore alpha
+                if (elements == 4)
+                    buf += 2;
+                // Jump to next aligned position
+                buf += need_align;
+            }
+            for (i = 0; i < 3; i++)
+                ptr[i] += p->linesize[i];
+        }
+        break;
+    case 16:
+        elements *= 2;
+    case 8:
+        if (   avctx->pix_fmt == AV_PIX_FMT_YUVA444P
+            || avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+            for (x = 0; x < avctx->height; x++) {
+                ptr[0] = p->data[0] + x * p->linesize[0];
+                ptr[1] = p->data[1] + x * p->linesize[1];
+                ptr[2] = p->data[2] + x * p->linesize[2];
+                ptr[3] = p->data[3] + x * p->linesize[3];
+                for (y = 0; y < avctx->width; y++) {
+                    *ptr[1]++ = *buf++;
+                    *ptr[0]++ = *buf++;
+                    *ptr[2]++ = *buf++;
+                    if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P)
+                        *ptr[3]++ = *buf++;
                 }
             }
-            break;
+        } else {
+        av_image_copy_plane(ptr[0], p->linesize[0],
+                            buf, stride,
+                            elements * avctx->width, avctx->height);
+        }
+        break;
     }
 
     *got_frame = 1;
@@ -211,7 +388,7 @@ static int decode_frame(AVCodecContext *avctx,
 
 AVCodec ff_dpx_decoder = {
     .name           = "dpx",
-    .long_name      = NULL_IF_CONFIG_SMALL("DPX image"),
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_DPX,
     .decode         = decode_frame,
diff --git a/libavcodec/dpx_parser.c b/libavcodec/dpx_parser.c
index e3a7ac5c35..8e4a01e09d 100644
--- a/libavcodec/dpx_parser.c
+++ b/libavcodec/dpx_parser.c
@@ -2,20 +2,20 @@
  * DPX parser
  * Copyright (c) 2013 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dpxenc.c b/libavcodec/dpxenc.c
index 17db94a414..9f3cbbe977 100644
--- a/libavcodec/dpxenc.c
+++ b/libavcodec/dpxenc.c
@@ -2,20 +2,20 @@
  * DPX (.dpx) image encoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,35 +28,44 @@
 typedef struct DPXContext {
     int big_endian;
     int bits_per_component;
+    int num_components;
     int descriptor;
+    int planar;
 } DPXContext;
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     DPXContext *s = avctx->priv_data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    s->big_endian         = 1;
-    s->bits_per_component = 8;
-    s->descriptor         = 50; /* RGB */
+    s->big_endian         = !!(desc->flags & AV_PIX_FMT_FLAG_BE);
+    s->bits_per_component = desc->comp[0].depth;
+    s->num_components     = desc->nb_components;
+    s->descriptor         = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) ? 51 : 50;
+    s->planar             = !!(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
 
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_ABGR:
+        s->descriptor = 52;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8:
+        s->descriptor = 6;
         break;
+    case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP10LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_RGBA64BE:
+    case AV_PIX_FMT_RGBA64LE:
     case AV_PIX_FMT_RGBA:
-        s->descriptor = 51; /* RGBA */
         break;
     case AV_PIX_FMT_RGB48LE:
-        s->big_endian = 0;
-        /* fall-through */
     case AV_PIX_FMT_RGB48BE:
-        s->bits_per_component = avctx->bits_per_raw_sample ? avctx->bits_per_raw_sample : 16;
+        if (avctx->bits_per_raw_sample)
+            s->bits_per_component = avctx->bits_per_raw_sample;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
@@ -66,20 +75,22 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-#define write16(p, value) \
-do { \
-    if (s->big_endian) AV_WB16(p, value); \
-    else               AV_WL16(p, value); \
-} while(0)
+static av_always_inline void write16_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB16(p, value);
+    else            AV_WL16(p, value);
+}
+
+static av_always_inline void write32_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB32(p, value);
+    else            AV_WL32(p, value);
+}
 
-#define write32(p, value) \
-do { \
-    if (s->big_endian) AV_WB32(p, value); \
-    else               AV_WL32(p, value); \
-} while(0)
+#define write16(p, value) write16_internal(s->big_endian, p, value)
+#define write32(p, value) write32_internal(s->big_endian, p, value)
 
-static void encode_rgb48_10bit(AVCodecContext *avctx, const AVPicture *pic,
-                               uint8_t *dst)
+static void encode_rgb48_10bit(AVCodecContext *avctx, const AVPicture *pic, uint8_t *dst)
 {
     DPXContext *s = avctx->priv_data;
     const uint8_t *src = pic->data[0];
@@ -88,14 +99,14 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVPicture *pic,
     for (y = 0; y < avctx->height; y++) {
         for (x = 0; x < avctx->width; x++) {
             int value;
-            if ((avctx->pix_fmt & 1)) {
-                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0) << 16);
+            if (s->big_endian) {
+                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0U) << 16);
             } else {
-                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0) << 16);
+                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0U) << 16);
             }
             write32(dst, value);
             dst += 4;
@@ -104,22 +115,88 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVPicture *pic,
     }
 }
 
+static void encode_gbrp10(AVCodecContext *avctx, const AVPicture *pic, uint8_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint8_t *src[3] = {pic->data[0], pic->data[1], pic->data[2]};
+    int x, y, i;
+
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            int value;
+            if (s->big_endian) {
+                value = (AV_RB16(src[0] + 2*x) << 12)
+                      | (AV_RB16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RB16(src[2] + 2*x) << 22);
+            } else {
+                value = (AV_RL16(src[0] + 2*x) << 12)
+                      | (AV_RL16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RL16(src[2] + 2*x) << 22);
+            }
+            write32(dst, value);
+            dst += 4;
+        }
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i];
+    }
+}
+
+static void encode_gbrp12(AVCodecContext *avctx, const AVPicture *pic, uint16_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint16_t *src[3] = {(uint16_t*)pic->data[0],
+                              (uint16_t*)pic->data[1],
+                              (uint16_t*)pic->data[2]};
+    int x, y, i, pad;
+    pad = avctx->width*6;
+    pad = (FFALIGN(pad, 4) - pad) >> 1;
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            uint16_t value[3];
+            if (s->big_endian) {
+                value[1] = AV_RB16(src[0] + x) << 4;
+                value[2] = AV_RB16(src[1] + x) << 4;
+                value[0] = AV_RB16(src[2] + x) << 4;
+            } else {
+                value[1] = AV_RL16(src[0] + x) << 4;
+                value[2] = AV_RL16(src[1] + x) << 4;
+                value[0] = AV_RL16(src[2] + x) << 4;
+            }
+            for (i = 0; i < 3; i++)
+                write16(dst++, value[i]);
+        }
+        for (i = 0; i < pad; i++)
+            *dst++ = 0;
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i]/2;
+    }
+}
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *frame, int *got_packet)
 {
     DPXContext *s = avctx->priv_data;
-    int size, ret;
+    int size, ret, need_align, len;
     uint8_t *buf;
 
 #define HEADER_SIZE 1664  /* DPX Generic header */
     if (s->bits_per_component == 10)
         size = avctx->height * avctx->width * 4;
-    else
-        size = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
-    if ((ret = ff_alloc_packet(pkt, size + HEADER_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    else if (s->bits_per_component == 12) {
+        // 3 components, 12 bits put on 16 bits
+        len  = avctx->width*6;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
+    } else {
+        // N components, M bits
+        len = avctx->width * s->num_components * s->bits_per_component >> 3;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
     }
+    if ((ret = ff_alloc_packet2(avctx, pkt, size + HEADER_SIZE, 0)) < 0)
+        return ret;
     buf = pkt->data;
 
     memset(buf, 0, HEADER_SIZE);
@@ -143,24 +220,44 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     buf[801] = 2; /* linear transfer */
     buf[802] = 2; /* linear colorimetric */
     buf[803] = s->bits_per_component;
-    write16(buf + 804, s->bits_per_component == 10 ? 1 : 0); /* packing method */
+    write16(buf + 804, (s->bits_per_component == 10 || s->bits_per_component == 12) ?
+                       1 : 0); /* packing method */
     write32(buf + 808, HEADER_SIZE); /* data offset */
 
     /* Image source information header */
     write32(buf + 1628, avctx->sample_aspect_ratio.num);
     write32(buf + 1632, avctx->sample_aspect_ratio.den);
 
-    switch (s->bits_per_component) {
+    switch(s->bits_per_component) {
     case 8:
     case 16:
-        size = avpicture_layout((const AVPicture*)frame, avctx->pix_fmt,
-                                avctx->width, avctx->height,
-                                buf + HEADER_SIZE, pkt->size - HEADER_SIZE);
+        if (need_align) {
+            int j;
+            const uint8_t *src = frame->data[0];
+            uint8_t *dst = pkt->data + HEADER_SIZE;
+            size = (len + need_align) * avctx->height;
+            for (j=0; j<avctx->height; j++) {
+                memcpy(dst, src, len);
+                memset(dst + len, 0, need_align);
+                dst += len + need_align;
+                src += frame->linesize[0];
+            }
+        } else {
+            size = avpicture_layout((const AVPicture*)frame, avctx->pix_fmt,
+                                    avctx->width, avctx->height,
+                                    buf + HEADER_SIZE, pkt->size - HEADER_SIZE);
+        }
         if (size < 0)
             return size;
         break;
     case 10:
-        encode_rgb48_10bit(avctx, (const AVPicture*)frame, buf + HEADER_SIZE);
+        if (s->planar)
+            encode_gbrp10(avctx, (const AVPicture*)frame, buf + HEADER_SIZE);
+        else
+            encode_rgb48_10bit(avctx, (const AVPicture*)frame, buf + HEADER_SIZE);
+        break;
+    case 12:
+        encode_gbrp12(avctx, (const AVPicture*)frame, (uint16_t*)(buf + HEADER_SIZE));
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", s->bits_per_component);
@@ -178,17 +275,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 }
 
 AVCodec ff_dpx_encoder = {
-    .name = "dpx",
-    .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
-    .type = AVMEDIA_TYPE_VIDEO,
-    .id   = AV_CODEC_ID_DPX,
+    .name           = "dpx",
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DPX,
     .priv_data_size = sizeof(DPXContext),
-    .init   = encode_init,
-    .encode2 = encode_frame,
-    .pix_fmts = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24,
-        AV_PIX_FMT_RGBA,
-        AV_PIX_FMT_RGB48LE,
-        AV_PIX_FMT_RGB48BE,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_RGB24,    AV_PIX_FMT_RGBA, AV_PIX_FMT_ABGR,
+        AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB48LE,  AV_PIX_FMT_RGB48BE,
+        AV_PIX_FMT_RGBA64LE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GBRP10LE, AV_PIX_FMT_GBRP10BE,
+        AV_PIX_FMT_GBRP12LE, AV_PIX_FMT_GBRP12BE,
         AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/dsd_tablegen.c b/libavcodec/dsd_tablegen.c
new file mode 100644
index 0000000000..dbeb9fe205
--- /dev/null
+++ b/libavcodec/dsd_tablegen.c
@@ -0,0 +1,38 @@
+/*
+ * Generate a header file for hardcoded DSD tables
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "dsd_tablegen.h"
+#include "tableprint.h"
+#include <inttypes.h>
+
+int main(void)
+{
+    dsd_ctables_tableinit();
+
+    write_fileheader();
+
+    printf("static const double ctables[CTABLES][256] = {\n");
+    write_float_2d_array(ctables, CTABLES, 256);
+    printf("};\n");
+
+    return 0;
+}
diff --git a/libavcodec/dsd_tablegen.h b/libavcodec/dsd_tablegen.h
new file mode 100644
index 0000000000..6afb4167da
--- /dev/null
+++ b/libavcodec/dsd_tablegen.h
@@ -0,0 +1,95 @@
+/*
+ * Header file for hardcoded DSD tables
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DSD_TABLEGEN_H
+#define AVCODEC_DSD_TABLEGEN_H
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+
+#define HTAPS   48                /** number of FIR constants */
+#define CTABLES ((HTAPS + 7) / 8) /** number of "8 MACs" lookup tables */
+
+#if CONFIG_HARDCODED_TABLES
+#define dsd_ctables_tableinit()
+#include "libavcodec/dsd_tables.h"
+#else
+#include "libavutil/common.h"
+
+/*
+ * Properties of this 96-tap lowpass filter when applied on a signal
+ * with sampling rate of 44100*64 Hz:
+ *
+ * () has a delay of 17 microseconds.
+ *
+ * () flat response up to 48 kHz
+ *
+ * () if you downsample afterwards by a factor of 8, the
+ *    spectrum below 70 kHz is practically alias-free.
+ *
+ * () stopband rejection is about 160 dB
+ *
+ * The coefficient tables ("ctables") take only 6 Kibi Bytes and
+ * should fit into a modern processor's fast cache.
+ */
+
+/**
+ * The 2nd half (48 coeffs) of a 96-tap symmetric lowpass filter
+ */
+static const double htaps[HTAPS] = {
+     0.09950731974056658,    0.09562845727714668,    0.08819647126516944,
+     0.07782552527068175,    0.06534876523171299,    0.05172629311427257,
+     0.0379429484910187,     0.02490921351762261,    0.0133774746265897,
+     0.003883043418804416,  -0.003284703416210726,  -0.008080250212687497,
+    -0.01067241812471033,   -0.01139427235000863,   -0.0106813877974587,
+    -0.009007905078766049,  -0.006828859761015335,  -0.004535184322001496,
+    -0.002425035959059578,  -0.0006922187080790708,  0.0005700762133516592,
+     0.001353838005269448,   0.001713709169690937,   0.001742046839472948,
+     0.001545601648013235,   0.001226696225277855,   0.0008704322683580222,
+     0.0005381636200535649,  0.000266446345425276,   7.002968738383528e-05,
+    -5.279407053811266e-05, -0.0001140625650874684, -0.0001304796361231895,
+    -0.0001189970287491285, -9.396247155265073e-05, -6.577634378272832e-05,
+    -4.07492895872535e-05,  -2.17407957554587e-05,  -9.163058931391722e-06,
+    -2.017460145032201e-06,  1.249721855219005e-06,  2.166655190537392e-06,
+     1.930520892991082e-06,  1.319400334374195e-06,  7.410039764949091e-07,
+     3.423230509967409e-07,  1.244182214744588e-07,  3.130441005359396e-08
+};
+
+static float ctables[CTABLES][256];
+
+static av_cold void dsd_ctables_tableinit(void)
+{
+    int t, e, m, k;
+    double acc;
+    for (t = 0; t < CTABLES; ++t) {
+        k = FFMIN(HTAPS - t * 8, 8);
+        for (e = 0; e < 256; ++e) {
+            acc = 0.0;
+            for (m = 0; m < k; ++m)
+                acc += (((e >> (7 - m)) & 1) * 2 - 1) * htaps[t * 8 + m];
+            ctables[CTABLES - 1 - t][e] = (float)acc;
+        }
+    }
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_DSD_TABLEGEN_H */
diff --git a/libavcodec/dsddec.c b/libavcodec/dsddec.c
new file mode 100644
index 0000000000..f1dfd4b231
--- /dev/null
+++ b/libavcodec/dsddec.c
@@ -0,0 +1,167 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Direct Stream Digital (DSD) decoder
+ */
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+#include "dsd_tablegen.h"
+
+#define FIFOSIZE 16              /** must be a power of two */
+#define FIFOMASK (FIFOSIZE - 1)  /** bit mask for FIFO offsets */
+
+#if FIFOSIZE * 8 < HTAPS * 2
+#error "FIFOSIZE too small"
+#endif
+
+/**
+ * Per-channel buffer
+ */
+typedef struct {
+    unsigned char buf[FIFOSIZE];
+    unsigned pos;
+} DSDContext;
+
+static void dsd2pcm_translate(DSDContext* s, size_t samples, int lsbf,
+                              const unsigned char *src, ptrdiff_t src_stride,
+                              float *dst, ptrdiff_t dst_stride)
+{
+    unsigned pos, i;
+    unsigned char* p;
+    double sum;
+
+    pos = s->pos;
+
+    while (samples-- > 0) {
+        s->buf[pos] = lsbf ? ff_reverse[*src] : *src;
+        src += src_stride;
+
+        p = s->buf + ((pos - CTABLES) & FIFOMASK);
+        *p = ff_reverse[*p];
+
+        sum = 0.0;
+        for (i = 0; i < CTABLES; i++) {
+            unsigned char a = s->buf[(pos                   - i) & FIFOMASK];
+            unsigned char b = s->buf[(pos - (CTABLES*2 - 1) + i) & FIFOMASK];
+            sum += ctables[i][a] + ctables[i][b];
+        }
+
+        *dst = (float)sum;
+        dst += dst_stride;
+
+        pos = (pos + 1) & FIFOMASK;
+    }
+
+    s->pos = pos;
+}
+
+static av_cold void init_static_data(void)
+{
+    static int done = 0;
+    if (done)
+        return;
+    dsd_ctables_tableinit();
+    done = 1;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DSDContext * s;
+    int i;
+
+    init_static_data();
+
+    s = av_malloc_array(sizeof(DSDContext), avctx->channels);
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < avctx->channels; i++) {
+        s[i].pos = 0;
+        memset(s[i].buf, 0x69, sizeof(s[i].buf));
+
+        /* 0x69 = 01101001
+         * This pattern "on repeat" makes a low energy 352.8 kHz tone
+         * and a high energy 1.0584 MHz tone which should be filtered
+         * out completely by any playback system --> silence
+         */
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->priv_data  = s;
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    DSDContext * s = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, i;
+    int lsbf = avctx->codec_id == AV_CODEC_ID_DSD_LSBF || avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR;
+    int src_next;
+    int src_stride;
+
+    frame->nb_samples = avpkt->size / avctx->channels;
+
+    if (avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR || avctx->codec_id == AV_CODEC_ID_DSD_MSBF_PLANAR) {
+        src_next   = frame->nb_samples;
+        src_stride = 1;
+    } else {
+        src_next   = 1;
+        src_stride = avctx->channels;
+    }
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0; i < avctx->channels; i++) {
+        float * dst = ((float **)frame->extended_data)[i];
+        dsd2pcm_translate(&s[i], frame->nb_samples, lsbf,
+            avpkt->data + i * src_next, src_stride,
+            dst, 1);
+    }
+
+    *got_frame_ptr = 1;
+    return frame->nb_samples * avctx->channels;
+}
+
+#define DSD_DECODER(id_, name_, long_name_) \
+AVCodec ff_##name_##_decoder = { \
+    .name         = #name_, \
+    .long_name    = NULL_IF_CONFIG_SMALL(long_name_), \
+    .type         = AVMEDIA_TYPE_AUDIO, \
+    .id           = AV_CODEC_ID_##id_, \
+    .init         = decode_init, \
+    .decode       = decode_frame, \
+    .sample_fmts  = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP, \
+                                                   AV_SAMPLE_FMT_NONE }, \
+};
+
+DSD_DECODER(DSD_LSBF, dsd_lsbf, "DSD (Direct Stream Digital), least significant bit first")
+DSD_DECODER(DSD_MSBF, dsd_msbf, "DSD (Direct Stream Digital), most significant bit first")
+DSD_DECODER(DSD_MSBF_PLANAR, dsd_msbf_planar, "DSD (Direct Stream Digital), most significant bit first, planar")
+DSD_DECODER(DSD_LSBF_PLANAR, dsd_lsbf_planar, "DSD (Direct Stream Digital), least significant bit first, planar")
diff --git a/libavcodec/dsicinaudio.c b/libavcodec/dsicinaudio.c
index e0fecbeb03..290dab41a5 100644
--- a/libavcodec/dsicinaudio.c
+++ b/libavcodec/dsicinaudio.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN audio decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,10 +98,8 @@ static int cinaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avpkt->size - cin->initial_decode_frame;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     delta = cin->delta;
diff --git a/libavcodec/dsicinvideo.c b/libavcodec/dsicinvideo.c
index 7c62dcf15b..f95cbc74a0 100644
--- a/libavcodec/dsicinvideo.c
+++ b/libavcodec/dsicinvideo.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN video decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,10 +42,33 @@ typedef struct CinVideoContext {
     uint8_t *bitmap_table[3];
 } CinVideoContext;
 
+static av_cold void destroy_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i)
+        av_freep(&cin->bitmap_table[i]);
+}
+
+static av_cold int allocate_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i) {
+        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
+        if (!cin->bitmap_table[i]) {
+            av_log(cin->avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
+            destroy_buffers(cin);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
 static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    unsigned int i;
 
     cin->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -55,11 +78,8 @@ static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     cin->bitmap_size = avctx->width * avctx->height;
-    for (i = 0; i < 3; ++i) {
-        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
-        if (!cin->bitmap_table[i])
-            av_log(avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
-    }
+    if (allocate_buffers(cin))
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -141,27 +161,30 @@ static int cin_decode_lzss(const unsigned char *src, int src_size,
     return 0;
 }
 
-static void cin_decode_rle(const unsigned char *src, int src_size,
+static int cin_decode_rle(const unsigned char *src, int src_size,
                            unsigned char *dst, int dst_size)
 {
     int len, code;
     unsigned char *dst_end       = dst + dst_size;
     const unsigned char *src_end = src + src_size;
 
-    while (src < src_end && dst < dst_end) {
+    while (src + 1 < src_end && dst < dst_end) {
         code = *src++;
         if (code & 0x80) {
-            if (src >= src_end)
-                break;
             len = code - 0x7F;
             memset(dst, *src++, FFMIN(len, dst_end - dst));
         } else {
             len = code + 1;
+            if (len > src_end-src) {
+                av_log(NULL, AV_LOG_ERROR, "RLE overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             memcpy(dst, src, FFMIN3(len, dst_end - dst, src_end - src));
             src += len;
         }
         dst += len;
     }
+    return 0;
 }
 
 static int cinvideo_decode_frame(AVCodecContext *avctx,
@@ -188,19 +211,17 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         if (palette_colors_count > 256)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[i]    = bytestream_get_le24(&buf);
+            cin->palette[i]    = 0xFFU << 24 | bytestream_get_le24(&buf);
             bitmap_frame_size -= 3;
         }
     } else {
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[buf[0]] = AV_RL24(buf + 1);
+            cin->palette[buf[0]] = 0xFFU << 24 | AV_RL24(buf + 1);
             buf                 += 4;
             bitmap_frame_size   -= 4;
         }
     }
 
-    bitmap_frame_size = FFMIN(cin->bitmap_size, bitmap_frame_size);
-
     /* note: the decoding routines below assumes that
      * surface.width = surface.pitch */
     switch (bitmap_frame_type) {
@@ -215,7 +236,7 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
                              cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
         break;
     case 35:
-        cin_decode_huffman(buf, bitmap_frame_size,
+        bitmap_frame_size = cin_decode_huffman(buf, bitmap_frame_size,
                            cin->bitmap_table[CIN_INT_BMP], cin->bitmap_size);
         cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
@@ -251,11 +272,8 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         break;
     }
 
-    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0) {
-        av_log(cin->avctx, AV_LOG_ERROR,
-               "delphinecinvideo: reget_buffer() failed to allocate a frame\n");
+    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0)
         return res;
-    }
 
     memcpy(cin->frame->data[1], cin->palette, sizeof(cin->palette));
     cin->frame->palette_has_changed = 1;
@@ -278,12 +296,10 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
 static av_cold int cinvideo_decode_end(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    int i;
 
     av_frame_free(&cin->frame);
 
-    for (i = 0; i < 3; ++i)
-        av_free(cin->bitmap_table[i]);
+    destroy_buffers(cin);
 
     return 0;
 }
diff --git a/libavcodec/dss_sp.c b/libavcodec/dss_sp.c
index 20b05287a7..7cf84899fc 100644
--- a/libavcodec/dss_sp.c
+++ b/libavcodec/dss_sp.c
@@ -2,20 +2,20 @@
  * Digital Speech Standard - Standard Play mode (DSS SP) audio decoder.
  * Copyright (C) 2014 Oleksij Rempel <linux@rempel-privat.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define DSS_SP_FRAME_SIZE        42
 #define DSS_SP_SAMPLE_COUNT     (66 * SUBFRAMES)
-#define DSS_SP_FORMULA(a, b, c) ((((a) << 15) + (b) * (c)) + 0x4000) >> 15
+#define DSS_SP_FORMULA(a, b, c) (((((a) << 15) + (b) * (c)) + 0x4000) >> 15)
 
 typedef struct DssSpSubframe {
     int16_t gain;
@@ -50,6 +50,7 @@ typedef struct DssSpFrame {
 } DssSpFrame;
 
 typedef struct DssSpContext {
+    AVCodecContext *avctx;
     int32_t excitation[288 + 6];
     int32_t history[187];
     DssSpFrame fparam;
@@ -296,6 +297,7 @@ static av_cold int dss_sp_decode_init(AVCodecContext *avctx)
 
     memset(p->history, 0, sizeof(p->history));
     p->pulse_dec_mode = 1;
+    p->avctx          = avctx;
 
     return 0;
 }
@@ -378,7 +380,7 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
                 if (C72_binomials[index] <= combined_pulse_pos) {
                     combined_pulse_pos -= C72_binomials[index];
 
-                    fparam->sf[subframe_idx].pulse_pos[(index ^ 7) - 1] = i;
+                    fparam->sf[subframe_idx].pulse_pos[6 - index] = i;
 
                     if (!index)
                         break;
@@ -400,10 +402,15 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
 
     combined_pitch /= 151;
 
-    for (i = 1; i < SUBFRAMES; i++) {
+    for (i = 1; i < SUBFRAMES - 1; i++) {
         fparam->pitch_lag[i] = combined_pitch % 48;
         combined_pitch      /= 48;
     }
+    if (combined_pitch > 47) {
+        av_log (p->avctx, AV_LOG_WARNING, "combined_pitch was too large\n");
+        combined_pitch = 0;
+    }
+    fparam->pitch_lag[i] = combined_pitch;
 
     pitch_lag = fparam->pitch_lag[0];
     for (i = 1; i < SUBFRAMES; i++) {
diff --git a/libavcodec/dump_extradata_bsf.c b/libavcodec/dump_extradata_bsf.c
index 3f54899794..08c422704d 100644
--- a/libavcodec/dump_extradata_bsf.c
+++ b/libavcodec/dump_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,6 @@ static int dump_extradata(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx,
 }
 
 AVBitStreamFilter ff_dump_extradata_bsf={
-    "dump_extra",
-    0,
-    dump_extradata,
+    .name   = "dump_extra",
+    .filter = dump_extradata,
 };
diff --git a/libavcodec/dv.c b/libavcodec/dv.c
index 81d28d1fb3..31d1315f38 100644
--- a/libavcodec/dv.c
+++ b/libavcodec/dv.c
@@ -16,20 +16,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 #include "simple_idct.h"
 
 /* XXX: also include quantization */
-RL_VLC_ELEM ff_dv_rl_vlc[1184];
+RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
                                           int seq, int slot, uint16_t *tbl)
@@ -173,20 +173,9 @@ static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
     }
 }
 
-/* quantization quanta by QNO for DV100 */
-static const uint8_t dv100_qstep[16] = {
-    1, /* QNO = 0 and 1 both have no quantization */
-    1,
-    2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
-};
-
-static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
-
 int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
 {
     int j, i, c, s, p;
-    uint32_t *factor1, *factor2;
-    const int *iweight1, *iweight2;
 
     p = i = 0;
     for (c = 0; c < d->n_difchan; c++) {
@@ -204,38 +193,6 @@ int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
         }
     }
 
-    factor1 = &ctx->idct_factor[0];
-    factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
-    if (d->height == 720) {
-        iweight1 = &ff_dv_iweight_720_y[0];
-        iweight2 = &ff_dv_iweight_720_c[0];
-    } else {
-        iweight1 = &ff_dv_iweight_1080_y[0];
-        iweight2 = &ff_dv_iweight_1080_c[0];
-    }
-    if (DV_PROFILE_IS_HD(d)) {
-        for (c = 0; c < 4; c++) {
-            for (s = 0; s < 16; s++) {
-                for (i = 0; i < 64; i++) {
-                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
-                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
-                }
-            }
-        }
-    } else {
-        iweight1 = &ff_dv_iweight_88[0];
-        for (j = 0; j < 2; j++, iweight1 = &ff_dv_iweight_248[0]) {
-            for (s = 0; s < 22; s++) {
-                for (i = c = 0; c < 4; c++) {
-                    for (; i < dv_quant_areas[c]; i++) {
-                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
-                        *factor2++ = (*factor1++) << 1;
-                    }
-                }
-            }
-        }
-    }
-
     return 0;
 }
 
@@ -277,7 +234,7 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
          * to accelerate the parsing of partial codes */
         init_vlc(&dv_vlc, TEX_VLC_BITS, j, new_dv_vlc_len,
                  1, 1, new_dv_vlc_bits, 2, 2, 0);
-        assert(dv_vlc.table_size == 1184);
+        av_assert1(dv_vlc.table_size == 1664);
 
         for (i = 0; i < dv_vlc.table_size; i++) {
             int code = dv_vlc.table[i][0];
@@ -303,3 +260,4 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
 
     return 0;
 }
+
diff --git a/libavcodec/dv.h b/libavcodec/dv.h
index b458aeab29..5d282633dd 100644
--- a/libavcodec/dv.h
+++ b/libavcodec/dv.h
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@ typedef struct DVVideoContext {
 
     uint8_t dv_zigzag[2][64];
 
-    void (*get_pixels)(int16_t *block, const uint8_t *pixels, int line_size);
+    void (*get_pixels)(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
     void (*fdct[2])(int16_t *block);
     void (*idct_put[2])(uint8_t *dest, int line_size, int16_t *block);
     me_cmp_func ildct_cmp;
@@ -80,10 +80,6 @@ enum dv_pack_type {
 #define DV_PROFILE_IS_1080i50(p) (((p)->video_stype == 0x14) && ((p)->dsf == 1))
 #define DV_PROFILE_IS_720p50(p)  (((p)->video_stype == 0x18) && ((p)->dsf == 1))
 
-/* minimum number of bytes to read from a DV stream in order to
- * determine the profile */
-#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
-
 /**
  * largest possible DV frame, in bytes (1080i50)
  */
@@ -94,11 +90,12 @@ enum dv_pack_type {
  */
 #define DV_MAX_BPM 8
 
-#define TEX_VLC_BITS 9
+#define TEX_VLC_BITS 10
 
-extern RL_VLC_ELEM ff_dv_rl_vlc[1184];
+extern RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 int ff_dv_init_dynamic_tables(DVVideoContext *s, const AVDVProfile *d);
+
 int ff_dvvideo_init(AVCodecContext *avctx);
 
 static inline int dv_work_pool_size(const AVDVProfile *d)
diff --git a/libavcodec/dv_profile.c b/libavcodec/dv_profile.c
index 74c529d727..66505c886b 100644
--- a/libavcodec/dv_profile.c
+++ b/libavcodec/dv_profile.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
 
@@ -256,22 +257,30 @@ void ff_dv_print_profiles(void *logctx, int loglevel)
 
 #endif /* CONFIG_DVPROFILE */
 
-const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
                                        const uint8_t *frame, unsigned buf_size)
 {
 #if CONFIG_DVPROFILE
     int i, dsf, stype;
 
-    if (buf_size < 80 * 5 + 48 + 4)
+    if(buf_size < DV_PROFILE_BYTES)
         return NULL;
 
     dsf   = (frame[3] & 0x80) >> 7;
     stype = frame[80 * 5 + 48 + 3] & 0x1f;
 
     /* 576i50 25Mbps 4:1:1 is a special case */
-    if (dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */)
+    if ((dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */) ||
+        (stype == 31 && codec && codec->codec_tag==AV_RL32("SL25") && codec->coded_width==720 && codec->coded_height==576))
         return &dv_profiles[2];
 
+    if(   stype == 0
+       && codec
+       && (codec->codec_tag==AV_RL32("dvsd") || codec->codec_tag==AV_RL32("CDVC"))
+       && codec->coded_width ==720
+       && codec->coded_height==576)
+        return &dv_profiles[1];
+
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (dsf == dv_profiles[i].dsf && stype == dv_profiles[i].video_stype)
             return &dv_profiles[i];
@@ -279,23 +288,54 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
     /* check if old sys matches and assumes corrupted input */
     if (sys && buf_size == sys->frame_size)
         return sys;
+
+    /* hack for trac issue #217, dv files created with QuickTime 3 */
+    if ((frame[3] & 0x7f) == 0x3f && frame[80 * 5 + 48 + 3] == 0xff)
+        return &dv_profiles[dsf];
 #endif
 
     return NULL;
 }
 
+const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size)
+{
+    return ff_dv_frame_profile(NULL, sys, frame, buf_size);
+}
+
 const AVDVProfile *av_dv_codec_profile(int width, int height,
                                        enum AVPixelFormat pix_fmt)
 {
 #if CONFIG_DVPROFILE
+    return av_dv_codec_profile2(width, height, pix_fmt, (AVRational){0, 0});
+#endif
+
+    return NULL;
+}
+
+const AVDVProfile *av_dv_codec_profile2(int width, int height,
+                                       enum AVPixelFormat pix_fmt,
+                                       AVRational frame_rate)
+{
+    const AVDVProfile *p = NULL;
+#if CONFIG_DVPROFILE
     int i;
+    /* frame rate is necessary to select between 720p50 and 720p60 profiles */
+    int invalid_framerate = frame_rate.num == 0 || frame_rate.den == 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (height  == dv_profiles[i].height  &&
             pix_fmt == dv_profiles[i].pix_fmt &&
             width   == dv_profiles[i].width)
-            return &dv_profiles[i];
+        {
+            if( invalid_framerate || av_div_q(dv_profiles[i].time_base, frame_rate).num == 1 )
+                return &dv_profiles[i];
+
+            if(!p)
+                p = &dv_profiles[i];
+        }
 #endif
 
-    return NULL;
+    return p;
 }
+
diff --git a/libavcodec/dv_profile.h b/libavcodec/dv_profile.h
index 5ad7b4f649..9380a66f07 100644
--- a/libavcodec/dv_profile.h
+++ b/libavcodec/dv_profile.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,11 @@
 #include "libavutil/rational.h"
 #include "avcodec.h"
 
+/* minimum number of bytes to read from a DV stream in order to
+ * determine the profile */
+#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
+
+
 /*
  * AVDVProfile is used to express the differences between various
  * DV flavors. For now it's primarily used for differentiating
@@ -69,4 +74,10 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
  */
 const AVDVProfile *av_dv_codec_profile(int width, int height, enum AVPixelFormat pix_fmt);
 
+/**
+ * Get a DV profile for the provided stream parameters.
+ * The frame rate is used as a best-effort parameter.
+ */
+const AVDVProfile *av_dv_codec_profile2(int width, int height, enum AVPixelFormat pix_fmt, AVRational frame_rate);
+
 #endif /* AVCODEC_DV_PROFILE_H */
diff --git a/libavcodec/dv_profile_internal.h b/libavcodec/dv_profile_internal.h
index f93e7cad17..67d3a2b79a 100644
--- a/libavcodec/dv_profile_internal.h
+++ b/libavcodec/dv_profile_internal.h
@@ -1,27 +1,35 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DV_PROFILE_INTERNAL_H
 #define AVCODEC_DV_PROFILE_INTERNAL_H
 
+#include "dv_profile.h"
+
 /**
  *  Print all allowed DV profiles into logctx at specified logging level.
  */
 void ff_dv_print_profiles(void *logctx, int loglevel);
 
+/**
+ * Get a DV profile for the provided compressed frame.
+ */
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size);
+
 #endif /* AVCODEC_DV_PROFILE_INTERNAL_H */
diff --git a/libavcodec/dv_tablegen.c b/libavcodec/dv_tablegen.c
index 9b2b9546d5..257934101a 100644
--- a/libavcodec/dv_tablegen.c
+++ b/libavcodec/dv_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dv_tablegen.h b/libavcodec/dv_tablegen.h
index b69721b591..941b5572be 100644
--- a/libavcodec/dv_tablegen.h
+++ b/libavcodec/dv_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #define AVCODEC_DV_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 #include "dvdata.h"
 
@@ -47,7 +48,7 @@ typedef struct dv_vlc_pair {
 #else
 static struct dv_vlc_pair dv_vlc_map[DV_VLC_MAP_RUN_SIZE][DV_VLC_MAP_LEV_SIZE];
 
-static void dv_vlc_map_tableinit(void)
+static av_cold void dv_vlc_map_tableinit(void)
 {
     int i, j;
     for (i = 0; i < NB_DV_VLC - 1; i++) {
diff --git a/libavcodec/dvbsub.c b/libavcodec/dvbsub.c
index 720e78669a..dd84a07664 100644
--- a/libavcodec/dvbsub.c
+++ b/libavcodec/dvbsub.c
@@ -2,20 +2,20 @@
  * DVB subtitle encoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
@@ -23,7 +23,6 @@
 #include "libavutil/colorspace.h"
 
 typedef struct DVBSubtitleContext {
-    int hide_state;
     int object_version;
 } DVBSubtitleContext;
 
@@ -194,6 +193,60 @@ static void dvb_encode_rle4(uint8_t **pq,
     *pq = q;
 }
 
+static void dvb_encode_rle8(uint8_t **pq,
+                            const uint8_t *bitmap, int linesize,
+                            int w, int h)
+{
+    uint8_t *q;
+    int x, y, len, x1, color;
+
+    q = *pq;
+
+    for (y = 0; y < h; y++) {
+        *q++ = 0x12;
+
+        x = 0;
+        while (x < w) {
+            x1 = x;
+            color = bitmap[x1++];
+            while (x1 < w && bitmap[x1] == color)
+                x1++;
+            len = x1 - x;
+            if (len == 1 && color) {
+                // 00000001 to 11111111           1 pixel in colour x
+                *q++ = color;
+            } else {
+                if (color == 0x00) {
+                    // 00000000 0LLLLLLL          L pixels (1-127) in colour 0 (L > 0)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = len;
+                } else if (len > 2) {
+                    // 00000000 1LLLLLLL CCCCCCCC L pixels (3-127) in colour C (L > 2)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = 0x80+len;
+                    *q++ = color;
+                }
+                else if (len == 2) {
+                    *q++ = color;
+                    *q++ = color;
+                } else {
+                    *q++ = color;
+                    len = 1;
+                }
+            }
+            x += len;
+        }
+        /* end of line */
+        // 00000000 00000000 end of 8-bit/pixel_code_string
+        *q++ = 0x00;
+        *q++ = 0x00;
+        bitmap += linesize;
+    }
+    *pq = q;
+}
+
 static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                 uint8_t *outbuf, const AVSubtitle *h)
 {
@@ -205,11 +258,9 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     page_id = 1;
 
-    if (h->num_rects == 0 || !h->rects)
+    if (h->num_rects && !h->rects)
         return -1;
 
-    *q++ = 0x00; /* subtitle_stream_id */
-
     /* page composition segment */
 
     *q++ = 0x0f; /* sync_byte */
@@ -218,10 +269,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
     pseg_len = q;
     q += 2; /* segment length */
     *q++ = 30; /* page_timeout (seconds) */
-    if (s->hide_state)
-        page_state = 0; /* normal case */
-    else
-        page_state = 2; /* mode change */
+    page_state = 2; /* mode change */
     /* page_version = 0 + page_state */
     *q++ = (s->object_version << 4) | (page_state << 2) | 3;
 
@@ -234,7 +282,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
         for (clut_id = 0; clut_id < h->num_rects; clut_id++) {
 
             /* CLUT segment */
@@ -245,10 +293,15 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
             } else if (h->rects[clut_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
                 bpp_index = 1;
+            } else if (h->rects[clut_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                bpp_index = 2;
             } else {
                 return -1;
             }
 
+
+            /* CLUT segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x12; /* CLUT definition segment */
             bytestream_put_be16(&q, page_id);
@@ -307,32 +360,37 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         *q++ = 0; /* 8 bit fill colors */
         *q++ = 0x03; /* 4 bit and 2 bit fill colors */
 
-        if (!s->hide_state) {
-            bytestream_put_be16(&q, region_id); /* object_id == region_id */
-            *q++ = (0 << 6) | (0 << 4);
-            *q++ = 0;
-            *q++ = 0xf0;
-            *q++ = 0;
-        }
+        bytestream_put_be16(&q, region_id); /* object_id == region_id */
+        *q++ = (0 << 6) | (0 << 4);
+        *q++ = 0;
+        *q++ = 0xf0;
+        *q++ = 0;
 
         bytestream_put_be16(&pseg_len, q - pseg_len - 2);
     }
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
 
         for (object_id = 0; object_id < h->num_rects; object_id++) {
-            /* Object Data segment */
+            void (*dvb_encode_rle)(uint8_t **pq,
+                                    const uint8_t *bitmap, int linesize,
+                                    int w, int h);
 
+            /* bpp_index maths */
             if (h->rects[object_id]->nb_colors <= 4) {
                 /* 2 bpp, some decoders do not support it correctly */
-                bpp_index = 0;
+                dvb_encode_rle = dvb_encode_rle2;
             } else if (h->rects[object_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
-                bpp_index = 1;
+                dvb_encode_rle = dvb_encode_rle4;
+            } else if (h->rects[object_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                dvb_encode_rle = dvb_encode_rle8;
             } else {
                 return -1;
             }
 
+            /* Object Data segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x13;
             bytestream_put_be16(&q, page_id);
@@ -345,19 +403,12 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                                                        non_modifying_color_flag */
             {
                 uint8_t *ptop_field_len, *pbottom_field_len, *top_ptr, *bottom_ptr;
-                void (*dvb_encode_rle)(uint8_t **pq,
-                                        const uint8_t *bitmap, int linesize,
-                                        int w, int h);
+
                 ptop_field_len = q;
                 q += 2;
                 pbottom_field_len = q;
                 q += 2;
 
-                if (bpp_index == 0)
-                    dvb_encode_rle = dvb_encode_rle2;
-                else
-                    dvb_encode_rle = dvb_encode_rle4;
-
                 top_ptr = q;
                 dvb_encode_rle(&q, h->rects[object_id]->pict.data[0], h->rects[object_id]->w * 2,
                                     h->rects[object_id]->w, h->rects[object_id]->h >> 1);
@@ -384,10 +435,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    *q++ = 0xff; /* end of PES data */
-
     s->object_version = (s->object_version + 1) & 0xf;
-    s->hide_state = !s->hide_state;
     return q - outbuf;
 }
 
diff --git a/libavcodec/dvbsub_parser.c b/libavcodec/dvbsub_parser.c
index 2e7d8c2aee..af467f7b64 100644
--- a/libavcodec/dvbsub_parser.c
+++ b/libavcodec/dvbsub_parser.c
@@ -1,21 +1,21 @@
 /*
- * DVB subtitle parser for Libav
+ * DVB subtitle parser for FFmpeg
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
@@ -123,11 +123,11 @@ static int dvbsub_parse(AVCodecParserContext *s,
     {
         if (*p == 0x0f)
         {
-            if (p + 6 <= p_end)
+            if (6 <= p_end - p)
             {
                 len = AV_RB16(p + 4);
 
-                if (p + len + 6 <= p_end)
+                if (len + 6 <= p_end - p)
                 {
                     *poutbuf_size += len + 6;
 
@@ -137,7 +137,7 @@ static int dvbsub_parse(AVCodecParserContext *s,
             } else
                 break;
         } else if (*p == 0xff) {
-            if (p + 1 < p_end)
+            if (1 < p_end - p)
             {
                 ff_dlog(avctx, "Junk at end of packet\n");
             }
diff --git a/libavcodec/dvbsubdec.c b/libavcodec/dvbsubdec.c
index d2b95bc53d..ab6054e99a 100644
--- a/libavcodec/dvbsubdec.c
+++ b/libavcodec/dvbsubdec.c
@@ -2,20 +2,20 @@
  * DVB subtitle decoding
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include "bytestream.h"
 #include "internal.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/opt.h"
 
 #define DVBSUB_PAGE_SEGMENT     0x10
 #define DVBSUB_REGION_SEGMENT   0x11
@@ -153,6 +154,7 @@ static void png_save2(const char *filename, uint32_t *bitmap, int w, int h)
 
 typedef struct DVBSubCLUT {
     int id;
+    int version;
 
     uint32_t clut4[4];
     uint32_t clut16[16];
@@ -179,6 +181,7 @@ typedef struct DVBSubObjectDisplay {
 
 typedef struct DVBSubObject {
     int id;
+    int version;
 
     int type;
 
@@ -198,6 +201,7 @@ typedef struct DVBSubRegionDisplay {
 
 typedef struct DVBSubRegion {
     int id;
+    int version;
 
     int width;
     int height;
@@ -208,6 +212,7 @@ typedef struct DVBSubRegion {
 
     uint8_t *pbuf;
     int buf_size;
+    int dirty;
 
     DVBSubObjectDisplay *display_list;
 
@@ -224,15 +229,21 @@ typedef struct DVBSubDisplayDefinition {
 } DVBSubDisplayDefinition;
 
 typedef struct DVBSubContext {
+    AVClass *class;
     int composition_id;
     int ancillary_id;
 
+    int version;
     int time_out;
+    int compute_edt; /**< if 1 end display time calculated using pts
+                          if 0 (Default) calculated using time out */
+    int compute_clut;
+    int substream;
+    int64_t prev_start;
     DVBSubRegion *region_list;
     DVBSubCLUT   *clut_list;
     DVBSubObject *object_list;
 
-    int display_list_size;
     DVBSubRegionDisplay *display_list;
     DVBSubDisplayDefinition *display_definition;
 } DVBSubContext;
@@ -298,53 +309,59 @@ static void delete_region_display_list(DVBSubContext *ctx, DVBSubRegion *region)
                     obj2 = *obj2_ptr;
 
                     while (obj2 != object) {
-                        assert(obj2);
+                        av_assert0(obj2);
                         obj2_ptr = &obj2->next;
                         obj2 = *obj2_ptr;
                     }
 
                     *obj2_ptr = obj2->next;
 
-                    av_free(obj2);
+                    av_freep(&obj2);
                 }
             }
         }
 
         region->display_list = display->region_list_next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
 }
 
-static void delete_state(DVBSubContext *ctx)
+static void delete_cluts(DVBSubContext *ctx)
 {
-    DVBSubRegion *region;
-    DVBSubCLUT *clut;
-
-    while (ctx->region_list) {
-        region = ctx->region_list;
+    while (ctx->clut_list) {
+        DVBSubCLUT *clut = ctx->clut_list;
 
-        ctx->region_list = region->next;
+        ctx->clut_list = clut->next;
 
-        delete_region_display_list(ctx, region);
-        av_free(region->pbuf);
-        av_free(region);
+        av_freep(&clut);
     }
+}
 
-    while (ctx->clut_list) {
-        clut = ctx->clut_list;
+static void delete_objects(DVBSubContext *ctx)
+{
+    while (ctx->object_list) {
+        DVBSubObject *object = ctx->object_list;
 
-        ctx->clut_list = clut->next;
+        ctx->object_list = object->next;
 
-        av_free(clut);
+        av_freep(&object);
     }
+}
 
-    av_freep(&ctx->display_definition);
+static void delete_regions(DVBSubContext *ctx)
+{
+    while (ctx->region_list) {
+        DVBSubRegion *region = ctx->region_list;
 
-    /* Should already be null */
-    if (ctx->object_list)
-        av_log(0, AV_LOG_ERROR, "Memory deallocation error!\n");
+        ctx->region_list = region->next;
+
+        delete_region_display_list(ctx, region);
+
+        av_freep(&region->pbuf);
+        av_freep(&region);
+    }
 }
 
 static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
@@ -352,15 +369,27 @@ static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
     int i, r, g, b, a = 0;
     DVBSubContext *ctx = avctx->priv_data;
 
-    if (!avctx->extradata || avctx->extradata_size != 4) {
-        av_log(avctx, AV_LOG_WARNING, "Invalid extradata, subtitle streams may be combined!\n");
+    if (ctx->substream < 0) {
+        ctx->composition_id = -1;
+        ctx->ancillary_id   = -1;
+    } else if (!avctx->extradata || (avctx->extradata_size < 4) || ((avctx->extradata_size % 5 != 0) && (avctx->extradata_size != 4))) {
+        av_log(avctx, AV_LOG_WARNING, "Invalid DVB subtitles stream extradata!\n");
         ctx->composition_id = -1;
         ctx->ancillary_id   = -1;
     } else {
-        ctx->composition_id = AV_RB16(avctx->extradata);
-        ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        if (avctx->extradata_size > 5*ctx->substream + 2) {
+            ctx->composition_id = AV_RB16(avctx->extradata + 5*ctx->substream);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 5*ctx->substream + 2);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Selected DVB subtitles sub-stream %d is not available\n", ctx->substream);
+            ctx->composition_id = AV_RB16(avctx->extradata);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        }
     }
 
+    ctx->version = -1;
+    ctx->prev_start = AV_NOPTS_VALUE;
+
     default_clut.id = -1;
     default_clut.next = NULL;
 
@@ -429,30 +458,39 @@ static av_cold int dvbsub_close_decoder(AVCodecContext *avctx)
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
 
-    delete_state(ctx);
+    delete_regions(ctx);
+
+    delete_objects(ctx);
+
+    delete_cluts(ctx);
+
+    av_freep(&ctx->display_definition);
 
     while (ctx->display_list) {
         display = ctx->display_list;
         ctx->display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
 }
 
-static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_2bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
     GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
 
     init_get_bits(&gb, *srcbuf, buf_size << 3);
 
+    destbuf += x_pos;
+
     while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
         bits = get_bits(&gb, 2);
 
@@ -513,14 +551,14 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
                         (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
@@ -539,25 +577,27 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
     }
 
     if (get_bits(&gb, 6))
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_4bit_string(AVCodecContext *avctx, uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
     GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
 
     init_get_bits(&gb, *srcbuf, buf_size << 3);
 
+    destbuf += x_pos;
+
     while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
         bits = get_bits(&gb, 4);
 
@@ -637,14 +677,14 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
                         if (map_table)
@@ -660,21 +700,24 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
     }
 
     if (get_bits(&gb, 8))
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_8bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                     const uint8_t **srcbuf, int buf_size,
-                                    int non_mod, uint8_t *map_table)
+                                    int non_mod, uint8_t *map_table, int x_pos)
 {
     const uint8_t *sbuf_end = (*srcbuf) + buf_size;
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
+
+    destbuf += x_pos;
 
     while (*srcbuf < sbuf_end && pixels_read < dbuf_len) {
         bits = *(*srcbuf)++;
@@ -694,30 +737,211 @@ static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
                 if (run_length == 0) {
                     return pixels_read;
                 }
+
+                bits = 0;
             } else {
                 bits = *(*srcbuf)++;
-
-                if (non_mod == 1 && bits == 1)
-                    pixels_read += run_length;
             }
-            if (map_table)
-                bits = map_table[0];
-            else
-                bits = 0;
-            while (run_length-- > 0 && pixels_read < dbuf_len) {
-                *destbuf++ = bits;
-                pixels_read++;
+            if (non_mod == 1 && bits == 1)
+                pixels_read += run_length;
+            else {
+                if (map_table)
+                    bits = map_table[bits];
+                while (run_length-- > 0 && pixels_read < dbuf_len) {
+                    *destbuf++ = bits;
+                    pixels_read++;
+                }
             }
         }
     }
 
     if (*(*srcbuf)++)
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     return pixels_read;
 }
 
+static void compute_default_clut(AVPicture *frame, int w, int h)
+{
+    uint8_t list[256] = {0};
+    uint8_t list_inv[256];
+    int counttab[256] = {0};
+    int count, i, x, y;
+
+#define V(x,y) frame->data[0][(x) + (y)*frame->linesize[0]]
+    for (y = 0; y<h; y++) {
+        for (x = 0; x<w; x++) {
+            int v = V(x,y) + 1;
+            int vl = x     ? V(x-1,y) + 1 : 0;
+            int vr = x+1<w ? V(x+1,y) + 1 : 0;
+            int vt = y     ? V(x,y-1) + 1 : 0;
+            int vb = y+1<h ? V(x,y+1) + 1 : 0;
+            counttab[v-1] += !!((v!=vl) + (v!=vr) + (v!=vt) + (v!=vb));
+        }
+    }
+#define L(x,y) list[ frame->data[0][(x) + (y)*frame->linesize[0]] ]
+
+    for (i = 0; i<256; i++) {
+        int scoretab[256] = {0};
+        int bestscore = 0;
+        int bestv = 0;
+        for (y = 0; y<h; y++) {
+            for (x = 0; x<w; x++) {
+                int v = frame->data[0][x + y*frame->linesize[0]];
+                int l_m = list[v];
+                int l_l = x     ? L(x-1, y) : 1;
+                int l_r = x+1<w ? L(x+1, y) : 1;
+                int l_t = y     ? L(x, y-1) : 1;
+                int l_b = y+1<h ? L(x, y+1) : 1;
+                int score;
+                if (l_m)
+                    continue;
+                scoretab[v] += l_l + l_r + l_t + l_b;
+                score = 1024LL*scoretab[v] / counttab[v];
+                if (score > bestscore) {
+                    bestscore = score;
+                    bestv = v;
+                }
+            }
+        }
+        if (!bestscore)
+            break;
+        list    [ bestv ] = 1;
+        list_inv[     i ] = bestv;
+    }
+
+    count = i - 1;
+    for (i--; i>=0; i--) {
+        int v = i*255/count;
+        AV_WN32(frame->data[1] + 4*list_inv[i], RGBA(v/2,v,v/2,v));
+    }
+}
+
+
+static int save_subtitle_set(AVCodecContext *avctx, AVSubtitle *sub, int *got_output)
+{
+    DVBSubContext *ctx = avctx->priv_data;
+    DVBSubRegionDisplay *display;
+    DVBSubDisplayDefinition *display_def = ctx->display_definition;
+    DVBSubRegion *region;
+    AVSubtitleRect *rect;
+    DVBSubCLUT *clut;
+    uint32_t *clut_table;
+    int i;
+    int offset_x=0, offset_y=0;
+    int ret = 0;
+
+
+    if (display_def) {
+        offset_x = display_def->x;
+        offset_y = display_def->y;
+    }
+
+    /* Not touching AVSubtitles again*/
+    if(sub->num_rects) {
+        avpriv_request_sample(ctx, "Different Version of Segment asked Twice");
+        return AVERROR_PATCHWELCOME;
+    }
+    for (display = ctx->display_list; display; display = display->next) {
+        region = get_region(ctx, display->region_id);
+        if (region && region->dirty)
+            sub->num_rects++;
+    }
+
+    if(ctx->compute_edt == 0) {
+        sub->end_display_time = ctx->time_out * 1000;
+        *got_output = 1;
+    } else if (ctx->prev_start != AV_NOPTS_VALUE) {
+        sub->end_display_time = av_rescale_q((sub->pts - ctx->prev_start ), AV_TIME_BASE_Q, (AVRational){ 1, 1000 }) - 1;
+        *got_output = 1;
+    }
+    if (sub->num_rects > 0) {
+
+        sub->rects = av_mallocz_array(sizeof(*sub->rects), sub->num_rects);
+        if (!sub->rects) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for(i=0; i<sub->num_rects; i++)
+            sub->rects[i] = av_mallocz(sizeof(*sub->rects[i]));
+
+        i = 0;
+
+        for (display = ctx->display_list; display; display = display->next) {
+            region = get_region(ctx, display->region_id);
+
+            if (!region)
+                continue;
+
+            if (!region->dirty)
+                continue;
+
+            rect = sub->rects[i];
+            rect->x = display->x_pos + offset_x;
+            rect->y = display->y_pos + offset_y;
+            rect->w = region->width;
+            rect->h = region->height;
+            rect->nb_colors = (1 << region->depth);
+            rect->type      = SUBTITLE_BITMAP;
+            rect->pict.linesize[0] = region->width;
+
+            clut = get_clut(ctx, region->clut);
+
+            if (!clut)
+                clut = &default_clut;
+
+            switch (region->depth) {
+            case 2:
+                clut_table = clut->clut4;
+                break;
+            case 8:
+                clut_table = clut->clut256;
+                break;
+            case 4:
+            default:
+                clut_table = clut->clut16;
+                break;
+            }
+
+            rect->pict.data[1] = av_mallocz(AVPALETTE_SIZE);
+            if (!rect->pict.data[1]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            memcpy(rect->pict.data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
+
+            rect->pict.data[0] = av_malloc(region->buf_size);
+            if (!rect->pict.data[0]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            memcpy(rect->pict.data[0], region->pbuf, region->buf_size);
+
+            if ((clut == &default_clut && ctx->compute_clut == -1) || ctx->compute_clut == 1)
+                compute_default_clut(&rect->pict, rect->w, rect->h);
 
+            i++;
+        }
+    }
+
+    return 0;
+fail:
+    if (sub->rects) {
+        for(i=0; i<sub->num_rects; i++) {
+            rect = sub->rects[i];
+            if (rect) {
+                av_freep(&rect->pict.data[0]);
+                av_freep(&rect->pict.data[1]);
+            }
+            av_freep(&sub->rects[i]);
+        }
+        av_freep(&sub->rects);
+    }
+    sub->num_rects = 0;
+    return ret;
+}
 
 static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDisplay *display,
                                           const uint8_t *buf, int buf_size, int top_bottom, int non_mod)
@@ -736,6 +960,7 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                          0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
     uint8_t *map_table;
 
+#if 0
     ff_dlog(avctx, "DVB pixel block size %d, %s field:\n", buf_size,
             top_bottom ? "bottom" : "top");
 
@@ -750,21 +975,22 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
 
     if (i % 16)
         ff_dlog(avctx, "\n");
+#endif
 
-    if (region == 0)
+    if (!region)
         return;
 
     pbuf = region->pbuf;
+    region->dirty = 1;
 
     x_pos = display->x_pos;
     y_pos = display->y_pos;
 
-    if ((y_pos & 1) != top_bottom)
-        y_pos++;
+    y_pos += top_bottom;
 
     while (buf < buf_end) {
-        if (x_pos > region->width || y_pos > region->height) {
-            av_log(avctx, AV_LOG_ERROR, "Invalid object location!\n");
+        if ((*buf!=0xf0 && x_pos >= region->width) || y_pos >= region->height) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid object location! %d-%d %d-%d %02x\n", x_pos, region->width, y_pos, region->height, *buf);
             return;
         }
 
@@ -777,9 +1003,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_2bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_2bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x11:
             if (region->depth < 4) {
@@ -792,9 +1018,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_4bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_4bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x12:
             if (region->depth < 8) {
@@ -802,9 +1028,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                 return;
             }
 
-            x_pos += dvbsub_read_8bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, NULL);
+            x_pos = dvbsub_read_8bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, NULL, x_pos);
             break;
 
         case 0x20:
@@ -839,7 +1065,6 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
     DVBSubContext *ctx = avctx->priv_data;
 
     const uint8_t *buf_end = buf + buf_size;
-    const uint8_t *block;
     int object_id;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
@@ -865,12 +1090,13 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
         buf += 2;
 
         if (buf + top_field_len + bottom_field_len > buf_end) {
-            av_log(avctx, AV_LOG_ERROR, "Field data size too large\n");
+            av_log(avctx, AV_LOG_ERROR, "Field data size %d+%d too large\n", top_field_len, bottom_field_len);
             return AVERROR_INVALIDDATA;
         }
 
         for (display = object->display_list; display; display = display->object_list_next) {
-            block = buf;
+            const uint8_t *block = buf;
+            int bfl = bottom_field_len;
 
             dvbsub_parse_pixel_data_block(avctx, display, block, top_field_len, 0,
                                             non_modifying_color);
@@ -878,9 +1104,9 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
             if (bottom_field_len > 0)
                 block = buf + top_field_len;
             else
-                bottom_field_len = top_field_len;
+                bfl = top_field_len;
 
-            dvbsub_parse_pixel_data_block(avctx, display, block, bottom_field_len, 1,
+            dvbsub_parse_pixel_data_block(avctx, display, block, bfl, 1,
                                             non_modifying_color);
         }
 
@@ -900,6 +1126,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int i, clut_id;
+    int version;
     DVBSubCLUT *clut;
     int entry_id, depth , full_range;
     int y, cr, cb, alpha;
@@ -917,6 +1144,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         ff_dlog(avctx, "\n");
 
     clut_id = *buf++;
+    version = ((*buf)>>4)&15;
     buf += 1;
 
     clut = get_clut(ctx, clut_id);
@@ -929,11 +1157,16 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         memcpy(clut, &default_clut, sizeof(DVBSubCLUT));
 
         clut->id = clut_id;
+        clut->version = -1;
 
         clut->next = ctx->clut_list;
         ctx->clut_list = clut;
     }
 
+    if (clut->version != version) {
+
+    clut->version = version;
+
     while (buf + 4 < buf_end) {
         entry_id = *buf++;
 
@@ -941,7 +1174,6 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
         if (depth == 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid clut depth 0x%x!\n", *buf);
-            return AVERROR_INVALIDDATA;
         }
 
         full_range = (*buf++) & 1;
@@ -967,14 +1199,20 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         YUV_TO_RGB2_CCIR(r, g, b, y);
 
         ff_dlog(avctx, "clut %d := (%d,%d,%d,%d)\n", entry_id, r, g, b, alpha);
+        if (!!(depth & 0x80) + !!(depth & 0x40) + !!(depth & 0x20) > 1) {
+            ff_dlog(avctx, "More than one bit level marked: %x\n", depth);
+            if (avctx->strict_std_compliance > FF_COMPLIANCE_NORMAL)
+                return AVERROR_INVALIDDATA;
+        }
 
         if (depth & 0x80)
             clut->clut4[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x40)
+        else if (depth & 0x40)
             clut->clut16[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x20)
+        else if (depth & 0x20)
             clut->clut256[entry_id] = RGBA(r,g,b,255 - alpha);
     }
+    }
 
     return 0;
 }
@@ -987,6 +1225,7 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int region_id, object_id;
+    int av_unused version;
     DVBSubRegion *region;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
@@ -1005,11 +1244,13 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
             return AVERROR(ENOMEM);
 
         region->id = region_id;
+        region->version = -1;
 
         region->next = ctx->region_list;
         ctx->region_list = region;
     }
 
+    version = ((*buf)>>4) & 15;
     fill = ((*buf++) >> 3) & 1;
 
     region->width = AV_RB16(buf);
@@ -1023,10 +1264,15 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
         region->buf_size = region->width * region->height;
 
         region->pbuf = av_malloc(region->buf_size);
-        if (!region->pbuf)
+        if (!region->pbuf) {
+            region->buf_size =
+            region->width =
+            region->height = 0;
             return AVERROR(ENOMEM);
+        }
 
         fill = 1;
+        region->dirty = 0;
     }
 
     region->depth = 1 << (((*buf++) >> 2) & 7);
@@ -1036,9 +1282,10 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
     }
     region->clut = *buf++;
 
-    if (region->depth == 8)
+    if (region->depth == 8) {
         region->bgcolor = *buf++;
-    else {
+        buf += 1;
+    } else {
         buf += 1;
 
         if (region->depth == 4)
@@ -1102,7 +1349,7 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_parse_page_segment(AVCodecContext *avctx,
-                                     const uint8_t *buf, int buf_size)
+                                     const uint8_t *buf, int buf_size, AVSubtitle *sub, int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
@@ -1111,22 +1358,36 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
     const uint8_t *buf_end = buf + buf_size;
     int region_id;
     int page_state;
+    int timeout;
+    int version;
 
     if (buf_size < 1)
         return AVERROR_INVALIDDATA;
 
-    ctx->time_out = *buf++;
+    timeout = *buf++;
+    version = ((*buf)>>4) & 15;
     page_state = ((*buf++) >> 2) & 3;
 
+    if (ctx->version == version) {
+        return 0;
+    }
+
+    ctx->time_out = timeout;
+    ctx->version = version;
+
     ff_dlog(avctx, "Page time out %ds, state %d\n", ctx->time_out, page_state);
 
-    if (page_state == 2) {
-        delete_state(ctx);
+    if(ctx->compute_edt == 1)
+        save_subtitle_set(avctx, sub, got_output);
+
+    if (page_state == 1 || page_state == 2) {
+        delete_regions(ctx);
+        delete_objects(ctx);
+        delete_cluts(ctx);
     }
 
     tmp_display_list = ctx->display_list;
     ctx->display_list = NULL;
-    ctx->display_list_size = 0;
 
     while (buf + 5 < buf_end) {
         region_id = *buf++;
@@ -1157,7 +1418,6 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         display->next = ctx->display_list;
         ctx->display_list = display;
-        ctx->display_list_size++;
 
         ff_dlog(avctx, "Region %d, (%d,%d)\n", region_id, display->x_pos, display->y_pos);
     }
@@ -1167,7 +1427,7 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         tmp_display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
@@ -1195,6 +1455,9 @@ static void save_display_set(DVBSubContext *ctx)
     for (display = ctx->display_list; display; display = display->next) {
         region = get_region(ctx, display->region_id);
 
+        if (!region)
+            return;
+
         if (x_pos == -1) {
             x_pos = display->x_pos;
             y_pos = display->y_pos;
@@ -1225,17 +1488,20 @@ static void save_display_set(DVBSubContext *ctx)
 
         pbuf = av_malloc(width * height * 4);
         if (!pbuf)
-            return AVERROR(ENOMEM);
+            return;
 
         for (display = ctx->display_list; display; display = display->next) {
             region = get_region(ctx, display->region_id);
 
+            if (!region)
+                return;
+
             x_off = display->x_pos - x_pos;
             y_off = display->y_pos - y_pos;
 
             clut = get_clut(ctx, region->clut);
 
-            if (clut == 0)
+            if (!clut)
                 clut = &default_clut;
 
             switch (region->depth) {
@@ -1264,7 +1530,7 @@ static void save_display_set(DVBSubContext *ctx)
 
         png_save2(filename, pbuf, width, height);
 
-        av_free(pbuf);
+        av_freep(&pbuf);
     }
 
     fileno_index++;
@@ -1299,14 +1565,18 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
     display_def->y       = 0;
     display_def->width   = bytestream_get_be16(&buf) + 1;
     display_def->height  = bytestream_get_be16(&buf) + 1;
-
-    if (buf_size < 13)
-        return AVERROR_INVALIDDATA;
+    if (!avctx->width || !avctx->height) {
+        avctx->width  = display_def->width;
+        avctx->height = display_def->height;
+    }
 
     if (info_byte & 1<<3) { // display_window_flag
+        if (buf_size < 13)
+            return AVERROR_INVALIDDATA;
+
         display_def->x = bytestream_get_be16(&buf);
-        display_def->y = bytestream_get_be16(&buf);
         display_def->width  = bytestream_get_be16(&buf) - display_def->x + 1;
+        display_def->y = bytestream_get_be16(&buf);
         display_def->height = bytestream_get_be16(&buf) - display_def->y + 1;
     }
 
@@ -1314,98 +1584,16 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
-                                      int buf_size, AVSubtitle *sub)
+                                      int buf_size, AVSubtitle *sub,int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
-    DVBSubDisplayDefinition *display_def = ctx->display_definition;
-
-    DVBSubRegion *region;
-    DVBSubRegionDisplay *display;
-    AVSubtitleRect *rect;
-    DVBSubCLUT *clut;
-    uint32_t *clut_table;
-    int i;
-    int offset_x=0, offset_y=0;
-
-    sub->rects = NULL;
-    sub->start_display_time = 0;
-    sub->end_display_time = ctx->time_out * 1000;
-    sub->format = 0;
-
-    if (display_def) {
-        offset_x = display_def->x;
-        offset_y = display_def->y;
-    }
-
-    sub->num_rects = ctx->display_list_size;
-    if (sub->num_rects <= 0)
-        return AVERROR_INVALIDDATA;
-
-    sub->rects = av_mallocz_array(sub->num_rects * sub->num_rects,
-                                  sizeof(*sub->rects));
-    if (!sub->rects)
-        return AVERROR(ENOMEM);
-
-    i = 0;
-
-    for (display = ctx->display_list; display; display = display->next) {
-        region = get_region(ctx, display->region_id);
-        rect = sub->rects[i];
-
-        if (!region)
-            continue;
-
-        rect->x = display->x_pos + offset_x;
-        rect->y = display->y_pos + offset_y;
-        rect->w = region->width;
-        rect->h = region->height;
-        rect->nb_colors = 16;
-        rect->type      = SUBTITLE_BITMAP;
-        rect->pict.linesize[0] = region->width;
-
-        clut = get_clut(ctx, region->clut);
-
-        if (!clut)
-            clut = &default_clut;
-
-        switch (region->depth) {
-        case 2:
-            clut_table = clut->clut4;
-            break;
-        case 8:
-            clut_table = clut->clut256;
-            break;
-        case 4:
-        default:
-            clut_table = clut->clut16;
-            break;
-        }
-
-        rect->pict.data[1] = av_mallocz(AVPALETTE_SIZE);
-        if (!rect->pict.data[1]) {
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->pict.data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
-
-        rect->pict.data[0] = av_malloc(region->buf_size);
-        if (!rect->pict.data[0]) {
-            av_free(rect->pict.data[1]);
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->pict.data[0], region->pbuf, region->buf_size);
-
-        i++;
-    }
-
-    sub->num_rects = i;
 
+    if(ctx->compute_edt == 0)
+        save_subtitle_set(avctx, sub, got_output);
 #ifdef DEBUG
     save_display_set(ctx);
 #endif
-
-    return 1;
+    return 0;
 }
 
 static int dvbsub_decode(AVCodecContext *avctx,
@@ -1421,6 +1609,9 @@ static int dvbsub_decode(AVCodecContext *avctx,
     int page_id;
     int segment_length;
     int i;
+    int ret = 0;
+    int got_segment = 0;
+    int got_dds = 0;
 
     ff_dlog(avctx, "DVB sub packet:\n");
 
@@ -1449,9 +1640,14 @@ static int dvbsub_decode(AVCodecContext *avctx,
         segment_length = AV_RB16(p);
         p += 2;
 
+        if (avctx->debug & FF_DEBUG_STARTCODE) {
+            av_log(avctx, AV_LOG_DEBUG, "segment_type:%d page_id:%d segment_length:%d\n", segment_type, page_id, segment_length);
+        }
+
         if (p_end - p < segment_length) {
             ff_dlog(avctx, "incomplete or broken packet");
-            return -1;
+            ret = -1;
+            goto end;
         }
 
         if (page_id == ctx->composition_id || page_id == ctx->ancillary_id ||
@@ -1459,24 +1655,35 @@ static int dvbsub_decode(AVCodecContext *avctx,
             int ret = 0;
             switch (segment_type) {
             case DVBSUB_PAGE_SEGMENT:
-                ret = dvbsub_parse_page_segment(avctx, p, segment_length);
+                ret = dvbsub_parse_page_segment(avctx, p, segment_length, sub, data_size);
+                got_segment |= 1;
                 break;
             case DVBSUB_REGION_SEGMENT:
                 ret = dvbsub_parse_region_segment(avctx, p, segment_length);
+                got_segment |= 2;
                 break;
             case DVBSUB_CLUT_SEGMENT:
                 ret = dvbsub_parse_clut_segment(avctx, p, segment_length);
+                if (ret < 0) goto end;
+                got_segment |= 4;
                 break;
             case DVBSUB_OBJECT_SEGMENT:
                 ret = dvbsub_parse_object_segment(avctx, p, segment_length);
+                got_segment |= 8;
                 break;
             case DVBSUB_DISPLAYDEFINITION_SEGMENT:
                 ret = dvbsub_parse_display_definition_segment(avctx, p,
                                                               segment_length);
+                got_dds = 1;
                 break;
             case DVBSUB_DISPLAY_SEGMENT:
-                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub);
-                *data_size = ret;
+                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub, data_size);
+                if (got_segment == 15 && !got_dds && !avctx->width && !avctx->height) {
+                    // Default from ETSI EN 300 743 V1.3.1 (7.2.1)
+                    avctx->width  = 720;
+                    avctx->height = 576;
+                }
+                got_segment |= 16;
                 break;
             default:
                 ff_dlog(avctx, "Subtitling segment type 0x%x, page id %d, length %d\n",
@@ -1484,15 +1691,44 @@ static int dvbsub_decode(AVCodecContext *avctx,
                 break;
             }
             if (ret < 0)
-                return ret;
+                goto end;
         }
 
         p += segment_length;
     }
+    // Some streams do not send a display segment but if we have all the other
+    // segments then we need no further data.
+    if (got_segment == 15) {
+        av_log(avctx, AV_LOG_DEBUG, "Missing display_end_segment, emulating\n");
+        dvbsub_display_end_segment(avctx, p, 0, sub, data_size);
+    }
+
+end:
+    if(ret < 0) {
+        *data_size = 0;
+        avsubtitle_free(sub);
+        return ret;
+    } else {
+        if(ctx->compute_edt == 1 )
+            FFSWAP(int64_t, ctx->prev_start, sub->pts);
+    }
 
     return p - buf;
 }
 
+#define DS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
+static const AVOption options[] = {
+    {"compute_edt", "compute end of time using pts or timeout", offsetof(DVBSubContext, compute_edt), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, DS},
+    {"compute_clut", "compute clut when not available(-1) or always(1) or never(0)", offsetof(DVBSubContext, compute_clut), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, DS},
+    {"dvb_substream", "", offsetof(DVBSubContext, substream), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, DS},
+    {NULL}
+};
+static const AVClass dvbsubdec_class = {
+    .class_name = "DVB Sub Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_dvbsub_decoder = {
     .name           = "dvbsub",
@@ -1503,4 +1739,5 @@ AVCodec ff_dvbsub_decoder = {
     .init           = dvbsub_init_decoder,
     .close          = dvbsub_close_decoder,
     .decode         = dvbsub_decode,
+    .priv_class     = &dvbsubdec_class,
 };
diff --git a/libavcodec/dvd_nav_parser.c b/libavcodec/dvd_nav_parser.c
new file mode 100644
index 0000000000..6e2352dc3c
--- /dev/null
+++ b/libavcodec/dvd_nav_parser.c
@@ -0,0 +1,115 @@
+/*
+ * DVD navigation block parser for FFmpeg
+ * Copyright (c) 2013 The FFmpeg Project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "parser.h"
+
+#define PCI_SIZE  980
+#define DSI_SIZE 1018
+
+/* parser definition */
+typedef struct DVDNavParseContext {
+    uint32_t     lba;
+    uint8_t      buffer[PCI_SIZE+DSI_SIZE];
+    int          copied;
+} DVDNavParseContext;
+
+static av_cold int dvd_nav_parse_init(AVCodecParserContext *s)
+{
+    DVDNavParseContext *pc = s->priv_data;
+
+    pc->lba    = 0xFFFFFFFF;
+    pc->copied = 0;
+    return 0;
+}
+
+static int dvd_nav_parse(AVCodecParserContext *s,
+                         AVCodecContext *avctx,
+                         const uint8_t **poutbuf, int *poutbuf_size,
+                         const uint8_t *buf, int buf_size)
+{
+    DVDNavParseContext *pc1 = s->priv_data;
+    int lastPacket          = 0;
+    int valid               = 0;
+
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
+    avctx->time_base.num = 1;
+    avctx->time_base.den = 90000;
+
+    if (buf && buf_size) {
+        switch(buf[0]) {
+            case 0x00:
+                if (buf_size == PCI_SIZE) {
+                    /* PCI */
+                    uint32_t lba      = AV_RB32(&buf[0x01]);
+                    uint32_t startpts = AV_RB32(&buf[0x0D]);
+                    uint32_t endpts   = AV_RB32(&buf[0x11]);
+
+                    if (endpts > startpts) {
+                        pc1->lba    = lba;
+                        s->pts      = (int64_t)startpts;
+                        s->duration = endpts - startpts;
+
+                        memcpy(pc1->buffer, buf, PCI_SIZE);
+                        pc1->copied = PCI_SIZE;
+                        valid       = 1;
+                    }
+                }
+                break;
+
+            case 0x01:
+                if ((buf_size == DSI_SIZE) && (pc1->copied == PCI_SIZE)) {
+                    /* DSI */
+                    uint32_t lba = AV_RB32(&buf[0x05]);
+
+                    if (lba == pc1->lba) {
+                        memcpy(pc1->buffer + pc1->copied, buf, DSI_SIZE);
+                        lastPacket  = 1;
+                        valid       = 1;
+                    }
+                }
+                break;
+        }
+    }
+
+    if (!valid || lastPacket) {
+        pc1->copied = 0;
+        pc1->lba    = 0xFFFFFFFF;
+    }
+
+    if (lastPacket) {
+        *poutbuf      = pc1->buffer;
+        *poutbuf_size = sizeof(pc1->buffer);
+    } else {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+    }
+
+    return buf_size;
+}
+
+AVCodecParser ff_dvd_nav_parser = {
+    .codec_ids      = { AV_CODEC_ID_DVD_NAV },
+    .priv_data_size = sizeof(DVDNavParseContext),
+    .parser_init    = dvd_nav_parse_init,
+    .parser_parse   = dvd_nav_parse,
+};
diff --git a/libavcodec/dvdata.c b/libavcodec/dvdata.c
index 985cda7ab5..231569a328 100644
--- a/libavcodec/dvdata.c
+++ b/libavcodec/dvdata.c
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,71 +69,6 @@ const uint8_t ff_dv_quant_shifts[22][4] = {
 
 const uint8_t ff_dv_quant_offset[4] = { 6, 3, 0, 1 };
 
-const int ff_dv_iweight_88[64] = {
-    32768, 16710, 16710, 17735, 17015, 17735, 18197, 18079,
-    18079, 18197, 18725, 18559, 19196, 18559, 18725, 19284,
-    19108, 19692, 19692, 19108, 19284, 21400, 19645, 20262,
-    20214, 20262, 19645, 21400, 22733, 21845, 20867, 20815,
-    20815, 20867, 21845, 22733, 23173, 23173, 21400, 21400,
-    21400, 23173, 23173, 24600, 23764, 22017, 22017, 23764,
-    24600, 25267, 24457, 22672, 24457, 25267, 25971, 25191,
-    25191, 25971, 26715, 27962, 26715, 29642, 29642, 31536,
-};
-const int ff_dv_iweight_248[64] = {
-    32768, 17735, 16710, 18079, 18725, 21400, 17735, 19196,
-    19108, 21845, 16384, 17735, 18725, 21400, 16710, 18079,
-    20262, 23173, 18197, 19692, 18725, 20262, 20815, 23764,
-    17735, 19196, 19108, 21845, 20262, 23173, 18197, 19692,
-    21400, 24457, 19284, 20867, 21400, 23173, 22017, 25191,
-    18725, 20262, 20815, 23764, 21400, 24457, 19284, 20867,
-    24457, 27962, 22733, 24600, 25971, 29642, 21400, 23173,
-    22017, 25191, 24457, 27962, 22733, 24600, 25971, 29642,
-};
-
-/**
- * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
- */
-const int ff_dv_iweight_1080_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  45,  45,  42,  42,
-     42,  45,  45,  48,  46,  43,  43,  46,
-     48,  49,  48,  44,  48,  49, 101,  98,
-     98, 101, 104, 109, 104, 116, 116, 123,
-};
-const int ff_dv_iweight_1080_c[64] = {
-    128,  16,  16,  17,  17,  17,  25,  25,
-     25,  25,  26,  25,  26,  25,  26,  26,
-     26,  27,  27,  26,  26,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  91,  91,  84,  84,
-     84,  91,  91,  96,  93,  86,  86,  93,
-     96, 197, 191, 177, 191, 197, 203, 197,
-    197, 203, 209, 219, 209, 232, 232, 246,
-};
-const int ff_dv_iweight_720_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  68,  68,  63,  63,
-     63,  68,  68,  96,  92,  86,  86,  92,
-     96,  98,  96,  88,  96,  98, 202, 196,
-    196, 202, 208, 218, 208, 232, 232, 246,
-};
-const int ff_dv_iweight_720_c[64] = {
-    128,  24,  24,  26,  26,  26,  36,  36,
-     36,  36,  36,  36,  38,  36,  36,  38,
-     38,  38,  38,  38,  38,  84,  76,  80,
-     80,  80,  76,  84,  88,  86,  82,  82,
-     82,  82,  86,  88, 182, 182, 168, 168,
-    168, 182, 182, 192, 186, 192, 172, 186,
-    192, 394, 382, 354, 382, 394, 406, 394,
-    394, 406, 418, 438, 418, 464, 464, 492,
-};
-
 /*
  * There's a catch about the following three tables: the mapping they establish
  * between (run, level) and vlc is not 1-1. So you have to watch out for that
diff --git a/libavcodec/dvdata.h b/libavcodec/dvdata.h
index 8e7c0fbcdf..e0ed043c47 100644
--- a/libavcodec/dvdata.h
+++ b/libavcodec/dvdata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,13 +26,6 @@ extern const uint8_t ff_dv_zigzag248_direct[64];
 extern const uint8_t ff_dv_quant_shifts[22][4];
 extern const uint8_t ff_dv_quant_offset[4];
 
-extern const int ff_dv_iweight_88[64];
-extern const int ff_dv_iweight_248[64];
-extern const int ff_dv_iweight_1080_y[64];
-extern const int ff_dv_iweight_1080_c[64];
-extern const int ff_dv_iweight_720_y[64];
-extern const int ff_dv_iweight_720_c[64];
-
 #define NB_DV_VLC 409
 
 extern const uint16_t ff_dv_vlc_bits[NB_DV_VLC];
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 9ee9933cdc..679075e6a9 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -13,20 +13,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,12 +35,14 @@
  * DV decoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
 #include "dv.h"
+#include "dv_profile_internal.h"
 #include "dvdata.h"
 #include "get_bits.h"
 #include "idctdsp.h"
@@ -60,18 +62,136 @@ typedef struct BlockInfo {
 
 static const int dv_iweight_bits = 14;
 
+static const uint16_t dv_iweight_88[64] = {
+    32768, 16705, 16705, 17734, 17032, 17734, 18205, 18081,
+    18081, 18205, 18725, 18562, 19195, 18562, 18725, 19266,
+    19091, 19705, 19705, 19091, 19266, 21407, 19643, 20267,
+    20228, 20267, 19643, 21407, 22725, 21826, 20853, 20806,
+    20806, 20853, 21826, 22725, 23170, 23170, 21407, 21400,
+    21407, 23170, 23170, 24598, 23786, 22018, 22018, 23786,
+    24598, 25251, 24465, 22654, 24465, 25251, 25972, 25172,
+    25172, 25972, 26722, 27969, 26722, 29692, 29692, 31521,
+};
+static const uint16_t dv_iweight_248[64] = {
+    32768, 16384, 16705, 16705, 17734, 17734, 17734, 17734,
+    18081, 18081, 18725, 18725, 21407, 21407, 19091, 19091,
+    19195, 19195, 18205, 18205, 18725, 18725, 19705, 19705,
+    20267, 20267, 21826, 21826, 23170, 23170, 20806, 20806,
+    20267, 20267, 19266, 19266, 21407, 21407, 20853, 20853,
+    21400, 21400, 23786, 23786, 24465, 24465, 22018, 22018,
+    23170, 23170, 22725, 22725, 24598, 24598, 24465, 24465,
+    25172, 25172, 27969, 27969, 25972, 25972, 29692, 29692
+};
+
+/**
+ * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
+ */
+static const uint16_t dv_iweight_1080_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  45,  45,  42,  42,
+     42,  45,  45,  48,  46,  43,  43,  46,
+     48,  49,  48,  44,  48,  49, 101,  98,
+     98, 101, 104, 109, 104, 116, 116, 123,
+};
+static const uint16_t dv_iweight_1080_c[64] = {
+    128,  16,  16,  17,  17,  17,  25,  25,
+     25,  25,  26,  25,  26,  25,  26,  26,
+     26,  27,  27,  26,  26,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  91,  91,  84,  84,
+     84,  91,  91,  96,  93,  86,  86,  93,
+     96, 197, 191, 177, 191, 197, 203, 197,
+    197, 203, 209, 219, 209, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  68,  68,  63,  63,
+     63,  68,  68,  96,  92,  86,  86,  92,
+     96,  98,  96,  88,  96,  98, 202, 196,
+    196, 202, 208, 218, 208, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_c[64] = {
+    128,  24,  24,  26,  26,  26,  36,  36,
+     36,  36,  36,  36,  38,  36,  36,  38,
+     38,  38,  38,  38,  38,  84,  76,  80,
+     80,  80,  76,  84,  88,  86,  82,  82,
+     82,  82,  86,  88, 182, 182, 168, 168,
+    168, 182, 182, 192, 186, 192, 172, 186,
+    192, 394, 382, 354, 382, 394, 406, 394,
+    394, 406, 418, 438, 418, 464, 464, 492,
+};
+
+static void dv_init_weight_tables(DVVideoContext *ctx, const AVDVProfile *d)
+{
+    int j, i, c, s;
+    uint32_t *factor1 = &ctx->idct_factor[0],
+             *factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
+
+    if (DV_PROFILE_IS_HD(d)) {
+        /* quantization quanta by QNO for DV100 */
+        static const uint8_t dv100_qstep[16] = {
+            1, /* QNO = 0 and 1 both have no quantization */
+            1,
+            2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
+        };
+        const uint16_t *iweight1, *iweight2;
+
+        if (d->height == 720) {
+            iweight1 = &dv_iweight_720_y[0];
+            iweight2 = &dv_iweight_720_c[0];
+        } else {
+            iweight1 = &dv_iweight_1080_y[0];
+            iweight2 = &dv_iweight_1080_c[0];
+        }
+        for (c = 0; c < 4; c++) {
+            for (s = 0; s < 16; s++) {
+                for (i = 0; i < 64; i++) {
+                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
+                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
+                }
+            }
+        }
+    } else {
+        static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
+        const uint16_t *iweight1 = &dv_iweight_88[0];
+        for (j = 0; j < 2; j++, iweight1 = &dv_iweight_248[0]) {
+            for (s = 0; s < 22; s++) {
+                for (i = c = 0; c < 4; c++) {
+                    for (; i < dv_quant_areas[c]; i++) {
+                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
+                        *factor2++ = (*factor1++) << 1;
+                    }
+                }
+            }
+        }
+    }
+}
+
 static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
 {
     DVVideoContext *s = avctx->priv_data;
     IDCTDSPContext idsp;
     int i;
 
+    memset(&idsp,0, sizeof(idsp));
     ff_idctdsp_init(&idsp, avctx);
 
     for (i = 0; i < 64; i++)
         s->dv_zigzag[0][i] = idsp.idct_permutation[ff_zigzag_direct[i]];
 
-    memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
+    if (avctx->lowres){
+        for (i = 0; i < 64; i++){
+            int j = ff_dv_zigzag248_direct[i];
+            s->dv_zigzag[1][i] = idsp.idct_permutation[(j & 7) + (j & 8) * 4 + (j & 48) / 2];
+        }
+    }else
+        memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
 
     s->idct_put[0] = idsp.idct_put;
     s->idct_put[1] = ff_simple_idct248_put;
@@ -169,11 +289,17 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     LOCAL_ALIGNED_16(int16_t, sblock, [5 * DV_MAX_BPM], [64]);
     LOCAL_ALIGNED_16(uint8_t, mb_bit_buffer, [80     + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
     LOCAL_ALIGNED_16(uint8_t, vs_bit_buffer, [80 * 5 + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
-    const int log2_blocksize = 3;
+    const int log2_blocksize = 3-s->avctx->lowres;
     int is_field_mode[5];
+    int vs_bit_buffer_damaged = 0;
+    int mb_bit_buffer_damaged[5] = {0};
+    int retried = 0;
+    int sta;
+
+    av_assert1((((int) mb_bit_buffer) & 7) == 0);
+    av_assert1((((int) vs_bit_buffer) & 7) == 0);
 
-    assert((((int) mb_bit_buffer) & 7) == 0);
-    assert((((int) vs_bit_buffer) & 7) == 0);
+retry:
 
     memset(sblock, 0, 5 * DV_MAX_BPM * sizeof(*sblock));
 
@@ -185,6 +311,14 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     for (mb_index = 0; mb_index < 5; mb_index++, mb1 += s->sys->bpm, block1 += s->sys->bpm * 64) {
         /* skip header */
         quant    = buf_ptr[3] & 0x0f;
+        if (avctx->error_concealment) {
+            if ((buf_ptr[3] >> 4) == 0x0E)
+                vs_bit_buffer_damaged = 1;
+            if (!mb_index) {
+                sta = buf_ptr[3] >> 4;
+            } else if (sta != (buf_ptr[3] >> 4))
+                vs_bit_buffer_damaged = 1;
+        }
         buf_ptr += 4;
         init_put_bits(&pb, mb_bit_buffer, 80);
         mb    = mb1;
@@ -229,11 +363,16 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
              * block is finished */
             if (mb->pos >= 64)
                 bit_copy(&pb, &gb);
+            if (mb->pos >= 64 && mb->pos < 127)
+                vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
 
             block += 64;
             mb++;
         }
 
+        if (mb_bit_buffer_damaged[mb_index] > 0)
+            continue;
+
         /* pass 2: we can do it just after */
         ff_dlog(avctx, "***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index);
         block = block1;
@@ -247,6 +386,8 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                 /* if still not finished, no need to parse other blocks */
                 if (mb->pos < 64)
                     break;
+                if (mb->pos < 127)
+                    vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
             }
         }
         /* all blocks are finished, so the extra bytes can be used at
@@ -264,17 +405,25 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     flush_put_bits(&vs_pb);
     for (mb_index = 0; mb_index < 5; mb_index++) {
         for (j = 0; j < s->sys->bpm; j++) {
-            if (mb->pos < 64) {
+            if (mb->pos < 64 && get_bits_left(&gb) > 0 && !vs_bit_buffer_damaged) {
                 ff_dlog(avctx, "start %d:%d\n", mb_index, j);
                 dv_decode_ac(&gb, mb, block);
             }
-            if (mb->pos >= 64 && mb->pos < 127)
+
+            if (mb->pos >= 64 && mb->pos < 127) {
                 av_log(avctx, AV_LOG_ERROR,
                        "AC EOB marker is absent pos=%d\n", mb->pos);
+                vs_bit_buffer_damaged = 1;
+            }
             block += 64;
             mb++;
         }
     }
+    if (vs_bit_buffer_damaged && !retried) {
+        av_log(avctx, AV_LOG_ERROR, "Concealing bitstream errors\n");
+        retried = 1;
+        goto retry;
+    }
 
     /* compute idct and place blocks */
     block = &sblock[0][0];
@@ -317,9 +466,9 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                 int x, y;
                 mb->idct_put(pixels, 8, block);
                 for (y = 0; y < (1 << log2_blocksize); y++, c_ptr += s->frame->linesize[j], pixels += 8) {
-                    ptr1   = pixels + (1 << (log2_blocksize - 1));
+                    ptr1   = pixels + ((1 << (log2_blocksize))>>1);
                     c_ptr1 = c_ptr + (s->frame->linesize[j] << log2_blocksize);
-                    for (x = 0; x < (1 << (log2_blocksize - 1)); x++) {
+                    for (x = 0; x < (1 << FFMAX(log2_blocksize - 1, 0)); x++) {
                         c_ptr[x]  = pixels[x];
                         c_ptr1[x] = ptr1[x];
                     }
@@ -354,7 +503,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
     int apt, is16_9, ret;
     const AVDVProfile *sys;
 
-    sys = av_dv_frame_profile(s->sys, buf, buf_size);
+    sys = ff_dv_frame_profile(avctx, s->sys, buf, buf_size);
     if (!sys || buf_size < sys->frame_size) {
         av_log(avctx, AV_LOG_ERROR, "could not find dv frame profile\n");
         return -1; /* NOTE: we only accept several full frames */
@@ -366,6 +515,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
             av_log(avctx, AV_LOG_ERROR, "Error initializing the work tables.\n");
             return ret;
         }
+        dv_init_weight_tables(s, sys);
         s->sys = sys;
     }
 
@@ -388,13 +538,16 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
         ff_set_sar(avctx, s->sys->sar[is16_9]);
     }
 
-    if (ff_get_buffer(avctx, s->frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+        return ret;
     s->frame->interlaced_frame = 1;
     s->frame->top_field_first  = 0;
 
+    /* Determine the codec's field order from the packet */
+    if ( *vsc_pack == dv_video_control ) {
+        s->frame->top_field_first = !(vsc_pack[3] & 0x40);
+    }
+
     s->buf = buf;
     avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
@@ -416,4 +569,5 @@ AVCodec ff_dvvideo_decoder = {
     .init           = dvvideo_decode_init,
     .decode         = dvvideo_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/dvdsub_parser.c b/libavcodec/dvdsub_parser.c
index 2ad3b33602..32a945ed65 100644
--- a/libavcodec/dvdsub_parser.c
+++ b/libavcodec/dvdsub_parser.c
@@ -2,20 +2,20 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,8 +45,11 @@ static int dvdsub_parse(AVCodecParserContext *s,
     DVDSubParseContext *pc = s->priv_data;
 
     if (pc->packet_index == 0) {
-        if (buf_size < 2)
-            return 0;
+        if (buf_size < 2 || AV_RB16(buf) && buf_size < 6) {
+            if (buf_size)
+                av_log(avctx, AV_LOG_DEBUG, "Parser input %d too small\n", buf_size);
+            return buf_size;
+        }
         pc->packet_len = AV_RB16(buf);
         if (pc->packet_len == 0) /* HD-DVD subpicture packet */
             pc->packet_len = AV_RB32(buf+2);
diff --git a/libavcodec/dvdsubdec.c b/libavcodec/dvdsubdec.c
index a309e74346..6f3fda3a29 100644
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@@ -2,20 +2,20 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,26 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bswap.h"
 
-typedef struct DVDSubContext {
-    uint32_t palette[16];
-    int      has_palette;
+typedef struct DVDSubContext
+{
+  AVClass *class;
+  uint32_t palette[16];
+  char    *palette_str;
+  char    *ifo_str;
+  int      has_palette;
+  uint8_t  colormap[4];
+  uint8_t  alpha[256];
+  uint8_t  buf[0x10000];
+  int      buf_size;
+  int      forced_subs_only;
+#ifdef DEBUG
+  int sub_id;
+#endif
 } DVDSubContext;
 
 static void yuv_a_to_rgba(const uint8_t *ycbcr, const uint8_t *alpha, uint32_t *rgba, int num_values)
@@ -94,6 +108,12 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
     int x, y, len, color;
     uint8_t *d;
 
+    if (start >= buf_size)
+        return -1;
+
+    if (w <= 0 || h <= 0)
+        return -1;
+
     bit_len = (buf_size - start) * 8;
     init_get_bits(&gb, buf + start, bit_len);
 
@@ -125,17 +145,24 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
 
 static void guess_palette(DVDSubContext* ctx,
                           uint32_t *rgba_palette,
-                          uint8_t *colormap,
-                          uint8_t *alpha,
                           uint32_t subtitle_color)
 {
+    static const uint8_t level_map[4][4] = {
+        // this configuration (full range, lowest to highest) in tests
+        // seemed most common, so assume this
+        {0xff},
+        {0x00, 0xff},
+        {0x00, 0x80, 0xff},
+        {0x00, 0x55, 0xaa, 0xff},
+    };
     uint8_t color_used[16] = { 0 };
     int nb_opaque_colors, i, level, j, r, g, b;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
 
-    if (ctx->has_palette) {
-        for (i = 0; i < 4; i++)
+    if(ctx->has_palette) {
+        for(i = 0; i < 4; i++)
             rgba_palette[i] = (ctx->palette[colormap[i]] & 0x00ffffff)
-                              | ((alpha[i] * 17) << 24);
+                              | ((alpha[i] * 17U) << 24);
         return;
     }
 
@@ -153,18 +180,18 @@ static void guess_palette(DVDSubContext* ctx,
     if (nb_opaque_colors == 0)
         return;
 
-    j = nb_opaque_colors;
+    j = 0;
     memset(color_used, 0, 16);
     for(i = 0; i < 4; i++) {
         if (alpha[i] != 0) {
             if (!color_used[colormap[i]])  {
-                level = (0xff * j) / nb_opaque_colors;
+                level = level_map[nb_opaque_colors][j];
                 r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
                 g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
                 b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
                 rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24);
                 color_used[colormap[i]] = (i + 1);
-                j--;
+                j++;
             } else {
                 rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
                                     ((alpha[i] * 17) << 24);
@@ -173,6 +200,21 @@ static void guess_palette(DVDSubContext* ctx,
     }
 }
 
+static void reset_rects(AVSubtitle *sub_header)
+{
+    int i;
+
+    if (sub_header->rects) {
+        for (i = 0; i < sub_header->num_rects; i++) {
+            av_freep(&sub_header->rects[i]->pict.data[0]);
+            av_freep(&sub_header->rects[i]->pict.data[1]);
+            av_freep(&sub_header->rects[i]);
+        }
+        av_freep(&sub_header->rects);
+        sub_header->num_rects = 0;
+    }
+}
+
 #define READ_OFFSET(a) (big_offsets ? AV_RB32(a) : AV_RB16(a))
 
 static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
@@ -180,15 +222,15 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
 {
     int cmd_pos, pos, cmd, x1, y1, x2, y2, offset1, offset2, next_cmd_pos;
     int big_offsets, offset_size, is_8bit = 0;
-    const uint8_t *yuv_palette = 0;
-    uint8_t colormap[4] = { 0 }, alpha[256] = { 0 };
+    const uint8_t *yuv_palette = NULL;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
     int date;
     int i;
     int is_menu = 0;
+    uint32_t size;
 
     if (buf_size < 10)
         return -1;
-    memset(sub_header, 0, sizeof(*sub_header));
 
     if (AV_RB16(buf) == 0) {   /* HD subpicture with 4-byte offsets */
         big_offsets = 1;
@@ -200,8 +242,17 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
         cmd_pos = 2;
     }
 
+    size = READ_OFFSET(buf + (big_offsets ? 2 : 0));
     cmd_pos = READ_OFFSET(buf + cmd_pos);
 
+    if (cmd_pos < 0 || cmd_pos > buf_size - 2 - offset_size) {
+        if (cmd_pos > size) {
+            av_log(ctx, AV_LOG_ERROR, "Discarding invalid packet\n");
+            return 0;
+        }
+        return AVERROR(EAGAIN);
+    }
+
     while (cmd_pos > 0 && cmd_pos < buf_size - 2 - offset_size) {
         date = AV_RB16(buf + cmd_pos);
         next_cmd_pos = READ_OFFSET(buf + cmd_pos + 2);
@@ -246,7 +297,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 alpha[1] = buf[pos + 1] >> 4;
                 alpha[0] = buf[pos + 1] & 0x0f;
                 pos += 2;
-            ff_dlog(NULL, "alpha=%x%x%x%x\n", alpha[0],alpha[1],alpha[2],alpha[3]);
+                ff_dlog(NULL, "alpha=%x%x%x%x\n", alpha[0],alpha[1],alpha[2],alpha[3]);
                 break;
             case 0x05:
             case 0x85:
@@ -310,19 +361,11 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
             w = x2 - x1 + 1;
             if (w < 0)
                 w = 0;
-            h = y2 - y1;
+            h = y2 - y1 + 1;
             if (h < 0)
                 h = 0;
             if (w > 0 && h > 0) {
-                if (sub_header->rects) {
-                    for (i = 0; i < sub_header->num_rects; i++) {
-                        av_freep(&sub_header->rects[i]->pict.data[0]);
-                        av_freep(&sub_header->rects[i]->pict.data[1]);
-                        av_freep(&sub_header->rects[i]);
-                    }
-                    av_freep(&sub_header->rects);
-                    sub_header->num_rects = 0;
-                }
+                reset_rects(sub_header);
 
                 sub_header->rects = av_mallocz(sizeof(*sub_header->rects));
                 if (!sub_header->rects)
@@ -334,23 +377,24 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 bitmap = sub_header->rects[0]->pict.data[0] = av_malloc(w * h);
                 if (!bitmap)
                     goto fail;
-                decode_rle(bitmap, w * 2, w, (h + 1) / 2,
-                           buf, offset1, buf_size, is_8bit);
-                decode_rle(bitmap + w, w * 2, w, h / 2,
-                           buf, offset2, buf_size, is_8bit);
+                if (decode_rle(bitmap, w * 2, w, (h + 1) / 2,
+                               buf, offset1, buf_size, is_8bit) < 0)
+                    goto fail;
+                if (decode_rle(bitmap + w, w * 2, w, h / 2,
+                               buf, offset2, buf_size, is_8bit) < 0)
+                    goto fail;
                 sub_header->rects[0]->pict.data[1] = av_mallocz(AVPALETTE_SIZE);
                 if (!sub_header->rects[0]->pict.data[1])
                     goto fail;
                 if (is_8bit) {
-                    if (yuv_palette == 0)
+                    if (!yuv_palette)
                         goto fail;
                     sub_header->rects[0]->nb_colors = 256;
                     yuv_a_to_rgba(yuv_palette, alpha, (uint32_t*)sub_header->rects[0]->pict.data[1], 256);
                 } else {
                     sub_header->rects[0]->nb_colors = 4;
-                    guess_palette(ctx,
-                                  (uint32_t*)sub_header->rects[0]->pict.data[1],
-                                  colormap, alpha, 0xffff00);
+                    guess_palette(ctx, (uint32_t*)sub_header->rects[0]->pict.data[1],
+                                  0xffff00);
                 }
                 sub_header->rects[0]->x = x1;
                 sub_header->rects[0]->y = y1;
@@ -358,8 +402,13 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 sub_header->rects[0]->h = h;
                 sub_header->rects[0]->type = SUBTITLE_BITMAP;
                 sub_header->rects[0]->pict.linesize[0] = w;
+                sub_header->rects[0]->flags = is_menu ? AV_SUBTITLE_FLAG_FORCED : 0;
             }
         }
+        if (next_cmd_pos < cmd_pos) {
+            av_log(ctx, AV_LOG_ERROR, "Invalid command offset\n");
+            break;
+        }
         if (next_cmd_pos == cmd_pos)
             break;
         cmd_pos = next_cmd_pos;
@@ -367,15 +416,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
     if (sub_header->num_rects > 0)
         return is_menu;
  fail:
-    if (!sub_header->rects) {
-        for (i = 0; i < sub_header->num_rects; i++) {
-            av_freep(&sub_header->rects[i]->pict.data[0]);
-            av_freep(&sub_header->rects[i]->pict.data[1]);
-            av_freep(&sub_header->rects[i]);
-        }
-        av_freep(&sub_header->rects);
-        sub_header->num_rects = 0;
-    }
+    reset_rects(sub_header);
     return -1;
 }
 
@@ -446,16 +487,19 @@ static int find_smallest_bounding_rectangle(AVSubtitle *s)
 }
 
 #ifdef DEBUG
+#define ALPHA_MIX(A,BACK,FORE) (((255-(A)) * (BACK) + (A) * (FORE)) / 255)
 static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
                      uint32_t *rgba_palette)
 {
-    int x, y, v;
+    int x, y, alpha;
+    uint32_t v;
+    int back[3] = {0, 255, 0};  /* green background */
     FILE *f;
 
     f = fopen(filename, "w");
     if (!f) {
         perror(filename);
-        exit(1);
+        return;
     }
     fprintf(f, "P6\n"
             "%d %d\n"
@@ -464,15 +508,32 @@ static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
     for(y = 0; y < h; y++) {
         for(x = 0; x < w; x++) {
             v = rgba_palette[bitmap[y * w + x]];
-            putc((v >> 16) & 0xff, f);
-            putc((v >> 8) & 0xff, f);
-            putc((v >> 0) & 0xff, f);
+            alpha = v >> 24;
+            putc(ALPHA_MIX(alpha, back[0], (v >> 16) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[1], (v >> 8) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[2], (v >> 0) & 0xff), f);
         }
     }
     fclose(f);
 }
 #endif
 
+static int append_to_cached_buf(AVCodecContext *avctx,
+                                const uint8_t *buf, int buf_size)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+
+    if (ctx->buf_size >= sizeof(ctx->buf) - buf_size) {
+        av_log(avctx, AV_LOG_WARNING, "Attempt to reconstruct "
+               "too large SPU packets aborted.\n");
+        ctx->buf_size = 0;
+        return AVERROR_INVALIDDATA;
+    }
+    memcpy(ctx->buf + ctx->buf_size, buf, buf_size);
+    ctx->buf_size += buf_size;
+    return 0;
+}
+
 static int dvdsub_decode(AVCodecContext *avctx,
                          void *data, int *data_size,
                          AVPacket *avpkt)
@@ -481,12 +542,29 @@ static int dvdsub_decode(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     AVSubtitle *sub = data;
+    int appended = 0;
     int is_menu;
 
+    if (ctx->buf_size) {
+        int ret = append_to_cached_buf(avctx, buf, buf_size);
+        if (ret < 0) {
+            *data_size = 0;
+            return ret;
+        }
+        buf = ctx->buf;
+        buf_size = ctx->buf_size;
+        appended = 1;
+    }
+
     is_menu = decode_dvd_subtitles(ctx, sub, buf, buf_size);
+    if (is_menu == AVERROR(EAGAIN)) {
+        *data_size = 0;
+        return appended ? 0 : append_to_cached_buf(avctx, buf, buf_size);
+    }
 
     if (is_menu < 0) {
     no_subtitle:
+        reset_rects(sub);
         *data_size = 0;
 
         return buf_size;
@@ -494,61 +572,189 @@ static int dvdsub_decode(AVCodecContext *avctx,
     if (!is_menu && find_smallest_bounding_rectangle(sub) == 0)
         goto no_subtitle;
 
+    if (ctx->forced_subs_only && !(sub->rects[0]->flags & AV_SUBTITLE_FLAG_FORCED))
+        goto no_subtitle;
+
 #if defined(DEBUG)
+    {
+    char ppm_name[32];
+
+    snprintf(ppm_name, sizeof(ppm_name), "/tmp/%05d.ppm", ctx->sub_id++);
     ff_dlog(NULL, "start=%d ms end =%d ms\n",
             sub->start_display_time,
             sub->end_display_time);
-    ppm_save("/tmp/a.ppm", sub->rects[0]->pict.data[0],
-             sub->rects[0]->w, sub->rects[0]->h, sub->rects[0]->pict.data[1]);
+    ppm_save(ppm_name, sub->rects[0]->pict.data[0],
+             sub->rects[0]->w, sub->rects[0]->h, (uint32_t*) sub->rects[0]->pict.data[1]);
+    }
 #endif
 
+    ctx->buf_size = 0;
     *data_size = 1;
     return buf_size;
 }
 
-static av_cold int dvdsub_init(AVCodecContext *avctx)
+static void parse_palette(DVDSubContext *ctx, char *p)
 {
-    DVDSubContext *ctx = avctx->priv_data;
-    char *data, *cur;
+    int i;
+
+    ctx->has_palette = 1;
+    for(i=0;i<16;i++) {
+        ctx->palette[i] = strtoul(p, &p, 16);
+        while(*p == ',' || av_isspace(*p))
+            p++;
+    }
+}
+
+static int parse_ifo_palette(DVDSubContext *ctx, char *p)
+{
+    FILE *ifo;
+    char ifostr[12];
+    uint32_t sp_pgci, pgci, off_pgc, pgc;
+    uint8_t r, g, b, yuv[65], *buf;
+    int i, y, cb, cr, r_add, g_add, b_add;
     int ret = 0;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    ctx->has_palette = 0;
+    if ((ifo = fopen(p, "r")) == NULL) {
+        av_log(ctx, AV_LOG_WARNING, "Unable to open IFO file \"%s\": %s\n", p, av_err2str(AVERROR(errno)));
+        return AVERROR_EOF;
+    }
+    if (fread(ifostr, 12, 1, ifo) != 1 || memcmp(ifostr, "DVDVIDEO-VTS", 12)) {
+        av_log(ctx, AV_LOG_WARNING, "\"%s\" is not a proper IFO file\n", p);
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if (fseek(ifo, 0xCC, SEEK_SET) == -1) {
+        ret = AVERROR(errno);
+        goto end;
+    }
+    if (fread(&sp_pgci, 4, 1, ifo) == 1) {
+        pgci = av_be2ne32(sp_pgci) * 2048;
+        if (fseek(ifo, pgci + 0x0C, SEEK_SET) == -1) {
+            ret = AVERROR(errno);
+            goto end;
+        }
+        if (fread(&off_pgc, 4, 1, ifo) == 1) {
+            pgc = pgci + av_be2ne32(off_pgc);
+            if (fseek(ifo, pgc + 0xA4, SEEK_SET) == -1) {
+                ret = AVERROR(errno);
+                goto end;
+            }
+            if (fread(yuv, 64, 1, ifo) == 1) {
+                buf = yuv;
+                for(i=0; i<16; i++) {
+                    y  = *++buf;
+                    cr = *++buf;
+                    cb = *++buf;
+                    YUV_TO_RGB1_CCIR(cb, cr);
+                    YUV_TO_RGB2_CCIR(r, g, b, y);
+                    ctx->palette[i] = (r << 16) + (g << 8) + b;
+                    buf++;
+                }
+                ctx->has_palette = 1;
+            }
+        }
+    }
+    if (ctx->has_palette == 0) {
+        av_log(ctx, AV_LOG_WARNING, "Failed to read palette from IFO file \"%s\"\n", p);
+        ret = AVERROR_INVALIDDATA;
+    }
+end:
+    fclose(ifo);
+    return ret;
+}
+
+static int dvdsub_parse_extradata(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = (DVDSubContext*) avctx->priv_data;
+    char *dataorig, *data;
+    int ret = 1;
 
     if (!avctx->extradata || !avctx->extradata_size)
-        return 0;
+        return 1;
 
-    data = av_malloc(avctx->extradata_size + 1);
+    dataorig = data = av_malloc(avctx->extradata_size+1);
     if (!data)
         return AVERROR(ENOMEM);
     memcpy(data, avctx->extradata, avctx->extradata_size);
     data[avctx->extradata_size] = '\0';
-    cur = data;
-
-    while (*cur) {
-        if (strncmp("palette:", cur, 8) == 0) {
-            int i;
-            char *p = cur + 8;
-            ctx->has_palette = 1;
-            for (i = 0; i < 16; i++) {
-                ctx->palette[i] = strtoul(p, &p, 16);
-                while (*p == ',' || av_isspace(*p))
-                    p++;
-            }
-        } else if (!strncmp("size:", cur, 5)) {
+
+    for(;;) {
+        int pos = strcspn(data, "\n\r");
+        if (pos==0 && *data==0)
+            break;
+
+        if (strncmp("palette:", data, 8) == 0) {
+            parse_palette(ctx, data + 8);
+        } else if (strncmp("size:", data, 5) == 0) {
             int w, h;
-            if (sscanf(cur + 5, "%dx%d", &w, &h) == 2) {
+            if (sscanf(data + 5, "%dx%d", &w, &h) == 2) {
                ret = ff_set_dimensions(avctx, w, h);
                if (ret < 0)
                    goto fail;
             }
         }
-        cur += strcspn(cur, "\n\r");
-        cur += strspn(cur, "\n\r");
+
+        data += pos;
+        data += strspn(data, "\n\r");
     }
 
 fail:
-    av_free(data);
+    av_free(dataorig);
     return ret;
 }
 
+static av_cold int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    int ret;
+
+    if ((ret = dvdsub_parse_extradata(avctx)) < 0)
+        return ret;
+
+    if (ctx->ifo_str)
+        parse_ifo_palette(ctx, ctx->ifo_str);
+    if (ctx->palette_str)
+        parse_palette(ctx, ctx->palette_str);
+    if (ctx->has_palette) {
+        int i;
+        av_log(avctx, AV_LOG_DEBUG, "palette:");
+        for(i=0;i<16;i++)
+            av_log(avctx, AV_LOG_DEBUG, " 0x%06x", ctx->palette[i]);
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+    }
+
+    return 1;
+}
+
+static void dvdsub_flush(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    ctx->buf_size = 0;
+}
+
+static av_cold int dvdsub_close(AVCodecContext *avctx)
+{
+    dvdsub_flush(avctx);
+    return 0;
+}
+
+#define OFFSET(field) offsetof(DVDSubContext, field)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "palette", "set the global palette", OFFSET(palette_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "ifo_palette", "obtain the global palette from .IFO file", OFFSET(ifo_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, SD},
+    { NULL }
+};
+static const AVClass dvdsub_class = {
+    .class_name = "dvdsubdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_decoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
@@ -557,4 +763,7 @@ AVCodec ff_dvdsub_decoder = {
     .priv_data_size = sizeof(DVDSubContext),
     .init           = dvdsub_init,
     .decode         = dvdsub_decode,
+    .flush          = dvdsub_flush,
+    .close          = dvdsub_close,
+    .priv_class     = &dvdsub_class,
 };
diff --git a/libavcodec/dvdsubenc.c b/libavcodec/dvdsubenc.c
index f62e943e7f..425f0af9ee 100644
--- a/libavcodec/dvdsubenc.c
+++ b/libavcodec/dvdsubenc.c
@@ -2,27 +2,35 @@
  * DVD subtitle encoding
  * Copyright (c) 2005 Wolfram Gloger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
-#undef NDEBUG
-#include <assert.h>
+typedef struct {
+    AVClass *class;
+    uint32_t global_palette[16];
+    int even_rows_fix;
+} DVDSubtitleContext;
 
 // ncnt is the nibble counter
 #define PUTNIBBLE(val)\
@@ -53,7 +61,7 @@ static void dvd_encode_rle(uint8_t **pq,
                 if (bitmap[x+len] != color)
                     break;
             color = cmap[color];
-            assert(color < 4);
+            av_assert0(color < 4);
             if (len < 0x04) {
                 PUTNIBBLE((len << 2)|color);
             } else if (len < 0x10) {
@@ -86,72 +94,265 @@ static void dvd_encode_rle(uint8_t **pq,
     *pq = q;
 }
 
-static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
+static int color_distance(uint32_t a, uint32_t b)
+{
+    int r = 0, d, i;
+    int alpha_a = 8, alpha_b = 8;
+
+    for (i = 24; i >= 0; i -= 8) {
+        d = alpha_a * (int)((a >> i) & 0xFF) -
+            alpha_b * (int)((b >> i) & 0xFF);
+        r += d * d;
+        alpha_a = a >> 28;
+        alpha_b = b >> 28;
+    }
+    return r;
+}
+
+/**
+ * Count colors used in a rectangle, quantizing alpha and grouping by
+ * nearest global palette entry.
+ */
+static void count_colors(AVCodecContext *avctx, unsigned hits[33],
+                         const AVSubtitleRect *r)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    unsigned count[256] = { 0 };
+    uint32_t *palette = (uint32_t *)r->pict.data[1];
+    uint32_t color;
+    int x, y, i, j, match, d, best_d, av_uninit(best_j);
+    uint8_t *p = r->pict.data[0];
+
+    for (y = 0; y < r->h; y++) {
+        for (x = 0; x < r->w; x++)
+            count[*(p++)]++;
+        p += r->pict.linesize[0] - r->w;
+    }
+    for (i = 0; i < 256; i++) {
+        if (!count[i]) /* avoid useless search */
+            continue;
+        color = palette[i];
+        /* 0: transparent, 1-16: semi-transparent, 17-33 opaque */
+        match = color < 0x33000000 ? 0 : color < 0xCC000000 ? 1 : 17;
+        if (match) {
+            best_d = INT_MAX;
+            for (j = 0; j < 16; j++) {
+                d = color_distance(0xFF000000 | color,
+                                   0xFF000000 | dvdc->global_palette[j]);
+                if (d < best_d) {
+                    best_d = d;
+                    best_j = j;
+                }
+            }
+            match += best_j;
+        }
+        hits[match] += count[i];
+    }
+}
+
+static void select_palette(AVCodecContext *avctx, int out_palette[4],
+                           int out_alpha[4], unsigned hits[33])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, bright, mult;
+    uint32_t color;
+    int selected[4] = { 0 };
+    uint32_t pseudopal[33] = { 0 };
+    uint32_t refcolor[3] = { 0x00000000, 0xFFFFFFFF, 0xFF000000 };
+
+    /* Bonus for transparent: if the rectangle fits tightly the text, the
+       background color can be quite rare, but it would be ugly without it */
+    hits[0] *= 16;
+    /* Bonus for bright colors */
+    for (i = 0; i < 16; i++) {
+        if (!(hits[1 + i] + hits[17 + i]))
+            continue; /* skip unused colors to gain time */
+        color = dvdc->global_palette[i];
+        bright = 0;
+        for (j = 0; j < 3; j++, color >>= 8)
+            bright += (color & 0xFF) < 0x40 || (color & 0xFF) >= 0xC0;
+        mult = 2 + FFMIN(bright, 2);
+        hits[ 1 + i] *= mult;
+        hits[17 + i] *= mult;
+    }
+
+    /* Select four most frequent colors */
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 33; j++)
+            if (hits[j] > hits[selected[i]])
+                selected[i] = j;
+        hits[selected[i]] = 0;
+    }
+
+    /* Order the colors like in most DVDs:
+       0: background, 1: foreground, 2: outline */
+    for (i = 0; i < 16; i++) {
+        pseudopal[ 1 + i] = 0x80000000 | dvdc->global_palette[i];
+        pseudopal[17 + i] = 0xFF000000 | dvdc->global_palette[i];
+    }
+    for (i = 0; i < 3; i++) {
+        int best_d = color_distance(refcolor[i], pseudopal[selected[i]]);
+        for (j = i + 1; j < 4; j++) {
+            int d = color_distance(refcolor[i], pseudopal[selected[j]]);
+            if (d < best_d) {
+                FFSWAP(int, selected[i], selected[j]);
+                best_d = d;
+            }
+        }
+    }
+
+    /* Output */
+    for (i = 0; i < 4; i++) {
+        out_palette[i] = selected[i] ? (selected[i] - 1) & 0xF : 0;
+        out_alpha  [i] = !selected[i] ? 0 : selected[i] < 17 ? 0x80 : 0xFF;
+    }
+}
+
+static void build_color_map(AVCodecContext *avctx, int cmap[],
+                            const uint32_t palette[],
+                            const int out_palette[], unsigned int const out_alpha[])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, d, best_d;
+    uint32_t pseudopal[4];
+
+    for (i = 0; i < 4; i++)
+        pseudopal[i] = (out_alpha[i] << 24) |
+                       dvdc->global_palette[out_palette[i]];
+    for (i = 0; i < 256; i++) {
+        best_d = INT_MAX;
+        for (j = 0; j < 4; j++) {
+            d = color_distance(pseudopal[j], palette[i]);
+            if (d < best_d) {
+                cmap[i] = j;
+                best_d = d;
+            }
+        }
+    }
+}
+
+static void copy_rectangle(AVSubtitleRect *dst, AVSubtitleRect *src, int cmap[])
+{
+    int x, y;
+    uint8_t *p, *q;
+
+    p = src->pict.data[0];
+    q = dst->pict.data[0] + (src->x - dst->x) +
+                            (src->y - dst->y) * dst->pict.linesize[0];
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++)
+            *(q++) = cmap[*(p++)];
+        p += src->pict.linesize[0] - src->w;
+        q += dst->pict.linesize[0] - src->w;
+    }
+}
+
+static int encode_dvd_subtitles(AVCodecContext *avctx,
+                                uint8_t *outbuf, int outbuf_size,
                                 const AVSubtitle *h)
 {
+    DVDSubtitleContext *dvdc = avctx->priv_data;
     uint8_t *q, *qq;
-    int object_id;
-    int offset1[20], offset2[20];
-    int i, imax, color, alpha, rects = h->num_rects;
-    unsigned long hmax;
-    unsigned long hist[256];
-    int           cmap[256];
+    int offset1, offset2;
+    int i, rects = h->num_rects, ret;
+    unsigned global_palette_hits[33] = { 0 };
+    int cmap[256];
+    int out_palette[4];
+    int out_alpha[4];
+    AVSubtitleRect vrect;
+    uint8_t *vrect_data = NULL;
+    int x2, y2;
+    int forced = 0;
 
     if (rects == 0 || !h->rects)
-        return -1;
-    if (rects > 20)
-        rects = 20;
-
-    // analyze bitmaps, compress to 4 colors
-    for (i=0; i<256; ++i) {
-        hist[i] = 0;
-        cmap[i] = 0;
-    }
-    for (object_id = 0; object_id < rects; object_id++)
-        for (i=0; i<h->rects[object_id]->w*h->rects[object_id]->h; ++i) {
-            color = h->rects[object_id]->pict.data[0][i];
-            // only count non-transparent pixels
-            alpha = ((uint32_t*)h->rects[object_id]->pict.data[1])[color] >> 24;
-            hist[color] += alpha;
+        return AVERROR(EINVAL);
+    for (i = 0; i < rects; i++)
+        if (h->rects[i]->type != SUBTITLE_BITMAP) {
+            av_log(avctx, AV_LOG_ERROR, "Bitmap subtitle required\n");
+            return AVERROR(EINVAL);
         }
-    for (color=3;; --color) {
-        hmax = 0;
-        imax = 0;
-        for (i=0; i<256; ++i)
-            if (hist[i] > hmax) {
-                imax = i;
-                hmax = hist[i];
-            }
-        if (hmax == 0)
+    /* Mark this subtitle forced if any of the rectangles is forced. */
+    for (i = 0; i < rects; i++)
+        if ((h->rects[i]->flags & AV_SUBTITLE_FLAG_FORCED) != 0) {
+            forced = 1;
             break;
-        if (color == 0)
-            color = 3;
-        av_log(NULL, AV_LOG_DEBUG, "dvd_subtitle hist[%d]=%ld -> col %d\n",
-               imax, hist[imax], color);
-        cmap[imax] = color;
-        hist[imax] = 0;
+        }
+    vrect = *h->rects[0];
+
+    if (rects > 1) {
+        /* DVD subtitles can have only one rectangle: build a virtual
+           rectangle containing all actual rectangles.
+           The data of the rectangles will be copied later, when the palette
+           is decided, because the rectangles may have different palettes. */
+        int xmin = h->rects[0]->x, xmax = xmin + h->rects[0]->w;
+        int ymin = h->rects[0]->y, ymax = ymin + h->rects[0]->h;
+        for (i = 1; i < rects; i++) {
+            xmin = FFMIN(xmin, h->rects[i]->x);
+            ymin = FFMIN(ymin, h->rects[i]->y);
+            xmax = FFMAX(xmax, h->rects[i]->x + h->rects[i]->w);
+            ymax = FFMAX(ymax, h->rects[i]->y + h->rects[i]->h);
+        }
+        vrect.x = xmin;
+        vrect.y = ymin;
+        vrect.w = xmax - xmin;
+        vrect.h = ymax - ymin;
+        if ((ret = av_image_check_size(vrect.w, vrect.h, 0, avctx)) < 0)
+            return ret;
+
+        /* Count pixels outside the virtual rectangle as transparent */
+        global_palette_hits[0] = vrect.w * vrect.h;
+        for (i = 0; i < rects; i++)
+            global_palette_hits[0] -= h->rects[i]->w * h->rects[i]->h;
     }
 
+    for (i = 0; i < rects; i++)
+        count_colors(avctx, global_palette_hits, h->rects[i]);
+    select_palette(avctx, out_palette, out_alpha, global_palette_hits);
+
+    if (rects > 1) {
+        if (!(vrect_data = av_calloc(vrect.w, vrect.h)))
+            return AVERROR(ENOMEM);
+        vrect.pict.data    [0] = vrect_data;
+        vrect.pict.linesize[0] = vrect.w;
+        for (i = 0; i < rects; i++) {
+            build_color_map(avctx, cmap, (uint32_t *)h->rects[i]->pict.data[1],
+                            out_palette, out_alpha);
+            copy_rectangle(&vrect, h->rects[i], cmap);
+        }
+        for (i = 0; i < 4; i++)
+            cmap[i] = i;
+    } else {
+        build_color_map(avctx, cmap, (uint32_t *)h->rects[0]->pict.data[1],
+                        out_palette, out_alpha);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Selected palette:");
+    for (i = 0; i < 4; i++)
+        av_log(avctx, AV_LOG_DEBUG, " 0x%06x@@%02x (0x%x,0x%x)",
+               dvdc->global_palette[out_palette[i]], out_alpha[i],
+               out_palette[i], out_alpha[i] >> 4);
+    av_log(avctx, AV_LOG_DEBUG, "\n");
 
     // encode data block
     q = outbuf + 4;
-    for (object_id = 0; object_id < rects; object_id++) {
-        offset1[object_id] = q - outbuf;
-        // worst case memory requirement: 1 nibble per pixel..
-        if ((q - outbuf) + h->rects[object_id]->w*h->rects[object_id]->h/2
-            + 17*rects + 21 > outbuf_size) {
-            av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
-            return -1;
-        }
-        dvd_encode_rle(&q, h->rects[object_id]->pict.data[0],
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
-        offset2[object_id] = q - outbuf;
-        dvd_encode_rle(&q, h->rects[object_id]->pict.data[0] + h->rects[object_id]->w,
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
+    offset1 = q - outbuf;
+    // worst case memory requirement: 1 nibble per pixel..
+    if ((q - outbuf) + vrect.w * vrect.h / 2 + 17 + 21 > outbuf_size) {
+        av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
+        ret = AVERROR_BUFFER_TOO_SMALL;
+        goto fail;
+    }
+    dvd_encode_rle(&q, vrect.pict.data[0], vrect.w * 2,
+                   vrect.w, (vrect.h + 1) >> 1, cmap);
+    offset2 = q - outbuf;
+    dvd_encode_rle(&q, vrect.pict.data[0] + vrect.w, vrect.w * 2,
+                   vrect.w, vrect.h >> 1, cmap);
+
+    if (dvdc->even_rows_fix && (vrect.h & 1)) {
+        // Work-around for some players that want the height to be even.
+        vrect.h++;
+        *q++ = 0x00; // 0x00 0x00 == empty row, i.e. fully transparent
+        *q++ = 0x00;
     }
 
     // set data packet size
@@ -160,35 +361,34 @@ static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
 
     // send start display command
     bytestream_put_be16(&q, (h->start_display_time*90) >> 10);
-    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12*rects + 2);
+    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12 + 2);
     *q++ = 0x03; // palette - 4 nibbles
-    *q++ = 0x03; *q++ = 0x7f;
+    *q++ = (out_palette[3] << 4) | out_palette[2];
+    *q++ = (out_palette[1] << 4) | out_palette[0];
     *q++ = 0x04; // alpha - 4 nibbles
-    *q++ = 0xf0; *q++ = 0x00;
-    //*q++ = 0x0f; *q++ = 0xff;
+    *q++ = (out_alpha[3] & 0xF0) | (out_alpha[2] >> 4);
+    *q++ = (out_alpha[1] & 0xF0) | (out_alpha[0] >> 4);
 
-    // XXX not sure if more than one rect can really be encoded..
     // 12 bytes per rect
-    for (object_id = 0; object_id < rects; object_id++) {
-        int x2 = h->rects[object_id]->x + h->rects[object_id]->w - 1;
-        int y2 = h->rects[object_id]->y + h->rects[object_id]->h - 1;
-
-        *q++ = 0x05;
-        // x1 x2 -> 6 nibbles
-        *q++ = h->rects[object_id]->x >> 4;
-        *q++ = (h->rects[object_id]->x << 4) | ((x2 >> 8) & 0xf);
-        *q++ = x2;
-        // y1 y2 -> 6 nibbles
-        *q++ = h->rects[object_id]->y >> 4;
-        *q++ = (h->rects[object_id]->y << 4) | ((y2 >> 8) & 0xf);
-        *q++ = y2;
-
-        *q++ = 0x06;
-        // offset1, offset2
-        bytestream_put_be16(&q, offset1[object_id]);
-        bytestream_put_be16(&q, offset2[object_id]);
-    }
-    *q++ = 0x01; // start command
+    x2 = vrect.x + vrect.w - 1;
+    y2 = vrect.y + vrect.h - 1;
+
+    *q++ = 0x05;
+    // x1 x2 -> 6 nibbles
+    *q++ = vrect.x >> 4;
+    *q++ = (vrect.x << 4) | ((x2 >> 8) & 0xf);
+    *q++ = x2;
+    // y1 y2 -> 6 nibbles
+    *q++ = vrect.y >> 4;
+    *q++ = (vrect.y << 4) | ((y2 >> 8) & 0xf);
+    *q++ = y2;
+
+    *q++ = 0x06;
+    // offset1, offset2
+    bytestream_put_be16(&q, offset1);
+    bytestream_put_be16(&q, offset2);
+
+    *q++ = forced ? 0x00 : 0x01; // start command
     *q++ = 0xff; // terminating command
 
     // send stop display command last
@@ -200,8 +400,42 @@ static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
     qq = outbuf;
     bytestream_put_be16(&qq, q - outbuf);
 
-    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%td\n", q - outbuf);
-    return q - outbuf;
+    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%"PTRDIFF_SPECIFIER"\n", q - outbuf);
+    ret = q - outbuf;
+
+fail:
+    av_free(vrect_data);
+    return ret;
+}
+
+static int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    static const uint32_t default_palette[16] = {
+        0x000000, 0x0000FF, 0x00FF00, 0xFF0000,
+        0xFFFF00, 0xFF00FF, 0x00FFFF, 0xFFFFFF,
+        0x808000, 0x8080FF, 0x800080, 0x80FF80,
+        0x008080, 0xFF8080, 0x555555, 0xAAAAAA,
+    };
+    AVBPrint extradata;
+    int i, ret;
+
+    av_assert0(sizeof(dvdc->global_palette) == sizeof(default_palette));
+    memcpy(dvdc->global_palette, default_palette, sizeof(dvdc->global_palette));
+
+    av_bprint_init(&extradata, 0, 1);
+    if (avctx->width && avctx->height)
+        av_bprintf(&extradata, "size: %dx%d\n", avctx->width, avctx->height);
+    av_bprintf(&extradata, "palette:");
+    for (i = 0; i < 16; i++)
+        av_bprintf(&extradata, " %06"PRIx32"%c",
+                   dvdc->global_palette[i] & 0xFFFFFF, i < 15 ? ',' : '\n');
+
+    ret = avpriv_bprint_to_extradata(avctx, &extradata);
+    if (ret < 0)
+        return ret;
+
+    return 0;
 }
 
 static int dvdsub_encode(AVCodecContext *avctx,
@@ -211,14 +445,31 @@ static int dvdsub_encode(AVCodecContext *avctx,
     //DVDSubtitleContext *s = avctx->priv_data;
     int ret;
 
-    ret = encode_dvd_subtitles(buf, buf_size, sub);
+    ret = encode_dvd_subtitles(avctx, buf, buf_size, sub);
     return ret;
 }
 
+#define OFFSET(x) offsetof(DVDSubtitleContext, x)
+#define SE AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    {"even_rows_fix", "Make number of rows even (workaround for some players)", OFFSET(even_rows_fix), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, SE},
+    { NULL },
+};
+
+static const AVClass dvdsubenc_class = {
+    .class_name = "VOBSUB subtitle encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_encoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
     .type           = AVMEDIA_TYPE_SUBTITLE,
     .id             = AV_CODEC_ID_DVD_SUBTITLE,
+    .init           = dvdsub_init,
     .encode_sub     = dvdsub_encode,
+    .priv_class     = &dvdsubenc_class,
+    .priv_data_size = sizeof(DVDSubtitleContext),
 };
diff --git a/libavcodec/dvenc.c b/libavcodec/dvenc.c
index 85e2741082..5de12cc843 100644
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@ -2,20 +2,20 @@
  * DV encoder
  * Copyright (c) 2003 Roman Shaposhnik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
     PixblockDSPContext pdsp;
     int ret;
 
-    s->sys = av_dv_codec_profile(avctx->width, avctx->height, avctx->pix_fmt);
+    s->sys = av_dv_codec_profile2(avctx->width, avctx->height, avctx->pix_fmt, avctx->time_base);
     if (!s->sys) {
         av_log(avctx, AV_LOG_ERROR, "Found no DV profile for %ix%i %s video. "
                                     "Valid DV profiles are:\n",
@@ -55,6 +55,10 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
         ff_dv_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
+    if (avctx->height > 576) {
+        av_log(avctx, AV_LOG_ERROR, "DVCPRO HD encoding is not supported.\n");
+        return AVERROR_PATCHWELCOME;
+    }
     ret = ff_dv_init_dynamic_tables(s, s->sys);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing work tables.\n");
@@ -63,6 +67,9 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
 
     dv_vlc_map_tableinit();
 
+    memset(&fdsp,0, sizeof(fdsp));
+    memset(&mecc,0, sizeof(mecc));
+    memset(&pdsp,0, sizeof(pdsp));
     ff_fdctdsp_init(&fdsp, avctx);
     ff_me_cmp_init(&mecc, avctx);
     ff_pixblockdsp_init(&pdsp, avctx);
@@ -165,7 +172,7 @@ static av_always_inline PutBitContext *dv_encode_ac(EncBlockInfo *bi,
             if (bits_left) {
                 size -= bits_left;
                 put_bits(pb, bits_left, vlc >> size);
-                vlc = vlc & ((1 << size) - 1);
+                vlc = av_mod_uintp2(vlc, size);
             }
             if (pb + 1 >= pb_end) {
                 bi->partial_bit_count  = size;
@@ -221,14 +228,14 @@ static const int dv_weight_88[64] = {
     170627, 165371, 160727, 153560, 160727, 144651, 144651, 136258,
 };
 static const int dv_weight_248[64] = {
-    131072, 242189, 257107, 237536, 229376, 200636, 242189, 223754,
-    224969, 196781, 262144, 242189, 229376, 200636, 257107, 237536,
-    211916, 185364, 235923, 217965, 229376, 211916, 206433, 180568,
-    242189, 223754, 224969, 196781, 211916, 185364, 235923, 217965,
-    200704, 175557, 222935, 205965, 200636, 185364, 195068, 170627,
-    229376, 211916, 206433, 180568, 200704, 175557, 222935, 205965,
-    175557, 153560, 188995, 174609, 165371, 144651, 200636, 185364,
-    195068, 170627, 175557, 153560, 188995, 174609, 165371, 144651,
+    131072, 262144, 257107, 257107, 242189, 242189, 242189, 242189,
+    237536, 237536, 229376, 229376, 200636, 200636, 224973, 224973,
+    223754, 223754, 235923, 235923, 229376, 229376, 217965, 217965,
+    211916, 211916, 196781, 196781, 185364, 185364, 206433, 206433,
+    211916, 211916, 222935, 222935, 200636, 200636, 205964, 205964,
+    200704, 200704, 180568, 180568, 175557, 175557, 195068, 195068,
+    185364, 185364, 188995, 188995, 174606, 174606, 175557, 175557,
+    170627, 170627, 153560, 153560, 165371, 165371, 144651, 144651,
 };
 
 static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
@@ -243,7 +250,7 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
      * method suggested in SMPTE 314M Table 22, and an improved
      * method. The SMPTE method is very conservative; it assigns class
      * 3 (i.e. severe quantization) to any block where the largest AC
-     * component is greater than 36. Libav's DV encoder tracks AC bit
+     * component is greater than 36. FFmpeg's DV encoder tracks AC bit
      * consumption precisely, so there is no need to bias most blocks
      * towards strongly lossy compression. Instead, we assign class 2
      * to most blocks, and use class 3 only when strictly necessary
@@ -251,13 +258,13 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
 
 #if 0 /* SMPTE spec method */
     static const int classes[] = { 12, 24, 36, 0xffff };
-#else /* improved Libav method */
+#else /* improved FFmpeg method */
     static const int classes[] = { -1, -1, 255, 0xffff };
 #endif
     int max  = classes[0];
     int prev = 0;
 
-    assert((((int) blk) & 15) == 0);
+    av_assert2((((int) blk) & 15) == 0);
 
     bi->area_q[0]          =
     bi->area_q[1]          =
@@ -290,7 +297,7 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
 
             if (level + 15 > 30U) {
                 bi->sign[i] = (level >> 31) & 1;
-                /* Weight it and and shift down into range, adding for rounding.
+                /* Weight it and shift down into range, adding for rounding.
                  * The extra division by a factor of 2^4 reverses the 8x
                  * expansion of the DCT AND the 2x doubling of the weights. */
                 level     = (FFABS(level) * weight[i] + (1 << (dv_weight_bits + 3))) >>
@@ -359,7 +366,7 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                         b->bit_size[a] = 1; // 4 areas 4 bits for EOB :)
                         b->area_q[a]++;
                         prev = b->prev[a];
-                        assert(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
+                        av_assert2(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
                         for (k = b->next[prev]; k < mb_area_start[a + 1]; k = b->next[k]) {
                             b->mb[k] >>= 1;
                             if (b->mb[k]) {
@@ -369,11 +376,11 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                                 if (b->next[k] >= mb_area_start[a + 1] && b->next[k] < 64) {
                                     for (a2 = a + 1; b->next[k] >= mb_area_start[a2 + 1]; a2++)
                                         b->prev[a2] = prev;
-                                    assert(a2 < 4);
-                                    assert(b->mb[b->next[k]]);
+                                    av_assert2(a2 < 4);
+                                    av_assert2(b->mb[b->next[k]]);
                                     b->bit_size[a2] += dv_rl2vlc_size(b->next[k] - prev - 1, b->mb[b->next[k]]) -
                                                        dv_rl2vlc_size(b->next[k] - k    - 1, b->mb[b->next[k]]);
-                                    assert(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
+                                    av_assert2(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
                                     b->prev[a2] = prev;
                                 }
                                 b->next[prev] = b->next[k];
@@ -567,6 +574,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
      *      compression scheme (if any).
      */
     int apt = (c->sys->pix_fmt == AV_PIX_FMT_YUV420P ? 0 : 1);
+    int fs  = c->frame->top_field_first ? 0x00 : 0x40;
 
     uint8_t aspect = 0;
     if ((int) (av_q2d(c->avctx->sample_aspect_ratio) *
@@ -606,7 +614,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
         buf[2] = 0xc8 |        /* reserved -- always b11001xxx */
                  aspect;
         buf[3] = (1 << 7) |    /* frame/field flag 1 -- frame, 0 -- field */
-                 (1 << 6) |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
+                 fs       |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
                  (1 << 5) |    /* frame change flag 0 -- same picture as before, 1 -- different */
                  (1 << 4) |    /* 1 - interlaced, 0 - noninterlaced */
                  0xc;          /* reserved -- always b1100 */
@@ -709,10 +717,8 @@ static int dvvideo_encode_frame(AVCodecContext *c, AVPacket *pkt,
     DVVideoContext *s = c->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->sys->frame_size)) < 0) {
-        av_log(c, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(c, pkt, s->sys->frame_size, 0)) < 0)
         return ret;
-    }
 
     c->pix_fmt                = s->sys->pix_fmt;
     s->frame                  = frame;
@@ -745,7 +751,7 @@ AVCodec ff_dvvideo_encoder = {
     .priv_data_size = sizeof(DVVideoContext),
     .init           = dvvideo_encode_init,
     .encode2        = dvvideo_encode_frame,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV422P,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
diff --git a/libavcodec/dxa.c b/libavcodec/dxa.c
index b80493580d..f6edc03e1a 100644
--- a/libavcodec/dxa.c
+++ b/libavcodec/dxa.c
@@ -2,20 +2,20 @@
  * Feeble Files/ScummVM DXA decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -41,6 +42,7 @@ typedef struct DxaDecContext {
     AVFrame *prev;
 
     int dsize;
+#define DECOMP_BUF_PADDING 16
     uint8_t *decomp_buf;
     uint32_t pal[256];
 } DxaDecContext;
@@ -49,13 +51,17 @@ static const int shift1[6] = { 0, 8, 8, 8, 4, 4 };
 static const int shift2[6] = { 0, 0, 8, 4, 0, 4 };
 
 static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
-                     int stride, uint8_t *src, uint8_t *ref)
+                     int stride, uint8_t *src, int srcsize, uint8_t *ref)
 {
     uint8_t *code, *data, *mv, *msk, *tmp, *tmp2;
+    uint8_t *src_end = src + srcsize;
     int i, j, k;
     int type, x, y, d, d2;
     uint32_t mask;
 
+    if (12ULL  + ((avctx->width * avctx->height) >> 4) + AV_RB32(src + 0) + AV_RB32(src + 4) > srcsize)
+        return AVERROR_INVALIDDATA;
+
     code = src  + 12;
     data = code + ((avctx->width * avctx->height) >> 4);
     mv   = data + AV_RB32(src + 0);
@@ -63,6 +69,8 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 
     for(j = 0; j < avctx->height; j += 4){
         for(i = 0; i < avctx->width; i += 4){
+            if (data > src_end || mv > src_end || msk > src_end)
+                return AVERROR_INVALIDDATA;
             tmp  = dst + i;
             tmp2 = ref + i;
             type = *code++;
@@ -70,6 +78,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
             case 4: // motion compensation
                 x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                 y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                if (i < -x || avctx->width  - i - 4 < x ||
+                    j < -y || avctx->height - j - 4 < y) {
+                    av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                    return AVERROR_INVALIDDATA;
+                }
                 tmp2 += x + y*stride;
             case 0: // skip
             case 5: // skip in method 12
@@ -127,6 +140,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
                     case 0x80: // motion compensation
                         x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                         y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                        if (i + 2*(k & 1) < -x || avctx->width  - i - 2*(k & 1) - 2 < x ||
+                            j +   (k & 2) < -y || avctx->height - j -   (k & 2) - 2 < y) {
+                            av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                            return AVERROR_INVALIDDATA;
+                        }
                         tmp2 += x + y*stride;
                     case 0x00: // skip
                         tmp[d + 0         ] = tmp2[0];
@@ -192,35 +210,27 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     AVFrame *frame = data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     DxaDecContext * const c = avctx->priv_data;
     uint8_t *outptr, *srcptr, *tmpptr;
     unsigned long dsize;
     int i, j, compr, ret;
     int stride;
-    int orig_buf_size = buf_size;
     int pc = 0;
+    GetByteContext gb;
 
-    /* make the palette available on the way out */
-    if(buf[0]=='C' && buf[1]=='M' && buf[2]=='A' && buf[3]=='P'){
-        int r, g, b;
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-        buf += 4;
+    /* make the palette available on the way out */
+    if (bytestream2_peek_le32(&gb) == MKTAG('C','M','A','P')) {
+        bytestream2_skip(&gb, 4);
         for(i = 0; i < 256; i++){
-            r = *buf++;
-            g = *buf++;
-            b = *buf++;
-            c->pal[i] = (r << 16) | (g << 8) | b;
+            c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         pc = 1;
-        buf_size -= 768+4;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     frame->palette_has_changed = pc;
 
@@ -229,16 +239,25 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     tmpptr = c->prev->data[0];
     stride = frame->linesize[0];
 
-    if(buf[0]=='N' && buf[1]=='U' && buf[2]=='L' && buf[3]=='L')
+    if (bytestream2_get_le32(&gb) == MKTAG('N','U','L','L'))
         compr = -1;
     else
-        compr = buf[4];
+        compr = bytestream2_get_byte(&gb);
 
     dsize = c->dsize;
-    if((compr != 4 && compr != -1) && uncompress(c->decomp_buf, &dsize, buf + 9, buf_size - 9) != Z_OK){
-        av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
-        return AVERROR_UNKNOWN;
+    if (compr != 4 && compr != -1) {
+        bytestream2_skip(&gb, 4);
+        if (uncompress(c->decomp_buf, &dsize, avpkt->data + bytestream2_tell(&gb),
+                       bytestream2_get_bytes_left(&gb)) != Z_OK) {
+            av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
+            return AVERROR_UNKNOWN;
+        }
+        memset(c->decomp_buf + dsize, 0, DECOMP_BUF_PADDING);
     }
+
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "compr:%2d, dsize:%d\n", compr, (int)dsize);
+
     switch(compr){
     case -1:
         frame->key_frame = 0;
@@ -265,14 +284,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 5:
         if (!tmpptr) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
-            return AVERROR_INVALIDDATA;
+            if (!(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
+                return AVERROR_INVALIDDATA;
         }
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++)
-                outptr[i] = srcptr[i] ^ tmpptr[i];
-            tmpptr += stride;
+            if(tmpptr){
+                for(i = 0; i < avctx->width; i++)
+                    outptr[i] = srcptr[i] ^ tmpptr[i];
+                tmpptr += stride;
+            }else
+                memcpy(outptr, srcptr, avctx->width);
             outptr += stride;
             srcptr += avctx->width;
         }
@@ -281,10 +304,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 13:
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
-        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, c->prev->data[0]);
+        if (!c->prev->data[0]) {
+            av_log(avctx, AV_LOG_ERROR, "Missing reference frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, dsize, c->prev->data[0]);
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", buf[4]);
+        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
 
@@ -295,13 +322,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     *got_frame = 1;
 
     /* always report that the buffer was completely consumed */
-    return orig_buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     DxaDecContext * const c = avctx->priv_data;
 
+    if (avctx->width%4 || avctx->height%4) {
+        avpriv_request_sample(avctx, "dimensions are not a multiple of 4");
+        return AVERROR_INVALIDDATA;
+    }
+
     c->prev = av_frame_alloc();
     if (!c->prev)
         return AVERROR(ENOMEM);
@@ -309,7 +341,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
     c->dsize = avctx->width * avctx->height * 2;
-    if (!(c->decomp_buf = av_malloc(c->dsize))) {
+    c->decomp_buf = av_malloc(c->dsize + DECOMP_BUF_PADDING);
+    if (!c->decomp_buf) {
+        av_frame_free(&c->prev);
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
diff --git a/libavcodec/dxtory.c b/libavcodec/dxtory.c
index 0a6f331c4b..6f68c10889 100644
--- a/libavcodec/dxtory.c
+++ b/libavcodec/dxtory.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,7 @@ static int dxtory_decode_v1_rgb(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *dst;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * bpp) {
+    if (src_size < avctx->width * avctx->height * (int64_t)bpp) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -65,7 +65,7 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *Y3, *Y4, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 18 / 16) {
+    if (src_size < avctx->width * avctx->height * 9LL / 8) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -82,10 +82,10 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     V  = pic->data[2];
     for (h = 0; h < avctx->height; h += 4) {
         for (w = 0; w < avctx->width; w += 4) {
-            AV_COPY32(Y1 + w, src);
-            AV_COPY32(Y2 + w, src + 4);
-            AV_COPY32(Y3 + w, src + 8);
-            AV_COPY32(Y4 + w, src + 12);
+            AV_COPY32U(Y1 + w, src);
+            AV_COPY32U(Y2 + w, src + 4);
+            AV_COPY32U(Y3 + w, src + 8);
+            AV_COPY32U(Y4 + w, src + 12);
             U[w >> 2] = src[16] + 0x80;
             V[w >> 2] = src[17] + 0x80;
             src += 18;
@@ -108,7 +108,7 @@ static int dxtory_decode_v1_420(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3 / 2) {
+    if (src_size < avctx->width * avctx->height * 3LL / 2) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -145,7 +145,7 @@ static int dxtory_decode_v1_444(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3) {
+    if (src_size < avctx->width * avctx->height * 3LL) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -171,9 +171,9 @@ static int dxtory_decode_v1_444(AVCodecContext *avctx, AVFrame *pic,
     return 0;
 }
 
-const uint8_t def_lru[8] = { 0x00, 0x20, 0x40, 0x60, 0x80, 0xA0, 0xC0, 0xFF };
-const uint8_t def_lru_555[8] = { 0x00, 0x08, 0x10, 0x18, 0x1F };
-const uint8_t def_lru_565[8] = { 0x00, 0x08, 0x10, 0x20, 0x30, 0x3F };
+static const uint8_t def_lru[8] = { 0x00, 0x20, 0x40, 0x60, 0x80, 0xA0, 0xC0, 0xFF };
+static const uint8_t def_lru_555[8] = { 0x00, 0x08, 0x10, 0x18, 0x1F };
+static const uint8_t def_lru_565[8] = { 0x00, 0x08, 0x10, 0x20, 0x30, 0x3F };
 
 static inline uint8_t decode_sym(GetBitContext *gb, uint8_t lru[8])
 {
@@ -295,7 +295,8 @@ static int dxtory_decode_v2(AVCodecContext *avctx, AVFrame *pic,
         if (ret < 0)
             return ret;
 
-        init_get_bits(&gb2, src + off + 16, (slice_size - 16) * 8);
+        if ((ret = init_get_bits8(&gb2, src + off + 16, slice_size - 16)) < 0)
+            return ret;
 
         line += decode_slice(&gb2, pic, line, avctx->height - line, lru);
 
diff --git a/libavcodec/dxv.c b/libavcodec/dxv.c
index 32137f5392..4fdfd65404 100644
--- a/libavcodec/dxv.c
+++ b/libavcodec/dxv.c
@@ -2,20 +2,20 @@
  * Resolume DXV decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -432,7 +432,6 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
 
     /* Now decompress the texture with the standard functions. */
     avctx->execute2(avctx, decompress_texture_thread,
diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
index 8b0e6869ae..2cf57ad653 100644
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ int ff_dxva2_commit_buffer(AVCodecContext *avctx,
     void     *dxva_data;
     unsigned dxva_size;
     int      result;
-    HRESULT hr;
+    HRESULT hr = 0;
 
 #if CONFIG_D3D11VA
     if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD)
@@ -137,7 +137,7 @@ int ff_dxva2_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
 #if CONFIG_DXVA2
     DXVA2_DecodeBufferDesc          buffer2[4];
 #endif
-    DECODER_BUFFER_DESC             *buffer,*buffer_slice;
+    DECODER_BUFFER_DESC             *buffer = NULL, *buffer_slice = NULL;
     int result, runs = 0;
     HRESULT hr;
     unsigned type;
diff --git a/libavcodec/dxva2.h b/libavcodec/dxva2.h
index ec448a4c2c..9e3ab86a82 100644
--- a/libavcodec/dxva2.h
+++ b/libavcodec/dxva2.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the DXVA2 Libav HWAccel implementation.
+ * to the DXVA2 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  */
@@ -81,7 +81,7 @@ struct dxva_context {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 };
diff --git a/libavcodec/dxva2_h264.c b/libavcodec/dxva2_h264.c
index a4a88ce61d..99b80ba373 100644
--- a/libavcodec/dxva2_h264.c
+++ b/libavcodec/dxva2_h264.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,7 +96,7 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
                                         ((h->sps.mb_aff &&
                                         (h->picture_structure == PICT_FRAME)) <<  1) |
                                         (h->sps.residual_color_transform_flag <<  2) |
-                                        /* sp_for_switch_flag (not implemented by Libav) */
+                                        /* sp_for_switch_flag (not implemented by FFmpeg) */
                                         (0                                    <<  3) |
                                         (h->sps.chroma_format_idc             <<  4) |
                                         ((h->nal_ref_idc != 0)                <<  6) |
@@ -152,8 +152,8 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
     pp->deblocking_filter_control_present_flag = h->pps.deblocking_filter_parameters_present;
     pp->redundant_pic_cnt_present_flag= h->pps.redundant_pic_cnt_present;
     pp->Reserved8BitsB                = 0;
-    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by Libav */
-    //pp->SliceGroupMap[810];               /* XXX not implemented by Libav */
+    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by FFmpeg */
+    //pp->SliceGroupMap[810];               /* XXX not implemented by FFmpeg */
 }
 
 static void fill_scaling_lists(const AVCodecContext *avctx, AVDXVAContext *ctx, const H264Context *h, DXVA_Qmatrix_H264 *qm)
@@ -275,7 +275,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
             }
         }
     }
-    slice->slice_qs_delta    = 0; /* XXX not implemented by Libav */
+    slice->slice_qs_delta    = 0; /* XXX not implemented by FFmpeg */
     slice->slice_qp_delta    = sl->qscale - h->pps.init_qp;
     slice->redundant_pic_cnt = sl->redundant_pic_count;
     if (sl->slice_type == AV_PICTURE_TYPE_B)
@@ -298,9 +298,9 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     const H264Picture *current_picture = h->cur_pic_ptr;
     struct dxva2_picture_context *ctx_pic = current_picture->hwaccel_picture_private;
     DXVA_Slice_H264_Short *slice = NULL;
-    void     *dxva_data_ptr;
+    void     *dxva_data_ptr = NULL;
     uint8_t  *dxva_data, *current, *end;
-    unsigned dxva_size;
+    unsigned dxva_size = 0;
     void     *slice_data;
     unsigned slice_size;
     unsigned padding;
diff --git a/libavcodec/dxva2_hevc.c b/libavcodec/dxva2_hevc.c
index 88d47823b8..79d2d2824e 100644
--- a/libavcodec/dxva2_hevc.c
+++ b/libavcodec/dxva2_hevc.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2014 - 2015 Hendrik Leppkes
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -92,7 +92,7 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
     pp->init_qp_minus26                          = pps->pic_init_qp_minus26;
 
     if (h->sh.short_term_ref_pic_set_sps_flag == 0 && h->sh.short_term_rps) {
-        pp->ucNumDeltaPocsOfRefRpsIdx            = h->sh.short_term_rps->num_delta_pocs;
+        pp->ucNumDeltaPocsOfRefRpsIdx            = h->sh.short_term_rps->rps_idx_num_delta_pocs;
         pp->wNumBitsForShortTermRPSInSlice       = h->sh.short_term_ref_pic_set_size;
     }
 
diff --git a/libavcodec/dxva2_internal.h b/libavcodec/dxva2_internal.h
index 30aec8b93c..c2499e5bf7 100644
--- a/libavcodec/dxva2_internal.h
+++ b/libavcodec/dxva2_internal.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,7 @@
 #if CONFIG_D3D11VA
 #include "d3d11va.h"
 #endif
+
 #if HAVE_DXVA_H
 /* When targeting WINAPI_FAMILY_PHONE_APP or WINAPI_FAMILY_APP, dxva.h
  * defines nothing. Force the struct definitions to be visible. */
diff --git a/libavcodec/dxva2_mpeg2.c b/libavcodec/dxva2_mpeg2.c
index 62308b403e..64fa196ca6 100644
--- a/libavcodec/dxva2_mpeg2.c
+++ b/libavcodec/dxva2_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -112,10 +112,10 @@ static void fill_quantization_matrices(AVCodecContext *avctx,
         qm->bNewQmatrix[i] = 1;
     for (i = 0; i < 64; i++) {
         int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
-        qm->Qmatrix[0][i] = s->intra_matrix[n];;
-        qm->Qmatrix[1][i] = s->inter_matrix[n];;
-        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];;
-        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];;
+        qm->Qmatrix[0][i] = s->intra_matrix[n];
+        qm->Qmatrix[1][i] = s->inter_matrix[n];
+        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];
+        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];
     }
 }
 
@@ -142,8 +142,7 @@ static void fill_slice(AVCodecContext *avctx,
     init_get_bits(&gb, &buffer[4], 8 * (size - 4));
 
     slice->wQuantizerScaleCode = get_bits(&gb, 5);
-    while (get_bits1(&gb))
-        skip_bits(&gb, 8);
+    skip_1stop_8data_bits(&gb);
 
     slice->wMBbitOffset        = 4 * 8 + get_bits_count(&gb);
 }
diff --git a/libavcodec/dxva2_vc1.c b/libavcodec/dxva2_vc1.c
index 73f9fb4109..1eb425383e 100644
--- a/libavcodec/dxva2_vc1.c
+++ b/libavcodec/dxva2_vc1.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,15 @@ static void fill_picture_parameters(AVCodecContext *avctx,
 {
     const MpegEncContext *s = &v->s;
     const Picture *current_picture = s->current_picture_ptr;
+    int intcomp = 0;
+
+    // determine if intensity compensation is needed
+    if (s->pict_type == AV_PICTURE_TYPE_P) {
+      if ((v->fcm == ILACE_FRAME && v->intcomp) || (v->fcm != ILACE_FRAME && v->mv_mode == MV_PMODE_INTENSITY_COMP)) {
+        if (v->lumscale != 32 || v->lumshift != 0 || (s->picture_structure != PICT_FRAME && (v->lumscale2 != 32 || v->lumshift2 != 0)))
+          intcomp = 1;
+      }
+    }
 
     memset(pp, 0, sizeof(*pp));
     pp->wDecodedPictureIndex    =
@@ -69,13 +78,13 @@ static void fill_picture_parameters(AVCodecContext *avctx,
         pp->bPicStructure      |= 0x01;
     if (s->picture_structure & PICT_BOTTOM_FIELD)
         pp->bPicStructure      |= 0x02;
-    pp->bSecondField            = v->interlace && v->fcm != ILACE_FIELD && !s->first_field;
+    pp->bSecondField            = v->interlace && v->fcm == ILACE_FIELD && v->second_field;
     pp->bPicIntra               = s->pict_type == AV_PICTURE_TYPE_I || v->bi_type;
     pp->bPicBackwardPrediction  = s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type;
     pp->bBidirectionalAveragingMode = (1                                           << 7) |
                                       ((DXVA_CONTEXT_CFG_INTRARESID(avctx, ctx) != 0) << 6) |
                                       ((DXVA_CONTEXT_CFG_RESIDACCEL(avctx, ctx) != 0) << 5) |
-                                      ((v->lumscale != 32 || v->lumshift != 0)     << 4) |
+                                      (intcomp                                     << 4) |
                                       ((v->profile == PROFILE_ADVANCED)            << 3);
     pp->bMVprecisionAndChromaRelation = ((v->mv_mode == MV_PMODE_1MV_HPEL_BILIN) << 3) |
                                         (1                                       << 2) |
@@ -123,15 +132,25 @@ static void fill_picture_parameters(AVCodecContext *avctx,
                                   (v->range_mapuv_flag << 3) |
                                   (v->range_mapuv          );
     pp->bPicBinPB               = 0;
-    pp->bMV_RPS                 = 0;
-    pp->bReservedBits           = 0;
+    pp->bMV_RPS                 = (v->fcm == ILACE_FIELD && pp->bPicBackwardPrediction) ? v->refdist + 9 : 0;
+    pp->bReservedBits           = v->pq;
     if (s->picture_structure == PICT_FRAME) {
-        pp->wBitstreamFcodes        = v->lumscale;
-        pp->wBitstreamPCEelements   = v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = v->lumscale;
+            pp->wBitstreamPCEelements = v->lumshift;
+        } else {
+            pp->wBitstreamFcodes      = 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     } else {
         /* Syntax: (top_field_param << 8) | bottom_field_param */
-        pp->wBitstreamFcodes        = (v->lumscale << 8) | v->lumscale;
-        pp->wBitstreamPCEelements   = (v->lumshift << 8) | v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = (v->lumscale << 8) | v->lumscale2;
+            pp->wBitstreamPCEelements = (v->lumshift << 8) | v->lumshift2;
+        } else {
+            pp->wBitstreamFcodes      = (32 << 8) | 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     }
     pp->bBitstreamConcealmentNeed   = 0;
     pp->bBitstreamConcealmentMethod = 0;
@@ -149,8 +168,8 @@ static void fill_slice(AVCodecContext *avctx, DXVA_SliceInfo *slice,
     slice->dwSliceBitsInBuffer = 8 * size;
     slice->dwSliceDataLocation = position;
     slice->bStartCodeBitOffset = 0;
-    slice->bReservedBits       = 0;
-    slice->wMBbitOffset        = get_bits_count(&s->gb);
+    slice->bReservedBits       = (s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type) ? v->bfraction_lut_index + 9 : 0;
+    slice->wMBbitOffset        = v->p_frame_skipped ? 0xffff : get_bits_count(&s->gb) + (avctx->codec_id == AV_CODEC_ID_VC1 ? 32 : 0);
     slice->wNumberMBsInSlice   = s->mb_width * s->mb_height; /* XXX We assume 1 slice */
     slice->wQuantizerScaleCode = v->pq;
     slice->wBadSliceChopping   = 0;
@@ -202,8 +221,11 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     dxva_data = dxva_data_ptr;
     result = data_size <= dxva_size ? 0 : -1;
     if (!result) {
-        if (start_code_size > 0)
+        if (start_code_size > 0) {
             memcpy(dxva_data, start_code, start_code_size);
+            if (v->second_field)
+                dxva_data[3] = 0x0c;
+        }
         memcpy(dxva_data + start_code_size,
                ctx_pic->bitstream + slice->dwSliceDataLocation, slice_size);
         if (padding > 0)
diff --git a/libavcodec/eac3_data.c b/libavcodec/eac3_data.c
index b0416f3ce7..b159e1682f 100644
--- a/libavcodec/eac3_data.c
+++ b/libavcodec/eac3_data.c
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3_data.h b/libavcodec/eac3_data.h
index 4d88ce0a53..10a67f16d2 100644
--- a/libavcodec/eac3_data.h
+++ b/libavcodec/eac3_data.h
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3dec.c b/libavcodec/eac3dec.c
index b9d079ce8d..ef815afb55 100644
--- a/libavcodec/eac3dec.c
+++ b/libavcodec/eac3dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,7 +63,7 @@ typedef enum {
 
 #define EAC3_SR_CODE_REDUCED  3
 
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
 {
     int bin, bnd, ch, i;
     uint8_t wrapflag[SPX_MAX_BANDS]={1,0,}, num_copy_sections, copy_sizes[SPX_MAX_BANDS];
@@ -101,7 +101,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (i = 0; i < num_copy_sections; i++) {
             memcpy(&s->transform_coeffs[ch][bin],
                    &s->transform_coeffs[ch][s->spx_dst_start_freq],
-                   copy_sizes[i]*sizeof(float));
+                   copy_sizes[i]*sizeof(INTFLOAT));
             bin += copy_sizes[i];
         }
 
@@ -124,7 +124,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
             bin = s->spx_src_start_freq - 2;
             for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
                 if (wrapflag[bnd]) {
-                    float *coeffs = &s->transform_coeffs[ch][bin];
+                    INTFLOAT *coeffs = &s->transform_coeffs[ch][bin];
                     coeffs[0] *= atten_tab[0];
                     coeffs[1] *= atten_tab[1];
                     coeffs[2] *= atten_tab[2];
@@ -142,6 +142,11 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
             float nscale = s->spx_noise_blend[ch][bnd] * rms_energy[bnd] * (1.0f / INT32_MIN);
             float sscale = s->spx_signal_blend[ch][bnd];
+#if USE_FIXED
+            // spx_noise_blend and spx_signal_blend are both FP.23
+            nscale *= 1.0 / (1<<23);
+            sscale *= 1.0 / (1<<23);
+#endif
             for (i = 0; i < s->spx_band_sizes[bnd]; i++) {
                 float noise  = nscale * (int32_t)av_lfg_get(&s->dith_state);
                 s->transform_coeffs[ch][bin]   *= sscale;
@@ -195,7 +200,7 @@ static void idct6(int pre_mant[6])
     pre_mant[5] = even0 - odd0;
 }
 
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
 {
     int bin, blk, gs;
     int end_bap, gaq_mode;
@@ -288,7 +293,7 @@ void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
     }
 }
 
-int ff_eac3_parse_header(AC3DecodeContext *s)
+static int ff_eac3_parse_header(AC3DecodeContext *s)
 {
     int i, blk, ch;
     int ac3_exponent_strategy, parse_aht_info, parse_spx_atten_data;
diff --git a/libavcodec/eac3enc.c b/libavcodec/eac3enc.c
index 3aa2d54013..e1d61f68bf 100644
--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,13 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_EAC3
 #include "ac3enc_opts_template.c"
-static const AVClass eac3enc_class = { "E-AC-3 Encoder", av_default_item_name,
-                                       ac3_options, LIBAVUTIL_VERSION_INT };
 
+static const AVClass eac3enc_class = {
+    .class_name = "E-AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 /**
  * LUT for finding a matching frame exponent strategy index from a set of
diff --git a/libavcodec/eac3enc.h b/libavcodec/eac3enc.h
index a92a24c2fa..7d6155975d 100644
--- a/libavcodec/eac3enc.h
+++ b/libavcodec/eac3enc.h
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eacmv.c b/libavcodec/eacmv.c
index 9668f64fbf..047be813b1 100644
--- a/libavcodec/eacmv.c
+++ b/libavcodec/eacmv.c
@@ -2,20 +2,20 @@
  * Electronic Arts CMV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -44,6 +44,7 @@ typedef struct CmvContext {
 
 static av_cold int cmv_decode_init(AVCodecContext *avctx){
     CmvContext *s = avctx->priv_data;
+
     s->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
@@ -160,7 +161,7 @@ static int cmv_process_header(CmvContext *s, const uint8_t *buf, const uint8_t *
 
     buf += 16;
     for (i=pal_start; i<pal_start+pal_count && i<AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
-        s->palette[i] = AV_RB24(buf);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
         buf += 3;
     }
 
@@ -185,19 +186,20 @@ static int cmv_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
 
     if (AV_RL32(buf)==MVIh_TAG||AV_RB32(buf)==MVIh_TAG) {
+        unsigned size = AV_RL32(buf + 4);
         ret = cmv_process_header(s, buf+EA_PREAMBLE_SIZE, buf_end);
         if (ret < 0)
             return ret;
-        return buf_size;
+        if (size > buf_end - buf - EA_PREAMBLE_SIZE)
+            return -1;
+        buf += size;
     }
 
     if (av_image_check_size(s->width, s->height, 0, s->avctx))
         return -1;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     memcpy(frame->data[1], s->palette, AVPALETTE_SIZE);
 
diff --git a/libavcodec/eaidct.c b/libavcodec/eaidct.c
index 5b2db44aff..e4840f2653 100644
--- a/libavcodec/eaidct.c
+++ b/libavcodec/eaidct.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ/TQI/MAD IDCT algorithm
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eaidct.h b/libavcodec/eaidct.h
index e78de04178..6b9ec1c91c 100644
--- a/libavcodec/eaidct.h
+++ b/libavcodec/eaidct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 8f7dd25d81..4e202f98a3 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -2,20 +2,20 @@
  * Electronic Arts Madcow Video Decoder
  * Copyright (c) 2007-2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -94,15 +94,21 @@ static inline void comp_block(MadContext *t, AVFrame *frame,
                               int j, int mv_x, int mv_y, int add)
 {
     if (j < 4) {
+        unsigned offset = (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x;
+        if (offset >= (t->avctx->height - 7) * t->last_frame->linesize[0] - 7)
+            return;
         comp(frame->data[0] + (mb_y*16 + ((j&2)<<2))*frame->linesize[0] + mb_x*16 + ((j&1)<<3),
              frame->linesize[0],
-             t->last_frame->data[0] + (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x,
+             t->last_frame->data[0] + offset,
              t->last_frame->linesize[0], add);
     } else if (!(t->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         int index = j - 3;
+        unsigned offset = (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2);
+        if (offset >= (t->avctx->height/2 - 7) * t->last_frame->linesize[index] - 7)
+            return;
         comp(frame->data[index] + (mb_y*8)*frame->linesize[index] + mb_x * 8,
              frame->linesize[index],
-             t->last_frame->data[index] + (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2),
+             t->last_frame->data[index] + offset,
              t->last_frame->linesize[index], add);
     }
 }
@@ -122,7 +128,7 @@ static inline void idct_put(MadContext *t, AVFrame *frame, int16_t *block,
     }
 }
 
-static inline void decode_block_intra(MadContext *s, int16_t * block)
+static inline int decode_block_intra(MadContext *s, int16_t * block)
 {
     int level, i, j, run;
     RLTable *rl = &ff_rl_mpeg1;
@@ -148,7 +154,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 level = (level*quant_matrix[j]) >> 4;
@@ -167,7 +173,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 if (level < 0) {
@@ -185,6 +191,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
         }
         CLOSE_READER(re, &s->gb);
     }
+    return 0;
 }
 
 static int decode_motion(GetBitContext *gb)
@@ -198,10 +205,10 @@ static int decode_motion(GetBitContext *gb)
     return value;
 }
 
-static void decode_mb(MadContext *s, AVFrame *frame, int inter)
+static int decode_mb(MadContext *s, AVFrame *frame, int inter)
 {
     int mv_map = 0;
-    int mv_x, mv_y;
+    int av_uninit(mv_x), av_uninit(mv_y);
     int j;
 
     if (inter) {
@@ -210,21 +217,22 @@ static void decode_mb(MadContext *s, AVFrame *frame, int inter)
             mv_map = v ? get_bits(&s->gb, 6) : 63;
             mv_x = decode_motion(&s->gb);
             mv_y = decode_motion(&s->gb);
-        } else {
-            mv_map = 0;
         }
     }
 
     for (j=0; j<6; j++) {
         if (mv_map & (1<<j)) {  // mv_x and mv_y are guarded by mv_map
             int add = 2*decode_motion(&s->gb);
-            comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
+            if (s->last_frame->data[0])
+                comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
         } else {
             s->bdsp.clear_block(s->block);
-            decode_block_intra(s, s->block);
+            if(decode_block_intra(s, s->block) < 0)
+                return -1;
             idct_put(s, frame, s->block, s->mb_x, s->mb_y, j);
         }
     }
+    return 0;
 }
 
 static void calc_quant_matrix(MadContext *s, int qscale)
@@ -269,16 +277,21 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (width < 16 || height < 16) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (avctx->width != width || avctx->height != height) {
         av_frame_unref(s->last_frame);
+        if((width * height)/2048*7 > bytestream2_get_bytes_left(&gb))
+            return AVERROR_INVALIDDATA;
         if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (inter && !s->last_frame->data[0]) {
         av_log(avctx, AV_LOG_WARNING, "Missing reference frame.\n");
@@ -299,11 +312,13 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR(ENOMEM);
     s->bbdsp.bswap16_buf(s->bitstream_buf, (const uint16_t *)(buf + bytestream2_tell(&gb)),
                          bytestream2_get_bytes_left(&gb) / 2);
+    memset((uint8_t*)s->bitstream_buf + bytestream2_get_bytes_left(&gb), 0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&s->gb, s->bitstream_buf, 8*(bytestream2_get_bytes_left(&gb)));
 
     for (s->mb_y=0; s->mb_y < (avctx->height+15)/16; s->mb_y++)
         for (s->mb_x=0; s->mb_x < (avctx->width +15)/16; s->mb_x++)
-            decode_mb(s, frame, inter);
+            if(decode_mb(s, frame, inter) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
@@ -320,7 +335,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     MadContext *t = avctx->priv_data;
     av_frame_free(&t->last_frame);
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index a0496a00cb..f8a47cb1a7 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ Video Decoder
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -148,7 +148,7 @@ static void tgq_idct_put_mb_dconly(TgqContext *s, AVFrame *frame,
     }
 }
 
-static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
+static int tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
 {
     int mode;
     int i;
@@ -157,7 +157,10 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
     mode = bytestream2_get_byte(&s->gb);
     if (mode > 12) {
         GetBitContext gb;
-        init_get_bits(&gb, s->gb.buffer, FFMIN(s->gb.buffer_end - s->gb.buffer, mode) * 8);
+        int ret = init_get_bits8(&gb, s->gb.buffer, FFMIN(bytestream2_get_bytes_left(&s->gb), mode));
+        if (ret < 0)
+            return ret;
+
         for (i = 0; i < 6; i++)
             tgq_decode_block(s, s->block[i], &gb);
         tgq_idct_put_mb(s, s->block, frame, mb_x, mb_y);
@@ -176,9 +179,11 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "unsupported mb mode %i\n", mode);
+            return -1;
         }
         tgq_idct_put_mb_dconly(s, frame, mb_x, mb_y, dc);
     }
+    return 0;
 }
 
 static void tgq_calculate_qtable(TgqContext *s, int quant)
@@ -201,12 +206,13 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     TgqContext *s      = avctx->priv_data;
     AVFrame *frame     = data;
     int x, y, ret;
-    int big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
+    int big_endian;
 
     if (buf_size < 16) {
         av_log(avctx, AV_LOG_WARNING, "truncated header\n");
         return AVERROR_INVALIDDATA;
     }
+    big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
     bytestream2_init(&s->gb, buf + 8, buf_size - 8);
     if (big_endian) {
         s->width  = bytestream2_get_be16u(&s->gb);
@@ -223,16 +229,15 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     tgq_calculate_qtable(s, bytestream2_get_byteu(&s->gb));
     bytestream2_skip(&s->gb, 3);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     frame->key_frame = 1;
     frame->pict_type = AV_PICTURE_TYPE_I;
 
     for (y = 0; y < FFALIGN(avctx->height, 16) >> 4; y++)
         for (x = 0; x < FFALIGN(avctx->width, 16) >> 4; x++)
-            tgq_decode_mb(s, frame, y, x);
+            if (tgq_decode_mb(s, frame, y, x) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
diff --git a/libavcodec/eatgv.c b/libavcodec/eatgv.c
index 4faae50a18..882bf077b2 100644
--- a/libavcodec/eatgv.c
+++ b/libavcodec/eatgv.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -81,7 +81,7 @@ static int unpack(const uint8_t *src, const uint8_t *src_end,
     else
         src += 2;
 
-    if (src + 3 > src_end)
+    if (src_end - src < 3)
         return AVERROR_INVALIDDATA;
     size = AV_RB24(src);
     src += 3;
@@ -156,7 +156,7 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     int mvbits;
     const uint8_t *blocks_raw;
 
-    if (buf + 12 > buf_end)
+    if(buf_end - buf < 12)
         return AVERROR_INVALIDDATA;
 
     num_mvs           = AV_RL16(&buf[0]);
@@ -173,9 +173,11 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
 
     /* allocate codebook buffers as necessary */
     if (num_mvs > s->num_mvs) {
-        int err = av_reallocp(&s->mv_codebook, num_mvs * 2 * sizeof(int));
-        if (err < 0)
+        int err = av_reallocp_array(&s->mv_codebook, num_mvs, sizeof(*s->mv_codebook));
+        if (err < 0) {
+            s->num_mvs = 0;
             return err;
+        }
         s->num_mvs = num_mvs;
     }
 
@@ -191,7 +193,7 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     /* read motion vectors */
     mvbits = (num_mvs * 2 * 10 + 31) & ~31;
 
-    if (buf + (mvbits >> 3) + 16 * num_blocks_raw + 8 * num_blocks_packed > buf_end)
+    if (buf_end - buf < (mvbits>>3) + 16*num_blocks_raw + 8*num_blocks_packed)
         return AVERROR_INVALIDDATA;
 
     init_get_bits(&gb, buf, mvbits);
@@ -231,8 +233,10 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
                 int my = y * 4 + s->mv_codebook[vector][1];
 
                 if (mx < 0 || mx + 4 > s->avctx->width ||
-                    my < 0 || my + 4 > s->avctx->height)
+                    my < 0 || my + 4 > s->avctx->height) {
+                    av_log(s->avctx, AV_LOG_ERROR, "MV %d %d out of picture\n", mx, my);
                     continue;
+                }
 
                 src = s->last_frame->data[0] + mx + my * s->last_frame->linesize[0];
                 src_stride = s->last_frame->linesize[0];
@@ -267,12 +271,15 @@ static int tgv_decode_frame(AVCodecContext *avctx,
     AVFrame *frame         = data;
     int chunk_type, ret;
 
+    if (buf_end - buf < EA_PREAMBLE_SIZE)
+        return AVERROR_INVALIDDATA;
+
     chunk_type = AV_RL32(&buf[0]);
     buf       += EA_PREAMBLE_SIZE;
 
     if (chunk_type == kVGT_TAG) {
         int pal_count, i;
-        if (buf + 12 > buf_end) {
+        if(buf_end - buf < 12) {
             av_log(avctx, AV_LOG_WARNING, "truncated header\n");
             return AVERROR_INVALIDDATA;
         }
@@ -288,8 +295,8 @@ static int tgv_decode_frame(AVCodecContext *avctx,
 
         pal_count = AV_RL16(&buf[6]);
         buf += 12;
-        for (i = 0; i < pal_count && i < AVPALETTE_COUNT && buf + 2 < buf_end; i++) {
-            s->palette[i] = AV_RB24(buf);
+        for(i = 0; i < pal_count && i < AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
             buf += 3;
         }
     }
@@ -305,7 +312,7 @@ static int tgv_decode_frame(AVCodecContext *avctx,
         frame->pict_type = AV_PICTURE_TYPE_I;
 
         if (!s->frame_buffer &&
-            !(s->frame_buffer = av_malloc(s->width * s->height)))
+            !(s->frame_buffer = av_mallocz(s->width * s->height)))
             return AVERROR(ENOMEM);
 
         if (unpack(buf, buf_end, s->frame_buffer, s->avctx->width, s->avctx->height) < 0) {
@@ -343,8 +350,8 @@ static av_cold int tgv_decode_end(AVCodecContext *avctx)
     TgvContext *s = avctx->priv_data;
     av_frame_free(&s->last_frame);
     av_freep(&s->frame_buffer);
-    av_free(s->mv_codebook);
-    av_free(s->block_codebook);
+    av_freep(&s->mv_codebook);
+    av_freep(&s->block_codebook);
     return 0;
 }
 
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index f5dcb53692..2423e21a41 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -2,20 +2,20 @@
  * Electronic Arts TQI Video Decoder
  * Copyright (c) 2007-2009 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -121,10 +121,8 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&t->bitstream_buf, &t->bitstream_buf_size,
                           buf_end - buf);
@@ -139,9 +137,10 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     for (s->mb_x=0; s->mb_x<(avctx->width+15)/16; s->mb_x++)
     {
         if (tqi_decode_mb(s, t->block) < 0)
-            break;
+            goto end;
         tqi_idct_put(t, frame, t->block);
     }
+    end:
 
     *got_frame = 1;
     return buf_size;
@@ -150,7 +149,7 @@ static int tqi_decode_frame(AVCodecContext *avctx,
 static av_cold int tqi_decode_end(AVCodecContext *avctx)
 {
     TqiContext *t = avctx->priv_data;
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/elbg.c b/libavcodec/elbg.c
index e2b03a6f9b..23ae9483f7 100644
--- a/libavcodec/elbg.c
+++ b/libavcodec/elbg.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/lfg.h"
 #include "elbg.h"
@@ -50,7 +51,7 @@ typedef struct elbg_data {
     int *codebook;
     cell **cells;
     int *utility;
-    int *utility_inc;
+    int64_t *utility_inc;
     int *nearest_cb;
     int *points;
     AVLFG *rand_state;
@@ -107,11 +108,20 @@ static int get_high_utility_cell(elbg_data *elbg)
 {
     int i=0;
     /* Using linear search, do binary if it ever turns to be speed critical */
-    int r = av_lfg_get(elbg->rand_state)%elbg->utility_inc[elbg->numCB-1] + 1;
-    while (elbg->utility_inc[i] < r)
+    uint64_t r;
+
+    if (elbg->utility_inc[elbg->numCB-1] < INT_MAX) {
+        r = av_lfg_get(elbg->rand_state) % (unsigned int)elbg->utility_inc[elbg->numCB-1] + 1;
+    } else {
+        r = av_lfg_get(elbg->rand_state);
+        r = (av_lfg_get(elbg->rand_state) + (r<<32)) % elbg->utility_inc[elbg->numCB-1] + 1;
+    }
+
+    while (elbg->utility_inc[i] < r) {
         i++;
+    }
 
-    assert(elbg->cells[i]);
+    av_assert2(elbg->cells[i]);
 
     return i;
 }
@@ -189,7 +199,7 @@ static void get_new_centroids(elbg_data *elbg, int huc, int *newcentroid_i,
 
 /**
  * Add the points in the low utility cell to its closest cell. Split the high
- * utility cell, putting the separed points in the (now empty) low utility
+ * utility cell, putting the separate points in the (now empty) low utility
  * cell.
  *
  * @param elbg         Internal elbg data
@@ -226,7 +236,8 @@ static void shift_codebook(elbg_data *elbg, int *indexes,
 
 static void evaluate_utility_inc(elbg_data *elbg)
 {
-    int i, inc=0;
+    int i;
+    int64_t inc=0;
 
     for (i=0; i < elbg->numCB; i++) {
         if (elbg->numCB*elbg->utility[i] > elbg->error)
@@ -323,7 +334,7 @@ static void do_shiftings(elbg_data *elbg)
 
 #define BIG_PRIME 433494437LL
 
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int max_steps, int *closest_cb,
                  AVLFG *rand_state)
 {
@@ -332,7 +343,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     if (numpoints > 24*numCB) {
         /* ELBG is very costly for a big number of points. So if we have a lot
            of them, get a good initial codebook to save on iterations       */
-        int *temp_points = av_malloc(dim*(numpoints/8)*sizeof(int));
+        int *temp_points = av_malloc_array(dim, (numpoints/8)*sizeof(int));
         if (!temp_points)
             return AVERROR(ENOMEM);
         for (i=0; i<numpoints/8; i++) {
@@ -340,14 +351,14 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
             memcpy(temp_points + i*dim, points + k*dim, dim*sizeof(int));
         }
 
-        ret = ff_init_elbg(temp_points, dim, numpoints / 8, codebook,
-                           numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_init_elbg(temp_points, dim, numpoints / 8, codebook,
+                               numCB, 2 * max_steps, closest_cb, rand_state);
         if (ret < 0) {
             av_freep(&temp_points);
             return ret;
         }
-        ret = ff_do_elbg(temp_points, dim, numpoints / 8, codebook,
-                         numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_do_elbg(temp_points, dim, numpoints / 8, codebook,
+                             numCB, 2 * max_steps, closest_cb, rand_state);
         av_free(temp_points);
 
     } else  // If not, initialize the codebook with random positions
@@ -357,7 +368,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     return ret;
 }
 
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                 int numCB, int max_steps, int *closest_cb,
                 AVLFG *rand_state)
 {
@@ -365,9 +376,9 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg_data elbg_d;
     elbg_data *elbg = &elbg_d;
     int i, j, k, last_error, steps = 0, ret = 0;
-    int *dist_cb = av_malloc(numpoints*sizeof(int));
-    int *size_part = av_malloc(numCB*sizeof(int));
-    cell *list_buffer = av_malloc(numpoints*sizeof(cell));
+    int *dist_cb = av_malloc_array(numpoints, sizeof(int));
+    int *size_part = av_malloc_array(numCB, sizeof(int));
+    cell *list_buffer = av_malloc_array(numpoints, sizeof(cell));
     cell *free_cells;
     int best_dist, best_idx = 0;
 
@@ -375,12 +386,12 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg->dim = dim;
     elbg->numCB = numCB;
     elbg->codebook = codebook;
-    elbg->cells = av_malloc(numCB*sizeof(cell *));
-    elbg->utility = av_malloc(numCB*sizeof(int));
+    elbg->cells = av_malloc_array(numCB, sizeof(cell *));
+    elbg->utility = av_malloc_array(numCB, sizeof(int));
     elbg->nearest_cb = closest_cb;
     elbg->points = points;
-    elbg->utility_inc = av_malloc(numCB*sizeof(int));
-    elbg->scratchbuf = av_malloc(5*dim*sizeof(int));
+    elbg->utility_inc = av_malloc_array(numCB, sizeof(*elbg->utility_inc));
+    elbg->scratchbuf = av_malloc_array(5*dim, sizeof(int));
 
     if (!dist_cb || !size_part || !list_buffer || !elbg->cells ||
         !elbg->utility || !elbg->utility_inc || !elbg->scratchbuf) {
diff --git a/libavcodec/elbg.h b/libavcodec/elbg.h
index 3b1587a3ab..f48aa3b443 100644
--- a/libavcodec/elbg.h
+++ b/libavcodec/elbg.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * @param rand_state A random number generator state. Should be already initialized by av_lfg_init().
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                int numCB, int num_steps, int *closest_cb,
                AVLFG *rand_state);
 
@@ -46,11 +46,11 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
  * Initialize the **codebook vector for the elbg algorithm. If you have already
  * a codebook and you want to refine it, you shouldn't call this function.
  * If numpoints < 8*numCB this function fills **codebook with random numbers.
- * If not, it calls ff_do_elbg for a (smaller) random sample of the points in
- * **points. Get the same parameters as ff_do_elbg.
+ * If not, it calls avpriv_do_elbg for a (smaller) random sample of the points in
+ * **points. Get the same parameters as avpriv_do_elbg.
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int num_steps, int *closest_cb,
                  AVLFG *rand_state);
 
diff --git a/libavcodec/elsdec.c b/libavcodec/elsdec.c
index 10a1a9d5e8..4797965457 100644
--- a/libavcodec/elsdec.c
+++ b/libavcodec/elsdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/elsdec.h b/libavcodec/elsdec.h
index 515b49aa71..139a24ab8b 100644
--- a/libavcodec/elsdec.h
+++ b/libavcodec/elsdec.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index 2c7836f23d..2c741a4a16 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include <limits.h>
 
+#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "error_resilience.h"
@@ -44,7 +45,7 @@
 static void set_mv_strides(ERContext *s, int *mv_step, int *stride)
 {
     if (s->avctx->codec_id == AV_CODEC_ID_H264) {
-        assert(s->quarter_sample);
+        av_assert0(s->quarter_sample);
         *mv_step = 4;
         *stride  = s->mb_width * 4;
     } else {
@@ -83,6 +84,8 @@ static void put_dc(ERContext *s, uint8_t *dest_y, uint8_t *dest_cb,
         dcv = 0;
     else if (dcv > 2040)
         dcv = 2040;
+
+    if (dest_cr)
     for (y = 0; y < 8; y++) {
         int x;
         for (x = 0; x < 8; x++) {
@@ -137,11 +140,73 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
                      int h, int stride, int is_luma)
 {
     int b_x, b_y;
+    int16_t  (*col )[4] = av_malloc_array(stride, h*sizeof( int16_t)*4);
+    uint32_t (*dist)[4] = av_malloc_array(stride, h*sizeof(uint32_t)*4);
+
+    if(!col || !dist) {
+        av_log(s->avctx, AV_LOG_ERROR, "guess_dc() is out of memory\n");
+        goto fail;
+    }
+
+    for(b_y=0; b_y<h; b_y++){
+        int color= 1024;
+        int distance= -1;
+        for(b_x=0; b_x<w; b_x++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][1]= color;
+            dist[b_x + b_y*stride][1]= distance >= 0 ? b_x-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_x=w-1; b_x>=0; b_x--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][0]= color;
+            dist[b_x + b_y*stride][0]= distance >= 0 ? distance-b_x : 9999;
+        }
+    }
+    for(b_x=0; b_x<w; b_x++){
+        int color= 1024;
+        int distance= -1;
+        for(b_y=0; b_y<h; b_y++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][3]= color;
+            dist[b_x + b_y*stride][3]= distance >= 0 ? b_y-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_y=h-1; b_y>=0; b_y--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][2]= color;
+            dist[b_x + b_y*stride][2]= distance >= 0 ? distance-b_y : 9999;
+        }
+    }
 
     for (b_y = 0; b_y < h; b_y++) {
         for (b_x = 0; b_x < w; b_x++) {
-            int color[4]    = { 1024, 1024, 1024, 1024 };
-            int distance[4] = { 9999, 9999, 9999, 9999 };
             int mb_index, error, j;
             int64_t guess, weight_sum;
             mb_index = (b_x >> is_luma) + (b_y >> is_luma) * s->mb_stride;
@@ -152,66 +217,21 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
             if (!(error & ER_DC_ERROR))
                 continue; // dc-ok
 
-            /* right block */
-            for (j = b_x + 1; j < w; j++) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[0]    = dc[j + b_y * stride];
-                    distance[0] = j - b_x;
-                    break;
-                }
-            }
-
-            /* left block */
-            for (j = b_x - 1; j >= 0; j--) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[1]    = dc[j + b_y * stride];
-                    distance[1] = b_x - j;
-                    break;
-                }
-            }
-
-            /* bottom block */
-            for (j = b_y + 1; j < h; j++) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[2]    = dc[b_x + j * stride];
-                    distance[2] = j - b_y;
-                    break;
-                }
-            }
-
-            /* top block */
-            for (j = b_y - 1; j >= 0; j--) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[3]    = dc[b_x + j * stride];
-                    distance[3] = b_y - j;
-                    break;
-                }
-            }
-
             weight_sum = 0;
             guess      = 0;
             for (j = 0; j < 4; j++) {
-                int64_t weight  = 256 * 256 * 256 * 16 / distance[j];
-                guess          += weight * (int64_t) color[j];
+                int64_t weight  = 256 * 256 * 256 * 16 / FFMAX(dist[b_x + b_y*stride][j], 1);
+                guess          += weight*(int64_t)col[b_x + b_y*stride][j];
                 weight_sum     += weight;
             }
             guess = (guess + weight_sum / 2) / weight_sum;
             dc[b_x + b_y * stride] = guess;
         }
     }
+
+fail:
+    av_freep(&col);
+    av_freep(&dist);
 }
 
 /**
@@ -381,6 +401,14 @@ static void guess_mv(ERContext *s)
         fixed[mb_xy] = f;
         if (f == MV_FROZEN)
             num_avail++;
+        else if(s->last_pic.f->data[0] && s->last_pic.motion_val[0]){
+            const int mb_y= mb_xy / s->mb_stride;
+            const int mb_x= mb_xy % s->mb_stride;
+            const int mot_index= (mb_x + mb_y*mot_stride) * mot_step;
+            s->cur_pic.motion_val[0][mot_index][0]= s->last_pic.motion_val[0][mot_index][0];
+            s->cur_pic.motion_val[0][mot_index][1]= s->last_pic.motion_val[0][mot_index][1];
+            s->cur_pic.ref_index[0][4*mb_xy]      = s->last_pic.ref_index[0][4*mb_xy];
+        }
     }
 
     if ((!(s->avctx->error_concealment&FF_EC_GUESS_MVS)) ||
@@ -431,6 +459,8 @@ static void guess_mv(ERContext *s)
 
                     if (fixed[mb_xy] == MV_FROZEN)
                         continue;
+                    av_assert1(!IS_INTRA(s->cur_pic.mb_type[mb_xy]));
+                    av_assert1(s->last_pic.f && s->last_pic.f->data[0]);
 
                     j = 0;
                     if (mb_x > 0             && fixed[mb_xy - 1]         == MV_FROZEN)
@@ -545,7 +575,7 @@ skip_mean_and_median:
                     /* zero MV */
                     pred_count++;
 
-                    if (!fixed[mb_xy]) {
+                    if (!fixed[mb_xy] && 0) {
                         if (s->avctx->codec_id == AV_CODEC_ID_H264) {
                             // FIXME
                         } else {
@@ -660,6 +690,9 @@ static int is_intra_more_likely(ERContext *s)
     if (!s->last_pic.f || !s->last_pic.f->data[0])
         return 1; // no previous frame available -> use spatial prediction
 
+    if (s->avctx->error_concealment & FF_EC_FAVOR_INTER)
+        return 0;
+
     undamaged_count = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
@@ -668,21 +701,14 @@ static int is_intra_more_likely(ERContext *s)
             undamaged_count++;
     }
 
-    if (s->avctx->codec_id == AV_CODEC_ID_H264 && s->ref_count <= 0)
-        return 1;
-
     if (undamaged_count < 5)
         return 0; // almost all MBs damaged -> use temporal prediction
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
     // prevent dsp.sad() check, that requires access to the image
-    if (CONFIG_MPEG_XVMC_DECODER    &&
-        s->avctx->xvmc_acceleration &&
+    if (CONFIG_XVMC    &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb &&
         s->cur_pic.f->pict_type == AV_PICTURE_TYPE_I)
         return 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
     skip_amount     = FFMAX(undamaged_count / 50, 1); // check only up to 50 MBs
     is_intra_likely = 0;
@@ -716,6 +742,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
                 is_intra_likely += s->mecc.sad[0](NULL, last_mb_ptr, mb_ptr,
                                                   linesize[0], 16);
+                // FIXME need await_progress() here
                 is_intra_likely -= s->mecc.sad[0](NULL, last_mb_ptr,
                                                   last_mb_ptr + linesize[0] * 16,
                                                   linesize[0], 16);
@@ -727,6 +754,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
     }
+//      av_log(NULL, AV_LOG_ERROR, "is_intra_likely: %d type:%d\n", is_intra_likely, s->pict_type);
     return is_intra_likely > 0;
 }
 
@@ -746,6 +774,19 @@ void ff_er_frame_start(ERContext *s)
     s->error_occurred = 0;
 }
 
+static int er_supported(ERContext *s)
+{
+    if(s->avctx->hwaccel && s->avctx->hwaccel->decode_slice           ||
+#if FF_API_CAP_VDPAU
+       s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU          ||
+#endif
+       !s->cur_pic.f                                                  ||
+       s->cur_pic.field_picture
+    )
+        return 0;
+    return 1;
+}
+
 /**
  * Add a slice.
  * @param endx   x component of the last macroblock, can be -1
@@ -762,7 +803,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     const int end_xy   = s->mb_index2xy[end_i];
     int mask           = -1;
 
-    if (s->avctx->hwaccel)
+    if (s->avctx->hwaccel && s->avctx->hwaccel->decode_slice)
         return;
 
     if (start_i > end_i || start_xy > end_xy) {
@@ -777,20 +818,20 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     mask &= ~VP_START;
     if (status & (ER_AC_ERROR | ER_AC_END)) {
         mask           &= ~(ER_AC_ERROR | ER_AC_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_DC_ERROR | ER_DC_END)) {
         mask           &= ~(ER_DC_ERROR | ER_DC_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_MV_ERROR | ER_MV_END)) {
         mask           &= ~(ER_MV_ERROR | ER_MV_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
 
     if (status & ER_MB_ERROR) {
         s->error_occurred = 1;
-        s->error_count    = INT_MAX;
+        avpriv_atomic_int_set(&s->error_count, INT_MAX);
     }
 
     if (mask == ~0x7F) {
@@ -803,7 +844,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     }
 
     if (end_i == s->mb_num)
-        s->error_count = INT_MAX;
+        avpriv_atomic_int_set(&s->error_count, INT_MAX);
     else {
         s->error_status_table[end_xy] &= mask;
         s->error_status_table[end_xy] |= status;
@@ -811,41 +852,92 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
 
     s->error_status_table[start_xy] |= VP_START;
 
-    if (start_xy > 0 && s->avctx->thread_count <= 1 &&
-        s->avctx->skip_top * s->mb_width < start_i) {
+    if (start_xy > 0 && !(s->avctx->active_thread_type & FF_THREAD_SLICE) &&
+        er_supported(s) && s->avctx->skip_top * s->mb_width < start_i) {
         int prev_status = s->error_status_table[s->mb_index2xy[start_i - 1]];
 
         prev_status &= ~ VP_START;
-        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
-            s->error_count = INT_MAX;
+        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END)) {
+            s->error_occurred = 1;
+            avpriv_atomic_int_set(&s->error_count, INT_MAX);
+        }
     }
 }
 
 void ff_er_frame_end(ERContext *s)
 {
-    int *linesize = s->cur_pic.f->linesize;
+    int *linesize = NULL;
     int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error;
     int distance;
     int threshold_part[4] = { 100, 100, 100 };
     int threshold = 50;
     int is_intra_likely;
+    int size = s->b8_stride * 2 * s->mb_height;
 
     /* We do not support ER of field pictures yet,
      * though it should not crash if enabled. */
     if (!s->avctx->error_concealment || s->error_count == 0            ||
-        s->avctx->hwaccel                                              ||
-        !s->cur_pic.f                                                  ||
-        s->cur_pic.field_picture                                       ||
+        s->avctx->lowres                                               ||
+        !er_supported(s)                                               ||
         s->error_count == 3 * s->mb_width *
                           (s->avctx->skip_top + s->avctx->skip_bottom)) {
         return;
-    };
+    }
+    linesize = s->cur_pic.f->linesize;
+    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+        int status = s->error_status_table[mb_x + (s->mb_height - 1) * s->mb_stride];
+        if (status != 0x7F)
+            break;
+    }
 
-    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
-        av_log(s->avctx, AV_LOG_ERROR, "MVs not available, ER not possible.\n");
+    if (   mb_x == s->mb_width
+        && s->avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO
+        && (s->avctx->height&16)
+        && s->error_count == 3 * s->mb_width * (s->avctx->skip_top + s->avctx->skip_bottom + 1)
+    ) {
+        av_log(s->avctx, AV_LOG_DEBUG, "ignoring last missing slice\n");
         return;
     }
 
+    if (s->last_pic.f) {
+        if (s->last_pic.f->width  != s->cur_pic.f->width  ||
+            s->last_pic.f->height != s->cur_pic.f->height ||
+            s->last_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use previous picture in error concealment\n");
+            memset(&s->last_pic, 0, sizeof(s->last_pic));
+        }
+    }
+    if (s->next_pic.f) {
+        if (s->next_pic.f->width  != s->cur_pic.f->width  ||
+            s->next_pic.f->height != s->cur_pic.f->height ||
+            s->next_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use next picture in error concealment\n");
+            memset(&s->next_pic, 0, sizeof(s->next_pic));
+        }
+    }
+
+    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
+        av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
+
+        for (i = 0; i < 2; i++) {
+            s->ref_index_buf[i]  = av_buffer_allocz(s->mb_stride * s->mb_height * 4 * sizeof(uint8_t));
+            s->motion_val_buf[i] = av_buffer_allocz((size + 4) * 2 * sizeof(uint16_t));
+            if (!s->ref_index_buf[i] || !s->motion_val_buf[i])
+                break;
+            s->cur_pic.ref_index[i]  = s->ref_index_buf[i]->data;
+            s->cur_pic.motion_val[i] = (int16_t (*)[2])s->motion_val_buf[i]->data + 4;
+        }
+        if (i < 2) {
+            for (i = 0; i < 2; i++) {
+                av_buffer_unref(&s->ref_index_buf[i]);
+                av_buffer_unref(&s->motion_val_buf[i]);
+                s->cur_pic.ref_index[i]  = NULL;
+                s->cur_pic.motion_val[i] = NULL;
+            }
+            return;
+        }
+    }
+
     if (s->avctx->debug & FF_DEBUG_ER) {
         for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -857,6 +949,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* handle overlapping slices */
     for (error_type = 1; error_type <= 3; error_type++) {
         int end_ok = 0;
@@ -877,7 +970,8 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
+#if 1
     /* handle slices with partitions of different length */
     if (s->partitioned_frame) {
         int end_ok = 0;
@@ -900,7 +994,7 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
     /* handle missing slices */
     if (s->avctx->err_recognition & AV_EF_EXPLODE) {
         int end_ok = 1;
@@ -927,6 +1021,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* backward mark errors */
     distance = 9999999;
     for (error_type = 1; error_type <= 3; error_type++) {
@@ -934,7 +1029,7 @@ void ff_er_frame_end(ERContext *s)
             const int mb_xy = s->mb_index2xy[i];
             int       error = s->error_status_table[mb_xy];
 
-            if (s->mbskip_table && !s->mbskip_table[mb_xy]) // FIXME partition specific
+            if (!s->mbskip_table || !s->mbskip_table[mb_xy]) // FIXME partition specific
                 distance++;
             if (error & (1 << error_type))
                 distance = 0;
@@ -951,6 +1046,7 @@ void ff_er_frame_end(ERContext *s)
                 distance = 9999999;
         }
     }
+#endif
 
     /* forward mark errors */
     error = 0;
@@ -965,22 +1061,23 @@ void ff_er_frame_end(ERContext *s)
             s->error_status_table[mb_xy] |= error;
         }
     }
-
+#if 1
     /* handle not partitioned case */
     if (!s->partitioned_frame) {
         for (i = 0; i < s->mb_num; i++) {
             const int mb_xy = s->mb_index2xy[i];
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
             if (error & ER_MB_ERROR)
                 error |= ER_MB_ERROR;
             s->error_status_table[mb_xy] = error;
         }
     }
+#endif
 
     dc_error = ac_error = mv_error = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (error & ER_DC_ERROR)
             dc_error++;
         if (error & ER_AC_ERROR)
@@ -988,15 +1085,15 @@ void ff_er_frame_end(ERContext *s)
         if (error & ER_MV_ERROR)
             mv_error++;
     }
-    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors\n",
-           dc_error, ac_error, mv_error);
+    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors in %c frame\n",
+           dc_error, ac_error, mv_error, av_get_picture_type_char(s->cur_pic.f->pict_type));
 
     is_intra_likely = is_intra_more_likely(s);
 
     /* set unknown mb-type to most likely */
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (!((error & ER_DC_ERROR) && (error & ER_MV_ERROR)))
             continue;
 
@@ -1024,7 +1121,7 @@ void ff_er_frame_end(ERContext *s)
             const int mv_dir  = dir ? MV_DIR_BACKWARD : MV_DIR_FORWARD;
             int mv_type;
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type))
                 continue; // intra
@@ -1061,7 +1158,7 @@ void ff_er_frame_end(ERContext *s)
                 const int mb_type = s->cur_pic.mb_type[mb_xy];
                 int mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
 
-                error = s->error_status_table[mb_xy];
+                int error = s->error_status_table[mb_xy];
 
                 if (IS_INTRA(mb_type))
                     continue;
@@ -1079,6 +1176,7 @@ void ff_er_frame_end(ERContext *s)
                     int time_pp = s->pp_time;
                     int time_pb = s->pb_time;
 
+                    av_assert0(s->avctx->codec_id != AV_CODEC_ID_H264);
                     ff_thread_await_progress(s->next_pic.tf, mb_y, 0);
 
                     s->mv[0][0][0] = s->next_pic.motion_val[0][xy][0] *  time_pb            / time_pp;
@@ -1099,13 +1197,9 @@ void ff_er_frame_end(ERContext *s)
     } else
         guess_mv(s);
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* the filters below are not XvMC compatible, skip them */
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
+    /* the filters below manipulate raw image, skip them */
+    if (CONFIG_XVMC && s->avctx->hwaccel && s->avctx->hwaccel->decode_mb)
         goto ec_clean;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
     /* fill DC for inter blocks */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1115,7 +1209,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            // error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type) && s->partitioned_frame)
                 continue;
@@ -1138,6 +1232,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 dc_ptr[(n & 1) + (n >> 1) * s->b8_stride] = (dc + 4) >> 3;
             }
 
+            if (!s->cur_pic.f->data[2])
+                continue;
+
             dcu = dcv = 0;
             for (y = 0; y < 8; y++) {
                 int x;
@@ -1150,15 +1247,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->dc_val[2][mb_x + mb_y * s->mb_stride] = (dcv + 4) >> 3;
         }
     }
-
+#if 1
     /* guess DC for damaged blocks */
-    guess_dc(s, s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride, 1);
-    guess_dc(s, s->dc_val[1], s->mb_width, s->mb_height, s->mb_stride, 0);
-    guess_dc(s, s->dc_val[2], s->mb_width, s->mb_height, s->mb_stride, 0);
+    guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
+    guess_dc(s, s->dc_val[1], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+    guess_dc(s, s->dc_val[2], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+#endif
 
     /* filter luma DC */
     filter181(s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride);
 
+#if 1
     /* render DC only intra */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1166,7 +1265,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTER(mb_type))
                 continue;
@@ -1176,27 +1275,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
             dest_y  = s->cur_pic.f->data[0] + mb_x * 16 + mb_y * 16 * linesize[0];
             dest_cb = s->cur_pic.f->data[1] + mb_x *  8 + mb_y *  8 * linesize[1];
             dest_cr = s->cur_pic.f->data[2] + mb_x *  8 + mb_y *  8 * linesize[2];
+            if (!s->cur_pic.f->data[2])
+                dest_cb = dest_cr = NULL;
 
             put_dc(s, dest_y, dest_cb, dest_cr, mb_x, mb_y);
         }
     }
+#endif
 
     if (s->avctx->error_concealment & FF_EC_DEBLOCK) {
         /* filter horizontal block boundaries */
         h_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
 
         /* filter vertical block boundaries */
         v_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
+
+        if (s->cur_pic.f->data[2]) {
+            h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+            v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+        }
     }
 
 ec_clean:
@@ -1213,6 +1318,13 @@ ec_clean:
             s->mbintra_table[mb_xy] = 1;
     }
 
+    for (i = 0; i < 2; i++) {
+        av_buffer_unref(&s->ref_index_buf[i]);
+        av_buffer_unref(&s->motion_val_buf[i]);
+        s->cur_pic.ref_index[i]  = NULL;
+        s->cur_pic.motion_val[i] = NULL;
+    }
+
     memset(&s->cur_pic, 0, sizeof(ERPicture));
     memset(&s->last_pic, 0, sizeof(ERPicture));
     memset(&s->next_pic, 0, sizeof(ERPicture));
diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h
index 611b529311..1f52f200f2 100644
--- a/libavcodec/error_resilience.h
+++ b/libavcodec/error_resilience.h
@@ -1,19 +1,19 @@
 /*
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,7 +42,7 @@ typedef struct ERPicture {
     AVFrame *f;
     ThreadFrame *tf;
 
-    // it's the caller responsability to allocate these buffers
+    // it's the caller's responsibility to allocate these buffers
     int16_t (*motion_val[2])[2];
     int8_t *ref_index[2];
 
@@ -61,7 +61,8 @@ typedef struct ERContext {
     int mb_stride;
     int b8_stride;
 
-    int error_count, error_occurred;
+    volatile int error_count;
+    int error_occurred;
     uint8_t *error_status_table;
     uint8_t *er_temp_buffer;
     int16_t *dc_val[3];
@@ -73,6 +74,9 @@ typedef struct ERContext {
     ERPicture last_pic;
     ERPicture next_pic;
 
+    AVBufferRef *ref_index_buf[2];
+    AVBufferRef *motion_val_buf[2];
+
     uint16_t pp_time;
     uint16_t pb_time;
     int quarter_sample;
diff --git a/libavcodec/escape124.c b/libavcodec/escape124.c
index f652198e5b..efcac64128 100644
--- a/libavcodec/escape124.c
+++ b/libavcodec/escape124.c
@@ -2,20 +2,20 @@
  * Escape 124 Video Decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman@gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,10 +49,6 @@ typedef struct Escape124Context {
     CodeBook codebooks[3];
 } Escape124Context;
 
-static int can_safely_read(GetBitContext* gb, int bits) {
-    return get_bits_left(gb) >= bits;
-}
-
 /**
  * Initialize the decoder
  * @param avctx decoder context
@@ -80,7 +76,7 @@ static av_cold int escape124_decode_close(AVCodecContext *avctx)
     Escape124Context *s = avctx->priv_data;
 
     for (i = 0; i < 3; i++)
-        av_free(s->codebooks[i].blocks);
+        av_freep(&s->codebooks[i].blocks);
 
     av_frame_free(&s->frame);
 
@@ -93,7 +89,7 @@ static CodeBook unpack_codebook(GetBitContext* gb, unsigned depth,
     unsigned i, j;
     CodeBook cb = { 0 };
 
-    if (!can_safely_read(gb, size * 34))
+    if (size >= INT_MAX / 34 || get_bits_left(gb) < size * 34)
         return cb;
 
     if (size >= INT_MAX / sizeof(MacroBlock))
@@ -124,7 +120,7 @@ static unsigned decode_skip_count(GetBitContext* gb)
     unsigned value;
     // This function reads a maximum of 23 bits,
     // which is within the padding space
-    if (!can_safely_read(gb, 1))
+    if (get_bits_left(gb) < 1)
         return -1;
     value = get_bits1(gb);
     if (!value)
@@ -204,7 +200,6 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                                   void *data, int *got_frame,
                                   AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     Escape124Context *s = avctx->priv_data;
     AVFrame *frame = data;
@@ -219,13 +214,15 @@ static int escape124_decode_frame(AVCodecContext *avctx,
 
     uint16_t* old_frame_data, *new_frame_data;
     unsigned old_stride, new_stride;
+
     int ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     // This call also guards the potential depth reads for the
     // codebook unpacking.
-    if (!can_safely_read(&gb, 64))
+    if (get_bits_left(&gb) < 64)
         return -1;
 
     frame_flags = get_bits_long(&gb, 32);
@@ -237,7 +234,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         if (!s->frame->data[0])
             return AVERROR_INVALIDDATA;
 
-        av_log(NULL, AV_LOG_DEBUG, "Skipping frame\n");
+        av_log(avctx, AV_LOG_DEBUG, "Skipping frame\n");
 
         *got_frame = 1;
         if ((ret = av_frame_ref(frame, s->frame)) < 0)
@@ -267,17 +264,15 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                     cb_size = s->num_superblocks << cb_depth;
                 }
             }
-            av_free(s->codebooks[i].blocks);
+            av_freep(&s->codebooks[i].blocks);
             s->codebooks[i] = unpack_codebook(&gb, cb_depth, cb_size);
             if (!s->codebooks[i].blocks)
                 return -1;
         }
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     new_frame_data = (uint16_t*)frame->data[0];
     new_stride = frame->linesize[0] / 2;
@@ -303,7 +298,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
             copy_superblock(sb.pixels, 8,
                             old_frame_data, old_stride);
 
-            while (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+            while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
                 unsigned mask;
                 mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
                 mask = get_bits(&gb, 16);
@@ -315,7 +310,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                 }
             }
 
-            if (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+            if (!get_bits1(&gb)) {
                 unsigned inv_mask = get_bits(&gb, 4);
                 for (i = 0; i < 4; i++) {
                     if (inv_mask & (1 << i)) {
@@ -327,15 +322,13 @@ static int escape124_decode_frame(AVCodecContext *avctx,
 
                 for (i = 0; i < 16; i++) {
                     if (multi_mask & mask_matrix[i]) {
-                        if (!can_safely_read(&gb, 1))
-                            break;
                         mb = decode_macroblock(s, &gb, &cb_index,
                                                superblock_index);
                         insert_mb_into_sb(&sb, mb, i);
                     }
                 }
             } else if (frame_flags & (1 << 16)) {
-                while (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+                while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
                     mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
                     insert_mb_into_sb(&sb, mb, get_bits(&gb, 4));
                 }
@@ -357,7 +350,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         skip--;
     }
 
-    av_log(NULL, AV_LOG_DEBUG,
+    av_log(avctx, AV_LOG_DEBUG,
            "Escape sizes: %i, %i, %i\n",
            frame_size, buf_size, get_bits_count(&gb) / 8);
 
diff --git a/libavcodec/escape130.c b/libavcodec/escape130.c
index f134138f77..f4f64d8419 100644
--- a/libavcodec/escape130.c
+++ b/libavcodec/escape130.c
@@ -2,20 +2,20 @@
  * Escape 130 video decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman <at> gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ static const int8_t chroma_adjust[2][8] = {
     { 0, 1, 1,  1,  0, -1, -1, -1 }
 };
 
-const uint8_t chroma_vals[] = {
+static const uint8_t chroma_vals[] = {
      20,  28,  36,  44,  52,  60,  68,  76,
      84,  92, 100, 106, 112, 116, 120, 124,
     128, 132, 136, 140, 144, 150, 156, 164,
@@ -166,6 +166,9 @@ static int decode_skip_count(GetBitContext* gb)
 {
     int value;
 
+    if (get_bits_left(gb) < 1+3)
+        return -1;
+
     value = get_bits1(gb);
     if (value)
         return 0;
@@ -188,7 +191,6 @@ static int decode_skip_count(GetBitContext* gb)
 static int escape130_decode_frame(AVCodecContext *avctx, void *data,
                                   int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf  = avpkt->data;
     int buf_size        = avpkt->size;
     Escape130Context *s = avctx->priv_data;
     AVFrame *pic        = data;
@@ -215,7 +217,9 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
 
-    init_get_bits(&gb, buf + 16, (buf_size - 16) * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+    skip_bits_long(&gb, 16 * 8);
 
     new_y  = s->new_y;
     new_cb = s->new_u;
diff --git a/libavcodec/evrcdata.h b/libavcodec/evrcdata.h
new file mode 100644
index 0000000000..8cfc2028dd
--- /dev/null
+++ b/libavcodec/evrcdata.h
@@ -0,0 +1,1499 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_EVRCDATA_H
+#define AVCODEC_EVRCDATA_H
+
+/**
+ * @file
+ * Data tables for the EVRC decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/common.h"
+
+/**
+ * Rate 1/8 frame energy quantization
+ *
+ * TIA/IS-127 table 8-18
+ */
+static const float evrc_energy_quant[][3] = {
+{-0.2464E-01,-0.4005E-02,-0.1107E+00 }, { 0.8734E+00, 0.1004E+01, 0.9930E+00 },
+{ 0.4222E+00, 0.3894E+00, 0.5020E+00 }, { 0.1450E+01, 0.1328E+01, 0.1278E+01 },
+{ 0.1957E+00, 0.2169E+00, 0.2735E+00 }, { 0.1142E+01, 0.1240E+01, 0.1157E+01 },
+{ 0.7881E+00, 0.6778E+00, 0.4185E+00 }, { 0.1504E+01, 0.1468E+01, 0.1534E+01 },
+{ 0.3173E+00, 0.2693E+00,-0.9526E-01 }, { 0.1141E+01, 0.1154E+01, 0.1044E+01 },
+{ 0.5147E+00, 0.5784E+00, 0.8802E+00 }, { 0.1502E+01, 0.1407E+01, 0.1409E+01 },
+{ 0.3163E+00, 0.3592E+00, 0.2830E+00 }, { 0.1217E+01, 0.1213E+01, 0.1216E+01 },
+{ 0.1023E+01, 0.1139E+01,-0.9526E-01 }, { 0.1619E+01, 0.1655E+01, 0.1642E+01 },
+{ 0.1437E+00, 0.1505E+00, 0.6838E-01 }, { 0.9794E+00, 0.1021E+01, 0.1117E+01 },
+{ 0.4701E+00, 0.6426E+00, 0.5519E+00 }, { 0.1366E+01, 0.1397E+01, 0.1406E+01 },
+{ 0.2918E+00, 0.3022E+00, 0.2420E+00 }, { 0.1309E+01, 0.1241E+01, 0.1220E+01 },
+{ 0.7989E+00, 0.7654E+00, 0.7391E+00 }, { 0.1612E+01, 0.1502E+01, 0.1447E+01 },
+{ 0.2594E+00, 0.1948E+00, 0.2555E+00 }, { 0.1091E+01, 0.1150E+01, 0.1272E+01 },
+{ 0.3423E+00, 0.4150E+00, 0.1294E+01 }, { 0.1729E+01, 0.1377E+01, 0.1065E+01 },
+{ 0.4103E+00, 0.3287E+00, 0.3228E+00 }, { 0.1144E+01, 0.1281E+01, 0.1416E+01 },
+{ 0.1047E+01, 0.1117E+01, 0.6188E+00 }, { 0.1914E+01, 0.1777E+01, 0.1516E+01 },
+{-0.2117E-01, 0.2159E+00, 0.2351E+00 }, { 0.1093E+01, 0.1088E+01, 0.1026E+01 },
+{ 0.5567E+00, 0.5092E+00, 0.4654E+00 }, { 0.1510E+01, 0.1449E+01, 0.1201E+01 },
+{ 0.2362E+00, 0.3426E+00, 0.2549E+00 }, { 0.1340E+01, 0.1225E+01, 0.1117E+01 },
+{ 0.1203E+01, 0.3819E+00, 0.2269E+00 }, { 0.1373E+01, 0.1404E+01, 0.1830E+01 },
+{ 0.2570E+00, 0.2668E+00, 0.1636E+00 }, { 0.1219E+01, 0.1098E+01, 0.1122E+01 },
+{ 0.6985E+00, 0.8456E+00, 0.1069E+01 }, { 0.1550E+01, 0.1501E+01, 0.1388E+01 },
+{ 0.2870E+00, 0.3060E+00, 0.3599E+00 }, { 0.1178E+01, 0.1345E+01, 0.1302E+01 },
+{ 0.1270E+01, 0.1215E+01, 0.1812E+00 }, { 0.1725E+01, 0.1777E+01, 0.1693E+01 },
+{ 0.2074E+00, 0.2104E+00, 0.1539E+00 }, { 0.1105E+01, 0.1034E+01, 0.1104E+01 },
+{ 0.6683E+00, 0.6646E+00, 0.6639E+00 }, { 0.1403E+01, 0.1462E+01, 0.1435E+01 },
+{ 0.3389E+00, 0.3754E+00, 0.2150E+00 }, { 0.1288E+01, 0.1325E+01, 0.1257E+01 },
+{ 0.8933E+00, 0.8253E+00, 0.8133E+00 }, { 0.1555E+01, 0.1579E+01, 0.1565E+01 },
+{ 0.3264E+00, 0.2434E+00, 0.2852E+00 }, { 0.1242E+01, 0.1180E+01, 0.1202E+01 },
+{ 0.1314E+00, 0.1698E+00, 0.1646E+01 }, { 0.1797E+01, 0.1597E+01, 0.1241E+01 },
+{ 0.4721E+00, 0.5346E+00, 0.3066E+00 }, { 0.1274E+01, 0.1401E+01, 0.1351E+01 },
+{ 0.1455E+01, 0.1386E+01, 0.6430E+00 }, { 0.1828E+01, 0.1867E+01, 0.1825E+01 },
+{-0.3265E+00,-0.2956E+00,-0.2462E+00 }, { 0.1035E+01, 0.1020E+01, 0.1003E+01 },
+{ 0.3702E+00, 0.4307E+00, 0.7072E+00 }, { 0.1424E+01, 0.1345E+01, 0.1352E+01 },
+{ 0.2267E+00, 0.2680E+00, 0.3037E+00 }, { 0.1235E+01, 0.1249E+01, 0.1146E+01 },
+{ 0.9944E+00, 0.6485E+00, 0.5248E+00 }, { 0.1539E+01, 0.1492E+01, 0.1612E+01 },
+{ 0.3815E+00, 0.3360E+00,-0.9526E-01 }, { 0.1163E+01, 0.1144E+01, 0.1117E+01 },
+{ 0.6734E+00, 0.7656E+00, 0.1014E+01 }, { 0.1568E+01, 0.1438E+01, 0.1455E+01 },
+{ 0.3409E+00, 0.3317E+00, 0.3856E+00 }, { 0.1180E+01, 0.1284E+01, 0.1284E+01 },
+{ 0.1244E+01, 0.1214E+01,-0.9526E-01 }, { 0.1753E+01, 0.1598E+01, 0.1744E+01 },
+{ 0.1548E+00, 0.1388E+00, 0.2020E+00 }, { 0.1027E+01, 0.1133E+01, 0.1093E+01 },
+{ 0.3906E+00, 0.7505E+00, 0.5705E+00 }, { 0.1420E+01, 0.1357E+01, 0.1543E+01 },
+{ 0.3252E+00, 0.3136E+00, 0.2804E+00 }, { 0.1351E+01, 0.1309E+01, 0.1224E+01 },
+{ 0.8781E+00, 0.8095E+00, 0.7109E+00 }, { 0.1614E+01, 0.1580E+01, 0.1433E+01 },
+{ 0.3222E+00, 0.2298E+00, 0.2157E+00 }, { 0.1216E+01, 0.1077E+01, 0.1247E+01 },
+{ 0.1363E+01, 0.1280E+01, 0.1317E+01 }, { 0.1751E+01, 0.1457E+01, 0.1182E+01 },
+{ 0.4428E+00, 0.4082E+00, 0.3181E+00 }, { 0.1157E+01, 0.1227E+01, 0.1604E+01 },
+{ 0.1286E+01, 0.1268E+01, 0.8167E+00 }, { 0.1994E+01, 0.2018E+01, 0.1307E+01 },
+{ 0.2671E-01, 0.2594E+00, 0.3397E+00 }, { 0.1164E+01, 0.1080E+01, 0.9321E+00 },
+{ 0.5998E+00, 0.6076E+00, 0.5081E+00 }, { 0.1442E+01, 0.1442E+01, 0.1375E+01 },
+{ 0.2390E+00, 0.3554E+00, 0.3426E+00 }, { 0.1287E+01, 0.1307E+01, 0.1144E+01 },
+{ 0.1200E+01, 0.7495E+00, 0.3967E+00 }, { 0.1561E+01, 0.1517E+01, 0.1898E+01 },
+{ 0.3598E+00, 0.3463E+00, 0.1200E+00 }, { 0.1298E+01, 0.1125E+01, 0.1062E+01 },
+{ 0.7577E+00, 0.1013E+01, 0.1194E+01 }, { 0.1537E+01, 0.1513E+01, 0.1464E+01 },
+{ 0.4041E+00, 0.4038E+00, 0.3897E+00 }, { 0.1293E+01, 0.1219E+01, 0.1378E+01 },
+{ 0.1250E+01, 0.1391E+01, 0.2451E+00 }, { 0.1558E+01, 0.1764E+01, 0.1728E+01 },
+{ 0.2700E+00, 0.1894E+00, 0.1924E+00 }, { 0.1111E+01, 0.1112E+01, 0.1173E+01 },
+{ 0.7579E+00, 0.8342E+00, 0.4781E+00 }, { 0.1464E+01, 0.1477E+01, 0.1469E+01 },
+{ 0.4001E+00, 0.3104E+00, 0.2217E+00 }, { 0.1346E+01, 0.1421E+01, 0.1312E+01 },
+{ 0.1071E+01, 0.8967E+00, 0.7511E+00 }, { 0.1616E+01, 0.1551E+01, 0.1574E+01 },
+{ 0.3329E+00, 0.2785E+00, 0.3140E+00 }, { 0.1281E+01, 0.1209E+01, 0.1239E+01 },
+{ 0.2805E+00, 0.2687E+00, 0.1646E+01 }, { 0.1814E+01, 0.1514E+01, 0.1510E+01 },
+{ 0.6231E+00, 0.4200E+00, 0.3701E+00 }, { 0.1255E+01, 0.1429E+01, 0.1454E+01 },
+{ 0.1642E+01, 0.1581E+01, 0.7112E+00 }, { 0.1844E+01, 0.1963E+01, 0.1895E+01 },
+{-0.4208E-01,-0.1491E+00,-0.7639E-01 }, { 0.1046E+01, 0.9598E+00, 0.9176E+00 },
+{ 0.4478E+00, 0.4605E+00, 0.5111E+00 }, { 0.1521E+01, 0.1292E+01, 0.1342E+01 },
+{ 0.2220E+00, 0.2549E+00, 0.2510E+00 }, { 0.1186E+01, 0.1254E+01, 0.1171E+01 },
+{ 0.8999E+00, 0.4960E+00, 0.4943E+00 }, { 0.1423E+01, 0.1484E+01, 0.1620E+01 },
+{ 0.2796E+00, 0.2778E+00,-0.2820E+00 }, { 0.1170E+01, 0.1181E+01, 0.1076E+01 },
+{ 0.4068E+00, 0.8541E+00, 0.9352E+00 }, { 0.1584E+01, 0.1416E+01, 0.1387E+01 },
+{ 0.3325E+00, 0.3655E+00, 0.3340E+00 }, { 0.1224E+01, 0.1257E+01, 0.1245E+01 },
+{ 0.1061E+01, 0.1138E+01,-0.9526E-01 }, { 0.1681E+01, 0.1704E+01, 0.1673E+01 },
+{ 0.1932E+00, 0.1489E+00, 0.1258E+00 }, { 0.1023E+01, 0.1088E+01, 0.1145E+01 },
+{ 0.5190E+00, 0.6873E+00, 0.5172E+00 }, { 0.1380E+01, 0.1405E+01, 0.1474E+01 },
+{ 0.3393E+00, 0.3100E+00, 0.2231E+00 }, { 0.1354E+01, 0.1249E+01, 0.1270E+01 },
+{ 0.7363E+00, 0.8508E+00, 0.8247E+00 }, { 0.1612E+01, 0.1537E+01, 0.1509E+01 },
+{ 0.2952E+00, 0.2053E+00, 0.2590E+00 }, { 0.1138E+01, 0.1219E+01, 0.1262E+01 },
+{ 0.1345E+01, 0.1289E+01, 0.1338E+01 }, { 0.1437E+01, 0.1360E+01, 0.1442E+01 },
+{ 0.4826E+00, 0.3298E+00, 0.3842E+00 }, { 0.1219E+01, 0.1311E+01, 0.1413E+01 },
+{ 0.1212E+01, 0.1186E+01, 0.6357E+00 }, { 0.1873E+01, 0.1939E+01, 0.1674E+01 },
+{ 0.1260E+01, 0.1306E+01, 0.1368E+01 }, { 0.1146E+01, 0.1077E+01, 0.1025E+01 },
+{ 0.6029E+00, 0.5039E+00, 0.5781E+00 }, { 0.1514E+01, 0.1420E+01, 0.1324E+01 },
+{ 0.2652E+00, 0.3192E+00, 0.3042E+00 }, { 0.1368E+01, 0.1198E+01, 0.1200E+01 },
+{ 0.1234E+01, 0.4910E+00, 0.3464E-01 }, { 0.1347E+01, 0.1560E+01, 0.1861E+01 },
+{ 0.2766E+00, 0.2887E+00, 0.2029E+00 }, { 0.1257E+01, 0.1105E+01, 0.1145E+01 },
+{ 0.1351E+01, 0.1353E+01, 0.1406E+01 }, { 0.1506E+01, 0.1580E+01, 0.1362E+01 },
+{ 0.2794E+00, 0.3868E+00, 0.4277E+00 }, { 0.1234E+01, 0.1334E+01, 0.1336E+01 },
+{ 0.1280E+01, 0.1252E+01, 0.1805E+00 }, { 0.1387E+01, 0.1396E+01, 0.1434E+01 },
+{ 0.2902E+00, 0.1170E+00, 0.1698E+00 }, { 0.1134E+01, 0.1077E+01, 0.1117E+01 },
+{ 0.6986E+00, 0.7177E+00, 0.7366E+00 }, { 0.1370E+01, 0.1491E+01, 0.1495E+01 },
+{ 0.4031E+00, 0.5144E+00, 0.1751E+00 }, { 0.1333E+01, 0.1377E+01, 0.1257E+01 },
+{ 0.9212E+00, 0.8934E+00, 0.8897E+00 }, { 0.1589E+01, 0.1614E+01, 0.1523E+01 },
+{ 0.3152E+00, 0.2164E+00, 0.3230E+00 }, { 0.1300E+01, 0.1145E+01, 0.1212E+01 },
+{ 0.1269E+01, 0.1245E+01, 0.1497E+01 }, { 0.1763E+01, 0.1716E+01, 0.1311E+01 },
+{ 0.4702E+00, 0.5422E+00, 0.4306E+00 }, { 0.1342E+01, 0.1433E+01, 0.1423E+01 },
+{ 0.1472E+01, 0.1404E+01, 0.8371E+00 }, { 0.1936E+01, 0.1883E+01, 0.1838E+01 },
+{ 0.1266E+01, 0.1295E+01, 0.1302E+01 }, { 0.1074E+01, 0.1002E+01, 0.1023E+01 },
+{ 0.5206E+00, 0.4045E+00, 0.6549E+00 }, { 0.1457E+01, 0.1378E+01, 0.1363E+01 },
+{ 0.2715E+00, 0.2629E+00, 0.2841E+00 }, { 0.1264E+01, 0.1271E+01, 0.1175E+01 },
+{ 0.1337E+01, 0.1305E+01, 0.1306E+01 }, { 0.1555E+01, 0.1571E+01, 0.1657E+01 },
+{ 0.3341E+00, 0.4147E+00,-0.3648E+00 }, { 0.1188E+01, 0.1185E+01, 0.1161E+01 },
+{ 0.6198E+00, 0.7208E+00, 0.1157E+01 }, { 0.1582E+01, 0.1465E+01, 0.1513E+01 },
+{ 0.3839E+00, 0.3651E+00, 0.3814E+00 }, { 0.1214E+01, 0.1256E+01, 0.1292E+01 },
+{ 0.1361E+01, 0.1363E+01, 0.1312E+01 }, { 0.1793E+01, 0.1693E+01, 0.1669E+01 },
+{ 0.1889E+00, 0.1275E+00, 0.2534E+00 }, { 0.1066E+01, 0.1174E+01, 0.1133E+01 },
+{ 0.4999E+00, 0.8207E+00, 0.5813E+00 }, { 0.1478E+01, 0.1416E+01, 0.1497E+01 },
+{ 0.3814E+00, 0.3138E+00, 0.2889E+00 }, { 0.1396E+01, 0.1265E+01, 0.1233E+01 },
+{ 0.9458E+00, 0.9161E+00, 0.5875E+00 }, { 0.1672E+01, 0.1632E+01, 0.1553E+01 },
+{ 0.3505E+00, 0.2525E+00, 0.2364E+00 }, { 0.1211E+01, 0.1138E+01, 0.1235E+01 },
+{ 0.1391E+01, 0.1231E+01, 0.1355E+01 }, { 0.1783E+01, 0.1510E+01, 0.1199E+01 },
+{ 0.4227E+00, 0.4548E+00, 0.3671E+00 }, { 0.1281E+01, 0.1254E+01, 0.1661E+01 },
+{ 0.1338E+01, 0.1379E+01, 0.9531E+00 }, { 0.2148E+01, 0.1965E+01, 0.1584E+01 },
+{ 0.9324E-01, 0.3575E+00, 0.3522E+00 }, { 0.1212E+01, 0.1086E+01, 0.1044E+01 },
+{ 0.6128E+00, 0.6136E+00, 0.6060E+00 }, { 0.1484E+01, 0.1507E+01, 0.1396E+01 },
+{ 0.2820E+00, 0.3848E+00, 0.3156E+00 }, { 0.1368E+01, 0.1287E+01, 0.1128E+01 },
+{ 0.1369E+01, 0.1352E+01, 0.1358E+01 }, { 0.1381E+01, 0.1765E+01, 0.2113E+01 },
+{ 0.1314E+01, 0.1345E+01, 0.1334E+01 }, { 0.1290E+01, 0.1172E+01, 0.1119E+01 },
+{ 0.1304E+01, 0.1377E+01, 0.1427E+01 }, { 0.1490E+01, 0.1540E+01, 0.1536E+01 },
+{ 0.3994E+00, 0.4402E+00, 0.4173E+00 }, { 0.1323E+01, 0.1307E+01, 0.1392E+01 },
+{ 0.1400E+01, 0.1388E+01, 0.1369E+01 }, { 0.1669E+01, 0.1818E+01, 0.1834E+01 },
+{ 0.2742E+00, 0.2235E+00, 0.1986E+00 }, { 0.1137E+01, 0.1139E+01, 0.1201E+01 },
+{ 0.1324E+01, 0.1385E+01, 0.1349E+01 }, { 0.1455E+01, 0.1574E+01, 0.1454E+01 },
+{ 0.5019E+00, 0.3255E+00, 0.2555E+00 }, { 0.1388E+01, 0.1438E+01, 0.1300E+01 },
+{ 0.1394E+01, 0.1349E+01, 0.1411E+01 }, { 0.1639E+01, 0.1580E+01, 0.1681E+01 },
+{ 0.3920E+00, 0.2498E+00, 0.3523E+00 }, { 0.1301E+01, 0.1221E+01, 0.1285E+01 },
+{ 0.1318E+01, 0.1342E+01, 0.1494E+01 }, { 0.1910E+01, 0.1680E+01, 0.1470E+01 },
+{ 0.6082E+00, 0.5270E+00, 0.4173E+00 }, { 0.1255E+01, 0.1477E+01, 0.1503E+01 },
+{ 0.1807E+01, 0.1742E+01, 0.6553E+00 }, { 0.2000E+01, 0.2072E+01, 0.2051E+01 }};
+
+/**
+ * LSP vector quantization tables
+ *
+ * TIA/IS-127 tables 8-1 through 8-9
+ */
+
+static const float evrc_lspq_full_codebook1[64][2] = {
+{1.42016308E-2, 1.93881616E-2}, {2.91667543E-2, 6.51749149E-2},
+{2.06693150E-2, 4.97564934E-2}, {3.94719802E-2, 9.55850929E-2},
+{2.27012448E-2, 3.96625809E-2}, {5.38789518E-2, 6.28347769E-2},
+{2.90525518E-2, 5.73435798E-2}, {4.48280610E-2, 1.15364626E-1},
+{1.94110647E-2, 3.46889682E-2}, {4.37502973E-2, 6.75228462E-2},
+{3.55497338E-2, 4.94086780E-2}, {6.99219853E-2, 8.67279768E-2},
+{2.77880151E-2, 4.65748496E-2}, {5.79111017E-2, 6.74542487E-2},
+{4.74664383E-2, 5.50271496E-2}, {7.88898915E-2, 1.22443043E-1},
+{2.21715886E-2, 3.02628800E-2}, {3.39134485E-2, 7.17703998E-2},
+{3.17989141E-2, 4.98996116E-2}, {6.11555986E-2, 8.73361230E-2},
+{2.67506503E-2, 3.96735854E-2}, {4.44100983E-2, 8.26731324E-2},
+{3.89172547E-2, 5.65788932E-2}, {6.04800619E-2, 1.04536951E-1},
+{2.69156620E-2, 3.57168876E-2}, {4.11117189E-2, 7.33322948E-2},
+{4.12660725E-2, 4.85165231E-2}, {7.18049556E-2, 1.06202349E-1},
+{3.38037871E-2, 4.24300395E-2}, {5.91818243E-2, 7.97467977E-2},
+{4.70107906E-2, 6.28563762E-2}, {9.42011923E-2, 1.30053163E-1},
+{1.94244273E-2, 2.72732340E-2}, {3.70831676E-2, 6.64898157E-2},
+{2.80136354E-2, 5.15984930E-2}, {5.34461029E-2, 9.25904214E-2},
+{2.54959203E-2, 4.32844795E-2}, {5.51860742E-2, 7.36182332E-2},
+{3.39851119E-2, 6.05329126E-2}, {6.18182123E-2, 1.34581268E-1},
+{2.35669166E-2, 3.55242006E-2}, {5.10804243E-2, 6.79562539E-2},
+{3.83464955E-2, 5.23469411E-2}, {7.44275749E-2, 9.66108292E-2},
+{3.18591148E-2, 4.62123118E-2}, {6.18909821E-2, 7.33231753E-2},
+{4.41718437E-2, 5.79240918E-2}, {7.93596208E-2, 1.41177371E-1},
+{2.47412287E-2, 3.23629379E-2}, {3.36563922E-2, 8.04650635E-2},
+{3.37943695E-2, 5.44977151E-2}, {6.53648973E-2, 9.52775925E-2},
+{2.93364152E-2, 4.28411029E-2}, {5.27870469E-2, 8.16159397E-2},
+{4.00724895E-2, 6.18144684E-2}, {6.75848573E-2, 1.17196076E-1},
+{3.03064957E-2, 3.86914052E-2}, {4.83106263E-2, 7.42383003E-2},
+{4.37548272E-2, 5.22842295E-2}, {8.32310021E-2, 1.09881967E-1},
+{3.75600643E-2, 4.53217216E-2}, {6.60113171E-2, 7.97580183E-2},
+{5.03225066E-2, 5.90176322E-2}, {8.77133310E-2, 1.63187444E-1}};
+
+static const float evrc_lspq_full_codebook2[64][2] = {
+{5.21959551E-2, 8.38445649E-2}, {1.05874076E-1, 1.28694162E-1},
+{5.48323877E-2, 1.33842856E-1}, {1.17768474E-1, 1.94037274E-1},
+{5.36086522E-2, 1.11398734E-1}, {1.19989693E-1, 1.47474691E-1},
+{8.00373554E-2, 1.42999724E-1}, {1.64086595E-1, 2.09821835E-1},
+{5.21059223E-2, 9.95229408E-2}, {8.67567956E-2, 1.85966507E-1},
+{7.77341127E-2, 1.31506845E-1}, {1.60545513E-1, 1.81930289E-1},
+{7.42243677E-2, 1.10437103E-1}, {1.18635088E-1, 1.75306752E-1},
+{6.61557764E-2, 1.64441928E-1}, {1.96810856E-1, 2.16682002E-1},
+{6.05317838E-2, 9.45408568E-2}, {1.06271386E-1, 1.48013934E-1},
+{5.87486550E-2, 1.47724584E-1}, {1.34816468E-1, 2.01517954E-1},
+{6.59698322E-2, 1.16447397E-1}, {1.32297173E-1, 1.53267249E-1},
+{9.26660746E-2, 1.46725491E-1}, {1.79285541E-1, 2.19705954E-1},
+{7.06458464E-2, 9.99924466E-2}, {1.06500491E-1, 1.79443434E-1},
+{8.79249722E-2, 1.25287697E-1}, {1.53640196E-1, 1.97852716E-1},
+{8.88430104E-2, 1.12465657E-1}, {1.48286715E-1, 1.67517021E-1},
+{8.16568136E-2, 1.69274017E-1}, {2.07810536E-1, 2.31033549E-1},
+{6.14927970E-2, 8.36263224E-2}, {1.14473253E-1, 1.36779979E-1},
+{6.87129870E-2, 1.38099059E-1}, {1.10511415E-1, 2.15352878E-1},
+{5.55652268E-2, 1.22242786E-1}, {1.20557591E-1, 1.61072448E-1},
+{8.32249671E-2, 1.55475482E-1}, {1.61638483E-1, 2.28268847E-1},
+{6.29152283E-2, 1.06229566E-1}, {8.29186887E-2, 2.06774518E-1},
+{8.84756893E-2, 1.35799959E-1}, {1.69772223E-1, 1.93773940E-1},
+{7.77297840E-2, 1.20287232E-1}, {1.30648017E-1, 1.84331819E-1},
+{6.91939592E-2, 1.84218004E-1}, {2.03904077E-1, 2.49715164E-1},
+{7.07671717E-2, 9.03186128E-2}, {1.08471557E-1, 1.61966518E-1},
+{7.16886371E-2, 1.51093170E-1}, {1.38779536E-1, 2.18801782E-1},
+{6.75907061E-2, 1.26740307E-1}, {1.33412346E-1, 1.68838874E-1},
+{9.61822569E-2, 1.58728704E-1}, {1.86485633E-1, 2.36560926E-1},
+{8.23447108E-2, 1.02126025E-1}, {1.00336641E-1, 1.94918498E-1},
+{9.95981991E-2, 1.36425093E-1}, {1.82448462E-1, 2.03655198E-1},
+{9.78890732E-2, 1.21145472E-1}, {1.45453140E-1, 1.83604524E-1},
+{9.58395451E-2, 1.72194853E-1}, {2.23295853E-1, 2.46418610E-1}};
+
+static const float evrc_lspq_full_codebook3[512][3] = {
+{1.36425778E-1, 1.68651849E-1, 2.04688221E-1},
+{1.85717627E-1, 2.28756160E-1, 2.51958042E-1},
+{1.22760192E-1, 1.85950696E-1, 2.79446691E-1},
+{1.96468458E-1, 2.64484435E-1, 2.89318889E-1},
+{1.25653744E-1, 1.50529265E-1, 2.76144296E-1},
+{1.96301565E-1, 2.41699994E-1, 2.88230687E-1},
+{1.40099391E-1, 2.22365588E-1, 2.74666578E-1},
+{2.59952307E-1, 2.75394946E-1, 3.10975939E-1},
+{1.58452198E-1, 1.88591003E-1, 2.07339197E-1},
+{1.95616230E-1, 2.21379519E-1, 2.87022918E-1},
+{1.69424579E-1, 2.01614648E-1, 2.75669187E-1},
+{2.12393746E-1, 2.64250666E-1, 3.17967504E-1},
+{1.82965085E-1, 1.99547559E-1, 2.29538843E-1},
+{2.15200707E-1, 2.62409419E-1, 2.82432705E-1},
+{1.46404549E-1, 2.36966729E-1, 2.90067106E-1},
+{2.45338634E-1, 3.03358108E-1, 3.42260152E-1},
+{1.37478963E-1, 1.58276558E-1, 2.39217222E-1},
+{2.01999024E-1, 2.20102608E-1, 2.69546896E-1},
+{1.18350029E-1, 2.30206400E-1, 2.83554822E-1},
+{2.25519255E-1, 2.72272140E-1, 3.06072980E-1},
+{1.35661438E-1, 1.91633970E-1, 2.65912026E-1},
+{1.95733085E-1, 2.31926173E-1, 3.14376086E-1},
+{1.67998984E-1, 2.27706313E-1, 2.76947826E-1},
+{2.50170559E-1, 3.01627070E-1, 3.21084231E-1},
+{1.33492306E-1, 2.01223105E-1, 2.33893991E-1},
+{2.06442133E-1, 2.38704175E-1, 2.77560145E-1},
+{1.79048792E-1, 1.95776582E-1, 2.80656606E-1},
+{2.06193641E-1, 2.64055401E-1, 3.33098441E-1},
+{1.75185278E-1, 1.91166341E-1, 2.57540315E-1},
+{2.28398636E-1, 2.45296657E-1, 3.08980793E-1},
+{1.80859819E-1, 2.43579060E-1, 2.96631068E-1},
+{2.76152968E-1, 3.08256060E-1, 3.46822590E-1},
+{1.37115732E-1, 1.80057764E-1, 2.20953465E-1},
+{1.81370094E-1, 2.26770103E-1, 2.70392686E-1},
+{1.25246510E-1, 1.79606944E-1, 3.10376436E-1},
+{1.90708354E-1, 2.87734240E-1, 3.13476235E-1},
+{1.30486086E-1, 1.60435289E-1, 3.00243706E-1},
+{1.97318628E-1, 2.56378502E-1, 2.78474301E-1},
+{1.58597067E-1, 2.37381399E-1, 2.62910336E-1},
+{2.61825919E-1, 2.77717203E-1, 3.31382245E-1},
+{1.64160743E-1, 1.85841531E-1, 2.35615849E-1},
+{2.09486142E-1, 2.21452802E-1, 2.92153865E-1},
+{1.66807845E-1, 2.13641763E-1, 2.70675927E-1},
+{2.29834273E-1, 2.88374633E-1, 3.06238323E-1},
+{1.82154253E-1, 2.00822473E-1, 2.40169376E-1},
+{2.24944726E-1, 2.69813925E-1, 2.91401237E-1},
+{1.63940564E-1, 2.50341147E-1, 2.78307766E-1},
+{2.56727993E-1, 2.95103759E-1, 3.53297085E-1},
+{1.40218839E-1, 1.76687688E-1, 2.46773273E-1},
+{2.15291306E-1, 2.29216009E-1, 2.64283627E-1},
+{1.21002659E-1, 2.18333840E-1, 3.22341293E-1},
+{2.54243195E-1, 2.73986191E-1, 2.96262473E-1},
+{1.60385415E-1, 1.83762908E-1, 2.81598717E-1},
+{1.87832162E-1, 2.37420350E-1, 3.29777509E-1},
+{1.77788362E-1, 2.26703495E-1, 3.02322537E-1},
+{2.75108218E-1, 2.93730587E-1, 3.12373787E-1},
+{1.70116410E-1, 1.85232103E-1, 2.46125028E-1},
+{2.21754774E-1, 2.39912242E-1, 2.86891907E-1},
+{1.95083722E-1, 2.08337873E-1, 2.88349718E-1},
+{2.37536535E-1, 2.75004476E-1, 3.39786023E-1},
+{1.88369319E-1, 2.04371840E-1, 2.57375032E-1},
+{2.47250155E-1, 2.60551840E-1, 3.02137524E-1},
+{1.66944191E-1, 2.46912360E-1, 3.18894416E-1},
+{2.78118610E-1, 3.13011140E-1, 3.65329295E-1},
+{1.45213529E-1, 1.63051456E-1, 2.24912614E-1},
+{2.05692515E-1, 2.20831484E-1, 2.52817810E-1},
+{1.21125661E-1, 1.96374118E-1, 3.00122708E-1},
+{2.15566799E-1, 2.65657336E-1, 2.99202889E-1},
+{1.09134212E-1, 1.78472102E-1, 2.88323194E-1},
+{2.03508541E-1, 2.40347922E-1, 2.96309739E-1},
+{1.53101787E-1, 2.25415319E-1, 2.84843713E-1},
+{2.50233442E-1, 2.77736932E-1, 3.24840695E-1},
+{1.66308925E-1, 1.94173396E-1, 2.11635381E-1},
+{2.01289460E-1, 2.26062179E-1, 2.93246478E-1},
+{1.49518773E-1, 2.14201719E-1, 2.83894747E-1},
+{2.21836135E-1, 2.85231501E-1, 3.20082635E-1},
+{1.89573213E-1, 2.06577629E-1, 2.30332345E-1},
+{2.31247649E-1, 2.46864259E-1, 2.89846569E-1},
+{1.39116928E-1, 2.59189934E-1, 2.98019558E-1},
+{2.44512573E-1, 2.82671362E-1, 3.61258298E-1},
+{1.22530967E-1, 1.68514788E-1, 2.70879298E-1},
+{2.04372838E-1, 2.30398357E-1, 2.71792918E-1},
+{1.42643943E-1, 2.22405583E-1, 2.92057186E-1},
+{2.42643669E-1, 2.77429372E-1, 2.97135502E-1},
+{1.52048603E-1, 1.96921080E-1, 2.61013240E-1},
+{2.17875019E-1, 2.45840371E-1, 3.08138579E-1},
+{1.90109268E-1, 2.31099129E-1, 2.80178159E-1},
+{2.54314184E-1, 2.94079810E-1, 3.39649171E-1},
+{1.56698599E-1, 2.08597451E-1, 2.28010774E-1},
+{2.25088730E-1, 2.50014484E-1, 2.76250154E-1},
+{1.78219035E-1, 1.98228240E-1, 3.04198891E-1},
+{2.08567217E-1, 2.92395383E-1, 3.46786886E-1},
+{1.71052113E-1, 2.03438759E-1, 2.62644321E-1},
+{2.30275467E-1, 2.58817524E-1, 3.11986536E-1},
+{1.85333565E-1, 2.45760202E-1, 3.10553998E-1},
+{2.89413869E-1, 3.11095625E-1, 3.46476167E-1},
+{1.50332406E-1, 1.67538226E-1, 2.40182847E-1},
+{1.79971650E-1, 2.37168610E-1, 2.60899693E-1},
+{1.49866179E-1, 1.97890073E-1, 3.07916552E-1},
+{2.10799649E-1, 2.88180083E-1, 3.29747230E-1},
+{1.31711140E-1, 1.65906459E-1, 3.22898000E-1},
+{2.14832023E-1, 2.52822131E-1, 2.97547072E-1},
+{1.83760419E-1, 2.37523615E-1, 2.74610013E-1},
+{2.55575180E-1, 2.75439233E-1, 3.46021861E-1},
+{1.82662204E-1, 1.99470907E-1, 2.16051653E-1},
+{2.09240332E-1, 2.22406715E-1, 3.02382857E-1},
+{1.84088245E-1, 2.11327791E-1, 2.82538086E-1},
+{2.41171077E-1, 2.97036022E-1, 3.15979272E-1},
+{1.96804658E-1, 2.11815894E-1, 2.41647676E-1},
+{2.42761984E-1, 2.58586556E-1, 2.93204397E-1},
+{1.58905461E-1, 2.65077025E-1, 2.89881319E-1},
+{2.58060575E-1, 3.18903178E-1, 3.47846836E-1},
+{1.48766384E-1, 1.66853935E-1, 2.66827434E-1},
+{2.15942249E-1, 2.29938298E-1, 2.76041597E-1},
+{1.38410494E-1, 2.39283442E-1, 3.27972382E-1},
+{2.43765280E-1, 2.88408488E-1, 3.06048721E-1},
+{1.70157120E-1, 1.89986289E-1, 2.81219155E-1},
+{2.19117031E-1, 2.58005291E-1, 3.26571971E-1},
+{1.92163572E-1, 2.23614186E-1, 2.98683077E-1},
+{2.73545444E-1, 3.12078089E-1, 3.30766588E-1},
+{1.62452087E-1, 2.04930902E-1, 2.53337711E-1},
+{2.23855302E-1, 2.37671077E-1, 3.03202003E-1},
+{1.93955287E-1, 2.12335557E-1, 3.07566851E-1},
+{2.29912683E-1, 2.97581047E-1, 3.37499231E-1},
+{1.89335391E-1, 2.04148144E-1, 2.78609782E-1},
+{2.42303565E-1, 2.73163110E-1, 3.15361649E-1},
+{1.55009672E-1, 2.88095146E-1, 3.35996419E-1},
+{2.73716152E-1, 3.31215471E-1, 3.62539083E-1},
+{1.52389362E-1, 1.72619134E-1, 1.90585673E-1},
+{1.96988270E-1, 2.26309747E-1, 2.46197492E-1},
+{1.20555148E-1, 2.06369758E-1, 2.81199783E-1},
+{1.93709418E-1, 2.71900505E-1, 3.01332921E-1},
+{1.36701152E-1, 1.54093146E-1, 2.82258362E-1},
+{1.97299168E-1, 2.53656298E-1, 2.90315062E-1},
+{1.43463776E-1, 2.43872911E-1, 2.75533706E-1},
+{2.58477271E-1, 2.73279876E-1, 3.21119100E-1},
+{1.54406175E-1, 1.93793535E-1, 2.15884149E-1},
+{2.05979452E-1, 2.24277020E-1, 2.85732359E-1},
+{1.74535319E-1, 2.08482355E-1, 2.79668540E-1},
+{2.18844578E-1, 2.72486299E-1, 3.27095598E-1},
+{1.77609727E-1, 2.12990195E-1, 2.39119649E-1},
+{2.29163751E-1, 2.59165913E-1, 2.83514649E-1},
+{1.57353148E-1, 2.39961296E-1, 3.04263145E-1},
+{2.45613828E-1, 3.16824526E-1, 3.42909366E-1},
+{1.42953232E-1, 1.61905348E-1, 2.53710240E-1},
+{2.10192814E-1, 2.22847700E-1, 2.71103770E-1},
+{1.26843944E-1, 2.16709048E-1, 2.97734648E-1},
+{2.31000140E-1, 2.80109137E-1, 2.99707443E-1},
+{1.52980462E-1, 1.93996876E-1, 2.72895664E-1},
+{2.12860718E-1, 2.41545349E-1, 3.16518754E-1},
+{1.71154693E-1, 2.22469687E-1, 2.93786496E-1},
+{2.51988232E-1, 3.04254979E-1, 3.31269950E-1},
+{1.33188918E-1, 2.07924992E-1, 2.55362093E-1},
+{2.12044910E-1, 2.42189646E-1, 2.88903743E-1},
+{1.84612468E-1, 2.01143622E-1, 2.86360770E-1},
+{2.18286708E-1, 2.76752442E-1, 3.44581515E-1},
+{1.83562174E-1, 1.99478507E-1, 2.62156576E-1},
+{2.33130530E-1, 2.49596909E-1, 3.15842837E-1},
+{1.89898983E-1, 2.46874869E-1, 2.97132462E-1},
+{2.75022447E-1, 3.22490305E-1, 3.46977681E-1},
+{1.42305329E-1, 1.92689180E-1, 2.16155857E-1},
+{1.95676163E-1, 2.22268641E-1, 2.76587397E-1},
+{1.33241490E-1, 1.97791785E-1, 3.22897941E-1},
+{1.84865132E-1, 2.97106177E-1, 3.26105148E-1},
+{1.50203660E-1, 1.76781267E-1, 2.91536182E-1},
+{2.03144446E-1, 2.59616166E-1, 2.99156040E-1},
+{1.65488973E-1, 2.38342047E-1, 2.87493914E-1},
+{2.71071255E-1, 2.89544493E-1, 3.19521040E-1},
+{1.68598369E-1, 1.98825568E-1, 2.30347604E-1},
+{2.13811651E-1, 2.34471768E-1, 2.90959626E-1},
+{1.74605444E-1, 2.17256010E-1, 2.85688072E-1},
+{2.28503481E-1, 2.96190292E-1, 3.16534668E-1},
+{1.87172607E-1, 2.20547438E-1, 2.39688724E-1},
+{2.28884771E-1, 2.63583153E-1, 3.01329464E-1},
+{1.77897051E-1, 2.58131474E-1, 2.81487674E-1},
+{2.59513617E-1, 3.07204396E-1, 3.48793596E-1},
+{1.45224437E-1, 1.78715974E-1, 2.59186983E-1},
+{2.19062313E-1, 2.38223523E-1, 2.60461539E-1},
+{1.43650874E-1, 2.09760785E-1, 3.15830201E-1},
+{2.50127465E-1, 2.79182345E-1, 3.05153579E-1},
+{1.48986444E-1, 2.01226771E-1, 2.82543689E-1},
+{2.08387777E-1, 2.35603899E-1, 3.45363885E-1},
+{1.85830340E-1, 2.21607298E-1, 3.10773641E-1},
+{2.80904710E-1, 2.95469791E-1, 3.25499445E-1},
+{1.72967300E-1, 1.97078109E-1, 2.45801106E-1},
+{2.19495699E-1, 2.44767100E-1, 2.93587774E-1},
+{1.83909580E-1, 2.15004295E-1, 3.00334543E-1},
+{2.45338634E-1, 2.68595248E-1, 3.48330349E-1},
+{1.92957386E-1, 2.06625074E-1, 2.67336398E-1},
+{2.54845560E-1, 2.68642277E-1, 3.03547889E-1},
+{1.76853105E-1, 2.59330958E-1, 3.16200763E-1},
+{2.90929139E-1, 3.15634757E-1, 3.68723541E-1},
+{1.57116994E-1, 1.73552901E-1, 2.28736520E-1},
+{2.12509260E-1, 2.30501205E-1, 2.52217978E-1},
+{1.42521843E-1, 2.01979935E-1, 2.93012232E-1},
+{2.14919671E-1, 2.78065056E-1, 3.14176053E-1},
+{1.35947272E-1, 1.81055903E-1, 2.75475413E-1},
+{1.98416695E-1, 2.41673797E-1, 3.05173427E-1},
+{1.59517333E-1, 2.31580108E-1, 2.95412451E-1},
+{2.58203626E-1, 2.87348121E-1, 3.20351988E-1},
+{1.74840674E-1, 1.92883253E-1, 2.11250007E-1},
+{2.02168509E-1, 2.27025688E-1, 3.04884046E-1},
+{1.69532105E-1, 2.11826235E-1, 2.97355384E-1},
+{2.30033740E-1, 2.91504353E-1, 3.26589435E-1},
+{1.95046112E-1, 2.11709172E-1, 2.27705747E-1},
+{2.37926885E-1, 2.52411634E-1, 2.97752172E-1},
+{1.53762922E-1, 2.46541560E-1, 3.14768940E-1},
+{2.36075714E-1, 3.03568929E-1, 3.70624453E-1},
+{1.38660327E-1, 1.67949975E-1, 2.73515254E-1},
+{2.13806167E-1, 2.27267206E-1, 2.86276251E-1},
+{1.25080630E-1, 2.44098395E-1, 3.02548796E-1},
+{2.35714868E-1, 2.81208843E-1, 3.08903724E-1},
+{1.51691392E-1, 2.10877746E-1, 2.63812989E-1},
+{2.20730439E-1, 2.52777904E-1, 3.16413730E-1},
+{1.84924737E-1, 2.39424765E-1, 2.85120815E-1},
+{2.59548545E-1, 3.09809893E-1, 3.26423734E-1},
+{1.62930742E-1, 2.19900876E-1, 2.36148626E-1},
+{2.34194234E-1, 2.49944329E-1, 2.77549058E-1},
+{1.70870200E-1, 1.98291600E-1, 3.21412593E-1},
+{2.31566861E-1, 2.75015086E-1, 3.69710356E-1},
+{1.80002406E-1, 2.06701040E-1, 2.71204919E-1},
+{2.38075271E-1, 2.54006237E-1, 3.23827595E-1},
+{1.99148253E-1, 2.54273921E-1, 3.07479709E-1},
+{2.87428617E-1, 3.25045079E-1, 3.48634571E-1},
+{1.45285025E-1, 1.91359162E-1, 2.49691397E-1},
+{1.94659308E-1, 2.40821242E-1, 2.77302653E-1},
+{1.53150991E-1, 1.94375664E-1, 3.27550441E-1},
+{2.04085842E-1, 2.98595697E-1, 3.21480066E-1},
+{1.56009689E-1, 1.81012720E-1, 3.00931662E-1},
+{2.10962430E-1, 2.55770296E-1, 3.08086127E-1},
+{1.85444072E-1, 2.49021322E-1, 2.74029821E-1},
+{2.74493456E-1, 2.89441973E-1, 3.38794917E-1},
+{1.76941887E-1, 1.94476932E-1, 2.22077265E-1},
+{2.16377512E-1, 2.30735779E-1, 3.03689271E-1},
+{1.89683452E-1, 2.14660764E-1, 2.88445383E-1},
+{2.40827337E-1, 2.98141748E-1, 3.27378422E-1},
+{2.01787844E-1, 2.19441772E-1, 2.39327446E-1},
+{2.48812512E-1, 2.65865892E-1, 2.93382376E-1},
+{1.82027832E-1, 2.68279046E-1, 2.93991417E-1},
+{2.56498635E-1, 3.19984466E-1, 3.62663239E-1},
+{1.58799276E-1, 1.75433666E-1, 2.67389864E-1},
+{2.24259302E-1, 2.36668259E-1, 2.77639121E-1},
+{1.49203405E-1, 2.26585329E-1, 3.45255584E-1},
+{2.50655770E-1, 2.92264849E-1, 3.13574284E-1},
+{1.58096299E-1, 2.02193201E-1, 2.98711687E-1},
+{2.28820905E-1, 2.48557344E-1, 3.44726473E-1},
+{1.87972054E-1, 2.34109432E-1, 3.04235607E-1},
+{2.85657108E-1, 3.14878136E-1, 3.36931497E-1},
+{1.62680015E-1, 2.17820048E-1, 2.57436782E-1},
+{2.24049792E-1, 2.46739820E-1, 3.00795883E-1},
+{2.01354548E-1, 2.18286663E-1, 3.13036293E-1},
+{2.38028511E-1, 2.98103482E-1, 3.53503793E-1},
+{1.98829994E-1, 2.12877125E-1, 2.72980839E-1},
+{2.50616491E-1, 2.67659992E-1, 3.20611864E-1},
+{1.70901820E-1, 2.69330353E-1, 3.34428221E-1},
+{3.04988861E-1, 3.36196691E-1, 3.65235358E-1},
+{1.47624031E-1, 1.81272805E-1, 2.04707921E-1},
+{1.93751350E-1, 2.20973969E-1, 2.61775166E-1},
+{1.32089809E-1, 1.94851607E-1, 2.83547610E-1},
+{2.07739428E-1, 2.70596832E-1, 2.92264789E-1},
+{1.27733424E-1, 1.66896015E-1, 2.83891350E-1},
+{2.05309406E-1, 2.47807533E-1, 2.83632785E-1},
+{1.54211894E-1, 2.25014091E-1, 2.70082027E-1},
+{2.67574131E-1, 2.84426898E-1, 3.09334785E-1},
+{1.68846920E-1, 1.87004536E-1, 2.02433169E-1},
+{2.02441111E-1, 2.16733068E-1, 2.93079227E-1},
+{1.63621262E-1, 2.15616465E-1, 2.82792896E-1},
+{2.25509301E-1, 2.66283005E-1, 3.17886561E-1},
+{1.89110294E-1, 2.05609441E-1, 2.22113580E-1},
+{2.21240178E-1, 2.60288864E-1, 2.92541057E-1},
+{1.55563369E-1, 2.46850818E-1, 2.89648801E-1},
+{2.48406157E-1, 3.05291861E-1, 3.55316669E-1},
+{1.27122149E-1, 1.58053726E-1, 2.54164368E-1},
+{2.04998836E-1, 2.19476849E-1, 2.78342038E-1},
+{1.33302316E-1, 2.29614019E-1, 2.86947161E-1},
+{2.36777052E-1, 2.67918199E-1, 3.08230907E-1},
+{1.40853569E-1, 2.03414679E-1, 2.73257107E-1},
+{2.07684264E-1, 2.34520018E-1, 3.24583262E-1},
+{1.77181646E-1, 2.29595393E-1, 2.83539146E-1},
+{2.61378348E-1, 3.01160187E-1, 3.21707100E-1},
+{1.48595735E-1, 2.07772017E-1, 2.46946126E-1},
+{2.14334831E-1, 2.48061299E-1, 2.72259146E-1},
+{1.76380262E-1, 1.96897894E-1, 2.92286903E-1},
+{1.98193476E-1, 2.75483340E-1, 3.49037558E-1},
+{1.76153168E-1, 1.93248957E-1, 2.69548506E-1},
+{2.36968622E-1, 2.50065804E-1, 3.06820840E-1},
+{1.76060721E-1, 2.54037619E-1, 3.03566784E-1},
+{2.82952905E-1, 3.01765054E-1, 3.53956312E-1},
+{1.45353720E-1, 1.83678836E-1, 2.34750062E-1},
+{1.93842635E-1, 2.30635554E-1, 2.67817765E-1},
+{1.38958976E-1, 1.86760783E-1, 3.13113242E-1},
+{1.99944481E-1, 2.77624756E-1, 3.25046331E-1},
+{1.42966077E-1, 1.71310842E-1, 3.03013414E-1},
+{2.07741663E-1, 2.58691758E-1, 2.88766950E-1},
+{1.71776935E-1, 2.40246087E-1, 2.73284525E-1},
+{2.71046638E-1, 2.85170943E-1, 3.27401131E-1},
+{1.69854626E-1, 1.87545776E-1, 2.24484712E-1},
+{2.15221986E-1, 2.27339745E-1, 2.95008808E-1},
+{1.75596640E-1, 2.17936546E-1, 2.74879605E-1},
+{2.34665439E-1, 2.89530903E-1, 3.16494375E-1},
+{1.89946994E-1, 2.04953820E-1, 2.46955171E-1},
+{2.37297818E-1, 2.68316716E-1, 2.90684313E-1},
+{1.69963166E-1, 2.53367484E-1, 2.92533010E-1},
+{2.70659864E-1, 2.97146112E-1, 3.56183976E-1},
+{1.52539685E-1, 1.70138955E-1, 2.52703935E-1},
+{2.19119206E-1, 2.35900700E-1, 2.69739121E-1},
+{1.42245665E-1, 2.18184620E-1, 3.28218073E-1},
+{2.61472821E-1, 2.78025657E-1, 3.02375883E-1},
+{1.53526023E-1, 1.90727741E-1, 2.92820841E-1},
+{2.09240988E-1, 2.49808684E-1, 3.24709088E-1},
+{1.75176397E-1, 2.38646746E-1, 3.06392699E-1},
+{2.73218870E-1, 3.03954989E-1, 3.20513874E-1},
+{1.63911596E-1, 1.89611584E-1, 2.56272525E-1},
+{2.26953760E-1, 2.40120232E-1, 2.92728513E-1},
+{1.95565715E-1, 2.11956203E-1, 2.97374696E-1},
+{2.41045550E-1, 2.88497001E-1, 3.36352319E-1},
+{1.94948331E-1, 2.09475279E-1, 2.56309658E-1},
+{2.47884631E-1, 2.63356417E-1, 3.11270863E-1},
+{1.69189706E-1, 2.35864580E-1, 3.36249381E-1},
+{2.86001563E-1, 3.25423747E-1, 3.59607369E-1},
+{1.56258598E-1, 1.76704943E-1, 2.14393437E-1},
+{2.08996847E-1, 2.23968685E-1, 2.60886759E-1},
+{1.35765389E-1, 2.03580052E-1, 3.05503219E-1},
+{2.18961373E-1, 2.79463500E-1, 2.99450845E-1},
+{1.34064749E-1, 1.78332120E-1, 2.90169626E-1},
+{2.13298395E-1, 2.40031511E-1, 3.00345927E-1},
+{1.64373413E-1, 2.26438701E-1, 2.87171155E-1},
+{2.50739604E-1, 2.80812472E-1, 3.35349351E-1},
+{1.63649514E-1, 1.97108001E-1, 2.21165180E-1},
+{2.08139613E-1, 2.30869800E-1, 2.96137065E-1},
+{1.59113124E-1, 2.18189180E-1, 2.95531958E-1},
+{2.39883497E-1, 2.81831235E-1, 3.26045603E-1},
+{1.89394727E-1, 2.08127141E-1, 2.38446414E-1},
+{2.32995704E-1, 2.59603471E-1, 2.93427974E-1},
+{1.60558835E-1, 2.55164832E-1, 3.02872926E-1},
+{2.53509283E-1, 2.96028465E-1, 3.67721587E-1},
+{1.30124375E-1, 1.74838990E-1, 2.60486037E-1},
+{2.10203990E-1, 2.33570784E-1, 2.83061892E-1},
+{1.52365491E-1, 2.25338757E-1, 3.03720981E-1},
+{2.40558609E-1, 2.77192205E-1, 3.05891901E-1},
+{1.63728818E-1, 1.94779396E-1, 2.69253582E-1},
+{2.25709423E-1, 2.40902692E-1, 3.18060607E-1},
+{1.92055091E-1, 2.29857832E-1, 2.89826721E-1},
+{2.62759686E-1, 3.04292172E-1, 3.35680574E-1},
+{1.66071162E-1, 2.06819177E-1, 2.39712462E-1},
+{2.23915562E-1, 2.50106871E-1, 2.85296232E-1},
+{1.88402340E-1, 2.03793734E-1, 3.03041130E-1},
+{2.30698988E-1, 2.87044138E-1, 3.49802762E-1},
+{1.82025358E-1, 2.14073509E-1, 2.63470024E-1},
+{2.37297758E-1, 2.65025407E-1, 3.17815512E-1},
+{1.89278707E-1, 2.58802205E-1, 3.04866165E-1},
+{2.97243059E-1, 3.17153066E-1, 3.56583923E-1},
+{1.58607468E-1, 1.78659767E-1, 2.41919369E-1},
+{1.94887385E-1, 2.41695851E-1, 2.62176663E-1},
+{1.58124432E-1, 2.11753070E-1, 3.11352164E-1},
+{2.16902718E-1, 2.98796803E-1, 3.20994049E-1},
+{1.49272785E-1, 1.74964130E-1, 3.15334409E-1},
+{2.21622273E-1, 2.56179065E-1, 3.03902954E-1},
+{1.75979599E-1, 2.43505448E-1, 2.85801739E-1},
+{2.64590383E-1, 2.85541564E-1, 3.45107764E-1},
+{1.80137083E-1, 2.05279350E-1, 2.22255990E-1},
+{2.10796222E-1, 2.26315439E-1, 3.14426929E-1},
+{1.79151163E-1, 2.09439725E-1, 2.93280870E-1},
+{2.49719024E-1, 2.91257650E-1, 3.27162296E-1},
+{1.98700234E-1, 2.15896755E-1, 2.49960214E-1},
+{2.40726396E-1, 2.64857739E-1, 2.99639553E-1},
+{1.71249732E-1, 2.68166155E-1, 3.03572744E-1},
+{2.69555569E-1, 3.16100627E-1, 3.56570691E-1},
+{1.50564745E-1, 1.84190869E-1, 2.68674821E-1},
+{2.16941193E-1, 2.40813971E-1, 2.78942198E-1},
+{1.35399476E-1, 2.60586530E-1, 3.32604855E-1},
+{2.56150961E-1, 2.87822872E-1, 3.06156367E-1},
+{1.66398838E-1, 1.88721806E-1, 2.93023735E-1},
+{2.29214087E-1, 2.61565417E-1, 3.27494055E-1},
+{1.98266640E-1, 2.32970506E-1, 2.99134284E-1},
+{2.87046254E-1, 3.07103783E-1, 3.27298075E-1},
+{1.75898686E-1, 2.11898595E-1, 2.51332909E-1},
+{2.32067421E-1, 2.44622201E-1, 2.99443692E-1},
+{1.90780059E-1, 2.12090015E-1, 3.25059265E-1},
+{2.31531218E-1, 3.14166099E-1, 3.42735857E-1},
+{1.95099846E-1, 2.09554315E-1, 2.79483467E-1},
+{2.40416065E-1, 2.69604772E-1, 3.28015476E-1},
+{1.71800867E-1, 2.82233089E-1, 3.14749271E-1},
+{2.69243777E-1, 3.38462502E-1, 3.79935652E-1},
+{1.59934625E-1, 1.77966774E-1, 2.00818628E-1},
+{2.01979712E-1, 2.30668545E-1, 2.56773323E-1},
+{1.34024277E-1, 2.10961610E-1, 2.84687728E-1},
+{2.03712896E-1, 2.83053070E-1, 3.03309411E-1},
+{1.44528881E-1, 1.64728075E-1, 2.85079390E-1},
+{2.06285611E-1, 2.48649031E-1, 2.96383053E-1},
+{1.58138171E-1, 2.34317720E-1, 2.79650003E-1},
+{2.64995635E-1, 2.79900700E-1, 3.18619400E-1},
+{1.66537479E-1, 1.84279412E-1, 2.14547485E-1},
+{2.03051880E-1, 2.35110492E-1, 2.88755983E-1},
+{1.68422714E-1, 2.03946173E-1, 2.87478894E-1},
+{2.31727019E-1, 2.74086386E-1, 3.24755162E-1},
+{1.85356215E-1, 2.14113116E-1, 2.29030401E-1},
+{2.42482558E-1, 2.60655493E-1, 2.83030301E-1},
+{1.67562261E-1, 2.42027491E-1, 2.99461991E-1},
+{2.38809898E-1, 3.19003850E-1, 3.58415872E-1},
+{1.37908265E-1, 1.54787809E-1, 2.65611202E-1},
+{2.11019263E-1, 2.24607319E-1, 2.79954702E-1},
+{1.37569889E-1, 2.25128531E-1, 3.09312850E-1},
+{2.29239866E-1, 2.76150972E-1, 3.15241843E-1},
+{1.60487458E-1, 1.95461214E-1, 2.83169478E-1},
+{2.18505666E-1, 2.38197207E-1, 3.30340117E-1},
+{1.81991324E-1, 2.33026952E-1, 2.93276042E-1},
+{2.54552305E-1, 3.14394146E-1, 3.36392254E-1},
+{1.44095764E-1, 2.26640165E-1, 2.50595063E-1},
+{2.15188012E-1, 2.51417249E-1, 2.85043985E-1},
+{1.87674388E-1, 2.04458863E-1, 2.94168979E-1},
+{2.30494842E-1, 2.68452436E-1, 3.52370054E-1},
+{1.85022101E-1, 1.99075252E-1, 2.71930546E-1},
+{2.42569372E-1, 2.55389154E-1, 3.11399311E-1},
+{1.95166096E-1, 2.49102056E-1, 2.98998445E-1},
+{2.83654153E-1, 3.14600259E-1, 3.55619401E-1},
+{1.51490018E-1, 1.97729796E-1, 2.32467473E-1},
+{2.00029895E-1, 2.30101258E-1, 2.81933933E-1},
+{1.38711318E-1, 1.91816628E-1, 3.45780402E-1},
+{1.96580395E-1, 3.04714769E-1, 3.40553433E-1},
+{1.38154253E-1, 1.88543141E-1, 2.99461216E-1},
+{2.05666468E-1, 2.68904895E-1, 3.05537194E-1},
+{1.72447845E-1, 2.33558387E-1, 2.93625206E-1},
+{2.70145416E-1, 2.98654765E-1, 3.28556389E-1},
+{1.75489411E-1, 1.91361547E-1, 2.35585332E-1},
+{2.20548794E-1, 2.34773993E-1, 2.95397669E-1},
+{1.85652360E-1, 2.22349137E-1, 2.79883891E-1},
+{2.29456946E-1, 3.04546326E-1, 3.24684292E-1},
+{1.86900780E-1, 2.15469390E-1, 2.51856804E-1},
+{2.34910533E-1, 2.71217376E-1, 2.99894661E-1},
+{1.85142443E-1, 2.56071001E-1, 2.93291301E-1},
+{2.63883710E-1, 3.07127446E-1, 3.62546653E-1},
+{1.60997644E-1, 1.78937852E-1, 2.55808324E-1},
+{2.25671068E-1, 2.43735075E-1, 2.68624991E-1},
+{1.55076161E-1, 2.30396181E-1, 3.21005553E-1},
+{2.51760483E-1, 2.79653400E-1, 3.14202160E-1},
+{1.56988814E-1, 2.07466930E-1, 2.89933950E-1},
+{2.17479482E-1, 2.59626418E-1, 3.40659052E-1},
+{1.76811531E-1, 2.31087089E-1, 3.17562491E-1},
+{2.82952607E-1, 2.99844354E-1, 3.36822897E-1},
+{1.82060316E-1, 1.98734730E-1, 2.51980305E-1},
+{2.25874200E-1, 2.52469152E-1, 2.93356389E-1},
+{2.00799957E-1, 2.17786849E-1, 3.02210063E-1},
+{2.47423753E-1, 2.86882848E-1, 3.47820610E-1},
+{2.01128140E-1, 2.14746892E-1, 2.62269646E-1},
+{2.53963351E-1, 2.69477993E-1, 3.12133819E-1},
+{1.91034868E-1, 2.55738169E-1, 3.32559615E-1},
+{2.91053712E-1, 3.31458420E-1, 3.68588477E-1},
+{1.57229915E-1, 1.85374141E-1, 2.25361317E-1},
+{2.08051339E-1, 2.38350868E-1, 2.64212936E-1},
+{1.46848336E-1, 2.13000089E-1, 3.00192565E-1},
+{2.18630567E-1, 2.90263802E-1, 3.09045762E-1},
+{1.43699184E-1, 1.87815160E-1, 2.83769876E-1},
+{2.07328036E-1, 2.45088696E-1, 3.08956414E-1},
+{1.64228097E-1, 2.27826655E-1, 3.08907896E-1},
+{2.61919737E-1, 2.91333705E-1, 3.31527978E-1},
+{1.70648888E-1, 2.02157527E-1, 2.17827827E-1},
+{2.07796112E-1, 2.34704822E-1, 3.06783766E-1},
+{1.72118798E-1, 2.14057386E-1, 3.10151786E-1},
+{2.29116157E-1, 2.80949861E-1, 3.33774298E-1},
+{1.96622208E-1, 2.16653049E-1, 2.33279720E-1},
+{2.37789229E-1, 2.58971304E-1, 3.04609209E-1},
+{1.55182019E-1, 2.63032585E-1, 3.18943053E-1},
+{2.49388829E-1, 3.16970855E-1, 3.77762467E-1},
+{1.51363596E-1, 1.75010651E-1, 2.78245836E-1},
+{2.19810233E-1, 2.32360214E-1, 2.85034925E-1},
+{1.42630622E-1, 2.40602851E-1, 3.04125100E-1},
+{2.42764875E-1, 2.83762127E-1, 3.15481216E-1},
+{1.57467470E-1, 2.07524061E-1, 2.75674909E-1},
+{2.28758618E-1, 2.49092206E-1, 3.28139395E-1},
+{1.90872714E-1, 2.38125205E-1, 2.94894546E-1},
+{2.66389251E-1, 3.14321429E-1, 3.38669509E-1},
+{1.70644209E-1, 2.25980043E-1, 2.47372389E-1},
+{2.36442789E-1, 2.53003448E-1, 2.88220435E-1},
+{1.85423777E-1, 2.04888850E-1, 3.14608842E-1},
+{2.17379019E-1, 2.94553548E-1, 3.67831022E-1},
+{1.88563988E-1, 2.15174288E-1, 2.72999734E-1},
+{2.45102122E-1, 2.59770364E-1, 3.21885556E-1},
+{1.98444173E-1, 2.61160702E-1, 3.17097872E-1},
+{2.99013853E-1, 3.28965336E-1, 3.56681198E-1},
+{1.58248767E-1, 1.92205697E-1, 2.46059090E-1},
+{2.02385351E-1, 2.47965842E-1, 2.71749645E-1},
+{1.61710784E-1, 2.13708103E-1, 3.27384740E-1},
+{2.14419708E-1, 3.05552453E-1, 3.33721548E-1},
+{1.61819980E-1, 1.89897299E-1, 3.10501546E-1},
+{2.19436333E-1, 2.65029579E-1, 3.09288830E-1},
+{1.88303933E-1, 2.49633163E-1, 2.85499543E-1},
+{2.69325376E-1, 2.99807042E-1, 3.41722459E-1},
+{1.72406003E-1, 2.10977256E-1, 2.27773219E-1},
+{2.20281526E-1, 2.34015763E-1, 3.12846094E-1},
+{1.83257267E-1, 2.22061962E-1, 2.91052371E-1},
+{2.42531225E-1, 3.09527606E-1, 3.30389649E-1},
+{2.07546696E-1, 2.24662632E-1, 2.44420141E-1},
+{2.45858207E-1, 2.70285994E-1, 3.05132121E-1},
+{1.84840545E-1, 2.72096783E-1, 3.12531084E-1},
+{2.74252594E-1, 3.21252435E-1, 3.74658197E-1},
+{1.66425839E-1, 1.84491634E-1, 2.68278092E-1},
+{2.28423670E-1, 2.43025422E-1, 2.81184882E-1},
+{1.60091296E-1, 2.52953321E-1, 3.35822314E-1},
+{2.62109995E-1, 2.95581907E-1, 3.13354105E-1},
+{1.67702749E-1, 2.01536924E-1, 3.01801592E-1},
+{2.37822965E-1, 2.59894758E-1, 3.38231117E-1},
+{1.97206214E-1, 2.45490909E-1, 3.17895442E-1},
+{2.98455298E-1, 3.19209784E-1, 3.40971738E-1},
+{1.71195343E-1, 2.24327832E-1, 2.62736112E-1},
+{2.30626896E-1, 2.53310233E-1, 3.01206797E-1},
+{2.04814211E-1, 2.21881568E-1, 3.25966567E-1},
+{2.22987518E-1, 3.06339115E-1, 3.50717157E-1},
+{2.00855389E-1, 2.15359926E-1, 2.84143478E-1},
+{2.50951648E-1, 2.66189247E-1, 3.33360583E-1},
+{1.75610259E-1, 2.93791324E-1, 3.40326935E-1},
+{2.91745067E-1, 3.40602487E-1, 3.81397158E-1}};
+
+static const float evrc_lspq_full_codebook4[128][3] = {
+{2.77461529E-1, 3.16972077E-1, 3.95498335E-1},
+{3.36560428E-1, 3.60156953E-1, 3.81473005E-1},
+{3.10509324E-1, 3.31732392E-1, 3.66864383E-1},
+{3.37470949E-1, 3.96795273E-1, 4.12356317E-1},
+{2.79660404E-1, 3.66520107E-1, 3.85313451E-1},
+{3.16038966E-1, 3.85609329E-1, 4.01304781E-1},
+{3.09960425E-1, 3.43410730E-1, 4.24745500E-1},
+{3.54243636E-1, 4.08699274E-1, 4.22167957E-1},
+{2.95587242E-1, 3.33741128E-1, 3.87421668E-1},
+{3.33446383E-1, 3.86974752E-1, 4.01353061E-1},
+{3.23412836E-1, 3.65269661E-1, 3.85193288E-1},
+{3.42731953E-1, 4.03192520E-1, 4.19920385E-1},
+{2.77681828E-1, 3.82494986E-1, 4.04274166E-1},
+{3.18247974E-1, 3.95985305E-1, 4.31353152E-1},
+{3.03711414E-1, 3.80319715E-1, 4.37173545E-1},
+{3.78288805E-1, 4.07077312E-1, 4.22679126E-1},
+{2.38116503E-1, 3.42454314E-1, 4.24624741E-1},
+{3.45615685E-1, 3.68681073E-1, 4.00817335E-1},
+{3.17688107E-1, 3.41902673E-1, 4.05601799E-1},
+{3.66368949E-1, 3.89039934E-1, 4.06154454E-1},
+{2.99398005E-1, 3.52021694E-1, 3.99955690E-1},
+{3.24991941E-1, 3.90028834E-1, 4.19478714E-1},
+{3.23025763E-1, 3.68114293E-1, 4.02087748E-1},
+{3.62326264E-1, 4.16927993E-1, 4.32773650E-1},
+{2.72696435E-1, 3.59205008E-1, 4.26880658E-1},
+{3.46539855E-1, 3.69616628E-1, 4.15621221E-1},
+{3.34109128E-1, 3.55736315E-1, 3.96749556E-1},
+{3.37468982E-1, 4.10392702E-1, 4.25986826E-1},
+{2.99468994E-1, 3.80648255E-1, 4.18284118E-1},
+{3.21378171E-1, 4.11198020E-1, 4.28792536E-1},
+{3.27841163E-1, 3.69345129E-1, 4.34395611E-1},
+{3.80669057E-1, 4.26086366E-1, 4.42754567E-1},
+{2.68943667E-1, 3.42942953E-1, 3.98681462E-1},
+{3.38102877E-1, 3.76338840E-1, 3.92043173E-1},
+{3.23593497E-1, 3.48742068E-1, 3.72551978E-1},
+{3.47550809E-1, 3.92885387E-1, 4.21169937E-1},
+{3.04182827E-1, 3.59816670E-1, 3.81633341E-1},
+{3.14221382E-1, 4.02108550E-1, 4.20085251E-1},
+{3.01306546E-1, 3.62662733E-1, 4.29262817E-1},
+{3.71770263E-1, 3.98696363E-1, 4.31438982E-1},
+{2.74591267E-1, 3.35595489E-1, 4.20079648E-1},
+{3.44540834E-1, 3.90451789E-1, 4.06412065E-1},
+{3.25239837E-1, 3.78344476E-1, 3.94673288E-1},
+{3.56683493E-1, 3.90574157E-1, 4.33851063E-1},
+{2.63501287E-1, 3.95260096E-1, 4.23116386E-1},
+{3.37520659E-1, 3.92563462E-1, 4.43415821E-1},
+{3.14522266E-1, 3.80968630E-1, 4.22676384E-1},
+{3.76235068E-1, 4.17298734E-1, 4.31451261E-1},
+{2.61855006E-1, 3.68646085E-1, 4.04260576E-1},
+{3.55580151E-1, 3.77994478E-1, 3.95868242E-1},
+{3.27742815E-1, 3.53872776E-1, 4.11040604E-1},
+{3.62960637E-1, 3.99466991E-1, 4.14690197E-1},
+{3.09410870E-1, 3.73796046E-1, 3.92672479E-1},
+{3.31016302E-1, 4.00801599E-1, 4.31759298E-1},
+{3.23573053E-1, 3.68619561E-1, 4.17455137E-1},
+{3.49115849E-1, 4.26840067E-1, 4.43913996E-1},
+{2.89738595E-1, 3.63759339E-1, 4.10511792E-1},
+{3.55286479E-1, 3.89331281E-1, 4.13432419E-1},
+{3.36565912E-1, 3.60222459E-1, 4.24179018E-1},
+{3.39932680E-1, 4.09228802E-1, 4.40184891E-1},
+{3.00889730E-1, 4.00081098E-1, 4.17955697E-1},
+{3.17052066E-1, 4.22288120E-1, 4.42229569E-1},
+{3.27336788E-1, 3.84311676E-1, 4.30288613E-1},
+{3.98990929E-1, 4.29498434E-1, 4.43475187E-1},
+{2.49110118E-1, 3.25696886E-1, 4.11728263E-1},
+{3.45929205E-1, 3.68577540E-1, 3.88473272E-1},
+{3.13219666E-1, 3.39229465E-1, 3.87597919E-1},
+{3.51453960E-1, 3.98730278E-1, 4.12656188E-1},
+{2.93487132E-1, 3.75763118E-1, 3.94488096E-1},
+{3.24470758E-1, 3.94202888E-1, 4.08882737E-1},
+{3.12710822E-1, 3.57720256E-1, 4.14061189E-1},
+{3.66507173E-1, 4.08171296E-1, 4.23891425E-1},
+{2.99965680E-1, 3.31993401E-1, 4.07860160E-1},
+{3.34925175E-1, 3.86143029E-1, 4.11538124E-1},
+{3.34788024E-1, 3.66196156E-1, 3.93347144E-1},
+{3.47847939E-1, 4.05926466E-1, 4.30507302E-1},
+{2.85952926E-1, 3.95283282E-1, 4.16119337E-1},
+{3.23867381E-1, 4.06476676E-1, 4.42482829E-1},
+{3.16716671E-1, 3.84451628E-1, 4.39411044E-1},
+{3.86772931E-1, 4.11824584E-1, 4.27831531E-1},
+{2.38072395E-1, 3.62342358E-1, 4.30931687E-1},
+{3.46450031E-1, 3.79082918E-1, 4.06567812E-1},
+{3.16576600E-1, 3.56468618E-1, 3.96218300E-1},
+{3.66539180E-1, 3.89590919E-1, 4.21055555E-1},
+{3.08291376E-1, 3.71324301E-1, 4.07867432E-1},
+{3.36435199E-1, 3.91514421E-1, 4.22977090E-1},
+{3.23035538E-1, 3.80447328E-1, 4.09550190E-1},
+{3.65228057E-1, 4.27910388E-1, 4.43691254E-1},
+{2.72038043E-1, 3.76596808E-1, 4.33685899E-1},
+{3.57665777E-1, 3.77761602E-1, 4.09178972E-1},
+{3.36498559E-1, 3.64215910E-1, 4.09255505E-1},
+{3.48082423E-1, 4.17631805E-1, 4.33284521E-1},
+{3.02754521E-1, 3.95974755E-1, 4.33717251E-1},
+{3.31676304E-1, 4.17587161E-1, 4.36239839E-1},
+{3.33287597E-1, 3.80799115E-1, 4.39620733E-1},
+{3.88112009E-1, 4.36933577E-1, 4.50829268E-1},
+{2.56026626E-1, 3.48015189E-1, 4.22922611E-1},
+{3.45773995E-1, 3.81725788E-1, 3.96794081E-1},
+{3.25623751E-1, 3.50391924E-1, 3.87330651E-1},
+{3.56868088E-1, 3.98574769E-1, 4.23177242E-1},
+{3.01226199E-1, 3.86906981E-1, 4.03335571E-1},
+{3.28178406E-1, 4.02090192E-1, 4.19389248E-1},
+{3.14385355E-1, 3.69043887E-1, 4.34375286E-1},
+{3.72321129E-1, 4.11672413E-1, 4.40518737E-1},
+{2.90479720E-1, 3.48121881E-1, 4.26216483E-1},
+{3.44438791E-1, 3.82666349E-1, 4.17321086E-1},
+{3.34866822E-1, 3.76235664E-1, 4.04475212E-1},
+{3.59025359E-1, 4.04721916E-1, 4.34838414E-1},
+{2.79127955E-1, 4.11106586E-1, 4.35360551E-1},
+{3.48125517E-1, 3.98732066E-1, 4.46927428E-1},
+{3.27018857E-1, 3.90107334E-1, 4.41707492E-1},
+{3.90858352E-1, 4.19813931E-1, 4.35153484E-1},
+{2.55319297E-1, 3.70405972E-1, 4.32188630E-1},
+{3.54651988E-1, 3.88332665E-1, 4.02956128E-1},
+{3.21608186E-1, 3.54489803E-1, 4.28299785E-1},
+{3.75163496E-1, 3.98833990E-1, 4.14177418E-1},
+{3.11953604E-1, 3.91430676E-1, 4.12552476E-1},
+{3.42528820E-1, 3.96365345E-1, 4.32497382E-1},
+{3.33744347E-1, 3.76422405E-1, 4.20536995E-1},
+{3.53529096E-1, 4.29231048E-1, 4.59699273E-1},
+{2.88017929E-1, 3.77999961E-1, 4.34011698E-1},
+{3.55683446E-1, 3.80780041E-1, 4.23145533E-1},
+{3.44358265E-1, 3.72184873E-1, 4.31265354E-1},
+{3.53966117E-1, 4.14166689E-1, 4.42941308E-1},
+{3.04770231E-1, 4.12517488E-1, 4.34183121E-1},
+{3.35913360E-1, 4.24590766E-1, 4.46378469E-1},
+{3.43738198E-1, 3.84766221E-1, 4.35271382E-1},
+{4.10941303E-1, 4.40662980E-1, 4.52113390E-1}};
+
+static const float evrc_lspq_half_codebook1[128][3] = {
+{1.35226343E-2, 1.82081293E-2, 3.93940695E-2},
+{2.29392890E-2, 3.57831158E-2, 1.05352886E-1},
+{2.09106486E-2, 3.04159056E-2, 8.93941075E-2},
+{1.88909005E-2, 3.82722206E-2, 1.37820408E-1},
+{2.05143820E-2, 2.85481159E-2, 7.39762187E-2},
+{4.69510332E-2, 6.84031919E-2, 1.09123811E-1},
+{3.15557197E-2, 5.69139980E-2, 8.57057571E-2},
+{3.81181911E-2, 7.77784660E-2, 1.92532852E-1},
+{2.16297153E-2, 2.92908940E-2, 6.25042021E-2},
+{3.11414022E-2, 5.99079318E-2, 1.02860682E-1},
+{3.02799307E-2, 5.35012372E-2, 7.80925751E-2},
+{6.50846213E-2, 9.06624720E-2, 1.42850950E-1},
+{3.27340364E-2, 5.04027791E-2, 6.26492277E-2},
+{5.27439862E-2, 6.22574277E-2, 1.22198336E-1},
+{3.48840356E-2, 6.42222390E-2, 9.16024595E-2},
+{4.88984436E-2, 1.05058022E-1, 1.68813452E-1},
+{2.35791076E-2, 3.21034677E-2, 5.60899563E-2},
+{2.77252812E-2, 4.87281792E-2, 1.01224191E-1},
+{2.74348017E-2, 4.04965915E-2, 9.34926122E-2},
+{4.38360050E-2, 6.03261292E-2, 1.52400866E-1},
+{2.68994924E-2, 4.52906378E-2, 6.49800375E-2},
+{5.16058952E-2, 6.08312152E-2, 1.08799636E-1},
+{4.20064926E-2, 6.11845106E-2, 8.54474008E-2},
+{7.13502690E-2, 1.01972111E-1, 1.74640998E-1},
+{2.88906675E-2, 4.13964354E-2, 5.25928028E-2},
+{3.16364467E-2, 6.63532093E-2, 1.24950245E-1},
+{4.30289507E-2, 5.14023267E-2, 7.96877742E-2},
+{5.70970774E-2, 1.08444504E-1, 1.44075617E-1},
+{3.38840261E-2, 5.04746847E-2, 7.29765445E-2},
+{6.54265657E-2, 7.90987685E-2, 1.15570590E-1},
+{3.85423526E-2, 7.33125433E-2, 1.02307513E-1},
+{6.57824501E-2, 1.02909811E-1, 2.11874440E-1},
+{1.54727865E-2, 2.04559695E-2, 5.46121262E-2},
+{2.27950197E-2, 3.90954204E-2, 1.19443826E-1},
+{3.06889173E-2, 4.54540215E-2, 8.20418894E-2},
+{2.25957241E-2, 4.79101725E-2, 1.71844408E-1},
+{2.71088015E-2, 4.01739590E-2, 7.01922849E-2},
+{4.95789349E-2, 7.92963281E-2, 1.04862511E-1},
+{3.06095853E-2, 5.64059429E-2, 9.49584097E-2},
+{6.34224564E-2, 9.11655501E-2, 1.84724405E-1},
+{2.43342388E-2, 3.91998328E-2, 6.31406233E-2},
+{3.38011980E-2, 6.60846457E-2, 1.11031540E-1},
+{3.51784080E-2, 5.79397269E-2, 7.20702857E-2},
+{6.49054050E-2, 8.65831897E-2, 1.54648736E-1},
+{2.91934665E-2, 5.16204573E-2, 6.94437325E-2},
+{5.94522804E-2, 7.19829276E-2, 1.27434507E-1},
+{5.31888530E-2, 6.38182089E-2, 9.88218486E-2},
+{8.68290961E-2, 1.41135350E-1, 1.91728458E-1},
+{2.49991138E-2, 3.62556018E-2, 5.03724031E-2},
+{2.82246377E-2, 5.44572286E-2, 1.12663500E-1},
+{3.62618119E-2, 4.59073223E-2, 9.43343639E-2},
+{5.70455343E-2, 7.46300444E-2, 1.59157172E-1},
+{2.72987466E-2, 4.56625856E-2, 7.52529651E-2},
+{5.12860194E-2, 8.51126984E-2, 1.23587973E-1},
+{4.91451994E-2, 5.93483113E-2, 9.22686011E-2},
+{7.06961900E-2, 1.05451979E-1, 1.92602143E-1},
+{2.80733760E-2, 4.18509208E-2, 5.87159805E-2},
+{4.64449003E-2, 7.06698820E-2, 1.26038432E-1},
+{4.18453738E-2, 6.30445331E-2, 7.66169876E-2},
+{8.42416435E-2, 1.13282882E-1, 1.43687114E-1},
+{4.17615622E-2, 5.59472926E-2, 7.09872842E-2},
+{5.55161387E-2, 9.50126722E-2, 1.27727196E-1},
+{5.90935498E-2, 7.36730024E-2, 9.65935886E-2},
+{7.84136653E-2, 1.41432360E-1, 2.17428640E-1},
+{2.10490543E-2, 2.91891042E-2, 4.60035764E-2},
+{3.64863276E-2, 4.62387018E-2, 1.07044168E-1},
+{2.68652122E-2, 3.92937548E-2, 8.41179937E-2},
+{2.72903945E-2, 5.53805046E-2, 1.41586170E-1},
+{2.48476695E-2, 3.63277681E-2, 7.62430876E-2},
+{5.25430813E-2, 7.75778666E-2, 1.14567965E-1},
+{4.07741442E-2, 5.39923795E-2, 9.07640457E-2},
+{5.73043302E-2, 7.65803084E-2, 1.79578975E-1},
+{2.46032421E-2, 3.41408364E-2, 6.78990781E-2},
+{4.08220068E-2, 6.29783794E-2, 9.95191261E-2},
+{3.83025035E-2, 5.52857481E-2, 7.90019333E-2},
+{7.24111274E-2, 1.01903863E-1, 1.46979645E-1},
+{3.73902172E-2, 4.70463894E-2, 6.54684529E-2},
+{5.27397543E-2, 6.72770366E-2, 1.39680430E-1},
+{4.05365378E-2, 7.05081299E-2, 9.25668627E-2},
+{4.43425253E-2, 1.10367171E-1, 1.99636266E-1},
+{2.54920740E-2, 3.47603969E-2, 6.05902039E-2},
+{4.35465500E-2, 5.32369502E-2, 1.08325966E-1},
+{2.79599819E-2, 4.91324775E-2, 8.84284526E-2},
+{4.98051867E-2, 8.81728902E-2, 1.52597323E-1},
+{3.19346264E-2, 4.62169312E-2, 6.85206428E-2},
+{5.80246300E-2, 6.84268698E-2, 1.15085281E-1},
+{4.33904678E-2, 6.90575615E-2, 8.44984353E-2},
+{7.39691556E-2, 1.19240515E-1, 1.77340195E-1},
+{3.18767503E-2, 4.59697433E-2, 5.72372638E-2},
+{4.50873822E-2, 5.66509366E-2, 1.32005826E-1},
+{4.59097028E-2, 5.45580424E-2, 8.61423314E-2},
+{7.44685754E-2, 1.13815404E-1, 1.61570594E-1},
+{3.97509560E-2, 4.95359488E-2, 7.22542256E-2},
+{6.76257759E-2, 8.31029043E-2, 1.27990112E-1},
+{5.76258078E-2, 6.95326403E-2, 1.05012968E-1},
+{6.85313493E-2, 1.21758826E-1, 2.20626548E-1},
+{2.18480472E-2, 2.99130920E-2, 5.16208000E-2},
+{3.64343151E-2, 4.91795056E-2, 1.23277210E-1},
+{3.89611274E-2, 4.76634987E-2, 8.61716568E-2},
+{4.14635167E-2, 6.88006952E-2, 1.69356152E-1},
+{3.35514620E-2, 4.17815186E-2, 7.37159401E-2},
+{5.80224693E-2, 8.70314166E-2, 1.12917498E-1},
+{4.80243117E-2, 5.69486506E-2, 1.00755706E-1},
+{5.98873124E-2, 8.57942328E-2, 2.01388851E-1},
+{2.99309995E-2, 3.94828431E-2, 6.46376088E-2},
+{3.88626605E-2, 8.07443634E-2, 1.15519784E-1},
+{3.49444002E-2, 6.28911033E-2, 8.04982036E-2},
+{6.88817874E-2, 9.92431119E-2, 1.60393253E-1},
+{3.64237651E-2, 5.34016453E-2, 6.70152009E-2},
+{5.83492741E-2, 7.85285756E-2, 1.41746715E-1},
+{4.86469641E-2, 7.26736858E-2, 9.48315859E-2},
+{5.85533604E-2, 1.36289746E-1, 1.98639736E-1},
+{2.60888506E-2, 3.73406820E-2, 5.57853170E-2},
+{4.58504409E-2, 5.60512505E-2, 1.17927872E-1},
+{4.28801328E-2, 5.14739119E-2, 9.75309014E-2},
+{6.37611598E-2, 8.73552933E-2, 1.68334916E-1},
+{3.76709923E-2, 4.58216034E-2, 7.86528140E-2},
+{6.75194561E-2, 8.98697898E-2, 1.19418114E-1},
+{5.46374246E-2, 6.66805878E-2, 8.93813819E-2},
+{7.73086548E-2, 1.21754415E-1, 1.99579224E-1},
+{3.15621309E-2, 4.51702215E-2, 6.25768527E-2},
+{3.78782675E-2, 8.03486481E-2, 1.38961688E-1},
+{5.08303270E-2, 6.18740581E-2, 8.31153840E-2},
+{8.96311402E-2, 1.28753766E-1, 1.64891586E-1},
+{4.73503470E-2, 5.75724356E-2, 7.65264630E-2},
+{7.16898590E-2, 9.89895687E-2, 1.30078360E-1},
+{6.29082546E-2, 7.90778771E-2, 1.05111063E-1},
+{8.80649835E-2, 1.65206164E-1, 2.13214174E-1}};
+
+static const float evrc_lspq_half_codebook2[128][3] = {
+{9.75915268E-2, 1.23701490E-1, 1.69437975E-1},
+{9.49536338E-2, 2.01081768E-1, 2.26855248E-1},
+{9.00496617E-2, 1.49164870E-1, 2.26532787E-1},
+{1.70302704E-1, 1.97222874E-1, 2.49974832E-1},
+{1.08773641E-1, 1.51972428E-1, 1.75123364E-1},
+{1.30278930E-1, 2.13229164E-1, 2.29646355E-1},
+{1.24917991E-1, 1.87347755E-1, 2.04712003E-1},
+{2.00670198E-1, 2.28963569E-1, 2.69420803E-1},
+{8.98375586E-2, 1.25332758E-1, 2.10539430E-1},
+{9.62376669E-2, 2.07185850E-1, 2.54174471E-1},
+{1.05694629E-1, 1.78856418E-1, 2.00121015E-1},
+{1.56048968E-1, 2.19573721E-1, 2.91079402E-1},
+{1.37392268E-1, 1.59993336E-1, 1.94698542E-1},
+{1.07262500E-1, 2.37790957E-1, 2.70740807E-1},
+{1.42976448E-1, 2.01550499E-1, 2.18468934E-1},
+{2.14270487E-1, 2.71881402E-1, 3.01200211E-1},
+{1.10729210E-1, 1.33688226E-1, 1.54877156E-1},
+{1.06667660E-1, 1.76678821E-1, 2.62798905E-1},
+{9.16352943E-2, 1.74592838E-1, 2.19329327E-1},
+{1.84038624E-1, 2.27964059E-1, 2.47762203E-1},
+{1.10572360E-1, 1.58207163E-1, 1.96013063E-1},
+{1.33543387E-1, 2.32269660E-1, 2.51828164E-1},
+{1.55922309E-1, 1.77941337E-1, 2.18096644E-1},
+{1.92260072E-1, 2.49512479E-1, 2.89911509E-1},
+{1.13708906E-1, 1.37872443E-1, 2.02929884E-1},
+{1.02557532E-1, 1.84820071E-1, 2.92164624E-1},
+{1.36595622E-1, 1.58687428E-1, 2.41399556E-1},
+{1.72813818E-1, 2.49303415E-1, 3.00458610E-1},
+{1.36871174E-1, 1.57249823E-1, 2.10913152E-1},
+{1.28974810E-1, 2.45167866E-1, 2.67653584E-1},
+{1.66812256E-1, 1.88998029E-1, 2.31345922E-1},
+{2.32248470E-1, 2.63196051E-1, 3.16754937E-1},
+{9.24560949E-2, 1.19977452E-1, 1.91262275E-1},
+{1.13085262E-1, 2.08461538E-1, 2.29368120E-1},
+{1.00716405E-1, 1.40670076E-1, 2.58062959E-1},
+{1.67010382E-1, 2.18105540E-1, 2.62592494E-1},
+{1.25487238E-1, 1.62686959E-1, 1.84409231E-1},
+{1.52406558E-1, 2.07131729E-1, 2.47582436E-1},
+{1.37441203E-1, 1.80262372E-1, 2.17698842E-1},
+{2.07853511E-1, 2.49209508E-1, 2.69830108E-1},
+{9.35257301E-2, 1.49197355E-1, 2.04652041E-1},
+{1.11997180E-1, 2.25233063E-1, 2.47003049E-1},
+{1.09315015E-1, 1.93811879E-1, 2.13802189E-1},
+{1.75118580E-1, 2.52520263E-1, 2.75082767E-1},
+{1.36918738E-1, 1.77440569E-1, 1.97931141E-1},
+{1.36811242E-1, 2.37426177E-1, 2.84737825E-1},
+{1.60759792E-1, 2.00833157E-1, 2.18084484E-1},
+{2.33710244E-1, 2.66372561E-1, 2.91802049E-1},
+{1.19171090E-1, 1.39703169E-1, 1.87723249E-1},
+{1.31049946E-1, 1.93696663E-1, 2.60426998E-1},
+{1.08267047E-1, 1.65194795E-1, 2.39523023E-1},
+{2.03195021E-1, 2.25942209E-1, 2.49403238E-1},
+{1.23842932E-1, 1.45794615E-1, 2.15635628E-1},
+{1.71226338E-1, 2.38054529E-1, 2.57975638E-1},
+{1.66923836E-1, 1.88604668E-1, 2.11124212E-1},
+{2.10620746E-1, 2.62442708E-1, 2.83127964E-1},
+{1.05748810E-1, 1.36286482E-1, 2.20050186E-1},
+{9.72945765E-2, 2.33471528E-1, 2.96113968E-1},
+{1.34298369E-1, 1.93955436E-1, 2.39148825E-1},
+{1.64229318E-1, 2.70067751E-1, 2.94142485E-1},
+{1.42760262E-1, 1.65033355E-1, 2.24100381E-1},
+{1.46414533E-1, 2.47942328E-1, 3.00708115E-1},
+{1.74778774E-1, 2.19349250E-1, 2.38162965E-1},
+{2.36311123E-1, 2.90669680E-1, 3.28010976E-1},
+{1.14076428E-1, 1.33071408E-1, 1.73181504E-1},
+{1.13575839E-1, 1.90307274E-1, 2.41681188E-1},
+{8.59165266E-2, 1.63920239E-1, 2.37934500E-1},
+{1.92916945E-1, 2.15082392E-1, 2.39128128E-1},
+{1.37291834E-1, 1.59423307E-1, 1.79722220E-1},
+{1.40435383E-1, 2.22092256E-1, 2.40960747E-1},
+{1.40387163E-1, 1.89601168E-1, 2.05635697E-1},
+{2.11695507E-1, 2.36578360E-1, 2.81248927E-1},
+{9.03010592E-2, 1.27157405E-1, 2.33567923E-1},
+{1.10118054E-1, 2.09328398E-1, 2.72836268E-1},
+{1.16710417E-1, 1.77853987E-1, 2.22808748E-1},
+{1.81691542E-1, 2.32265159E-1, 2.74991214E-1},
+{1.46553472E-1, 1.69474706E-1, 1.90245956E-1},
+{1.09213792E-1, 2.63291955E-1, 2.88490772E-1},
+{1.49815127E-1, 2.11342707E-1, 2.28899449E-1},
+{1.97645500E-1, 2.83229947E-1, 3.14882278E-1},
+{1.24495603E-1, 1.46097973E-1, 1.66125208E-1},
+{1.34878591E-1, 1.83030054E-1, 2.89288282E-1},
+{9.33032110E-2, 1.83962211E-1, 2.38543004E-1},
+{1.92844257E-1, 2.39588335E-1, 2.58421540E-1},
+{1.23796798E-1, 1.65556595E-1, 2.08408386E-1},
+{1.51144341E-1, 2.35801116E-1, 2.59280622E-1},
+{1.50657728E-1, 1.90052524E-1, 2.28362590E-1},
+{1.98180959E-1, 2.56794214E-1, 3.08975637E-1},
+{1.28490031E-1, 1.49084017E-1, 1.98376507E-1},
+{9.20595750E-2, 2.12231293E-1, 2.92948842E-1},
+{1.41698137E-1, 1.72356680E-1, 2.58454144E-1},
+{1.96733460E-1, 2.29709730E-1, 2.95780182E-1},
+{1.47062227E-1, 1.68918088E-1, 2.07363635E-1},
+{1.36309877E-1, 2.60373056E-1, 2.82607377E-1},
+{1.81041077E-1, 2.01826140E-1, 2.38867551E-1},
+{2.45326266E-1, 2.80183077E-1, 3.11954319E-1},
+{1.04131766E-1, 1.33040652E-1, 1.89834684E-1},
+{1.23298146E-1, 2.09621087E-1, 2.47813210E-1},
+{1.24040775E-1, 1.59827366E-1, 2.58856058E-1},
+{1.87048867E-1, 2.12488100E-1, 2.59629130E-1},
+{1.24255307E-1, 1.73768952E-1, 1.92850024E-1},
+{1.58917829E-1, 2.25389823E-1, 2.43284762E-1},
+{1.53421149E-1, 1.91807315E-1, 2.09249526E-1},
+{2.27154449E-1, 2.51181155E-1, 2.72600353E-1},
+{1.09922059E-1, 1.57100275E-1, 2.20024973E-1},
+{1.32782355E-1, 2.19485506E-1, 2.67028928E-1},
+{1.26857504E-1, 1.98836312E-1, 2.17928499E-1},
+{1.91415027E-1, 2.52424240E-1, 2.72652745E-1},
+{1.55277625E-1, 1.79573521E-1, 2.00773627E-1},
+{1.17547743E-1, 2.47869864E-1, 3.08279335E-1},
+{1.65706977E-1, 2.10339502E-1, 2.29199320E-1},
+{2.25694910E-1, 2.84438193E-1, 3.12106073E-1},
+{1.29503176E-1, 1.48420051E-1, 1.80180401E-1},
+{1.54752508E-1, 1.97748467E-1, 2.67275035E-1},
+{1.28590241E-1, 1.76178381E-1, 2.39905864E-1},
+{2.14926764E-1, 2.37634435E-1, 2.58794010E-1},
+{1.28322318E-1, 1.59338519E-1, 2.26626605E-1},
+{1.55747548E-1, 2.47740522E-1, 2.73726821E-1},
+{1.75741687E-1, 1.97952345E-1, 2.19115943E-1},
+{2.18626365E-1, 2.45809183E-1, 3.00479650E-1},
+{1.17709018E-1, 1.45512864E-1, 2.38044471E-1},
+{1.18006893E-1, 2.23775521E-1, 2.94175088E-1},
+{1.51349202E-1, 1.88157812E-1, 2.48743281E-1},
+{1.89312205E-1, 2.69580543E-1, 2.93785989E-1},
+{1.49895594E-1, 1.74537256E-1, 2.37430006E-1},
+{1.39775530E-1, 2.71709383E-1, 3.07839513E-1},
+{1.83945730E-1, 2.07717165E-1, 2.26722151E-1},
+{2.54552156E-1, 2.96640933E-1, 3.24801445E-1}};
+
+static const float evrc_lspq_half_codebook3[256][4] = {
+{2.36904725E-1, 2.56104350E-1, 3.16955745E-1, 4.07520533E-1},
+{2.97596931E-1, 3.23482454E-1, 3.47667515E-1, 3.74551237E-1},
+{2.73721159E-1, 2.98297524E-1, 3.29923928E-1, 3.83599102E-1},
+{3.07849586E-1, 3.32836270E-1, 3.89340341E-1, 4.05575991E-1},
+{2.33803615E-1, 2.60296524E-1, 3.67351949E-1, 4.04388249E-1},
+{2.97513664E-1, 3.15356553E-1, 3.85135233E-1, 4.02197123E-1},
+{2.85618782E-1, 3.10872793E-1, 3.65022361E-1, 3.84816766E-1},
+{3.35271597E-1, 3.55222225E-1, 3.81921113E-1, 3.98685753E-1},
+{2.00265601E-1, 2.50502288E-1, 3.70398223E-1, 4.32012677E-1},
+{3.07982087E-1, 3.33767712E-1, 3.58199060E-1, 3.78386796E-1},
+{2.60086119E-1, 3.25520277E-1, 3.56873333E-1, 3.84737790E-1},
+{3.01356375E-1, 3.41369390E-1, 4.00296748E-1, 4.17337179E-1},
+{2.67080963E-1, 2.97674358E-1, 3.69702041E-1, 3.89139235E-1},
+{2.72669852E-1, 3.49704087E-1, 3.91925275E-1, 4.06383276E-1},
+{2.52825916E-1, 3.49636555E-1, 3.84550989E-1, 4.05930996E-1},
+{3.42927098E-1, 3.74274015E-1, 4.05468166E-1, 4.20351923E-1},
+{2.52408743E-1, 2.80375838E-1, 3.21436584E-1, 3.88436913E-1},
+{2.96970189E-1, 3.17173600E-1, 3.65342557E-1, 4.02736843E-1},
+{2.81905174E-1, 3.01479161E-1, 3.34335625E-1, 4.07633483E-1},
+{3.26872945E-1, 3.47177684E-1, 3.75017703E-1, 4.05372381E-1},
+{2.36371145E-1, 3.16441059E-1, 3.48707020E-1, 3.82030427E-1},
+{2.87817597E-1, 3.13627005E-1, 4.05129731E-1, 4.23379660E-1},
+{2.77502477E-1, 3.01843822E-1, 3.72250855E-1, 4.19212818E-1},
+{3.28988850E-1, 3.61901104E-1, 4.02015507E-1, 4.19229805E-1},
+{2.24960461E-1, 2.74636388E-1, 3.77016127E-1, 3.94726515E-1},
+{3.01045477E-1, 3.40486169E-1, 3.74888122E-1, 4.02532160E-1},
+{2.59897947E-1, 3.30334961E-1, 3.57493818E-1, 4.08657968E-1},
+{3.00961852E-1, 3.56449068E-1, 4.04779494E-1, 4.22508955E-1},
+{2.20979586E-1, 3.16477656E-1, 4.01744068E-1, 4.20735776E-1},
+{2.79754996E-1, 3.30776095E-1, 4.11152899E-1, 4.32687044E-1},
+{2.64246881E-1, 3.16610634E-1, 3.83876741E-1, 4.36683774E-1},
+{3.44381154E-1, 3.85365665E-1, 4.24949467E-1, 4.41560209E-1},
+{2.19488308E-1, 2.36459881E-1, 3.42465997E-1, 4.24989998E-1},
+{2.91465104E-1, 3.22282016E-1, 3.72852802E-1, 3.91635895E-1},
+{2.74792433E-1, 3.16536307E-1, 3.45392585E-1, 3.74555230E-1},
+{3.10583472E-1, 3.35264921E-1, 3.87527227E-1, 4.23076212E-1},
+{2.23211512E-1, 2.98497617E-1, 3.68426204E-1, 3.90213728E-1},
+{2.89078832E-1, 3.26512754E-1, 3.76308680E-1, 4.09553707E-1},
+{2.63830125E-1, 3.08977246E-1, 3.81453037E-1, 4.04660761E-1},
+{3.47073615E-1, 3.64797831E-1, 3.86763453E-1, 4.04511690E-1},
+{2.18452707E-1, 2.75614083E-1, 3.62711072E-1, 4.18278992E-1},
+{3.15042794E-1, 3.40813220E-1, 3.78627181E-1, 3.96316767E-1},
+{2.79727697E-1, 3.31259727E-1, 3.60061288E-1, 3.81175518E-1},
+{3.18602443E-1, 3.38044286E-1, 4.09010768E-1, 4.30300415E-1},
+{2.64196932E-1, 2.90672481E-1, 3.68595004E-1, 4.31856751E-1},
+{2.72645593E-1, 3.63514841E-1, 3.96518826E-1, 4.20091212E-1},
+{2.26540968E-1, 3.50055099E-1, 3.93851519E-1, 4.12597001E-1},
+{3.53053868E-1, 3.69929552E-1, 4.09656048E-1, 4.26387310E-1},
+{2.60788381E-1, 2.85172462E-1, 3.45943332E-1, 3.97500694E-1},
+{3.01113129E-1, 3.28201890E-1, 3.56068015E-1, 4.10803795E-1},
+{2.88101614E-1, 3.09559643E-1, 3.43756795E-1, 4.24872875E-1},
+{3.10489357E-1, 3.51421893E-1, 3.93717408E-1, 4.15550530E-1},
+{2.22308263E-1, 3.26798201E-1, 3.77981663E-1, 3.98635030E-1},
+{3.02915514E-1, 3.22781920E-1, 3.98558855E-1, 4.25489604E-1},
+{2.77136803E-1, 3.19992602E-1, 3.77490878E-1, 4.29177463E-1},
+{3.38731766E-1, 3.58164370E-1, 4.08386350E-1, 4.25495386E-1},
+{2.18726233E-1, 2.84384966E-1, 3.94053698E-1, 4.16346967E-1},
+{3.01005960E-1, 3.44093680E-1, 3.69013667E-1, 4.15091276E-1},
+{2.80783713E-1, 3.33053648E-1, 3.76726151E-1, 3.97526860E-1},
+{3.14394057E-1, 3.62678826E-1, 4.23668981E-1, 4.41899240E-1},
+{2.66453624E-1, 3.08513761E-1, 3.97407174E-1, 4.17450190E-1},
+{2.94222653E-1, 3.41904402E-1, 4.12726879E-1, 4.34888899E-1},
+{2.87300706E-1, 3.32434595E-1, 3.78856659E-1, 4.38234031E-1},
+{3.57146621E-1, 3.98147047E-1, 4.29875731E-1, 4.44243908E-1},
+{2.29671344E-1, 2.51018614E-1, 3.41046572E-1, 4.04376328E-1},
+{2.94472575E-1, 3.34944606E-1, 3.60409737E-1, 3.83682847E-1},
+{2.88250983E-1, 3.11722696E-1, 3.31680059E-1, 3.65104675E-1},
+{3.24881613E-1, 3.45656693E-1, 3.88306379E-1, 4.05954897E-1},
+{2.50829220E-1, 2.77623534E-1, 3.70799541E-1, 3.90479207E-1},
+{2.93523371E-1, 3.28319192E-1, 3.92112255E-1, 4.09464061E-1},
+{2.83608794E-1, 3.03885639E-1, 3.78504395E-1, 3.97310555E-1},
+{3.34039807E-1, 3.52837384E-1, 3.97272944E-1, 4.14322019E-1},
+{2.21891895E-1, 2.51877457E-1, 3.71723533E-1, 4.31791008E-1},
+{3.13201427E-1, 3.41175437E-1, 3.65503550E-1, 3.88567209E-1},
+{2.71330535E-1, 3.39163721E-1, 3.62616420E-1, 3.95735979E-1},
+{3.07550132E-1, 3.47777665E-1, 4.01049614E-1, 4.32767451E-1},
+{2.59387434E-1, 2.87243843E-1, 3.86817336E-1, 4.06042695E-1},
+{2.85485208E-1, 3.44094992E-1, 4.02050495E-1, 4.19413745E-1},
+{2.65781403E-1, 3.40084374E-1, 3.69407654E-1, 4.27031696E-1},
+{3.53740931E-1, 3.84463251E-1, 4.11747813E-1, 4.26181793E-1},
+{2.43866488E-1, 2.68350184E-1, 3.42201948E-1, 3.98457229E-1},
+{2.93145239E-1, 3.34754169E-1, 3.61702800E-1, 3.98416638E-1},
+{2.91342974E-1, 3.13155174E-1, 3.36525917E-1, 3.87748599E-1},
+{3.05656791E-1, 3.62904549E-1, 3.88153434E-1, 4.05543149E-1},
+{2.17492327E-1, 3.11723530E-1, 3.75984788E-1, 4.28997755E-1},
+{2.91149259E-1, 3.29380929E-1, 4.03900385E-1, 4.22333181E-1},
+{2.90362060E-1, 3.09530973E-1, 3.78994226E-1, 4.13688362E-1},
+{3.29564869E-1, 3.77404690E-1, 4.06584859E-1, 4.24739718E-1},
+{2.46461585E-1, 2.71593273E-1, 3.66338253E-1, 4.30753767E-1},
+{3.14107716E-1, 3.37011874E-1, 3.80409718E-1, 4.11099434E-1},
+{2.76568413E-1, 3.27320695E-1, 3.58844280E-1, 4.28949475E-1},
+{3.17179084E-1, 3.58972430E-1, 4.04765844E-1, 4.40376341E-1},
+{2.42777750E-1, 3.34954798E-1, 3.96943450E-1, 4.13318396E-1},
+{2.88895488E-1, 3.25691164E-1, 4.22859550E-1, 4.43758667E-1},
+{2.77583301E-1, 3.25479031E-1, 3.89144659E-1, 4.41075861E-1},
+{3.59125674E-1, 3.90694141E-1, 4.21009541E-1, 4.35708523E-1},
+{2.20172390E-1, 2.47719273E-1, 3.54381859E-1, 4.25398111E-1},
+{3.06046784E-1, 3.27924728E-1, 3.66992772E-1, 3.93192589E-1},
+{2.70805597E-1, 3.16826642E-1, 3.45648706E-1, 4.11717594E-1},
+{3.23188901E-1, 3.45463097E-1, 3.89778793E-1, 4.21570778E-1},
+{2.46136114E-1, 3.12391996E-1, 3.72188628E-1, 3.95842731E-1},
+{3.03856730E-1, 3.24354768E-1, 3.85747254E-1, 4.14155006E-1},
+{2.81075418E-1, 3.18608463E-1, 3.85646880E-1, 4.02703643E-1},
+{3.53517115E-1, 3.72702539E-1, 3.96264613E-1, 4.13074911E-1},
+{2.09221140E-1, 2.95262218E-1, 3.80314291E-1, 4.31278229E-1},
+{3.25313628E-1, 3.46735477E-1, 3.70724022E-1, 3.91045630E-1},
+{2.86396503E-1, 3.43560040E-1, 3.69713604E-1, 3.89867842E-1},
+{3.27794671E-1, 3.47367823E-1, 4.05465066E-1, 4.24566150E-1},
+{2.53054976E-1, 3.02656293E-1, 3.82165134E-1, 4.29898322E-1},
+{2.94418454E-1, 3.70745420E-1, 3.95443261E-1, 4.19514775E-1},
+{2.62873113E-1, 3.45069230E-1, 4.04140890E-1, 4.21902061E-1},
+{3.65063488E-1, 3.82435143E-1, 4.13424790E-1, 4.31241691E-1},
+{2.48788506E-1, 2.82372773E-1, 3.65772307E-1, 4.10981059E-1},
+{3.07288766E-1, 3.27828944E-1, 3.77664983E-1, 4.36220944E-1},
+{2.98542321E-1, 3.20627332E-1, 3.50569665E-1, 4.27620232E-1},
+{3.16258013E-1, 3.62903833E-1, 3.88225138E-1, 4.25608873E-1},
+{2.39077866E-1, 3.31310451E-1, 3.70317876E-1, 4.15995896E-1},
+{3.03735793E-1, 3.32806051E-1, 4.10232842E-1, 4.27751064E-1},
+{2.96002507E-1, 3.19014788E-1, 3.81062448E-1, 4.26954985E-1},
+{3.32508922E-1, 3.62516999E-1, 4.23315108E-1, 4.40995157E-1},
+{2.35128701E-1, 2.74731100E-1, 4.12070572E-1, 4.35478806E-1},
+{2.98073769E-1, 3.55338752E-1, 3.79087746E-1, 4.15318787E-1},
+{2.83429801E-1, 3.45264912E-1, 3.70376289E-1, 4.09900844E-1},
+{3.23593080E-1, 3.65412831E-1, 4.12813127E-1, 4.31023479E-1},
+{2.76626348E-1, 3.00508440E-1, 4.02236879E-1, 4.26638782E-1},
+{2.94512928E-1, 3.61443222E-1, 4.19635236E-1, 4.36999202E-1},
+{2.90807247E-1, 3.41689348E-1, 3.92779291E-1, 4.43490267E-1},
+{3.59391451E-1, 4.03985143E-1, 4.40843761E-1, 4.53028619E-1},
+{2.23295465E-1, 2.39192486E-1, 3.23768020E-1, 4.21689451E-1},
+{2.94778049E-1, 3.18798721E-1, 3.53217840E-1, 3.91906381E-1},
+{2.59032130E-1, 3.10240507E-1, 3.43569040E-1, 3.95064235E-1},
+{3.16474676E-1, 3.38544369E-1, 3.93329024E-1, 4.12235558E-1},
+{2.40108207E-1, 2.84631193E-1, 3.60280991E-1, 3.79973769E-1},
+{2.96909094E-1, 3.15798342E-1, 3.94964337E-1, 4.15127575E-1},
+{2.85434067E-1, 3.04921508E-1, 3.61974716E-1, 4.05767262E-1},
+{3.37407053E-1, 3.56672168E-1, 3.85155082E-1, 4.11186695E-1},
+{2.24014923E-1, 2.60116160E-1, 3.94772530E-1, 4.19585884E-1},
+{3.00647914E-1, 3.41640651E-1, 3.70223522E-1, 3.89520049E-1},
+{2.65946031E-1, 3.25039148E-1, 3.74339938E-1, 3.92346144E-1},
+{3.16029310E-1, 3.40491295E-1, 4.02355313E-1, 4.20484245E-1},
+{2.69841492E-1, 2.94562399E-1, 3.62341762E-1, 4.06415462E-1},
+{2.78897285E-1, 3.59831035E-1, 3.82025838E-1, 4.10577476E-1},
+{2.60760844E-1, 3.31088543E-1, 3.88826251E-1, 4.05486643E-1},
+{3.43372285E-1, 3.82647038E-1, 4.14716601E-1, 4.31592941E-1},
+{2.47998103E-1, 2.73393154E-1, 3.31160426E-1, 4.18943226E-1},
+{3.03579569E-1, 3.25202465E-1, 3.70984435E-1, 4.14420485E-1},
+{2.76896894E-1, 3.00499499E-1, 3.54178190E-1, 4.28807020E-1},
+{3.23655546E-1, 3.59816968E-1, 3.89525414E-1, 4.09288704E-1},
+{2.38927796E-1, 3.09919238E-1, 3.53915572E-1, 4.16634321E-1},
+{2.81171739E-1, 3.07520270E-1, 4.16264892E-1, 4.38523829E-1},
+{2.88858652E-1, 3.09810817E-1, 3.67845178E-1, 4.36035573E-1},
+{3.38423491E-1, 3.70634377E-1, 4.15449977E-1, 4.31534529E-1},
+{2.41260394E-1, 2.73617864E-1, 3.89554620E-1, 4.12539542E-1},
+{2.98046708E-1, 3.40122104E-1, 3.86183739E-1, 4.13826346E-1},
+{2.82436430E-1, 3.31597507E-1, 3.57941389E-1, 4.12115216E-1},
+{3.03820193E-1, 3.70588601E-1, 4.05774951E-1, 4.31517065E-1},
+{2.39077732E-1, 3.11638474E-1, 4.13935781E-1, 4.35304046E-1},
+{2.67116845E-1, 3.41937900E-1, 4.17409420E-1, 4.39184844E-1},
+{2.67946839E-1, 3.33343923E-1, 3.86481404E-1, 4.37462509E-1},
+{3.40510964E-1, 3.90878022E-1, 4.35485125E-1, 4.49101925E-1},
+{2.10069850E-1, 2.32524484E-1, 3.61781418E-1, 4.31357861E-1},
+{2.94509888E-1, 3.33709776E-1, 3.82278621E-1, 3.98638904E-1},
+{2.80525148E-1, 3.25905204E-1, 3.50647032E-1, 3.92873943E-1},
+{3.19999635E-1, 3.43674660E-1, 3.91070545E-1, 4.37501073E-1},
+{2.20581010E-1, 3.03151906E-1, 3.81765544E-1, 4.04488146E-1},
+{2.86122739E-1, 3.29746544E-1, 3.88102829E-1, 4.24247742E-1},
+{2.69807100E-1, 3.25332284E-1, 3.79154503E-1, 4.15138245E-1},
+{3.34858894E-1, 3.69258404E-1, 3.94743145E-1, 4.11922157E-1},
+{2.07109794E-1, 2.72779524E-1, 3.78566444E-1, 4.34579968E-1},
+{3.06466222E-1, 3.46695721E-1, 3.87138307E-1, 4.03558314E-1},
+{2.70148575E-1, 3.46654534E-1, 3.77696693E-1, 3.96434486E-1},
+{3.18745911E-1, 3.40225697E-1, 4.14991558E-1, 4.41578746E-1},
+{2.58592844E-1, 3.14370096E-1, 3.65083754E-1, 4.21615183E-1},
+{2.82712996E-1, 3.54137123E-1, 4.06745970E-1, 4.29267883E-1},
+{2.52021760E-1, 3.59105110E-1, 3.95102918E-1, 4.18148398E-1},
+{3.54906201E-1, 3.74952912E-1, 4.18965995E-1, 4.36144412E-1},
+{2.64841139E-1, 2.92941809E-1, 3.27751458E-1, 4.08790469E-1},
+{3.07774246E-1, 3.35586190E-1, 3.62209618E-1, 4.25394237E-1},
+{2.88466334E-1, 3.16075742E-1, 3.60989630E-1, 4.19551432E-1},
+{3.17128420E-1, 3.55772197E-1, 4.05808747E-1, 4.23972964E-1},
+{2.47089684E-1, 3.38184595E-1, 3.71859610E-1, 3.95971477E-1},
+{3.07981730E-1, 3.32691789E-1, 4.00534213E-1, 4.38273668E-1},
+{2.79484808E-1, 3.16183507E-1, 3.97237718E-1, 4.34746623E-1},
+{3.44490469E-1, 3.66153181E-1, 4.10959423E-1, 4.41727102E-1},
+{2.35741779E-1, 2.94587255E-1, 3.98072541E-1, 4.16833401E-1},
+{3.14038455E-1, 3.52272034E-1, 3.79138887E-1, 4.10969079E-1},
+{2.83002496E-1, 3.38136256E-1, 3.88641894E-1, 4.06193316E-1},
+{3.23625326E-1, 3.50243390E-1, 4.28089559E-1, 4.46630359E-1},
+{2.61252105E-1, 3.24970961E-1, 4.00214493E-1, 4.25321758E-1},
+{3.05284500E-1, 3.42164159E-1, 4.24475133E-1, 4.43830967E-1},
+{2.87374794E-1, 3.32500637E-1, 3.94308269E-1, 4.42538500E-1},
+{3.74075353E-1, 4.02026355E-1, 4.30933535E-1, 4.44160044E-1},
+{2.34503999E-1, 2.56218612E-1, 3.41238797E-1, 4.23045278E-1},
+{3.05492580E-1, 3.29156995E-1, 3.52709830E-1, 3.92439067E-1},
+{2.81323552E-1, 3.03292334E-1, 3.48925412E-1, 3.93163860E-1},
+{3.21893454E-1, 3.50419939E-1, 3.97317469E-1, 4.14560318E-1},
+{2.39684582E-1, 2.92451501E-1, 3.78937423E-1, 3.96535456E-1},
+{3.07307243E-1, 3.29127908E-1, 3.98455560E-1, 4.16143298E-1},
+{2.85274565E-1, 3.08774531E-1, 3.92916501E-1, 4.14437652E-1},
+{3.44446361E-1, 3.62201869E-1, 3.97619784E-1, 4.17743623E-1},
+{2.32083067E-1, 2.67807961E-1, 3.78075659E-1, 4.34560895E-1},
+{3.04738700E-1, 3.51865292E-1, 3.75973165E-1, 3.95293653E-1},
+{2.61990905E-1, 3.46207321E-1, 3.71296942E-1, 4.12438929E-1},
+{3.11080933E-1, 3.51040900E-1, 4.16082799E-1, 4.34340119E-1},
+{2.74980426E-1, 2.96631455E-1, 3.87520492E-1, 4.09243762E-1},
+{2.90939093E-1, 3.54455590E-1, 3.93426955E-1, 4.08220291E-1},
+{2.71871865E-1, 3.45510781E-1, 3.87125313E-1, 4.22590613E-1},
+{3.63245904E-1, 3.81932199E-1, 4.04114902E-1, 4.18370664E-1},
+{2.45770738E-1, 2.72909343E-1, 3.48317921E-1, 4.25161839E-1},
+{3.14139009E-1, 3.37872326E-1, 3.65195215E-1, 4.04423416E-1},
+{2.94075787E-1, 3.16935539E-1, 3.43047202E-1, 4.06130373E-1},
+{3.14627469E-1, 3.72413397E-1, 4.00660694E-1, 4.17930841E-1},
+{2.34014243E-1, 3.14007223E-1, 3.83003533E-1, 4.34829175E-1},
+{2.93635666E-1, 3.20529997E-1, 4.10837352E-1, 4.36393142E-1},
+{2.89505839E-1, 3.11828852E-1, 3.86311471E-1, 4.38771248E-1},
+{3.26317430E-1, 3.80858183E-1, 4.19721425E-1, 4.38795507E-1},
+{2.50809520E-1, 2.83018053E-1, 3.82247388E-1, 4.34244394E-1},
+{3.18994045E-1, 3.44855130E-1, 3.72690141E-1, 4.23067033E-1},
+{2.88380086E-1, 3.36622238E-1, 3.69742334E-1, 4.25057590E-1},
+{3.06107700E-1, 3.81856918E-1, 4.18206155E-1, 4.32868361E-1},
+{2.33898312E-1, 3.44861805E-1, 4.12176549E-1, 4.29216206E-1},
+{2.85980880E-1, 3.42903793E-1, 4.25112903E-1, 4.44299698E-1},
+{2.79858828E-1, 3.38789344E-1, 3.92085373E-1, 4.40541029E-1},
+{3.64509344E-1, 3.82202744E-1, 4.29830611E-1, 4.45818365E-1},
+{2.34392300E-1, 2.57377386E-1, 3.59567046E-1, 4.30088580E-1},
+{3.05031896E-1, 3.27589393E-1, 3.78305554E-1, 4.01026130E-1},
+{2.77522624E-1, 3.18130314E-1, 3.67794275E-1, 4.01543021E-1},
+{3.33035767E-1, 3.55820954E-1, 3.87548923E-1, 4.24628675E-1},
+{2.45021001E-1, 3.12560678E-1, 3.91147614E-1, 4.08762813E-1},
+{2.97059119E-1, 3.40246916E-1, 3.92919302E-1, 4.28899705E-1},
+{2.77839303E-1, 3.25019777E-1, 3.97436380E-1, 4.15920913E-1},
+{3.49465251E-1, 3.70362461E-1, 3.95482540E-1, 4.31923389E-1},
+{2.31485590E-1, 2.91023374E-1, 3.77909541E-1, 4.32259738E-1},
+{3.19283485E-1, 3.53671074E-1, 3.80982876E-1, 3.97843361E-1},
+{2.89689243E-1, 3.50265682E-1, 3.80729675E-1, 3.97969365E-1},
+{3.28987300E-1, 3.52005422E-1, 4.12557244E-1, 4.37597930E-1},
+{2.76273251E-1, 3.02267194E-1, 3.81723404E-1, 4.34989095E-1},
+{2.79627264E-1, 3.73727322E-1, 4.12374616E-1, 4.30626333E-1},
+{2.53442764E-1, 3.65940034E-1, 4.14937019E-1, 4.32743609E-1},
+{3.76107216E-1, 3.95142019E-1, 4.16787744E-1, 4.33023572E-1},
+{2.62815833E-1, 2.88270533E-1, 3.47397208E-1, 4.24182594E-1},
+{3.01931322E-1, 3.43652546E-1, 3.77031326E-1, 4.34204459E-1},
+{2.97834277E-1, 3.23495388E-1, 3.64492416E-1, 4.33550835E-1},
+{3.31774473E-1, 3.64324927E-1, 3.98243546E-1, 4.35078323E-1},
+{2.49049723E-1, 3.27870786E-1, 3.83587003E-1, 4.35558081E-1},
+{3.04653406E-1, 3.27671230E-1, 4.18484688E-1, 4.41378772E-1},
+{2.96960890E-1, 3.23898911E-1, 3.90463710E-1, 4.39915955E-1},
+{3.43923748E-1, 3.67100477E-1, 4.29523230E-1, 4.45214987E-1},
+{2.59399652E-1, 2.91602671E-1, 4.04372454E-1, 4.31413233E-1},
+{2.97537506E-1, 3.57573807E-1, 3.88991833E-1, 4.30006981E-1},
+{2.84068942E-1, 3.49574566E-1, 3.81042838E-1, 4.29712772E-1},
+{3.25716257E-1, 3.74875903E-1, 4.31959271E-1, 4.47290838E-1},
+{2.65302956E-1, 3.14745963E-1, 4.16703463E-1, 4.37294722E-1},
+{3.00398588E-1, 3.54147255E-1, 4.28538084E-1, 4.60336387E-1},
+{2.98077166E-1, 3.49304914E-1, 4.00429249E-1, 4.48213518E-1},
+{3.75576198E-1, 4.16657329E-1, 4.42136765E-1, 4.52728629E-1}};
+
+static const float evrc_lspq_quant_codebook1[16][5] = {
+{0.42091064E-1, 0.69474973E-1, 0.11168948E+0, 0.14571965E+0, 0.20893581E+0},
+{0.54944664E-1, 0.98242261E-1, 0.11007882E+0, 0.15890779E+0, 0.20548241E+0},
+{0.45188572E-1, 0.75199433E-1, 0.11423391E+0, 0.15469728E+0, 0.19746706E+0},
+{0.49474996E-1, 0.79667501E-1, 0.12571351E+0, 0.16944779E+0, 0.20775315E+0},
+{0.41789379E-1, 0.63459560E-1, 0.12068028E+0, 0.15850765E+0, 0.20406815E+0},
+{0.47159236E-1, 0.79129547E-1, 0.12183110E+0, 0.15650047E+0, 0.22309226E+0},
+{0.54539919E-1, 0.80343045E-1, 0.12947764E+0, 0.15186153E+0, 0.20171718E+0},
+{0.55852082E-1, 0.94114847E-1, 0.14016025E+0, 0.17807084E+0, 0.22955489E+0},
+{0.45443531E-1, 0.73541410E-1, 0.11937657E+0, 0.15442030E+0, 0.21010752E+0},
+{0.63178010E-1, 0.95231488E-1, 0.12364983E+0, 0.17672543E+0, 0.21743731E+0},
+{0.52765369E-1, 0.84351443E-1, 0.11589085E+0, 0.15790924E+0, 0.20732352E+0},
+{0.51865745E-1, 0.81328541E-1, 0.13756232E+0, 0.18322878E+0, 0.21640070E+0},
+{0.44419531E-1, 0.68874463E-1, 0.13115251E+0, 0.16263582E+0, 0.21659100E+0},
+{0.49378436E-1, 0.81882551E-1, 0.13067168E+0, 0.16821896E+0, 0.23136081E+0},
+{0.55909779E-1, 0.90783298E-1, 0.13348848E+0, 0.16298474E+0, 0.20961523E+0},
+{0.61378211E-1, 0.98602772E-1, 0.14793332E+0, 0.19283190E+0, 0.23156509E+0}};
+
+static const float evrc_lspq_quant_codebook2[16][5] = {
+{0.26822963, 0.30585295, 0.31110349, 0.36823335, 0.40774474},
+{0.24418014, 0.28970167, 0.32573757, 0.39021483, 0.41345838},
+{0.23341830, 0.30078292, 0.32893899, 0.38557330, 0.41068462},
+{0.25905868, 0.29756859, 0.34196618, 0.38531172, 0.41295227},
+{0.24290450, 0.29223618, 0.32718554, 0.37788135, 0.40332928},
+{0.24674191, 0.29749370, 0.33631226, 0.39426059, 0.42258954},
+{0.21377595, 0.33140418, 0.34067687, 0.38222077, 0.40939021},
+{0.26673481, 0.30791649, 0.34419721, 0.39611506, 0.42387524},
+{0.26121426, 0.30492544, 0.32997236, 0.38486803, 0.42023736},
+{0.24954870, 0.29372856, 0.33382735, 0.37850669, 0.41714057},
+{0.24158891, 0.30173415, 0.34128246, 0.38428575, 0.41619650},
+{0.25818908, 0.31736413, 0.34904337, 0.38769925, 0.41551358},
+{0.24450587, 0.30673453, 0.33579323, 0.37844428, 0.40557048},
+{0.25164026, 0.31225079, 0.33847794, 0.39554194, 0.42396802},
+{0.22787990, 0.31779197, 0.33831909, 0.40044111, 0.41185561},
+{0.27896860, 0.32261974, 0.35658112, 0.40206763, 0.42370448}};
+
+static const float * const evrc_lspq_full_codebooks[] = {
+    evrc_lspq_full_codebook1[0], evrc_lspq_full_codebook2[0],
+    evrc_lspq_full_codebook3[0], evrc_lspq_full_codebook4[0],
+};
+
+static const float * const evrc_lspq_half_codebooks[] = {
+    evrc_lspq_half_codebook1[0], evrc_lspq_half_codebook2[0],
+    evrc_lspq_half_codebook3[0],
+};
+
+static const float * const evrc_lspq_quant_codebooks[] = {
+    evrc_lspq_quant_codebook1[0], evrc_lspq_quant_codebook2[0],
+};
+
+static const float * const * const evrc_lspq_codebooks[] = {
+    0,
+    evrc_lspq_quant_codebooks,
+    0,
+    evrc_lspq_half_codebooks,
+    evrc_lspq_full_codebooks,
+};
+
+static const uint8_t evrc_lspq_nb_codebooks[] = {
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebooks),
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebooks),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebooks),
+};
+
+static const uint8_t evrc_lspq_full_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook3[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook4[0]),
+};
+
+static const uint8_t evrc_lspq_half_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook3[0]),
+};
+
+static const uint8_t evrc_lspq_quant_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook2[0]),
+};
+
+static const uint8_t* const evrc_lspq_codebooks_row_sizes[] = {
+    NULL,
+    evrc_lspq_quant_codebooks_row_sizes,
+    NULL,
+    evrc_lspq_half_codebooks_row_sizes,
+    evrc_lspq_full_codebooks_row_sizes,
+};
+
+static const float pitch_gain_vq[] = { 0, 0.3, 0.55, 0.7, 0.8, 0.9, 1, 1.2 };
+static const float estimation_delay[] = { 55.0, 80.0, 39.0, 71.0, 33.0 }; // 5.2.3.4
+static const uint8_t subframe_sizes[] = { 53, 53, 54 };
+#endif /* AVCODEC_EVRCDATA_H */
diff --git a/libavcodec/evrcdec.c b/libavcodec/evrcdec.c
new file mode 100644
index 0000000000..cbbc62976f
--- /dev/null
+++ b/libavcodec/evrcdec.c
@@ -0,0 +1,941 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "evrcdata.h"
+#include "acelp_vectors.h"
+#include "lsp.h"
+
+#define MIN_LSP_SEP (0.05 / (2.0 * M_PI))
+#define MIN_DELAY      20
+#define MAX_DELAY     120
+#define NB_SUBFRAMES    3
+#define SUBFRAME_SIZE  54
+#define FILTER_ORDER   10
+#define ACB_SIZE      128
+
+typedef enum {
+    RATE_ERRS = -1,
+    SILENCE,
+    RATE_QUANT,
+    RATE_QUARTER,
+    RATE_HALF,
+    RATE_FULL,
+} evrc_packet_rate;
+
+/**
+ * EVRC-A unpacked data frame
+ */
+typedef struct EVRCAFrame {
+    uint8_t  lpc_flag;        ///< spectral change indicator
+    uint16_t lsp[4];          ///< index into LSP codebook
+    uint8_t  pitch_delay;     ///< pitch delay for entire frame
+    uint8_t  delay_diff;      ///< delay difference for entire frame
+    uint8_t  acb_gain[3];     ///< adaptive codebook gain
+    uint16_t fcb_shape[3][4]; ///< fixed codebook shape
+    uint8_t  fcb_gain[3];     ///< fixed codebook gain index
+    uint8_t  energy_gain;     ///< frame energy gain index
+    uint8_t  tty;             ///< tty baud rate bit
+} EVRCAFrame;
+
+typedef struct EVRCContext {
+    AVClass *class;
+
+    int              postfilter;
+
+    GetBitContext    gb;
+    evrc_packet_rate bitrate;
+    evrc_packet_rate last_valid_bitrate;
+    EVRCAFrame       frame;
+
+    float            lspf[FILTER_ORDER];
+    float            prev_lspf[FILTER_ORDER];
+    float            synthesis[FILTER_ORDER];
+    float            postfilter_fir[FILTER_ORDER];
+    float            postfilter_iir[FILTER_ORDER];
+    float            postfilter_residual[ACB_SIZE + SUBFRAME_SIZE];
+    float            pitch_delay;
+    float            prev_pitch_delay;
+    float            avg_acb_gain;  ///< average adaptive codebook gain
+    float            avg_fcb_gain;  ///< average fixed codebook gain
+    float            pitch[ACB_SIZE + FILTER_ORDER + SUBFRAME_SIZE];
+    float            pitch_back[ACB_SIZE];
+    float            interpolation_coeffs[136];
+    float            energy_vector[NB_SUBFRAMES];
+    float            fade_scale;
+    float            last;
+
+    uint8_t          prev_energy_gain;
+    uint8_t          prev_error_flag;
+    uint8_t          warned_buf_mismatch_bitrate;
+} EVRCContext;
+
+/**
+ * Frame unpacking for RATE_FULL, RATE_HALF and RATE_QUANT
+ *
+ * @param e the context
+ *
+ * TIA/IS-127 Table 4.21-1
+ */
+static void unpack_frame(EVRCContext *e)
+{
+    EVRCAFrame *frame = &e->frame;
+    GetBitContext *gb = &e->gb;
+
+    switch (e->bitrate) {
+    case RATE_FULL:
+        frame->lpc_flag        = get_bits1(gb);
+        frame->lsp[0]          = get_bits(gb,  6);
+        frame->lsp[1]          = get_bits(gb,  6);
+        frame->lsp[2]          = get_bits(gb,  9);
+        frame->lsp[3]          = get_bits(gb,  7);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->delay_diff      = get_bits(gb,  5);
+        frame->acb_gain[0]     = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb,  8);
+        frame->fcb_shape[0][1] = get_bits(gb,  8);
+        frame->fcb_shape[0][2] = get_bits(gb,  8);
+        frame->fcb_shape[0][3] = get_bits(gb, 11);
+        frame->fcb_gain[0]     = get_bits(gb,  5);
+        frame->acb_gain[1]     = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb,  8);
+        frame->fcb_shape[1][1] = get_bits(gb,  8);
+        frame->fcb_shape[1][2] = get_bits(gb,  8);
+        frame->fcb_shape[1][3] = get_bits(gb, 11);
+        frame->fcb_gain    [1] = get_bits(gb,  5);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb,  8);
+        frame->fcb_shape[2][1] = get_bits(gb,  8);
+        frame->fcb_shape[2][2] = get_bits(gb,  8);
+        frame->fcb_shape[2][3] = get_bits(gb, 11);
+        frame->fcb_gain    [2] = get_bits(gb,  5);
+        frame->tty             = get_bits1(gb);
+        break;
+    case RATE_HALF:
+        frame->lsp         [0] = get_bits(gb,  7);
+        frame->lsp         [1] = get_bits(gb,  7);
+        frame->lsp         [2] = get_bits(gb,  8);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->acb_gain    [0] = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb, 10);
+        frame->fcb_gain    [0] = get_bits(gb,  4);
+        frame->acb_gain    [1] = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb, 10);
+        frame->fcb_gain    [1] = get_bits(gb,  4);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb, 10);
+        frame->fcb_gain    [2] = get_bits(gb,  4);
+        break;
+    case RATE_QUANT:
+        frame->lsp         [0] = get_bits(gb, 4);
+        frame->lsp         [1] = get_bits(gb, 4);
+        frame->energy_gain     = get_bits(gb, 8);
+        break;
+    }
+}
+
+static evrc_packet_rate buf_size2bitrate(const int buf_size)
+{
+    switch (buf_size) {
+    case 23: return RATE_FULL;
+    case 11: return RATE_HALF;
+    case  6: return RATE_QUARTER;
+    case  3: return RATE_QUANT;
+    case  1: return SILENCE;
+    }
+
+    return RATE_ERRS;
+}
+
+/**
+ * Determine the bitrate from the frame size and/or the first byte of the frame.
+ *
+ * @param avctx the AV codec context
+ * @param buf_size length of the buffer
+ * @param buf the bufffer
+ *
+ * @return the bitrate on success,
+ *         RATE_ERRS  if the bitrate cannot be satisfactorily determined
+ */
+static evrc_packet_rate determine_bitrate(AVCodecContext *avctx,
+                                          int *buf_size,
+                                          const uint8_t **buf)
+{
+    evrc_packet_rate bitrate;
+
+    if ((bitrate = buf_size2bitrate(*buf_size)) >= 0) {
+        if (bitrate > **buf) {
+            EVRCContext *e = avctx->priv_data;
+            if (!e->warned_buf_mismatch_bitrate) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Claimed bitrate and buffer size mismatch.\n");
+                e->warned_buf_mismatch_bitrate = 1;
+            }
+            bitrate = **buf;
+        } else if (bitrate < **buf) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Buffer is too small for the claimed bitrate.\n");
+            return RATE_ERRS;
+        }
+        (*buf)++;
+        *buf_size -= 1;
+    } else if ((bitrate = buf_size2bitrate(*buf_size + 1)) >= 0) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+    } else
+        return RATE_ERRS;
+
+    return bitrate;
+}
+
+static void warn_insufficient_frame_quality(AVCodecContext *avctx,
+                                            const char *message)
+{
+    av_log(avctx, AV_LOG_WARNING, "Frame #%d, %s\n",
+           avctx->frame_number, message);
+}
+
+/**
+ * Initialize the speech codec according to the specification.
+ *
+ * TIA/IS-127 5.2
+ */
+static av_cold int evrc_decode_init(AVCodecContext *avctx)
+{
+    EVRCContext *e = avctx->priv_data;
+    int i, n, idx = 0;
+    float denom = 2.0 / (2.0 * 8.0 + 1.0);
+
+    avctx->channels       = 1;
+    avctx->channel_layout = AV_CH_LAYOUT_MONO;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        e->prev_lspf[i] = (i + 1) * 0.048;
+        e->synthesis[i] = 0.0;
+    }
+
+    for (i = 0; i < ACB_SIZE; i++)
+        e->pitch[i] = e->pitch_back[i] = 0.0;
+
+    e->last_valid_bitrate = RATE_QUANT;
+    e->prev_pitch_delay   = 40.0;
+    e->fade_scale         = 1.0;
+    e->prev_error_flag    = 0;
+    e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+
+    for (i = 0; i < 8; i++) {
+        float tt = ((float)i - 8.0 / 2.0) / 8.0;
+
+        for (n = -8; n <= 8; n++, idx++) {
+            float arg1 = M_PI * 0.9 * (tt - n);
+            float arg2 = M_PI * (tt - n);
+
+            e->interpolation_coeffs[idx] = 0.9;
+            if (arg1)
+                e->interpolation_coeffs[idx] *= (0.54 + 0.46 * cos(arg2 * denom)) *
+                                                 sin(arg1) / arg1;
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Decode the 10 vector quantized line spectral pair frequencies from the LSP
+ * transmission codes of any bitrate and check for badly received packets.
+ *
+ * @param e the context
+ *
+ * @return 0 on success, -1 if the packet is badly received
+ *
+ * TIA/IS-127 5.2.1, 5.7.1
+ */
+static int decode_lspf(EVRCContext *e)
+{
+    const float * const *codebooks = evrc_lspq_codebooks[e->bitrate];
+    int i, j, k = 0;
+
+    for (i = 0; i < evrc_lspq_nb_codebooks[e->bitrate]; i++) {
+        int row_size = evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        const float *codebook = codebooks[i];
+
+        for (j = 0; j < row_size; j++)
+            e->lspf[k++] = codebook[e->frame.lsp[i] * row_size + j];
+    }
+
+    // check for monotonic LSPs
+    for (i = 1; i < FILTER_ORDER; i++)
+        if (e->lspf[i] <= e->lspf[i - 1])
+            return -1;
+
+    // check for minimum separation of LSPs at the splits
+    for (i = 0, k = 0; i < evrc_lspq_nb_codebooks[e->bitrate] - 1; i++) {
+        k += evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        if (e->lspf[k] - e->lspf[k - 1] <= MIN_LSP_SEP)
+            return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Interpolation of LSP parameters.
+ *
+ * TIA/IS-127 5.2.3.1, 5.7.3.2
+ */
+static void interpolate_lsp(float *ilsp, const float *lsp,
+                            const float *prev, int index)
+{
+    static const float lsp_interpolation_factors[] = { 0.1667, 0.5, 0.8333 };
+    ff_weighted_vector_sumf(ilsp, prev, lsp,
+                            1.0 - lsp_interpolation_factors[index],
+                            lsp_interpolation_factors[index], FILTER_ORDER);
+}
+
+/*
+ * Reconstruction of the delay contour.
+ *
+ * TIA/IS-127 5.2.2.3.2
+ */
+static void interpolate_delay(float *dst, float current, float prev, int index)
+{
+    static const float d_interpolation_factors[] = { 0, 0.3313, 0.6625, 1, 1 };
+    dst[0] = (1.0 - d_interpolation_factors[index    ]) * prev
+                  + d_interpolation_factors[index    ]  * current;
+    dst[1] = (1.0 - d_interpolation_factors[index + 1]) * prev
+                  + d_interpolation_factors[index + 1]  * current;
+    dst[2] = (1.0 - d_interpolation_factors[index + 2]) * prev
+                  + d_interpolation_factors[index + 2]  * current;
+}
+
+/*
+ * Convert the quantized, interpolated line spectral frequencies,
+ * to prediction coefficients.
+ *
+ * TIA/IS-127 5.2.3.2, 4.7.2.2
+ */
+static void decode_predictor_coeffs(const float *ilspf, float *ilpc)
+{
+    double lsp[FILTER_ORDER];
+    float a[FILTER_ORDER / 2 + 1], b[FILTER_ORDER / 2 + 1];
+    float a1[FILTER_ORDER / 2] = { 0 };
+    float a2[FILTER_ORDER / 2] = { 0 };
+    float b1[FILTER_ORDER / 2] = { 0 };
+    float b2[FILTER_ORDER / 2] = { 0 };
+    int i, k;
+
+    ff_acelp_lsf2lspd(lsp, ilspf, FILTER_ORDER);
+
+    for (k = 0; k <= FILTER_ORDER; k++) {
+        a[0] = k < 2 ? 0.25 : 0;
+        b[0] = k < 2 ? k < 1 ? 0.25 : -0.25 : 0;
+
+        for (i = 0; i < FILTER_ORDER / 2; i++) {
+            a[i + 1] = a[i] - 2 * lsp[i * 2    ] * a1[i] + a2[i];
+            b[i + 1] = b[i] - 2 * lsp[i * 2 + 1] * b1[i] + b2[i];
+            a2[i] = a1[i];
+            a1[i] = a[i];
+            b2[i] = b1[i];
+            b1[i] = b[i];
+        }
+
+        if (k)
+            ilpc[k - 1] = 2.0 * (a[FILTER_ORDER / 2] + b[FILTER_ORDER / 2]);
+    }
+}
+
+static void bl_intrp(EVRCContext *e, float *ex, float delay)
+{
+    float *f;
+    int offset, i, coef_idx;
+    int16_t t;
+
+    offset = lrintf(delay);
+
+    t = (offset - delay + 0.5) * 8.0 + 0.5;
+    if (t == 8) {
+        t = 0;
+        offset--;
+    }
+
+    f = ex - offset - 8;
+
+    coef_idx = t * (2 * 8 + 1);
+
+    ex[0] = 0.0;
+    for (i = 0; i < 2 * 8 + 1; i++)
+        ex[0] += e->interpolation_coeffs[coef_idx + i] * f[i];
+}
+
+/*
+ * Adaptive codebook excitation.
+ *
+ * TIA/IS-127 5.2.2.3.3, 4.12.5.2
+ */
+static void acb_excitation(EVRCContext *e, float *excitation, float gain,
+                           const float delay[3], int length)
+{
+    float denom, locdelay, dpr, invl;
+    int i;
+
+    invl = 1.0 / ((float) length);
+    dpr = length;
+
+    /* first at-most extra samples */
+    denom = (delay[1] - delay[0]) * invl;
+    for (i = 0; i < dpr; i++) {
+        locdelay = delay[0] + i * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    denom = (delay[2] - delay[1]) * invl;
+    /* interpolation */
+    for (i = dpr; i < dpr + 10; i++) {
+        locdelay = delay[1] + (i - dpr) * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    for (i = 0; i < length; i++)
+        excitation[i] *= gain;
+}
+
+static void decode_8_pulses_35bits(const uint16_t *fixed_index, float *cod)
+{
+    int i, pos1, pos2, offset;
+
+    offset = (fixed_index[3] >> 9) & 3;
+
+    for (i = 0; i < 3; i++) {
+        pos1 = ((fixed_index[i] & 0x7f) / 11) * 5 + ((i + offset) % 5);
+        pos2 = ((fixed_index[i] & 0x7f) % 11) * 5 + ((i + offset) % 5);
+
+        cod[pos1] = (fixed_index[i] & 0x80) ? -1.0 : 1.0;
+
+        if (pos2 < pos1)
+            cod[pos2]  = -cod[pos1];
+        else
+            cod[pos2] +=  cod[pos1];
+    }
+
+    pos1 = ((fixed_index[3] & 0x7f) / 11) * 5 + ((3 + offset) % 5);
+    pos2 = ((fixed_index[3] & 0x7f) % 11) * 5 + ((4 + offset) % 5);
+
+    cod[pos1] = (fixed_index[3] & 0x100) ? -1.0 : 1.0;
+    cod[pos2] = (fixed_index[3] & 0x80 ) ? -1.0 : 1.0;
+}
+
+static void decode_3_pulses_10bits(uint16_t fixed_index, float *cod)
+{
+    float sign;
+    int pos;
+
+    sign = (fixed_index & 0x200) ? -1.0 : 1.0;
+
+    pos = ((fixed_index        & 0x7) * 7) + 4;
+    cod[pos] += sign;
+    pos = (((fixed_index >> 3) & 0x7) * 7) + 2;
+    cod[pos] -= sign;
+    pos = (((fixed_index >> 6) & 0x7) * 7);
+    cod[pos] += sign;
+}
+
+/*
+ * Reconstruction of ACELP fixed codebook excitation for full and half rate.
+ *
+ * TIA/IS-127 5.2.3.7
+ */
+static void fcb_excitation(EVRCContext *e, const uint16_t *codebook,
+                           float *excitation, float pitch_gain,
+                           int pitch_lag, int subframe_size)
+{
+    int i;
+
+    if (e->bitrate == RATE_FULL)
+        decode_8_pulses_35bits(codebook, excitation);
+    else
+        decode_3_pulses_10bits(*codebook, excitation);
+
+    pitch_gain = av_clipf(pitch_gain, 0.2, 0.9);
+
+    for (i = pitch_lag; i < subframe_size; i++)
+        excitation[i] += pitch_gain * excitation[i - pitch_lag];
+}
+
+/**
+ * Synthesis of the decoder output signal.
+ *
+ * param[in]     in              input signal
+ * param[in]     filter_coeffs   LPC coefficients
+ * param[in/out] memory          synthesis filter memory
+ * param         buffer_length   amount of data to process
+ * param[out]    samples         output samples
+ *
+ * TIA/IS-127 5.2.3.15, 5.7.3.4
+ */
+static void synthesis_filter(const float *in, const float *filter_coeffs,
+                             float *memory, int buffer_length, float *samples)
+{
+    int i, j;
+
+    for (i = 0; i < buffer_length; i++) {
+        samples[i] = in[i];
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            samples[i] -= filter_coeffs[j] * memory[j];
+            memory[j]   = memory[j - 1];
+        }
+        samples[i] -= filter_coeffs[0] * memory[0];
+        memory[0]   = samples[i];
+    }
+}
+
+static void bandwidth_expansion(float *coeff, const float *inbuf, float gamma)
+{
+    double fac = gamma;
+    int i;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        coeff[i] = inbuf[i] * fac;
+        fac *= gamma;
+    }
+}
+
+static void residual_filter(float *output, const float *input,
+                            const float *coef, float *memory, int length)
+{
+    float sum;
+    int i, j;
+
+    for (i = 0; i < length; i++) {
+        sum = input[i];
+
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            sum      += coef[j] * memory[j];
+            memory[j] = memory[j - 1];
+        }
+        sum += coef[0] * memory[0];
+        memory[0] = input[i];
+        output[i] = sum;
+    }
+}
+
+/*
+ * TIA/IS-127 Table 5.9.1-1.
+ */
+static const struct PfCoeff {
+    float tilt;
+    float ltgain;
+    float p1;
+    float p2;
+} postfilter_coeffs[5] = {
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.0 , 0.0 , 0.57, 0.57 },
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.35, 0.50, 0.50, 0.75 },
+    { 0.20, 0.50, 0.57, 0.75 },
+};
+
+/*
+ * Adaptive postfilter.
+ *
+ * TIA/IS-127 5.9
+ */
+static void postfilter(EVRCContext *e, float *in, const float *coeff,
+                       float *out, int idx, const struct PfCoeff *pfc,
+                       int length)
+{
+    float wcoef1[FILTER_ORDER], wcoef2[FILTER_ORDER],
+          scratch[SUBFRAME_SIZE], temp[SUBFRAME_SIZE],
+          mem[SUBFRAME_SIZE];
+    float sum1 = 0.0, sum2 = 0.0, gamma, gain;
+    float tilt = pfc->tilt;
+    int i, n, best;
+
+    bandwidth_expansion(wcoef1, coeff, pfc->p1);
+    bandwidth_expansion(wcoef2, coeff, pfc->p2);
+
+    /* Tilt compensation filter, TIA/IS-127 5.9.1 */
+    for (i = 0; i < length - 1; i++)
+        sum2 += in[i] * in[i + 1];
+    if (sum2 < 0.0)
+        tilt = 0.0;
+
+    for (i = 0; i < length; i++) {
+        scratch[i] = in[i] - tilt * e->last;
+        e->last = in[i];
+    }
+
+    /* Short term residual filter, TIA/IS-127 5.9.2 */
+    residual_filter(&e->postfilter_residual[ACB_SIZE], scratch, wcoef1, e->postfilter_fir, length);
+
+    /* Long term postfilter */
+    best = idx;
+    for (i = FFMIN(MIN_DELAY, idx - 3); i <= FFMAX(MAX_DELAY, idx + 3); i++) {
+        for (n = ACB_SIZE, sum2 = 0; n < ACB_SIZE + length; n++)
+            sum2 += e->postfilter_residual[n] * e->postfilter_residual[n - i];
+        if (sum2 > sum1) {
+            sum1 = sum2;
+            best = i;
+        }
+    }
+
+    for (i = ACB_SIZE, sum1 = 0; i < ACB_SIZE + length; i++)
+        sum1 += e->postfilter_residual[i - best] * e->postfilter_residual[i - best];
+    for (i = ACB_SIZE, sum2 = 0; i < ACB_SIZE + length; i++)
+        sum2 += e->postfilter_residual[i] * e->postfilter_residual[i - best];
+
+    if (sum2 * sum1 == 0 || e->bitrate == RATE_QUANT) {
+        memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+    } else {
+        gamma = sum2 / sum1;
+        if (gamma < 0.5)
+            memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+        else {
+            gamma = FFMIN(gamma, 1.0);
+
+            for (i = 0; i < length; i++) {
+                temp[i] = e->postfilter_residual[ACB_SIZE + i] + gamma *
+                    pfc->ltgain * e->postfilter_residual[ACB_SIZE + i - best];
+            }
+        }
+    }
+
+    memcpy(scratch, temp, length * sizeof(float));
+    memcpy(mem, e->postfilter_iir, FILTER_ORDER * sizeof(float));
+    synthesis_filter(scratch, wcoef2, mem, length, scratch);
+
+    /* Gain computation, TIA/IS-127 5.9.4-2 */
+    for (i = 0, sum1 = 0, sum2 = 0; i < length; i++) {
+        sum1 += in[i] * in[i];
+        sum2 += scratch[i] * scratch[i];
+    }
+    gain = sum2 ? sqrt(sum1 / sum2) : 1.0;
+
+    for (i = 0; i < length; i++)
+        temp[i] *= gain;
+
+    /* Short term postfilter */
+    synthesis_filter(temp, wcoef2, e->postfilter_iir, length, out);
+
+    memmove(e->postfilter_residual,
+           e->postfilter_residual + length, ACB_SIZE * sizeof(float));
+}
+
+static void frame_erasure(EVRCContext *e, float *samples)
+{
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES],
+          tmp[SUBFRAME_SIZE + 6], f;
+    int i, j;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        if (e->bitrate != RATE_QUANT)
+            e->lspf[i] = e->prev_lspf[i] * 0.875 + 0.125 * (i + 1) * 0.048;
+        else
+            e->lspf[i] = e->prev_lspf[i];
+    }
+
+    if (e->prev_error_flag)
+        e->avg_acb_gain *= 0.75;
+    if (e->bitrate == RATE_FULL)
+        memcpy(e->pitch_back, e->pitch, ACB_SIZE * sizeof(float));
+    if (e->last_valid_bitrate == RATE_QUANT)
+        e->bitrate = RATE_QUANT;
+    else
+        e->bitrate = RATE_FULL;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        e->pitch_delay = e->prev_pitch_delay;
+    } else {
+        float sum = 0;
+
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            sum += evrc_energy_quant[e->prev_energy_gain][i];
+        sum /= (float) NB_SUBFRAMES;
+        sum  = pow(10, sum);
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = sum;
+    }
+
+    if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT) {
+            if (e->avg_acb_gain < 0.3) {
+                idelay[0] = estimation_delay[i];
+                idelay[1] = estimation_delay[i + 1];
+                idelay[2] = estimation_delay[i + 2];
+            } else {
+                interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+            }
+        }
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        if (e->bitrate != RATE_QUANT) {
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           e->avg_acb_gain, idelay, subframe_size);
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] *= e->fade_scale;
+            e->fade_scale = FFMAX(e->fade_scale - 0.05, 0.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        if (e->bitrate != RATE_QUANT && e->avg_acb_gain < 0.4) {
+            f = 0.1 * e->avg_fcb_gain;
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f;
+        } else if (e->bitrate == RATE_QUANT) {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size, tmp);
+        postfilter(e, tmp, ilpc, samples, pitch_lag,
+                   &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+}
+
+static int evrc_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    AVFrame *frame     = data;
+    EVRCContext *e     = avctx->priv_data;
+    int buf_size       = avpkt->size;
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES];
+    float *samples;
+    int   i, j, ret, error_flag = 0;
+
+    frame->nb_samples = 160;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (float *)frame->data[0];
+
+    if ((e->bitrate = determine_bitrate(avctx, &buf_size, &buf)) == RATE_ERRS) {
+        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        goto erasure;
+    }
+    if (e->bitrate <= SILENCE || e->bitrate == RATE_QUARTER)
+        goto erasure;
+    if (e->bitrate == RATE_QUANT && e->last_valid_bitrate == RATE_FULL
+                                 && !e->prev_error_flag)
+        goto erasure;
+
+    if ((ret = init_get_bits8(&e->gb, buf, buf_size)) < 0)
+        return ret;
+    memset(&e->frame, 0, sizeof(EVRCAFrame));
+
+    unpack_frame(e);
+
+    if (e->bitrate != RATE_QUANT) {
+        uint8_t *p = (uint8_t *) &e->frame;
+        for (i = 0; i < sizeof(EVRCAFrame); i++) {
+            if (p[i])
+                break;
+        }
+        if (i == sizeof(EVRCAFrame))
+            goto erasure;
+    } else if (e->frame.lsp[0] == 0xf &&
+               e->frame.lsp[1] == 0xf &&
+               e->frame.energy_gain == 0xff) {
+        goto erasure;
+    }
+
+    if (decode_lspf(e) < 0)
+        goto erasure;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        /* Pitch delay parameter checking as per TIA/IS-127 5.1.5.1 */
+        if (e->frame.pitch_delay > MAX_DELAY - MIN_DELAY)
+            goto erasure;
+
+        e->pitch_delay = e->frame.pitch_delay + MIN_DELAY;
+
+        /* Delay diff parameter checking as per TIA/IS-127 5.1.5.2 */
+        if (e->frame.delay_diff) {
+            int p = e->pitch_delay - e->frame.delay_diff + 16;
+            if (p < MIN_DELAY || p > MAX_DELAY)
+                goto erasure;
+        }
+
+        /* Delay contour reconstruction as per TIA/IS-127 5.2.2.2 */
+        if (e->frame.delay_diff &&
+            e->bitrate == RATE_FULL && e->prev_error_flag) {
+            float delay;
+
+            memcpy(e->pitch, e->pitch_back, ACB_SIZE * sizeof(float));
+
+            delay = e->prev_pitch_delay;
+            e->prev_pitch_delay = delay - e->frame.delay_diff + 16.0;
+
+            if (fabs(e->pitch_delay - delay) > 15)
+                delay = e->pitch_delay;
+
+            for (i = 0; i < NB_SUBFRAMES; i++) {
+                int subframe_size = subframe_sizes[i];
+
+                interpolate_delay(idelay, delay, e->prev_pitch_delay, i);
+                acb_excitation(e, e->pitch + ACB_SIZE, e->avg_acb_gain, idelay, subframe_size);
+                memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+            }
+        }
+
+        /* Smoothing of the decoded delay as per TIA/IS-127 5.2.2.5 */
+        if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+            e->prev_pitch_delay = e->pitch_delay;
+
+        e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+    } else {
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        /* Decode frame energy vectors as per TIA/IS-127 5.7.2 */
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = pow(10, evrc_energy_quant[e->frame.energy_gain][i]);
+        e->prev_energy_gain = e->frame.energy_gain;
+    }
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        float tmp[SUBFRAME_SIZE + 6] = { 0 };
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT)
+            interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        /* Bandwidth expansion as per TIA/IS-127 5.2.3.3 */
+        if (e->frame.lpc_flag && e->prev_error_flag)
+            bandwidth_expansion(ilpc, ilpc, 0.75);
+
+        if (e->bitrate != RATE_QUANT) {
+            float acb_sum, f;
+
+            f = exp((e->bitrate == RATE_HALF ? 0.5 : 0.25)
+                         * (e->frame.fcb_gain[i] + 1));
+            acb_sum = pitch_gain_vq[e->frame.acb_gain[i]];
+            e->avg_acb_gain += acb_sum / NB_SUBFRAMES;
+            e->avg_fcb_gain += f / NB_SUBFRAMES;
+
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           acb_sum, idelay, subframe_size);
+            fcb_excitation(e, e->frame.fcb_shape[i], tmp,
+                           acb_sum, pitch_lag, subframe_size);
+
+            /* Total excitation generation as per TIA/IS-127 5.2.3.9 */
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f * tmp[j];
+            e->fade_scale = FFMIN(e->fade_scale + 0.2, 1.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size,
+                         e->postfilter ? tmp : samples);
+        if (e->postfilter)
+            postfilter(e, tmp, ilpc, samples, pitch_lag,
+                       &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+
+    if (error_flag) {
+erasure:
+        error_flag = 1;
+        av_log(avctx, AV_LOG_WARNING, "frame erasure\n");
+        frame_erasure(e, samples);
+    }
+
+    memcpy(e->prev_lspf, e->lspf, sizeof(e->prev_lspf));
+    e->prev_error_flag    = error_flag;
+    e->last_valid_bitrate = e->bitrate;
+
+    if (e->bitrate != RATE_QUANT)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    samples = (float *)frame->data[0];
+    for (i = 0; i < 160; i++)
+        samples[i] /= 32768;
+
+    *got_frame_ptr   = 1;
+
+    return avpkt->size;
+}
+
+#define OFFSET(x) offsetof(EVRCContext, x)
+#define AD AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption options[] = {
+    { "postfilter", "enable postfilter", OFFSET(postfilter), AV_OPT_TYPE_INT, {.i64 = 1}, 0, 1, AD },
+    { NULL }
+};
+
+static const AVClass evrcdec_class = {
+    .class_name = "evrc",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_evrc_decoder = {
+    .name           = "evrc",
+    .long_name      = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EVRC,
+    .init           = evrc_decode_init,
+    .decode         = evrc_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(EVRCContext),
+    .priv_class     = &evrcdec_class,
+};
diff --git a/libavcodec/exif.c b/libavcodec/exif.c
new file mode 100644
index 0000000000..fa30f0573f
--- /dev/null
+++ b/libavcodec/exif.c
@@ -0,0 +1,142 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "exif.h"
+
+
+static const char *exif_get_tag_name(uint16_t id)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(tag_list); i++) {
+        if (tag_list[i].id == id)
+            return tag_list[i].name;
+    }
+
+    return NULL;
+}
+
+
+static int exif_add_metadata(AVCodecContext *avctx, int count, int type,
+                             const char *name, const char *sep,
+                             GetByteContext *gb, int le,
+                             AVDictionary **metadata)
+{
+    switch(type) {
+    case 0:
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid TIFF tag type 0 found for %s with size %d\n",
+               name, count);
+        return 0;
+    case TIFF_DOUBLE   : return ff_tadd_doubles_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SSHORT   : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_SHORT    : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_SBYTE    : return ff_tadd_bytes_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_BYTE     :
+    case TIFF_UNDEFINED: return ff_tadd_bytes_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_STRING   : return ff_tadd_string_metadata(count, name, gb, le, metadata);
+    case TIFF_SRATIONAL:
+    case TIFF_RATIONAL : return ff_tadd_rational_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SLONG    :
+    case TIFF_LONG     : return ff_tadd_long_metadata(count, name, sep, gb, le, metadata);
+    default:
+        avpriv_request_sample(avctx, "TIFF tag type (%u)", type);
+        return 0;
+    };
+}
+
+
+static int exif_decode_tag(AVCodecContext *avctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata)
+{
+    int ret, cur_pos;
+    unsigned id, count;
+    enum TiffTypes type;
+
+    if (depth > 2) {
+        return 0;
+    }
+
+    ff_tread_tag(gbytes, le, &id, &type, &count, &cur_pos);
+
+    if (!bytestream2_tell(gbytes)) {
+        bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+        return 0;
+    }
+
+    // read count values and add it metadata
+    // store metadata or proceed with next IFD
+    ret = ff_tis_ifd(id);
+    if (ret) {
+        ret = avpriv_exif_decode_ifd(avctx, gbytes, le, depth + 1, metadata);
+    } else {
+        const char *name = exif_get_tag_name(id);
+        char *use_name   = (char*) name;
+
+        if (!use_name) {
+            use_name = av_malloc(7);
+            if (!use_name) {
+                return AVERROR(ENOMEM);
+            }
+            snprintf(use_name, 7, "0x%04X", id);
+        }
+
+        ret = exif_add_metadata(avctx, count, type, use_name, NULL,
+                                gbytes, le, metadata);
+
+        if (!name) {
+            av_freep(&use_name);
+        }
+    }
+
+    bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+
+    return ret;
+}
+
+
+int avpriv_exif_decode_ifd(AVCodecContext *avctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata)
+{
+    int i, ret;
+    int entries;
+
+    entries = ff_tget_short(gbytes, le);
+
+    if (bytestream2_get_bytes_left(gbytes) < entries * 12) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < entries; i++) {
+        if ((ret = exif_decode_tag(avctx, gbytes, le, depth, metadata)) < 0) {
+            return ret;
+        }
+    }
+
+    // return next IDF offset or 0x000000000 or a value < 0 for failure
+    return ff_tget_long(gbytes, le);
+}
diff --git a/libavcodec/exif.h b/libavcodec/exif.h
new file mode 100644
index 0000000000..2f509ba1e9
--- /dev/null
+++ b/libavcodec/exif.h
@@ -0,0 +1,170 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_EXIF_H
+#define AVCODEC_EXIF_H
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "tiff.h"
+
+#define EXIF_MAX_IFD_RECURSION 2
+#define EXIF_TAG_NAME_LENGTH   32
+
+struct exif_tag {
+    char      name[EXIF_TAG_NAME_LENGTH];
+    uint16_t  id;
+};
+
+static const struct exif_tag tag_list[] = { // JEITA CP-3451 EXIF specification:
+    {"GPSVersionID",               0x00}, // <- Table 12 GPS Attribute Information
+    {"GPSLatitudeRef",             0x01},
+    {"GPSLatitude",                0x02},
+    {"GPSLongitudeRef",            0x03},
+    {"GPSLongitude",               0x04},
+    {"GPSAltitudeRef",             0x05},
+    {"GPSAltitude",                0x06},
+    {"GPSTimeStamp",               0x07},
+    {"GPSSatellites",              0x08},
+    {"GPSStatus",                  0x09},
+    {"GPSMeasureMode",             0x0A},
+    {"GPSDOP",                     0x0B},
+    {"GPSSpeedRef",                0x0C},
+    {"GPSSpeed",                   0x0D},
+    {"GPSTrackRef",                0x0E},
+    {"GPSTrack",                   0x0F},
+    {"GPSImgDirectionRef",         0x10},
+    {"GPSImgDirection",            0x11},
+    {"GPSMapDatum",                0x12},
+    {"GPSDestLatitudeRef",         0x13},
+    {"GPSDestLatitude",            0x14},
+    {"GPSDestLongitudeRef",        0x15},
+    {"GPSDestLongitude",           0x16},
+    {"GPSDestBearingRef",          0x17},
+    {"GPSDestBearing",             0x18},
+    {"GPSDestDistanceRef",         0x19},
+    {"GPSDestDistance",            0x1A},
+    {"GPSProcessingMethod",        0x1B},
+    {"GPSAreaInformation",         0x1C},
+    {"GPSDateStamp",               0x1D},
+    {"GPSDifferential",            0x1E},
+    {"ImageWidth",                 0x100}, // <- Table 3 TIFF Rev. 6.0 Attribute Information Used in Exif
+    {"ImageLength",                0x101},
+    {"BitsPerSample",              0x102},
+    {"Compression",                0x103},
+    {"PhotometricInterpretation",  0x106},
+    {"Orientation",                0x112},
+    {"SamplesPerPixel",            0x115},
+    {"PlanarConfiguration",        0x11C},
+    {"YCbCrSubSampling",           0x212},
+    {"YCbCrPositioning",           0x213},
+    {"XResolution",                0x11A},
+    {"YResolution",                0x11B},
+    {"ResolutionUnit",             0x128},
+    {"StripOffsets",               0x111},
+    {"RowsPerStrip",               0x116},
+    {"StripByteCounts",            0x117},
+    {"JPEGInterchangeFormat",      0x201},
+    {"JPEGInterchangeFormatLength",0x202},
+    {"TransferFunction",           0x12D},
+    {"WhitePoint",                 0x13E},
+    {"PrimaryChromaticities",      0x13F},
+    {"YCbCrCoefficients",          0x211},
+    {"ReferenceBlackWhite",        0x214},
+    {"DateTime",                   0x132},
+    {"ImageDescription",           0x10E},
+    {"Make",                       0x10F},
+    {"Model",                      0x110},
+    {"Software",                   0x131},
+    {"Artist",                     0x13B},
+    {"Copyright",                  0x8298},
+    {"ExifVersion",                0x9000}, // <- Table 4 Exif IFD Attribute Information (1)
+    {"FlashpixVersion",            0xA000},
+    {"ColorSpace",                 0xA001},
+    {"ComponentsConfiguration",    0x9101},
+    {"CompressedBitsPerPixel",     0x9102},
+    {"PixelXDimension",            0xA002},
+    {"PixelYDimension",            0xA003},
+    {"MakerNote",                  0x927C},
+    {"UserComment",                0x9286},
+    {"RelatedSoundFile",           0xA004},
+    {"DateTimeOriginal",           0x9003},
+    {"DateTimeDigitized",          0x9004},
+    {"SubSecTime",                 0x9290},
+    {"SubSecTimeOriginal",         0x9291},
+    {"SubSecTimeDigitized",        0x9292},
+    {"ImageUniqueID",              0xA420},
+    {"ExposureTime",               0x829A}, // <- Table 5 Exif IFD Attribute Information (2)
+    {"FNumber",                    0x829D},
+    {"ExposureProgram",            0x8822},
+    {"SpectralSensitivity",        0x8824},
+    {"ISOSpeedRatings",            0x8827},
+    {"OECF",                       0x8828},
+    {"ShutterSpeedValue",          0x9201},
+    {"ApertureValue",              0x9202},
+    {"BrightnessValue",            0x9203},
+    {"ExposureBiasValue",          0x9204},
+    {"MaxApertureValue",           0x9205},
+    {"SubjectDistance",            0x9206},
+    {"MeteringMode",               0x9207},
+    {"LightSource",                0x9208},
+    {"Flash",                      0x9209},
+    {"FocalLength",                0x920A},
+    {"SubjectArea",                0x9214},
+    {"FlashEnergy",                0xA20B},
+    {"SpatialFrequencyResponse",   0xA20C},
+    {"FocalPlaneXResolution",      0xA20E},
+    {"FocalPlaneYResolution",      0xA20F},
+    {"FocalPlaneResolutionUnit",   0xA210},
+    {"SubjectLocation",            0xA214},
+    {"ExposureIndex",              0xA215},
+    {"SensingMethod",              0xA217},
+    {"FileSource",                 0xA300},
+    {"SceneType",                  0xA301},
+    {"CFAPattern",                 0xA302},
+    {"CustomRendered",             0xA401},
+    {"ExposureMode",               0xA402},
+    {"WhiteBalance",               0xA403},
+    {"DigitalZoomRatio",           0xA404},
+    {"FocalLengthIn35mmFilm",      0xA405},
+    {"SceneCaptureType",           0xA406},
+    {"GainControl",                0xA407},
+    {"Contrast",                   0xA408},
+    {"Saturation",                 0xA409},
+    {"Sharpness",                  0xA40A},
+    {"DeviceSettingDescription",   0xA40B},
+    {"SubjectDistanceRange",       0xA40C}
+//    {"InteroperabilityIndex",      0x1}, // <- Table 13 Interoperability IFD Attribute Information
+//    {"",                           0x0}
+};
+
+/** Recursively decodes all IFD's and
+ *  adds included TAGS into the metadata dictionary. */
+int avpriv_exif_decode_ifd(AVCodecContext *avctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata);
+
+#endif /* AVCODEC_EXIF_H */
diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index cdcdd7cc6c..86a99086fe 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -2,20 +2,20 @@
  * OpenEXR (.exr) image decoder
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
@@ -110,6 +111,7 @@ typedef struct EXRContext {
 
     const char *layer;
 
+    enum AVColorTransferCharacteristic apply_trc_type;
     float gamma;
     uint16_t gamma_table[65536];
 } EXRContext;
@@ -722,8 +724,8 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
     if (!td->lut)
         td->lut = av_malloc(1 << 17);
     if (!td->bitmap || !td->lut) {
-        av_free(td->bitmap);
-        av_free(td->lut);
+        av_freep(&td->bitmap);
+        av_freep(&td->lut);
         return AVERROR(ENOMEM);
     }
 
@@ -842,6 +844,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
     int bxmin = s->xmin * 2 * s->desc->nb_components;
     int i, x, buf_size = s->buf_size;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
     int ret;
 
     line_offset = AV_RL64(s->gb.buffer + jobnr * 8);
@@ -921,24 +924,43 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         ptr_x += s->xmin * s->desc->nb_components;
         if (s->pixel_type == EXR_FLOAT) {
             // 32-bit
-            for (x = 0; x < xdelta; x++) {
-                union av_intfloat32 t;
-                t.i = bytestream_get_le32(&r);
-                if (t.f > 0.0f)  /* avoid negative values */
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-
-                t.i = bytestream_get_le32(&g);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-
-                t.i = bytestream_get_le32(&b);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-                if (channel_buffer[3])
-                    *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+            if (trc_func) {
+                for (x = 0; x < xdelta; x++) {
+                    union av_intfloat32 t;
+                    t.i = bytestream_get_le32(&r);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&g);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&b);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
+            } else {
+                for (x = 0; x < xdelta; x++) {
+                    union av_intfloat32 t;
+                    t.i = bytestream_get_le32(&r);
+                    if (t.f > 0.0f)  /* avoid negative values */
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&g);
+                    if (t.f > 0.0f)
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&b);
+                    if (t.f > 0.0f)
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
             }
         } else {
             // 16-bit
@@ -1010,6 +1032,22 @@ static int decode_header(EXRContext *s)
     int current_channel_offset = 0;
     int magic_number, version, flags, i;
 
+    s->xmin               = ~0;
+    s->xmax               = ~0;
+    s->ymin               = ~0;
+    s->ymax               = ~0;
+    s->xdelta             = ~0;
+    s->ydelta             = ~0;
+    s->channel_offsets[0] = -1;
+    s->channel_offsets[1] = -1;
+    s->channel_offsets[2] = -1;
+    s->channel_offsets[3] = -1;
+    s->pixel_type         = EXR_UNKNOWN;
+    s->compression        = EXR_UNKN;
+    s->nb_channels        = 0;
+    s->w                  = 0;
+    s->h                  = 0;
+
     if (bytestream2_get_bytes_left(&s->gb) < 10) {
         av_log(s->avctx, AV_LOG_ERROR, "Header too short to parse.\n");
         return AVERROR_INVALIDDATA;
@@ -1271,6 +1309,9 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->apply_trc_type != AVCOL_TRC_UNSPECIFIED)
+        avctx->color_trc = s->apply_trc_type;
+
     switch (s->compression) {
     case EXR_RAW:
     case EXR_RLE:
@@ -1348,36 +1389,31 @@ static av_cold int decode_init(AVCodecContext *avctx)
     uint32_t i;
     union av_intfloat32 t;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = NULL;
 
     s->avctx              = avctx;
-    s->xmin               = ~0;
-    s->xmax               = ~0;
-    s->ymin               = ~0;
-    s->ymax               = ~0;
-    s->xdelta             = ~0;
-    s->ydelta             = ~0;
-    s->channel_offsets[0] = -1;
-    s->channel_offsets[1] = -1;
-    s->channel_offsets[2] = -1;
-    s->channel_offsets[3] = -1;
-    s->pixel_type         = EXR_UNKNOWN;
-    s->compression        = EXR_UNKN;
-    s->nb_channels        = 0;
-    s->w                  = 0;
-    s->h                  = 0;
 
-    if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
-        for (i = 0; i < 65536; ++i)
-            s->gamma_table[i] = exr_halflt2uint(i);
-    } else {
+    trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
+    if (trc_func) {
         for (i = 0; i < 65536; ++i) {
             t = exr_half2float(i);
-            /* If negative value we reuse half value */
-            if (t.f <= 0.0f) {
+            t.f = trc_func(t.f);
+            s->gamma_table[i] = exr_flt2uint(t.i);
+        }
+    } else {
+        if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
+            for (i = 0; i < 65536; ++i)
                 s->gamma_table[i] = exr_halflt2uint(i);
-            } else {
-                t.f = powf(t.f, one_gamma);
-                s->gamma_table[i] = exr_flt2uint(t.i);
+        } else {
+            for (i = 0; i < 65536; ++i) {
+                t = exr_half2float(i);
+                /* If negative value we reuse half value */
+                if (t.f <= 0.0f) {
+                    s->gamma_table[i] = exr_halflt2uint(i);
+                } else {
+                    t.f = powf(t.f, one_gamma);
+                    s->gamma_table[i] = exr_flt2uint(t.i);
+                }
             }
         }
     }
@@ -1390,6 +1426,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {    EXRContext *s = avctx->priv_data;
 
@@ -1400,6 +1437,7 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
@@ -1426,6 +1464,43 @@ static const AVOption options[] = {
         AV_OPT_TYPE_STRING, { .str = "" }, 0, 0, VD },
     { "gamma", "Set the float gamma value when decoding", OFFSET(gamma),
         AV_OPT_TYPE_FLOAT, { .dbl = 1.0f }, 0.001, FLT_MAX, VD },
+
+    // XXX: Note the abuse of the enum using AVCOL_TRC_UNSPECIFIED to subsume the existing gamma option
+    { "apply_trc", "color transfer characteristics to apply to EXR linear input", OFFSET(apply_trc_type),
+        AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, AVCOL_TRC_NB-1, VD, "apply_trc_type"},
+    { "bt709",        "BT.709",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma",        "gamma",            0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma22",      "BT.470 M",         0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA22 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma28",      "BT.470 BG",        0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA28 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte170m",    "SMPTE 170 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE170M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte240m",    "SMPTE 240 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE240M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "linear",       "Linear",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LINEAR },       INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log",          "Log",              0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG },          INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log_sqrt",     "Log square root",  0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG_SQRT },     INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_4", "IEC 61966-2-4",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_4 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt1361",       "BT.1361",          0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT1361_ECG },   INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_1", "IEC 61966-2-1",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_10bit", "BT.2020 - 10 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_12bit", "BT.2020 - 12 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte2084",    "SMPTE ST 2084",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte428_1",   "SMPTE ST 428-1",   0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+
     { NULL },
 };
 
diff --git a/libavcodec/faandct.h b/libavcodec/faandct.h
index 59d5ff3534..c5ef96dcf1 100644
--- a/libavcodec/faandct.h
+++ b/libavcodec/faandct.h
@@ -2,20 +2,20 @@
  * Floating point AAN DCT
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faanidct.c b/libavcodec/faanidct.c
index 2e9ce9ce1e..ca82f778ba 100644
--- a/libavcodec/faanidct.c
+++ b/libavcodec/faanidct.c
@@ -2,20 +2,20 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "faanidct.h"
diff --git a/libavcodec/faanidct.h b/libavcodec/faanidct.h
index 0c015201cc..4cd2c7858f 100644
--- a/libavcodec/faanidct.h
+++ b/libavcodec/faanidct.h
@@ -2,20 +2,20 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faxcompr.c b/libavcodec/faxcompr.c
index 4cbda3f126..80df418a29 100644
--- a/libavcodec/faxcompr.c
+++ b/libavcodec/faxcompr.c
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -122,6 +122,77 @@ av_cold void ff_ccitt_unpack_init(void)
     initialized = 1;
 }
 
+static int decode_uncompressed(AVCodecContext *avctx, GetBitContext *gb,
+                               unsigned int *pix_left, int **runs,
+                               const int *runend, int *mode)
+{
+    int eob = 0;
+    int newmode;
+    int saved_run = 0;
+
+    do {
+        int cwi, k;
+        int cw = 0;
+        int codes[2];
+        do {
+            cwi = show_bits(gb, 11);
+            if (!cwi) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid uncompressed codeword\n");
+                return AVERROR_INVALIDDATA;
+            }
+            cwi = 10 - av_log2(cwi);
+            skip_bits(gb, cwi + 1);
+            if (cwi > 5) {
+                newmode = get_bits1(gb);
+                eob = 1;
+                cwi -= 6;
+            }
+            cw += cwi;
+        } while(cwi == 5);
+
+        codes[0] = cw;
+        codes[1] = !eob;
+
+        for (k = 0; k < 2; k++) {
+            if (codes[k]) {
+                if (*mode == !k) {
+                    *(*runs)++ = saved_run;
+                    if (*runs >= runend) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    if (*pix_left <= saved_run) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    *pix_left -= saved_run;
+                    saved_run = 0;
+                    *mode = !*mode;
+                }
+                saved_run += codes[k];
+            }
+        }
+    } while (!eob);
+    *(*runs)++ = saved_run;
+    if (*runs >= runend) {
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (*pix_left <= saved_run) {
+        if (*pix_left == saved_run)
+            return 1;
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of boundsE\n");
+        return AVERROR_INVALIDDATA;
+    }
+    *pix_left -= saved_run;
+    saved_run = 0;
+    *mode = !*mode;
+    if (newmode != *mode) { //FIXME CHECK
+        *(*runs)++ = 0;
+        *mode = newmode;
+    }
+    return 0;
+}
 
 static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
                                  unsigned int pix_left, int *runs,
@@ -149,8 +220,18 @@ static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
             run       = 0;
             mode      = !mode;
         } else if ((int)t == -1) {
-            av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
-            return AVERROR_INVALIDDATA;
+            if (show_bits(gb, 12) == 15) {
+                int ret;
+                skip_bits(gb, 12);
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
     }
     *runs++ = 0;
@@ -165,8 +246,6 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
     int run_off       = *ref++;
     unsigned int offs = 0, run = 0;
 
-    runend--; // for the last written 0
-
     while (offs < width) {
         int cmode = get_vlc2(gb, ccitt_group3_2d_vlc.table, 9, 1);
         if (cmode == -1) {
@@ -174,10 +253,12 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
             return AVERROR_INVALIDDATA;
         }
         if (!cmode) { //pass mode
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             run      = run_off - offs;
             offs     = run_off;
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             if (offs > width) {
                 av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
                 return AVERROR_INVALIDDATA;
@@ -211,8 +292,25 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
                 mode = !mode;
             }
         } else if (cmode == 9 || cmode == 10) {
-            avpriv_report_missing_feature(avctx, "Special modes support");
-            return AVERROR_PATCHWELCOME;
+            int xxx = get_bits(gb, 3);
+            if (cmode == 9 && xxx == 7) {
+                int ret;
+                int pix_left = width - offs;
+
+                if (saved_run) {
+                    av_log(avctx, AV_LOG_ERROR, "saved run %d on entering uncompressed mode\n", saved_run);
+                    return AVERROR_INVALIDDATA;
+                }
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                offs = width - pix_left;
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                avpriv_report_missing_feature(avctx, "Special mode %d xxx=%d support", cmode, xxx);
+                return AVERROR_PATCHWELCOME;
+            }
         } else { //vertical mode
             run      = run_off - offs + (cmode - 5);
             run_off -= *--ref;
@@ -230,13 +328,19 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
             mode      = !mode;
         }
         //sync line pointers
-        while (run_off <= offs) {
+        while (offs < width && run_off <= offs) {
             run_off += *ref++;
             run_off += *ref++;
         }
     }
     *runs++ = saved_run;
-    *runs++ = 0;
+    if (saved_run) {
+        if (runs >= runend) {
+            av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+            return -1;
+        }
+        *runs++ = 0;
+    }
     return 0;
 }
 
@@ -245,7 +349,7 @@ static void put_line(uint8_t *dst, int size, int width, const int *runs)
     PutBitContext pb;
     int run, mode = ~0, pix_left = width, run_idx = 0;
 
-    init_put_bits(&pb, dst, size * 8);
+    init_put_bits(&pb, dst, size);
     while (pix_left > 0) {
         run       = runs[run_idx++];
         mode      = ~mode;
@@ -279,9 +383,10 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
     int *runs, *ref = NULL, *runend;
     int ret;
     int runsize = avctx->width + 2;
+    int has_eol;
 
-    runs = av_malloc(runsize * sizeof(runs[0]));
-    ref  = av_malloc(runsize * sizeof(ref[0]));
+    runs = av_malloc_array(runsize, sizeof(runs[0]));
+    ref  = av_malloc_array(runsize, sizeof(ref[0]));
     if (!runs || !ref) {
         ret = AVERROR(ENOMEM);
         goto fail;
@@ -289,7 +394,10 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
     ref[0] = avctx->width;
     ref[1] = 0;
     ref[2] = 0;
-    init_get_bits(&gb, src, srcsize * 8);
+    if ((ret = init_get_bits8(&gb, src, srcsize)) < 0)
+        goto fail;
+    has_eol = show_bits(&gb, 12) == 1 || show_bits(&gb, 16) == 1;
+
     for (j = 0; j < height; j++) {
         runend = runs + runsize;
         if (compr == TIFF_G4) {
@@ -300,6 +408,7 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
         } else {
             int g3d1 = (compr == TIFF_G3) && !(opts & 1);
             if (compr != TIFF_CCITT_RLE &&
+                has_eol &&
                 find_group3_syncmarker(&gb, srcsize * 8) < 0)
                 break;
             if (compr == TIFF_CCITT_RLE || g3d1 || get_bits1(&gb))
diff --git a/libavcodec/faxcompr.h b/libavcodec/faxcompr.h
index 8157f1fc21..53d11681b2 100644
--- a/libavcodec/faxcompr.h
+++ b/libavcodec/faxcompr.h
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
index f299eae272..b9c2c86322 100644
--- a/libavcodec/fdctdsp.c
+++ b/libavcodec/fdctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
+    if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
         c->fdct    = ff_jpeg_fdct_islow_10;
         c->fdct248 = ff_fdct248_islow_10;
     } else if (avctx->dct_algo == FF_DCT_FASTINT) {
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
index 944dc6d1bf..3e1f683b9e 100644
--- a/libavcodec/fdctdsp.h
+++ b/libavcodec/fdctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fft-fixed-test.c b/libavcodec/fft-fixed-test.c
index d6ea987fa3..330211ebf6 100644
--- a/libavcodec/fft-fixed-test.c
+++ b/libavcodec/fft-fixed-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fft-fixed32-test.c b/libavcodec/fft-fixed32-test.c
new file mode 100644
index 0000000000..4bd11cea61
--- /dev/null
+++ b/libavcodec/fft-fixed32-test.c
@@ -0,0 +1,21 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "fft-test.c"
diff --git a/libavcodec/fft-internal.h b/libavcodec/fft-internal.h
index a449ec058a..0a8f7d05cf 100644
--- a/libavcodec/fft-internal.h
+++ b/libavcodec/fft-internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,12 +36,29 @@
 
 #else
 
+#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
+
+#if FFT_FIXED_32
+
+#define CMUL(dre, dim, are, aim, bre, bim) do {             \
+        int64_t accu;                                     \
+        (accu)  = (int64_t)(bre) * (are);                 \
+        (accu) -= (int64_t)(bim) * (aim);                 \
+        (dre)   = (int)(((accu) + 0x40000000) >> 31);       \
+        (accu)  = (int64_t)(bre) * (aim);                 \
+        (accu) += (int64_t)(bim) * (are);                 \
+        (dim)   = (int)(((accu) + 0x40000000) >> 31);       \
+    } while (0)
+
+#define FIX15(a) av_clip(SCALE_FLOAT(a, 31), -2147483647, 2147483647)
+
+#else /* FFT_FIXED_32 */
+
 #include "fft.h"
 #include "mathops.h"
 
 void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 
-#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
 #define FIX15(a) av_clip(SCALE_FLOAT(a, 15), -32767, 32767)
 
 #define sqrthalf ((int16_t)((1<<15)*M_SQRT1_2))
@@ -62,6 +79,8 @@ void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 #define CMULL(dre, dim, are, aim, bre, bim)     \
     CMULS(dre, dim, are, aim, bre, bim, 0)
 
+#endif /* FFT_FIXED_32 */
+
 #endif /* FFT_FLOAT */
 
 #define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c)
diff --git a/libavcodec/fft-test.c b/libavcodec/fft-test.c
index 83b5546f08..d647fde119 100644
--- a/libavcodec/fft-test.c
+++ b/libavcodec/fft-test.c
@@ -1,20 +1,20 @@
 /*
  * (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,10 @@
 #define RANGE 1.0
 #define REF_SCALE(x, bits)  (x)
 #define FMT "%10.6f"
+#elif FFT_FIXED_32
+#define RANGE 8388608
+#define REF_SCALE(x, bits) (x)
+#define FMT "%6d"
 #else
 #define RANGE 16384
 #define REF_SCALE(x, bits) ((x) / (1 << (bits)))
@@ -73,7 +77,7 @@ static int fft_ref_init(int nbits, int inverse)
 {
     int i, n = 1 << nbits;
 
-    exptab = av_malloc((n / 2) * sizeof(*exptab));
+    exptab = av_malloc_array((n / 2), sizeof(*exptab));
     if (!exptab)
         return AVERROR(ENOMEM);
 
@@ -150,7 +154,7 @@ static void mdct_ref(FFTSample *output, FFTSample *input, int nbits)
 
 #if FFT_FLOAT
 #if CONFIG_DCT
-static void idct_ref(float *output, float *input, int nbits)
+static void idct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -165,7 +169,7 @@ static void idct_ref(float *output, float *input, int nbits)
     }
 }
 
-static void dct_ref(float *output, float *input, int nbits)
+static void dct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -203,7 +207,7 @@ static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale)
         if (e > max)
             max = e;
     }
-    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error) / n);
+    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n));
     return err;
 }
 
@@ -281,20 +285,22 @@ int main(int argc, char **argv)
             break;
         case 'c':
         {
-            int cpuflags = av_parse_cpu_flags(optarg);
-            if (cpuflags < 0)
+            unsigned cpuflags = av_get_cpu_flags();
+
+            if (av_parse_cpu_caps(&cpuflags, optarg) < 0)
                 return 1;
-            av_set_cpu_flags_mask(cpuflags);
+
+            av_force_cpu_flags(cpuflags);
             break;
         }
         }
     }
 
     fft_size = 1 << fft_nbits;
-    tab      = av_malloc(fft_size * sizeof(FFTComplex));
-    tab1     = av_malloc(fft_size * sizeof(FFTComplex));
-    tab_ref  = av_malloc(fft_size * sizeof(FFTComplex));
-    tab2     = av_malloc(fft_size * sizeof(FFTSample));
+    tab      = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab1     = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab_ref  = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab2     = av_malloc_array(fft_size, sizeof(FFTSample));
 
     if (!(tab && tab1 && tab_ref && tab2))
         goto cleanup;
@@ -316,22 +322,22 @@ int main(int argc, char **argv)
         else
             av_log(NULL, AV_LOG_INFO, "FFT");
         ff_fft_init(&s, fft_nbits, do_inverse);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "IDFT_C2R");
         else
             av_log(NULL, AV_LOG_INFO, "DFT_R2C");
         ff_rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "DCT_III");
@@ -339,7 +345,7 @@ int main(int argc, char **argv)
             av_log(NULL, AV_LOG_INFO, "DCT_II");
         ff_dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     default:
         av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n");
@@ -486,16 +492,16 @@ int main(int argc, char **argv)
         ff_fft_end(&s);
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
         ff_rdft_end(&r);
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
         ff_dct_end(&d);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     }
 
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index 7daae24ca3..64f0f6331e 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,10 @@
 #define FFT_FLOAT 1
 #endif
 
+#ifndef FFT_FIXED_32
+#define FFT_FIXED_32 0
+#endif
+
 #include <stdint.h>
 #include "config.h"
 #include "libavutil/mem.h"
@@ -40,15 +44,26 @@ typedef float FFTDouble;
 
 #else
 
+#if FFT_FIXED_32
+
+#define Q31(x) (int)((x)*2147483648.0 + 0.5)
+#define FFT_NAME(x) x ## _fixed_32
+
+typedef int32_t FFTSample;
+
+#else /* FFT_FIXED_32 */
+
 #define FFT_NAME(x) x ## _fixed
 
 typedef int16_t FFTSample;
-typedef int     FFTDouble;
+
+#endif /* FFT_FIXED_32 */
 
 typedef struct FFTComplex {
-    int16_t re, im;
+    FFTSample re, im;
 } FFTComplex;
 
+typedef int    FFTDouble;
 typedef struct FFTContext FFTContext;
 
 #endif /* FFT_FLOAT */
@@ -142,6 +157,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
 void ff_fft_init_aarch64(FFTContext *s);
 void ff_fft_init_x86(FFTContext *s);
 void ff_fft_init_arm(FFTContext *s);
+void ff_fft_init_mips(FFTContext *s);
 void ff_fft_init_ppc(FFTContext *s);
 
 void ff_fft_fixed_init_arm(FFTContext *s);
diff --git a/libavcodec/fft_fixed.c b/libavcodec/fft_fixed.c
index bad482148b..3d3bd2fca6 100644
--- a/libavcodec/fft_fixed.c
+++ b/libavcodec/fft_fixed.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_fixed_32.c b/libavcodec/fft_fixed_32.c
new file mode 100644
index 0000000000..fbdbf847e2
--- /dev/null
+++ b/libavcodec/fft_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "fft_template.c"
diff --git a/libavcodec/fft_float.c b/libavcodec/fft_float.c
index ed4cffa11c..73cc98d0d4 100644
--- a/libavcodec/fft_float.c
+++ b/libavcodec/fft_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_init_table.c b/libavcodec/fft_init_table.c
new file mode 100644
index 0000000000..4d74f8a2a2
--- /dev/null
+++ b/libavcodec/fft_init_table.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and initialization of LUT table for FFT
+ */
+#include "libavcodec/fft_table.h"
+
+const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = {
+    2147483647, 2147481121, 2147473542, 2147460908, 2147443222, 2147420483, 2147392690, 2147359845,
+    2147321946, 2147278995, 2147230991, 2147177934, 2147119825, 2147056664, 2146988450, 2146915184,
+    2146836866, 2146753497, 2146665076, 2146571603, 2146473080, 2146369505, 2146260881, 2146147205,
+    2146028480, 2145904705, 2145775880, 2145642006, 2145503083, 2145359112, 2145210092, 2145056025,
+    2144896910, 2144732748, 2144563539, 2144389283, 2144209982, 2144025635, 2143836244, 2143641807,
+    2143442326, 2143237802, 2143028234, 2142813624, 2142593971, 2142369276, 2142139541, 2141904764,
+    2141664948, 2141420092, 2141170197, 2140915264, 2140655293, 2140390284, 2140120240, 2139845159,
+    2139565043, 2139279892, 2138989708, 2138694490, 2138394240, 2138088958, 2137778644, 2137463301,
+    2137142927, 2136817525, 2136487095, 2136151637, 2135811153, 2135465642, 2135115107, 2134759548,
+    2134398966, 2134033361, 2133662734, 2133287087, 2132906420, 2132520734, 2132130030, 2131734309,
+    2131333572, 2130927819, 2130517052, 2130101272, 2129680480, 2129254676, 2128823862, 2128388038,
+    2127947206, 2127501367, 2127050522, 2126594672, 2126133817, 2125667960, 2125197100, 2124721240,
+    2124240380, 2123754522, 2123263666, 2122767814, 2122266967, 2121761126, 2121250292, 2120734467,
+    2120213651, 2119687847, 2119157054, 2118621275, 2118080511, 2117534762, 2116984031, 2116428319,
+    2115867626, 2115301954, 2114731305, 2114155680, 2113575080, 2112989506, 2112398960, 2111803444,
+    2111202959, 2110597505, 2109987085, 2109371700, 2108751352, 2108126041, 2107495770, 2106860540,
+    2106220352, 2105575208, 2104925109, 2104270057, 2103610054, 2102945101, 2102275199, 2101600350,
+    2100920556, 2100235819, 2099546139, 2098851519, 2098151960, 2097447464, 2096738032, 2096023667,
+    2095304370, 2094580142, 2093850985, 2093116901, 2092377892, 2091633960, 2090885105, 2090131331,
+    2089372638, 2088609029, 2087840505, 2087067068, 2086288720, 2085505463, 2084717298, 2083924228,
+    2083126254, 2082323379, 2081515603, 2080702930, 2079885360, 2079062896, 2078235540, 2077403294,
+    2076566160, 2075724139, 2074877233, 2074025446, 2073168777, 2072307231, 2071440808, 2070569511,
+    2069693342, 2068812302, 2067926394, 2067035621, 2066139983, 2065239484, 2064334124, 2063423908,
+    2062508835, 2061588910, 2060664133, 2059734508, 2058800036, 2057860719, 2056916560, 2055967560,
+    2055013723, 2054055050, 2053091544, 2052123207, 2051150040, 2050172048, 2049189231, 2048201592,
+    2047209133, 2046211857, 2045209767, 2044202863, 2043191150, 2042174628, 2041153301, 2040127172,
+    2039096241, 2038060512, 2037019988, 2035974670, 2034924562, 2033869665, 2032809982, 2031745516,
+    2030676269, 2029602243, 2028523442, 2027439867, 2026351522, 2025258408, 2024160529, 2023057887,
+    2021950484, 2020838323, 2019721407, 2018599739, 2017473321, 2016342155, 2015206245, 2014065592,
+    2012920201, 2011770073, 2010615210, 2009455617, 2008291295, 2007122248, 2005948478, 2004769987,
+    2003586779, 2002398857, 2001206222, 2000008879, 1998806829, 1997600076, 1996388622, 1995172471,
+    1993951625, 1992726087, 1991495860, 1990260946, 1989021350, 1987777073, 1986528118, 1985274489,
+    1984016189, 1982753220, 1981485585, 1980213288, 1978936331, 1977654717, 1976368450, 1975077532,
+    1973781967, 1972481757, 1971176906, 1969867417, 1968553292, 1967234535, 1965911148, 1964583136,
+    1963250501, 1961913246, 1960571375, 1959224890, 1957873796, 1956518093, 1955157788, 1953792881,
+    1952423377, 1951049279, 1949670589, 1948287312, 1946899451, 1945507008, 1944109987, 1942708392,
+    1941302225, 1939891490, 1938476190, 1937056329, 1935631910, 1934202936, 1932769411, 1931331338,
+    1929888720, 1928441561, 1926989864, 1925533633, 1924072871, 1922607581, 1921137767, 1919663432,
+    1918184581, 1916701216, 1915213340, 1913720958, 1912224073, 1910722688, 1909216806, 1907706433,
+    1906191570, 1904672222, 1903148392, 1901620084, 1900087301, 1898550047, 1897008325, 1895462140,
+    1893911494, 1892356392, 1890796837, 1889232832, 1887664383, 1886091491, 1884514161, 1882932397,
+    1881346202, 1879755580, 1878160535, 1876561070, 1874957189, 1873348897, 1871736196, 1870119091,
+    1868497586, 1866871683, 1865241388, 1863606704, 1861967634, 1860324183, 1858676355, 1857024153,
+    1855367581, 1853706643, 1852041343, 1850371686, 1848697674, 1847019312, 1845336604, 1843649553,
+    1841958164, 1840262441, 1838562388, 1836858008, 1835149306, 1833436286, 1831718951, 1829997307,
+    1828271356, 1826541103, 1824806552, 1823067707, 1821324572, 1819577151, 1817825449, 1816069469,
+    1814309216, 1812544694, 1810775906, 1809002858, 1807225553, 1805443995, 1803658189, 1801868139,
+    1800073849, 1798275323, 1796472565, 1794665580, 1792854372, 1791038946, 1789219305, 1787395453,
+    1785567396, 1783735137, 1781898681, 1780058032, 1778213194, 1776364172, 1774510970, 1772653593,
+    1770792044, 1768926328, 1767056450, 1765182414, 1763304224, 1761421885, 1759535401, 1757644777,
+    1755750017, 1753851126, 1751948107, 1750040966, 1748129707, 1746214334, 1744294853, 1742371267,
+    1740443581, 1738511799, 1736575927, 1734635968, 1732691928, 1730743810, 1728791620, 1726835361,
+    1724875040, 1722910659, 1720942225, 1718969740, 1716993211, 1715012642, 1713028037, 1711039401,
+    1709046739, 1707050055, 1705049355, 1703044642, 1701035922, 1699023199, 1697006479, 1694985765,
+    1692961062, 1690932376, 1688899711, 1686863072, 1684822463, 1682777890, 1680729357, 1678676870,
+    1676620432, 1674560049, 1672495725, 1670427466, 1668355276, 1666279161, 1664199124, 1662115172,
+    1660027308, 1657935539, 1655839867, 1653740300, 1651636841, 1649529496, 1647418269, 1645303166,
+    1643184191, 1641061349, 1638934646, 1636804087, 1634669676, 1632531418, 1630389319, 1628243383,
+    1626093616, 1623940023, 1621782608, 1619621377, 1617456335, 1615287487, 1613114838, 1610938393,
+    1608758157, 1606574136, 1604386335, 1602194758, 1599999411, 1597800299, 1595597428, 1593390801,
+    1591180426, 1588966306, 1586748447, 1584526854, 1582301533, 1580072489, 1577839726, 1575603251,
+    1573363068, 1571119183, 1568871601, 1566620327, 1564365367, 1562106725, 1559844408, 1557578421,
+    1555308768, 1553035455, 1550758488, 1548477872, 1546193612, 1543905714, 1541614183, 1539319024,
+    1537020244, 1534717846, 1532411837, 1530102222, 1527789007, 1525472197, 1523151797, 1520827813,
+    1518500250, 1516169114, 1513834411, 1511496145, 1509154322, 1506808949, 1504460029, 1502107570,
+    1499751576, 1497392053, 1495029006, 1492662441, 1490292364, 1487918781, 1485541696, 1483161115,
+    1480777044, 1478389489, 1475998456, 1473603949, 1471205974, 1468804538, 1466399645, 1463991302,
+    1461579514, 1459164286, 1456745625, 1454323536, 1451898025, 1449469098, 1447036760, 1444601017,
+    1442161874, 1439719338, 1437273414, 1434824109, 1432371426, 1429915374, 1427455956, 1424993180,
+    1422527051, 1420057574, 1417584755, 1415108601, 1412629117, 1410146309, 1407660183, 1405170745,
+    1402678000, 1400181954, 1397682613, 1395179984, 1392674072, 1390164882, 1387652422, 1385136696,
+    1382617710, 1380095472, 1377569986, 1375041258, 1372509294, 1369974101, 1367435685, 1364894050,
+    1362349204, 1359801152, 1357249901, 1354695455, 1352137822, 1349577007, 1347013017, 1344445857,
+    1341875533, 1339302052, 1336725419, 1334145641, 1331562723, 1328976672, 1326387494, 1323795195,
+    1321199781, 1318601257, 1315999631, 1313394909, 1310787095, 1308176198, 1305562222, 1302945174,
+    1300325060, 1297701886, 1295075659, 1292446384, 1289814068, 1287178717, 1284540337, 1281898935,
+    1279254516, 1276607086, 1273956653, 1271303222, 1268646800, 1265987392, 1263325005, 1260659646,
+    1257991320, 1255320034, 1252645794, 1249968606, 1247288478, 1244605414, 1241919421, 1239230506,
+    1236538675, 1233843935, 1231146291, 1228445750, 1225742318, 1223036002, 1220326809, 1217614743,
+    1214899813, 1212182024, 1209461382, 1206737894, 1204011567, 1201282407, 1198550419, 1195815612,
+    1193077991, 1190337562, 1187594332, 1184848308, 1182099496, 1179347902, 1176593533, 1173836395,
+    1171076495, 1168313840, 1165548435, 1162780288, 1160009405, 1157235792, 1154459456, 1151680403,
+    1148898640, 1146114174, 1143327011, 1140537158, 1137744621, 1134949406, 1132151521, 1129350972,
+    1126547765, 1123741908, 1120933406, 1118122267, 1115308496, 1112492101, 1109673089, 1106851465,
+    1104027237, 1101200410, 1098370993, 1095538991, 1092704411, 1089867259, 1087027544, 1084185270,
+    1081340445, 1078493076, 1075643169, 1072790730, 1069935768, 1067078288, 1064218296, 1061355801,
+    1058490808, 1055623324, 1052753357, 1049880912, 1047005996, 1044128617, 1041248781, 1038366495,
+    1035481766, 1032594600, 1029705004, 1026812985, 1023918550, 1021021705, 1018122458, 1015220816,
+    1012316784, 1009410370, 1006501581, 1003590424, 1000676905,  997761031,  994842810,  991922248,
+     988999351,  986074127,  983146583,  980216726,  977284562,  974350098,  971413342,  968474300,
+     965532978,  962589385,  959643527,  956695411,  953745043,  950792431,  947837582,  944880503,
+     941921200,  938959681,  935995952,  933030021,  930061894,  927091579,  924119082,  921144411,
+     918167572,  915188572,  912207419,  909224120,  906238681,  903251110,  900261413,  897269597,
+     894275671,  891279640,  888281512,  885281293,  882278992,  879274614,  876268167,  873259659,
+     870249095,  867236484,  864221832,  861205147,  858186435,  855165703,  852142959,  849118210,
+     846091463,  843062726,  840032004,  836999305,  833964638,  830928007,  827889422,  824848888,
+     821806413,  818762005,  815715670,  812667415,  809617249,  806565177,  803511207,  800455346,
+     797397602,  794337982,  791276492,  788213141,  785147934,  782080880,  779011986,  775941259,
+     772868706,  769794334,  766718151,  763640164,  760560380,  757478806,  754395449,  751310318,
+     748223418,  745134758,  742044345,  738952186,  735858287,  732762657,  729665303,  726566232,
+     723465451,  720362968,  717258790,  714152924,  711045377,  707936158,  704825272,  701712728,
+     698598533,  695482694,  692365218,  689246113,  686125387,  683003045,  679879097,  676753549,
+     673626408,  670497682,  667367379,  664235505,  661102068,  657967075,  654830535,  651692453,
+     648552838,  645411696,  642269036,  639124865,  635979190,  632832018,  629683357,  626533215,
+     623381598,  620228514,  617073971,  613917975,  610760536,  607601658,  604441352,  601279623,
+     598116479,  594951927,  591785976,  588618632,  585449903,  582279796,  579108320,  575935480,
+     572761285,  569585743,  566408860,  563230645,  560051104,  556870245,  553688076,  550504604,
+     547319836,  544133781,  540946445,  537757837,  534567963,  531376831,  528184449,  524990824,
+     521795963,  518599875,  515402566,  512204045,  509004318,  505803394,  502601279,  499397982,
+     496193509,  492987869,  489781069,  486573117,  483364019,  480153784,  476942419,  473729932,
+     470516330,  467301622,  464085813,  460868912,  457650927,  454431865,  451211734,  447990541,
+     444768294,  441545000,  438320667,  435095303,  431868915,  428641511,  425413098,  422183684,
+     418953276,  415721883,  412489512,  409256170,  406021865,  402786604,  399550396,  396313247,
+     393075166,  389836160,  386596237,  383355404,  380113669,  376871039,  373627523,  370383128,
+     367137861,  363891730,  360644742,  357396906,  354148230,  350898719,  347648383,  344397230,
+     341145265,  337892498,  334638936,  331384586,  328129457,  324873555,  321616889,  318359466,
+     315101295,  311842381,  308582734,  305322361,  302061269,  298799466,  295536961,  292273760,
+     289009871,  285745302,  282480061,  279214155,  275947592,  272680379,  269412525,  266144038,
+     262874923,  259605191,  256334847,  253063900,  249792358,  246520228,  243247518,  239974235,
+     236700388,  233425984,  230151030,  226875535,  223599506,  220322951,  217045878,  213768293,
+     210490206,  207211624,  203932553,  200653003,  197372981,  194092495,  190811551,  187530159,
+     184248325,  180966058,  177683365,  174400254,  171116733,  167832808,  164548489,  161263783,
+     157978697,  154693240,  151407418,  148121241,  144834714,  141547847,  138260647,  134973122,
+     131685278,  128397125,  125108670,  121819921,  118530885,  115241570,  111951983,  108662134,
+     105372028,  102081675,   98791081,   95500255,   92209205,   88917937,   85626460,   82334782,
+      79042909,   75750851,   72458615,   69166208,   65873638,   62580914,   59288042,   55995030,
+      52701887,   49408620,   46115236,   42821744,   39528151,   36234466,   32940695,   29646846,
+      26352928,   23058947,   19764913,   16470832,   13176712,    9882561,    6588387,    3294197
+};
+
+uint16_t ff_fft_offsets_lut[0x2aab];
+
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
+{
+    if (size < 16) {
+        table[*index] = off >> 2;
+        (*index)++;
+    }
+    else {
+        ff_fft_lut_init(table, off, size>>1, index);
+        ff_fft_lut_init(table, off+(size>>1), size>>2, index);
+        ff_fft_lut_init(table, off+3*(size>>2), size>>2, index);
+    }
+}
diff --git a/libavcodec/fft_table.h b/libavcodec/fft_table.h
new file mode 100644
index 0000000000..e5e54b6efa
--- /dev/null
+++ b/libavcodec/fft_table.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and tables for FFT
+ */
+#ifndef AVCODEC_FFT_TABLE_H
+#define AVCODEC_FFT_TABLE_H
+
+#include "libavcodec/fft.h"
+
+#define MAX_LOG2_NFFT 16 //!< Specifies maximum allowed fft size
+#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
+
+extern const int32_t ff_w_tab_sr[];
+extern uint16_t ff_fft_offsets_lut[];
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index);
+
+#endif /* AVCODEC_FFT_TABLE_H */
diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c
index 808f317c17..23ea453938 100644
--- a/libavcodec/fft_template.c
+++ b/libavcodec/fft_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2002 Fabrice Bellard
  * Partly based on libdjbfft by D. J. Bernstein
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 #include "fft.h"
 #include "fft-internal.h"
 
+#if FFT_FIXED_32
+#include "fft_table.h"
+#else /* FFT_FIXED_32 */
+
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 #if !CONFIG_HARDCODED_TABLES
 COSTABLE(16);
@@ -65,6 +69,8 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     FFT_NAME(ff_cos_65536),
 };
 
+#endif /* FFT_FIXED_32 */
+
 static void fft_permute_c(FFTContext *s, FFTComplex *z);
 static void fft_calc_c(FFTContext *s, FFTComplex *z);
 
@@ -81,7 +87,7 @@ static int split_radix_permutation(int i, int n, int inverse)
 
 av_cold void ff_init_ff_cos_tabs(int index)
 {
-#if !CONFIG_HARDCODED_TABLES
+#if (!CONFIG_HARDCODED_TABLES) && (!FFT_FIXED_32)
     int i;
     int m = 1<<index;
     double freq = 2*M_PI/m;
@@ -157,26 +163,34 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
     s->mdct_calc   = ff_mdct_calc_c;
 #endif
 
+#if FFT_FIXED_32
+    {
+        int n=0;
+        ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 16, &n);
+    }
+#else /* FFT_FIXED_32 */
 #if FFT_FLOAT
     if (ARCH_AARCH64) ff_fft_init_aarch64(s);
     if (ARCH_ARM)     ff_fft_init_arm(s);
     if (ARCH_PPC)     ff_fft_init_ppc(s);
     if (ARCH_X86)     ff_fft_init_x86(s);
     if (CONFIG_MDCT)  s->mdct_calcw = s->mdct_calc;
+    if (HAVE_MIPSFPU) ff_fft_init_mips(s);
 #else
     if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
 #endif
-
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
+#endif /* FFT_FIXED_32 */
+
 
     if (s->fft_permutation == FF_FFT_PERM_AVX) {
         fft_perm_avx(s);
     } else {
         for(i=0; i<n; i++) {
-            int j = i;
+            j = i;
             if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
                 j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
             s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
@@ -206,6 +220,166 @@ av_cold void ff_fft_end(FFTContext *s)
     av_freep(&s->tmp_buf);
 }
 
+#if FFT_FIXED_32
+
+static void fft_calc_c(FFTContext *s, FFTComplex *z) {
+
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    const int fft_size = (1 << s->nbits);
+    int64_t accu;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[4].re + tmpz[5].re;
+        tmp3 = tmpz[6].re + tmpz[7].re;
+        tmp2 = tmpz[4].im + tmpz[5].im;
+        tmp4 = tmpz[6].im + tmpz[7].im;
+        tmp5 = tmp1 + tmp3;
+        tmp7 = tmp1 - tmp3;
+        tmp6 = tmp2 + tmp4;
+        tmp8 = tmp2 - tmp4;
+
+        tmp1 = tmpz[4].re - tmpz[5].re;
+        tmp2 = tmpz[4].im - tmpz[5].im;
+        tmp3 = tmpz[6].re - tmpz[7].re;
+        tmp4 = tmpz[6].im - tmpz[7].im;
+
+        tmpz[4].re = tmpz[0].re - tmp5;
+        tmpz[0].re = tmpz[0].re + tmp5;
+        tmpz[4].im = tmpz[0].im - tmp6;
+        tmpz[0].im = tmpz[0].im + tmp6;
+        tmpz[6].re = tmpz[2].re - tmp8;
+        tmpz[2].re = tmpz[2].re + tmp8;
+        tmpz[6].im = tmpz[2].im + tmp7;
+        tmpz[2].im = tmpz[2].im - tmp7;
+
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp1 + tmp2);
+        tmp5 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 - tmp4);
+        tmp7 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp2 - tmp1);
+        tmp6 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 + tmp4);
+        tmp8 = (int32_t)((accu + 0x40000000) >> 31);
+        tmp1 = tmp5 + tmp7;
+        tmp3 = tmp5 - tmp7;
+        tmp2 = tmp6 + tmp8;
+        tmp4 = tmp6 - tmp8;
+
+        tmpz[5].re = tmpz[1].re - tmp1;
+        tmpz[1].re = tmpz[1].re + tmp1;
+        tmpz[5].im = tmpz[1].im - tmp2;
+        tmpz[1].im = tmpz[1].im + tmp2;
+        tmpz[7].re = tmpz[3].re - tmp4;
+        tmpz[3].re = tmpz[3].re + tmp4;
+        tmpz[7].im = tmpz[3].im + tmp3;
+        tmpz[3].im = tmpz[3].im - tmp3;
+    }
+
+    step = 1 << ((MAX_LOG2_NFFT-4) - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++){
+        n2  = 2*n4;
+        n34 = 3*n4;
+        num_transforms = (num_transforms >> 1) | 1;
+
+        for (n=0; n<num_transforms; n++){
+            const FFTSample *w_re_ptr = ff_w_tab_sr + step;
+            const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmp5 = tmpz[ n2].re + tmpz[n34].re;
+            tmp1 = tmpz[ n2].re - tmpz[n34].re;
+            tmp6 = tmpz[ n2].im + tmpz[n34].im;
+            tmp2 = tmpz[ n2].im - tmpz[n34].im;
+
+            tmpz[ n2].re = tmpz[ 0].re - tmp5;
+            tmpz[  0].re = tmpz[ 0].re + tmp5;
+            tmpz[ n2].im = tmpz[ 0].im - tmp6;
+            tmpz[  0].im = tmpz[ 0].im + tmp6;
+            tmpz[n34].re = tmpz[n4].re - tmp2;
+            tmpz[ n4].re = tmpz[n4].re + tmp2;
+            tmpz[n34].im = tmpz[n4].im + tmp1;
+            tmpz[ n4].im = tmpz[n4].im - tmp1;
+
+            for (i=1; i<n4; i++){
+                FFTSample w_re = w_re_ptr[0];
+                FFTSample w_im = w_im_ptr[0];
+                accu  = (int64_t)w_re*tmpz[ n2+i].re;
+                accu += (int64_t)w_im*tmpz[ n2+i].im;
+                tmp1 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[ n2+i].im;
+                accu -= (int64_t)w_im*tmpz[ n2+i].re;
+                tmp2 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].re;
+                accu -= (int64_t)w_im*tmpz[n34+i].im;
+                tmp3 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].im;
+                accu += (int64_t)w_im*tmpz[n34+i].re;
+                tmp4 = (int32_t)((accu + 0x40000000) >> 31);
+
+                tmp5 = tmp1 + tmp3;
+                tmp1 = tmp1 - tmp3;
+                tmp6 = tmp2 + tmp4;
+                tmp2 = tmp2 - tmp4;
+
+                tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                tmpz[    i].re = tmpz[   i].re + tmp5;
+                tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                tmpz[    i].im = tmpz[   i].im + tmp6;
+                tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+#else /* FFT_FIXED_32 */
+
 #define BUTTERFLIES(a0,a1,a2,a3) {\
     BF(t3, t5, t5, t1);\
     BF(a2.re, a0.re, a0.re, t5);\
@@ -351,3 +525,4 @@ static void fft_calc_c(FFTContext *s, FFTComplex *z)
 {
     fft_dispatch[s->nbits-2](z);
 }
+#endif /* FFT_FIXED_32 */
diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 0836d42012..13d3be2b56 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 codec for libavcodec
  *
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,119 +27,32 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 #include "avcodec.h"
-#include "get_bits.h"
-#include "put_bits.h"
+#include "internal.h"
 #include "rangecoder.h"
 #include "golomb.h"
 #include "mathops.h"
 #include "ffv1.h"
 
-const int8_t ffv1_quant5_10bit[256] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant5[256] = {
-     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
-};
-
-const int8_t ffv1_quant9_10bit[256] = {
-     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant11[256] = {
-     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
-};
-
-const uint8_t ffv1_ver2_state[256] = {
-      0,  10,  10,  10,  10,  16,  16,  16,  28,  16,  16,  29,  42,  49,  20,  49,
-     59,  25,  26,  26,  27,  31,  33,  33,  33,  34,  34,  37,  67,  38,  39,  39,
-     40,  40,  41,  79,  43,  44,  45,  45,  48,  48,  64,  50,  51,  52,  88,  52,
-     53,  74,  55,  57,  58,  58,  74,  60,  101, 61,  62,  84,  66,  66,  68,  69,
-     87,  82,  71,  97,  73,  73,  82,  75,  111, 77,  94,  78,  87,  81,  83,  97,
-     85,  83,  94,  86,  99,  89,  90,  99,  111, 92,  93,  134, 95,  98,  105, 98,
-    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
-    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
-    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
-    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
-    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
-    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
-    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
-    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
-    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
-    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
-};
-
-
-av_cold int ffv1_common_init(AVCodecContext *avctx)
+av_cold int ff_ffv1_common_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
 
+    if (!avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
     s->avctx = avctx;
     s->flags = avctx->flags;
 
-    if (!avctx->width || !avctx->height)
-        return AVERROR_INVALIDDATA;
+    s->picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    if (!s->picture.f || !s->last_picture.f)
+        return AVERROR(ENOMEM);
 
     s->width  = avctx->width;
     s->height = avctx->height;
@@ -151,7 +64,7 @@ av_cold int ffv1_common_init(AVCodecContext *avctx)
     return 0;
 }
 
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
+av_cold int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 {
     int j;
 
@@ -162,13 +75,13 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 
         if (fs->ac) {
             if (!p->state)
-                p->state = av_malloc(CONTEXT_SIZE * p->context_count *
+                p->state = av_malloc_array(p->context_count, CONTEXT_SIZE *
                                      sizeof(uint8_t));
             if (!p->state)
                 return AVERROR(ENOMEM);
         } else {
             if (!p->vlc_state)
-                p->vlc_state = av_malloc(p->context_count * sizeof(VlcState));
+                p->vlc_state = av_malloc_array(p->context_count, sizeof(VlcState));
             if (!p->vlc_state)
                 return AVERROR(ENOMEM);
         }
@@ -177,7 +90,7 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
     if (fs->ac > 1) {
         //FIXME only redo if state_transition changed
         for (j = 1; j < 256; j++) {
-            fs->c.one_state[j]        = f->state_transition[j];
+            fs->c. one_state[      j] = f->state_transition[j];
             fs->c.zero_state[256 - j] = 256 - fs->c.one_state[j];
         }
     }
@@ -185,17 +98,25 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
     return 0;
 }
 
-av_cold int ffv1_init_slice_contexts(FFV1Context *f)
+av_cold int ff_ffv1_init_slices_state(FFV1Context *f)
 {
-    int i, j;
-
-    f->slice_count = f->num_h_slices * f->num_v_slices;
-    if (f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "Invalid number of slices\n");
-        return AVERROR(EINVAL);
+    int i, ret;
+    for (i = 0; i < f->max_slice_count; i++) {
+        FFV1Context *fs = f->slice_context[i];
+        if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
+            return AVERROR(ENOMEM);
     }
+    return 0;
+}
+
+av_cold int ff_ffv1_init_slice_contexts(FFV1Context *f)
+{
+    int i;
+
+    f->max_slice_count = f->num_h_slices * f->num_v_slices;
+    av_assert0(f->max_slice_count > 0);
 
-    for (i = 0; i < f->slice_count; i++) {
+    for (i = 0; i < f->max_slice_count; i++) {
         int sx          = i % f->num_h_slices;
         int sy          = i / f->num_h_slices;
         int sxs         = f->avctx->width  *  sx      / f->num_h_slices;
@@ -203,6 +124,7 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         int sys         = f->avctx->height *  sy      / f->num_v_slices;
         int sye         = f->avctx->height * (sy + 1) / f->num_v_slices;
         FFV1Context *fs = av_mallocz(sizeof(*fs));
+
         if (!fs)
             goto memfail;
 
@@ -215,29 +137,29 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         fs->slice_x      = sxs;
         fs->slice_y      = sys;
 
-        fs->sample_buffer = av_malloc(3 * MAX_PLANES * (fs->width + 6) *
+        fs->sample_buffer = av_malloc_array((fs->width + 6), 3 * MAX_PLANES *
                                       sizeof(*fs->sample_buffer));
         if (!fs->sample_buffer) {
-            av_free(fs);
+            av_freep(&f->slice_context[i]);
             goto memfail;
         }
     }
     return 0;
 
 memfail:
-    for (j = 0; j < i; j++) {
-        av_free(f->slice_context[j]->sample_buffer);
-        av_free(f->slice_context[j]);
+    while(--i >= 0) {
+        av_freep(&f->slice_context[i]->sample_buffer);
+        av_freep(&f->slice_context[i]);
     }
     return AVERROR(ENOMEM);
 }
 
-int ffv1_allocate_initial_states(FFV1Context *f)
+int ff_ffv1_allocate_initial_states(FFV1Context *f)
 {
     int i;
 
     for (i = 0; i < f->quant_table_count; i++) {
-        f->initial_states[i] = av_malloc(f->context_count[i] *
+        f->initial_states[i] = av_malloc_array(f->context_count[i],
                                          sizeof(*f->initial_states[i]));
         if (!f->initial_states[i])
             return AVERROR(ENOMEM);
@@ -247,7 +169,7 @@ int ffv1_allocate_initial_states(FFV1Context *f)
     return 0;
 }
 
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
 {
     int i, j;
 
@@ -274,12 +196,21 @@ void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
     }
 }
 
-av_cold int ffv1_close(AVCodecContext *avctx)
+
+av_cold int ff_ffv1_close(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     int i, j;
 
-    for (j = 0; j < s->slice_count; j++) {
+    if (s->picture.f)
+        ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+
+    if (s->last_picture.f)
+        ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+
+    for (j = 0; j < s->max_slice_count; j++) {
         FFV1Context *fs = s->slice_context[j];
         for (i = 0; i < s->plane_count; i++) {
             PlaneContext *p = &fs->plane[i];
@@ -293,14 +224,14 @@ av_cold int ffv1_close(AVCodecContext *avctx)
     av_freep(&avctx->stats_out);
     for (j = 0; j < s->quant_table_count; j++) {
         av_freep(&s->initial_states[j]);
-        for (i = 0; i < s->slice_count; i++) {
+        for (i = 0; i < s->max_slice_count; i++) {
             FFV1Context *sf = s->slice_context[i];
             av_freep(&sf->rc_stat2[j]);
         }
         av_freep(&s->rc_stat2[j]);
     }
 
-    for (i = 0; i < s->slice_count; i++)
+    for (i = 0; i < s->max_slice_count; i++)
         av_freep(&s->slice_context[i]);
 
     return 0;
diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index 6e8c798b6a..b68ed2c4c1 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -3,32 +3,49 @@
  *
  * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_FFV1_H
 #define AVCODEC_FFV1_H
 
-#include <stdint.h>
+/**
+ * @file
+ * FF Video Codec 1 (a lossless codec)
+ */
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 #include "avcodec.h"
 #include "get_bits.h"
+#include "internal.h"
+#include "mathops.h"
 #include "put_bits.h"
 #include "rangecoder.h"
+#include "thread.h"
+
+#ifdef __INTEL_COMPILER
+#undef av_flatten
+#define av_flatten
+#endif
 
 #define MAX_PLANES 4
 #define CONTEXT_SIZE 32
@@ -36,14 +53,6 @@
 #define MAX_QUANT_TABLES 8
 #define MAX_CONTEXT_INPUTS 5
 
-extern const uint8_t ff_log2_run[41];
-
-extern const int8_t ffv1_quant5_10bit[256];
-extern const int8_t ffv1_quant5[256];
-extern const int8_t ffv1_quant9_10bit[256];
-extern const int8_t ffv1_quant11[256];
-extern const uint8_t ffv1_ver2_state[256];
-
 typedef struct VlcState {
     int16_t drift;
     uint16_t error_sum;
@@ -71,7 +80,7 @@ typedef struct FFV1Context {
     uint64_t rc_stat[256][2];
     uint64_t (*rc_stat2[MAX_QUANT_TABLES])[32][2];
     int version;
-    int minor_version;
+    int micro_version;
     int width, height;
     int chroma_planes;
     int chroma_h_shift, chroma_v_shift;
@@ -79,13 +88,13 @@ typedef struct FFV1Context {
     int flags;
     int picture_number;
     int key_frame;
-    const AVFrame *frame;
-    AVFrame *last_picture;
+    ThreadFrame picture, last_picture;
+    struct FFV1Context *fsrc;
 
     AVFrame *cur;
     int plane_count;
-    int ac;     // 1 = range coder <-> 0 = golomb rice
-    int ac_byte_count;      // number of bytes used for AC coding
+    int ac;                              ///< 1=range coder <-> 0=golomb rice
+    int ac_byte_count;                   ///< number of bytes used for AC coding
     PlaneContext plane[MAX_PLANES];
     int16_t quant_table[MAX_CONTEXT_INPUTS][256];
     int16_t quant_tables[MAX_QUANT_TABLES][MAX_CONTEXT_INPUTS][256];
@@ -97,6 +106,7 @@ typedef struct FFV1Context {
     int16_t *sample_buffer;
 
     int ec;
+    int intra;
     int slice_damaged;
     int key_frame_ok;
 
@@ -108,21 +118,34 @@ typedef struct FFV1Context {
 
     struct FFV1Context *slice_context[MAX_SLICES];
     int slice_count;
+    int max_slice_count;
     int num_v_slices;
     int num_h_slices;
     int slice_width;
     int slice_height;
     int slice_x;
     int slice_y;
+    int slice_reset_contexts;
+    int slice_coding_mode;
+    int slice_rct_by_coef;
+    int slice_rct_ry_coef;
 } FFV1Context;
 
+int ff_ffv1_common_init(AVCodecContext *avctx);
+int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_init_slices_state(FFV1Context *f);
+int ff_ffv1_init_slice_contexts(FFV1Context *f);
+int ff_ffv1_allocate_initial_states(FFV1Context *f);
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_close(AVCodecContext *avctx);
+
 static av_always_inline int fold(int diff, int bits)
 {
     if (bits == 8)
         diff = (int8_t)diff;
     else {
         diff +=  1 << (bits  - 1);
-        diff &= (1 <<  bits) - 1;
+        diff  = av_mod_uintp2(diff, bits);
         diff -=  1 << (bits  - 1);
     }
 
@@ -194,11 +217,4 @@ static inline void update_vlc_state(VlcState *const state, const int v)
     state->count = count;
 }
 
-int ffv1_common_init(AVCodecContext *avctx);
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_init_slice_contexts(FFV1Context *f);
-int ffv1_allocate_initial_states(FFV1Context *f);
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_close(AVCodecContext *avctx);
-
 #endif /* AVCODEC_FFV1_H */
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 01c58a14ca..a8d5e33da9 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 decoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,15 +26,14 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
-#include "put_bits.h"
 #include "rangecoder.h"
 #include "golomb.h"
 #include "mathops.h"
@@ -48,8 +47,11 @@ static inline av_flatten int get_symbol_inline(RangeCoder *c, uint8_t *state,
     else {
         int i, e, a;
         e = 0;
-        while (get_rac(c, state + 1 + FFMIN(e, 9))) // 1..10
+        while (get_rac(c, state + 1 + FFMIN(e, 9))) { // 1..10
             e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
 
         a = 1;
         for (i = e - 1; i >= 0; i--)
@@ -77,8 +79,6 @@ static inline int get_vlc_symbol(GetBitContext *gb, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 8);
-
     v = get_sr_golomb(gb, k, 12, bits);
     ff_dlog(NULL, "v:%d bias:%d error:%d drift:%d count:%d k:%d",
             v, state->bias, state->error_sum, state->drift, state->count, k);
@@ -108,6 +108,19 @@ static av_always_inline void decode_line(FFV1Context *s, int w,
     int run_mode  = 0;
     int run_index = s->run_index;
 
+    if (s->slice_coding_mode == 1) {
+        int i;
+        for (x = 0; x < w; x++) {
+            int v = 0;
+            for (i=0; i<bits; i++) {
+                uint8_t state = 128;
+                v += v + get_rac(c, &state);
+            }
+            sample[1][x] = v;
+        }
+        return;
+    }
+
     for (x = 0; x < w; x++) {
         int diff, context, sign;
 
@@ -162,8 +175,7 @@ static av_always_inline void decode_line(FFV1Context *s, int w,
         if (sign)
             diff = -diff;
 
-        sample[1][x] = (predict(sample[1] + x, sample[0] + x) + diff) &
-                       ((1 << bits) - 1);
+        sample[1][x] = av_mod_uintp2(predict(sample[1] + x, sample[0] + x) + diff, bits);
     }
     s->run_index = run_index;
 }
@@ -195,29 +207,27 @@ static void decode_plane(FFV1Context *s, uint8_t *src,
             for (x = 0; x < w; x++)
                 src[x + stride * y] = sample[1][x];
         } else {
-            decode_line(s, w, sample, plane_index,
-                        s->avctx->bits_per_raw_sample);
+            decode_line(s, w, sample, plane_index, s->avctx->bits_per_raw_sample);
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x];
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x] = sample[1][x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample);
+                }
             }
         }
 // STOP_TIMER("decode-line") }
     }
 }
 
-static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
-                             int stride[3])
+static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h, int stride[3])
 {
     int x, y, p;
     int16_t *sample[4][2];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
+    int lbd    = s->avctx->bits_per_raw_sample <= 8;
+    int bits   = s->avctx->bits_per_raw_sample > 0 ? s->avctx->bits_per_raw_sample : 8;
     int offset = 1 << bits;
 
     for (x = 0; x < 4; x++) {
@@ -231,17 +241,17 @@ static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
 
     for (y = 0; y < h; y++) {
         for (p = 0; p < 3 + s->transparency; p++) {
-            int16_t *temp = sample[p][0]; //FIXME try a normal buffer
+            int16_t *temp = sample[p][0]; // FIXME: try a normal buffer
 
             sample[p][0] = sample[p][1];
             sample[p][1] = temp;
 
-            sample[p][1][-1] = sample[p][0][0];
-            sample[p][0][w]  = sample[p][0][w - 1];
-            if (lbd)
-                decode_line(s, w, sample[p], (p + 1) / 2, 9);
+            sample[p][1][-1]= sample[p][0][0  ];
+            sample[p][0][ w]= sample[p][0][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                decode_line(s, w, sample[p], (p + 1)/2, 9);
             else
-                decode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
+                decode_line(s, w, sample[p], (p + 1)/2, bits + (s->slice_coding_mode != 1));
         }
         for (x = 0; x < w; x++) {
             int g = sample[0][1][x];
@@ -249,19 +259,20 @@ static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
             int r = sample[2][1][x];
             int a = sample[3][1][x];
 
-            b -= offset;
-            r -= offset;
-            g -= (b + r) >> 2;
-            b += g;
-            r += g;
+            if (s->slice_coding_mode != 1) {
+                b -= offset;
+                r -= offset;
+                g -= (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += g;
+                r += g;
+            }
 
             if (lbd)
-                *((uint32_t *)(src[0] + x * 4 + stride[0] * y)) = b +
-                    (g << 8) + (r << 16) + (a << 24);
+                *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = b + (g<<8) + (r<<16) + (a<<24);
             else {
-                *((uint16_t *)(src[0] + x * 2 + stride[0] * y)) = b;
-                *((uint16_t *)(src[1] + x * 2 + stride[1] * y)) = g;
-                *((uint16_t *)(src[2] + x * 2 + stride[2] * y)) = r;
+                *((uint16_t*)(src[0] + x*2 + stride[0]*y)) = b;
+                *((uint16_t*)(src[1] + x*2 + stride[1]*y)) = g;
+                *((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
             }
         }
     }
@@ -274,35 +285,29 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
     unsigned ps, i, context_count;
     memset(state, 128, sizeof(state));
 
-    if (fs->ac > 1) {
-        for (i = 1; i < 256; i++) {
-            fs->c.one_state[i]        = f->state_transition[i];
-            fs->c.zero_state[256 - i] = 256 - fs->c.one_state[i];
-        }
-    }
+    av_assert0(f->version > 2);
 
-    fs->slice_x      = get_symbol(c, state, 0) * f->width;
-    fs->slice_y      = get_symbol(c, state, 0) * f->height;
-    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
+    fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+    fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
     fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
 
-    fs->slice_x     /= f->num_h_slices;
-    fs->slice_y     /= f->num_v_slices;
-    fs->slice_width  = fs->slice_width / f->num_h_slices - fs->slice_x;
-    fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-    if ((unsigned)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
-    if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
+    fs->slice_x /= f->num_h_slices;
+    fs->slice_y /= f->num_v_slices;
+    fs->slice_width  = fs->slice_width /f->num_h_slices - fs->slice_x;
+    fs->slice_height = fs->slice_height/f->num_v_slices - fs->slice_y;
+    if ((unsigned)fs->slice_width > f->width || (unsigned)fs->slice_height > f->height)
+        return -1;
+    if (    (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+         || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
+        return -1;
 
     for (i = 0; i < f->plane_count; i++) {
-        PlaneContext *const p = &fs->plane[i];
-        int idx               = get_symbol(c, state, 0);
-        if (idx > (unsigned)f->quant_table_count) {
+        PlaneContext * const p = &fs->plane[i];
+        int idx = get_symbol(c, state, 0);
+        if (idx >= (unsigned)f->quant_table_count) {
             av_log(f->avctx, AV_LOG_ERROR, "quant_table_index out of range\n");
-            return AVERROR_INVALIDDATA;
+            return -1;
         }
         p->quant_table_index = idx;
         memcpy(p->quant_table, f->quant_tables[idx], sizeof(p->quant_table));
@@ -336,65 +341,110 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
         f->cur->sample_aspect_ratio = (AVRational){ 0, 1 };
     }
 
+    if (fs->version > 3) {
+        fs->slice_reset_contexts = get_rac(c, state);
+        fs->slice_coding_mode = get_symbol(c, state, 0);
+        if (fs->slice_coding_mode != 1) {
+            fs->slice_rct_by_coef = get_symbol(c, state, 0);
+            fs->slice_rct_ry_coef = get_symbol(c, state, 0);
+            if ((uint64_t)fs->slice_rct_by_coef + (uint64_t)fs->slice_rct_ry_coef > 4) {
+                av_log(f->avctx, AV_LOG_ERROR, "slice_rct_y_coef out of range\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
     return 0;
 }
 
 static int decode_slice(AVCodecContext *c, void *arg)
 {
-    FFV1Context *fs = *(void **)arg;
-    FFV1Context *f  = fs->avctx->priv_data;
+    FFV1Context *fs   = *(void **)arg;
+    FFV1Context *f    = fs->avctx->priv_data;
     int width, height, x, y, ret;
-    const int ps = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                   ? (c->bits_per_raw_sample > 8) + 1
-                   : 4;
-    AVFrame *const p = f->cur;
+    const int ps      = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    AVFrame * const p = f->cur;
+    int i, si;
+
+    for( si=0; fs != f->slice_context[si]; si ++)
+        ;
+
+    if(f->fsrc && !p->key_frame)
+        ff_thread_await_progress(&f->last_picture, si, 0);
+
+    if(f->fsrc && !p->key_frame) {
+        FFV1Context *fssrc = f->fsrc->slice_context[si];
+        FFV1Context *fsdst = f->slice_context[si];
+        av_assert1(fsdst->plane_count == fssrc->plane_count);
+        av_assert1(fsdst == fs);
+
+        if (!p->key_frame)
+            fsdst->slice_damaged |= fssrc->slice_damaged;
+
+        for (i = 0; i < f->plane_count; i++) {
+            PlaneContext *psrc = &fssrc->plane[i];
+            PlaneContext *pdst = &fsdst->plane[i];
+
+            av_free(pdst->state);
+            av_free(pdst->vlc_state);
+            memcpy(pdst, psrc, sizeof(*pdst));
+            pdst->state = NULL;
+            pdst->vlc_state = NULL;
+
+            if (fssrc->ac) {
+                pdst->state = av_malloc_array(CONTEXT_SIZE,  psrc->context_count);
+                memcpy(pdst->state, psrc->state, CONTEXT_SIZE * psrc->context_count);
+            } else {
+                pdst->vlc_state = av_malloc_array(sizeof(*pdst->vlc_state), psrc->context_count);
+                memcpy(pdst->vlc_state, psrc->vlc_state, sizeof(*pdst->vlc_state) * psrc->context_count);
+            }
+        }
+    }
+
+    fs->slice_rct_by_coef = 1;
+    fs->slice_rct_ry_coef = 1;
 
     if (f->version > 2) {
+        if (ff_ffv1_init_slice_state(f, fs) < 0)
+            return AVERROR(ENOMEM);
         if (decode_slice_header(f, fs) < 0) {
             fs->slice_damaged = 1;
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_init_slice_state(f, fs)) < 0)
+    if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
         return ret;
-    if (f->cur->key_frame)
-        ffv1_clear_slice_state(f, fs);
+    if (f->cur->key_frame || fs->slice_reset_contexts)
+        ff_ffv1_clear_slice_state(f, fs);
+
     width  = fs->slice_width;
     height = fs->slice_height;
     x      = fs->slice_x;
     y      = fs->slice_y;
 
     if (!fs->ac) {
-        if (f->version == 3 && f->minor_version > 1 || f->version > 3)
+        if (f->version == 3 && f->micro_version > 1 || f->version > 3)
             get_rac(&fs->c, (uint8_t[]) { 129 });
         fs->ac_byte_count = f->version > 2 || (!x && !y) ? fs->c.bytestream - fs->c.bytestream_start - 1 : 0;
-        init_get_bits(&fs->gb, fs->c.bytestream_start + fs->ac_byte_count,
-                      (fs->c.bytestream_end - fs->c.bytestream_start -
-                       fs->ac_byte_count) * 8);
+        init_get_bits(&fs->gb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
+                      (fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count) * 8);
     }
 
     av_assert1(width && height);
     if (f->colorspace == 0) {
-        const int chroma_width  = -((-width) >> f->chroma_h_shift);
-        const int chroma_height = -((-height) >> f->chroma_v_shift);
+        const int chroma_width  = FF_CEIL_RSHIFT(width,  f->chroma_h_shift);
+        const int chroma_height = FF_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
-        decode_plane(fs, p->data[0] + ps * x + y * p->linesize[0], width,
-                     height, p->linesize[0],
-                     0);
+        decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0);
 
         if (f->chroma_planes) {
-            decode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1],
-                         1);
-            decode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2],
-                         1);
+            decode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1);
+            decode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1);
         }
         if (fs->transparency)
-            decode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3],
-                         2);
+            decode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], (f->version >= 4 && !f->chroma_planes) ? 1 : 2);
     } else {
         uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
                                p->data[1] + ps * x + y * p->linesize[1],
@@ -404,16 +454,17 @@ static int decode_slice(AVCodecContext *c, void *arg)
     if (fs->ac && f->version > 2) {
         int v;
         get_rac(&fs->c, (uint8_t[]) { 129 });
-        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5 * f->ec;
+        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5*f->ec;
         if (v) {
-            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n",
-                   v);
+            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n", v);
             fs->slice_damaged = 1;
         }
     }
 
     emms_c();
 
+    ff_thread_report_progress(&f->picture, si, 0);
+
     return 0;
 }
 
@@ -428,8 +479,8 @@ static int read_quant_table(RangeCoder *c, int16_t *quant_table, int scale)
     for (v = 0; i < 128; v++) {
         unsigned len = get_symbol(c, state, 0) + 1;
 
-        if (len > 128 - i)
-            return -1;
+        if (len > 128 - i || !len)
+            return AVERROR_INVALIDDATA;
 
         while (len--) {
             quant_table[i] = scale * v;
@@ -451,9 +502,12 @@ static int read_quant_tables(RangeCoder *c,
     int context_count = 1;
 
     for (i = 0; i < 5; i++) {
-        context_count *= read_quant_table(c, quant_table[i], context_count);
+        int ret = read_quant_table(c, quant_table[i], context_count);
+        if (ret < 0)
+            return ret;
+        context_count *= ret;
         if (context_count > 32768U) {
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     }
     return (context_count + 1) / 2;
@@ -465,6 +519,7 @@ static int read_extra_header(FFV1Context *f)
     uint8_t state[CONTEXT_SIZE];
     int i, j, k, ret;
     uint8_t state2[32][CONTEXT_SIZE];
+    unsigned crc = 0;
 
     memset(state2, 128, sizeof(state2));
     memset(state, 128, sizeof(state));
@@ -473,12 +528,17 @@ static int read_extra_header(FFV1Context *f)
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     f->version = get_symbol(c, state, 0);
+    if (f->version < 2) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid version in global header\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (f->version > 2) {
         c->bytestream_end -= 4;
-        f->minor_version   = get_symbol(c, state, 0);
+        f->micro_version = get_symbol(c, state, 0);
+        if (f->micro_version < 0)
+            return AVERROR_INVALIDDATA;
     }
     f->ac = f->avctx->coder_type = get_symbol(c, state, 0);
-
     if (f->ac > 1) {
         for (i = 1; i < 256; i++)
             f->state_transition[i] = get_symbol(c, state, 1) + c->one_state[i];
@@ -490,19 +550,27 @@ static int read_extra_header(FFV1Context *f)
     f->chroma_h_shift             = get_symbol(c, state, 0);
     f->chroma_v_shift             = get_symbol(c, state, 0);
     f->transparency               = get_rac(c, state);
-    f->plane_count                = 2 + f->transparency;
+    f->plane_count                = 1 + (f->chroma_planes || f->version<4) + f->transparency;
     f->num_h_slices               = 1 + get_symbol(c, state, 0);
     f->num_v_slices               = 1 + get_symbol(c, state, 0);
 
-    if (f->num_h_slices > (unsigned)f->width ||
-        f->num_v_slices > (unsigned)f->height) {
-        av_log(f->avctx, AV_LOG_ERROR, "too many slices\n");
+    if (f->chroma_h_shift > 4U || f->chroma_v_shift > 4U) {
+        av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+               f->chroma_h_shift, f->chroma_v_shift);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (f->num_h_slices > (unsigned)f->width  || !f->num_h_slices ||
+        f->num_v_slices > (unsigned)f->height || !f->num_v_slices
+       ) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count invalid\n");
         return AVERROR_INVALIDDATA;
     }
 
     f->quant_table_count = get_symbol(c, state, 0);
     if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES)
         return AVERROR_INVALIDDATA;
+
     for (i = 0; i < f->quant_table_count; i++) {
         f->context_count[i] = read_quant_tables(c, f->quant_tables[i]);
         if (f->context_count[i] < 0) {
@@ -510,7 +578,7 @@ static int read_extra_header(FFV1Context *f)
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_allocate_initial_states(f)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(f)) < 0)
         return ret;
 
     for (i = 0; i < f->quant_table_count; i++)
@@ -525,46 +593,59 @@ static int read_extra_header(FFV1Context *f)
 
     if (f->version > 2) {
         f->ec = get_symbol(c, state, 0);
+        if (f->micro_version > 2)
+            f->intra = get_symbol(c, state, 0);
     }
 
     if (f->version > 2) {
         unsigned v;
         v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
                    f->avctx->extradata, f->avctx->extradata_size);
-        if (v) {
+        if (v || f->avctx->extradata_size < 4) {
             av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", v);
             return AVERROR_INVALIDDATA;
         }
+        crc = AV_RB32(f->avctx->extradata + f->avctx->extradata_size - 4);
     }
 
+    if (f->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(f->avctx, AV_LOG_DEBUG,
+               "global: ver:%d.%d, coder:%d, colorspace: %d bpr:%d chroma:%d(%d:%d), alpha:%d slices:%dx%d qtabs:%d ec:%d intra:%d CRC:0x%08X\n",
+               f->version, f->micro_version,
+               f->ac,
+               f->colorspace,
+               f->avctx->bits_per_raw_sample,
+               f->chroma_planes, f->chroma_h_shift, f->chroma_v_shift,
+               f->transparency,
+               f->num_h_slices, f->num_v_slices,
+               f->quant_table_count,
+               f->ec,
+               f->intra,
+               crc
+              );
     return 0;
 }
 
-
 static int read_header(FFV1Context *f)
 {
     uint8_t state[CONTEXT_SIZE];
-    int i, j, context_count = -1;
+    int i, j, context_count = -1; //-1 to avoid warning
     RangeCoder *const c = &f->slice_context[0]->c;
 
     memset(state, 128, sizeof(state));
 
     if (f->version < 2) {
         int chroma_planes, chroma_h_shift, chroma_v_shift, transparency, colorspace, bits_per_raw_sample;
-        unsigned v = get_symbol(c, state, 0);
-        if (v > 1) {
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "invalid version %d in version 1 header\n", v);
+        unsigned v= get_symbol(c, state, 0);
+        if (v >= 2) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid version %d in ver01 header\n", v);
             return AVERROR_INVALIDDATA;
         }
         f->version = v;
-
-        f->ac = f->avctx->coder_type = get_symbol(c, state, 0);
-
+        f->ac      = f->avctx->coder_type = get_symbol(c, state, 0);
         if (f->ac > 1) {
             for (i = 1; i < 256; i++)
-                f->state_transition[i] =
-                    get_symbol(c, state, 1) + c->one_state[i];
+                f->state_transition[i] = get_symbol(c, state, 1) + c->one_state[i];
         }
 
         colorspace          = get_symbol(c, state, 0); //YUV cs type
@@ -573,6 +654,8 @@ static int read_header(FFV1Context *f)
         chroma_h_shift      = get_symbol(c, state, 0);
         chroma_v_shift      = get_symbol(c, state, 0);
         transparency        = get_rac(c, state);
+        if (colorspace == 0 && f->avctx->skip_alpha)
+            transparency = 0;
 
         if (f->plane_count) {
             if (colorspace          != f->colorspace                 ||
@@ -586,6 +669,12 @@ static int read_header(FFV1Context *f)
             }
         }
 
+        if (chroma_h_shift > 4U || chroma_v_shift > 4U) {
+            av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+                   chroma_h_shift, chroma_v_shift);
+            return AVERROR_INVALIDDATA;
+        }
+
         f->colorspace                 = colorspace;
         f->avctx->bits_per_raw_sample = bits_per_raw_sample;
         f->chroma_planes              = chroma_planes;
@@ -602,91 +691,60 @@ static int read_header(FFV1Context *f)
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
             else
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        } else if (f->avctx->bits_per_raw_sample <= 8 && !f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
-                break;
-            case 0x01:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-                break;
-            case 0x20:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
-                break;
-            case 0x22:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample<=8 && !f->transparency) {
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P; break;
+            case 0x01: f->avctx->pix_fmt = AV_PIX_FMT_YUV440P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P; break;
+            case 0x20: f->avctx->pix_fmt = AV_PIX_FMT_YUV411P; break;
+            case 0x22: f->avctx->pix_fmt = AV_PIX_FMT_YUV410P; break;
             }
         } else if (f->avctx->bits_per_raw_sample <= 8 && f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16*f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 9) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && !f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 10) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P9; break;
             }
-        } else {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample == 10 && !f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 10 && f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && !f->transparency){
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && f->transparency){
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16; break;
             }
         }
     } else if (f->colorspace == 1) {
@@ -695,27 +753,25 @@ static int read_header(FFV1Context *f)
                    "chroma subsampling not supported in this colorspace\n");
             return AVERROR(ENOSYS);
         }
-        switch (f->avctx->bits_per_raw_sample) {
-        case 0:
-        case 8:
-            f->avctx->pix_fmt = AV_PIX_FMT_RGB32;
-            break;
-        case 9:
+        if (     f->avctx->bits_per_raw_sample ==  9)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP9;
-            break;
-        case 10:
+        else if (f->avctx->bits_per_raw_sample == 10)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP10;
-            break;
-        default:
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "bit depth %d not supported\n",
-                   f->avctx->bits_per_raw_sample);
-            return AVERROR(ENOSYS);
-        }
+        else if (f->avctx->bits_per_raw_sample == 12)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        else if (f->avctx->bits_per_raw_sample == 14)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP14;
+        else
+        if (f->transparency) f->avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        else                 f->avctx->pix_fmt = AV_PIX_FMT_0RGB32;
     } else {
         av_log(f->avctx, AV_LOG_ERROR, "colorspace not supported\n");
         return AVERROR(ENOSYS);
     }
+    if (f->avctx->pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
+        return AVERROR(ENOSYS);
+    }
 
     ff_dlog(f->avctx, "%d %d %d\n",
             f->chroma_h_shift, f->chroma_v_shift, f->avctx->pix_fmt);
@@ -725,6 +781,7 @@ static int read_header(FFV1Context *f)
             av_log(f->avctx, AV_LOG_ERROR, "read_quant_table error\n");
             return AVERROR_INVALIDDATA;
         }
+        f->slice_count = f->max_slice_count;
     } else if (f->version < 3) {
         f->slice_count = get_symbol(c, state, 0);
     } else {
@@ -732,16 +789,15 @@ static int read_header(FFV1Context *f)
         for (f->slice_count = 0;
              f->slice_count < MAX_SLICES && 3 < p - c->bytestream_start;
              f->slice_count++) {
-            int trailer = 3 + 5 * !!f->ec;
-            int size    = AV_RB24(p - trailer);
+            int trailer = 3 + 5*!!f->ec;
+            int size = AV_RB24(p-trailer);
             if (size + trailer > p - c->bytestream_start)
                 break;
             p -= size + trailer;
         }
     }
-    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid\n",
-               f->slice_count);
+    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0 || f->slice_count > f->max_slice_count) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid (max=%d)\n", f->slice_count, f->max_slice_count);
         return AVERROR_INVALIDDATA;
     }
 
@@ -753,23 +809,20 @@ static int read_header(FFV1Context *f)
         fs->slice_damaged = 0;
 
         if (f->version == 2) {
-            fs->slice_x     = get_symbol(c, state, 0) * f->width;
-            fs->slice_y     = get_symbol(c, state, 0) * f->height;
-            fs->slice_width =
-                (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
-            fs->slice_height =
-                (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
-
-            fs->slice_x      /= f->num_h_slices;
-            fs->slice_y      /= f->num_v_slices;
+            fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+            fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+            fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
+            fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
+
+            fs->slice_x     /= f->num_h_slices;
+            fs->slice_y     /= f->num_v_slices;
             fs->slice_width  = fs->slice_width  / f->num_h_slices - fs->slice_x;
             fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-            if ((unsigned)fs->slice_width > f->width ||
+            if ((unsigned)fs->slice_width  > f->width ||
                 (unsigned)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
-            if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width > f->width
-                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height >
-                f->height)
+            if (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
         }
 
@@ -804,28 +857,26 @@ static int read_header(FFV1Context *f)
     return 0;
 }
 
-static av_cold int ffv1_decode_init(AVCodecContext *avctx)
+static av_cold int decode_init(AVCodecContext *avctx)
 {
     FFV1Context *f = avctx->priv_data;
     int ret;
 
-    ffv1_common_init(avctx);
-
-    f->last_picture = av_frame_alloc();
-    if (!f->last_picture)
-        return AVERROR(ENOMEM);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
     if (avctx->extradata && (ret = read_extra_header(f)) < 0)
         return ret;
 
-    if ((ret = ffv1_init_slice_contexts(f)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
         return ret;
 
+    avctx->internal->allocate_progress = 1;
+
     return 0;
 }
 
-static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
-                             int *got_frame, AVPacket *avpkt)
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     uint8_t *buf        = avpkt->data;
     int buf_size        = avpkt->size;
@@ -834,10 +885,22 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
     int i, ret;
     uint8_t keystate = 128;
     uint8_t *buf_p;
-    AVFrame *const p    = data;
+    AVFrame *p;
+
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
+    FFSWAP(ThreadFrame, f->picture, f->last_picture);
 
-    f->cur = p;
+    f->cur = p = f->picture.f;
+
+    if (f->version < 3 && avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        p->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            p->top_field_first = 1;
+    }
 
+    f->avctx = avctx;
     ff_init_range_decoder(c, buf, buf_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
@@ -857,27 +920,23 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         p->key_frame = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &f->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(avctx, AV_LOG_DEBUG,
-               "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
-               f->version, p->key_frame, f->ac, f->ec, f->slice_count,
-               f->avctx->bits_per_raw_sample);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
+               f->version, p->key_frame, f->ac, f->ec, f->slice_count, f->avctx->bits_per_raw_sample);
+
+    ff_thread_finish_setup(avctx);
 
     buf_p = buf + buf_size;
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
-        int trailer     = 3 + 5 * !!f->ec;
+        int trailer = 3 + 5*!!f->ec;
         int v;
 
-        if (i || f->version > 2)
-            v = AV_RB24(buf_p - trailer) + trailer;
-        else
-            v = buf_p - c->bytestream_start;
+        if (i || f->version > 2) v = AV_RB24(buf_p-trailer) + trailer;
+        else                     v = buf_p - c->bytestream_start;
         if (buf_p - c->bytestream_start < v) {
             av_log(avctx, AV_LOG_ERROR, "Slice pointer chain broken\n");
             return AVERROR_INVALIDDATA;
@@ -887,9 +946,20 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         if (f->ec) {
             unsigned crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, buf_p, v);
             if (crc) {
-                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", crc);
+                int64_t ts = avpkt->pts != AV_NOPTS_VALUE ? avpkt->pts : avpkt->dts;
+                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!", crc);
+                if (ts != AV_NOPTS_VALUE && avctx->pkt_timebase.num) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %f seconds\n", ts*av_q2d(avctx->pkt_timebase));
+                } else if (ts != AV_NOPTS_VALUE) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %"PRId64"\n", ts);
+                } else {
+                    av_log(f->avctx, AV_LOG_ERROR, "\n");
+                }
                 fs->slice_damaged = 1;
             }
+            if (avctx->debug & FF_DEBUG_PICT_INFO) {
+                av_log(avctx, AV_LOG_DEBUG, "slice %d, CRC: 0x%08X\n", i, AV_RB32(buf_p + v - 4));
+            }
         }
 
         if (i) {
@@ -897,57 +967,157 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         } else
             fs->c.bytestream_end = buf_p + v;
 
+        fs->avctx = avctx;
         fs->cur = p;
     }
 
-    avctx->execute(avctx, decode_slice, &f->slice_context[0], NULL,
+    avctx->execute(avctx,
+                   decode_slice,
+                   &f->slice_context[0],
+                   NULL,
                    f->slice_count,
-                   sizeof(void *));
+                   sizeof(void*));
 
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
         int j;
-        if (fs->slice_damaged && f->last_picture->data[0]) {
+        if (fs->slice_damaged && f->last_picture.f->data[0]) {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
             const uint8_t *src[4];
             uint8_t *dst[4];
+            ff_thread_await_progress(&f->last_picture, INT_MAX, 0);
             for (j = 0; j < 4; j++) {
+                int pixshift = desc->comp[j].depth > 8;
                 int sh = (j == 1 || j == 2) ? f->chroma_h_shift : 0;
                 int sv = (j == 1 || j == 2) ? f->chroma_v_shift : 0;
                 dst[j] = p->data[j] + p->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
-                src[j] = f->last_picture->data[j] +
-                         f->last_picture->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
+                src[j] = f->last_picture.f->data[j] + f->last_picture.f->linesize[j] *
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
             }
             av_image_copy(dst, p->linesize, src,
-                          f->last_picture->linesize,
-                          avctx->pix_fmt, fs->slice_width,
+                          f->last_picture.f->linesize,
+                          avctx->pix_fmt,
+                          fs->slice_width,
                           fs->slice_height);
         }
     }
+    ff_thread_report_progress(&f->picture, INT_MAX, 0);
 
     f->picture_number++;
 
-    av_frame_unref(f->last_picture);
-    if ((ret = av_frame_ref(f->last_picture, p)) < 0)
-        return ret;
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
     f->cur = NULL;
+    if ((ret = av_frame_ref(data, f->picture.f)) < 0)
+        return ret;
 
     *got_frame = 1;
 
     return buf_size;
 }
 
-static av_cold int ffv1_decode_close(AVCodecContext *avctx)
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
 {
-    FFV1Context *s = avctx->priv_data;;
+    FFV1Context *f = avctx->priv_data;
+    int i, ret;
 
-    av_frame_free(&s->last_picture);
+    f->picture.f      = NULL;
+    f->last_picture.f = NULL;
+    f->sample_buffer  = NULL;
+    f->max_slice_count = 0;
+    f->slice_count = 0;
 
-    ffv1_close(avctx);
+    for (i = 0; i < f->quant_table_count; i++) {
+        av_assert0(f->version > 1);
+        f->initial_states[i] = av_memdup(f->initial_states[i],
+                                         f->context_count[i] * sizeof(*f->initial_states[i]));
+    }
+
+    f->picture.f      = av_frame_alloc();
+    f->last_picture.f = av_frame_alloc();
+
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
+        return ret;
 
     return 0;
 }
+#endif
+
+static void copy_fields(FFV1Context *fsdst, FFV1Context *fssrc, FFV1Context *fsrc)
+{
+    fsdst->version             = fsrc->version;
+    fsdst->micro_version       = fsrc->micro_version;
+    fsdst->chroma_planes       = fsrc->chroma_planes;
+    fsdst->chroma_h_shift      = fsrc->chroma_h_shift;
+    fsdst->chroma_v_shift      = fsrc->chroma_v_shift;
+    fsdst->transparency        = fsrc->transparency;
+    fsdst->plane_count         = fsrc->plane_count;
+    fsdst->ac                  = fsrc->ac;
+    fsdst->colorspace          = fsrc->colorspace;
+
+    fsdst->ec                  = fsrc->ec;
+    fsdst->intra               = fsrc->intra;
+    fsdst->slice_damaged       = fssrc->slice_damaged;
+    fsdst->key_frame_ok        = fsrc->key_frame_ok;
+
+    fsdst->bits_per_raw_sample = fsrc->bits_per_raw_sample;
+    fsdst->packed_at_lsb       = fsrc->packed_at_lsb;
+    fsdst->slice_count         = fsrc->slice_count;
+    if (fsrc->version<3){
+        fsdst->slice_x             = fssrc->slice_x;
+        fsdst->slice_y             = fssrc->slice_y;
+        fsdst->slice_width         = fssrc->slice_width;
+        fsdst->slice_height        = fssrc->slice_height;
+    }
+}
+
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    FFV1Context *fsrc = src->priv_data;
+    FFV1Context *fdst = dst->priv_data;
+    int i, ret;
+
+    if (dst == src)
+        return 0;
+
+    {
+        ThreadFrame picture = fdst->picture, last_picture = fdst->last_picture;
+        uint8_t (*initial_states[MAX_QUANT_TABLES])[32];
+        struct FFV1Context *slice_context[MAX_SLICES];
+        memcpy(initial_states, fdst->initial_states, sizeof(fdst->initial_states));
+        memcpy(slice_context,  fdst->slice_context , sizeof(fdst->slice_context));
+
+        memcpy(fdst, fsrc, sizeof(*fdst));
+        memcpy(fdst->initial_states, initial_states, sizeof(fdst->initial_states));
+        memcpy(fdst->slice_context,  slice_context , sizeof(fdst->slice_context));
+        fdst->picture      = picture;
+        fdst->last_picture = last_picture;
+        for (i = 0; i<fdst->num_h_slices * fdst->num_v_slices; i++) {
+            FFV1Context *fssrc = fsrc->slice_context[i];
+            FFV1Context *fsdst = fdst->slice_context[i];
+            copy_fields(fsdst, fssrc, fsrc);
+        }
+        av_assert0(!fdst->plane[0].state);
+        av_assert0(!fdst->sample_buffer);
+    }
+
+    av_assert1(fdst->max_slice_count == fsrc->max_slice_count);
+
+
+    ff_thread_release_buffer(dst, &fdst->picture);
+    if (fsrc->picture.f->data[0]) {
+        if ((ret = ff_thread_ref_frame(&fdst->picture, &fsrc->picture)) < 0)
+            return ret;
+    }
+
+    fdst->fsrc = fsrc;
+
+    return 0;
+}
+#endif
 
 AVCodec ff_ffv1_decoder = {
     .name           = "ffv1",
@@ -955,9 +1125,11 @@ AVCodec ff_ffv1_decoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_decode_init,
-    .close          = ffv1_decode_close,
-    .decode         = ffv1_decode_frame,
+    .init           = decode_init,
+    .close          = ff_ffv1_close,
+    .decode         = decode_frame,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
     .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/ |
-                      AV_CODEC_CAP_SLICE_THREADS,
+                      AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
 };
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 32b3711e16..5bd93fb2e9 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -1,22 +1,22 @@
 /*
- * FFV1 encoder for libavcodec
+ * FFV1 encoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,19 +27,114 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 #include "avcodec.h"
 #include "internal.h"
-#include "get_bits.h"
 #include "put_bits.h"
 #include "rangecoder.h"
 #include "golomb.h"
 #include "mathops.h"
 #include "ffv1.h"
 
+static const int8_t quant5_10bit[256] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
+};
+
+static const int8_t quant5[256] = {
+     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
+};
+
+static const int8_t quant9_10bit[256] = {
+     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
+};
+
+static const int8_t quant11[256] = {
+     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
+};
+
+static const uint8_t ver2_state[256] = {
+      0,  10,  10,  10,  10,  16,  16,  16, 28,   16,  16,  29,  42,  49,  20,  49,
+     59,  25,  26,  26,  27,  31,  33,  33, 33,   34,  34,  37,  67,  38,  39,  39,
+     40,  40,  41,  79,  43,  44,  45,  45, 48,   48,  64,  50,  51,  52,  88,  52,
+     53,  74,  55,  57,  58,  58,  74,  60, 101,  61,  62,  84,  66,  66,  68,  69,
+     87,  82,  71,  97,  73,  73,  82,  75, 111,  77,  94,  78,  87,  81,  83,  97,
+     85,  83,  94,  86,  99,  89,  90,  99, 111,  92,  93,  134, 95,  98,  105, 98,
+    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
+    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
+    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
+    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
+    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
+    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
+    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
+    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
+    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
+    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
+};
+
 static void find_best_state(uint8_t best_state[256][256],
                             const uint8_t one_state[256])
 {
@@ -64,8 +159,8 @@ static void find_best_state(uint8_t best_state[256][256],
                 double newocc[256] = { 0 };
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        len -= occ[m] *     (p  * l2tab[m] +
-                                        (1 - p) * l2tab[256 - m]);
+                        len -=occ[m]*(     p *l2tab[    m]
+                                      + (1-p)*l2tab[256-m]);
                     }
                 if (len < best_len[k]) {
                     best_len[k]      = len;
@@ -73,7 +168,7 @@ static void find_best_state(uint8_t best_state[256][256],
                 }
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        newocc[one_state[m]]             += occ[m] * p;
+                        newocc[      one_state[      m]] += occ[m] * p;
                         newocc[256 - one_state[256 - m]] += occ[m] * (1 - p);
                     }
                 memcpy(occ, newocc, sizeof(occ));
@@ -136,6 +231,7 @@ static av_noinline void put_symbol(RangeCoder *c, uint8_t *state,
     put_symbol_inline(c, state, v, is_signed, NULL, NULL);
 }
 
+
 static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
                                   int v, int bits)
 {
@@ -149,7 +245,7 @@ static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 13);
+    av_assert2(k <= 13);
 
 #if 0 // JPEG LS
     if (k == 0 && 2 * state->drift <= -state->count)
@@ -179,7 +275,7 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
     int run_mode  = 0;
 
     if (s->ac) {
-        if (c->bytestream_end - c->bytestream < w * 20) {
+        if (c->bytestream_end - c->bytestream < w * 35) {
             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
             return AVERROR_INVALIDDATA;
         }
@@ -190,6 +286,18 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
         }
     }
 
+    if (s->slice_coding_mode == 1) {
+        for (x = 0; x < w; x++) {
+            int i;
+            int v = sample[0][x];
+            for (i = bits-1; i>=0; i--) {
+                uint8_t state = 128;
+                put_rac(c, &state, (v>>i) & 1);
+            }
+        }
+        return 0;
+    }
+
     for (x = 0; x < w; x++) {
         int diff, context;
 
@@ -257,10 +365,10 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
     return 0;
 }
 
-static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
+static int encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
                          int stride, int plane_index)
 {
-    int x, y, i;
+    int x, y, i, ret;
     const int ring_size = s->avctx->context_model ? 3 : 2;
     int16_t *sample[3];
     s->run_index = 0;
@@ -271,38 +379,40 @@ static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
         for (i = 0; i < ring_size; i++)
             sample[i] = s->sample_buffer + (w + 6) * ((h + i - y) % ring_size) + 3;
 
-        sample[0][-1] = sample[1][0];
-        sample[1][w]  = sample[1][w - 1];
+        sample[0][-1]= sample[1][0  ];
+        sample[1][ w]= sample[1][w-1];
 // { START_TIMER
         if (s->bits_per_raw_sample <= 8) {
             for (x = 0; x < w; x++)
                 sample[0][x] = src[x + stride * y];
-            encode_line(s, w, sample, plane_index, 8);
+            if((ret = encode_line(s, w, sample, plane_index, 8)) < 0)
+                return ret;
         } else {
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    sample[0][x] = ((uint16_t *)(src + stride * y))[x];
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    sample[0][x] =
-                        ((uint16_t *)(src + stride * y))[x] >> (16 - s->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x] >> (16 - s->bits_per_raw_sample);
+                }
             }
-            encode_line(s, w, sample, plane_index, s->bits_per_raw_sample);
+            if((ret = encode_line(s, w, sample, plane_index, s->bits_per_raw_sample)) < 0)
+                return ret;
         }
 // STOP_TIMER("encode line") }
     }
+    return 0;
 }
 
-static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
+static int encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
                              int w, int h, const int stride[3])
 {
     int x, y, p, i;
     const int ring_size = s->avctx->context_model ? 3 : 2;
-    int16_t *sample[MAX_PLANES][3];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
+    int16_t *sample[4][3];
+    int lbd    = s->bits_per_raw_sample <= 8;
+    int bits   = s->bits_per_raw_sample > 0 ? s->bits_per_raw_sample : 8;
     int offset = 1 << bits;
 
     s->run_index = 0;
@@ -313,29 +423,29 @@ static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
     for (y = 0; y < h; y++) {
         for (i = 0; i < ring_size; i++)
             for (p = 0; p < MAX_PLANES; p++)
-                sample[p][i] = s->sample_buffer + p * ring_size *
-                               (w + 6) +
-                               ((h + i - y) % ring_size) * (w + 6) + 3;
+                sample[p][i]= s->sample_buffer + p*ring_size*(w+6) + ((h+i-y)%ring_size)*(w+6) + 3;
 
         for (x = 0; x < w; x++) {
             int b, g, r, av_uninit(a);
             if (lbd) {
-                unsigned v = *((const uint32_t *)(src[0] + x * 4 + stride[0] * y));
-                b = v & 0xFF;
-                g = (v >> 8) & 0xFF;
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
                 r = (v >> 16) & 0xFF;
-                a = v >> 24;
+                a =  v >> 24;
             } else {
-                b = *((const uint16_t *)(src[0] + x * 2 + stride[0] * y));
-                g = *((const uint16_t *)(src[1] + x * 2 + stride[1] * y));
-                r = *((const uint16_t *)(src[2] + x * 2 + stride[2] * y));
+                b = *((const uint16_t *)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t *)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t *)(src[2] + x*2 + stride[2]*y));
             }
 
-            b -= g;
-            r -= g;
-            g += (b + r) >> 2;
-            b += offset;
-            r += offset;
+            if (s->slice_coding_mode != 1) {
+                b -= g;
+                r -= g;
+                g += (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += offset;
+                r += offset;
+            }
 
             sample[0][0][x] = g;
             sample[1][0][x] = b;
@@ -343,17 +453,20 @@ static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
             sample[3][0][x] = a;
         }
         for (p = 0; p < 3 + s->transparency; p++) {
-            sample[p][0][-1] = sample[p][1][0];
-            sample[p][1][w]  = sample[p][1][w - 1];
-            if (lbd)
-                encode_line(s, w, sample[p], (p + 1) / 2, 9);
+            int ret;
+            sample[p][0][-1] = sample[p][1][0  ];
+            sample[p][1][ w] = sample[p][1][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                ret = encode_line(s, w, sample[p], (p + 1) / 2, 9);
             else
-                encode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
+                ret = encode_line(s, w, sample[p], (p + 1) / 2, bits + (s->slice_coding_mode != 1));
+            if (ret < 0)
+                return ret;
         }
     }
+    return 0;
 }
 
-
 static void write_quant_table(RangeCoder *c, int16_t *quant_table)
 {
     int last = 0;
@@ -393,7 +506,7 @@ static void write_header(FFV1Context *f)
                 put_symbol(c, state,
                            f->state_transition[i] - c->one_state[i], 1);
         }
-        put_symbol(c, state, f->colorspace, 0); // YUV cs type
+        put_symbol(c, state, f->colorspace, 0); //YUV cs type
         if (f->version > 0)
             put_symbol(c, state, f->bits_per_raw_sample, 0);
         put_rac(c, state, f->chroma_planes);
@@ -437,15 +550,19 @@ static int write_extradata(FFV1Context *f)
 
     f->avctx->extradata_size = 10000 + 4 +
                                     (11 * 11 * 5 * 5 * 5 + 11 * 11 * 11) * 32;
-    f->avctx->extradata = av_malloc(f->avctx->extradata_size);
+    f->avctx->extradata = av_malloc(f->avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!f->avctx->extradata)
+        return AVERROR(ENOMEM);
     ff_init_range_encoder(c, f->avctx->extradata, f->avctx->extradata_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     put_symbol(c, state, f->version, 0);
     if (f->version > 2) {
-        if (f->version == 3)
-            f->minor_version = 2;
-        put_symbol(c, state, f->minor_version, 0);
+        if (f->version == 3) {
+            f->micro_version = 4;
+        } else if (f->version == 4)
+            f->micro_version = 2;
+        put_symbol(c, state, f->micro_version, 0);
     }
 
     put_symbol(c, state, f->ac, 0);
@@ -485,12 +602,11 @@ static int write_extradata(FFV1Context *f)
 
     if (f->version > 2) {
         put_symbol(c, state, f->ec, 0);
+        put_symbol(c, state, f->intra = (f->avctx->gop_size < 2), 0);
     }
 
     f->avctx->extradata_size = ff_rac_terminate(c);
-
-    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
-               f->avctx->extradata, f->avctx->extradata_size);
+    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, f->avctx->extradata, f->avctx->extradata_size);
     AV_WL32(f->avctx->extradata + f->avctx->extradata_size, v);
     f->avctx->extradata_size += 4;
 
@@ -515,7 +631,7 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
 
                 double size0 = COST2(i,  i) + COST2(i2, i2);
                 double sizeX = COST2(i, i2) + COST2(i2, i);
-                if (sizeX < size0 && i != 128 && i2 != 128) {
+                if (size0 - sizeX > size0*(1e-14) && i != 128 && i2 != 128) {
                     int j;
                     FFSWAP(int, stt[i], stt[i2]);
                     FFSWAP(int, s->rc_stat[i][0], s->rc_stat[i2][0]);
@@ -545,24 +661,14 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
     return print;
 }
 
-static av_cold int init_slices_state(FFV1Context *f)
-{
-    int i, ret;
-    for (i = 0; i < f->slice_count; i++) {
-        FFV1Context *fs = f->slice_context[i];
-        if ((ret = ffv1_init_slice_state(f, fs)) < 0)
-            return AVERROR(ENOMEM);
-    }
-    return 0;
-}
-
-static av_cold int ffv1_encode_init(AVCodecContext *avctx)
+static av_cold int encode_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     int i, j, k, m, ret;
 
-    ffv1_common_init(avctx);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
     s->version = 0;
 
@@ -570,35 +676,48 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
         avctx->slices > 1)
         s->version = FFMAX(s->version, 2);
 
-    if (avctx->level == 3) {
+    // Unspecified level & slices, we choose version 1.2+ to ensure multithreaded decodability
+    if (avctx->slices == 0 && avctx->level < 0 && avctx->width * avctx->height > 720*576)
+        s->version = FFMAX(s->version, 2);
+
+    if (avctx->level <= 0 && s->version == 2) {
         s->version = 3;
     }
+    if (avctx->level >= 0 && avctx->level <= 4) {
+        if (avctx->level < s->version) {
+            av_log(avctx, AV_LOG_ERROR, "Version %d needed for requested features but %d requested\n", s->version, avctx->level);
+            return AVERROR(EINVAL);
+        }
+        s->version = avctx->level;
+    }
 
     if (s->ec < 0) {
         s->ec = (s->version >= 3);
     }
 
-    if (s->version >= 2 &&
-        avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Version %d requested, please set -strict experimental in "
-               "order to enable it\n",
-               s->version);
-        return AVERROR(ENOSYS);
+    if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Version 2 needed for requested features but version 2 is experimental and not enabled\n");
+        return AVERROR_INVALIDDATA;
     }
 
     s->ac = avctx->coder_type > 0 ? 2 : 0;
 
     s->plane_count = 3;
-    switch (avctx->pix_fmt) {
+    switch(avctx->pix_fmt) {
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA420P9:
         if (!avctx->bits_per_raw_sample)
             s->bits_per_raw_sample = 9;
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
         s->packed_at_lsb = 1;
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
@@ -606,6 +725,9 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample) {
             s->bits_per_raw_sample = 16;
         } else if (!s->bits_per_raw_sample) {
@@ -616,15 +738,12 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
             return AVERROR_INVALIDDATA;
         }
         if (!s->ac && avctx->coder_type == -1) {
-            av_log(avctx, AV_LOG_INFO,
-                   "bits_per_raw_sample > 8, forcing coder 1\n");
+            av_log(avctx, AV_LOG_INFO, "bits_per_raw_sample > 8, forcing coder 1\n");
             s->ac = 2;
         }
         if (!s->ac) {
-            av_log(
-                avctx, AV_LOG_ERROR,
-                "bits_per_raw_sample of more than 8 needs -coder 1 currently\n");
-            return AVERROR_INVALIDDATA;
+            av_log(avctx, AV_LOG_ERROR, "bits_per_raw_sample of more than 8 needs -coder 1 currently\n");
+            return AVERROR(ENOSYS);
         }
         s->version = FFMAX(s->version, 1);
     case AV_PIX_FMT_GRAY8:
@@ -634,19 +753,29 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV411P:
     case AV_PIX_FMT_YUV410P:
-        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
-        s->colorspace    = 0;
-        break;
     case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUVA422P:
     case AV_PIX_FMT_YUVA420P:
-        s->chroma_planes = 1;
-        s->colorspace    = 0;
-        s->transparency  = 1;
+        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
+        s->colorspace = 0;
+        s->transparency = desc->nb_components == 4;
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
+        else if (!s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_RGB32:
-        s->colorspace   = 1;
+        s->colorspace = 1;
         s->transparency = 1;
+        s->chroma_planes = 1;
+        if (!avctx->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
+        break;
+    case AV_PIX_FMT_0RGB32:
+        s->colorspace = 1;
+        s->chroma_planes = 1;
+        if (!avctx->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_GBRP9:
         if (!avctx->bits_per_raw_sample)
@@ -654,55 +783,64 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
     case AV_PIX_FMT_GBRP10:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
-    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GBRP12:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
-            s->bits_per_raw_sample = 16;
+            s->bits_per_raw_sample = 12;
+    case AV_PIX_FMT_GBRP14:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 14;
         else if (!s->bits_per_raw_sample)
             s->bits_per_raw_sample = avctx->bits_per_raw_sample;
-        s->colorspace    = 1;
+        s->colorspace = 1;
         s->chroma_planes = 1;
-        s->version       = FFMAX(s->version, 1);
+        s->version = FFMAX(s->version, 1);
+        if (!s->ac && avctx->coder_type == -1) {
+            av_log(avctx, AV_LOG_INFO, "bits_per_raw_sample > 8, forcing coder 1\n");
+            s->ac = 2;
+        }
+        if (!s->ac) {
+            av_log(avctx, AV_LOG_ERROR, "bits_per_raw_sample of more than 8 needs -coder 1 currently\n");
+            return AVERROR(ENOSYS);
+        }
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return AVERROR_INVALIDDATA;
+        return AVERROR(ENOSYS);
     }
+    av_assert0(s->bits_per_raw_sample >= 8);
+
     if (s->transparency) {
-        av_log(
-            avctx, AV_LOG_WARNING,
-            "Storing alpha plane, this will require a recent FFV1 decoder to playback!\n");
+        av_log(avctx, AV_LOG_WARNING, "Storing alpha plane, this will require a recent FFV1 decoder to playback!\n");
     }
     if (avctx->context_model > 1U) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid context model %d, valid values are 0 and 1\n",
-               avctx->context_model);
+        av_log(avctx, AV_LOG_ERROR, "Invalid context model %d, valid values are 0 and 1\n", avctx->context_model);
         return AVERROR(EINVAL);
     }
 
     if (s->ac > 1)
         for (i = 1; i < 256; i++)
-            s->state_transition[i] = ffv1_ver2_state[i];
+            s->state_transition[i] = ver2_state[i];
 
     for (i = 0; i < 256; i++) {
         s->quant_table_count = 2;
         if (s->bits_per_raw_sample <= 8) {
-            s->quant_tables[0][0][i] = ffv1_quant11[i];
-            s->quant_tables[0][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant11[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant11[i];
-            s->quant_tables[1][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5[i]  * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5[i]  *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5[i]  *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant11[i];
+            s->quant_tables[0][1][i]=        11*quant11[i];
+            s->quant_tables[0][2][i]=     11*11*quant11[i];
+            s->quant_tables[1][0][i]=           quant11[i];
+            s->quant_tables[1][1][i]=        11*quant11[i];
+            s->quant_tables[1][2][i]=     11*11*quant5 [i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5 [i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5 [i];
         } else {
-            s->quant_tables[0][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[0][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant9_10bit[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[1][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5_10bit[i] * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5_10bit[i] *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5_10bit[i] *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant9_10bit[i];
+            s->quant_tables[0][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[0][2][i]=     11*11*quant9_10bit[i];
+            s->quant_tables[1][0][i]=           quant9_10bit[i];
+            s->quant_tables[1][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[1][2][i]=     11*11*quant5_10bit[i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5_10bit[i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5_10bit[i];
         }
     }
     s->context_count[0] = (11 * 11 * 11        + 1) / 2;
@@ -718,7 +856,7 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
         p->context_count     = s->context_count[p->quant_table_index];
     }
 
-    if ((ret = ffv1_allocate_initial_states(s)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(s)) < 0)
         return ret;
 
 #if FF_API_CODED_FRAME
@@ -729,10 +867,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (!s->transparency)
         s->plane_count = 2;
+    if (!s->chroma_planes && s->version > 3)
+        s->plane_count--;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift,
-                                     &s->chroma_v_shift);
-
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
     s->picture_number = 0;
 
     if (avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) {
@@ -745,19 +883,22 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
-        uint8_t best_state[256][256];
+        uint8_t (*best_state)[256] = av_malloc_array(256, 256);
         int gob_count = 0;
         char *next;
+        if (!best_state)
+            return AVERROR(ENOMEM);
 
         av_assert0(s->version >= 2);
 
-        for (;; ) {
+        for (;;) {
             for (j = 0; j < 256; j++)
                 for (i = 0; i < 2; i++) {
                     s->rc_stat[j][i] = strtol(p, &next, 0);
                     if (next == p) {
                         av_log(avctx, AV_LOG_ERROR,
                                "2Pass file invalid at %d %d [%s]\n", j, i, p);
+                        av_freep(&best_state);
                         return AVERROR_INVALIDDATA;
                     }
                     p = next;
@@ -771,6 +912,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                 av_log(avctx, AV_LOG_ERROR,
                                        "2Pass file invalid at %d %d %d %d [%s]\n",
                                        i, j, k, m, p);
+                                av_freep(&best_state);
                                 return AVERROR_INVALIDDATA;
                             }
                             p = next;
@@ -779,6 +921,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             gob_count = strtol(p, &next, 0);
             if (next == p || gob_count <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "2Pass file invalid\n");
+                av_freep(&best_state);
                 return AVERROR_INVALIDDATA;
             }
             p = next;
@@ -792,47 +935,64 @@ FF_ENABLE_DEPRECATION_WARNINGS
         find_best_state(best_state, s->state_transition);
 
         for (i = 0; i < s->quant_table_count; i++) {
-            for (j = 0; j < s->context_count[i]; j++)
-                for (k = 0; k < 32; k++) {
+            for (k = 0; k < 32; k++) {
+                double a=0, b=0;
+                int jp = 0;
+                for (j = 0; j < s->context_count[i]; j++) {
                     double p = 128;
-                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]) {
-                        p = 256.0 * s->rc_stat2[i][j][k][1] /
-                            (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]);
+                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1] > 200 && j || a+b > 200) {
+                        if (a+b)
+                            p = 256.0 * b / (a + b);
+                        s->initial_states[i][jp][k] =
+                            best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
+                        for(jp++; jp<j; jp++)
+                            s->initial_states[i][jp][k] = s->initial_states[i][jp-1][k];
+                        a=b=0;
+                    }
+                    a += s->rc_stat2[i][j][k][0];
+                    b += s->rc_stat2[i][j][k][1];
+                    if (a+b) {
+                        p = 256.0 * b / (a + b);
                     }
                     s->initial_states[i][j][k] =
-                        best_state[av_clip(round(p), 1, 255)][av_clip((s->rc_stat2[i][j][k][0] +
-                                                                       s->rc_stat2[i][j][k][1]) /
-                                                                      gob_count, 0, 255)];
+                        best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
                 }
+            }
         }
+        av_freep(&best_state);
     }
 
     if (s->version > 1) {
-        for (s->num_v_slices = 2; s->num_v_slices < 9; s->num_v_slices++)
-            for (s->num_h_slices = s->num_v_slices;
-                 s->num_h_slices < 2 * s->num_v_slices; s->num_h_slices++)
-                if (avctx->slices == s->num_h_slices * s->num_v_slices &&
-                    avctx->slices <= 64 || !avctx->slices)
+        s->num_v_slices = (avctx->width > 352 || avctx->height > 288 || !avctx->slices) ? 2 : 1;
+        for (; s->num_v_slices < 9; s->num_v_slices++) {
+            for (s->num_h_slices = s->num_v_slices; s->num_h_slices < 2*s->num_v_slices; s->num_h_slices++) {
+                if (avctx->slices == s->num_h_slices * s->num_v_slices && avctx->slices <= 64 || !avctx->slices)
                     goto slices_ok;
+            }
+        }
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported number %d of slices requested, please specify a "
                "supported number with -slices (ex:4,6,9,12,16, ...)\n",
                avctx->slices);
         return AVERROR(ENOSYS);
 slices_ok:
-        write_extradata(s);
+        if ((ret = write_extradata(s)) < 0)
+            return ret;
     }
 
-    if ((ret = ffv1_init_slice_contexts(s)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(s)) < 0)
         return ret;
-    if ((ret = init_slices_state(s)) < 0)
+    s->slice_count = s->max_slice_count;
+    if ((ret = ff_ffv1_init_slices_state(s)) < 0)
         return ret;
 
 #define STATS_OUT_SIZE 1024 * 1024 * 6
     if (avctx->flags & AV_CODEC_FLAG_PASS1) {
         avctx->stats_out = av_mallocz(STATS_OUT_SIZE);
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
         for (i = 0; i < s->quant_table_count; i++)
-            for (j = 0; j < s->slice_count; j++) {
+            for (j = 0; j < s->max_slice_count; j++) {
                 FFV1Context *sf = s->slice_context[j];
                 av_assert0(!sf->rc_stat2[i]);
                 sf->rc_stat2[i] = av_mallocz(s->context_count[i] *
@@ -852,23 +1012,112 @@ static void encode_slice_header(FFV1Context *f, FFV1Context *fs)
     int j;
     memset(state, 128, sizeof(state));
 
-    put_symbol(c, state, (fs->slice_x + 1) * f->num_h_slices / f->width, 0);
-    put_symbol(c, state, (fs->slice_y + 1) * f->num_v_slices / f->height, 0);
-    put_symbol(c, state, (fs->slice_width + 1) * f->num_h_slices / f->width - 1,
-               0);
-    put_symbol(c, state,
-               (fs->slice_height + 1) * f->num_v_slices / f->height - 1,
-               0);
-    for (j = 0; j < f->plane_count; j++) {
+    put_symbol(c, state, (fs->slice_x     +1)*f->num_h_slices / f->width   , 0);
+    put_symbol(c, state, (fs->slice_y     +1)*f->num_v_slices / f->height  , 0);
+    put_symbol(c, state, (fs->slice_width +1)*f->num_h_slices / f->width -1, 0);
+    put_symbol(c, state, (fs->slice_height+1)*f->num_v_slices / f->height-1, 0);
+    for (j=0; j<f->plane_count; j++) {
         put_symbol(c, state, f->plane[j].quant_table_index, 0);
         av_assert0(f->plane[j].quant_table_index == f->avctx->context_model);
     }
-    if (!f->frame->interlaced_frame)
+    if (!f->picture.f->interlaced_frame)
         put_symbol(c, state, 3, 0);
     else
-        put_symbol(c, state, 1 + !f->frame->top_field_first, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.num, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.den, 0);
+        put_symbol(c, state, 1 + !f->picture.f->top_field_first, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.num, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.den, 0);
+    if (f->version > 3) {
+        put_rac(c, state, fs->slice_coding_mode == 1);
+        if (fs->slice_coding_mode == 1)
+            ff_ffv1_clear_slice_state(f, fs);
+        put_symbol(c, state, fs->slice_coding_mode, 0);
+        if (fs->slice_coding_mode != 1) {
+            put_symbol(c, state, fs->slice_rct_by_coef, 0);
+            put_symbol(c, state, fs->slice_rct_ry_coef, 0);
+        }
+    }
+}
+
+static void choose_rct_params(FFV1Context *fs, const uint8_t *src[3], const int stride[3], int w, int h)
+{
+#define NB_Y_COEFF 15
+    static const int rct_y_coeff[15][2] = {
+        {0, 0}, //      4G
+        {1, 1}, //  R + 2G + B
+        {2, 2}, // 2R      + 2B
+        {0, 2}, //      2G + 2B
+        {2, 0}, // 2R + 2G
+        {4, 0}, // 4R
+        {0, 4}, //           4B
+
+        {0, 3}, //      1G + 3B
+        {3, 0}, // 3R + 1G
+        {3, 1}, // 3R      +  B
+        {1, 3}, //  R      + 3B
+        {1, 2}, //  R +  G + 2B
+        {2, 1}, // 2R +  G +  B
+        {0, 1}, //      3G +  B
+        {1, 0}, //  R + 3G
+    };
+
+    int stat[NB_Y_COEFF] = {0};
+    int x, y, i, p, best;
+    int16_t *sample[3];
+    int lbd = fs->bits_per_raw_sample <= 8;
+
+    for (y = 0; y < h; y++) {
+        int lastr=0, lastg=0, lastb=0;
+        for (p = 0; p < 3; p++)
+            sample[p] = fs->sample_buffer + p*w;
+
+        for (x = 0; x < w; x++) {
+            int b, g, r;
+            int ab, ag, ar;
+            if (lbd) {
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
+                r = (v >> 16) & 0xFF;
+            } else {
+                b = *((const uint16_t*)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t*)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t*)(src[2] + x*2 + stride[2]*y));
+            }
+
+            ar = r - lastr;
+            ag = g - lastg;
+            ab = b - lastb;
+            if (x && y) {
+                int bg = ag - sample[0][x];
+                int bb = ab - sample[1][x];
+                int br = ar - sample[2][x];
+
+                br -= bg;
+                bb -= bg;
+
+                for (i = 0; i<NB_Y_COEFF; i++) {
+                    stat[i] += FFABS(bg + ((br*rct_y_coeff[i][0] + bb*rct_y_coeff[i][1])>>2));
+                }
+
+            }
+            sample[0][x] = ag;
+            sample[1][x] = ab;
+            sample[2][x] = ar;
+
+            lastr = r;
+            lastg = g;
+            lastb = b;
+        }
+    }
+
+    best = 0;
+    for (i=1; i<NB_Y_COEFF; i++) {
+        if (stat[i] < stat[best])
+            best = i;
+    }
+
+    fs->slice_rct_by_coef = rct_y_coeff[best][1];
+    fs->slice_rct_ry_coef = rct_y_coeff[best][0];
 }
 
 static int encode_slice(AVCodecContext *c, void *arg)
@@ -879,75 +1128,144 @@ static int encode_slice(AVCodecContext *c, void *arg)
     int height       = fs->slice_height;
     int x            = fs->slice_x;
     int y            = fs->slice_y;
-    const AVFrame *const p = f->frame;
-    const int ps     = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                       ? (f->bits_per_raw_sample > 8) + 1
-                       : 4;
+    const AVFrame *const p = f->picture.f;
+    const int ps     = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    int ret;
+    RangeCoder c_bak = fs->c;
+    const uint8_t *planes[3] = {p->data[0] + ps*x + y*p->linesize[0],
+                                p->data[1] + ps*x + y*p->linesize[1],
+                                p->data[2] + ps*x + y*p->linesize[2]};
+
+    fs->slice_coding_mode = 0;
+    if (f->version > 3) {
+        choose_rct_params(fs, planes, p->linesize, width, height);
+    } else {
+        fs->slice_rct_by_coef = 1;
+        fs->slice_rct_ry_coef = 1;
+    }
 
+retry:
     if (f->key_frame)
-        ffv1_clear_slice_state(f, fs);
+        ff_ffv1_clear_slice_state(f, fs);
     if (f->version > 2) {
         encode_slice_header(f, fs);
     }
     if (!fs->ac) {
         if (f->version > 2)
             put_rac(&fs->c, (uint8_t[]) { 129 }, 0);
-        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate( &fs->c) : 0;
-        init_put_bits(&fs->pb, fs->c.bytestream_start + fs->ac_byte_count,
+        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate(&fs->c) : 0;
+        init_put_bits(&fs->pb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
                       fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count);
     }
 
     if (f->colorspace == 0) {
-        const int chroma_width  = -((-width) >> f->chroma_h_shift);
-        const int chroma_height = -((-height) >> f->chroma_v_shift);
+        const int chroma_width  = FF_CEIL_RSHIFT(width,  f->chroma_h_shift);
+        const int chroma_height = FF_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
 
-        encode_plane(fs, p->data[0] + ps * x + y * p->linesize[0],
-                     width, height, p->linesize[0], 0);
+        ret = encode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0);
 
         if (f->chroma_planes) {
-            encode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1], 1);
-            encode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2], 1);
+            ret |= encode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1);
+            ret |= encode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1);
         }
         if (fs->transparency)
-            encode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3], 2);
+            ret |= encode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], 2);
     } else {
-        const uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
-                                     p->data[1] + ps * x + y * p->linesize[1],
-                                     p->data[2] + ps * x + y * p->linesize[2] };
-        encode_rgb_frame(fs, planes, width, height, p->linesize);
+        ret = encode_rgb_frame(fs, planes, width, height, p->linesize);
     }
     emms_c();
 
+    if (ret < 0) {
+        av_assert0(fs->slice_coding_mode == 0);
+        if (fs->version < 4 || !fs->ac) {
+            av_log(c, AV_LOG_ERROR, "Buffer too small\n");
+            return ret;
+        }
+        av_log(c, AV_LOG_DEBUG, "Coding slice as PCM\n");
+        fs->slice_coding_mode = 1;
+        fs->c = c_bak;
+        goto retry;
+    }
+
     return 0;
 }
 
-static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
     FFV1Context *f      = avctx->priv_data;
     RangeCoder *const c = &f->slice_context[0]->c;
+    AVFrame *const p    = f->picture.f;
     int used_count      = 0;
     uint8_t keystate    = 128;
     uint8_t *buf_p;
     int i, ret;
+    int64_t maxsize =   AV_INPUT_BUFFER_MIN_SIZE
+                      + avctx->width*avctx->height*35LL*4;
+
+    if(!pict) {
+        if (avctx->flags & AV_CODEC_FLAG_PASS1) {
+            int j, k, m;
+            char *p   = avctx->stats_out;
+            char *end = p + STATS_OUT_SIZE;
+
+            memset(f->rc_stat, 0, sizeof(f->rc_stat));
+            for (i = 0; i < f->quant_table_count; i++)
+                memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
+
+            av_assert0(f->slice_count == f->max_slice_count);
+            for (j = 0; j < f->slice_count; j++) {
+                FFV1Context *fs = f->slice_context[j];
+                for (i = 0; i < 256; i++) {
+                    f->rc_stat[i][0] += fs->rc_stat[i][0];
+                    f->rc_stat[i][1] += fs->rc_stat[i][1];
+                }
+                for (i = 0; i < f->quant_table_count; i++) {
+                    for (k = 0; k < f->context_count[i]; k++)
+                        for (m = 0; m < 32; m++) {
+                            f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
+                            f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
+                        }
+                }
+            }
 
-    f->frame = pict;
+            for (j = 0; j < 256; j++) {
+                snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                        f->rc_stat[j][0], f->rc_stat[j][1]);
+                p += strlen(p);
+            }
+            snprintf(p, end - p, "\n");
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height *
-                             ((8 * 2 + 1 + 1) * 4) / 8 +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+            for (i = 0; i < f->quant_table_count; i++) {
+                for (j = 0; j < f->context_count[i]; j++)
+                    for (m = 0; m < 32; m++) {
+                        snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                                f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
+                        p += strlen(p);
+                    }
+            }
+            snprintf(p, end - p, "%d\n", f->gob_count);
+        }
+        return 0;
     }
 
+    if (f->version > 3)
+        maxsize = AV_INPUT_BUFFER_MIN_SIZE + avctx->width*avctx->height*3LL*4;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, maxsize, 0)) < 0)
+        return ret;
+
     ff_init_range_encoder(c, pkt->data, pkt->size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
+    av_frame_unref(p);
+    if ((ret = av_frame_ref(p, pict)) < 0)
+        return ret;
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+
     if (avctx->gop_size == 0 || f->picture_number % avctx->gop_size == 0) {
         put_rac(c, &keystate, 1);
         f->key_frame = 1;
@@ -968,9 +1286,8 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     for (i = 1; i < f->slice_count; i++) {
         FFV1Context *fs = f->slice_context[i];
-        uint8_t *start  = pkt->data +
-                          (pkt->size - used_count) * (int64_t)i / f->slice_count;
-        int len = pkt->size / f->slice_count;
+        uint8_t *start  = pkt->data + (pkt->size - used_count) * (int64_t)i / f->slice_count;
+        int len         = pkt->size / f->slice_count;
         ff_init_range_encoder(&fs->c, start, len);
     }
     avctx->execute(avctx, encode_slice, &f->slice_context[0], NULL,
@@ -1006,47 +1323,7 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         buf_p += bytes;
     }
 
-    if ((avctx->flags & AV_CODEC_FLAG_PASS1) && (f->picture_number & 31) == 0) {
-        int j, k, m;
-        char *p   = avctx->stats_out;
-        char *end = p + STATS_OUT_SIZE;
-
-        memset(f->rc_stat, 0, sizeof(f->rc_stat));
-        for (i = 0; i < f->quant_table_count; i++)
-            memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
-
-        for (j = 0; j < f->slice_count; j++) {
-            FFV1Context *fs = f->slice_context[j];
-            for (i = 0; i < 256; i++) {
-                f->rc_stat[i][0] += fs->rc_stat[i][0];
-                f->rc_stat[i][1] += fs->rc_stat[i][1];
-            }
-            for (i = 0; i < f->quant_table_count; i++) {
-                for (k = 0; k < f->context_count[i]; k++)
-                    for (m = 0; m < 32; m++) {
-                        f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
-                        f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
-                    }
-            }
-        }
-
-        for (j = 0; j < 256; j++) {
-            snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                     f->rc_stat[j][0], f->rc_stat[j][1]);
-            p += strlen(p);
-        }
-        snprintf(p, end - p, "\n");
-
-        for (i = 0; i < f->quant_table_count; i++) {
-            for (j = 0; j < f->context_count[i]; j++)
-                for (m = 0; m < 32; m++) {
-                    snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                             f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
-                    p += strlen(p);
-                }
-        }
-        snprintf(p, end - p, "%d\n", f->gob_count);
-    } else if (avctx->flags & AV_CODEC_FLAG_PASS1)
+    if (avctx->flags & AV_CODEC_FLAG_PASS1)
         avctx->stats_out[0] = '\0';
 
 #if FF_API_CODED_FRAME
@@ -1057,27 +1334,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     f->picture_number++;
     pkt->size   = buf_p - pkt->data;
+    pkt->pts    =
+    pkt->dts    = pict->pts;
     pkt->flags |= AV_PKT_FLAG_KEY * f->key_frame;
     *got_packet = 1;
 
     return 0;
 }
 
-static av_cold int ffv1_encode_close(AVCodecContext *avctx)
+static av_cold int encode_close(AVCodecContext *avctx)
 {
-    ffv1_close(avctx);
+    ff_ffv1_close(avctx);
     return 0;
 }
 
 #define OFFSET(x) offsetof(FFV1Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_INT,
-             { .i64 = -1 }, -1, 1, VE },
+    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE },
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass ffv1_class = {
     .class_name = "ffv1 encoder",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1095,23 +1373,24 @@ AVCodec ff_ffv1_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_encode_init,
-    .encode2        = ffv1_encode_frame,
-    .close          = ffv1_encode_close,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P,   AV_PIX_FMT_YUV410P,
-        AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,  AV_PIX_FMT_YUV420P9,
-        AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
-        AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
-        AV_PIX_FMT_RGB32,
-        AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
-        AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUVA444P,
-        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUVA444P,  AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P,   AV_PIX_FMT_0RGB32,    AV_PIX_FMT_RGB32,     AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,
+        AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUVA444P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA420P10,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA420P9,
+        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,     AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
+        AV_PIX_FMT_GBRP12,    AV_PIX_FMT_GBRP14,
         AV_PIX_FMT_NONE
 
     },
     .defaults       = ffv1_defaults,
-    .priv_class     = &class,
+    .priv_class     = &ffv1_class,
 };
diff --git a/libavcodec/ffwavesynth.c b/libavcodec/ffwavesynth.c
new file mode 100644
index 0000000000..9d055e4019
--- /dev/null
+++ b/libavcodec/ffwavesynth.c
@@ -0,0 +1,481 @@
+/*
+ * Wavesynth pseudo-codec
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "avcodec.h"
+#include "internal.h"
+
+
+#define SIN_BITS 14
+#define WS_MAX_CHANNELS 32
+#define INF_TS 0x7FFFFFFFFFFFFFFF
+
+#define PINK_UNIT 128
+
+/*
+   Format of the extradata and packets
+
+   THIS INFORMATION IS NOT PART OF THE PUBLIC API OR ABI.
+   IT CAN CHANGE WITHOUT NOTIFICATION.
+
+   All numbers are in little endian.
+
+   The codec extradata define a set of intervals with uniform content.
+   Overlapping intervals are added together.
+
+   extradata:
+       uint32      number of intervals
+       ...         intervals
+
+   interval:
+       int64       start timestamp; time_base must be 1/sample_rate;
+                   start timestamps must be in ascending order
+       int64       end timestamp
+       uint32      type
+       uint32      channels mask
+       ...         additional information, depends on type
+
+   sine interval (type fourcc "SINE"):
+       int32       start frequency, in 1/(1<<16) Hz
+       int32       end frequency
+       int32       start amplitude, 1<<16 is the full amplitude
+       int32       end amplitude
+       uint32      start phase, 0 is sin(0), 0x20000000 is sin(pi/2), etc.;
+                   n | (1<<31) means to match the phase of previous channel #n
+
+   pink noise interval (type fourcc "NOIS"):
+       int32       start amplitude
+       int32       end amplitude
+
+   The input packets encode the time and duration of the requested segment.
+
+   packet:
+       int64       start timestamp
+       int32       duration
+
+*/
+
+enum ws_interval_type {
+    WS_SINE  = MKTAG('S','I','N','E'),
+    WS_NOISE = MKTAG('N','O','I','S'),
+};
+
+struct ws_interval {
+    int64_t ts_start, ts_end;
+    uint64_t phi0, dphi0, ddphi;
+    uint64_t amp0, damp;
+    uint64_t phi, dphi, amp;
+    uint32_t channels;
+    enum ws_interval_type type;
+    int next;
+};
+
+struct wavesynth_context {
+    int64_t cur_ts;
+    int64_t next_ts;
+    int32_t *sin;
+    struct ws_interval *inter;
+    uint32_t dither_state;
+    uint32_t pink_state;
+    int32_t pink_pool[PINK_UNIT];
+    unsigned pink_need, pink_pos;
+    int nb_inter;
+    int cur_inter;
+    int next_inter;
+};
+
+#define LCG_A 1284865837
+#define LCG_C 4150755663
+#define LCG_AI 849225893 /* A*AI = 1 [mod 1<<32] */
+
+static uint32_t lcg_next(uint32_t *s)
+{
+    *s = *s * LCG_A + LCG_C;
+    return *s;
+}
+
+static void lcg_seek(uint32_t *s, int64_t dt)
+{
+    uint32_t a, c, t = *s;
+
+    if (dt >= 0) {
+        a = LCG_A;
+        c = LCG_C;
+    } else { /* coefficients for a step backward */
+        a = LCG_AI;
+        c = (uint32_t)(LCG_AI * LCG_C);
+        dt = -dt;
+    }
+    while (dt) {
+        if (dt & 1)
+            t = a * t + c;
+        c *= a + 1; /* coefficients for a double step */
+        a *= a;
+        dt >>= 1;
+    }
+    *s = t;
+}
+
+/* Emulate pink noise by summing white noise at the sampling frequency,
+ * white noise at half the sampling frequency (each value taken twice),
+ * etc., with a total of 8 octaves.
+ * This is known as the Voss-McCartney algorithm. */
+
+static void pink_fill(struct wavesynth_context *ws)
+{
+    int32_t vt[7] = { 0 }, v = 0;
+    int i, j;
+
+    ws->pink_pos = 0;
+    if (!ws->pink_need)
+        return;
+    for (i = 0; i < PINK_UNIT; i++) {
+        for (j = 0; j < 7; j++) {
+            if ((i >> j) & 1)
+                break;
+            v -= vt[j];
+            vt[j] = (int32_t)lcg_next(&ws->pink_state) >> 3;
+            v += vt[j];
+        }
+        ws->pink_pool[i] = v + ((int32_t)lcg_next(&ws->pink_state) >> 3);
+    }
+    lcg_next(&ws->pink_state); /* so we use exactly 256 steps */
+}
+
+/**
+ * @return  (1<<64) * a / b, without overflow, if a < b
+ */
+static uint64_t frac64(uint64_t a, uint64_t b)
+{
+    uint64_t r = 0;
+    int i;
+
+    if (b < (uint64_t)1 << 32) { /* b small, use two 32-bits steps */
+        a <<= 32;
+        return ((a / b) << 32) | ((a % b) << 32) / b;
+    }
+    if (b < (uint64_t)1 << 48) { /* b medium, use four 16-bits steps */
+        for (i = 0; i < 4; i++) {
+            a <<= 16;
+            r = (r << 16) | (a / b);
+            a %= b;
+        }
+        return r;
+    }
+    for (i = 63; i >= 0; i--) {
+        if (a >= (uint64_t)1 << 63 || a << 1 >= b) {
+            r |= (uint64_t)1 << i;
+            a = (a << 1) - b;
+        } else {
+            a <<= 1;
+        }
+    }
+    return r;
+}
+
+static uint64_t phi_at(struct ws_interval *in, int64_t ts)
+{
+    uint64_t dt = ts - in->ts_start;
+    uint64_t dt2 = dt & 1 ? /* dt * (dt - 1) / 2 without overflow */
+                   dt * ((dt - 1) >> 1) : (dt >> 1) * (dt - 1);
+    return in->phi0 + dt * in->dphi0 + dt2 * in->ddphi;
+}
+
+static void wavesynth_seek(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi  = phi_at(in, ts);
+        in->dphi = in->dphi0 + (ts - in->ts_start) * in->ddphi;
+        in->amp  = in->amp0  + (ts - in->ts_start) * in->damp;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+    lcg_seek(&ws->dither_state, ts - ws->cur_ts);
+    if (ws->pink_need) {
+        int64_t pink_ts_cur  = (ws->cur_ts + PINK_UNIT - 1) & ~(PINK_UNIT - 1);
+        int64_t pink_ts_next = ts & ~(PINK_UNIT - 1);
+        int pos = ts & (PINK_UNIT - 1);
+        lcg_seek(&ws->pink_state, (pink_ts_next - pink_ts_cur) << 1);
+        if (pos) {
+            pink_fill(ws);
+            ws->pink_pos = pos;
+        } else {
+            ws->pink_pos = PINK_UNIT;
+        }
+    }
+    ws->cur_ts = ts;
+}
+
+static int wavesynth_parse_extradata(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    struct ws_interval *in;
+    uint8_t *edata, *edata_end;
+    int32_t f1, f2, a1, a2;
+    uint32_t phi;
+    int64_t dphi1, dphi2, dt, cur_ts = -0x8000000000000000;
+    int i;
+
+    if (avc->extradata_size < 4)
+        return AVERROR(EINVAL);
+    edata = avc->extradata;
+    edata_end = edata + avc->extradata_size;
+    ws->nb_inter = AV_RL32(edata);
+    edata += 4;
+    if (ws->nb_inter < 0)
+        return AVERROR(EINVAL);
+    ws->inter = av_calloc(ws->nb_inter, sizeof(*ws->inter));
+    if (!ws->inter)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (edata_end - edata < 24)
+            return AVERROR(EINVAL);
+        in->ts_start = AV_RL64(edata +  0);
+        in->ts_end   = AV_RL64(edata +  8);
+        in->type     = AV_RL32(edata + 16);
+        in->channels = AV_RL32(edata + 20);
+        edata += 24;
+        if (in->ts_start < cur_ts || in->ts_end <= in->ts_start)
+            return AVERROR(EINVAL);
+        cur_ts = in->ts_start;
+        dt = in->ts_end - in->ts_start;
+        switch (in->type) {
+            case WS_SINE:
+                if (edata_end - edata < 20)
+                    return AVERROR(EINVAL);
+                f1  = AV_RL32(edata +  0);
+                f2  = AV_RL32(edata +  4);
+                a1  = AV_RL32(edata +  8);
+                a2  = AV_RL32(edata + 12);
+                phi = AV_RL32(edata + 16);
+                edata += 20;
+                dphi1 = frac64(f1, (int64_t)avc->sample_rate << 16);
+                dphi2 = frac64(f2, (int64_t)avc->sample_rate << 16);
+                in->dphi0 = dphi1;
+                in->ddphi = (dphi2 - dphi1) / dt;
+                if (phi & 0x80000000) {
+                    phi &= ~0x80000000;
+                    if (phi >= i)
+                        return AVERROR(EINVAL);
+                    in->phi0 = phi_at(&ws->inter[phi], in->ts_start);
+                } else {
+                    in->phi0 = (uint64_t)phi << 33;
+                }
+                break;
+            case WS_NOISE:
+                if (edata_end - edata < 8)
+                    return AVERROR(EINVAL);
+                a1  = AV_RL32(edata +  0);
+                a2  = AV_RL32(edata +  4);
+                edata += 8;
+                break;
+            default:
+                return AVERROR(EINVAL);
+        }
+        in->amp0 = (int64_t)a1 << 32;
+        in->damp = (((int64_t)a2 << 32) - ((int64_t)a1 << 32)) / dt;
+    }
+    if (edata != edata_end)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
+static av_cold int wavesynth_init(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    int i, r;
+
+    if (avc->channels > WS_MAX_CHANNELS) {
+        av_log(avc, AV_LOG_ERROR,
+               "This implementation is limited to %d channels.\n",
+               WS_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
+    r = wavesynth_parse_extradata(avc);
+    if (r < 0) {
+        av_log(avc, AV_LOG_ERROR, "Invalid intervals definitions.\n");
+        goto fail;
+    }
+    ws->sin = av_malloc(sizeof(*ws->sin) << SIN_BITS);
+    if (!ws->sin) {
+        r = AVERROR(ENOMEM);
+        goto fail;
+    }
+    for (i = 0; i < 1 << SIN_BITS; i++)
+        ws->sin[i] = floor(32767 * sin(2 * M_PI * i / (1 << SIN_BITS)));
+    ws->dither_state = MKTAG('D','I','T','H');
+    for (i = 0; i < ws->nb_inter; i++)
+        ws->pink_need += ws->inter[i].type == WS_NOISE;
+    ws->pink_state = MKTAG('P','I','N','K');
+    ws->pink_pos = PINK_UNIT;
+    wavesynth_seek(ws, 0);
+    avc->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+
+fail:
+    av_freep(&ws->inter);
+    av_freep(&ws->sin);
+    return r;
+}
+
+static void wavesynth_synth_sample(struct wavesynth_context *ws, int64_t ts,
+                                   int32_t *channels)
+{
+    int32_t amp, val, *cv;
+    struct ws_interval *in;
+    int i, *last, pink;
+    uint32_t c, all_ch = 0;
+
+    i = ws->cur_inter;
+    last = &ws->cur_inter;
+    if (ws->pink_pos == PINK_UNIT)
+        pink_fill(ws);
+    pink = ws->pink_pool[ws->pink_pos++] >> 16;
+    while (i >= 0) {
+        in = &ws->inter[i];
+        i = in->next;
+        if (ts >= in->ts_end) {
+            *last = i;
+            continue;
+        }
+        last = &in->next;
+        amp = in->amp >> 32;
+        in->amp  += in->damp;
+        switch (in->type) {
+            case WS_SINE:
+                val = amp * ws->sin[in->phi >> (64 - SIN_BITS)];
+                in->phi  += in->dphi;
+                in->dphi += in->ddphi;
+                break;
+            case WS_NOISE:
+                val = amp * pink;
+                break;
+            default:
+                val = 0;
+        }
+        all_ch |= in->channels;
+        for (c = in->channels, cv = channels; c; c >>= 1, cv++)
+            if (c & 1)
+                *cv += val;
+    }
+    val = (int32_t)lcg_next(&ws->dither_state) >> 16;
+    for (c = all_ch, cv = channels; c; c >>= 1, cv++)
+        if (c & 1)
+            *cv += val;
+}
+
+static void wavesynth_enter_intervals(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = ws->cur_inter; i >= 0; i = ws->inter[i].next)
+        last = &ws->inter[i].next;
+    for (i = ws->next_inter; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi = in->phi0;
+        in->dphi = in->dphi0;
+        in->amp = in->amp0;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+}
+
+static int wavesynth_decode(AVCodecContext *avc, void *rframe, int *rgot_frame,
+                            AVPacket *packet)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    AVFrame *frame = rframe;
+    int64_t ts;
+    int duration;
+    int s, c, r;
+    int16_t *pcm;
+    int32_t channels[WS_MAX_CHANNELS];
+
+    *rgot_frame = 0;
+    if (packet->size != 12)
+        return AVERROR_INVALIDDATA;
+    ts = AV_RL64(packet->data);
+    if (ts != ws->cur_ts)
+        wavesynth_seek(ws, ts);
+    duration = AV_RL32(packet->data + 8);
+    if (duration <= 0)
+        return AVERROR(EINVAL);
+    frame->nb_samples = duration;
+    r = ff_get_buffer(avc, frame, 0);
+    if (r < 0)
+        return r;
+    pcm = (int16_t *)frame->data[0];
+    for (s = 0; s < duration; s++, ts++) {
+        memset(channels, 0, avc->channels * sizeof(*channels));
+        if (ts >= ws->next_ts)
+            wavesynth_enter_intervals(ws, ts);
+        wavesynth_synth_sample(ws, ts, channels);
+        for (c = 0; c < avc->channels; c++)
+            *(pcm++) = channels[c] >> 16;
+    }
+    ws->cur_ts += duration;
+    *rgot_frame = 1;
+    return packet->size;
+}
+
+static av_cold int wavesynth_close(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+
+    av_freep(&ws->sin);
+    av_freep(&ws->inter);
+    return 0;
+}
+
+AVCodec ff_ffwavesynth_decoder = {
+    .name           = "wavesynth",
+    .long_name      = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_FFWAVESYNTH,
+    .priv_data_size = sizeof(struct wavesynth_context),
+    .init           = wavesynth_init,
+    .close          = wavesynth_close,
+    .decode         = wavesynth_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/fic.c b/libavcodec/fic.c
index b1286ebe65..b58b017c77 100644
--- a/libavcodec/fic.c
+++ b/libavcodec/fic.c
@@ -4,24 +4,25 @@
  * Copyright (c) 2014 Konstantin Shishkov
  * Copyright (c) 2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -36,6 +37,7 @@ typedef struct FICThreadContext {
 } FICThreadContext;
 
 typedef struct FICContext {
+    AVClass *class;
     AVCodecContext *avctx;
     AVFrame *frame;
     AVFrame *final_frame;
@@ -51,6 +53,7 @@ typedef struct FICContext {
     int num_slices, slice_h;
 
     uint8_t cursor_buf[4096];
+    int skip_cursor;
 } FICContext;
 
 static const uint8_t fic_qmat_hq[64] = {
@@ -79,7 +82,7 @@ static const uint8_t fic_header[7] = { 0, 0, 1, 'F', 'I', 'C', 'V' };
 
 #define FIC_HEADER_SIZE 27
 
-static av_always_inline void fic_idct(int16_t *blk, int step, int shift)
+static av_always_inline void fic_idct(int16_t *blk, int step, int shift, int rnd)
 {
     const int t0 =  27246 * blk[3 * step] + 18405 * blk[5 * step];
     const int t1 =  27246 * blk[5 * step] - 18405 * blk[3 * step];
@@ -91,8 +94,8 @@ static av_always_inline void fic_idct(int16_t *blk, int step, int shift)
     const int t7 = t3 - t1;
     const int t8 =  17734 * blk[2 * step] - 42813 * blk[6 * step];
     const int t9 =  17734 * blk[6 * step] + 42814 * blk[2 * step];
-    const int tA = (blk[0 * step] - blk[4 * step] << 15) + (1 << shift - 1);
-    const int tB = (blk[0 * step] + blk[4 * step] << 15) + (1 << shift - 1);
+    const int tA = (blk[0 * step] - blk[4 * step] << 15) + rnd;
+    const int tB = (blk[0 * step] + blk[4 * step] << 15) + rnd;
     blk[0 * step] = (  t4       + t9 + tB) >> shift;
     blk[1 * step] = (  t6 + t7  + t8 + tA) >> shift;
     blk[2 * step] = (  t6 - t7  - t8 + tA) >> shift;
@@ -109,14 +112,15 @@ static void fic_idct_put(uint8_t *dst, int stride, int16_t *block)
     int16_t *ptr;
 
     ptr = block;
-    for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 8, 13);
+    fic_idct(ptr++, 8, 13, (1 << 12) + (1 << 17));
+    for (i = 1; i < 8; i++) {
+        fic_idct(ptr, 8, 13, 1 << 12);
         ptr++;
     }
 
     ptr = block;
     for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 1, 20);
+        fic_idct(ptr, 1, 20, 0);
         ptr += 8;
     }
 
@@ -262,13 +266,11 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     int msize;
     int tsize;
     int cur_x, cur_y;
-    int skip_cursor = 0;
+    int skip_cursor = ctx->skip_cursor;
     uint8_t *sdata;
 
-    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0)
         return ret;
-    }
 
     /* Header + at least one slice (4) */
     if (avpkt->size < FIC_HEADER_SIZE + 4) {
@@ -281,8 +283,13 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_WARNING, "Invalid FIC Header.\n");
 
     /* Is it a skip frame? */
-    if (src[17])
+    if (src[17]) {
+        if (!ctx->final_frame) {
+            av_log(avctx, AV_LOG_WARNING, "Initial frame is skipped\n");
+            return AVERROR_INVALIDDATA;
+        }
         goto skip;
+    }
 
     nslices = src[13];
     if (!nslices) {
@@ -302,7 +309,10 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (tsize < 32) {
+    if (!tsize)
+        skip_cursor = 1;
+
+    if (!skip_cursor && tsize < 32) {
         av_log(avctx, AV_LOG_WARNING,
                "Cursor data too small. Skipping cursor.\n");
         skip_cursor = 1;
@@ -311,14 +321,14 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     /* Cursor position. */
     cur_x = AV_RL16(src + 33);
     cur_y = AV_RL16(src + 35);
-    if (cur_x > avctx->width || cur_y > avctx->height) {
+    if (!skip_cursor && (cur_x > avctx->width || cur_y > avctx->height)) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid cursor position: (%d,%d). Skipping cusor.\n",
                cur_x, cur_y);
         skip_cursor = 1;
     }
 
-    if (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32) {
+    if (!skip_cursor && (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32)) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid cursor size. Skipping cursor.\n");
         skip_cursor = 1;
@@ -445,6 +455,18 @@ static av_cold int fic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "skip_cursor", "skip the cursor", offsetof(FICContext, skip_cursor), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM },
+{ NULL },
+};
+
+static const AVClass fic_decoder_class = {
+    .class_name = "FIC encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_fic_decoder = {
     .name           = "fic",
     .long_name      = NULL_IF_CONFIG_SMALL("Mirillis FIC"),
@@ -455,4 +477,5 @@ AVCodec ff_fic_decoder = {
     .decode         = fic_decode_frame,
     .close          = fic_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .priv_class     = &fic_decoder_class,
 };
diff --git a/libavcodec/flac.c b/libavcodec/flac.c
index 3e51fdeb98..f5154b9149 100644
--- a/libavcodec/flac.c
+++ b/libavcodec/flac.c
@@ -2,20 +2,20 @@
  * FLAC common code
  * Copyright (c) 2009 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -230,24 +230,8 @@ void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
         av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
         ff_flac_set_channel_layout(avctx);
 
-    s->samples  = get_bits_long(&gb, 32) << 4;
-    s->samples |= get_bits(&gb, 4);
+    s->samples = get_bits64(&gb, 36);
 
     skip_bits_long(&gb, 64); /* md5 sum */
     skip_bits_long(&gb, 64); /* md5 sum */
 }
-
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                              const uint8_t *buffer)
-{
-    ff_flac_parse_streaminfo(avctx, s, buffer);
-}
-
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                               enum FLACExtradataFormat *format,
-                               uint8_t **streaminfo_start)
-{
-    return ff_flac_is_extradata_valid(avctx, format, streaminfo_start);
-}
-#endif
diff --git a/libavcodec/flac.h b/libavcodec/flac.h
index 3229682047..96d971c9d7 100644
--- a/libavcodec/flac.h
+++ b/libavcodec/flac.h
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder/demuxer common functions
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,14 +99,6 @@ typedef struct FLACFrameInfo {
 void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
                               const uint8_t *buffer);
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                                  const uint8_t *buffer);
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                                   enum FLACExtradataFormat *format,
-                                   uint8_t **streaminfo_start);
-#endif
-
 /**
  * Validate the FLAC extradata.
  * @param[in]  avctx codec context containing the extradata.
diff --git a/libavcodec/flac_parser.c b/libavcodec/flac_parser.c
index 70b9a651e9..3723716441 100644
--- a/libavcodec/flac_parser.c
+++ b/libavcodec/flac_parser.c
@@ -2,20 +2,20 @@
  * FLAC parser
  * Copyright (c) 2010 Michael Chinen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -87,6 +87,8 @@ typedef struct FLACParseContext {
     int end_padded;                /**< specifies if fifo_buf's end is padded */
     uint8_t *wrap_buf;             /**< general fifo read buffer when wrapped */
     int wrap_buf_allocated_size;   /**< actual allocated size of the buffer   */
+    FLACFrameInfo last_fi;         /**< last decoded frame header info        */
+    int last_fi_valid;             /**< set if last_fi is valid               */
 } FLACParseContext;
 
 static int frame_header_is_valid(AVCodecContext *avctx, const uint8_t *buf,
@@ -180,7 +182,7 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
             size++;
         }
 
-        *end_handle = av_mallocz(sizeof(FLACHeaderMarker));
+        *end_handle = av_mallocz(sizeof(**end_handle));
         if (!*end_handle) {
             av_log(fpc->avctx, AV_LOG_ERROR,
                    "couldn't allocate FLACHeaderMarker\n");
@@ -190,6 +192,13 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
         (*end_handle)->offset       = offset;
         (*end_handle)->link_penalty = av_malloc(sizeof(int) *
                                             FLAC_MAX_SEQUENTIAL_HEADERS);
+        if (!(*end_handle)->link_penalty) {
+            av_freep(end_handle);
+            av_log(fpc->avctx, AV_LOG_ERROR,
+                   "couldn't allocate link_penalty\n");
+            return AVERROR(ENOMEM);
+        }
+
         for (i = 0; i < FLAC_MAX_SEQUENTIAL_HEADERS; i++)
             (*end_handle)->link_penalty[i] = FLAC_HEADER_NOT_PENALIZED_YET;
 
@@ -267,13 +276,12 @@ static int find_new_headers(FLACParseContext *fpc, int search_start)
     return size;
 }
 
-static int check_header_mismatch(FLACParseContext  *fpc,
-                                 FLACHeaderMarker  *header,
-                                 FLACHeaderMarker  *child,
-                                 int                log_level_offset)
+static int check_header_fi_mismatch(FLACParseContext  *fpc,
+                                    FLACFrameInfo     *header_fi,
+                                    FLACFrameInfo     *child_fi,
+                                    int                log_level_offset)
 {
-    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
-    int deduction = 0, deduction_expected = 0, i;
+    int deduction = 0;
     if (child_fi->samplerate != header_fi->samplerate) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
@@ -288,13 +296,25 @@ static int check_header_mismatch(FLACParseContext  *fpc,
         /* Changing blocking strategy not allowed per the spec */
         deduction += FLAC_HEADER_BASE_SCORE;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "blocking strategy change detected in adjacent frames\n");
+               "blocking strategy change detected in adjacent frames\n");
     }
     if (child_fi->channels != header_fi->channels) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "number of channels change detected in adjacent frames\n");
+               "number of channels change detected in adjacent frames\n");
     }
+    return deduction;
+}
+
+static int check_header_mismatch(FLACParseContext  *fpc,
+                                 FLACHeaderMarker  *header,
+                                 FLACHeaderMarker  *child,
+                                 int                log_level_offset)
+{
+    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
+    int deduction, deduction_expected = 0, i;
+    deduction = check_header_fi_mismatch(fpc, header_fi, child_fi,
+                                         log_level_offset);
     /* Check sample and frame numbers. */
     if ((child_fi->frame_or_sample_num - header_fi->frame_or_sample_num
          != header_fi->blocksize) &&
@@ -399,11 +419,18 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
     FLACHeaderMarker *child;
     int dist = 0;
     int child_score;
-
+    int base_score = FLAC_HEADER_BASE_SCORE;
     if (header->max_score != FLAC_HEADER_NOT_SCORED_YET)
         return header->max_score;
 
-    header->max_score = FLAC_HEADER_BASE_SCORE;
+    /* Modify the base score with changes from the last output header */
+    if (fpc->last_fi_valid) {
+        /* Silence the log since this will be repeated if selected */
+        base_score -= check_header_fi_mismatch(fpc, &fpc->last_fi, &header->fi,
+                                               AV_LOG_DEBUG);
+    }
+
+    header->max_score = base_score;
 
     /* Check and compute the children's scores. */
     child = header->next;
@@ -419,7 +446,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
         if (FLAC_HEADER_BASE_SCORE + child_score > header->max_score) {
             /* Keep the child because the frame scoring is dynamic. */
             header->best_child = child;
-            header->max_score  = FLAC_HEADER_BASE_SCORE + child_score;
+            header->max_score  = base_score + child_score;
         }
         child = child->next;
     }
@@ -430,7 +457,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
 static void score_sequences(FLACParseContext *fpc)
 {
     FLACHeaderMarker *curr;
-    int best_score = FLAC_HEADER_NOT_SCORED_YET;
+    int best_score = 0;//FLAC_HEADER_NOT_SCORED_YET;
     /* First pass to clear all old scores. */
     for (curr = fpc->headers; curr; curr = curr->next)
         curr->max_score = FLAC_HEADER_NOT_SCORED_YET;
@@ -469,7 +496,18 @@ static int get_best_header(FLACParseContext* fpc, const uint8_t **poutbuf,
                                         &fpc->wrap_buf,
                                         &fpc->wrap_buf_allocated_size);
 
+
+    if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+        if (header->fi.is_var_size)
+          fpc->pc->pts = header->fi.frame_or_sample_num;
+        else if (header->best_child)
+          fpc->pc->pts = header->fi.frame_or_sample_num * header->fi.blocksize;
+    }
+
     fpc->best_header_valid = 0;
+    fpc->last_fi_valid = 1;
+    fpc->last_fi = header->fi;
+
     /* Return the negative overread index so the client can compute pos.
        This should be the amount overread to the beginning of the child */
     if (child)
@@ -489,8 +527,16 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         FLACFrameInfo fi;
-        if (frame_header_is_valid(avctx, buf, &fi))
+        if (frame_header_is_valid(avctx, buf, &fi)) {
             s->duration = fi.blocksize;
+            if (!avctx->sample_rate)
+                avctx->sample_rate = fi.samplerate;
+            if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+                fpc->pc->pts = fi.frame_or_sample_num;
+                if (!fi.is_var_size)
+                  fpc->pc->pts *= fi.blocksize;
+            }
+        }
         *poutbuf      = buf;
         *poutbuf_size = buf_size;
         return buf_size;
@@ -546,14 +592,18 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         av_freep(&fpc->best_header);
     }
 
-    /* Find and score new headers. */
-    while ((buf && read_end < buf + buf_size &&
+    /* Find and score new headers.                                     */
+    /* buf_size is to zero when padding, so check for this since we do */
+    /* not want to try to read more input once we have found the end.  */
+    /* Note that as (non-modified) parameters, buf can be non-NULL,    */
+    /* while buf_size is 0.                                            */
+    while ((buf && buf_size && read_end < buf + buf_size &&
             fpc->nb_headers_buffered < FLAC_MIN_HEADERS)
-           || (!buf && !fpc->end_padded)) {
+           || ((!buf || !buf_size) && !fpc->end_padded)) {
         int start_offset;
 
         /* Pad the end once if EOF, to check the final region for headers. */
-        if (!buf) {
+        if (!buf || !buf_size) {
             fpc->end_padded      = 1;
             buf_size = MAX_FRAME_HEADER_SIZE;
             read_end = read_start + MAX_FRAME_HEADER_SIZE;
@@ -575,15 +625,15 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
 
         /* Fill the buffer. */
-        if (av_fifo_realloc2(fpc->fifo_buf,
-                             (read_end - read_start) + av_fifo_size(fpc->fifo_buf)) < 0) {
+        if (   av_fifo_space(fpc->fifo_buf) < read_end - read_start
+            && av_fifo_realloc2(fpc->fifo_buf, (read_end - read_start) + 2*av_fifo_size(fpc->fifo_buf)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "couldn't reallocate buffer of size %td\n",
+                   "couldn't reallocate buffer of size %"PTRDIFF_SPECIFIER"\n",
                    (read_end - read_start) + av_fifo_size(fpc->fifo_buf));
             goto handle_error;
         }
 
-        if (buf) {
+        if (buf && buf_size) {
             av_fifo_generic_write(fpc->fifo_buf, (void*) read_start,
                                   read_end - read_start, NULL);
         } else {
@@ -620,10 +670,11 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
         /* restore the state pre-padding */
         if (fpc->end_padded) {
+            int warp = fpc->fifo_buf->wptr - fpc->fifo_buf->buffer < MAX_FRAME_HEADER_SIZE;
             /* HACK: drain the tail of the fifo */
             fpc->fifo_buf->wptr -= MAX_FRAME_HEADER_SIZE;
             fpc->fifo_buf->wndx -= MAX_FRAME_HEADER_SIZE;
-            if (fpc->fifo_buf->wptr < 0) {
+            if (warp) {
                 fpc->fifo_buf->wptr += fpc->fifo_buf->end -
                     fpc->fifo_buf->buffer;
             }
@@ -632,10 +683,12 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
     }
 
-    curr = fpc->headers;
-    for (curr = fpc->headers; curr; curr = curr->next)
-        if (!fpc->best_header || curr->max_score > fpc->best_header->max_score)
+    for (curr = fpc->headers; curr; curr = curr->next) {
+        if (curr->max_score > 0 &&
+            (!fpc->best_header || curr->max_score > fpc->best_header->max_score)) {
             fpc->best_header = curr;
+        }
+    }
 
     if (fpc->best_header) {
         fpc->best_header_valid = 1;
@@ -660,7 +713,7 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 handle_error:
     *poutbuf      = NULL;
     *poutbuf_size = 0;
-    return read_end - buf;
+    return buf_size ? read_end - buf : 0;
 }
 
 static av_cold int flac_parse_init(AVCodecParserContext *c)
@@ -669,7 +722,12 @@ static av_cold int flac_parse_init(AVCodecParserContext *c)
     fpc->pc = c;
     /* There will generally be FLAC_MIN_HEADERS buffered in the fifo before
        it drains.  This is allocated early to avoid slow reallocation. */
-    fpc->fifo_buf = av_fifo_alloc(FLAC_AVG_FRAME_SIZE * (FLAC_MIN_HEADERS + 3));
+    fpc->fifo_buf = av_fifo_alloc_array(FLAC_MIN_HEADERS + 3, FLAC_AVG_FRAME_SIZE);
+    if (!fpc->fifo_buf) {
+        av_log(fpc->avctx, AV_LOG_ERROR,
+                "couldn't allocate fifo_buf\n");
+        return AVERROR(ENOMEM);
+    }
     return 0;
 }
 
@@ -684,8 +742,8 @@ static void flac_parse_close(AVCodecParserContext *c)
         av_free(curr);
         curr = temp;
     }
-    av_fifo_free(fpc->fifo_buf);
-    av_free(fpc->wrap_buf);
+    av_fifo_freep(&fpc->fifo_buf);
+    av_freep(&fpc->wrap_buf);
 }
 
 AVCodecParser ff_flac_parser = {
diff --git a/libavcodec/flacdata.c b/libavcodec/flacdata.c
index 820c3aa492..1954f32d32 100644
--- a/libavcodec/flacdata.c
+++ b/libavcodec/flacdata.c
@@ -2,20 +2,20 @@
  * FLAC data
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ const int ff_flac_sample_rate_table[16] =
   8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000,
   0, 0, 0, 0 };
 
-const int16_t ff_flac_blocksize_table[16] = {
+const int32_t ff_flac_blocksize_table[16] = {
      0,    192, 576<<0, 576<<1, 576<<2, 576<<3,      0,      0,
 256<<0, 256<<1, 256<<2, 256<<3, 256<<4, 256<<5, 256<<6, 256<<7
 };
diff --git a/libavcodec/flacdata.h b/libavcodec/flacdata.h
index f56637773c..e2c1e5d7f2 100644
--- a/libavcodec/flacdata.h
+++ b/libavcodec/flacdata.h
@@ -2,20 +2,20 @@
  * FLAC data header
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,6 @@
 
 extern const int ff_flac_sample_rate_table[16];
 
-extern const int16_t ff_flac_blocksize_table[16];
+extern const int32_t ff_flac_blocksize_table[16];
 
 #endif /* AVCODEC_FLACDATA_H */
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 7af71f3c0c..126f4e00df 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,9 @@
 
 #include <limits.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -41,9 +44,13 @@
 #include "flac.h"
 #include "flacdata.h"
 #include "flacdsp.h"
+#include "thread.h"
+#include "unary.h"
+
 
 typedef struct FLACContext {
-    FLACSTREAMINFO
+    AVClass *class;
+    struct FLACStreaminfo flac_stream_info;
 
     AVCodecContext *avctx;                  ///< parent AVCodecContext
     GetBitContext gb;                       ///< GetBitContext initialized to start at the current frame
@@ -56,6 +63,7 @@ typedef struct FLACContext {
     int32_t *decoded[FLAC_MAX_CHANNELS];    ///< decoded samples
     uint8_t *decoded_buffer;
     unsigned int decoded_buffer_size;
+    int buggy_lpc;                          ///< use workaround for old lavc encoded files
 
     FLACDSPContext dsp;
 } FLACContext;
@@ -65,7 +73,7 @@ static int allocate_buffers(FLACContext *s);
 static void flac_set_bps(FLACContext *s)
 {
     enum AVSampleFormat req = s->avctx->request_sample_fmt;
-    int need32 = s->bps > 16;
+    int need32 = s->flac_stream_info.bps > 16;
     int want32 = av_get_bytes_per_sample(req) > 2;
     int planar = av_sample_fmt_is_planar(req);
 
@@ -74,13 +82,13 @@ static void flac_set_bps(FLACContext *s)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-        s->sample_shift = 32 - s->bps;
+        s->sample_shift = 32 - s->flac_stream_info.bps;
     } else {
         if (planar)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-        s->sample_shift = 16 - s->bps;
+        s->sample_shift = 16 - s->flac_stream_info.bps;
     }
 }
 
@@ -101,12 +109,13 @@ static av_cold int flac_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
 
     /* initialize based on the demuxer-supplied streamdata header */
-    ff_flac_parse_streaminfo(avctx, (FLACStreaminfo *)s, streaminfo);
+    ff_flac_parse_streaminfo(avctx, &s->flac_stream_info, streaminfo);
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -124,8 +133,12 @@ static void dump_headers(AVCodecContext *avctx, FLACStreaminfo *s)
 static int allocate_buffers(FLACContext *s)
 {
     int buf_size;
+    int ret;
+
+    av_assert0(s->flac_stream_info.max_blocksize);
 
-    buf_size = av_samples_get_buffer_size(NULL, s->channels, s->max_blocksize,
+    buf_size = av_samples_get_buffer_size(NULL, s->flac_stream_info.channels,
+                                          s->flac_stream_info.max_blocksize,
                                           AV_SAMPLE_FMT_S32P, 0);
     if (buf_size < 0)
         return buf_size;
@@ -134,9 +147,12 @@ static int allocate_buffers(FLACContext *s)
     if (!s->decoded_buffer)
         return AVERROR(ENOMEM);
 
-    return av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
-                                  s->decoded_buffer, s->channels,
-                                  s->max_blocksize, AV_SAMPLE_FMT_S32P, 0);
+    ret = av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
+                                 s->decoded_buffer,
+                                 s->flac_stream_info.channels,
+                                 s->flac_stream_info.max_blocksize,
+                                 AV_SAMPLE_FMT_S32P, 0);
+    return ret < 0 ? ret : 0;
 }
 
 /**
@@ -159,12 +175,13 @@ static int parse_streaminfo(FLACContext *s, const uint8_t *buf, int buf_size)
         metadata_size != FLAC_STREAMINFO_SIZE) {
         return AVERROR_INVALIDDATA;
     }
-    ff_flac_parse_streaminfo(s->avctx, (FLACStreaminfo *)s, &buf[8]);
+    ff_flac_parse_streaminfo(s->avctx, &s->flac_stream_info, &buf[8]);
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -213,6 +230,12 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
     rice_order = get_bits(&s->gb, 4);
 
     samples= s->blocksize >> rice_order;
+    if (samples << rice_order != s->blocksize) {
+        av_log(s->avctx, AV_LOG_ERROR, "invalid rice order: %i blocksize %i\n",
+               rice_order, s->blocksize);
+        return AVERROR_INVALIDDATA;
+    }
+
     if (pred_order > samples) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid predictor order: %i > %i\n",
                pred_order, samples);
@@ -245,7 +268,8 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
                                  int pred_order, int bps)
 {
     const int blocksize = s->blocksize;
-    int a, b, c, d, i, ret;
+    int av_uninit(a), av_uninit(b), av_uninit(c), av_uninit(d), i;
+    int ret;
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++) {
@@ -291,6 +315,33 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
     return 0;
 }
 
+static void lpc_analyze_remodulate(int32_t *decoded, const int coeffs[32],
+                                   int order, int qlevel, int len, int bps)
+{
+    int i, j;
+    int ebps = 1 << (bps-1);
+    unsigned sigma = 0;
+
+    for (i = order; i < len; i++)
+        sigma |= decoded[i] + ebps;
+
+    if (sigma < 2*ebps)
+        return;
+
+    for (i = len - 1; i >= order; i--) {
+        int64_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (int64_t)decoded[i-order+j];
+        decoded[i] -= p >> qlevel;
+    }
+    for (i = order; i < len; i++, decoded++) {
+        int32_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (uint32_t)decoded[j];
+        decoded[j] += p >> qlevel;
+    }
+}
+
 static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
                                int bps)
 {
@@ -322,7 +373,15 @@ static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
     if ((ret = decode_residuals(s, decoded, pred_order)) < 0)
         return ret;
 
-    s->dsp.lpc(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    if (   (    s->buggy_lpc && s->flac_stream_info.bps <= 16)
+        || (   !s->buggy_lpc && bps <= 16
+            && bps + coeff_prec + av_log2(pred_order) <= 32)) {
+        s->dsp.lpc16(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    } else {
+        s->dsp.lpc32(decoded, coeffs, pred_order, qlevel, s->blocksize);
+        if (s->flac_stream_info.bps <= 16)
+            lpc_analyze_remodulate(decoded, coeffs, pred_order, qlevel, s->blocksize, bps);
+    }
 
     return 0;
 }
@@ -331,7 +390,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
 {
     int32_t *decoded = s->decoded[channel];
     int type, wasted = 0;
-    int bps = s->bps;
+    int bps = s->flac_stream_info.bps;
     int i, tmp, ret;
 
     if (channel == 0) {
@@ -350,8 +409,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
 
     if (get_bits1(&s->gb)) {
         int left = get_bits_left(&s->gb);
-        wasted = 1;
-        if ( left < 0 ||
+        if ( left <= 0 ||
             (left < bps && !show_bits_long(&s->gb, left)) ||
                            !show_bits_long(&s->gb, bps)) {
             av_log(s->avctx, AV_LOG_ERROR,
@@ -359,8 +417,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
                    bps, left);
             return AVERROR_INVALIDDATA;
         }
-        while (!get_bits1(&s->gb))
-            wasted++;
+        wasted = 1 + get_unary(&s->gb, 1, get_bits_left(&s->gb));
         bps -= wasted;
     }
     if (bps > 32) {
@@ -407,66 +464,69 @@ static int decode_frame(FLACContext *s)
         return ret;
     }
 
-    if (s->channels && fi.channels != s->channels && s->got_streaminfo) {
-        s->channels = s->avctx->channels = fi.channels;
+    if (   s->flac_stream_info.channels
+        && fi.channels != s->flac_stream_info.channels
+        && s->got_streaminfo) {
+        s->flac_stream_info.channels = s->avctx->channels = fi.channels;
         ff_flac_set_channel_layout(s->avctx);
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
     }
-    s->channels = s->avctx->channels = fi.channels;
+    s->flac_stream_info.channels = s->avctx->channels = fi.channels;
     if (!s->avctx->channel_layout)
         ff_flac_set_channel_layout(s->avctx);
     s->ch_mode = fi.ch_mode;
 
-    if (!s->bps && !fi.bps) {
+    if (!s->flac_stream_info.bps && !fi.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "bps not found in STREAMINFO or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (!fi.bps) {
-        fi.bps = s->bps;
-    } else if (s->bps && fi.bps != s->bps) {
+        fi.bps = s->flac_stream_info.bps;
+    } else if (s->flac_stream_info.bps && fi.bps != s->flac_stream_info.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "switching bps mid-stream is not "
                                        "supported\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->bps) {
-        s->bps = s->avctx->bits_per_raw_sample = fi.bps;
+    if (!s->flac_stream_info.bps) {
+        s->flac_stream_info.bps = s->avctx->bits_per_raw_sample = fi.bps;
         flac_set_bps(s);
     }
 
-    if (!s->max_blocksize)
-        s->max_blocksize = FLAC_MAX_BLOCKSIZE;
-    if (fi.blocksize > s->max_blocksize) {
+    if (!s->flac_stream_info.max_blocksize)
+        s->flac_stream_info.max_blocksize = FLAC_MAX_BLOCKSIZE;
+    if (fi.blocksize > s->flac_stream_info.max_blocksize) {
         av_log(s->avctx, AV_LOG_ERROR, "blocksize %d > %d\n", fi.blocksize,
-               s->max_blocksize);
+               s->flac_stream_info.max_blocksize);
         return AVERROR_INVALIDDATA;
     }
     s->blocksize = fi.blocksize;
 
-    if (!s->samplerate && !fi.samplerate) {
+    if (!s->flac_stream_info.samplerate && !fi.samplerate) {
         av_log(s->avctx, AV_LOG_ERROR, "sample rate not found in STREAMINFO"
                                         " or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (fi.samplerate == 0)
-        fi.samplerate = s->samplerate;
-    s->samplerate = s->avctx->sample_rate = fi.samplerate;
+        fi.samplerate = s->flac_stream_info.samplerate;
+    s->flac_stream_info.samplerate = s->avctx->sample_rate = fi.samplerate;
 
     if (!s->got_streaminfo) {
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
-        ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
         s->got_streaminfo = 1;
-        dump_headers(s->avctx, (FLACStreaminfo *)s);
+        dump_headers(s->avctx, &s->flac_stream_info);
     }
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
 
-//    dump_headers(s->avctx, (FLACStreaminfo *)s);
+//    dump_headers(s->avctx, &s->flac_stream_info);
 
     /* subframes */
-    for (i = 0; i < s->channels; i++) {
+    for (i = 0; i < s->flac_stream_info.channels; i++) {
         if ((ret = decode_subframe(s, i)) < 0)
             return ret;
     }
@@ -483,6 +543,7 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     FLACContext *s = avctx->priv_data;
@@ -491,12 +552,22 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 0;
 
-    if (s->max_framesize == 0) {
-        s->max_framesize =
-            ff_flac_get_max_frame_size(s->max_blocksize ? s->max_blocksize : FLAC_MAX_BLOCKSIZE,
+    if (s->flac_stream_info.max_framesize == 0) {
+        s->flac_stream_info.max_framesize =
+            ff_flac_get_max_frame_size(s->flac_stream_info.max_blocksize ? s->flac_stream_info.max_blocksize : FLAC_MAX_BLOCKSIZE,
                                        FLAC_MAX_CHANNELS, 32);
     }
 
+    if (buf_size > 5 && !memcmp(buf, "\177FLAC", 5)) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping flac header packet 1\n");
+        return buf_size;
+    }
+
+    if (buf_size > 0 && (*buf & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping vorbis comment\n");
+        return buf_size;
+    }
+
     /* check that there is at least the smallest decodable amount of data.
        this amount corresponds to the smallest valid FLAC frame possible.
        FF F8 69 02 00 00 9A 00 00 34 46 */
@@ -513,21 +584,29 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* decode frame */
-    init_get_bits(&s->gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
     if ((ret = decode_frame(s)) < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "decode_frame() failed\n");
         return ret;
     }
-    bytes_read = (get_bits_count(&s->gb)+7)/8;
+    bytes_read = get_bits_count(&s->gb)/8;
+
+    if ((s->avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) &&
+        av_crc(av_crc_get_table(AV_CRC_16_ANSI),
+               0, buf, bytes_read)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error at PTS %"PRId64"\n", avpkt->pts);
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
 
     /* get output buffer */
     frame->nb_samples = s->blocksize;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
-    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded, s->channels,
+    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded,
+                                   s->flac_stream_info.channels,
                                    s->blocksize, s->sample_shift);
 
     if (bytes_read > buf_size) {
@@ -544,6 +623,19 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     return bytes_read;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    FLACContext *s = avctx->priv_data;
+    s->decoded_buffer = NULL;
+    s->decoded_buffer_size = 0;
+    s->avctx = avctx;
+    if (s->flac_stream_info.max_blocksize)
+        return allocate_buffers(s);
+    return 0;
+}
+#endif
+
 static av_cold int flac_decode_close(AVCodecContext *avctx)
 {
     FLACContext *s = avctx->priv_data;
@@ -553,6 +645,18 @@ static av_cold int flac_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "use_buggy_lpc", "emulate old buggy lavc behavior", offsetof(FLACContext, buggy_lpc), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
+{ NULL },
+};
+
+static const AVClass flac_decoder_class = {
+    "FLAC decoder",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_flac_decoder = {
     .name           = "flac",
     .long_name      = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
@@ -562,10 +666,12 @@ AVCodec ff_flac_decoder = {
     .init           = flac_decode_init,
     .close          = flac_decode_close,
     .decode         = flac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16,
                                                       AV_SAMPLE_FMT_S16P,
                                                       AV_SAMPLE_FMT_S32,
                                                       AV_SAMPLE_FMT_S32P,
-                                                      -1 },
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &flac_decoder_class,
 };
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index b916869321..30b66484e8 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,16 +85,13 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 
 }
 
-av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                              int bps)
 {
-    if (bps > 16) {
-        c->lpc            = flac_lpc_32_c;
-        c->lpc_encode     = flac_lpc_encode_c_32;
-    } else {
-        c->lpc            = flac_lpc_16_c;
-        c->lpc_encode     = flac_lpc_encode_c_16;
-    }
+    c->lpc16        = flac_lpc_16_c;
+    c->lpc32        = flac_lpc_32_c;
+    c->lpc16_encode = flac_lpc_encode_c_16;
+    c->lpc32_encode = flac_lpc_encode_c_32;
 
     switch (fmt) {
     case AV_SAMPLE_FMT_S32:
@@ -127,5 +124,7 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
     }
 
     if (ARCH_ARM)
-        ff_flacdsp_init_arm(c, fmt, bps);
+        ff_flacdsp_init_arm(c, fmt, channels, bps);
+    if (ARCH_X86)
+        ff_flacdsp_init_x86(c, fmt, channels, bps);
 }
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 33184b589d..f5cbd94724 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,18 @@
 typedef struct FLACDSPContext {
     void (*decorrelate[4])(uint8_t **out, int32_t **in, int channels,
                            int len, int shift);
-    void (*lpc)(int32_t *samples, const int coeffs[32], int order,
-                int qlevel, int len);
-    void (*lpc_encode)(int32_t *res, const int32_t *smp, int len, int order,
-                       const int32_t *coefs, int shift);
+    void (*lpc16)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
+    void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
 } FLACDSPContext;
 
-void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
-void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
+void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
 
 #endif /* AVCODEC_FLACDSP_H */
diff --git a/libavcodec/flacdsp_lpc_template.c b/libavcodec/flacdsp_lpc_template.c
index 269e64b3a8..5d532e0673 100644
--- a/libavcodec/flacdsp_lpc_template.c
+++ b/libavcodec/flacdsp_lpc_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,3 +139,21 @@ static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len,
     }
 #endif
 }
+
+/* Comment for clarity/de-obfuscation.
+ *
+ * for (int i = order; i < len; i++) {
+ *     int32_t p = 0;
+ *     for (int j = 0; j < order; j++) {
+ *         int c = coefs[j];
+ *         int s = smp[(i-1)-j];
+ *         p    += c*s;
+ *     }
+ *     res[i] = smp[i] - (p >> shift);
+ * }
+ *
+ * The CONFIG_SMALL code above simplifies to this, in the case of SAMPLE_SIZE
+ * not being equal to 32 (at the present time that means for 16-bit audio). The
+ * code above does 2 samples per iteration.  Commit bfdd5bc (made all the way
+ * back in 2007) says that way is faster.
+ */
diff --git a/libavcodec/flacdsp_template.c b/libavcodec/flacdsp_template.c
index 0affe22ddb..62c0a15ff6 100644
--- a/libavcodec/flacdsp_template.c
+++ b/libavcodec/flacdsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 58961b8dfa..f849ffcca4 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -2,30 +2,31 @@
  * FLAC audio encoder
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/crc.h"
 #include "libavutil/intmath.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
-#include "get_bits.h"
+#include "put_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "lpc.h"
@@ -61,13 +62,14 @@ typedef struct CompressionOptions {
     int min_partition_order;
     int max_partition_order;
     int ch_mode;
+    int exact_rice_parameters;
+    int multi_dim_quant;
 } CompressionOptions;
 
 typedef struct RiceContext {
     enum CodingMode coding_mode;
     int porder;
     int params[MAX_PARTITIONS];
-    uint32_t udata[FLAC_MAX_BLOCKSIZE];
 } RiceContext;
 
 typedef struct FlacSubframe {
@@ -78,9 +80,13 @@ typedef struct FlacSubframe {
     int order;
     int32_t coefs[MAX_LPC_ORDER];
     int shift;
+
     RiceContext rc;
+    uint32_t rc_udata[FLAC_MAX_BLOCKSIZE];
+    uint64_t rc_sums[32][MAX_PARTITIONS];
+
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+1];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
@@ -157,7 +163,7 @@ static int select_blocksize(int samplerate, int block_time_ms)
     int target;
     int blocksize;
 
-    assert(samplerate > 0);
+    av_assert0(samplerate > 0);
     blocksize = ff_flac_blocksize_table[1];
     target    = (samplerate * block_time_ms) / 1000;
     for (i = 0; i < 16; i++) {
@@ -251,8 +257,11 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
         break;
     }
 
-    if (channels < 1 || channels > FLAC_MAX_CHANNELS)
-        return -1;
+    if (channels < 1 || channels > FLAC_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "%d channels not supported (max %d)\n",
+               channels, FLAC_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
     s->channels = channels;
 
     /* find samplerate in table */
@@ -278,7 +287,8 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             s->sr_code[0] = 13;
             s->sr_code[1] = freq;
         } else {
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "%d Hz not supported\n", freq);
+            return AVERROR(EINVAL);
         }
         s->samplerate = freq;
     }
@@ -293,7 +303,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     if (level > 12) {
         av_log(avctx, AV_LOG_ERROR, "invalid compression level: %d\n",
                s->options.compression_level);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     s->options.block_time_ms = ((int[]){ 27, 27, 27,105,105,105,105,105,105,105,105,105,105})[level];
@@ -332,13 +342,13 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             if (avctx->min_prediction_order > MAX_FIXED_ORDER) {
                 av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                        avctx->min_prediction_order);
-                return -1;
+                return AVERROR(EINVAL);
             }
         } else if (avctx->min_prediction_order < MIN_LPC_ORDER ||
                    avctx->min_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                    avctx->min_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.min_prediction_order = avctx->min_prediction_order;
     }
@@ -349,20 +359,20 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             if (avctx->max_prediction_order > MAX_FIXED_ORDER) {
                 av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                        avctx->max_prediction_order);
-                return -1;
+                return AVERROR(EINVAL);
             }
         } else if (avctx->max_prediction_order < MIN_LPC_ORDER ||
                    avctx->max_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                    avctx->max_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.max_prediction_order = avctx->max_prediction_order;
     }
     if (s->options.max_prediction_order < s->options.min_prediction_order) {
         av_log(avctx, AV_LOG_ERROR, "invalid prediction orders: min=%d max=%d\n",
                s->options.min_prediction_order, s->options.max_prediction_order);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     if (avctx->frame_size > 0) {
@@ -370,7 +380,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
                 avctx->frame_size > FLAC_MAX_BLOCKSIZE) {
             av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n",
                    avctx->frame_size);
-            return -1;
+            return AVERROR(EINVAL);
         }
     } else {
         s->avctx->frame_size = select_blocksize(s->samplerate, s->options.block_time_ms);
@@ -398,11 +408,33 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     s->frame_count   = 0;
     s->min_framesize = s->max_framesize;
 
+    if (channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK) {
+        if (avctx->channel_layout) {
+            av_log(avctx, AV_LOG_ERROR, "Channel layout not supported by Flac, "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n");
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Flac channel layout for "
+                                               "%d channels.\n", channels);
+        }
+    }
+
     ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
                       s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);
 
     ff_bswapdsp_init(&s->bdsp);
-    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt,
+    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt, channels,
                     avctx->bits_per_raw_sample);
 
     dprint_compression_options(s);
@@ -478,7 +510,7 @@ static void copy_samples(FlacEncodeContext *s, const void *samples)
 }
 
 
-static uint64_t rice_count_exact(int32_t *res, int n, int k)
+static uint64_t rice_count_exact(const int32_t *res, int n, int k)
 {
     int i;
     uint64_t count = 0;
@@ -502,6 +534,9 @@ static uint64_t subframe_count_exact(FlacEncodeContext *s, FlacSubframe *sub,
     /* subframe header */
     count += 8;
 
+    if (sub->wasted)
+        count += sub->wasted;
+
     /* subframe */
     if (sub->type == FLAC_SUBFRAME_CONSTANT) {
         count += sub->obits;
@@ -556,24 +591,44 @@ static int find_optimal_param(uint64_t sum, int n, int max_param)
     return FFMIN(k, max_param);
 }
 
+static int find_optimal_param_exact(uint64_t sums[32][MAX_PARTITIONS], int i, int max_param)
+{
+    int bestk = 0;
+    int64_t bestbits = INT64_MAX;
+    int k;
+
+    for (k = 0; k <= max_param; k++) {
+        int64_t bits = sums[k][i];
+        if (bits < bestbits) {
+            bestbits = bits;
+            bestk = k;
+        }
+    }
+
+    return bestk;
+}
 
 static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
-                                         uint64_t *sums, int n, int pred_order)
+                                         uint64_t sums[32][MAX_PARTITIONS],
+                                         int n, int pred_order, int max_param, int exact)
 {
     int i;
-    int k, cnt, part, max_param;
+    int k, cnt, part;
     uint64_t all_bits;
 
-    max_param = (1 << rc->coding_mode) - 2;
-
     part     = (1 << porder);
     all_bits = 4 * part;
 
     cnt = (n >> porder) - pred_order;
     for (i = 0; i < part; i++) {
-        k = find_optimal_param(sums[i], cnt, max_param);
+        if (exact) {
+            k = find_optimal_param_exact(sums, i, max_param);
+            all_bits += sums[k][i];
+        } else {
+            k = find_optimal_param(sums[0][i], cnt, max_param);
+            all_bits += rice_encode_count(sums[0][i], cnt, k);
+        }
         rc->params[i] = k;
-        all_bits += rice_encode_count(sums[i], cnt, k);
         cnt = n >> porder;
     }
 
@@ -583,61 +638,80 @@ static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
 }
 
 
-static void calc_sums(int pmin, int pmax, uint32_t *data, int n, int pred_order,
-                      uint64_t sums[][MAX_PARTITIONS])
+static void calc_sum_top(int pmax, int kmax, const uint32_t *data, int n, int pred_order,
+                         uint64_t sums[32][MAX_PARTITIONS])
 {
-    int i, j;
+    int i, k;
     int parts;
-    uint32_t *res, *res_end;
+    const uint32_t *res, *res_end;
 
     /* sums for highest level */
     parts   = (1 << pmax);
-    res     = &data[pred_order];
-    res_end = &data[n >> pmax];
-    for (i = 0; i < parts; i++) {
-        uint64_t sum = 0;
-        while (res < res_end)
-            sum += *(res++);
-        sums[pmax][i] = sum;
-        res_end += n >> pmax;
-    }
-    /* sums for lower levels */
-    for (i = pmax - 1; i >= pmin; i--) {
-        parts = (1 << i);
-        for (j = 0; j < parts; j++)
-            sums[i][j] = sums[i+1][2*j] + sums[i+1][2*j+1];
+
+    for (k = 0; k <= kmax; k++) {
+        res     = &data[pred_order];
+        res_end = &data[n >> pmax];
+        for (i = 0; i < parts; i++) {
+            if (kmax) {
+                uint64_t sum = (1LL + k) * (res_end - res);
+                while (res < res_end)
+                    sum += *(res++) >> k;
+                sums[k][i] = sum;
+            } else {
+                uint64_t sum = 0;
+                while (res < res_end)
+                    sum += *(res++);
+                sums[k][i] = sum;
+            }
+            res_end += n >> pmax;
+        }
     }
 }
 
+static void calc_sum_next(int level, uint64_t sums[32][MAX_PARTITIONS], int kmax)
+{
+    int i, k;
+    int parts = (1 << level);
+    for (i = 0; i < parts; i++) {
+        for (k=0; k<=kmax; k++)
+            sums[k][i] = sums[k][2*i] + sums[k][2*i+1];
+    }
+}
 
-static uint64_t calc_rice_params(RiceContext *rc, int pmin, int pmax,
-                                 int32_t *data, int n, int pred_order)
+static uint64_t calc_rice_params(RiceContext *rc,
+                                 uint32_t udata[FLAC_MAX_BLOCKSIZE],
+                                 uint64_t sums[32][MAX_PARTITIONS],
+                                 int pmin, int pmax,
+                                 const int32_t *data, int n, int pred_order, int exact)
 {
     int i;
     uint64_t bits[MAX_PARTITION_ORDER+1];
     int opt_porder;
     RiceContext tmp_rc;
-    uint64_t sums[MAX_PARTITION_ORDER + 1][MAX_PARTITIONS] = { { 0 } };
+    int kmax = (1 << rc->coding_mode) - 2;
 
-    assert(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
-    assert(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
-    assert(pmin <= pmax);
+    av_assert1(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
+    av_assert1(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
+    av_assert1(pmin <= pmax);
 
     tmp_rc.coding_mode = rc->coding_mode;
 
     for (i = 0; i < n; i++)
-        rc->udata[i] = (2 * data[i]) ^ (data[i] >> 31);
+        udata[i] = (2 * data[i]) ^ (data[i] >> 31);
 
-    calc_sums(pmin, pmax, rc->udata, n, pred_order, sums);
+    calc_sum_top(pmax, exact ? kmax : 0, udata, n, pred_order, sums);
 
     opt_porder = pmin;
     bits[pmin] = UINT32_MAX;
-    for (i = pmin; i <= pmax; i++) {
-        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums[i], n, pred_order);
-        if (bits[i] <= bits[opt_porder]) {
+    for (i = pmax; ; ) {
+        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums, n, pred_order, kmax, exact);
+        if (bits[i] < bits[opt_porder] || pmax == pmin) {
             opt_porder = i;
             *rc = tmp_rc;
         }
+        if (i == pmin)
+            break;
+        calc_sum_next(--i, sums, exact ? kmax : 0);
     }
 
     return bits[opt_porder];
@@ -664,8 +738,8 @@ static uint64_t find_subframe_rice_params(FlacEncodeContext *s,
     uint64_t bits = 8 + pred_order * sub->obits + 2 + sub->rc.coding_mode;
     if (sub->type == FLAC_SUBFRAME_LPC)
         bits += 4 + 5 + pred_order * s->options.lpc_coeff_precision;
-    bits += calc_rice_params(&sub->rc, pmin, pmax, sub->residual,
-                             s->frame.blocksize, pred_order);
+    bits += calc_rice_params(&sub->rc, sub->rc_udata, sub->rc_sums, pmin, pmax, sub->residual,
+                             s->frame.blocksize, pred_order, s->options.exact_rice_parameters);
     return bits;
 }
 
@@ -804,8 +878,13 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             order = av_clip(order, min_order - 1, max_order - 1);
             if (order == last_order)
                 continue;
-            s->flac_dsp.lpc_encode(res, smp, n, order+1, coefs[order],
-                                   shift[order]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(order) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, order+1);
             if (bits[i] < bits[opt_index]) {
                 opt_index = i;
@@ -819,7 +898,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order = 0;
         bits[0]   = UINT32_MAX;
         for (i = min_order-1; i < max_order; i++) {
-            s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, i+1);
             if (bits[i] < bits[opt_order])
                 opt_order = i;
@@ -837,7 +920,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             for (i = last-step; i <= last+step; i += step) {
                 if (i < min_order-1 || i >= max_order || bits[i] < UINT32_MAX)
                     continue;
-                s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                    s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                } else {
+                    s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                }
                 bits[i] = find_subframe_rice_params(s, sub, i+1);
                 if (bits[i] < bits[opt_order])
                     opt_order = i;
@@ -846,13 +933,60 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order++;
     }
 
+    if (s->options.multi_dim_quant) {
+        int allsteps = 1;
+        int i, step, improved;
+        int64_t best_score = INT64_MAX;
+        int32_t qmax;
+
+        qmax = (1 << (s->options.lpc_coeff_precision - 1)) - 1;
+
+        for (i=0; i<opt_order; i++)
+            allsteps *= 3;
+
+        do {
+            improved = 0;
+            for (step = 0; step < allsteps; step++) {
+                int tmp = step;
+                int32_t lpc_try[MAX_LPC_ORDER];
+                int64_t score = 0;
+                int diffsum = 0;
+
+                for (i=0; i<opt_order; i++) {
+                    int diff = ((tmp + 1) % 3) - 1;
+                    lpc_try[i] = av_clip(coefs[opt_order - 1][i] + diff, -qmax, qmax);
+                    tmp /= 3;
+                    diffsum += !!diff;
+                }
+                if (diffsum >8)
+                    continue;
+
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order - 1) <= 32) {
+                    s->flac_dsp.lpc16_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                } else {
+                    s->flac_dsp.lpc32_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                }
+                score = find_subframe_rice_params(s, sub, opt_order);
+                if (score < best_score) {
+                    best_score = score;
+                    memcpy(coefs[opt_order-1], lpc_try, sizeof(*coefs));
+                    improved=1;
+                }
+            }
+        } while(improved);
+    }
+
     sub->order     = opt_order;
     sub->type_code = sub->type | (sub->order-1);
     sub->shift     = shift[sub->order-1];
     for (i = 0; i < sub->order; i++)
         sub->coefs[i] = coefs[sub->order-1][i];
 
-    s->flac_dsp.lpc_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order) <= 32) {
+        s->flac_dsp.lpc16_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    } else {
+        s->flac_dsp.lpc32_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    }
 
     find_subframe_rice_params(s, sub, sub->order);
 
@@ -948,7 +1082,7 @@ static void remove_wasted_bits(FlacEncodeContext *s)
 }
 
 
-static int estimate_stereo_mode(int32_t *left_ch, int32_t *right_ch, int n,
+static int estimate_stereo_mode(const int32_t *left_ch, const int32_t *right_ch, int n,
                                 int max_rice_param)
 {
     int i, best;
@@ -1188,9 +1322,7 @@ static int update_md5_sum(FlacEncodeContext *s, const void *samples)
 
         for (i = 0; i < s->frame.blocksize * s->channels; i++) {
             int32_t v = samples0[i] >> 8;
-            *tmp++    = (v      ) & 0xFF;
-            *tmp++    = (v >>  8) & 0xFF;
-            *tmp++    = (v >> 16) & 0xFF;
+            AV_WL24(tmp + 3*i, v);
         }
         buf = s->md5_buffer;
     }
@@ -1264,10 +1396,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, frame_bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, frame_bytes, 0)) < 0)
         return ret;
-    }
 
     out_bytes = write_frame(s, avpkt);
 
@@ -1314,7 +1444,7 @@ static const AVOption options[] = {
 { "fixed",    NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "levinson", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "cholesky", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, FLAGS },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 2 }, 1, INT_MAX, FLAGS },
 { "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
@@ -1330,6 +1460,8 @@ static const AVOption options[] = {
 { "left_side",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_LEFT_SIDE   }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "right_side", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_RIGHT_SIDE  }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "mid_side",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_MID_SIDE    }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
+{ "exact_rice_parameters", "Calculate rice parameters exactly", offsetof(FlacEncodeContext, options.exact_rice_parameters), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+{ "multi_dim_quant",       "Multi-dimensional quantization",    offsetof(FlacEncodeContext, options.multi_dim_quant),       AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
 { NULL },
 };
 
@@ -1349,7 +1481,7 @@ AVCodec ff_flac_encoder = {
     .init           = flac_encode_init,
     .encode2        = flac_encode_frame,
     .close          = flac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_LOSSLESS,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_S32,
                                                      AV_SAMPLE_FMT_NONE },
diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c
index ee854acd5c..69b56d116c 100644
--- a/libavcodec/flashsv.c
+++ b/libavcodec/flashsv.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,7 +69,7 @@ typedef struct FlashSVContext {
     int             diff_start, diff_height;
 } FlashSVContext;
 
-static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
+static int decode_hybrid(const uint8_t *sptr, const uint8_t *sptr_end, uint8_t *dptr, int dx, int dy,
                          int h, int w, int stride, const uint32_t *pal)
 {
     int x, y;
@@ -78,6 +78,8 @@ static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
     for (y = dx + h; y > dx; y--) {
         uint8_t *dst = dptr + (y * stride) + dy * 3;
         for (x = 0; x < w; x++) {
+            if (sptr >= sptr_end)
+                return AVERROR_INVALIDDATA;
             if (*sptr & 0x80) {
                 /* 15-bit color */
                 unsigned c = AV_RB16(sptr) & ~0x8000;
@@ -107,7 +109,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx)
     av_frame_free(&s->frame);
 
     /* free the tmpblock */
-    av_free(s->tmpblock);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -142,6 +144,9 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     z_stream zs;
     int zret; // Zlib return code
 
+    if (!src)
+        return AVERROR_INVALIDDATA;
+
     zs.zalloc = NULL;
     zs.zfree  = NULL;
     zs.opaque = NULL;
@@ -152,7 +157,8 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     s->zstream.avail_out = s->block_size * 3;
     inflate(&s->zstream, Z_SYNC_FLUSH);
 
-    deflateInit(&zs, 0);
+    if (deflateInit(&zs, 0) != Z_OK)
+        return -1;
     zs.next_in   = s->tmpblock;
     zs.avail_in  = s->block_size * 3 - s->zstream.avail_out;
     zs.next_out  = s->deflate_block;
@@ -228,10 +234,15 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
         }
     } else {
         /* hybrid 15-bit/palette mode */
-        decode_hybrid(s->tmpblock, s->frame->data[0],
+        ret = decode_hybrid(s->tmpblock, s->zstream.next_out,
+                      s->frame->data[0],
                       s->image_height - (y_pos + 1 + s->diff_height),
                       x_pos, s->diff_height, width,
                       s->frame->linesize[0], s->pal);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "decode_hybrid failed\n");
+            return ret;
+        }
     }
     skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */
     return 0;
@@ -260,6 +271,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     FlashSVContext *s = avctx->priv_data;
     int h_blocks, v_blocks, h_part, v_part, i, j, ret;
     GetBitContext gb;
+    int last_blockwidth = s->block_width;
+    int last_blockheight= s->block_height;
 
     /* no supplementary picture */
     if (buf_size == 0)
@@ -267,7 +280,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 4)
         return -1;
 
-    init_get_bits(&gb, avpkt->data, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, buf_size)) < 0)
+        return ret;
 
     /* start to parse the bitstream */
     s->block_width  = 16 * (get_bits(&gb, 4) + 1);
@@ -275,6 +289,10 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     s->block_height = 16 * (get_bits(&gb, 4) + 1);
     s->image_height = get_bits(&gb, 12);
 
+    if (   last_blockwidth != s->block_width
+        || last_blockheight!= s->block_height)
+        av_freep(&s->blocks);
+
     if (s->ver == 2) {
         skip_bits(&gb, 6);
         if (get_bits1(&gb)) {
@@ -322,8 +340,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
 
     /* initialize the image size once */
     if (avctx->width == 0 && avctx->height == 0) {
-        avctx->width  = s->image_width;
-        avctx->height = s->image_height;
+        if ((ret = ff_set_dimensions(avctx, s->image_width, s->image_height)) < 0)
+            return ret;
     }
 
     /* check for changes of image width and image height */
@@ -342,19 +360,17 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
         if ((err = av_reallocp(&s->keyframedata, avpkt->size)) < 0)
             return err;
         memcpy(s->keyframedata, avpkt->data, avpkt->size);
-        if ((err = av_reallocp(&s->blocks, (v_blocks + !!v_part) *
-                               (h_blocks + !!h_part) * sizeof(s->blocks[0]))) < 0)
-            return err;
     }
+    if(s->ver == 2 && !s->blocks)
+        s->blocks = av_mallocz((v_blocks + !!v_part) * (h_blocks + !!h_part) *
+                               sizeof(s->blocks[0]));
 
     ff_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n",
             s->image_width, s->image_height, s->block_width, s->block_height,
             h_blocks, v_blocks, h_part, v_part);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     /* loop over all block columns */
     for (j = 0; j < v_blocks + (v_part ? 1 : 0); j++) {
diff --git a/libavcodec/flashsv2enc.c b/libavcodec/flashsv2enc.c
new file mode 100644
index 0000000000..65db112696
--- /dev/null
+++ b/libavcodec/flashsv2enc.c
@@ -0,0 +1,922 @@
+/*
+ * Flash Screen Video Version 2 encoder
+ * Copyright (C) 2009 Joshua Warner
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Flash Screen Video Version 2 encoder
+ * @author Joshua Warner
+ */
+
+/* Differences from version 1 stream:
+ * NOTE: Currently, the only player that supports version 2 streams is Adobe Flash Player itself.
+ * * Supports sending only a range of scanlines in a block,
+ *   indicating a difference from the corresponding block in the last keyframe.
+ * * Supports initializing the zlib dictionary with data from the corresponding
+ *   block in the last keyframe, to improve compression.
+ * * Supports a hybrid 15-bit rgb / 7-bit palette color space.
+ */
+
+/* TODO:
+ * Don't keep Block structures for both current frame and keyframe.
+ * Make better heuristics for deciding stream parameters (optimum_* functions).  Currently these return constants.
+ * Figure out how to encode palette information in the stream, choose an optimum palette at each keyframe.
+ * Figure out how the zlibPrimeCompressCurrent flag works, implement support.
+ * Find other sample files (that weren't generated here), develop a decoder.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+
+#define HAS_IFRAME_IMAGE 0x02
+#define HAS_PALLET_INFO 0x01
+
+#define COLORSPACE_BGR 0x00
+#define COLORSPACE_15_7 0x10
+#define HAS_DIFF_BLOCKS 0x04
+#define ZLIB_PRIME_COMPRESS_CURRENT 0x02
+#define ZLIB_PRIME_COMPRESS_PREVIOUS 0x01
+
+// Disables experimental "smart" parameter-choosing code, as well as the statistics that it depends on.
+// At the moment, the "smart" code is a great example of how the parameters *shouldn't* be chosen.
+#define FLASHSV2_DUMB
+
+typedef struct Block {
+    uint8_t *enc;
+    uint8_t *sl_begin, *sl_end;
+    int enc_size;
+    uint8_t *data;
+    unsigned long data_size;
+
+    uint8_t start, len;
+    uint8_t dirty;
+    uint8_t col, row, width, height;
+    uint8_t flags;
+} Block;
+
+typedef struct Palette {
+    unsigned colors[128];
+    uint8_t index[1 << 15];
+} Palette;
+
+typedef struct FlashSV2Context {
+    AVCodecContext *avctx;
+    uint8_t *current_frame;
+    uint8_t *key_frame;
+    uint8_t *encbuffer;
+    uint8_t *keybuffer;
+    uint8_t *databuffer;
+
+    uint8_t *blockbuffer;
+    int blockbuffer_size;
+
+    Block *frame_blocks;
+    Block *key_blocks;
+    int frame_size;
+    int blocks_size;
+
+    int use15_7, dist, comp;
+
+    int rows, cols;
+
+    int last_key_frame;
+
+    int image_width, image_height;
+    int block_width, block_height;
+    uint8_t flags;
+    uint8_t use_custom_palette;
+    uint8_t palette_type;       ///< 0=>default, 1=>custom - changed when palette regenerated.
+    Palette palette;
+#ifndef FLASHSV2_DUMB
+    double tot_blocks;          ///< blocks encoded since last keyframe
+    double diff_blocks;         ///< blocks that were different since last keyframe
+    double tot_lines;           ///< total scanlines in image since last keyframe
+    double diff_lines;          ///< scanlines that were different since last keyframe
+    double raw_size;            ///< size of raw frames since last keyframe
+    double comp_size;           ///< size of compressed data since last keyframe
+    double uncomp_size;         ///< size of uncompressed data since last keyframe
+
+    double total_bits;          ///< total bits written to stream so far
+#endif
+} FlashSV2Context;
+
+static av_cold void cleanup(FlashSV2Context * s)
+{
+    av_freep(&s->encbuffer);
+    av_freep(&s->keybuffer);
+    av_freep(&s->databuffer);
+    av_freep(&s->blockbuffer);
+    av_freep(&s->current_frame);
+    av_freep(&s->key_frame);
+
+    av_freep(&s->frame_blocks);
+    av_freep(&s->key_blocks);
+}
+
+static void init_blocks(FlashSV2Context * s, Block * blocks,
+                        uint8_t * encbuf, uint8_t * databuf)
+{
+    int row, col;
+    Block *b;
+    for (col = 0; col < s->cols; col++) {
+        for (row = 0; row < s->rows; row++) {
+            b = blocks + (col + row * s->cols);
+            b->width = (col < s->cols - 1) ?
+                s->block_width :
+                s->image_width - col * s->block_width;
+
+            b->height = (row < s->rows - 1) ?
+                s->block_height :
+                s->image_height - row * s->block_height;
+
+            b->row   = row;
+            b->col   = col;
+            b->enc   = encbuf;
+            b->data  = databuf;
+            encbuf  += b->width * b->height * 3;
+            databuf += !databuf ? 0 : b->width * b->height * 6;
+        }
+    }
+}
+
+static void reset_stats(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    s->diff_blocks = 0.1;
+    s->tot_blocks = 1;
+    s->diff_lines = 0.1;
+    s->tot_lines = 1;
+    s->raw_size = s->comp_size = s->uncomp_size = 10;
+#endif
+}
+
+static av_cold int flashsv2_encode_init(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    s->comp = avctx->compression_level;
+    if (s->comp == -1)
+        s->comp = 9;
+    if (s->comp < 0 || s->comp > 9) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Compression level should be 0-9, not %d\n", s->comp);
+        return -1;
+    }
+
+
+    if ((avctx->width > 4095) || (avctx->height > 4095)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too large, input must be max 4095x4095 !\n");
+        return -1;
+    }
+    if ((avctx->width < 16) || (avctx->height < 16)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too small, input must be at least 16x16 !\n");
+        return -1;
+    }
+
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0)
+        return -1;
+
+
+    s->last_key_frame = 0;
+
+    s->image_width  = avctx->width;
+    s->image_height = avctx->height;
+
+    s->block_width  = (s->image_width /  12) & ~15;
+    s->block_height = (s->image_height / 12) & ~15;
+
+    if(!s->block_width)
+        s->block_width = 1;
+    if(!s->block_height)
+        s->block_height = 1;
+
+    s->rows = (s->image_height + s->block_height - 1) / s->block_height;
+    s->cols = (s->image_width +  s->block_width -  1) / s->block_width;
+
+    s->frame_size  = s->image_width * s->image_height * 3;
+    s->blocks_size = s->rows * s->cols * sizeof(Block);
+
+    s->encbuffer     = av_mallocz(s->frame_size);
+    s->keybuffer     = av_mallocz(s->frame_size);
+    s->databuffer    = av_mallocz(s->frame_size * 6);
+    s->current_frame = av_mallocz(s->frame_size);
+    s->key_frame     = av_mallocz(s->frame_size);
+    s->frame_blocks  = av_mallocz(s->blocks_size);
+    s->key_blocks    = av_mallocz(s->blocks_size);
+
+    s->blockbuffer      = NULL;
+    s->blockbuffer_size = 0;
+
+    init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+    init_blocks(s, s->key_blocks,   s->keybuffer, 0);
+    reset_stats(s);
+#ifndef FLASHSV2_DUMB
+    s->total_bits = 1;
+#endif
+
+    s->use_custom_palette =  0;
+    s->palette_type       = -1;        // so that the palette will be generated in reconfigure_at_keyframe
+
+    if (!s->encbuffer || !s->keybuffer || !s->databuffer
+        || !s->current_frame || !s->key_frame || !s->key_blocks
+        || !s->frame_blocks) {
+        av_log(avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+        cleanup(s);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int new_key_frame(FlashSV2Context * s)
+{
+    int i;
+    memcpy(s->key_blocks, s->frame_blocks, s->blocks_size);
+    memcpy(s->key_frame, s->current_frame, s->frame_size);
+
+    for (i = 0; i < s->rows * s->cols; i++) {
+        s->key_blocks[i].enc += (s->keybuffer - s->encbuffer);
+        s->key_blocks[i].sl_begin = 0;
+        s->key_blocks[i].sl_end   = 0;
+        s->key_blocks[i].data     = 0;
+    }
+    memcpy(s->keybuffer, s->encbuffer, s->frame_size);
+
+    return 0;
+}
+
+static int write_palette(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static int write_header(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    PutBitContext pb;
+    int buf_pos, len;
+
+    if (buf_size < 5)
+        return -1;
+
+    init_put_bits(&pb, buf, buf_size);
+
+    put_bits(&pb, 4, (s->block_width  >> 4) - 1);
+    put_bits(&pb, 12, s->image_width);
+    put_bits(&pb, 4, (s->block_height >> 4) - 1);
+    put_bits(&pb, 12, s->image_height);
+
+    flush_put_bits(&pb);
+    buf_pos = 4;
+
+    buf[buf_pos++] = s->flags;
+
+    if (s->flags & HAS_PALLET_INFO) {
+        len = write_palette(s, buf + buf_pos, buf_size - buf_pos);
+        if (len < 0)
+            return -1;
+        buf_pos += len;
+    }
+
+    return buf_pos;
+}
+
+static int write_block(Block * b, uint8_t * buf, int buf_size)
+{
+    int buf_pos = 0;
+    unsigned block_size = b->data_size;
+
+    if (b->flags & HAS_DIFF_BLOCKS)
+        block_size += 2;
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT)
+        block_size += 2;
+    if (block_size > 0)
+        block_size += 1;
+    if (buf_size < block_size + 2)
+        return -1;
+
+    buf[buf_pos++] = block_size >> 8;
+    buf[buf_pos++] = block_size;
+
+    if (block_size == 0)
+        return buf_pos;
+
+    buf[buf_pos++] = b->flags;
+
+    if (b->flags & HAS_DIFF_BLOCKS) {
+        buf[buf_pos++] = (b->start);
+        buf[buf_pos++] = (b->len);
+    }
+
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT) {
+        //This feature of the format is poorly understood, and as of now, unused.
+        buf[buf_pos++] = (b->col);
+        buf[buf_pos++] = (b->row);
+    }
+
+    memcpy(buf + buf_pos, b->data, b->data_size);
+
+    buf_pos += b->data_size;
+
+    return buf_pos;
+}
+
+static int encode_zlib(Block * b, uint8_t * buf, unsigned long *buf_size, int comp)
+{
+    int res = compress2(buf, buf_size, b->sl_begin, b->sl_end - b->sl_begin, comp);
+    return res == Z_OK ? 0 : -1;
+}
+
+static int encode_zlibprime(Block * b, Block * prime, uint8_t * buf,
+                            int *buf_size, int comp)
+{
+    z_stream s;
+    int res;
+    s.zalloc = NULL;
+    s.zfree  = NULL;
+    s.opaque = NULL;
+    res = deflateInit(&s, comp);
+    if (res < 0)
+        return -1;
+
+    s.next_in  = prime->enc;
+    s.avail_in = prime->enc_size;
+    while (s.avail_in > 0) {
+        s.next_out  = buf;
+        s.avail_out = *buf_size;
+        res = deflate(&s, Z_SYNC_FLUSH);
+        if (res < 0)
+            return -1;
+    }
+
+    s.next_in   = b->sl_begin;
+    s.avail_in  = b->sl_end - b->sl_begin;
+    s.next_out  = buf;
+    s.avail_out = *buf_size;
+    res = deflate(&s, Z_FINISH);
+    deflateEnd(&s);
+    *buf_size -= s.avail_out;
+    if (res != Z_STREAM_END)
+        return -1;
+    return 0;
+}
+
+static int encode_bgr(Block * b, const uint8_t * src, int stride)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_begin = ptr + i * b->width * 3;
+    for (; i < b->start + b->len; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_end = ptr + i * b->width * 3;
+    for (; i < b->height; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->enc_size = ptr + i * b->width * 3 - b->enc;
+    return b->enc_size;
+}
+
+static inline unsigned pixel_color15(const uint8_t * src)
+{
+    return (src[0] >> 3) | ((src[1] & 0xf8) << 2) | ((src[2] & 0xf8) << 7);
+}
+
+static inline unsigned int chroma_diff(unsigned int c1, unsigned int c2)
+{
+#define ABSDIFF(a,b) (abs((int)(a)-(int)(b)))
+
+    unsigned int t1 = (c1 & 0x000000ff) + ((c1 & 0x0000ff00) >> 8) + ((c1 & 0x00ff0000) >> 16);
+    unsigned int t2 = (c2 & 0x000000ff) + ((c2 & 0x0000ff00) >> 8) + ((c2 & 0x00ff0000) >> 16);
+
+    return ABSDIFF(t1, t2) + ABSDIFF(c1 & 0x000000ff, c2 & 0x000000ff) +
+        ABSDIFF((c1 & 0x0000ff00) >> 8 , (c2 & 0x0000ff00) >> 8) +
+        ABSDIFF((c1 & 0x00ff0000) >> 16, (c2 & 0x00ff0000) >> 16);
+}
+
+static inline int pixel_color7_fast(Palette * palette, unsigned c15)
+{
+    return palette->index[c15];
+}
+
+static int pixel_color7_slow(Palette * palette, unsigned color)
+{
+    int i, min = 0x7fffffff;
+    int minc = -1;
+    for (i = 0; i < 128; i++) {
+        int c1 = palette->colors[i];
+        int diff = chroma_diff(c1, color);
+        if (diff < min) {
+            min = diff;
+            minc = i;
+        }
+    }
+    return minc;
+}
+
+static inline unsigned pixel_bgr(const uint8_t * src)
+{
+    return (src[0]) | (src[1] << 8) | (src[2] << 16);
+}
+
+static int write_pixel_15_7(Palette * palette, uint8_t * dest, const uint8_t * src,
+                            int dist)
+{
+    unsigned c15 = pixel_color15(src);
+    unsigned color = pixel_bgr(src);
+    int d15 = chroma_diff(color, color & 0x00f8f8f8);
+    int c7 = pixel_color7_fast(palette, c15);
+    int d7 = chroma_diff(color, palette->colors[c7]);
+    if (dist + d15 >= d7) {
+        dest[0] = c7;
+        return 1;
+    } else {
+        dest[0] = 0x80 | (c15 >> 8);
+        dest[1] = c15 & 0xff;
+        return 2;
+    }
+}
+
+static int update_palette_index(Palette * palette)
+{
+    int r, g, b;
+    unsigned int bgr, c15, index;
+    for (r = 4; r < 256; r += 8) {
+        for (g = 4; g < 256; g += 8) {
+            for (b = 4; b < 256; b += 8) {
+                bgr = b | (g << 8) | (r << 16);
+                c15 = (b >> 3) | ((g & 0xf8) << 2) | ((r & 0xf8) << 7);
+                index = pixel_color7_slow(palette, bgr);
+
+                palette->index[c15] = index;
+            }
+        }
+    }
+    return 0;
+}
+
+static const unsigned int default_screen_video_v2_palette[128] = {
+    0x00000000, 0x00333333, 0x00666666, 0x00999999, 0x00CCCCCC, 0x00FFFFFF,
+    0x00330000, 0x00660000, 0x00990000, 0x00CC0000, 0x00FF0000, 0x00003300,
+    0x00006600, 0x00009900, 0x0000CC00, 0x0000FF00, 0x00000033, 0x00000066,
+    0x00000099, 0x000000CC, 0x000000FF, 0x00333300, 0x00666600, 0x00999900,
+    0x00CCCC00, 0x00FFFF00, 0x00003333, 0x00006666, 0x00009999, 0x0000CCCC,
+    0x0000FFFF, 0x00330033, 0x00660066, 0x00990099, 0x00CC00CC, 0x00FF00FF,
+    0x00FFFF33, 0x00FFFF66, 0x00FFFF99, 0x00FFFFCC, 0x00FF33FF, 0x00FF66FF,
+    0x00FF99FF, 0x00FFCCFF, 0x0033FFFF, 0x0066FFFF, 0x0099FFFF, 0x00CCFFFF,
+    0x00CCCC33, 0x00CCCC66, 0x00CCCC99, 0x00CCCCFF, 0x00CC33CC, 0x00CC66CC,
+    0x00CC99CC, 0x00CCFFCC, 0x0033CCCC, 0x0066CCCC, 0x0099CCCC, 0x00FFCCCC,
+    0x00999933, 0x00999966, 0x009999CC, 0x009999FF, 0x00993399, 0x00996699,
+    0x0099CC99, 0x0099FF99, 0x00339999, 0x00669999, 0x00CC9999, 0x00FF9999,
+    0x00666633, 0x00666699, 0x006666CC, 0x006666FF, 0x00663366, 0x00669966,
+    0x0066CC66, 0x0066FF66, 0x00336666, 0x00996666, 0x00CC6666, 0x00FF6666,
+    0x00333366, 0x00333399, 0x003333CC, 0x003333FF, 0x00336633, 0x00339933,
+    0x0033CC33, 0x0033FF33, 0x00663333, 0x00993333, 0x00CC3333, 0x00FF3333,
+    0x00003366, 0x00336600, 0x00660033, 0x00006633, 0x00330066, 0x00663300,
+    0x00336699, 0x00669933, 0x00993366, 0x00339966, 0x00663399, 0x00996633,
+    0x006699CC, 0x0099CC66, 0x00CC6699, 0x0066CC99, 0x009966CC, 0x00CC9966,
+    0x0099CCFF, 0x00CCFF99, 0x00FF99CC, 0x0099FFCC, 0x00CC99FF, 0x00FFCC99,
+    0x00111111, 0x00222222, 0x00444444, 0x00555555, 0x00AAAAAA, 0x00BBBBBB,
+    0x00DDDDDD, 0x00EEEEEE
+};
+
+static int generate_default_palette(Palette * palette)
+{
+    memcpy(palette->colors, default_screen_video_v2_palette,
+           sizeof(default_screen_video_v2_palette));
+
+    return update_palette_index(palette);
+}
+
+static int generate_optimum_palette(Palette * palette, const uint8_t * image,
+                                   int width, int height, int stride)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static inline int encode_15_7_sl(Palette * palette, uint8_t * dest,
+                                 const uint8_t * src, int width, int dist)
+{
+    int len = 0, x;
+    for (x = 0; x < width; x++) {
+        len += write_pixel_15_7(palette, dest + len, src + 3 * x, dist);
+    }
+    return len;
+}
+
+static int encode_15_7(Palette * palette, Block * b, const uint8_t * src,
+                       int stride, int dist)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_begin = ptr;
+    for (; i < b->start + b->len; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_end = ptr;
+    for (; i < b->height; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->enc_size = ptr - b->enc;
+    return b->enc_size;
+}
+
+static int encode_block(FlashSV2Context *s, Palette * palette, Block * b,
+                        Block * prev, const uint8_t * src, int stride, int comp,
+                        int dist, int keyframe)
+{
+    unsigned buf_size = b->width * b->height * 6;
+    uint8_t *buf = s->blockbuffer;
+    int res;
+
+    if (b->flags & COLORSPACE_15_7) {
+        encode_15_7(palette, b, src, stride, dist);
+    } else {
+        encode_bgr(b, src, stride);
+    }
+
+    if (b->len > 0) {
+        b->data_size = buf_size;
+        res = encode_zlib(b, b->data, &b->data_size, comp);
+        if (res)
+            return res;
+
+        if (!keyframe) {
+            res = encode_zlibprime(b, prev, buf, &buf_size, comp);
+            if (res)
+                return res;
+
+            if (buf_size < b->data_size) {
+                b->data_size = buf_size;
+                memcpy(b->data, buf, buf_size);
+                b->flags |= ZLIB_PRIME_COMPRESS_PREVIOUS;
+            }
+        }
+    } else {
+        b->data_size = 0;
+    }
+    return 0;
+}
+
+static int compare_sl(FlashSV2Context * s, Block * b, const uint8_t * src,
+                      uint8_t * frame, uint8_t * key, int y, int keyframe)
+{
+    if (memcmp(src, frame, b->width * 3) != 0) {
+        b->dirty = 1;
+        memcpy(frame, src, b->width * 3);
+#ifndef FLASHSV2_DUMB
+        s->diff_lines++;
+#endif
+    }
+    if (memcmp(src, key, b->width * 3) != 0) {
+        if (b->len == 0)
+            b->start = y;
+        b->len = y + 1 - b->start;
+    }
+    return 0;
+}
+
+static int mark_all_blocks(FlashSV2Context * s, const uint8_t * src, int stride,
+                           int keyframe)
+{
+    int sl, rsl, col, pos, possl;
+    Block *b;
+    for (sl = s->image_height - 1; sl >= 0; sl--) {
+        for (col = 0; col < s->cols; col++) {
+            rsl = s->image_height - sl - 1;
+            b = s->frame_blocks + col + rsl / s->block_height * s->cols;
+            possl = stride * sl + col * s->block_width * 3;
+            pos = s->image_width * rsl * 3 + col * s->block_width * 3;
+            compare_sl(s, b, src + possl, s->current_frame + pos,
+                       s->key_frame + pos, rsl % s->block_height, keyframe);
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->tot_lines += s->image_height * s->cols;
+#endif
+    return 0;
+}
+
+static int encode_all_blocks(FlashSV2Context * s, int keyframe)
+{
+    int row, col, res;
+    uint8_t *data;
+    Block *b, *prev;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + (row * s->cols + col);
+            prev = s->key_blocks + (row * s->cols + col);
+            b->flags = s->use15_7 ? COLORSPACE_15_7 : 0;
+            if (keyframe) {
+                b->start = 0;
+                b->len = b->height;
+            } else if (!b->dirty) {
+                b->start = 0;
+                b->len = 0;
+                b->data_size = 0;
+                continue;
+            } else if (b->start != 0 || b->len != b->height) {
+                b->flags |= HAS_DIFF_BLOCKS;
+            }
+            data = s->current_frame + s->image_width * 3 * s->block_height * row + s->block_width * col * 3;
+            res = encode_block(s, &s->palette, b, prev, data, s->image_width * 3, s->comp, s->dist, keyframe);
+#ifndef FLASHSV2_DUMB
+            if (b->dirty)
+                s->diff_blocks++;
+            s->comp_size += b->data_size;
+            s->uncomp_size += b->enc_size;
+#endif
+            if (res)
+                return res;
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->raw_size += s->image_width * s->image_height * 3;
+    s->tot_blocks += s->rows * s->cols;
+#endif
+    return 0;
+}
+
+static int write_all_blocks(FlashSV2Context * s, uint8_t * buf,
+                            int buf_size)
+{
+    int row, col, buf_pos = 0, len;
+    Block *b;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + row * s->cols + col;
+            len = write_block(b, buf + buf_pos, buf_size - buf_pos);
+            b->start = b->len = b->dirty = 0;
+            if (len < 0)
+                return len;
+            buf_pos += len;
+        }
+    }
+    return buf_pos;
+}
+
+static int write_bitstream(FlashSV2Context * s, const uint8_t * src, int stride,
+                           uint8_t * buf, int buf_size, int keyframe)
+{
+    int buf_pos, res;
+
+    res = mark_all_blocks(s, src, stride, keyframe);
+    if (res)
+        return res;
+    res = encode_all_blocks(s, keyframe);
+    if (res)
+        return res;
+
+    res = write_header(s, buf, buf_size);
+    if (res < 0) {
+        return res;
+    } else {
+        buf_pos = res;
+    }
+    res = write_all_blocks(s, buf + buf_pos, buf_size - buf_pos);
+    if (res < 0)
+        return res;
+    buf_pos += res;
+#ifndef FLASHSV2_DUMB
+    s->total_bits += ((double) buf_pos) * 8.0;
+#endif
+
+    return buf_pos;
+}
+
+static void recommend_keyframe(FlashSV2Context * s, int *keyframe)
+{
+#ifndef FLASHSV2_DUMB
+    double block_ratio, line_ratio, enc_ratio, comp_ratio, data_ratio;
+    if (s->avctx->gop_size > 0) {
+        block_ratio = s->diff_blocks / s->tot_blocks;
+        line_ratio = s->diff_lines / s->tot_lines;
+        enc_ratio = s->uncomp_size / s->raw_size;
+        comp_ratio = s->comp_size / s->uncomp_size;
+        data_ratio = s->comp_size / s->raw_size;
+
+        if ((block_ratio >= 0.5 && line_ratio / block_ratio <= 0.5) || line_ratio >= 0.95) {
+            *keyframe = 1;
+            return;
+        }
+    }
+#else
+    return;
+#endif
+}
+
+#ifndef FLASHSV2_DUMB
+static const double block_size_fraction = 1.0 / 300;
+static const double use15_7_threshold = 8192;
+static const double color15_7_factor = 100;
+#endif
+static int optimum_block_width(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double width = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_width;
+    int pwidth = ((int) width);
+    return FFCLIP(pwidth & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_block_height(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double height = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_height;
+    int pheight = ((int) height);
+    return FFCLIP(pheight & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_use15_7(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal = ((double)(s->avctx->bit_rate * s->avctx->time_base.den * s->avctx->ticks_per_frame)) /
+        ((double) s->avctx->time_base.num) * s->avctx->frame_number;
+    if (ideal + use15_7_threshold < s->total_bits) {
+        return 1;
+    } else {
+        return 0;
+    }
+#else
+    return s->avctx->global_quality == 0;
+#endif
+}
+
+static int optimum_dist(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal =
+        s->avctx->bit_rate * s->avctx->time_base.den *
+        s->avctx->ticks_per_frame;
+    int dist = pow((s->total_bits / ideal) * color15_7_factor, 3);
+    av_log(s->avctx, AV_LOG_DEBUG, "dist: %d\n", dist);
+    return dist;
+#else
+    return 15;
+#endif
+}
+
+
+static int reconfigure_at_keyframe(FlashSV2Context * s, const uint8_t * image,
+                                   int stride)
+{
+    int update_palette = 0;
+    int res;
+    int block_width  = optimum_block_width (s);
+    int block_height = optimum_block_height(s);
+
+    s->rows = (s->image_height + block_height - 1) / block_height;
+    s->cols = (s->image_width  + block_width  - 1) / block_width;
+
+    if (block_width != s->block_width || block_height != s->block_height) {
+        s->block_width  = block_width;
+        s->block_height = block_height;
+        if (s->rows * s->cols > s->blocks_size / sizeof(Block)) {
+            s->frame_blocks = av_realloc_array(s->frame_blocks, s->rows, s->cols * sizeof(Block));
+            s->key_blocks = av_realloc_array(s->key_blocks, s->cols, s->rows * sizeof(Block));
+            if (!s->frame_blocks || !s->key_blocks) {
+                av_log(s->avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+                return -1;
+            }
+            s->blocks_size = s->rows * s->cols * sizeof(Block);
+        }
+        init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+        init_blocks(s, s->key_blocks, s->keybuffer, 0);
+
+        av_fast_malloc(&s->blockbuffer, &s->blockbuffer_size, block_width * block_height * 6);
+        if (!s->blockbuffer) {
+            av_log(s->avctx, AV_LOG_ERROR, "Could not allocate block buffer.\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    s->use15_7 = optimum_use15_7(s);
+    if (s->use15_7) {
+        if ((s->use_custom_palette && s->palette_type != 1) || update_palette) {
+            res = generate_optimum_palette(&s->palette, image, s->image_width, s->image_height, stride);
+            if (res)
+                return res;
+            s->palette_type = 1;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated optimum palette\n");
+        } else if (!s->use_custom_palette && s->palette_type != 0) {
+            res = generate_default_palette(&s->palette);
+            if (res)
+                return res;
+            s->palette_type = 0;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated default palette\n");
+        }
+    }
+
+
+    reset_stats(s);
+
+    return 0;
+}
+
+static int flashsv2_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                 const AVFrame *p, int *got_packet)
+{
+    FlashSV2Context *const s = avctx->priv_data;
+    int res;
+    int keyframe = 0;
+
+    if ((res = ff_alloc_packet2(avctx, pkt, s->frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return res;
+
+    /* First frame needs to be a keyframe */
+    if (avctx->frame_number == 0)
+        keyframe = 1;
+
+    /* Check the placement of keyframes */
+    if (avctx->gop_size > 0) {
+        if (avctx->frame_number >= s->last_key_frame + avctx->gop_size)
+            keyframe = 1;
+    }
+
+    if (!keyframe
+        && avctx->frame_number > s->last_key_frame + avctx->keyint_min) {
+        recommend_keyframe(s, &keyframe);
+        if (keyframe)
+            av_log(avctx, AV_LOG_DEBUG, "Recommending key frame at frame %d\n", avctx->frame_number);
+    }
+
+    if (keyframe) {
+        res = reconfigure_at_keyframe(s, p->data[0], p->linesize[0]);
+        if (res)
+            return res;
+    }
+
+    if (s->use15_7)
+        s->dist = optimum_dist(s);
+
+    res = write_bitstream(s, p->data[0], p->linesize[0], pkt->data, pkt->size, keyframe);
+
+    if (keyframe) {
+        new_key_frame(s);
+        s->last_key_frame = avctx->frame_number;
+        pkt->flags |= AV_PKT_FLAG_KEY;
+        av_log(avctx, AV_LOG_DEBUG, "Inserting key frame at frame %d\n", avctx->frame_number);
+    }
+
+    pkt->size = res;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int flashsv2_encode_end(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    cleanup(s);
+
+    return 0;
+}
+
+AVCodec ff_flashsv2_encoder = {
+    .name           = "flashsv2",
+    .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video Version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_FLASHSV2,
+    .priv_data_size = sizeof(FlashSV2Context),
+    .init           = flashsv2_encode_init,
+    .encode2        = flashsv2_encode_frame,
+    .close          = flashsv2_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/flashsvenc.c b/libavcodec/flashsvenc.c
index 7e14e47990..f7f98efde3 100644
--- a/libavcodec/flashsvenc.c
+++ b/libavcodec/flashsvenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -94,9 +94,9 @@ static av_cold int flashsv_encode_end(AVCodecContext *avctx)
 
     deflateEnd(&s->zstream);
 
-    av_free(s->encbuffer);
-    av_free(s->previous_frame);
-    av_free(s->tmpblock);
+    av_freep(&s->encbuffer);
+    av_freep(&s->previous_frame);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -109,7 +109,7 @@ static av_cold int flashsv_encode_init(AVCodecContext *avctx)
 
     if (avctx->width > 4095 || avctx->height > 4095) {
         av_log(avctx, AV_LOG_ERROR,
-               "Input dimensions too large, input must be max 4096x4096 !\n");
+               "Input dimensions too large, input must be max 4095x4095 !\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -143,7 +143,7 @@ static int encode_bitstream(FlashSVContext *s, const AVFrame *p, uint8_t *buf,
     int buf_pos, res;
     int pred_blocks = 0;
 
-    init_put_bits(&pb, buf, buf_size * 8);
+    init_put_bits(&pb, buf, buf_size);
 
     put_bits(&pb,  4, block_width / 16 - 1);
     put_bits(&pb, 12, s->image_width);
@@ -238,12 +238,8 @@ static int flashsv_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         I_frame = 1;
     }
 
-    if ((res = ff_alloc_packet(pkt, s->image_width * s->image_height * 3)) < 0) {
-        //Conservative upper bound check for compressed data
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n",
-               s->image_width * s->image_height * 3);
+    if ((res = ff_alloc_packet2(avctx, pkt, s->image_width * s->image_height * 3, 0)) < 0)
         return res;
-    }
 
     pkt->size = encode_bitstream(s, p, pkt->data, pkt->size, opt_w * 16, opt_h * 16,
                                  pfptr, &I_frame);
diff --git a/libavcodec/flicvideo.c b/libavcodec/flicvideo.c
index 3a595435b0..3e0573af93 100644
--- a/libavcodec/flicvideo.c
+++ b/libavcodec/flicvideo.c
@@ -1,21 +1,21 @@
 /*
  * FLI/FLC Animation Video Decoder
- * Copyright (C) 2003, 2004 the ffmpeg project
+ * Copyright (c) 2003, 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,7 @@
 
 #define CHECK_PIXEL_PTR(n) \
     if (pixel_ptr + n > pixel_limit) { \
-        av_log (s->avctx, AV_LOG_INFO, "Problem: pixel_ptr >= pixel_limit (%d >= %d)\n", \
+        av_log (s->avctx, AV_LOG_ERROR, "Invalid pixel_ptr = %d > pixel_limit = %d\n", \
         pixel_ptr + n, pixel_limit); \
         return AVERROR_INVALIDDATA; \
     } \
@@ -84,22 +84,40 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
     unsigned char *fli_header = (unsigned char *)avctx->extradata;
     int depth;
 
-    if (avctx->extradata_size != 12 &&
-        avctx->extradata_size != 128) {
-        av_log(avctx, AV_LOG_ERROR, "Expected extradata of 12 or 128 bytes\n");
+    if (avctx->extradata_size != 0 &&
+        avctx->extradata_size != 12 &&
+        avctx->extradata_size != 128 &&
+        avctx->extradata_size != 256 &&
+        avctx->extradata_size != 904 &&
+        avctx->extradata_size != 1024) {
+        av_log(avctx, AV_LOG_ERROR, "Unexpected extradata size %d\n", avctx->extradata_size);
         return AVERROR_INVALIDDATA;
     }
 
     s->avctx = avctx;
 
-    s->fli_type = AV_RL16(&fli_header[4]); /* Might be overridden if a Magic Carpet FLC */
-
-    depth = 0;
     if (s->avctx->extradata_size == 12) {
         /* special case for magic carpet FLIs */
         s->fli_type = FLC_MAGIC_CARPET_SYNTHETIC_TYPE_CODE;
         depth = 8;
+    } else if (avctx->extradata_size == 1024) {
+        uint8_t *ptr = avctx->extradata;
+        int i;
+
+        for (i = 0; i < 256; i++) {
+            s->palette[i] = AV_RL32(ptr);
+            ptr += 4;
+        }
+        depth = 8;
+        /* FLI in MOV, see e.g. FFmpeg trac issue #626 */
+    } else if (avctx->extradata_size == 0 ||
+               avctx->extradata_size == 256 ||
+        /* see FFmpeg ticket #1234 */
+               avctx->extradata_size == 904) {
+        s->fli_type = FLI_TYPE_CODE;
+        depth = 8;
     } else {
+        s->fli_type = AV_RL16(&fli_header[4]);
         depth = AV_RL16(&fli_header[12]);
     }
 
@@ -116,7 +134,7 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
         case 15 : avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
         case 16 : avctx->pix_fmt = AV_PIX_FMT_RGB565; break;
         case 24 : avctx->pix_fmt = AV_PIX_FMT_BGR24; /* Supposedly BGR, but havent any files to test with */
-                  av_log(avctx, AV_LOG_ERROR, "24Bpp FLC/FLX is unsupported due to no test files.\n");
+                  avpriv_request_sample(avctx, "24Bpp FLC/FLX");
                   return AVERROR_PATCHWELCOME;
         default :
                   av_log(avctx, AV_LOG_ERROR, "Unknown FLC/FLX depth of %d Bpp is unsupported.\n",depth);
@@ -139,7 +157,6 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
     FlicDecodeContext *s = avctx->priv_data;
 
     GetByteContext g2;
-    int stream_ptr_after_color_chunk;
     int pixel_ptr;
     int palette_ptr;
     unsigned char palette_idx1;
@@ -171,14 +188,16 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
+    if (buf_size < 16 || buf_size > INT_MAX - (3 * 256 + AV_INPUT_BUFFER_PADDING_SIZE))
+        return AVERROR_INVALIDDATA;
     frame_size = bytestream2_get_le32(&g2);
+    if (frame_size > buf_size)
+        frame_size = buf_size;
     bytestream2_skip(&g2, 2); /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
@@ -186,15 +205,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size >= 6) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
-            stream_ptr_after_color_chunk = bytestream2_tell(&g2) + chunk_size - 6;
-
             /* check special case: If this file is from the Magic Carpet
              * game and uses 6-bit colors even though it reports 256-color
              * chunks in a 0xAF12-type file (fli_type is set to 0xAF13 during
@@ -217,6 +243,9 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 if (color_changes == 0)
                     color_changes = 256;
 
+                if (bytestream2_tell(&g2) + color_changes * 3 > stream_ptr_after_chunk)
+                    break;
+
                 for (j = 0; j < color_changes; j++) {
                     unsigned int entry;
 
@@ -227,26 +256,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     r = bytestream2_get_byte(&g2) << color_shift;
                     g = bytestream2_get_byte(&g2) << color_shift;
                     b = bytestream2_get_byte(&g2) << color_shift;
-                    entry = (r << 16) | (g << 8) | b;
+                    entry = 0xFFU << 24 | r << 16 | g << 8 | b;
+                    if (color_shift == 2)
+                        entry |= entry >> 6 & 0x30303;
                     if (s->palette[palette_ptr] != entry)
                         s->new_palette = 1;
                     s->palette[palette_ptr++] = entry;
                 }
             }
-
-            /* color chunks sometimes have weird 16-bit alignment issues;
-             * therefore, take the hardline approach and skip
-             * to the value calculated w.r.t. the size specified by the color
-             * chunk header */
-            if (stream_ptr_after_color_chunk - bytestream2_tell(&g2) > 0)
-                bytestream2_skip(&g2, stream_ptr_after_color_chunk - bytestream2_tell(&g2));
-
             break;
 
         case FLI_DELTA:
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_le16(&g2);
                 if ((line_packets & 0xC000) == 0xC000) {
                     // line skip opcode
@@ -265,6 +290,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     CHECK_PIXEL_PTR(0);
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         /* account for the skip bytes */
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
@@ -281,6 +308,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                             }
                         } else {
                             CHECK_PIXEL_PTR(byte_run * 2);
+                            if (bytestream2_tell(&g2) + byte_run * 2 > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run * 2; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -303,16 +332,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 pixel_ptr = y_ptr;
                 CHECK_PIXEL_PTR(0);
                 pixel_countdown = s->avctx->width;
+                if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_byte(&g2);
                 if (line_packets > 0) {
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
                         pixel_countdown -= pixel_skip;
                         byte_run = sign_extend(bytestream2_get_byte(&g2),8);
                         if (byte_run > 0) {
                             CHECK_PIXEL_PTR(byte_run);
+                            if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -349,6 +384,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                  bytestream2_skip(&g2, 1);
                 pixel_countdown = s->avctx->width;
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (!byte_run) {
                         av_log(avctx, AV_LOG_ERROR, "Invalid byte run value.\n");
@@ -368,6 +405,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
                         CHECK_PIXEL_PTR(byte_run);
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         for (j = 0; j < byte_run; j++) {
                             pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             pixel_countdown--;
@@ -384,9 +423,9 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
         case FLI_COPY:
             /* copy the chunk (uncompressed frame) */
-            if (chunk_size - 6 > s->avctx->width * s->avctx->height) {
+            if (chunk_size - 6 != s->avctx->width * s->avctx->height) {
                 av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
-                       "bigger than image, skipping chunk\n", chunk_size - 6);
+                       "has incorrect size, skipping chunk\n", chunk_size - 6);
                 bytestream2_skip(&g2, chunk_size - 6);
             } else {
                 for (y_ptr = 0; y_ptr < s->frame->linesize[0] * s->avctx->height;
@@ -399,7 +438,6 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
         case FLI_MINI:
             /* some sort of a thumbnail? disregard this chunk... */
-            bytestream2_skip(&g2, chunk_size - 6);
             break;
 
         default:
@@ -407,14 +445,16 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
             break;
         }
 
+        if (stream_ptr_after_chunk - bytestream2_tell(&g2) > 0)
+            bytestream2_skip(&g2, stream_ptr_after_chunk - bytestream2_tell(&g2));
+
         frame_size -= chunk_size;
         num_chunks--;
     }
 
     /* by the end of the chunk, the stream ptr should equal the frame
-     * size (minus 1, possibly); if it doesn't, issue a warning */
-    if ((bytestream2_get_bytes_left(&g2) != 0) &&
-        (bytestream2_get_bytes_left(&g2) != 1))
+     * size (minus 1 or 2, possibly); if it doesn't, issue a warning */
+    if (bytestream2_get_bytes_left(&g2) > 2)
         av_log(avctx, AV_LOG_ERROR, "Processed FLI chunk where chunk size = %d " \
                "and final chunk ptr = %d\n", buf_size,
                buf_size - bytestream2_get_bytes_left(&g2));
@@ -467,10 +507,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
@@ -479,14 +517,26 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
     bytestream2_skip(&g2, 2);  /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
+    if (frame_size > buf_size)
+        frame_size = buf_size;
 
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size > 0) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
+
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
@@ -504,6 +554,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_le16(&g2);
                 if (line_packets < 0) {
                     line_packets = -line_packets;
@@ -515,6 +567,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += (pixel_skip*2); /* Pixel is 2 bytes wide */
                         pixel_countdown -= pixel_skip;
@@ -528,6 +582,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                                 pixel_ptr += 2;
                             }
                         } else {
+                            if (bytestream2_tell(&g2) + 2*byte_run > stream_ptr_after_chunk)
+                                break;
                             CHECK_PIXEL_PTR(2 * byte_run);
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
@@ -562,6 +618,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = (s->avctx->width * 2);
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         palette_idx1 = bytestream2_get_byte(&g2);
@@ -575,6 +633,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(byte_run);
                         for (j = 0; j < byte_run; j++) {
                             palette_idx1 = bytestream2_get_byte(&g2);
@@ -614,6 +674,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = s->avctx->width; /* Width is in pixels, not bytes */
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         pixel    = bytestream2_get_le16(&g2);
@@ -628,6 +690,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy pixels if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + 2 * byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(2 * byte_run);
                         for (j = 0; j < byte_run; j++) {
                             *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
diff --git a/libavcodec/flv.h b/libavcodec/flv.h
index 801e357e7a..561cfe0baa 100644
--- a/libavcodec/flv.h
+++ b/libavcodec/flv.h
@@ -1,20 +1,20 @@
 /*
  * FLV specific private header.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,5 @@ void ff_flv2_encode_ac_esc(PutBitContext *pb, int slevel, int level, int run,
                            int last);
 
 int ff_flv_decode_picture_header(MpegEncContext *s);
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last);
 
 #endif /* AVCODEC_FLV_H */
diff --git a/libavcodec/flvdec.c b/libavcodec/flvdec.c
index f2d49294c6..f74ba3f018 100644
--- a/libavcodec/flvdec.c
+++ b/libavcodec/flvdec.c
@@ -1,20 +1,20 @@
 /*
  * FLV decoding.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,17 +25,6 @@
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
 
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last)
-{
-    int is11 = get_bits1(gb);
-    *last = get_bits1(gb);
-    *run  = get_bits(gb, 6);
-    if (is11)
-        *level = get_sbits(gb, 11);
-    else
-        *level = get_sbits(gb, 7);
-}
-
 int ff_flv_decode_picture_header(MpegEncContext *s)
 {
     int format, width, height;
@@ -43,12 +32,12 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     /* picture header */
     if (get_bits_long(&s->gb, 17) != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture start code\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     format = get_bits(&s->gb, 5);
     if (format != 0 && format != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture format\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     s->h263_flv       = format + 1;
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
@@ -87,7 +76,7 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
         break;
     }
     if (av_image_check_size(width, height, 0, s->avctx))
-        return -1;
+        return AVERROR(EINVAL);
     s->width  = width;
     s->height = height;
 
@@ -105,10 +94,14 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     s->h263_long_vectors = 0;
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
     s->f_code = 1;
 
+    if (s->ehc_mode)
+        s->avctx->sample_aspect_ratio= (AVRational){1,2};
+
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_DEBUG, "%c esc_type:%d, qp:%d num:%d\n",
                s->droppable ? 'D' : av_get_picture_type_char(s->pict_type),
@@ -130,6 +123,7 @@ AVCodec ff_flv_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                      AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/flvenc.c b/libavcodec/flvenc.c
index ab89b25203..a2cd39955e 100644
--- a/libavcodec/flvenc.c
+++ b/libavcodec/flvenc.c
@@ -1,20 +1,20 @@
 /*
  * FLV Encoding specific code.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index 2dff704aa2..1de1d318d2 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,4 +49,5 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
     if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
     if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
     if (ARCH_X86) ff_fmt_convert_init_x86(c, avctx);
+    if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c);
 }
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 7de890bd6a..4b8b9580d2 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,5 +61,6 @@ void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_mips(FmtConvertContext *c);
 
 #endif /* AVCODEC_FMTCONVERT_H */
diff --git a/libavcodec/frame_thread_encoder.c b/libavcodec/frame_thread_encoder.c
new file mode 100644
index 0000000000..14f2ea241b
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "frame_thread_encoder.h"
+
+#include "libavutil/fifo.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "thread.h"
+
+#if HAVE_PTHREADS
+#include <pthread.h>
+#elif HAVE_W32THREADS
+#include "compat/w32pthreads.h"
+#elif HAVE_OS2THREADS
+#include "compat/os2threads.h"
+#endif
+
+#define MAX_THREADS 64
+#define BUFFER_SIZE (2*MAX_THREADS)
+
+typedef struct{
+    void *indata;
+    void *outdata;
+    int64_t return_code;
+    unsigned index;
+} Task;
+
+typedef struct{
+    AVCodecContext *parent_avctx;
+    pthread_mutex_t buffer_mutex;
+
+    AVFifoBuffer *task_fifo;
+    pthread_mutex_t task_fifo_mutex;
+    pthread_cond_t task_fifo_cond;
+
+    Task finished_tasks[BUFFER_SIZE];
+    pthread_mutex_t finished_task_mutex;
+    pthread_cond_t finished_task_cond;
+
+    unsigned task_index;
+    unsigned finished_task_index;
+
+    pthread_t worker[MAX_THREADS];
+    int exit;
+} ThreadContext;
+
+static void * attribute_align_arg worker(void *v){
+    AVCodecContext *avctx = v;
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    AVPacket *pkt = NULL;
+
+    while(!c->exit){
+        int got_packet, ret;
+        AVFrame *frame;
+        Task task;
+
+        if(!pkt) pkt= av_mallocz(sizeof(*pkt));
+        if(!pkt) continue;
+        av_init_packet(pkt);
+
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        while (av_fifo_size(c->task_fifo) <= 0 || c->exit) {
+            if(c->exit){
+                pthread_mutex_unlock(&c->task_fifo_mutex);
+                goto end;
+            }
+            pthread_cond_wait(&c->task_fifo_cond, &c->task_fifo_mutex);
+        }
+        av_fifo_generic_read(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+        frame = task.indata;
+
+        ret = avcodec_encode_video2(avctx, pkt, frame, &got_packet);
+        pthread_mutex_lock(&c->buffer_mutex);
+        av_frame_unref(frame);
+        pthread_mutex_unlock(&c->buffer_mutex);
+        av_frame_free(&frame);
+        if(got_packet) {
+            av_dup_packet(pkt);
+        } else {
+            pkt->data = NULL;
+            pkt->size = 0;
+        }
+        pthread_mutex_lock(&c->finished_task_mutex);
+        c->finished_tasks[task.index].outdata = pkt; pkt = NULL;
+        c->finished_tasks[task.index].return_code = ret;
+        pthread_cond_signal(&c->finished_task_cond);
+        pthread_mutex_unlock(&c->finished_task_mutex);
+    }
+end:
+    av_free(pkt);
+    pthread_mutex_lock(&c->buffer_mutex);
+    avcodec_close(avctx);
+    pthread_mutex_unlock(&c->buffer_mutex);
+    av_freep(&avctx);
+    return NULL;
+}
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options){
+    int i=0;
+    ThreadContext *c;
+
+
+    if(   !(avctx->thread_type & FF_THREAD_FRAME)
+       || !(avctx->codec->capabilities & AV_CODEC_CAP_INTRA_ONLY))
+        return 0;
+
+    if(   !avctx->thread_count
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Forcing thread count to 1 for MJPEG encoding, use -thread_type slice "
+               "or a constant quantizer if you want to use multiple cpu cores\n");
+        avctx->thread_count = 1;
+    }
+    if(   avctx->thread_count > 1
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE))
+        av_log(avctx, AV_LOG_WARNING,
+               "MJPEG CBR encoding works badly with frame multi-threading, consider "
+               "using -threads 1, -thread_type slice or a constant quantizer.\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_HUFFYUV ||
+        avctx->codec_id == AV_CODEC_ID_FFVHUFF) {
+        int warn = 0;
+        if (avctx->flags & AV_CODEC_FLAG_PASS1)
+            warn = 1;
+        else if(avctx->context_model > 0) {
+            AVDictionaryEntry *t = av_dict_get(options, "non_deterministic",
+                                               NULL, AV_DICT_MATCH_CASE);
+            warn = !t || !t->value || !atoi(t->value) ? 1 : 0;
+        }
+        // huffyuv does not support these with multiple frame threads currently
+        if (warn) {
+            av_log(avctx, AV_LOG_WARNING,
+               "Forcing thread count to 1 for huffyuv encoding with first pass or context 1\n");
+            avctx->thread_count = 1;
+        }
+    }
+
+    if(!avctx->thread_count) {
+        avctx->thread_count = av_cpu_count();
+        avctx->thread_count = FFMIN(avctx->thread_count, MAX_THREADS);
+    }
+
+    if(avctx->thread_count <= 1)
+        return 0;
+
+    if(avctx->thread_count > MAX_THREADS)
+        return AVERROR(EINVAL);
+
+    av_assert0(!avctx->internal->frame_thread_encoder);
+    c = avctx->internal->frame_thread_encoder = av_mallocz(sizeof(ThreadContext));
+    if(!c)
+        return AVERROR(ENOMEM);
+
+    c->parent_avctx = avctx;
+
+    c->task_fifo = av_fifo_alloc_array(BUFFER_SIZE, sizeof(Task));
+    if(!c->task_fifo)
+        goto fail;
+
+    pthread_mutex_init(&c->task_fifo_mutex, NULL);
+    pthread_mutex_init(&c->finished_task_mutex, NULL);
+    pthread_mutex_init(&c->buffer_mutex, NULL);
+    pthread_cond_init(&c->task_fifo_cond, NULL);
+    pthread_cond_init(&c->finished_task_cond, NULL);
+
+    for(i=0; i<avctx->thread_count ; i++){
+        AVDictionary *tmp = NULL;
+        void *tmpv;
+        AVCodecContext *thread_avctx = avcodec_alloc_context3(avctx->codec);
+        if(!thread_avctx)
+            goto fail;
+        tmpv = thread_avctx->priv_data;
+        *thread_avctx = *avctx;
+        thread_avctx->priv_data = tmpv;
+        thread_avctx->internal = NULL;
+        memcpy(thread_avctx->priv_data, avctx->priv_data, avctx->codec->priv_data_size);
+        thread_avctx->thread_count = 1;
+        thread_avctx->active_thread_type &= ~FF_THREAD_FRAME;
+
+        av_dict_copy(&tmp, options, 0);
+        av_dict_set(&tmp, "threads", "1", 0);
+        if(avcodec_open2(thread_avctx, avctx->codec, &tmp) < 0) {
+            av_dict_free(&tmp);
+            goto fail;
+        }
+        av_dict_free(&tmp);
+        av_assert0(!thread_avctx->internal->frame_thread_encoder);
+        thread_avctx->internal->frame_thread_encoder = c;
+        if(pthread_create(&c->worker[i], NULL, worker, thread_avctx)) {
+            goto fail;
+        }
+    }
+
+    avctx->active_thread_type = FF_THREAD_FRAME;
+
+    return 0;
+fail:
+    avctx->thread_count = i;
+    av_log(avctx, AV_LOG_ERROR, "ff_frame_thread_encoder_init failed\n");
+    ff_frame_thread_encoder_free(avctx);
+    return -1;
+}
+
+void ff_frame_thread_encoder_free(AVCodecContext *avctx){
+    int i;
+    ThreadContext *c= avctx->internal->frame_thread_encoder;
+
+    pthread_mutex_lock(&c->task_fifo_mutex);
+    c->exit = 1;
+    pthread_cond_broadcast(&c->task_fifo_cond);
+    pthread_mutex_unlock(&c->task_fifo_mutex);
+
+    for (i=0; i<avctx->thread_count; i++) {
+         pthread_join(c->worker[i], NULL);
+    }
+
+    pthread_mutex_destroy(&c->task_fifo_mutex);
+    pthread_mutex_destroy(&c->finished_task_mutex);
+    pthread_mutex_destroy(&c->buffer_mutex);
+    pthread_cond_destroy(&c->task_fifo_cond);
+    pthread_cond_destroy(&c->finished_task_cond);
+    av_fifo_freep(&c->task_fifo);
+    av_freep(&avctx->internal->frame_thread_encoder);
+}
+
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr){
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    Task task;
+    int ret;
+
+    av_assert1(!*got_packet_ptr);
+
+    if(frame){
+        AVFrame *new = av_frame_alloc();
+        if(!new)
+            return AVERROR(ENOMEM);
+        ret = av_frame_ref(new, frame);
+        if(ret < 0) {
+            av_frame_free(&new);
+            return ret;
+        }
+
+        task.index = c->task_index;
+        task.indata = (void*)new;
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        av_fifo_generic_write(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_cond_signal(&c->task_fifo_cond);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+
+        c->task_index = (c->task_index+1) % BUFFER_SIZE;
+
+        if(!c->finished_tasks[c->finished_task_index].outdata && (c->task_index - c->finished_task_index) % BUFFER_SIZE <= avctx->thread_count)
+            return 0;
+    }
+
+    if(c->task_index == c->finished_task_index)
+        return 0;
+
+    pthread_mutex_lock(&c->finished_task_mutex);
+    while (!c->finished_tasks[c->finished_task_index].outdata) {
+        pthread_cond_wait(&c->finished_task_cond, &c->finished_task_mutex);
+    }
+    task = c->finished_tasks[c->finished_task_index];
+    *pkt = *(AVPacket*)(task.outdata);
+    if(pkt->data)
+        *got_packet_ptr = 1;
+    av_freep(&c->finished_tasks[c->finished_task_index].outdata);
+    c->finished_task_index = (c->finished_task_index+1) % BUFFER_SIZE;
+    pthread_mutex_unlock(&c->finished_task_mutex);
+
+    return task.return_code;
+}
diff --git a/libavcodec/frame_thread_encoder.h b/libavcodec/frame_thread_encoder.h
new file mode 100644
index 0000000000..1da0ce1808
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options);
+void ff_frame_thread_encoder_free(AVCodecContext *avctx);
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr);
+
diff --git a/libavcodec/fraps.c b/libavcodec/fraps.c
index eb61c70c49..2d4d5c48c4 100644
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Roine Gustafsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "bytestream.h"
 #include "bswapdsp.h"
 #include "internal.h"
+#include "thread.h"
 
 #define FPS_TAG MKTAG('F', 'P', 'S', 'x')
 #define VLC_BITS 11
@@ -47,7 +48,6 @@
 typedef struct FrapsContext {
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    AVFrame *frame;
     uint8_t *tmpbuf;
     int tmpbuf_size;
 } FrapsContext;
@@ -62,15 +62,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 {
     FrapsContext * const s = avctx->priv_data;
 
-    avctx->pix_fmt     = AV_PIX_FMT_NONE; /* set in decode_frame */
-
     s->avctx  = avctx;
     s->tmpbuf = NULL;
 
-    s->frame = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     ff_bswapdsp_init(&s->bdsp);
 
     return 0;
@@ -111,7 +105,9 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
     s->bdsp.bswap_buf((uint32_t *) s->tmpbuf,
                       (const uint32_t *) src, size >> 2);
 
-    init_get_bits(&gb, s->tmpbuf, size * 8);
+    if ((ret = init_get_bits8(&gb, s->tmpbuf, size)) < 0)
+        return ret;
+
     for (j = 0; j < h; j++) {
         for (i = 0; i < w*step; i += step) {
             dst[i] = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
@@ -140,17 +136,17 @@ static int decode_frame(AVCodecContext *avctx,
     FrapsContext * const s = avctx->priv_data;
     const uint8_t *buf     = avpkt->data;
     int buf_size           = avpkt->size;
-    AVFrame *frame         = data;
-    AVFrame * const f      = s->frame;
+    ThreadFrame frame = { .f = data };
+    AVFrame * const f = data;
     uint32_t header;
     unsigned int version,header_size;
     unsigned int x, y;
     const uint32_t *buf32;
     uint32_t *luma1,*luma2,*cb,*cr;
     uint32_t offs[4];
-    int i, j, ret, is_chroma, planes;
-    enum AVPixelFormat pix_fmt;
-    int prev_pic_bit, expected_size;
+    int i, j, ret, is_chroma;
+    const int planes = 3;
+    uint8_t *out;
 
     if (buf_size < 4) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
@@ -160,7 +156,6 @@ static int decode_frame(AVCodecContext *avctx,
     header      = AV_RL32(buf);
     version     = header & 0xff;
     header_size = (header & (1<<30))? 8 : 4; /* bit 30 means pad to 8 bytes */
-    prev_pic_bit = header & (1U << 31); /* bit 31 means same as previous pic */
 
     if (version > 5) {
         av_log(avctx, AV_LOG_ERROR,
@@ -169,89 +164,92 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_PATCHWELCOME;
     }
 
-    buf += 4;
-    if (header_size == 8)
-        buf += 4;
+    buf += header_size;
 
-    pix_fmt = version & 1 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
-    if (avctx->pix_fmt != pix_fmt && f->data[0]) {
-        av_frame_unref(f);
+    if (version < 2) {
+        unsigned needed_size = avctx->width * avctx->height * 3;
+        if (version == 0) needed_size /= 2;
+        needed_size += header_size;
+        /* bit 31 means same as previous pic */
+        if (header & (1U<<31)) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (buf_size != needed_size) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid frame length %d (should be %d)\n",
+                   buf_size, needed_size);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        /* skip frame */
+        if (buf_size == 8) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (AV_RL32(buf) != FPS_TAG || buf_size < planes*1024 + 24) {
+            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < planes; i++) {
+            offs[i] = AV_RL32(buf + 4 + i * 4);
+            if (offs[i] >= buf_size - header_size || (i && offs[i] <= offs[i - 1] + 1024)) {
+                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        offs[planes] = buf_size - header_size;
+        for (i = 0; i < planes; i++) {
+            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size, offs[i + 1] - offs[i] - 1024);
+            if (!s->tmpbuf)
+                return AVERROR(ENOMEM);
+        }
     }
-    avctx->pix_fmt = pix_fmt;
+
+    f->pict_type = AV_PICTURE_TYPE_I;
+    f->key_frame = 1;
+
+    avctx->pix_fmt = version & 1 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
     avctx->color_range = version & 1 ? AVCOL_RANGE_UNSPECIFIED
                                      : AVCOL_RANGE_JPEG;
+    avctx->colorspace = version & 1 ? AVCOL_SPC_UNSPECIFIED : AVCOL_SPC_BT709;
 
-    expected_size = header_size;
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
 
     switch (version) {
     case 0:
     default:
         /* Fraps v0 is a reordered YUV420 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3 / 2;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
-
         if (((avctx->width % 8) != 0) || ((avctx->height % 2) != 0)) {
             av_log(avctx, AV_LOG_ERROR, "Invalid frame size %dx%d\n",
                    avctx->width, avctx->height);
             return AVERROR_INVALIDDATA;
         }
 
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
-
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
-            buf32 = (const uint32_t*)buf;
-            for (y = 0; y < avctx->height / 2; y++) {
-                luma1 = (uint32_t*)&f->data[0][ y * 2      * f->linesize[0]];
-                luma2 = (uint32_t*)&f->data[0][(y * 2 + 1) * f->linesize[0]];
-                cr    = (uint32_t*)&f->data[1][ y          * f->linesize[1]];
-                cb    = (uint32_t*)&f->data[2][ y          * f->linesize[2]];
-                for (x = 0; x < avctx->width; x += 8) {
-                    *(luma1++) = *(buf32++);
-                    *(luma1++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(cr++) = *(buf32++);
-                    *(cb++) = *(buf32++);
-                }
+        buf32 = (const uint32_t*)buf;
+        for (y = 0; y < avctx->height / 2; y++) {
+            luma1 = (uint32_t*)&f->data[0][  y * 2      * f->linesize[0] ];
+            luma2 = (uint32_t*)&f->data[0][ (y * 2 + 1) * f->linesize[0] ];
+            cr    = (uint32_t*)&f->data[1][  y          * f->linesize[1] ];
+            cb    = (uint32_t*)&f->data[2][  y          * f->linesize[2] ];
+            for (x = 0; x < avctx->width; x += 8) {
+                *luma1++ = *buf32++;
+                *luma1++ = *buf32++;
+                *luma2++ = *buf32++;
+                *luma2++ = *buf32++;
+                *cr++    = *buf32++;
+                *cb++    = *buf32++;
             }
         }
         break;
 
     case 1:
         /* Fraps v1 is an upside-down BGR24 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
-
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
-
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
             for (y = 0; y<avctx->height; y++)
                 memcpy(&f->data[0][(avctx->height - y - 1) * f->linesize[0]],
                        &buf[y * avctx->width * 3],
                        3 * avctx->width);
-        }
         break;
 
     case 2:
@@ -260,37 +258,8 @@ static int decode_frame(AVCodecContext *avctx,
          * Fraps v2 is Huffman-coded YUV420 planes
          * Fraps v4 is virtually the same
          */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG) || (buf_size < (planes * 1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
-        for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
         for (i = 0; i < planes; i++) {
             is_chroma = !!i;
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[i], f->linesize[i],
                                            avctx->width  >> is_chroma,
                                            avctx->height >> is_chroma,
@@ -304,36 +273,7 @@ static int decode_frame(AVCodecContext *avctx,
     case 3:
     case 5:
         /* Virtually the same as version 4, but is for RGB24 */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG)||(buf_size < (planes*1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
-        for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
         for (i = 0; i < planes; i++) {
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[0] + i + (f->linesize[0] * (avctx->height - 1)),
                                            -f->linesize[0], avctx->width, avctx->height,
                                            buf + offs[i], offs[i + 1] - offs[i], 0, 3)) < 0) {
@@ -341,18 +281,20 @@ static int decode_frame(AVCodecContext *avctx,
                 return ret;
             }
         }
+        out = f->data[0];
         // convert pseudo-YUV into real RGB
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++) {
-                f->data[0][0 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
-                f->data[0][2 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
+            uint8_t *line_end = out + 3*avctx->width;
+            while (out < line_end) {
+                out[0]  += out[1];
+                out[2]  += out[1];
+                out += 3;
             }
+            out += f->linesize[0] - 3*avctx->width;
         }
         break;
     }
 
-    if ((ret = av_frame_ref(frame, f)) < 0)
-        return ret;
     *got_frame = 1;
 
     return buf_size;
@@ -368,8 +310,6 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     FrapsContext *s = (FrapsContext*)avctx->priv_data;
 
-    av_frame_free(&s->frame);
-
     av_freep(&s->tmpbuf);
     return 0;
 }
@@ -384,5 +324,5 @@ AVCodec ff_fraps_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/frwu.c b/libavcodec/frwu.c
index 61cd3156be..1aabefeb6f 100644
--- a/libavcodec/frwu.c
+++ b/libavcodec/frwu.c
@@ -3,26 +3,32 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *av_class;
+    int change_field_order;
+} FRWUContext;
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -38,6 +44,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
+    FRWUContext *s = avctx->priv_data;
     int field, ret;
     AVFrame *pic = data;
     const uint8_t *buf = avpkt->data;
@@ -52,15 +59,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
-    }
 
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
-    pic->interlaced_frame = 1;
-    pic->top_field_first = 1;
 
     for (field = 0; field < 2; field++) {
         int i;
@@ -79,9 +82,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "Packet is too small, need %i, have %i\n", field_size, (int)(buf_end - buf));
             return AVERROR_INVALIDDATA;
         }
-        if (field)
+        if (field ^ s->change_field_order) {
             dst += pic->linesize[0];
+        } else if (s->change_field_order) {
+            dst += 2 * pic->linesize[0];
+        }
         for (i = 0; i < field_h; i++) {
+            if (s->change_field_order && field && i == field_h - 1)
+                dst = pic->data[0];
             memcpy(dst, buf, avctx->width * 2);
             buf += avctx->width * 2;
             dst += pic->linesize[0] << 1;
@@ -94,12 +102,27 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     return avpkt->size;
 }
 
+static const AVOption frwu_options[] = {
+    {"change_field_order", "Change field order", offsetof(FRWUContext, change_field_order), AV_OPT_TYPE_INT,
+     {.i64 = 0}, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM},
+    {NULL}
+};
+
+static const AVClass frwu_class = {
+    .class_name = "frwu Decoder",
+    .item_name  = av_default_item_name,
+    .option     = frwu_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_frwu_decoder = {
     .name           = "frwu",
     .long_name      = NULL_IF_CONFIG_SMALL("Forward Uncompressed"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FRWU,
+    .priv_data_size = sizeof(FRWUContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &frwu_class,
 };
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 85f53e2bbe..ba83c67337 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Konstantin Shishkov
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -298,7 +298,8 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
         return ret;
     jpg_unescape(src, src_size, c->buf, &unesc_size);
     memset(c->buf + unesc_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-    init_get_bits(&gb, c->buf, unesc_size * 8);
+    if((ret = init_get_bits8(&gb, c->buf, unesc_size)) < 0)
+        return ret;
 
     width = FFALIGN(width, 16);
     mb_w  =  width        >> 4;
@@ -554,6 +555,11 @@ static uint32_t epic_decode_pixel_pred(ePICContext *dc, int x, int y,
         B     = ((pred >> B_shift) & 0xFF) - TOSIGNED(delta);
     }
 
+    if (R<0 || G<0 || B<0) {
+        av_log(NULL, AV_LOG_ERROR, "RGB %d %d %d is out of range\n", R, G, B);
+        return 0;
+    }
+
     return (R << R_shift) | (G << G_shift) | (B << B_shift);
 }
 
@@ -1005,7 +1011,7 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
     return 0;
 }
 
-static void kempf_restore_buf(const uint8_t *src, int len,
+static int kempf_restore_buf(const uint8_t *src, int len,
                               uint8_t *dst, int stride,
                               const uint8_t *jpeg_tile, int tile_stride,
                               int width, int height,
@@ -1013,9 +1019,11 @@ static void kempf_restore_buf(const uint8_t *src, int len,
 {
     GetBitContext gb;
     int i, j, nb, col;
+    int ret;
     int align_width = FFALIGN(width, 16);
 
-    init_get_bits(&gb, src, len * 8);
+    if ((ret = init_get_bits8(&gb, src, len)) < 0)
+        return ret;
 
     if (npal <= 2)       nb = 1;
     else if (npal <= 4)  nb = 2;
@@ -1034,6 +1042,8 @@ static void kempf_restore_buf(const uint8_t *src, int len,
         }
         skip_bits_long(&gb, nb * (align_width - width));
     }
+
+    return 0;
 }
 
 static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
@@ -1077,6 +1087,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
         src += 3;
     }
     npal = *src++ + 1;
+    if (src_end - src < npal * 3)
+        return AVERROR_INVALIDDATA;
     memcpy(pal, src, npal * 3);
     src += npal * 3;
     if (sub_type != 2) {
@@ -1093,7 +1105,7 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     zsize = (src[0] << 8) | src[1];
     src  += 2;
 
-    if (src_end - src < zsize)
+    if (src_end - src < zsize + (sub_type != 2))
         return AVERROR_INVALIDDATA;
 
     ret = uncompress(c->kempf_buf, &dlen, src, zsize);
@@ -1115,6 +1127,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     for (i = 0; i < (FFALIGN(height, 16) >> 4); i++) {
         for (j = 0; j < (FFALIGN(width, 16) >> 4); j++) {
             if (!bits) {
+                if (src >= src_end)
+                    return AVERROR_INVALIDDATA;
                 bitbuf = *src++;
                 bits   = 8;
             }
@@ -1148,10 +1162,10 @@ static int g2m_init_buffers(G2MContext *c)
     int aligned_height;
 
     if (!c->framebuf || c->old_width < c->width || c->old_height < c->height) {
-        c->framebuf_stride = FFALIGN(c->width * 3, 16);
-        aligned_height     = FFALIGN(c->height,    16);
+        c->framebuf_stride = FFALIGN(c->width + 15, 16) * 3;
+        aligned_height     = c->height + 15;
         av_free(c->framebuf);
-        c->framebuf = av_mallocz(c->framebuf_stride * aligned_height);
+        c->framebuf = av_mallocz_array(c->framebuf_stride, aligned_height);
         if (!c->framebuf)
             return AVERROR(ENOMEM);
     }
@@ -1159,14 +1173,15 @@ static int g2m_init_buffers(G2MContext *c)
         (c->compression == 2 && !c->epic_buf_base) ||
         c->old_tile_w < c->tile_width ||
         c->old_tile_h < c->tile_height) {
-        c->tile_stride     = FFALIGN(c->tile_width * 3, 16);
+        c->tile_stride     = FFALIGN(c->tile_width, 16) * 3;
         c->epic_buf_stride = FFALIGN(c->tile_width * 4, 16);
         aligned_height     = FFALIGN(c->tile_height,    16);
-        av_free(c->synth_tile);
-        av_free(c->jpeg_tile);
-        av_free(c->kempf_buf);
-        av_free(c->kempf_flags);
-        av_free(c->epic_buf_base);
+        av_freep(&c->synth_tile);
+        av_freep(&c->jpeg_tile);
+        av_freep(&c->kempf_buf);
+        av_freep(&c->kempf_flags);
+        av_freep(&c->epic_buf_base);
+        c->epic_buf    = NULL;
         c->synth_tile  = av_mallocz(c->tile_stride      * aligned_height);
         c->jpeg_tile   = av_mallocz(c->tile_stride      * aligned_height);
         c->kempf_buf   = av_mallocz((c->tile_width + 1) * aligned_height +
@@ -1203,7 +1218,7 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
     cursor_hot_y = bytestream2_get_byte(gb);
     cursor_fmt   = bytestream2_get_byte(gb);
 
-    cursor_stride = FFALIGN(cursor_w, 32) * 4;
+    cursor_stride = FFALIGN(cursor_w, cursor_fmt==1 ? 32 : 1) * 4;
 
     if (cursor_w < 1 || cursor_w > 256 ||
         cursor_h < 1 || cursor_h > 256) {
@@ -1253,7 +1268,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
 
         dst = c->cursor;
@@ -1285,7 +1299,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     case 32: // full colour
@@ -1299,7 +1312,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                 *dst++ = val >> 16;
                 *dst++ = val >> 24;
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     default:
@@ -1402,6 +1414,7 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
         }
         switch (chunk_type) {
         case DISPLAY_INFO:
+            got_header =
             c->got_header = 0;
             if (chunk_size < 21) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid display info size %"PRIu32"\n",
@@ -1421,19 +1434,22 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
             if (c->width != avctx->width || c->height != avctx->height) {
                 ret = ff_set_dimensions(avctx, c->width, c->height);
                 if (ret < 0)
-                    return ret;
+                    goto header_fail;
             }
             c->compression = bytestream2_get_be32(&bc);
             if (c->compression != 2 && c->compression != 3) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Unknown compression method %d\n",
                        c->compression);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             c->tile_width  = bytestream2_get_be32(&bc);
             c->tile_height = bytestream2_get_be32(&bc);
-            if (!c->tile_width || !c->tile_height ||
-                ((c->tile_width | c->tile_height) & 0xF)) {
+            if (c->tile_width <= 0 || c->tile_height <= 0 ||
+                ((c->tile_width | c->tile_height) & 0xF) ||
+                c->tile_width * (uint64_t)c->tile_height >= INT_MAX / 4
+            ) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Invalid tile dimensions %dx%d\n",
                        c->tile_width, c->tile_height);
@@ -1448,7 +1464,8 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     (chunk_size - 21) < 16) {
                     av_log(avctx, AV_LOG_ERROR,
                            "Display info: missing bitmasks!\n");
-                    return AVERROR_INVALIDDATA;
+                    ret = AVERROR_INVALIDDATA;
+                    goto header_fail;
                 }
                 r_mask = bytestream2_get_be32(&bc);
                 g_mask = bytestream2_get_be32(&bc);
@@ -1457,11 +1474,13 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     av_log(avctx, AV_LOG_ERROR,
                            "Invalid or unsupported bitmasks: R=%"PRIX32", G=%"PRIX32", B=%"PRIX32"\n",
                            r_mask, g_mask, b_mask);
-                    return AVERROR_PATCHWELCOME;
+                    ret = AVERROR_PATCHWELCOME;
+                    goto header_fail;
                 }
             } else {
                 avpriv_request_sample(avctx, "bpp=%d", c->bpp);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             if (g2m_init_buffers(c)) {
                 ret = AVERROR(ENOMEM);
@@ -1538,11 +1557,9 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
     if (got_header)
         c->got_header = 1;
 
-    if (c->width && c->height) {
-        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (c->width && c->height && c->framebuf) {
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         pic->key_frame = got_header;
         pic->pict_type = got_header ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
@@ -1563,6 +1580,8 @@ header_fail:
     c->height  = 0;
     c->tiles_x =
     c->tiles_y = 0;
+    c->tile_width =
+    c->tile_height = 0;
     return ret;
 }
 
@@ -1593,6 +1612,7 @@ static av_cold int g2m_decode_end(AVCodecContext *avctx)
     jpg_free_context(&c->jc);
 
     av_freep(&c->epic_buf_base);
+    c->epic_buf = NULL;
     av_freep(&c->kempf_buf);
     av_freep(&c->kempf_flags);
     av_freep(&c->synth_tile);
diff --git a/libavcodec/g722.c b/libavcodec/g722.c
index 830877e336..ee3b85f845 100644
--- a/libavcodec/g722.c
+++ b/libavcodec/g722.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/g722.h b/libavcodec/g722.h
index 483017018b..25676a326e 100644
--- a/libavcodec/g722.h
+++ b/libavcodec/g722.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/g722dec.c b/libavcodec/g722dec.c
index c4c0ec8fee..0bfa82a394 100644
--- a/libavcodec/g722dec.c
+++ b/libavcodec/g722dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ static const int16_t low_inv_quant5[32] = {
      587,   473,   370,   276,   190,   110,    35,   -35
 };
 
-static const int16_t *low_inv_quants[3] = { ff_g722_low_inv_quant6,
+static const int16_t * const low_inv_quants[3] = { ff_g722_low_inv_quant6,
                                                     low_inv_quant5,
                                             ff_g722_low_inv_quant4 };
 
@@ -96,10 +96,8 @@ static int g722_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avpkt->size * 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_buf = (int16_t *)frame->data[0];
 
     init_get_bits(&gb, avpkt->data, avpkt->size * 8);
diff --git a/libavcodec/g722dsp.c b/libavcodec/g722dsp.c
index c7e41ff9c7..f1480536d0 100644
--- a/libavcodec/g722dsp.c
+++ b/libavcodec/g722dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,4 +71,6 @@ av_cold void ff_g722dsp_init(G722DSPContext *c)
 
     if (ARCH_ARM)
         ff_g722dsp_init_arm(c);
+    if (ARCH_X86)
+        ff_g722dsp_init_x86(c);
 }
diff --git a/libavcodec/g722dsp.h b/libavcodec/g722dsp.h
index ecd6a47fcb..c956a1e161 100644
--- a/libavcodec/g722dsp.h
+++ b/libavcodec/g722dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,5 +29,6 @@ typedef struct G722DSPContext {
 
 void ff_g722dsp_init(G722DSPContext *c);
 void ff_g722dsp_init_arm(G722DSPContext *c);
+void ff_g722dsp_init_x86(G722DSPContext *c);
 
 #endif /* AVCODEC_G722DSP_H */
diff --git a/libavcodec/g722enc.c b/libavcodec/g722enc.c
index 545825b057..01a3db26fd 100644
--- a/libavcodec/g722enc.c
+++ b/libavcodec/g722enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * G.722 ADPCM audio encoder
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "g722.h"
@@ -74,9 +75,9 @@ static av_cold int g722_encode_init(AVCodecContext * avctx)
         int max_paths = frontier * FREEZE_INTERVAL;
         int i;
         for (i = 0; i < 2; i++) {
-            c->paths[i] = av_mallocz(max_paths * sizeof(**c->paths));
-            c->node_buf[i] = av_mallocz(2 * frontier * sizeof(**c->node_buf));
-            c->nodep_buf[i] = av_mallocz(2 * frontier * sizeof(**c->nodep_buf));
+            c->paths[i] = av_mallocz_array(max_paths, sizeof(**c->paths));
+            c->node_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->node_buf));
+            c->nodep_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->nodep_buf));
             if (!c->paths[i] || !c->node_buf[i] || !c->nodep_buf[i]) {
                 ret = AVERROR(ENOMEM);
                 goto error;
@@ -238,7 +239,7 @@ static void g722_encode_trellis(G722Context *c, int trellis,
                     continue;\
                 if (heap_pos[index] < frontier) {\
                     pos = heap_pos[index]++;\
-                    assert(pathn[index] < FREEZE_INTERVAL * frontier);\
+                    av_assert2(pathn[index] < FREEZE_INTERVAL * frontier);\
                     node = nodes_next[index][pos] = next[index]++;\
                     node->path = pathn[index]++;\
                 } else {\
@@ -357,10 +358,8 @@ static int g722_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int nb_samples, out_size, ret;
 
     out_size = (frame->nb_samples + 1) / 2;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
 
     nb_samples = frame->nb_samples - (frame->nb_samples & 1);
 
diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 4904e52ac0..68a6d9179c 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,47 +33,12 @@
 #include "get_bits.h"
 #include "acelp_vectors.h"
 #include "celp_filters.h"
+#include "celp_math.h"
 #include "g723_1_data.h"
 #include "internal.h"
 
 #define CNG_RANDOM_SEED 12345
 
-/**
- * G723.1 frame types
- */
-enum FrameType {
-    ACTIVE_FRAME,        ///< Active speech
-    SID_FRAME,           ///< Silence Insertion Descriptor frame
-    UNTRANSMITTED_FRAME
-};
-
-enum Rate {
-    RATE_6300,
-    RATE_5300
-};
-
-/**
- * G723.1 unpacked data subframe
- */
-typedef struct G723_1_Subframe {
-    int ad_cb_lag;     ///< adaptive codebook lag
-    int ad_cb_gain;
-    int dirac_train;
-    int pulse_sign;
-    int grid_index;
-    int amp_index;
-    int pulse_pos;
-} G723_1_Subframe;
-
-/**
- * Pitch postfilter parameters
- */
-typedef struct PPFParam {
-    int     index;    ///< postfilter backward/forward lag
-    int16_t opt_gain; ///< optimal gain
-    int16_t sc_gain;  ///< scaling gain
-} PPFParam;
-
 typedef struct g723_1_context {
     AVClass *class;
 
@@ -100,10 +65,21 @@ typedef struct g723_1_context {
     int sid_gain;
     int cur_gain;
     int reflection_coef;
-    int pf_gain;
+    int pf_gain;                 ///< formant postfilter
+                                 ///< gain scaling unit memory
     int postfilter;
 
     int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX + 4];
+    int16_t prev_data[HALF_FRAME_LEN];
+    int16_t prev_weight_sig[PITCH_MAX];
+
+
+    int16_t hpf_fir_mem;                   ///< highpass filter fir
+    int     hpf_iir_mem;                   ///< and iir memories
+    int16_t perf_fir_mem[LPC_ORDER];       ///< perceptual filter fir
+    int16_t perf_iir_mem[LPC_ORDER];       ///< and iir memories
+
+    int16_t harmonic_mem[PITCH_MAX];
 } G723_1_Context;
 
 static av_cold int g723_1_decode_init(AVCodecContext *avctx)
@@ -113,7 +89,6 @@ static av_cold int g723_1_decode_init(AVCodecContext *avctx)
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
     avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
     avctx->channels       = 1;
-    avctx->sample_rate    = 8000;
     p->pf_gain            = 1 << 12;
 
     memcpy(p->prev_lsp, dc_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
@@ -197,13 +172,13 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
         }
     }
 
-    p->subframe[0].grid_index = get_bits(&gb, 1);
-    p->subframe[1].grid_index = get_bits(&gb, 1);
-    p->subframe[2].grid_index = get_bits(&gb, 1);
-    p->subframe[3].grid_index = get_bits(&gb, 1);
+    p->subframe[0].grid_index = get_bits1(&gb);
+    p->subframe[1].grid_index = get_bits1(&gb);
+    p->subframe[2].grid_index = get_bits1(&gb);
+    p->subframe[3].grid_index = get_bits1(&gb);
 
     if (p->cur_rate == RATE_6300) {
-        skip_bits(&gb, 1);  /* skip reserved bit */
+        skip_bits1(&gb);  /* skip reserved bit */
 
         /* Compute pulse_pos index using the 13-bit combined position index */
         temp = get_bits(&gb, 13);
@@ -247,32 +222,27 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
 /**
  * Bitexact implementation of sqrt(val/2).
  */
-static int16_t square_root(int val)
+static int16_t square_root(unsigned val)
 {
-    int16_t res = 0;
-    int16_t exp = 0x4000;
-    int i;
+    av_assert2(!(val & 0x80000000));
 
-    for (i = 0; i < 14; i ++) {
-        int res_exp = res + exp;
-        if (val >= res_exp * res_exp << 1)
-            res += exp;
-        exp >>= 1;
-    }
-    return res;
+    return (ff_sqrt(val << 1) >> 1) & (~1);
 }
 
 /**
  * Calculate the number of left-shifts required for normalizing the input.
  *
  * @param num   input number
- * @param width width of the input, 16 bits(0) / 32 bits(1)
+ * @param width width of the input, 15 or 31 bits
  */
 static int normalize_bits(int num, int width)
 {
     return width - av_log2(num) - 1;
 }
 
+#define normalize_bits_int16(num) normalize_bits(num, 15)
+#define normalize_bits_int32(num) normalize_bits(num, 31)
+
 /**
  * Scale vector contents based on the largest of their absolutes.
  */
@@ -281,12 +251,11 @@ static int scale_vector(int16_t *dst, const int16_t *vector, int length)
     int bits, max = 0;
     int i;
 
-
     for (i = 0; i < length; i++)
         max |= FFABS(vector[i]);
 
-    max   = FFMIN(max, 0x7FFF);
-    bits  = normalize_bits(max, 15);
+    bits= 14 - av_log2_16bit(max);
+    bits= FFMAX(bits, 0);
 
     for (i = 0; i < length; i++)
         dst[i] = vector[i] << bits >> 3;
@@ -371,7 +340,7 @@ static void inverse_quant(int16_t *cur_lsp, int16_t *prev_lsp,
  * @param b 16 bit multiplier
  */
 #define MULL2(a, b) \
-        ((((a) >> 16) * (b) << 1) + (((a) & 0xffff) * (b) >> 15))
+        MULL(a,b,15)
 
 /**
  * Convert LSP frequencies to LPC coefficients.
@@ -386,7 +355,7 @@ static void lsp2lpc(int16_t *lpc)
 
     /* Calculate negative cosine */
     for (j = 0; j < LPC_ORDER; j++) {
-        int index     = lpc[j] >> 7;
+        int index     = (lpc[j] >> 7) & 0x1FF;
         int offset    = lpc[j] & 0x7f;
         int temp1     = cos_tab[index] << 16;
         int temp2     = (cos_tab[index + 1] - cos_tab[index]) *
@@ -567,13 +536,8 @@ static void get_residual(int16_t *residual, int16_t *prev_excitation, int lag)
 
 static int dot_product(const int16_t *a, const int16_t *b, int length)
 {
-    int i, sum = 0;
-
-    for (i = 0; i < length; i++) {
-        int prod = a[i] * b[i];
-        sum = av_sat_dadd32(sum, prod);
-    }
-    return sum;
+    int sum = ff_dot_product(a,b,length);
+    return av_sat_add32(sum, sum);
 }
 
 /**
@@ -593,16 +557,16 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     get_residual(residual, prev_excitation, lag);
 
     /* Select quantization table */
-    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2)
+    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2) {
         cb_ptr = adaptive_cb_gain85;
-    else
+    } else
         cb_ptr = adaptive_cb_gain170;
 
     /* Calculate adaptive vector */
     cb_ptr += subfrm->ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        sum = dot_product(residual + i, cb_ptr, PITCH_ORDER);
-        vector[i] = av_sat_dadd32(1 << 15, sum) >> 16;
+        sum = ff_dot_product(residual + i, cb_ptr, PITCH_ORDER);
+        vector[i] = av_sat_dadd32(1 << 15, av_sat_add32(sum, sum)) >> 16;
     }
 }
 
@@ -810,9 +774,9 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
 
     temp = best_eng * *exc_eng >> 3;
 
-    if (temp < ccr * ccr)
+    if (temp < ccr * ccr) {
         return index;
-    else
+    } else
         return 0;
 }
 
@@ -852,21 +816,24 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
  * @param iir_coef IIR coefficients
  * @param src      source vector
  * @param dest     destination vector
+ * @param width    width of the output, 16 bits(0) / 32 bits(1)
  */
-static inline void iir_filter(int16_t *fir_coef, int16_t *iir_coef,
-                              int16_t *src, int *dest)
-{
-    int m, n;
-
-    for (m = 0; m < SUBFRAME_LEN; m++) {
-        int64_t filter = 0;
-        for (n = 1; n <= LPC_ORDER; n++) {
-            filter -= fir_coef[n - 1] * src[m - n] -
-                      iir_coef[n - 1] * (dest[m - n] >> 16);
-        }
-
-        dest[m] = av_clipl_int32((src[m] << 16) + (filter << 3) + (1 << 15));
-    }
+#define iir_filter(fir_coef, iir_coef, src, dest, width)\
+{\
+    int m, n;\
+    int res_shift = 16 & ~-(width);\
+    int in_shift  = 16 - res_shift;\
+\
+    for (m = 0; m < SUBFRAME_LEN; m++) {\
+        int64_t filter = 0;\
+        for (n = 1; n <= LPC_ORDER; n++) {\
+            filter -= (fir_coef)[n - 1] * (src)[m - n] -\
+                      (iir_coef)[n - 1] * ((dest)[m - n] >> in_shift);\
+        }\
+\
+        (dest)[m] = av_clipl_int32(((src)[m] << 16) + (filter << 3) +\
+                                   (1 << 15)) >> res_shift;\
+    }\
 }
 
 /**
@@ -937,13 +904,12 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
                                  (1 << 14)) >> 15;
         }
         iir_filter(filter_coef[0], filter_coef[1], buf + i,
-                   filter_signal + i);
+                   filter_signal + i, 1);
         lpc += LPC_ORDER;
     }
 
-    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(*p->fir_mem));
-    memcpy(p->iir_mem, filter_signal + FRAME_LEN,
-           LPC_ORDER * sizeof(*p->iir_mem));
+    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(int16_t));
+    memcpy(p->iir_mem, filter_signal + FRAME_LEN, LPC_ORDER * sizeof(int));
 
     buf += LPC_ORDER;
     signal_ptr = filter_signal + LPC_ORDER;
@@ -1218,10 +1184,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = FRAME_LEN;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-         return ret;
-    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
 
     out = (int16_t *)frame->data[0];
 
@@ -1376,3 +1340,1145 @@ AVCodec ff_g723_1_decoder = {
     .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
     .priv_class     = &g723_1dec_class,
 };
+
+#if CONFIG_G723_1_ENCODER
+#define BITSTREAM_WRITER_LE
+#include "put_bits.h"
+
+static av_cold int g723_1_encode_init(AVCodecContext *avctx)
+{
+    G723_1_Context *p = avctx->priv_data;
+
+    if (avctx->sample_rate != 8000) {
+        av_log(avctx, AV_LOG_ERROR, "Only 8000Hz sample rate supported\n");
+        return -1;
+    }
+
+    if (avctx->channels != 1) {
+        av_log(avctx, AV_LOG_ERROR, "Only mono supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->bit_rate == 6300) {
+        p->cur_rate = RATE_6300;
+    } else if (avctx->bit_rate == 5300) {
+        av_log(avctx, AV_LOG_ERROR, "Bitrate not supported yet, use 6.3k\n");
+        return AVERROR_PATCHWELCOME;
+    } else {
+        av_log(avctx, AV_LOG_ERROR,
+               "Bitrate not supported, use 6.3k\n");
+        return AVERROR(EINVAL);
+    }
+    avctx->frame_size = 240;
+    memcpy(p->prev_lsp, dc_lsp, LPC_ORDER * sizeof(int16_t));
+
+    return 0;
+}
+
+/**
+ * Remove DC component from the input signal.
+ *
+ * @param buf input signal
+ * @param fir zero memory
+ * @param iir pole memory
+ */
+static void highpass_filter(int16_t *buf, int16_t *fir, int *iir)
+{
+    int i;
+    for (i = 0; i < FRAME_LEN; i++) {
+        *iir   = (buf[i] << 15) + ((-*fir) << 15) + MULL2(*iir, 0x7f00);
+        *fir   = buf[i];
+        buf[i] = av_clipl_int32((int64_t)*iir + (1 << 15)) >> 16;
+    }
+}
+
+/**
+ * Estimate autocorrelation of the input vector.
+ *
+ * @param buf      input buffer
+ * @param autocorr autocorrelation coefficients vector
+ */
+static void comp_autocorr(int16_t *buf, int16_t *autocorr)
+{
+    int i, scale, temp;
+    int16_t vector[LPC_FRAME];
+
+    scale_vector(vector, buf, LPC_FRAME);
+
+    /* Apply the Hamming window */
+    for (i = 0; i < LPC_FRAME; i++)
+        vector[i] = (vector[i] * hamming_window[i] + (1 << 14)) >> 15;
+
+    /* Compute the first autocorrelation coefficient */
+    temp = ff_dot_product(vector, vector, LPC_FRAME);
+
+    /* Apply a white noise correlation factor of (1025/1024) */
+    temp += temp >> 10;
+
+    /* Normalize */
+    scale = normalize_bits_int32(temp);
+    autocorr[0] = av_clipl_int32((int64_t)(temp << scale) +
+                                 (1 << 15)) >> 16;
+
+    /* Compute the remaining coefficients */
+    if (!autocorr[0]) {
+        memset(autocorr + 1, 0, LPC_ORDER * sizeof(int16_t));
+    } else {
+        for (i = 1; i <= LPC_ORDER; i++) {
+           temp = ff_dot_product(vector, vector + i, LPC_FRAME - i);
+           temp = MULL2((temp << scale), binomial_window[i - 1]);
+           autocorr[i] = av_clipl_int32((int64_t)temp + (1 << 15)) >> 16;
+        }
+    }
+}
+
+/**
+ * Use Levinson-Durbin recursion to compute LPC coefficients from
+ * autocorrelation values.
+ *
+ * @param lpc      LPC coefficients vector
+ * @param autocorr autocorrelation coefficients vector
+ * @param error    prediction error
+ */
+static void levinson_durbin(int16_t *lpc, int16_t *autocorr, int16_t error)
+{
+    int16_t vector[LPC_ORDER];
+    int16_t partial_corr;
+    int i, j, temp;
+
+    memset(lpc, 0, LPC_ORDER * sizeof(int16_t));
+
+    for (i = 0; i < LPC_ORDER; i++) {
+        /* Compute the partial correlation coefficient */
+        temp = 0;
+        for (j = 0; j < i; j++)
+            temp -= lpc[j] * autocorr[i - j - 1];
+        temp = ((autocorr[i] << 13) + temp) << 3;
+
+        if (FFABS(temp) >= (error << 16))
+            break;
+
+        partial_corr = temp / (error << 1);
+
+        lpc[i] = av_clipl_int32((int64_t)(partial_corr << 14) +
+                                (1 << 15)) >> 16;
+
+        /* Update the prediction error */
+        temp  = MULL2(temp, partial_corr);
+        error = av_clipl_int32((int64_t)(error << 16) - temp +
+                                (1 << 15)) >> 16;
+
+        memcpy(vector, lpc, i * sizeof(int16_t));
+        for (j = 0; j < i; j++) {
+            temp = partial_corr * vector[i - j - 1] << 1;
+            lpc[j] = av_clipl_int32((int64_t)(lpc[j] << 16) - temp +
+                                    (1 << 15)) >> 16;
+        }
+    }
+}
+
+/**
+ * Calculate LPC coefficients for the current frame.
+ *
+ * @param buf       current frame
+ * @param prev_data 2 trailing subframes of the previous frame
+ * @param lpc       LPC coefficients vector
+ */
+static void comp_lpc_coeff(int16_t *buf, int16_t *lpc)
+{
+    int16_t autocorr[(LPC_ORDER + 1) * SUBFRAMES];
+    int16_t *autocorr_ptr = autocorr;
+    int16_t *lpc_ptr      = lpc;
+    int i, j;
+
+    for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++) {
+        comp_autocorr(buf + i, autocorr_ptr);
+        levinson_durbin(lpc_ptr, autocorr_ptr + 1, autocorr_ptr[0]);
+
+        lpc_ptr += LPC_ORDER;
+        autocorr_ptr += LPC_ORDER + 1;
+    }
+}
+
+static void lpc2lsp(int16_t *lpc, int16_t *prev_lsp, int16_t *lsp)
+{
+    int f[LPC_ORDER + 2]; ///< coefficients of the sum and difference
+                          ///< polynomials (F1, F2) ordered as
+                          ///< f1[0], f2[0], ...., f1[5], f2[5]
+
+    int max, shift, cur_val, prev_val, count, p;
+    int i, j;
+    int64_t temp;
+
+    /* Initialize f1[0] and f2[0] to 1 in Q25 */
+    for (i = 0; i < LPC_ORDER; i++)
+        lsp[i] = (lpc[i] * bandwidth_expand[i] + (1 << 14)) >> 15;
+
+    /* Apply bandwidth expansion on the LPC coefficients */
+    f[0] = f[1] = 1 << 25;
+
+    /* Compute the remaining coefficients */
+    for (i = 0; i < LPC_ORDER / 2; i++) {
+        /* f1 */
+        f[2 * i + 2] = -f[2 * i] - ((lsp[i] + lsp[LPC_ORDER - 1 - i]) << 12);
+        /* f2 */
+        f[2 * i + 3] = f[2 * i + 1] - ((lsp[i] - lsp[LPC_ORDER - 1 - i]) << 12);
+    }
+
+    /* Divide f1[5] and f2[5] by 2 for use in polynomial evaluation */
+    f[LPC_ORDER] >>= 1;
+    f[LPC_ORDER + 1] >>= 1;
+
+    /* Normalize and shorten */
+    max = FFABS(f[0]);
+    for (i = 1; i < LPC_ORDER + 2; i++)
+        max = FFMAX(max, FFABS(f[i]));
+
+    shift = normalize_bits_int32(max);
+
+    for (i = 0; i < LPC_ORDER + 2; i++)
+        f[i] = av_clipl_int32((int64_t)(f[i] << shift) + (1 << 15)) >> 16;
+
+    /**
+     * Evaluate F1 and F2 at uniform intervals of pi/256 along the
+     * unit circle and check for zero crossings.
+     */
+    p    = 0;
+    temp = 0;
+    for (i = 0; i <= LPC_ORDER / 2; i++)
+        temp += f[2 * i] * cos_tab[0];
+    prev_val = av_clipl_int32(temp << 1);
+    count    = 0;
+    for ( i = 1; i < COS_TBL_SIZE / 2; i++) {
+        /* Evaluate */
+        temp = 0;
+        for (j = 0; j <= LPC_ORDER / 2; j++)
+            temp += f[LPC_ORDER - 2 * j + p] * cos_tab[i * j % COS_TBL_SIZE];
+        cur_val = av_clipl_int32(temp << 1);
+
+        /* Check for sign change, indicating a zero crossing */
+        if ((cur_val ^ prev_val) < 0) {
+            int abs_cur  = FFABS(cur_val);
+            int abs_prev = FFABS(prev_val);
+            int sum      = abs_cur + abs_prev;
+
+            shift        = normalize_bits_int32(sum);
+            sum          <<= shift;
+            abs_prev     = abs_prev << shift >> 8;
+            lsp[count++] = ((i - 1) << 7) + (abs_prev >> 1) / (sum >> 16);
+
+            if (count == LPC_ORDER)
+                break;
+
+            /* Switch between sum and difference polynomials */
+            p ^= 1;
+
+            /* Evaluate */
+            temp = 0;
+            for (j = 0; j <= LPC_ORDER / 2; j++){
+                temp += f[LPC_ORDER - 2 * j + p] *
+                        cos_tab[i * j % COS_TBL_SIZE];
+            }
+            cur_val = av_clipl_int32(temp<<1);
+        }
+        prev_val = cur_val;
+    }
+
+    if (count != LPC_ORDER)
+        memcpy(lsp, prev_lsp, LPC_ORDER * sizeof(int16_t));
+}
+
+/**
+ * Quantize the current LSP subvector.
+ *
+ * @param num    band number
+ * @param offset offset of the current subvector in an LPC_ORDER vector
+ * @param size   size of the current subvector
+ */
+#define get_index(num, offset, size) \
+{\
+    int error, max = -1;\
+    int16_t temp[4];\
+    int i, j;\
+    for (i = 0; i < LSP_CB_SIZE; i++) {\
+        for (j = 0; j < size; j++){\
+            temp[j] = (weight[j + (offset)] * lsp_band##num[i][j] +\
+                      (1 << 14)) >> 15;\
+        }\
+        error =  dot_product(lsp + (offset), temp, size) << 1;\
+        error -= dot_product(lsp_band##num[i], temp, size);\
+        if (error > max) {\
+            max = error;\
+            lsp_index[num] = i;\
+        }\
+    }\
+}
+
+/**
+ * Vector quantize the LSP frequencies.
+ *
+ * @param lsp      the current lsp vector
+ * @param prev_lsp the previous lsp vector
+ */
+static void lsp_quantize(uint8_t *lsp_index, int16_t *lsp, int16_t *prev_lsp)
+{
+    int16_t weight[LPC_ORDER];
+    int16_t min, max;
+    int shift, i;
+
+    /* Calculate the VQ weighting vector */
+    weight[0] = (1 << 20) / (lsp[1] - lsp[0]);
+    weight[LPC_ORDER - 1] = (1 << 20) /
+                            (lsp[LPC_ORDER - 1] - lsp[LPC_ORDER - 2]);
+
+    for (i = 1; i < LPC_ORDER - 1; i++) {
+        min  = FFMIN(lsp[i] - lsp[i - 1], lsp[i + 1] - lsp[i]);
+        if (min > 0x20)
+            weight[i] = (1 << 20) / min;
+        else
+            weight[i] = INT16_MAX;
+    }
+
+    /* Normalize */
+    max = 0;
+    for (i = 0; i < LPC_ORDER; i++)
+        max = FFMAX(weight[i], max);
+
+    shift = normalize_bits_int16(max);
+    for (i = 0; i < LPC_ORDER; i++) {
+        weight[i] <<= shift;
+    }
+
+    /* Compute the VQ target vector */
+    for (i = 0; i < LPC_ORDER; i++) {
+        lsp[i] -= dc_lsp[i] +
+                  (((prev_lsp[i] - dc_lsp[i]) * 12288 + (1 << 14)) >> 15);
+    }
+
+    get_index(0, 0, 3);
+    get_index(1, 3, 3);
+    get_index(2, 6, 4);
+}
+
+/**
+ * Apply the formant perceptual weighting filter.
+ *
+ * @param flt_coef filter coefficients
+ * @param unq_lpc  unquantized lpc vector
+ */
+static void perceptual_filter(G723_1_Context *p, int16_t *flt_coef,
+                              int16_t *unq_lpc, int16_t *buf)
+{
+    int16_t vector[FRAME_LEN + LPC_ORDER];
+    int i, j, k, l = 0;
+
+    memcpy(buf, p->iir_mem, sizeof(int16_t) * LPC_ORDER);
+    memcpy(vector, p->fir_mem, sizeof(int16_t) * LPC_ORDER);
+    memcpy(vector + LPC_ORDER, buf + LPC_ORDER, sizeof(int16_t) * FRAME_LEN);
+
+    for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++) {
+        for (k = 0; k < LPC_ORDER; k++) {
+            flt_coef[k + 2 * l] = (unq_lpc[k + l] * percept_flt_tbl[0][k] +
+                                  (1 << 14)) >> 15;
+            flt_coef[k + 2 * l + LPC_ORDER] = (unq_lpc[k + l] *
+                                             percept_flt_tbl[1][k] +
+                                             (1 << 14)) >> 15;
+        }
+        iir_filter(flt_coef + 2 * l, flt_coef + 2 * l + LPC_ORDER, vector + i,
+                   buf + i, 0);
+        l += LPC_ORDER;
+    }
+    memcpy(p->iir_mem, buf + FRAME_LEN, sizeof(int16_t) * LPC_ORDER);
+    memcpy(p->fir_mem, vector + FRAME_LEN, sizeof(int16_t) * LPC_ORDER);
+}
+
+/**
+ * Estimate the open loop pitch period.
+ *
+ * @param buf   perceptually weighted speech
+ * @param start estimation is carried out from this position
+ */
+static int estimate_pitch(int16_t *buf, int start)
+{
+    int max_exp = 32;
+    int max_ccr = 0x4000;
+    int max_eng = 0x7fff;
+    int index   = PITCH_MIN;
+    int offset  = start - PITCH_MIN + 1;
+
+    int ccr, eng, orig_eng, ccr_eng, exp;
+    int diff, temp;
+
+    int i;
+
+    orig_eng = ff_dot_product(buf + offset, buf + offset, HALF_FRAME_LEN);
+
+    for (i = PITCH_MIN; i <= PITCH_MAX - 3; i++) {
+        offset--;
+
+        /* Update energy and compute correlation */
+        orig_eng += buf[offset] * buf[offset] -
+                    buf[offset + HALF_FRAME_LEN] * buf[offset + HALF_FRAME_LEN];
+        ccr      =  ff_dot_product(buf + start, buf + offset, HALF_FRAME_LEN);
+        if (ccr <= 0)
+            continue;
+
+        /* Split into mantissa and exponent to maintain precision */
+        exp  =   normalize_bits_int32(ccr);
+        ccr  =   av_clipl_int32((int64_t)(ccr << exp) + (1 << 15)) >> 16;
+        exp  <<= 1;
+        ccr  *=  ccr;
+        temp =   normalize_bits_int32(ccr);
+        ccr  =   ccr << temp >> 16;
+        exp  +=  temp;
+
+        temp =   normalize_bits_int32(orig_eng);
+        eng  =   av_clipl_int32((int64_t)(orig_eng << temp) + (1 << 15)) >> 16;
+        exp  -=  temp;
+
+        if (ccr >= eng) {
+            exp--;
+            ccr >>= 1;
+        }
+        if (exp > max_exp)
+            continue;
+
+        if (exp + 1 < max_exp)
+            goto update;
+
+        /* Equalize exponents before comparison */
+        if (exp + 1 == max_exp)
+            temp = max_ccr >> 1;
+        else
+            temp = max_ccr;
+        ccr_eng = ccr * max_eng;
+        diff    = ccr_eng - eng * temp;
+        if (diff > 0 && (i - index < PITCH_MIN || diff > ccr_eng >> 2)) {
+update:
+            index   = i;
+            max_exp = exp;
+            max_ccr = ccr;
+            max_eng = eng;
+        }
+    }
+    return index;
+}
+
+/**
+ * Compute harmonic noise filter parameters.
+ *
+ * @param buf       perceptually weighted speech
+ * @param pitch_lag open loop pitch period
+ * @param hf        harmonic filter parameters
+ */
+static void comp_harmonic_coeff(int16_t *buf, int16_t pitch_lag, HFParam *hf)
+{
+    int ccr, eng, max_ccr, max_eng;
+    int exp, max, diff;
+    int energy[15];
+    int i, j;
+
+    for (i = 0, j = pitch_lag - 3; j <= pitch_lag + 3; i++, j++) {
+        /* Compute residual energy */
+        energy[i << 1] = ff_dot_product(buf - j, buf - j, SUBFRAME_LEN);
+        /* Compute correlation */
+        energy[(i << 1) + 1] = ff_dot_product(buf, buf - j, SUBFRAME_LEN);
+    }
+
+    /* Compute target energy */
+    energy[14] = ff_dot_product(buf, buf, SUBFRAME_LEN);
+
+    /* Normalize */
+    max = 0;
+    for (i = 0; i < 15; i++)
+        max = FFMAX(max, FFABS(energy[i]));
+
+    exp = normalize_bits_int32(max);
+    for (i = 0; i < 15; i++) {
+        energy[i] = av_clipl_int32((int64_t)(energy[i] << exp) +
+                                   (1 << 15)) >> 16;
+    }
+
+    hf->index = -1;
+    hf->gain  =  0;
+    max_ccr   =  1;
+    max_eng   =  0x7fff;
+
+    for (i = 0; i <= 6; i++) {
+        eng = energy[i << 1];
+        ccr = energy[(i << 1) + 1];
+
+        if (ccr <= 0)
+            continue;
+
+        ccr  = (ccr * ccr + (1 << 14)) >> 15;
+        diff = ccr * max_eng - eng * max_ccr;
+        if (diff > 0) {
+            max_ccr   = ccr;
+            max_eng   = eng;
+            hf->index = i;
+        }
+    }
+
+    if (hf->index == -1) {
+        hf->index = pitch_lag;
+        return;
+    }
+
+    eng = energy[14] * max_eng;
+    eng = (eng >> 2) + (eng >> 3);
+    ccr = energy[(hf->index << 1) + 1] * energy[(hf->index << 1) + 1];
+    if (eng < ccr) {
+        eng = energy[(hf->index << 1) + 1];
+
+        if (eng >= max_eng)
+            hf->gain = 0x2800;
+        else
+            hf->gain = ((eng << 15) / max_eng * 0x2800 + (1 << 14)) >> 15;
+    }
+    hf->index += pitch_lag - 3;
+}
+
+/**
+ * Apply the harmonic noise shaping filter.
+ *
+ * @param hf filter parameters
+ */
+static void harmonic_filter(HFParam *hf, const int16_t *src, int16_t *dest)
+{
+    int i;
+
+    for (i = 0; i < SUBFRAME_LEN; i++) {
+        int64_t temp = hf->gain * src[i - hf->index] << 1;
+        dest[i] = av_clipl_int32((src[i] << 16) - temp + (1 << 15)) >> 16;
+    }
+}
+
+static void harmonic_noise_sub(HFParam *hf, const int16_t *src, int16_t *dest)
+{
+    int i;
+    for (i = 0; i < SUBFRAME_LEN; i++) {
+        int64_t temp = hf->gain * src[i - hf->index] << 1;
+        dest[i] = av_clipl_int32(((dest[i] - src[i]) << 16) + temp +
+                                 (1 << 15)) >> 16;
+
+    }
+}
+
+/**
+ * Combined synthesis and formant perceptual weighting filer.
+ *
+ * @param qnt_lpc  quantized lpc coefficients
+ * @param perf_lpc perceptual filter coefficients
+ * @param perf_fir perceptual filter fir memory
+ * @param perf_iir perceptual filter iir memory
+ * @param scale    the filter output will be scaled by 2^scale
+ */
+static void synth_percept_filter(int16_t *qnt_lpc, int16_t *perf_lpc,
+                                 int16_t *perf_fir, int16_t *perf_iir,
+                                 const int16_t *src, int16_t *dest, int scale)
+{
+    int i, j;
+    int16_t buf_16[SUBFRAME_LEN + LPC_ORDER];
+    int64_t buf[SUBFRAME_LEN];
+
+    int16_t *bptr_16 = buf_16 + LPC_ORDER;
+
+    memcpy(buf_16, perf_fir, sizeof(int16_t) * LPC_ORDER);
+    memcpy(dest - LPC_ORDER, perf_iir, sizeof(int16_t) * LPC_ORDER);
+
+    for (i = 0; i < SUBFRAME_LEN; i++) {
+        int64_t temp = 0;
+        for (j = 1; j <= LPC_ORDER; j++)
+            temp -= qnt_lpc[j - 1] * bptr_16[i - j];
+
+        buf[i]     = (src[i] << 15) + (temp << 3);
+        bptr_16[i] = av_clipl_int32(buf[i] + (1 << 15)) >> 16;
+    }
+
+    for (i = 0; i < SUBFRAME_LEN; i++) {
+        int64_t fir = 0, iir = 0;
+        for (j = 1; j <= LPC_ORDER; j++) {
+            fir -= perf_lpc[j - 1] * bptr_16[i - j];
+            iir += perf_lpc[j + LPC_ORDER - 1] * dest[i - j];
+        }
+        dest[i] = av_clipl_int32(((buf[i] + (fir << 3)) << scale) + (iir << 3) +
+                                 (1 << 15)) >> 16;
+    }
+    memcpy(perf_fir, buf_16 + SUBFRAME_LEN, sizeof(int16_t) * LPC_ORDER);
+    memcpy(perf_iir, dest + SUBFRAME_LEN - LPC_ORDER,
+           sizeof(int16_t) * LPC_ORDER);
+}
+
+/**
+ * Compute the adaptive codebook contribution.
+ *
+ * @param buf   input signal
+ * @param index the current subframe index
+ */
+static void acb_search(G723_1_Context *p, int16_t *residual,
+                       int16_t *impulse_resp, const int16_t *buf,
+                       int index)
+{
+
+    int16_t flt_buf[PITCH_ORDER][SUBFRAME_LEN];
+
+    const int16_t *cb_tbl = adaptive_cb_gain85;
+
+    int ccr_buf[PITCH_ORDER * SUBFRAMES << 2];
+
+    int pitch_lag = p->pitch_lag[index >> 1];
+    int acb_lag   = 1;
+    int acb_gain  = 0;
+    int odd_frame = index & 1;
+    int iter      = 3 + odd_frame;
+    int count     = 0;
+    int tbl_size  = 85;
+
+    int i, j, k, l, max;
+    int64_t temp;
+
+    if (!odd_frame) {
+        if (pitch_lag == PITCH_MIN)
+            pitch_lag++;
+        else
+            pitch_lag = FFMIN(pitch_lag, PITCH_MAX - 5);
+    }
+
+    for (i = 0; i < iter; i++) {
+        get_residual(residual, p->prev_excitation, pitch_lag + i - 1);
+
+        for (j = 0; j < SUBFRAME_LEN; j++) {
+            temp = 0;
+            for (k = 0; k <= j; k++)
+                temp += residual[PITCH_ORDER - 1 + k] * impulse_resp[j - k];
+            flt_buf[PITCH_ORDER - 1][j] = av_clipl_int32((temp << 1) +
+                                                         (1 << 15)) >> 16;
+        }
+
+        for (j = PITCH_ORDER - 2; j >= 0; j--) {
+            flt_buf[j][0] = ((residual[j] << 13) + (1 << 14)) >> 15;
+            for (k = 1; k < SUBFRAME_LEN; k++) {
+                temp = (flt_buf[j + 1][k - 1] << 15) +
+                       residual[j] * impulse_resp[k];
+                flt_buf[j][k] = av_clipl_int32((temp << 1) + (1 << 15)) >> 16;
+            }
+        }
+
+        /* Compute crosscorrelation with the signal */
+        for (j = 0; j < PITCH_ORDER; j++) {
+            temp = ff_dot_product(buf, flt_buf[j], SUBFRAME_LEN);
+            ccr_buf[count++] = av_clipl_int32(temp << 1);
+        }
+
+        /* Compute energies */
+        for (j = 0; j < PITCH_ORDER; j++) {
+            ccr_buf[count++] = dot_product(flt_buf[j], flt_buf[j],
+                                           SUBFRAME_LEN);
+        }
+
+        for (j = 1; j < PITCH_ORDER; j++) {
+            for (k = 0; k < j; k++) {
+                temp = ff_dot_product(flt_buf[j], flt_buf[k], SUBFRAME_LEN);
+                ccr_buf[count++] = av_clipl_int32(temp<<2);
+            }
+        }
+    }
+
+    /* Normalize and shorten */
+    max = 0;
+    for (i = 0; i < 20 * iter; i++)
+        max = FFMAX(max, FFABS(ccr_buf[i]));
+
+    temp = normalize_bits_int32(max);
+
+    for (i = 0; i < 20 * iter; i++){
+        ccr_buf[i] = av_clipl_int32((int64_t)(ccr_buf[i] << temp) +
+                                    (1 << 15)) >> 16;
+    }
+
+    max = 0;
+    for (i = 0; i < iter; i++) {
+        /* Select quantization table */
+        if (!odd_frame && pitch_lag + i - 1 >= SUBFRAME_LEN - 2 ||
+            odd_frame && pitch_lag >= SUBFRAME_LEN - 2) {
+            cb_tbl = adaptive_cb_gain170;
+            tbl_size = 170;
+        }
+
+        for (j = 0, k = 0; j < tbl_size; j++, k += 20) {
+            temp = 0;
+            for (l = 0; l < 20; l++)
+                temp += ccr_buf[20 * i + l] * cb_tbl[k + l];
+            temp =  av_clipl_int32(temp);
+
+            if (temp > max) {
+                max      = temp;
+                acb_gain = j;
+                acb_lag  = i;
+            }
+        }
+    }
+
+    if (!odd_frame) {
+        pitch_lag += acb_lag - 1;
+        acb_lag   =  1;
+    }
+
+    p->pitch_lag[index >> 1]      = pitch_lag;
+    p->subframe[index].ad_cb_lag  = acb_lag;
+    p->subframe[index].ad_cb_gain = acb_gain;
+}
+
+/**
+ * Subtract the adaptive codebook contribution from the input
+ * to obtain the residual.
+ *
+ * @param buf target vector
+ */
+static void sub_acb_contrib(const int16_t *residual, const int16_t *impulse_resp,
+                            int16_t *buf)
+{
+    int i, j;
+    /* Subtract adaptive CB contribution to obtain the residual */
+    for (i = 0; i < SUBFRAME_LEN; i++) {
+        int64_t temp = buf[i] << 14;
+        for (j = 0; j <= i; j++)
+            temp -= residual[j] * impulse_resp[i - j];
+
+        buf[i] = av_clipl_int32((temp << 2) + (1 << 15)) >> 16;
+    }
+}
+
+/**
+ * Quantize the residual signal using the fixed codebook (MP-MLQ).
+ *
+ * @param optim optimized fixed codebook parameters
+ * @param buf   excitation vector
+ */
+static void get_fcb_param(FCBParam *optim, int16_t *impulse_resp,
+                          int16_t *buf, int pulse_cnt, int pitch_lag)
+{
+    FCBParam param;
+    int16_t impulse_r[SUBFRAME_LEN];
+    int16_t temp_corr[SUBFRAME_LEN];
+    int16_t impulse_corr[SUBFRAME_LEN];
+
+    int ccr1[SUBFRAME_LEN];
+    int ccr2[SUBFRAME_LEN];
+    int amp, err, max, max_amp_index, min, scale, i, j, k, l;
+
+    int64_t temp;
+
+    /* Update impulse response */
+    memcpy(impulse_r, impulse_resp, sizeof(int16_t) * SUBFRAME_LEN);
+    param.dirac_train = 0;
+    if (pitch_lag < SUBFRAME_LEN - 2) {
+        param.dirac_train = 1;
+        gen_dirac_train(impulse_r, pitch_lag);
+    }
+
+    for (i = 0; i < SUBFRAME_LEN; i++)
+        temp_corr[i] = impulse_r[i] >> 1;
+
+    /* Compute impulse response autocorrelation */
+    temp = dot_product(temp_corr, temp_corr, SUBFRAME_LEN);
+
+    scale = normalize_bits_int32(temp);
+    impulse_corr[0] = av_clipl_int32((temp << scale) + (1 << 15)) >> 16;
+
+    for (i = 1; i < SUBFRAME_LEN; i++) {
+        temp = dot_product(temp_corr + i, temp_corr, SUBFRAME_LEN - i);
+        impulse_corr[i] = av_clipl_int32((temp << scale) + (1 << 15)) >> 16;
+    }
+
+    /* Compute crosscorrelation of impulse response with residual signal */
+    scale -= 4;
+    for (i = 0; i < SUBFRAME_LEN; i++){
+        temp = dot_product(buf + i, impulse_r, SUBFRAME_LEN - i);
+        if (scale < 0)
+            ccr1[i] = temp >> -scale;
+        else
+            ccr1[i] = av_clipl_int32(temp << scale);
+    }
+
+    /* Search loop */
+    for (i = 0; i < GRID_SIZE; i++) {
+        /* Maximize the crosscorrelation */
+        max = 0;
+        for (j = i; j < SUBFRAME_LEN; j += GRID_SIZE) {
+            temp = FFABS(ccr1[j]);
+            if (temp >= max) {
+                max = temp;
+                param.pulse_pos[0] = j;
+            }
+        }
+
+        /* Quantize the gain (max crosscorrelation/impulse_corr[0]) */
+        amp = max;
+        min = 1 << 30;
+        max_amp_index = GAIN_LEVELS - 2;
+        for (j = max_amp_index; j >= 2; j--) {
+            temp = av_clipl_int32((int64_t)fixed_cb_gain[j] *
+                                  impulse_corr[0] << 1);
+            temp = FFABS(temp - amp);
+            if (temp < min) {
+                min = temp;
+                max_amp_index = j;
+            }
+        }
+
+        max_amp_index--;
+        /* Select additional gain values */
+        for (j = 1; j < 5; j++) {
+            for (k = i; k < SUBFRAME_LEN; k += GRID_SIZE) {
+                temp_corr[k] = 0;
+                ccr2[k]      = ccr1[k];
+            }
+            param.amp_index = max_amp_index + j - 2;
+            amp = fixed_cb_gain[param.amp_index];
+
+            param.pulse_sign[0] = (ccr2[param.pulse_pos[0]] < 0) ? -amp : amp;
+            temp_corr[param.pulse_pos[0]] = 1;
+
+            for (k = 1; k < pulse_cnt; k++) {
+                max = INT_MIN;
+                for (l = i; l < SUBFRAME_LEN; l += GRID_SIZE) {
+                    if (temp_corr[l])
+                        continue;
+                    temp = impulse_corr[FFABS(l - param.pulse_pos[k - 1])];
+                    temp = av_clipl_int32((int64_t)temp *
+                                          param.pulse_sign[k - 1] << 1);
+                    ccr2[l] -= temp;
+                    temp = FFABS(ccr2[l]);
+                    if (temp > max) {
+                        max = temp;
+                        param.pulse_pos[k] = l;
+                    }
+                }
+
+                param.pulse_sign[k] = (ccr2[param.pulse_pos[k]] < 0) ?
+                                      -amp : amp;
+                temp_corr[param.pulse_pos[k]] = 1;
+            }
+
+            /* Create the error vector */
+            memset(temp_corr, 0, sizeof(int16_t) * SUBFRAME_LEN);
+
+            for (k = 0; k < pulse_cnt; k++)
+                temp_corr[param.pulse_pos[k]] = param.pulse_sign[k];
+
+            for (k = SUBFRAME_LEN - 1; k >= 0; k--) {
+                temp = 0;
+                for (l = 0; l <= k; l++) {
+                    int prod = av_clipl_int32((int64_t)temp_corr[l] *
+                                              impulse_r[k - l] << 1);
+                    temp     = av_clipl_int32(temp + prod);
+                }
+                temp_corr[k] = temp << 2 >> 16;
+            }
+
+            /* Compute square of error */
+            err = 0;
+            for (k = 0; k < SUBFRAME_LEN; k++) {
+                int64_t prod;
+                prod = av_clipl_int32((int64_t)buf[k] * temp_corr[k] << 1);
+                err  = av_clipl_int32(err - prod);
+                prod = av_clipl_int32((int64_t)temp_corr[k] * temp_corr[k]);
+                err  = av_clipl_int32(err + prod);
+            }
+
+            /* Minimize */
+            if (err < optim->min_err) {
+                optim->min_err     = err;
+                optim->grid_index  = i;
+                optim->amp_index   = param.amp_index;
+                optim->dirac_train = param.dirac_train;
+
+                for (k = 0; k < pulse_cnt; k++) {
+                    optim->pulse_sign[k] = param.pulse_sign[k];
+                    optim->pulse_pos[k]  = param.pulse_pos[k];
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Encode the pulse position and gain of the current subframe.
+ *
+ * @param optim optimized fixed CB parameters
+ * @param buf   excitation vector
+ */
+static void pack_fcb_param(G723_1_Subframe *subfrm, FCBParam *optim,
+                           int16_t *buf, int pulse_cnt)
+{
+    int i, j;
+
+    j = PULSE_MAX - pulse_cnt;
+
+    subfrm->pulse_sign = 0;
+    subfrm->pulse_pos  = 0;
+
+    for (i = 0; i < SUBFRAME_LEN >> 1; i++) {
+        int val = buf[optim->grid_index + (i << 1)];
+        if (!val) {
+            subfrm->pulse_pos += combinatorial_table[j][i];
+        } else {
+            subfrm->pulse_sign <<= 1;
+            if (val < 0) subfrm->pulse_sign++;
+            j++;
+
+            if (j == PULSE_MAX) break;
+        }
+    }
+    subfrm->amp_index   = optim->amp_index;
+    subfrm->grid_index  = optim->grid_index;
+    subfrm->dirac_train = optim->dirac_train;
+}
+
+/**
+ * Compute the fixed codebook excitation.
+ *
+ * @param buf          target vector
+ * @param impulse_resp impulse response of the combined filter
+ */
+static void fcb_search(G723_1_Context *p, int16_t *impulse_resp,
+                       int16_t *buf, int index)
+{
+    FCBParam optim;
+    int pulse_cnt = pulses[index];
+    int i;
+
+    optim.min_err = 1 << 30;
+    get_fcb_param(&optim, impulse_resp, buf, pulse_cnt, SUBFRAME_LEN);
+
+    if (p->pitch_lag[index >> 1] < SUBFRAME_LEN - 2) {
+        get_fcb_param(&optim, impulse_resp, buf, pulse_cnt,
+                      p->pitch_lag[index >> 1]);
+    }
+
+    /* Reconstruct the excitation */
+    memset(buf, 0, sizeof(int16_t) * SUBFRAME_LEN);
+    for (i = 0; i < pulse_cnt; i++)
+        buf[optim.pulse_pos[i]] = optim.pulse_sign[i];
+
+    pack_fcb_param(&p->subframe[index], &optim, buf, pulse_cnt);
+
+    if (optim.dirac_train)
+        gen_dirac_train(buf, p->pitch_lag[index >> 1]);
+}
+
+/**
+ * Pack the frame parameters into output bitstream.
+ *
+ * @param frame output buffer
+ * @param size  size of the buffer
+ */
+static int pack_bitstream(G723_1_Context *p, unsigned char *frame, int size)
+{
+    PutBitContext pb;
+    int info_bits, i, temp;
+
+    init_put_bits(&pb, frame, size);
+
+    if (p->cur_rate == RATE_6300) {
+        info_bits = 0;
+        put_bits(&pb, 2, info_bits);
+    }else
+        av_assert0(0);
+
+    put_bits(&pb, 8, p->lsp_index[2]);
+    put_bits(&pb, 8, p->lsp_index[1]);
+    put_bits(&pb, 8, p->lsp_index[0]);
+
+    put_bits(&pb, 7, p->pitch_lag[0] - PITCH_MIN);
+    put_bits(&pb, 2, p->subframe[1].ad_cb_lag);
+    put_bits(&pb, 7, p->pitch_lag[1] - PITCH_MIN);
+    put_bits(&pb, 2, p->subframe[3].ad_cb_lag);
+
+    /* Write 12 bit combined gain */
+    for (i = 0; i < SUBFRAMES; i++) {
+        temp = p->subframe[i].ad_cb_gain * GAIN_LEVELS +
+               p->subframe[i].amp_index;
+        if (p->cur_rate ==  RATE_6300)
+            temp += p->subframe[i].dirac_train << 11;
+        put_bits(&pb, 12, temp);
+    }
+
+    put_bits(&pb, 1, p->subframe[0].grid_index);
+    put_bits(&pb, 1, p->subframe[1].grid_index);
+    put_bits(&pb, 1, p->subframe[2].grid_index);
+    put_bits(&pb, 1, p->subframe[3].grid_index);
+
+    if (p->cur_rate == RATE_6300) {
+        skip_put_bits(&pb, 1); /* reserved bit */
+
+        /* Write 13 bit combined position index */
+        temp = (p->subframe[0].pulse_pos >> 16) * 810 +
+               (p->subframe[1].pulse_pos >> 14) *  90 +
+               (p->subframe[2].pulse_pos >> 16) *   9 +
+               (p->subframe[3].pulse_pos >> 14);
+        put_bits(&pb, 13, temp);
+
+        put_bits(&pb, 16, p->subframe[0].pulse_pos & 0xffff);
+        put_bits(&pb, 14, p->subframe[1].pulse_pos & 0x3fff);
+        put_bits(&pb, 16, p->subframe[2].pulse_pos & 0xffff);
+        put_bits(&pb, 14, p->subframe[3].pulse_pos & 0x3fff);
+
+        put_bits(&pb, 6, p->subframe[0].pulse_sign);
+        put_bits(&pb, 5, p->subframe[1].pulse_sign);
+        put_bits(&pb, 6, p->subframe[2].pulse_sign);
+        put_bits(&pb, 5, p->subframe[3].pulse_sign);
+    }
+
+    flush_put_bits(&pb);
+    return frame_size[info_bits];
+}
+
+static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
+{
+    G723_1_Context *p = avctx->priv_data;
+    int16_t unq_lpc[LPC_ORDER * SUBFRAMES];
+    int16_t qnt_lpc[LPC_ORDER * SUBFRAMES];
+    int16_t cur_lsp[LPC_ORDER];
+    int16_t weighted_lpc[LPC_ORDER * SUBFRAMES << 1];
+    int16_t vector[FRAME_LEN + PITCH_MAX];
+    int offset, ret;
+    int16_t *in_orig = av_memdup(frame->data[0], frame->nb_samples * sizeof(int16_t));
+    int16_t *in = in_orig;
+
+    HFParam hf[4];
+    int i, j;
+
+    if (!in)
+        return AVERROR(ENOMEM);
+
+    highpass_filter(in, &p->hpf_fir_mem, &p->hpf_iir_mem);
+
+    memcpy(vector, p->prev_data, HALF_FRAME_LEN * sizeof(int16_t));
+    memcpy(vector + HALF_FRAME_LEN, in, FRAME_LEN * sizeof(int16_t));
+
+    comp_lpc_coeff(vector, unq_lpc);
+    lpc2lsp(&unq_lpc[LPC_ORDER * 3], p->prev_lsp, cur_lsp);
+    lsp_quantize(p->lsp_index, cur_lsp, p->prev_lsp);
+
+    /* Update memory */
+    memcpy(vector + LPC_ORDER, p->prev_data + SUBFRAME_LEN,
+           sizeof(int16_t) * SUBFRAME_LEN);
+    memcpy(vector + LPC_ORDER + SUBFRAME_LEN, in,
+           sizeof(int16_t) * (HALF_FRAME_LEN + SUBFRAME_LEN));
+    memcpy(p->prev_data, in + HALF_FRAME_LEN,
+           sizeof(int16_t) * HALF_FRAME_LEN);
+    memcpy(in, vector + LPC_ORDER, sizeof(int16_t) * FRAME_LEN);
+
+    perceptual_filter(p, weighted_lpc, unq_lpc, vector);
+
+    memcpy(in, vector + LPC_ORDER, sizeof(int16_t) * FRAME_LEN);
+    memcpy(vector, p->prev_weight_sig, sizeof(int16_t) * PITCH_MAX);
+    memcpy(vector + PITCH_MAX, in, sizeof(int16_t) * FRAME_LEN);
+
+    scale_vector(vector, vector, FRAME_LEN + PITCH_MAX);
+
+    p->pitch_lag[0] = estimate_pitch(vector, PITCH_MAX);
+    p->pitch_lag[1] = estimate_pitch(vector, PITCH_MAX + HALF_FRAME_LEN);
+
+    for (i = PITCH_MAX, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
+        comp_harmonic_coeff(vector + i, p->pitch_lag[j >> 1], hf + j);
+
+    memcpy(vector, p->prev_weight_sig, sizeof(int16_t) * PITCH_MAX);
+    memcpy(vector + PITCH_MAX, in, sizeof(int16_t) * FRAME_LEN);
+    memcpy(p->prev_weight_sig, vector + FRAME_LEN, sizeof(int16_t) * PITCH_MAX);
+
+    for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++)
+        harmonic_filter(hf + j, vector + PITCH_MAX + i, in + i);
+
+    inverse_quant(cur_lsp, p->prev_lsp, p->lsp_index, 0);
+    lsp_interpolate(qnt_lpc, cur_lsp, p->prev_lsp);
+
+    memcpy(p->prev_lsp, cur_lsp, sizeof(int16_t) * LPC_ORDER);
+
+    offset = 0;
+    for (i = 0; i < SUBFRAMES; i++) {
+        int16_t impulse_resp[SUBFRAME_LEN];
+        int16_t residual[SUBFRAME_LEN + PITCH_ORDER - 1];
+        int16_t flt_in[SUBFRAME_LEN];
+        int16_t zero[LPC_ORDER], fir[LPC_ORDER], iir[LPC_ORDER];
+
+        /**
+         * Compute the combined impulse response of the synthesis filter,
+         * formant perceptual weighting filter and harmonic noise shaping filter
+         */
+        memset(zero, 0, sizeof(int16_t) * LPC_ORDER);
+        memset(vector, 0, sizeof(int16_t) * PITCH_MAX);
+        memset(flt_in, 0, sizeof(int16_t) * SUBFRAME_LEN);
+
+        flt_in[0] = 1 << 13; /* Unit impulse */
+        synth_percept_filter(qnt_lpc + offset, weighted_lpc + (offset << 1),
+                             zero, zero, flt_in, vector + PITCH_MAX, 1);
+        harmonic_filter(hf + i, vector + PITCH_MAX, impulse_resp);
+
+         /* Compute the combined zero input response */
+        flt_in[0] = 0;
+        memcpy(fir, p->perf_fir_mem, sizeof(int16_t) * LPC_ORDER);
+        memcpy(iir, p->perf_iir_mem, sizeof(int16_t) * LPC_ORDER);
+
+        synth_percept_filter(qnt_lpc + offset, weighted_lpc + (offset << 1),
+                             fir, iir, flt_in, vector + PITCH_MAX, 0);
+        memcpy(vector, p->harmonic_mem, sizeof(int16_t) * PITCH_MAX);
+        harmonic_noise_sub(hf + i, vector + PITCH_MAX, in);
+
+        acb_search(p, residual, impulse_resp, in, i);
+        gen_acb_excitation(residual, p->prev_excitation,p->pitch_lag[i >> 1],
+                           &p->subframe[i], p->cur_rate);
+        sub_acb_contrib(residual, impulse_resp, in);
+
+        fcb_search(p, impulse_resp, in, i);
+
+        /* Reconstruct the excitation */
+        gen_acb_excitation(impulse_resp, p->prev_excitation, p->pitch_lag[i >> 1],
+                           &p->subframe[i], RATE_6300);
+
+        memmove(p->prev_excitation, p->prev_excitation + SUBFRAME_LEN,
+               sizeof(int16_t) * (PITCH_MAX - SUBFRAME_LEN));
+        for (j = 0; j < SUBFRAME_LEN; j++)
+            in[j] = av_clip_int16((in[j] << 1) + impulse_resp[j]);
+        memcpy(p->prev_excitation + PITCH_MAX - SUBFRAME_LEN, in,
+               sizeof(int16_t) * SUBFRAME_LEN);
+
+        /* Update filter memories */
+        synth_percept_filter(qnt_lpc + offset, weighted_lpc + (offset << 1),
+                             p->perf_fir_mem, p->perf_iir_mem,
+                             in, vector + PITCH_MAX, 0);
+        memmove(p->harmonic_mem, p->harmonic_mem + SUBFRAME_LEN,
+                sizeof(int16_t) * (PITCH_MAX - SUBFRAME_LEN));
+        memcpy(p->harmonic_mem + PITCH_MAX - SUBFRAME_LEN, vector + PITCH_MAX,
+               sizeof(int16_t) * SUBFRAME_LEN);
+
+        in += SUBFRAME_LEN;
+        offset += LPC_ORDER;
+    }
+
+    av_freep(&in_orig); in = NULL;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 24, 0)) < 0)
+        return ret;
+
+    *got_packet_ptr = 1;
+    avpkt->size = pack_bitstream(p, avpkt->data, avpkt->size);
+    return 0;
+}
+
+AVCodec ff_g723_1_encoder = {
+    .name           = "g723_1",
+    .long_name      = NULL_IF_CONFIG_SMALL("G.723.1"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_G723_1,
+    .priv_data_size = sizeof(G723_1_Context),
+    .init           = g723_1_encode_init,
+    .encode2        = g723_1_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){AV_SAMPLE_FMT_S16,
+                                                    AV_SAMPLE_FMT_NONE},
+};
+#endif
diff --git a/libavcodec/g723_1_data.h b/libavcodec/g723_1_data.h
index 04f8a06e37..db7f6e4d3c 100644
--- a/libavcodec/g723_1_data.h
+++ b/libavcodec/g723_1_data.h
@@ -1,28 +1,28 @@
 /*
- * G.723.1 compatible decoder data tables.
+ * G723.1 compatible decoder data tables.
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * G.723.1 compatible decoder data tables
+ * G723.1 compatible decoder data tables
  */
 
 #ifndef AVCODEC_G723_1_DATA_H
@@ -33,6 +33,8 @@
 #define SUBFRAMES       4
 #define SUBFRAME_LEN    60
 #define FRAME_LEN       (SUBFRAME_LEN << 2)
+#define HALF_FRAME_LEN  (FRAME_LEN / 2)
+#define LPC_FRAME       (HALF_FRAME_LEN + SUBFRAME_LEN)
 #define LPC_ORDER       10
 #define LSP_BANDS       3
 #define LSP_CB_SIZE     256
@@ -44,19 +46,89 @@
 #define GAIN_LEVELS     24
 #define COS_TBL_SIZE    512
 
+/**
+ * G723.1 frame types
+ */
+typedef enum FrameType {
+    ACTIVE_FRAME,        ///< Active speech
+    SID_FRAME,           ///< Silence Insertion Descriptor frame
+    UNTRANSMITTED_FRAME
+} FrameType;
+
 static const uint8_t frame_size[4] = { 24, 20, 4, 1 };
 
-/* Postfilter gain weighting factors scaled by 2^15 */
-static const int16_t ppf_gain_weight[2] = { 0x1800, 0x2000 };
+typedef enum Rate {
+    RATE_6300,
+    RATE_5300
+} Rate;
+
+/**
+ * G723.1 unpacked data subframe
+ */
+typedef struct G723_1_Subframe {
+    int ad_cb_lag;     ///< adaptive codebook lag
+    int ad_cb_gain;
+    int dirac_train;
+    int pulse_sign;
+    int grid_index;
+    int amp_index;
+    int pulse_pos;
+} G723_1_Subframe;
+
+/**
+ * Pitch postfilter parameters
+ */
+typedef struct PPFParam {
+    int     index;    ///< postfilter backward/forward lag
+    int16_t opt_gain; ///< optimal gain
+    int16_t sc_gain;  ///< scaling gain
+} PPFParam;
+
+/**
+ * Harmonic filter parameters
+ */
+typedef struct HFParam {
+    int index;
+    int gain;
+} HFParam;
+
+/**
+ * Optimized fixed codebook excitation parameters
+ */
+typedef struct FCBParam {
+    int min_err;
+    int amp_index;
+    int grid_index;
+    int dirac_train;
+    int pulse_pos[PULSE_MAX];
+    int pulse_sign[PULSE_MAX];
+} FCBParam;
+
+/**
+ * Postfilter gain weighting factors scaled by 2^15
+ */
+static const int16_t ppf_gain_weight[2] = {0x1800, 0x2000};
 
-/* LSP DC component */
+/**
+ * LSP DC component
+ */
 static const int16_t dc_lsp[LPC_ORDER] = {
-    0x0c3b, 0x1271, 0x1e0a, 0x2a36, 0x3630,
-    0x406f, 0x4d28, 0x56f4, 0x638c, 0x6c46
+    0x0c3b,
+    0x1271,
+    0x1e0a,
+    0x2a36,
+    0x3630,
+    0x406f,
+    0x4d28,
+    0x56f4,
+    0x638c,
+    0x6c46
 };
 
-/* Cosine table scaled by 2^14 */
-static const int16_t cos_tab[COS_TBL_SIZE] = {
+/**
+ * Cosine table scaled by 2^14
+ */
+static const int16_t cos_tab[COS_TBL_SIZE+1] = {
     16384,  16383,  16379,  16373,  16364,  16353,  16340,  16324,
     16305,  16284,  16261,  16235,  16207,  16176,  16143,  16107,
     16069,  16029,  15986,  15941,  15893,  15843,  15791,  15736,
@@ -121,9 +193,12 @@ static const int16_t cos_tab[COS_TBL_SIZE] = {
     15679,  15736,  15791,  15843,  15893,  15941,  15986,  16029,
     16069,  16107,  16143,  16176,  16207,  16235,  16261,  16284,
     16305,  16324,  16340,  16353,  16364,  16373,  16379,  16383,
+    16384
 };
 
-/* LSP VQ tables */
+/**
+ *  LSP VQ tables
+ */
 static const int16_t lsp_band0[LSP_CB_SIZE][3] = {
     {    0,      0,      0}, { -270,  -1372,  -1032}, { -541,  -1650,  -1382},
     { -723,  -2011,  -2213}, { -941,  -1122,  -1942}, { -780,  -1145,  -2454},
@@ -433,12 +508,12 @@ static const int16_t lsp_band2[LSP_CB_SIZE][4] = {
     { 3633,   2336,   2408,   1453}, { 2923,   3517,   2567,   1318},
 };
 
-/*
+/**
  * Used for the coding/decoding of the pulses positions
  * for the MP-MLQ codebook
  */
 static const int32_t combinatorial_table[PULSE_MAX][SUBFRAME_LEN/GRID_SIZE] = {
-    {118755, 98280, 80730, 65780L, 53130,
+    {118755, 98280, 80730,  65780, 53130,
       42504, 33649, 26334,  20349, 15504,
       11628,  8568,  6188,   4368,  3003,
        2002,  1287,   792,    462,   252,
@@ -527,10 +602,14 @@ static const int16_t pitch_contrib[340] = {
     -2, 25144,  0, 17998
 };
 
-/* Number of non-zero pulses in the MP-MLQ excitation */
+/**
+ * Number of non-zero pulses in the MP-MLQ excitation
+ */
 static const int8_t pulses[4] = {6, 5, 6, 5};
 
-/* Size of the MP-MLQ fixed excitation codebooks */
+/**
+ * Size of the MP-MLQ fixed excitation codebooks
+ */
 static const int32_t max_pos[4] = {593775, 142506, 593775, 142506};
 
 static const int16_t fixed_cb_gain[GAIN_LEVELS] = {
@@ -1183,12 +1262,62 @@ static const int16_t adaptive_cb_gain170[170 * 20] = {
     -4534,  -2487,  -3932,  -4166,  -2113,  -3341,  -3540,  -3070
 };
 
-/* 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15 */
+/**
+ * 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15
+ */
 static const int16_t postfilter_tbl[2][LPC_ORDER] = {
     /* Zero */
-    { 21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441 },
+    {21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441},
     /* Pole */
-    { 24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845 }
+    {24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845}
+};
+
+/**
+ * Hamming window coefficients scaled by 2^15
+ */
+static const int16_t hamming_window[LPC_FRAME] = {
+     2621,  2631,  2659,  2705,  2770,  2853,  2955,  3074,  3212,  3367,
+     3541,  3731,  3939,  4164,  4405,  4663,  4937,  5226,  5531,  5851,
+     6186,  6534,  6897,  7273,  7661,  8062,  8475,  8899,  9334,  9780,
+    10235, 10699, 11172, 11653, 12141, 12636, 13138, 13645, 14157, 14673,
+    15193, 15716, 16242, 16769, 17298, 17827, 18356, 18884, 19411, 19935,
+    20457, 20975, 21489, 21999, 22503, 23002, 23494, 23978, 24455, 24924,
+    25384, 25834, 26274, 26704, 27122, 27529, 27924, 28306, 28675, 29031,
+    29373, 29700, 30012, 30310, 30592, 30857, 31107, 31340, 31557, 31756,
+    31938, 32102, 32249, 32377, 32488, 32580, 32654, 32710, 32747, 32766,
+    32766, 32747, 32710, 32654, 32580, 32488, 32377, 32249, 32102, 31938,
+    31756, 31557, 31340, 31107, 30857, 30592, 30310, 30012, 29700, 29373,
+    29031, 28675, 28306, 27924, 27529, 27122, 26704, 26274, 25834, 25384,
+    24924, 24455, 23978, 23494, 23002, 22503, 21999, 21489, 20975, 20457,
+    19935, 19411, 18884, 18356, 17827, 17298, 16769, 16242, 15716, 15193,
+    14673, 14157, 13645, 13138, 12636, 12141, 11653, 11172, 10699, 10235,
+     9780, 9334,   8899,  8475,  8062,  7661,  7273,  6897,  6534,  6186,
+     5851, 5531,   5226,  4937,  4663,  4405,  4164,  3939,  3731,  3541,
+     3367, 3212,   3074,  2955,  2853,  2770,  2705,  2659,  2631,  2621
+};
+
+/**
+ * Binomial window coefficients scaled by 2^15
+ */
+static const int16_t binomial_window[LPC_ORDER] = {
+    32749, 32695, 32604, 32477, 32315, 32118, 31887, 31622, 31324, 30995
+};
+
+/**
+ * 0.994^i scaled by 2^15
+ */
+static const int16_t bandwidth_expand[LPC_ORDER] = {
+    32571, 32376, 32182, 31989, 31797, 31606, 31416, 31228, 31040, 30854
+};
+
+/**
+ * 0.5^i scaled by 2^15
+ */
+static const int16_t percept_flt_tbl[2][LPC_ORDER] = {
+    /* Zero part */
+    {29491, 26542, 23888, 21499, 19349, 17414, 15673, 14106, 12695, 11425},
+    /* Pole part */
+    {16384,  8192,  4096,  2048,  1024,   512,   256,   128,    64,    32}
 };
 
 static const int cng_adaptive_cb_lag[4] = { 1, 0, 1, 3 };
diff --git a/libavcodec/g726.c b/libavcodec/g726.c
index 4a23ff78a4..934d120b6f 100644
--- a/libavcodec/g726.c
+++ b/libavcodec/g726.c
@@ -5,20 +5,20 @@
  * This is a very straightforward rendition of the G.726
  * Section 4 "Computational Details".
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <limits.h>
@@ -95,6 +95,7 @@ typedef struct G726Context {
     int sez;            /**< estimated second order prediction */
     int y;              /**< quantizer scaling factor for the next iteration */
     int code_size;
+    int little_endian;  /**< little-endian bitstream as used in aiff and Sun AU */
 } G726Context;
 
 static const int quant_tbl16[] =                  /**< 16kbit/s 2bits per sample */
@@ -296,7 +297,7 @@ static int16_t g726_encode(G726Context* c, int16_t sig)
 {
     uint8_t i;
 
-    i = quant(c, sig/4 - c->se) & ((1<<c->code_size) - 1);
+    i = av_mod_uintp2(quant(c, sig/4 - c->se), c->code_size);
     g726_decode(c, i);
     return i;
 }
@@ -350,10 +351,8 @@ static int g726_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int i, ret, out_size;
 
     out_size = (frame->nb_samples * c->code_size + 7) / 8;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
     for (i = 0; i < frame->nb_samples; i++)
@@ -373,7 +372,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass g726_class = {
     .class_name = "g726",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -396,19 +395,25 @@ AVCodec ff_adpcm_g726_encoder = {
     .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &g726_class,
     .defaults       = defaults,
 };
 #endif
 
-#if CONFIG_ADPCM_G726_DECODER
+#if CONFIG_ADPCM_G726_DECODER || CONFIG_ADPCM_G726LE_DECODER
 static av_cold int g726_decode_init(AVCodecContext *avctx)
 {
     G726Context* c = avctx->priv_data;
 
+    if(avctx->channels > 1){
+        avpriv_request_sample(avctx, "Decoding more than one channel");
+        return AVERROR_PATCHWELCOME;
+    }
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
 
+    c->little_endian = !strcmp(avctx->codec->name, "g726le");
+
     c->code_size = avctx->bits_per_coded_sample;
     if (c->code_size < 2 || c->code_size > 5) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of bits %d\n", c->code_size);
@@ -436,16 +441,16 @@ static int g726_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = out_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     init_get_bits(&gb, buf, buf_size * 8);
 
     while (out_samples--)
-        *samples++ = g726_decode(c, get_bits(&gb, c->code_size));
+        *samples++ = g726_decode(c, c->little_endian ?
+                                    get_bits_le(&gb, c->code_size) :
+                                    get_bits(&gb, c->code_size));
 
     if (get_bits_left(&gb) > 0)
         av_log(avctx, AV_LOG_ERROR, "Frame invalidly split, missing parser?\n");
@@ -460,7 +465,9 @@ static void g726_decode_flush(AVCodecContext *avctx)
     G726Context *c = avctx->priv_data;
     g726_reset(c);
 }
+#endif
 
+#if CONFIG_ADPCM_G726_DECODER
 AVCodec ff_adpcm_g726_decoder = {
     .name           = "g726",
     .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM"),
@@ -473,3 +480,17 @@ AVCodec ff_adpcm_g726_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
 };
 #endif
+
+#if CONFIG_ADPCM_G726LE_DECODER
+AVCodec ff_adpcm_g726le_decoder = {
+    .name           = "g726le",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ADPCM_G726LE,
+    .priv_data_size = sizeof(G726Context),
+    .init           = g726_decode_init,
+    .decode         = g726_decode_frame,
+    .flush          = g726_decode_flush,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+};
+#endif
diff --git a/libavcodec/g729.h b/libavcodec/g729.h
new file mode 100644
index 0000000000..7c5f693a7a
--- /dev/null
+++ b/libavcodec/g729.h
@@ -0,0 +1,33 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_G729_H
+#define AVCODEC_G729_H
+
+/**
+ * subframe size
+ */
+#define SUBFRAME_SIZE 40
+
+/* bytes per block */
+#define G729_8K_BLOCK_SIZE     10
+#define G729D_6K4_BLOCK_SIZE   8
+
+#endif // AVCODEC_G729_H
diff --git a/libavcodec/g729_parser.c b/libavcodec/g729_parser.c
new file mode 100644
index 0000000000..d13c990807
--- /dev/null
+++ b/libavcodec/g729_parser.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015  Ganesh Ajjanagadde
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * G.729 audio parser
+ *
+ * Splits packets into individual blocks.
+ */
+
+#include "libavutil/avassert.h"
+#include "parser.h"
+#include "g729.h"
+
+typedef struct G729ParseContext {
+    ParseContext pc;
+    int block_size;
+    int duration;
+    int remaining;
+} G729ParseContext;
+
+static int g729_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    G729ParseContext *s = s1->priv_data;
+    ParseContext *pc = &s->pc;
+    int next;
+
+    if (!s->block_size) {
+        av_assert1(avctx->codec_id == AV_CODEC_ID_G729);
+        /* FIXME: replace this heuristic block_size with more precise estimate */
+        s->block_size = (avctx->bit_rate < 8000) ? G729D_6K4_BLOCK_SIZE : G729_8K_BLOCK_SIZE;
+        s->duration   = avctx->frame_size;
+    }
+
+    if (!s->remaining)
+        s->remaining = s->block_size;
+    if (s->remaining <= buf_size) {
+        next = s->remaining;
+        s->remaining = 0;
+    } else {
+        next = END_NOT_FOUND;
+        s->remaining -= buf_size;
+    }
+
+    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0 || !buf_size) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    s1->duration = s->duration;
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_g729_parser = {
+    .codec_ids      = { AV_CODEC_ID_G729 },
+    .priv_data_size = sizeof(G729ParseContext),
+    .parser_parse   = g729_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/g729data.h b/libavcodec/g729data.h
new file mode 100644
index 0000000000..365ca47ec6
--- /dev/null
+++ b/libavcodec/g729data.h
@@ -0,0 +1,382 @@
+/*
+ * data for G.729, G729 Annex D decoders
+ * Copyright (c) 2007 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_G729DATA_H
+#define AVCODEC_G729DATA_H
+
+#include <stdint.h>
+
+#define MA_NP                4  ///< Moving Average (MA) prediction order
+
+#define VQ_1ST_BITS          7  ///< first stage vector of quantizer (size in bits)
+#define VQ_2ND_BITS          5  ///< second stage vector of quantizer (size in bits)
+
+#define GC_1ST_IDX_BITS_8K   3  ///< gain codebook (first stage) index, 8k mode (size in bits)
+#define GC_2ND_IDX_BITS_8K   4  ///< gain codebook (second stage) index, 8k mode (size in bits)
+
+#define GC_1ST_IDX_BITS_6K4  3  ///< gain codebook (first stage) index, 6.4k mode (size in bits)
+#define GC_2ND_IDX_BITS_6K4  3  ///< gain codebook (second stage) index, 6.4k mode (size in bits)
+
+/**
+ * first stage LSP codebook
+ * (10-dimensional, with 128 entries (3.24 of G.729)
+ */
+static const int16_t cb_lsp_1st[1<<VQ_1ST_BITS][10] = { /* (2.13) */
+  { 1486,  2168,  3751,  9074, 12134, 13944, 17983, 19173, 21190, 21820},
+  { 1730,  2640,  3450,  4870,  6126,  7876, 15644, 17817, 20294, 21902},
+  { 1568,  2256,  3088,  4874, 11063, 13393, 18307, 19293, 21109, 21741},
+  { 1733,  2512,  3357,  4708,  6977, 10296, 17024, 17956, 19145, 20350},
+  { 1744,  2436,  3308,  8731, 10432, 12007, 15614, 16639, 21359, 21913},
+  { 1786,  2369,  3372,  4521,  6795, 12963, 17674, 18988, 20855, 21640},
+  { 1631,  2433,  3361,  6328, 10709, 12013, 13277, 13904, 19441, 21088},
+  { 1489,  2364,  3291,  6250,  9227, 10403, 13843, 15278, 17721, 21451},
+  { 1869,  2533,  3475,  4365,  9152, 14513, 15908, 17022, 20611, 21411},
+  { 2070,  3025,  4333,  5854,  7805,  9231, 10597, 16047, 20109, 21834},
+  { 1910,  2673,  3419,  4261, 11168, 15111, 16577, 17591, 19310, 20265},
+  { 1141,  1815,  2624,  4623,  6495,  9588, 13968, 16428, 19351, 21286},
+  { 2192,  3171,  4707,  5808, 10904, 12500, 14162, 15664, 21124, 21789},
+  { 1286,  1907,  2548,  3453,  9574, 11964, 15978, 17344, 19691, 22495},
+  { 1921,  2720,  4604,  6684, 11503, 12992, 14350, 15262, 16997, 20791},
+  { 2052,  2759,  3897,  5246,  6638, 10267, 15834, 16814, 18149, 21675},
+  { 1798,  2497,  5617, 11449, 13189, 14711, 17050, 18195, 20307, 21182},
+  { 1009,  1647,  2889,  5709,  9541, 12354, 15231, 18494, 20966, 22033},
+  { 3016,  3794,  5406,  7469, 12488, 13984, 15328, 16334, 19952, 20791},
+  { 2203,  3040,  3796,  5442, 11987, 13512, 14931, 16370, 17856, 18803},
+  { 2912,  4292,  7988,  9572, 11562, 13244, 14556, 16529, 20004, 21073},
+  { 2861,  3607,  5923,  7034,  9234, 12054, 13729, 18056, 20262, 20974},
+  { 3069,  4311,  5967,  7367, 11482, 12699, 14309, 16233, 18333, 19172},
+  { 2434,  3661,  4866,  5798, 10383, 11722, 13049, 15668, 18862, 19831},
+  { 2020,  2605,  3860,  9241, 13275, 14644, 16010, 17099, 19268, 20251},
+  { 1877,  2809,  3590,  4707, 11056, 12441, 15622, 17168, 18761, 19907},
+  { 2107,  2873,  3673,  5799, 13579, 14687, 15938, 17077, 18890, 19831},
+  { 1612,  2284,  2944,  3572,  8219, 13959, 15924, 17239, 18592, 20117},
+  { 2420,  3156,  6542, 10215, 12061, 13534, 15305, 16452, 18717, 19880},
+  { 1667,  2612,  3534,  5237, 10513, 11696, 12940, 16798, 18058, 19378},
+  { 2388,  3017,  4839,  9333, 11413, 12730, 15024, 16248, 17449, 18677},
+  { 1875,  2786,  4231,  6320,  8694, 10149, 11785, 17013, 18608, 19960},
+  {  679,  1411,  4654,  8006, 11446, 13249, 15763, 18127, 20361, 21567},
+  { 1838,  2596,  3578,  4608,  5650, 11274, 14355, 15886, 20579, 21754},
+  { 1303,  1955,  2395,  3322, 12023, 13764, 15883, 18077, 20180, 21232},
+  { 1438,  2102,  2663,  3462,  8328, 10362, 13763, 17248, 19732, 22344},
+  {  860,  1904,  6098,  7775,  9815, 12007, 14821, 16709, 19787, 21132},
+  { 1673,  2723,  3704,  6125,  7668,  9447, 13683, 14443, 20538, 21731},
+  { 1246,  1849,  2902,  4508,  7221, 12710, 14835, 16314, 19335, 22720},
+  { 1525,  2260,  3862,  5659,  7342, 11748, 13370, 14442, 18044, 21334},
+  { 1196,  1846,  3104,  7063, 10972, 12905, 14814, 17037, 19922, 22636},
+  { 2147,  3106,  4475,  6511,  8227,  9765, 10984, 12161, 18971, 21300},
+  { 1585,  2405,  2994,  4036, 11481, 13177, 14519, 15431, 19967, 21275},
+  { 1778,  2688,  3614,  4680,  9465, 11064, 12473, 16320, 19742, 20800},
+  { 1862,  2586,  3492,  6719, 11708, 13012, 14364, 16128, 19610, 20425},
+  { 1395,  2156,  2669,  3386, 10607, 12125, 13614, 16705, 18976, 21367},
+  { 1444,  2117,  3286,  6233,  9423, 12981, 14998, 15853, 17188, 21857},
+  { 2004,  2895,  3783,  4897,  6168,  7297, 12609, 16445, 19297, 21465},
+  { 1495,  2863,  6360,  8100, 11399, 14271, 15902, 17711, 20479, 22061},
+  { 2484,  3114,  5718,  7097,  8400, 12616, 14073, 14847, 20535, 21396},
+  { 2424,  3277,  5296,  6284, 11290, 12903, 16022, 17508, 19333, 20283},
+  { 2565,  3778,  5360,  6989,  8782, 10428, 14390, 15742, 17770, 21734},
+  { 2727,  3384,  6613,  9254, 10542, 12236, 14651, 15687, 20074, 21102},
+  { 1916,  2953,  6274,  8088,  9710, 10925, 12392, 16434, 20010, 21183},
+  { 3384,  4366,  5349,  7667, 11180, 12605, 13921, 15324, 19901, 20754},
+  { 3075,  4283,  5951,  7619,  9604, 11010, 12384, 14006, 20658, 21497},
+  { 1751,  2455,  5147,  9966, 11621, 13176, 14739, 16470, 20788, 21756},
+  { 1442,  2188,  3330,  6813,  8929, 12135, 14476, 15306, 19635, 20544},
+  { 2294,  2895,  4070,  8035, 12233, 13416, 14762, 17367, 18952, 19688},
+  { 1937,  2659,  4602,  6697,  9071, 12863, 14197, 15230, 16047, 18877},
+  { 2071,  2663,  4216,  9445, 10887, 12292, 13949, 14909, 19236, 20341},
+  { 1740,  2491,  3488,  8138,  9656, 11153, 13206, 14688, 20896, 21907},
+  { 2199,  2881,  4675,  8527, 10051, 11408, 14435, 15463, 17190, 20597},
+  { 1943,  2988,  4177,  6039,  7478,  8536, 14181, 15551, 17622, 21579},
+  { 1825,  3175,  7062,  9818, 12824, 15450, 18330, 19856, 21830, 22412},
+  { 2464,  3046,  4822,  5977,  7696, 15398, 16730, 17646, 20588, 21320},
+  { 2550,  3393,  5305,  6920, 10235, 14083, 18143, 19195, 20681, 21336},
+  { 3003,  3799,  5321,  6437,  7919, 11643, 15810, 16846, 18119, 18980},
+  { 3455,  4157,  6838,  8199,  9877, 12314, 15905, 16826, 19949, 20892},
+  { 3052,  3769,  4891,  5810,  6977, 10126, 14788, 15990, 19773, 20904},
+  { 3671,  4356,  5827,  6997,  8460, 12084, 14154, 14939, 19247, 20423},
+  { 2716,  3684,  5246,  6686,  8463, 10001, 12394, 14131, 16150, 19776},
+  { 1945,  2638,  4130,  7995, 14338, 15576, 17057, 18206, 20225, 20997},
+  { 2304,  2928,  4122,  4824,  5640, 13139, 15825, 16938, 20108, 21054},
+  { 1800,  2516,  3350,  5219, 13406, 15948, 17618, 18540, 20531, 21252},
+  { 1436,  2224,  2753,  4546,  9657, 11245, 15177, 16317, 17489, 19135},
+  { 2319,  2899,  4980,  6936,  8404, 13489, 15554, 16281, 20270, 20911},
+  { 2187,  2919,  4610,  5875,  7390, 12556, 14033, 16794, 20998, 21769},
+  { 2235,  2923,  5121,  6259,  8099, 13589, 15340, 16340, 17927, 20159},
+  { 1765,  2638,  3751,  5730,  7883, 10108, 13633, 15419, 16808, 18574},
+  { 3460,  5741,  9596, 11742, 14413, 16080, 18173, 19090, 20845, 21601},
+  { 3735,  4426,  6199,  7363,  9250, 14489, 16035, 17026, 19873, 20876},
+  { 3521,  4778,  6887,  8680, 12717, 14322, 15950, 18050, 20166, 21145},
+  { 2141,  2968,  6865,  8051, 10010, 13159, 14813, 15861, 17528, 18655},
+  { 4148,  6128,  9028, 10871, 12686, 14005, 15976, 17208, 19587, 20595},
+  { 4403,  5367,  6634,  8371, 10163, 11599, 14963, 16331, 17982, 18768},
+  { 4091,  5386,  6852,  8770, 11563, 13290, 15728, 16930, 19056, 20102},
+  { 2746,  3625,  5299,  7504, 10262, 11432, 13172, 15490, 16875, 17514},
+  { 2248,  3556,  8539, 10590, 12665, 14696, 16515, 17824, 20268, 21247},
+  { 1279,  1960,  3920,  7793, 10153, 14753, 16646, 18139, 20679, 21466},
+  { 2440,  3475,  6737,  8654, 12190, 14588, 17119, 17925, 19110, 19979},
+  { 1879,  2514,  4497,  7572, 10017, 14948, 16141, 16897, 18397, 19376},
+  { 2804,  3688,  7490, 10086, 11218, 12711, 16307, 17470, 20077, 21126},
+  { 2023,  2682,  3873,  8268, 10255, 11645, 15187, 17102, 18965, 19788},
+  { 2823,  3605,  5815,  8595, 10085, 11469, 16568, 17462, 18754, 19876},
+  { 2851,  3681,  5280,  7648,  9173, 10338, 14961, 16148, 17559, 18474},
+  { 1348,  2645,  5826,  8785, 10620, 12831, 16255, 18319, 21133, 22586},
+  { 2141,  3036,  4293,  6082,  7593, 10629, 17158, 18033, 21466, 22084},
+  { 1608,  2375,  3384,  6878,  9970, 11227, 16928, 17650, 20185, 21120},
+  { 2774,  3616,  5014,  6557,  7788,  8959, 17068, 18302, 19537, 20542},
+  { 1934,  4813,  6204,  7212,  8979, 11665, 15989, 17811, 20426, 21703},
+  { 2288,  3507,  5037,  6841,  8278,  9638, 15066, 16481, 21653, 22214},
+  { 2951,  3771,  4878,  7578,  9016, 10298, 14490, 15242, 20223, 20990},
+  { 3256,  4791,  6601,  7521,  8644,  9707, 13398, 16078, 19102, 20249},
+  { 1827,  2614,  3486,  6039, 12149, 13823, 16191, 17282, 21423, 22041},
+  { 1000,  1704,  3002,  6335,  8471, 10500, 14878, 16979, 20026, 22427},
+  { 1646,  2286,  3109,  7245, 11493, 12791, 16824, 17667, 18981, 20222},
+  { 1708,  2501,  3315,  6737,  8729,  9924, 16089, 17097, 18374, 19917},
+  { 2623,  3510,  4478,  5645,  9862, 11115, 15219, 18067, 19583, 20382},
+  { 2518,  3434,  4728,  6388,  8082,  9285, 13162, 18383, 19819, 20552},
+  { 1726,  2383,  4090,  6303,  7805, 12845, 14612, 17608, 19269, 20181},
+  { 2860,  3735,  4838,  6044,  7254,  8402, 14031, 16381, 18037, 19410},
+  { 4247,  5993,  7952,  9792, 12342, 14653, 17527, 18774, 20831, 21699},
+  { 3502,  4051,  5680,  6805,  8146, 11945, 16649, 17444, 20390, 21564},
+  { 3151,  4893,  5899,  7198, 11418, 13073, 15124, 17673, 20520, 21861},
+  { 3960,  4848,  5926,  7259,  8811, 10529, 15661, 16560, 18196, 20183},
+  { 4499,  6604,  8036,  9251, 10804, 12627, 15880, 17512, 20020, 21046},
+  { 4251,  5541,  6654,  8318,  9900, 11686, 15100, 17093, 20572, 21687},
+  { 3769,  5327,  7865,  9360, 10684, 11818, 13660, 15366, 18733, 19882},
+  { 3083,  3969,  6248,  8121,  9798, 10994, 12393, 13686, 17888, 19105},
+  { 2731,  4670,  7063,  9201, 11346, 13735, 16875, 18797, 20787, 22360},
+  { 1187,  2227,  4737,  7214,  9622, 12633, 15404, 17968, 20262, 23533},
+  { 1911,  2477,  3915, 10098, 11616, 12955, 16223, 17138, 19270, 20729},
+  { 1764,  2519,  3887,  6944,  9150, 12590, 16258, 16984, 17924, 18435},
+  { 1400,  3674,  7131,  8718, 10688, 12508, 15708, 17711, 19720, 21068},
+  { 2322,  3073,  4287,  8108,  9407, 10628, 15862, 16693, 19714, 21474},
+  { 2630,  3339,  4758,  8360, 10274, 11333, 12880, 17374, 19221, 19936},
+  { 1721,  2577,  5553,  7195,  8651, 10686, 15069, 16953, 18703, 19929}
+};
+
+/**
+ * second stage LSP codebook, high and low parts
+   (both 5-dimensional, with 32 entries (3.2.4 of G.729)
+ */
+static const int16_t cb_lsp_2nd[1<<VQ_2ND_BITS][10] = { /* (2.13) */
+  { -435,  -815,  -742,  1033,  -518,   582, -1201,   829,    86,   385},
+  { -833,  -891,   463,    -8, -1251,  1450,    72,  -231,   864,   661},
+  {-1021,   231,  -306,   321,  -220,  -163,  -526,  -754, -1633,   267},
+  {   57,  -198,  -339,   -33, -1468,   573,   796,  -169,  -631,   816},
+  {  171,  -350,   294,  1660,   453,   519,   291,   159,  -640, -1296},
+  { -701,  -842,   -58,   950,   892,  1549,   715,   527,  -714,  -193},
+  {  584,    31,  -289,   356,  -333,  -457,   612,  -283, -1381,  -741},
+  { -109,  -808,   231,    77,   -87,  -344,  1341,  1087,  -654,  -569},
+  { -859,  1236,   550,   854,   714,  -543, -1752,  -195,   -98,  -276},
+  { -877,  -954, -1248,  -299,   212,  -235,  -728,   949,  1517,   895},
+  {  -77,   344,  -620,   763,   413,   502,  -362,  -960,  -483,  1386},
+  { -314,  -307,  -256, -1260,  -429,   450,  -466,  -108,  1010,  2223},
+  {  711,   693,   521,   650,  1305,   -28,  -378,   744, -1005,   240},
+  { -112,  -271,  -500,   946,  1733,   271,   -15,   909,  -259,  1688},
+  {  575,   -10,  -468,  -199,  1101, -1011,   581,   -53,  -747,   878},
+  {  145,  -285, -1280,  -398,    36,  -498, -1377,    18,  -444,  1483},
+  {-1133,  -835,  1350,  1284,   -95,  1015,  -222,   443,   372,  -354},
+  {-1459, -1237,   416,  -213,   466,   669,   659,  1640,   932,   534},
+  {  -15,    66,   468,  1019,  -748,  1385,  -182,  -907,  -721,  -262},
+  { -338,   148,  1445,    75,  -760,   569,  1247,   337,   416,  -121},
+  {  389,   239,  1568,   981,   113,   369, -1003,  -507,  -587,  -904},
+  { -312,   -98,   949,    31,  1104,    72,  -141,  1465,    63,  -785},
+  { 1127,   584,   835,   277, -1159,   208,   301,  -882,   117,  -404},
+  {  539,  -114,   856,  -493,   223,  -912,   623,   -76,   276,  -440},
+  { 2197,  2337,  1268,   670,   304,  -267,  -525,   140,   882,  -139},
+  {-1596,   550,   801,  -456,   -56,  -697,   865,  1060,   413,   446},
+  { 1154,   593,   -77,  1237,   -31,   581, -1037,  -895,   669,   297},
+  {  397,   558,   203,  -797,  -919,     3,   692,  -292,  1050,   782},
+  {  334,  1475,   632,   -80,    48, -1061,  -484,   362,  -597,  -852},
+  { -545,  -330,  -429,  -680,  1133, -1182,  -744,  1340,   262,    63},
+  { 1320,   827,  -398,  -576,   341,  -774,  -483, -1247,   -70,    98},
+  { -163,   674,   -11,  -886,   531, -1125,  -265,  -242,   724,   934}
+};
+
+/**
+ * gain codebook (first stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_8k[1<<GC_1ST_IDX_BITS_8K][2] = { /*(0.14) (2.13) */
+  { 3242 ,  9949 },
+  { 1551 ,  2425 },
+  { 2678 , 27162 },
+  { 1921 ,  9291 },
+  { 1831 ,  5022 },
+  {    1 ,  1516 },
+  {  356 , 14756 },
+  {   57 ,  5404 },
+};
+
+/**
+ * gain codebook (second stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_8k[1<<GC_2ND_IDX_BITS_8K][2] = { /*(1.14) (1.13) */
+  {  5142 ,   592 },
+  { 17299 ,  1861 },
+  {  6160 ,  2395 },
+  { 16112 ,  3392 },
+  {   826 ,  2005 },
+  { 18973 ,  5935 },
+  {  1994 ,     0 },
+  { 15434 ,   237 },
+  { 10573 ,  2966 },
+  { 15132 ,  4914 },
+  { 11569 ,  1196 },
+  { 14194 ,  1630 },
+  {  8091 ,  4861 },
+  { 15161 , 14276 },
+  {  9120 ,   525 },
+  { 13260 ,  3256 },
+};
+
+/**
+ * gain codebook (first stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_6k4[1<<GC_1ST_IDX_BITS_6K4][2] =
+{ /*(0.14) (1.14)*/
+ { 5849,     0 },
+ { 3171,  9280 },
+ { 3617,  6747 },
+ { 4987, 22294 },
+ { 2929,  1078 },
+ { 6068,  6093 },
+ { 9425,  2731 },
+ { 3915, 12872 },
+};
+
+/**
+ * gain codebook (second stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_6k4[1<<GC_2ND_IDX_BITS_6K4][2] =
+{ /*(1.14) (1.14)*/
+ {    0,  4175 },
+ {10828, 27602 },
+ {16423, 15724 },
+ { 4478,  7324 },
+ { 3988,     0 },
+ {10291, 11385 },
+ {11956, 10735 },
+ { 7876,  7821 },
+};
+
+/**
+ * 4th order Moving Average (MA) Predictor codebook (3.2.4 of G.729)
+ *
+ * float cb_ma_predictor_float[2][MA_NP][10] = {
+ *   {
+ *     {0.2570, 0.2780, 0.2800, 0.2736, 0.2757, 0.2764, 0.2675, 0.2678, 0.2779, 0.2647},
+ *     {0.2142, 0.2194, 0.2331, 0.2230, 0.2272, 0.2252, 0.2148, 0.2123, 0.2115, 0.2096},
+ *     {0.1670, 0.1523, 0.1567, 0.1580, 0.1601, 0.1569, 0.1589, 0.1555, 0.1474, 0.1571},
+ *     {0.1238, 0.0925, 0.0798, 0.0923, 0.0890, 0.0828, 0.1010, 0.0988, 0.0872, 0.1060},
+ *   },
+ *   {
+ *     {0.2360, 0.2405, 0.2499, 0.2495, 0.2517, 0.2591, 0.2636, 0.2625, 0.2551, 0.2310},
+ *     {0.1285, 0.0925, 0.0779, 0.1060, 0.1183, 0.1176, 0.1277, 0.1268, 0.1193, 0.1211},
+ *     {0.0981, 0.0589, 0.0401, 0.0654, 0.0761, 0.0728, 0.0841, 0.0826, 0.0776, 0.0891},
+ *     {0.0923, 0.0486, 0.0287, 0.0498, 0.0526, 0.0482, 0.0621, 0.0636, 0.0584, 0.0794},
+ *   },
+ * };
+ *                                    15
+ * cb_ma_predictor[j][k][i] = floor( 2 * cb_ma_predictor_float[j][k][i] )
+ *
+ * j=0..1, i=0..9, k=0..MA_NP-1
+ */
+static const int16_t cb_ma_predictor[2][MA_NP][10] = { /* (0.15) */
+  {
+    { 8421,  9109,  9175,  8965,  9034,  9057,  8765,  8775,  9106,  8673},
+    { 7018,  7189,  7638,  7307,  7444,  7379,  7038,  6956,  6930,  6868},
+    { 5472,  4990,  5134,  5177,  5246,  5141,  5206,  5095,  4830,  5147},
+    { 4056,  3031,  2614,  3024,  2916,  2713,  3309,  3237,  2857,  3473}
+  },
+  {
+    { 7733,  7880,  8188,  8175,  8247,  8490,  8637,  8601,  8359,  7569},
+    { 4210,  3031,  2552,  3473,  3876,  3853,  4184,  4154,  3909,  3968},
+    { 3214,  1930,  1313,  2143,  2493,  2385,  2755,  2706,  2542,  2919},
+    { 3024,  1592,   940,  1631,  1723,  1579,  2034,  2084,  1913,  2601}
+  }
+};
+
+/**
+ *                                     15         3
+ * cb_ma_predictor_sum[j][i] = floor( 2 * (1.0 - sum ( cb_ma_predictor_float[j][k][i] ) ) )
+ *                                               k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum[2][10] = { /* (0.15) */
+  { 7798,  8447,  8205,  8293,  8126,  8477,  8447,  8703,  9043,  8604},
+  {14585, 18333, 19772, 17344, 16426, 16459, 15155, 15220, 16043, 15708}
+};
+
+/**
+ *                                                           12
+ *                                                          2
+ * cb_ma_predictor_sum_inv[j][i] = floor(---------------------------------------------)
+ *                                               3
+ *                                        1.0 - sum ( cb_ma_predictor_float[j][k][i] )
+ *                                              k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum_inv[2][10] = { /* (3.12) */
+  {17210, 15888, 16357, 16183, 16516, 15833, 15888, 15421, 14840, 15597},
+  { 9202,  7320,  6788,  7738,  8170,  8154,  8856,  8818,  8366,  8544}
+};
+
+/**
+ * MA prediction coefficients (3.9.1 of G.729, near Equation 69)
+ */
+static const uint16_t ma_prediction_coeff[4] = { /* (0.13) */
+  5571, 4751, 2785, 1556
+};
+
+/**
+ * initial LSP coefficients belongs to virtual frame preceding  the
+ * first frame of the stream
+ */
+static const int16_t lsp_init[10]= { /* (0.15) */
+   30000, 26000, 21000, 15000, 8000, 0, -8000,-15000,-21000,-26000
+};
+
+/**
+ * additional "phase" post-processing filter impulse response (D.6.2 of G.729)
+ *
+ * Table contains three impulse responses, correspond to
+ * different amounts of spreading.
+ */
+static const int16_t phase_filter[3][40] =
+{
+  { // maximum spreading (for noise-like segments)
+    14690, 11518,  1268, -2762, -5672,  7514,  -36, -2808, -3041,  4823,
+     2952, -8425,  3785,  1455,  2179, -8638, 8051, -2104, -1455,   777,
+     1108, -2386,  2254,  -364,  -675, -2104, 6046, -5682,  1072,  3123,
+    -5059,  5312, -2330, -3729,  6924, -3890,  675, -1776,    29, 10145,
+  },
+  { // medium spreading
+    30274,  3831, -4037,  2972, -1049, -1003,  2477, -3044,  2815, -2232,
+     1753, -1612,  1714, -1776,  1543, -1009,   429,  -170,   472, -1265,
+     2176, -2707,  2523, -1622,   344,   826, -1530,  1724, -1658,  1701,
+    -2064,  2644, -3061,  2897, -1979,   557,   780, -1370,   842,   655,
+  },
+  { // no spreading (for voiced speech)
+    32767, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  }
+};
+#endif /* AVCODEC_G729DATA_H */
diff --git a/libavcodec/g729dec.c b/libavcodec/g729dec.c
new file mode 100644
index 0000000000..99053add43
--- /dev/null
+++ b/libavcodec/g729dec.c
@@ -0,0 +1,726 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "libavutil/avutil.h"
+#include "get_bits.h"
+#include "audiodsp.h"
+#include "internal.h"
+
+
+#include "g729.h"
+#include "lsp.h"
+#include "celp_math.h"
+#include "celp_filters.h"
+#include "acelp_filters.h"
+#include "acelp_pitch_delay.h"
+#include "acelp_vectors.h"
+#include "g729data.h"
+#include "g729postfilter.h"
+
+/**
+ * minimum quantized LSF value (3.2.4)
+ * 0.005 in Q13
+ */
+#define LSFQ_MIN                   40
+
+/**
+ * maximum quantized LSF value (3.2.4)
+ * 3.135 in Q13
+ */
+#define LSFQ_MAX                   25681
+
+/**
+ * minimum LSF distance (3.2.4)
+ * 0.0391 in Q13
+ */
+#define LSFQ_DIFF_MIN              321
+
+/// interpolation filter length
+#define INTERPOL_LEN              11
+
+/**
+ * minimum gain pitch value (3.8, Equation 47)
+ * 0.2 in (1.14)
+ */
+#define SHARP_MIN                  3277
+
+/**
+ * maximum gain pitch value (3.8, Equation 47)
+ * (EE) This does not comply with the specification.
+ * Specification says about 0.8, which should be
+ * 13107 in (1.14), but reference C code uses
+ * 13017 (equals to 0.7945) instead of it.
+ */
+#define SHARP_MAX                  13017
+
+/**
+ * MR_ENERGY (mean removed energy) = mean_energy + 10 * log10(2^26  * subframe_size) in (7.13)
+ */
+#define MR_ENERGY 1018156
+
+#define DECISION_NOISE        0
+#define DECISION_INTERMEDIATE 1
+#define DECISION_VOICE        2
+
+typedef enum {
+    FORMAT_G729_8K = 0,
+    FORMAT_G729D_6K4,
+    FORMAT_COUNT,
+} G729Formats;
+
+typedef struct {
+    uint8_t ac_index_bits[2];   ///< adaptive codebook index for second subframe (size in bits)
+    uint8_t parity_bit;         ///< parity bit for pitch delay
+    uint8_t gc_1st_index_bits;  ///< gain codebook (first stage) index (size in bits)
+    uint8_t gc_2nd_index_bits;  ///< gain codebook (second stage) index (size in bits)
+    uint8_t fc_signs_bits;      ///< number of pulses in fixed-codebook vector
+    uint8_t fc_indexes_bits;    ///< size (in bits) of fixed-codebook index entry
+} G729FormatDescription;
+
+typedef struct {
+    AudioDSPContext adsp;
+
+    /// past excitation signal buffer
+    int16_t exc_base[2*SUBFRAME_SIZE+PITCH_DELAY_MAX+INTERPOL_LEN];
+
+    int16_t* exc;               ///< start of past excitation data in buffer
+    int pitch_delay_int_prev;   ///< integer part of previous subframe's pitch delay (4.1.3)
+
+    /// (2.13) LSP quantizer outputs
+    int16_t  past_quantizer_output_buf[MA_NP + 1][10];
+    int16_t* past_quantizer_outputs[MA_NP + 1];
+
+    int16_t lsfq[10];           ///< (2.13) quantized LSF coefficients from previous frame
+    int16_t lsp_buf[2][10];     ///< (0.15) LSP coefficients (previous and current frames) (3.2.5)
+    int16_t *lsp[2];            ///< pointers to lsp_buf
+
+    int16_t quant_energy[4];    ///< (5.10) past quantized energy
+
+    /// previous speech data for LP synthesis filter
+    int16_t syn_filter_data[10];
+
+
+    /// residual signal buffer (used in long-term postfilter)
+    int16_t residual[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+
+    /// previous speech data for residual calculation filter
+    int16_t res_filter_data[SUBFRAME_SIZE+10];
+
+    /// previous speech data for short-term postfilter
+    int16_t pos_filter_data[SUBFRAME_SIZE+10];
+
+    /// (1.14) pitch gain of current and five previous subframes
+    int16_t past_gain_pitch[6];
+
+    /// (14.1) gain code from current and previous subframe
+    int16_t past_gain_code[2];
+
+    /// voice decision on previous subframe (0-noise, 1-intermediate, 2-voice), G.729D
+    int16_t voice_decision;
+
+    int16_t onset;              ///< detected onset level (0-2)
+    int16_t was_periodic;       ///< whether previous frame was declared as periodic or not (4.4)
+    int16_t ht_prev_data;       ///< previous data for 4.2.3, equation 86
+    int gain_coeff;             ///< (1.14) gain coefficient (4.2.4)
+    uint16_t rand_value;        ///< random number generator value (4.4.4)
+    int ma_predictor_prev;      ///< switched MA predictor of LSP quantizer from last good frame
+
+    /// (14.14) high-pass filter data (past input)
+    int hpf_f[2];
+
+    /// high-pass filter data (past output)
+    int16_t hpf_z[2];
+}  G729Context;
+
+static const G729FormatDescription format_g729_8k = {
+    .ac_index_bits     = {8,5},
+    .parity_bit        = 1,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_8K,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_8K,
+    .fc_signs_bits     = 4,
+    .fc_indexes_bits   = 13,
+};
+
+static const G729FormatDescription format_g729d_6k4 = {
+    .ac_index_bits     = {8,4},
+    .parity_bit        = 0,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_6K4,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_6K4,
+    .fc_signs_bits     = 2,
+    .fc_indexes_bits   = 9,
+};
+
+/**
+ * @brief pseudo random number generator
+ */
+static inline uint16_t g729_prng(uint16_t value)
+{
+    return 31821 * value + 13849;
+}
+
+/**
+ * Get parity bit of bit 2..7
+ */
+static inline int get_parity(uint8_t value)
+{
+   return (0x6996966996696996ULL >> (value >> 2)) & 1;
+}
+
+/**
+ * Decodes LSF (Line Spectral Frequencies) from L0-L3 (3.2.4).
+ * @param[out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor switched MA predictor of LSP quantizer
+ * @param vq_1st first stage vector of quantizer
+ * @param vq_2nd_low second stage lower vector of LSP quantizer
+ * @param vq_2nd_high second stage higher vector of LSP quantizer
+ */
+static void lsf_decode(int16_t* lsfq, int16_t* past_quantizer_outputs[MA_NP + 1],
+                       int16_t ma_predictor,
+                       int16_t vq_1st, int16_t vq_2nd_low, int16_t vq_2nd_high)
+{
+    int i,j;
+    static const uint8_t min_distance[2]={10, 5}; //(2.13)
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+
+    for (i = 0; i < 5; i++) {
+        quantizer_output[i]     = cb_lsp_1st[vq_1st][i    ] + cb_lsp_2nd[vq_2nd_low ][i    ];
+        quantizer_output[i + 5] = cb_lsp_1st[vq_1st][i + 5] + cb_lsp_2nd[vq_2nd_high][i + 5];
+    }
+
+    for (j = 0; j < 2; j++) {
+        for (i = 1; i < 10; i++) {
+            int diff = (quantizer_output[i - 1] - quantizer_output[i] + min_distance[j]) >> 1;
+            if (diff > 0) {
+                quantizer_output[i - 1] -= diff;
+                quantizer_output[i    ] += diff;
+            }
+        }
+    }
+
+    for (i = 0; i < 10; i++) {
+        int sum = quantizer_output[i] * cb_ma_predictor_sum[ma_predictor][i];
+        for (j = 0; j < MA_NP; j++)
+            sum += past_quantizer_outputs[j][i] * cb_ma_predictor[ma_predictor][j][i];
+
+        lsfq[i] = sum >> 15;
+    }
+
+    ff_acelp_reorder_lsf(lsfq, LSFQ_DIFF_MIN, LSFQ_MIN, LSFQ_MAX, 10);
+}
+
+/**
+ * Restores past LSP quantizer output using LSF from previous frame
+ * @param[in,out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor_prev MA predictor from previous frame
+ * @param lsfq_prev (2.13) quantized LSF coefficients from previous frame
+ */
+static void lsf_restore_from_previous(int16_t* lsfq,
+                                      int16_t* past_quantizer_outputs[MA_NP + 1],
+                                      int ma_predictor_prev)
+{
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+    int i,k;
+
+    for (i = 0; i < 10; i++) {
+        int tmp = lsfq[i] << 15;
+
+        for (k = 0; k < MA_NP; k++)
+            tmp -= past_quantizer_outputs[k][i] * cb_ma_predictor[ma_predictor_prev][k][i];
+
+        quantizer_output[i] = ((tmp >> 15) * cb_ma_predictor_sum_inv[ma_predictor_prev][i]) >> 12;
+    }
+}
+
+/**
+ * Constructs new excitation signal and applies phase filter to it
+ * @param[out] out constructed speech signal
+ * @param in original excitation signal
+ * @param fc_cur (2.13) original fixed-codebook vector
+ * @param gain_code (14.1) gain code
+ * @param subframe_size length of the subframe
+ */
+static void g729d_get_new_exc(
+        int16_t* out,
+        const int16_t* in,
+        const int16_t* fc_cur,
+        int dstate,
+        int gain_code,
+        int subframe_size)
+{
+    int i;
+    int16_t fc_new[SUBFRAME_SIZE];
+
+    ff_celp_convolve_circ(fc_new, fc_cur, phase_filter[dstate], subframe_size);
+
+    for(i=0; i<subframe_size; i++)
+    {
+        out[i]  = in[i];
+        out[i] -= (gain_code * fc_cur[i] + 0x2000) >> 14;
+        out[i] += (gain_code * fc_new[i] + 0x2000) >> 14;
+    }
+}
+
+/**
+ * Makes decision about onset in current subframe
+ * @param past_onset decision result of previous subframe
+ * @param past_gain_code gain code of current and previous subframe
+ *
+ * @return onset decision result for current subframe
+ */
+static int g729d_onset_decision(int past_onset, const int16_t* past_gain_code)
+{
+    if((past_gain_code[0] >> 1) > past_gain_code[1])
+        return 2;
+    else
+        return FFMAX(past_onset-1, 0);
+}
+
+/**
+ * Makes decision about voice presence in current subframe
+ * @param onset onset level
+ * @param prev_voice_decision voice decision result from previous subframe
+ * @param past_gain_pitch pitch gain of current and previous subframes
+ *
+ * @return voice decision result for current subframe
+ */
+static int16_t g729d_voice_decision(int onset, int prev_voice_decision, const int16_t* past_gain_pitch)
+{
+    int i, low_gain_pitch_cnt, voice_decision;
+
+    if(past_gain_pitch[0] >= 14745)      // 0.9
+        voice_decision = DECISION_VOICE;
+    else if (past_gain_pitch[0] <= 9830) // 0.6
+        voice_decision = DECISION_NOISE;
+    else
+        voice_decision = DECISION_INTERMEDIATE;
+
+    for(i=0, low_gain_pitch_cnt=0; i<6; i++)
+        if(past_gain_pitch[i] < 9830)
+            low_gain_pitch_cnt++;
+
+    if(low_gain_pitch_cnt > 2 && !onset)
+        voice_decision = DECISION_NOISE;
+
+    if(!onset && voice_decision > prev_voice_decision + 1)
+        voice_decision--;
+
+    if(onset && voice_decision < DECISION_VOICE)
+        voice_decision++;
+
+    return voice_decision;
+}
+
+static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
+{
+    int res = 0;
+
+    while (order--)
+        res += *v1++ * *v2++;
+
+    return res;
+}
+
+static av_cold int decoder_init(AVCodecContext * avctx)
+{
+    G729Context* ctx = avctx->priv_data;
+    int i,k;
+
+    if (avctx->channels != 1) {
+        av_log(avctx, AV_LOG_ERROR, "Only mono sound is supported (requested channels: %d).\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+
+    /* Both 8kbit/s and 6.4kbit/s modes uses two subframes per frame. */
+    avctx->frame_size = SUBFRAME_SIZE << 1;
+
+    ctx->gain_coeff = 16384; // 1.0 in (1.14)
+
+    for (k = 0; k < MA_NP + 1; k++) {
+        ctx->past_quantizer_outputs[k] = ctx->past_quantizer_output_buf[k];
+        for (i = 1; i < 11; i++)
+            ctx->past_quantizer_outputs[k][i - 1] = (18717 * i) >> 3;
+    }
+
+    ctx->lsp[0] = ctx->lsp_buf[0];
+    ctx->lsp[1] = ctx->lsp_buf[1];
+    memcpy(ctx->lsp[0], lsp_init, 10 * sizeof(int16_t));
+
+    ctx->exc = &ctx->exc_base[PITCH_DELAY_MAX+INTERPOL_LEN];
+
+    ctx->pitch_delay_int_prev = PITCH_DELAY_MIN;
+
+    /* random seed initialization */
+    ctx->rand_value = 21845;
+
+    /* quantized prediction error */
+    for(i=0; i<4; i++)
+        ctx->quant_energy[i] = -14336; // -14 in (5.10)
+
+    ff_audiodsp_init(&ctx->adsp);
+    ctx->adsp.scalarproduct_int16 = scalarproduct_int16_c;
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int16_t *out_frame;
+    GetBitContext gb;
+    const G729FormatDescription *format;
+    int frame_erasure = 0;    ///< frame erasure detected during decoding
+    int bad_pitch = 0;        ///< parity check failed
+    int i;
+    int16_t *tmp;
+    G729Formats packet_type;
+    G729Context *ctx = avctx->priv_data;
+    int16_t lp[2][11];           // (3.12)
+    uint8_t ma_predictor;     ///< switched MA predictor of LSP quantizer
+    uint8_t quantizer_1st;    ///< first stage vector of quantizer
+    uint8_t quantizer_2nd_lo; ///< second stage lower vector of quantizer (size in bits)
+    uint8_t quantizer_2nd_hi; ///< second stage higher vector of quantizer (size in bits)
+
+    int pitch_delay_int[2];      // pitch delay, integer part
+    int pitch_delay_3x;          // pitch delay, multiplied by 3
+    int16_t fc[SUBFRAME_SIZE];   // fixed-codebook vector
+    int16_t synth[SUBFRAME_SIZE+10]; // fixed-codebook vector
+    int j, ret;
+    int gain_before, gain_after;
+    int is_periodic = 0;         // whether one of the subframes is declared as periodic or not
+    AVFrame *frame = data;
+
+    frame->nb_samples = SUBFRAME_SIZE<<1;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    out_frame = (int16_t*) frame->data[0];
+
+    if (buf_size % 10 == 0) {
+        packet_type = FORMAT_G729_8K;
+        format = &format_g729_8k;
+        //Reset voice decision
+        ctx->onset = 0;
+        ctx->voice_decision = DECISION_VOICE;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729 @ 8kbit/s");
+    } else if (buf_size == 8) {
+        packet_type = FORMAT_G729D_6K4;
+        format = &format_g729d_6k4;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729D @ 6.4kbit/s");
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is unknown.\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i=0; i < buf_size; i++)
+        frame_erasure |= buf[i];
+    frame_erasure = !frame_erasure;
+
+    init_get_bits(&gb, buf, 8*buf_size);
+
+    ma_predictor     = get_bits(&gb, 1);
+    quantizer_1st    = get_bits(&gb, VQ_1ST_BITS);
+    quantizer_2nd_lo = get_bits(&gb, VQ_2ND_BITS);
+    quantizer_2nd_hi = get_bits(&gb, VQ_2ND_BITS);
+
+    if(frame_erasure)
+        lsf_restore_from_previous(ctx->lsfq, ctx->past_quantizer_outputs,
+                                  ctx->ma_predictor_prev);
+    else {
+        lsf_decode(ctx->lsfq, ctx->past_quantizer_outputs,
+                   ma_predictor,
+                   quantizer_1st, quantizer_2nd_lo, quantizer_2nd_hi);
+        ctx->ma_predictor_prev = ma_predictor;
+    }
+
+    tmp = ctx->past_quantizer_outputs[MA_NP];
+    memmove(ctx->past_quantizer_outputs + 1, ctx->past_quantizer_outputs,
+            MA_NP * sizeof(int16_t*));
+    ctx->past_quantizer_outputs[0] = tmp;
+
+    ff_acelp_lsf2lsp(ctx->lsp[1], ctx->lsfq, 10);
+
+    ff_acelp_lp_decode(&lp[0][0], &lp[1][0], ctx->lsp[1], ctx->lsp[0], 10);
+
+    FFSWAP(int16_t*, ctx->lsp[1], ctx->lsp[0]);
+
+    for (i = 0; i < 2; i++) {
+        int gain_corr_factor;
+
+        uint8_t ac_index;      ///< adaptive codebook index
+        uint8_t pulses_signs;  ///< fixed-codebook vector pulse signs
+        int fc_indexes;        ///< fixed-codebook indexes
+        uint8_t gc_1st_index;  ///< gain codebook (first stage) index
+        uint8_t gc_2nd_index;  ///< gain codebook (second stage) index
+
+        ac_index      = get_bits(&gb, format->ac_index_bits[i]);
+        if(!i && format->parity_bit)
+            bad_pitch = get_parity(ac_index) == get_bits1(&gb);
+        fc_indexes    = get_bits(&gb, format->fc_indexes_bits);
+        pulses_signs  = get_bits(&gb, format->fc_signs_bits);
+        gc_1st_index  = get_bits(&gb, format->gc_1st_index_bits);
+        gc_2nd_index  = get_bits(&gb, format->gc_2nd_index_bits);
+
+        if (frame_erasure)
+            pitch_delay_3x   = 3 * ctx->pitch_delay_int_prev;
+        else if(!i) {
+            if (bad_pitch)
+                pitch_delay_3x   = 3 * ctx->pitch_delay_int_prev;
+            else
+                pitch_delay_3x = ff_acelp_decode_8bit_to_1st_delay3(ac_index);
+        } else {
+            int pitch_delay_min = av_clip(ctx->pitch_delay_int_prev - 5,
+                                          PITCH_DELAY_MIN, PITCH_DELAY_MAX - 9);
+
+            if(packet_type == FORMAT_G729D_6K4)
+                pitch_delay_3x = ff_acelp_decode_4bit_to_2nd_delay3(ac_index, pitch_delay_min);
+            else
+                pitch_delay_3x = ff_acelp_decode_5_6_bit_to_2nd_delay3(ac_index, pitch_delay_min);
+        }
+
+        /* Round pitch delay to nearest (used everywhere except ff_acelp_interpolate). */
+        pitch_delay_int[i]  = (pitch_delay_3x + 1) / 3;
+        if (pitch_delay_int[i] > PITCH_DELAY_MAX) {
+            av_log(avctx, AV_LOG_WARNING, "pitch_delay_int %d is too large\n", pitch_delay_int[i]);
+            pitch_delay_int[i] = PITCH_DELAY_MAX;
+        }
+
+        if (frame_erasure) {
+            ctx->rand_value = g729_prng(ctx->rand_value);
+            fc_indexes   = av_mod_uintp2(ctx->rand_value, format->fc_indexes_bits);
+
+            ctx->rand_value = g729_prng(ctx->rand_value);
+            pulses_signs = ctx->rand_value;
+        }
+
+
+        memset(fc, 0, sizeof(int16_t) * SUBFRAME_SIZE);
+        switch (packet_type) {
+            case FORMAT_G729_8K:
+                ff_acelp_fc_pulse_per_track(fc, ff_fc_4pulses_8bits_tracks_13,
+                                            ff_fc_4pulses_8bits_track_4,
+                                            fc_indexes, pulses_signs, 3, 3);
+                break;
+            case FORMAT_G729D_6K4:
+                ff_acelp_fc_pulse_per_track(fc, ff_fc_2pulses_9bits_track1_gray,
+                                            ff_fc_2pulses_9bits_track2_gray,
+                                            fc_indexes, pulses_signs, 1, 4);
+                break;
+        }
+
+        /*
+          This filter enhances harmonic components of the fixed-codebook vector to
+          improve the quality of the reconstructed speech.
+
+                     / fc_v[i],                                    i < pitch_delay
+          fc_v[i] = <
+                     \ fc_v[i] + gain_pitch * fc_v[i-pitch_delay], i >= pitch_delay
+        */
+        ff_acelp_weighted_vector_sum(fc + pitch_delay_int[i],
+                                     fc + pitch_delay_int[i],
+                                     fc, 1 << 14,
+                                     av_clip(ctx->past_gain_pitch[0], SHARP_MIN, SHARP_MAX),
+                                     0, 14,
+                                     SUBFRAME_SIZE - pitch_delay_int[i]);
+
+        memmove(ctx->past_gain_pitch+1, ctx->past_gain_pitch, 5 * sizeof(int16_t));
+        ctx->past_gain_code[1] = ctx->past_gain_code[0];
+
+        if (frame_erasure) {
+            ctx->past_gain_pitch[0] = (29491 * ctx->past_gain_pitch[0]) >> 15; // 0.90 (0.15)
+            ctx->past_gain_code[0]  = ( 2007 * ctx->past_gain_code[0] ) >> 11; // 0.98 (0.11)
+
+            gain_corr_factor = 0;
+        } else {
+            if (packet_type == FORMAT_G729D_6K4) {
+                ctx->past_gain_pitch[0]  = cb_gain_1st_6k4[gc_1st_index][0] +
+                                           cb_gain_2nd_6k4[gc_2nd_index][0];
+                gain_corr_factor = cb_gain_1st_6k4[gc_1st_index][1] +
+                                   cb_gain_2nd_6k4[gc_2nd_index][1];
+
+                /* Without check below overflow can occur in ff_acelp_update_past_gain.
+                   It is not issue for G.729, because gain_corr_factor in it's case is always
+                   greater than 1024, while in G.729D it can be even zero. */
+                gain_corr_factor = FFMAX(gain_corr_factor, 1024);
+#ifndef G729_BITEXACT
+                gain_corr_factor >>= 1;
+#endif
+            } else {
+                ctx->past_gain_pitch[0]  = cb_gain_1st_8k[gc_1st_index][0] +
+                                           cb_gain_2nd_8k[gc_2nd_index][0];
+                gain_corr_factor = cb_gain_1st_8k[gc_1st_index][1] +
+                                   cb_gain_2nd_8k[gc_2nd_index][1];
+            }
+
+            /* Decode the fixed-codebook gain. */
+            ctx->past_gain_code[0] = ff_acelp_decode_gain_code(&ctx->adsp, gain_corr_factor,
+                                                               fc, MR_ENERGY,
+                                                               ctx->quant_energy,
+                                                               ma_prediction_coeff,
+                                                               SUBFRAME_SIZE, 4);
+#ifdef G729_BITEXACT
+            /*
+              This correction required to get bit-exact result with
+              reference code, because gain_corr_factor in G.729D is
+              two times larger than in original G.729.
+
+              If bit-exact result is not issue then gain_corr_factor
+              can be simpler divided by 2 before call to g729_get_gain_code
+              instead of using correction below.
+            */
+            if (packet_type == FORMAT_G729D_6K4) {
+                gain_corr_factor >>= 1;
+                ctx->past_gain_code[0] >>= 1;
+            }
+#endif
+        }
+        ff_acelp_update_past_gain(ctx->quant_energy, gain_corr_factor, 2, frame_erasure);
+
+        /* Routine requires rounding to lowest. */
+        ff_acelp_interpolate(ctx->exc + i * SUBFRAME_SIZE,
+                             ctx->exc + i * SUBFRAME_SIZE - pitch_delay_3x / 3,
+                             ff_acelp_interp_filter, 6,
+                             (pitch_delay_3x % 3) << 1,
+                             10, SUBFRAME_SIZE);
+
+        ff_acelp_weighted_vector_sum(ctx->exc + i * SUBFRAME_SIZE,
+                                     ctx->exc + i * SUBFRAME_SIZE, fc,
+                                     (!ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_pitch[0],
+                                     ( ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_code[0],
+                                     1 << 13, 14, SUBFRAME_SIZE);
+
+        memcpy(synth, ctx->syn_filter_data, 10 * sizeof(int16_t));
+
+        if (ff_celp_lp_synthesis_filter(
+            synth+10,
+            &lp[i][1],
+            ctx->exc  + i * SUBFRAME_SIZE,
+            SUBFRAME_SIZE,
+            10,
+            1,
+            0,
+            0x800))
+            /* Overflow occurred, downscale excitation signal... */
+            for (j = 0; j < 2 * SUBFRAME_SIZE + PITCH_DELAY_MAX + INTERPOL_LEN; j++)
+                ctx->exc_base[j] >>= 2;
+
+        /* ... and make synthesis again. */
+        if (packet_type == FORMAT_G729D_6K4) {
+            int16_t exc_new[SUBFRAME_SIZE];
+
+            ctx->onset = g729d_onset_decision(ctx->onset, ctx->past_gain_code);
+            ctx->voice_decision = g729d_voice_decision(ctx->onset, ctx->voice_decision, ctx->past_gain_pitch);
+
+            g729d_get_new_exc(exc_new, ctx->exc  + i * SUBFRAME_SIZE, fc, ctx->voice_decision, ctx->past_gain_code[0], SUBFRAME_SIZE);
+
+            ff_celp_lp_synthesis_filter(
+                    synth+10,
+                    &lp[i][1],
+                    exc_new,
+                    SUBFRAME_SIZE,
+                    10,
+                    0,
+                    0,
+                    0x800);
+        } else {
+            ff_celp_lp_synthesis_filter(
+                    synth+10,
+                    &lp[i][1],
+                    ctx->exc  + i * SUBFRAME_SIZE,
+                    SUBFRAME_SIZE,
+                    10,
+                    0,
+                    0,
+                    0x800);
+        }
+        /* Save data (without postfilter) for use in next subframe. */
+        memcpy(ctx->syn_filter_data, synth+SUBFRAME_SIZE, 10 * sizeof(int16_t));
+
+        /* Calculate gain of unfiltered signal for use in AGC. */
+        gain_before = 0;
+        for (j = 0; j < SUBFRAME_SIZE; j++)
+            gain_before += FFABS(synth[j+10]);
+
+        /* Call postfilter and also update voicing decision for use in next frame. */
+        ff_g729_postfilter(
+                &ctx->adsp,
+                &ctx->ht_prev_data,
+                &is_periodic,
+                &lp[i][0],
+                pitch_delay_int[0],
+                ctx->residual,
+                ctx->res_filter_data,
+                ctx->pos_filter_data,
+                synth+10,
+                SUBFRAME_SIZE);
+
+        /* Calculate gain of filtered signal for use in AGC. */
+        gain_after = 0;
+        for(j=0; j<SUBFRAME_SIZE; j++)
+            gain_after += FFABS(synth[j+10]);
+
+        ctx->gain_coeff = ff_g729_adaptive_gain_control(
+                gain_before,
+                gain_after,
+                synth+10,
+                SUBFRAME_SIZE,
+                ctx->gain_coeff);
+
+        if (frame_erasure)
+            ctx->pitch_delay_int_prev = FFMIN(ctx->pitch_delay_int_prev + 1, PITCH_DELAY_MAX);
+        else
+            ctx->pitch_delay_int_prev = pitch_delay_int[i];
+
+        memcpy(synth+8, ctx->hpf_z, 2*sizeof(int16_t));
+        ff_acelp_high_pass_filter(
+                out_frame + i*SUBFRAME_SIZE,
+                ctx->hpf_f,
+                synth+10,
+                SUBFRAME_SIZE);
+        memcpy(ctx->hpf_z, synth+8+SUBFRAME_SIZE, 2*sizeof(int16_t));
+    }
+
+    ctx->was_periodic = is_periodic;
+
+    /* Save signal for use in next frame. */
+    memmove(ctx->exc_base, ctx->exc_base + 2 * SUBFRAME_SIZE, (PITCH_DELAY_MAX+INTERPOL_LEN)*sizeof(int16_t));
+
+    *got_frame_ptr = 1;
+    return packet_type == FORMAT_G729_8K ? 10 : 8;
+}
+
+AVCodec ff_g729_decoder = {
+    .name           = "g729",
+    .long_name      = NULL_IF_CONFIG_SMALL("G.729"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_G729,
+    .priv_data_size = sizeof(G729Context),
+    .init           = decoder_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/g729postfilter.c b/libavcodec/g729postfilter.c
new file mode 100644
index 0000000000..9a775c47b2
--- /dev/null
+++ b/libavcodec/g729postfilter.c
@@ -0,0 +1,610 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <inttypes.h>
+#include <limits.h>
+
+#include "avcodec.h"
+#include "g729.h"
+#include "acelp_pitch_delay.h"
+#include "g729postfilter.h"
+#include "celp_math.h"
+#include "acelp_filters.h"
+#include "acelp_vectors.h"
+#include "celp_filters.h"
+
+#define FRAC_BITS 15
+#include "mathops.h"
+
+/**
+ * short interpolation filter (of length 33, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_short[(ANALYZED_FRAC_DELAYS+1)*SHORT_INT_FILT_LEN] = {
+      0, 31650, 28469, 23705, 18050, 12266,  7041,  2873,
+      0, -1597, -2147, -1992, -1492,  -933,  -484,  -188,
+};
+
+/**
+ * long interpolation filter (of length 129, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_long[(ANALYZED_FRAC_DELAYS+1)*LONG_INT_FILT_LEN] = {
+   0, 31915, 29436, 25569, 20676, 15206,  9639,  4439,
+   0, -3390, -5579, -6549, -6414, -5392, -3773, -1874,
+   0,  1595,  2727,  3303,  3319,  2850,  2030,  1023,
+   0,  -887, -1527, -1860, -1876, -1614, -1150,  -579,
+   0,   501,   859,  1041,  1044,   892,   631,   315,
+   0,  -266,  -453,  -543,  -538,  -455,  -317,  -156,
+   0,   130,   218,   258,   253,   212,   147,    72,
+   0,   -59,  -101,  -122,  -123,  -106,   -77,   -40,
+};
+
+/**
+ * formant_pp_factor_num_pow[i] = FORMANT_PP_FACTOR_NUM^(i+1)
+ */
+static const int16_t formant_pp_factor_num_pow[10]= {
+  /* (0.15) */
+  18022, 9912, 5451, 2998, 1649, 907, 499, 274, 151, 83
+};
+
+/**
+ * formant_pp_factor_den_pow[i] = FORMANT_PP_FACTOR_DEN^(i+1)
+ */
+static const int16_t formant_pp_factor_den_pow[10] = {
+  /* (0.15) */
+  22938, 16057, 11240, 7868, 5508, 3856, 2699, 1889, 1322, 925
+};
+
+/**
+ * \brief Residual signal calculation (4.2.1 if G.729)
+ * \param out [out] output data filtered through A(z/FORMANT_PP_FACTOR_NUM)
+ * \param filter_coeffs (3.12) A(z/FORMANT_PP_FACTOR_NUM) filter coefficients
+ * \param in input speech data to process
+ * \param subframe_size size of one subframe
+ *
+ * \note in buffer must contain 10 items of previous speech data before top of the buffer
+ * \remark It is safe to pass the same buffer for input and output.
+ */
+static void residual_filter(int16_t* out, const int16_t* filter_coeffs, const int16_t* in,
+                            int subframe_size)
+{
+    int i, n;
+
+    for (n = subframe_size - 1; n >= 0; n--) {
+        int sum = 0x800;
+        for (i = 0; i < 10; i++)
+            sum += filter_coeffs[i] * in[n - i - 1];
+
+        out[n] = in[n] + (sum >> 12);
+    }
+}
+
+/**
+ * \brief long-term postfilter (4.2.1)
+ * \param dsp initialized DSP context
+ * \param pitch_delay_int integer part of the pitch delay in the first subframe
+ * \param residual filtering input data
+ * \param residual_filt [out] speech signal with applied A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param subframe_size size of subframe
+ *
+ * \return 0 if long-term prediction gain is less than 3dB, 1 -  otherwise
+ */
+static int16_t long_term_filter(AudioDSPContext *adsp, int pitch_delay_int,
+                                const int16_t* residual, int16_t *residual_filt,
+                                int subframe_size)
+{
+    int i, k, tmp, tmp2;
+    int sum;
+    int L_temp0;
+    int L_temp1;
+    int64_t L64_temp0;
+    int64_t L64_temp1;
+    int16_t shift;
+    int corr_int_num, corr_int_den;
+
+    int ener;
+    int16_t sh_ener;
+
+    int16_t gain_num,gain_den; //selected signal's gain numerator and denominator
+    int16_t sh_gain_num, sh_gain_den;
+    int gain_num_square;
+
+    int16_t gain_long_num,gain_long_den; //filtered through long interpolation filter signal's gain numerator and denominator
+    int16_t sh_gain_long_num, sh_gain_long_den;
+
+    int16_t best_delay_int, best_delay_frac;
+
+    int16_t delayed_signal_offset;
+    int lt_filt_factor_a, lt_filt_factor_b;
+
+    int16_t * selected_signal;
+    const int16_t * selected_signal_const; //Necessary to avoid compiler warning
+
+    int16_t sig_scaled[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+    int16_t delayed_signal[ANALYZED_FRAC_DELAYS][SUBFRAME_SIZE+1];
+    int corr_den[ANALYZED_FRAC_DELAYS][2];
+
+    tmp = 0;
+    for(i=0; i<subframe_size + RES_PREV_DATA_SIZE; i++)
+        tmp |= FFABS(residual[i]);
+
+    if(!tmp)
+        shift = 3;
+    else
+        shift = av_log2(tmp) - 11;
+
+    if (shift > 0)
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] >> shift;
+    else
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] << -shift;
+
+    /* Start of best delay searching code */
+    gain_num = 0;
+
+    ener = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                    sig_scaled + RES_PREV_DATA_SIZE,
+                                    subframe_size);
+    if (ener) {
+        sh_ener = FFMAX(av_log2(ener) - 14, 0);
+        ener >>= sh_ener;
+        /* Search for best pitch delay.
+
+                       sum{ r(n) * r(k,n) ] }^2
+           R'(k)^2 := -------------------------
+                       sum{ r(k,n) * r(k,n) }
+
+
+           R(T)    :=  sum{ r(n) * r(n-T) ] }
+
+
+           where
+           r(n-T) is integer delayed signal with delay T
+           r(k,n) is non-integer delayed signal with integer delay best_delay
+           and fractional delay k */
+
+        /* Find integer delay best_delay which maximizes correlation R(T).
+
+           This is also equals to numerator of R'(0),
+           since the fine search (second step) is done with 1/8
+           precision around best_delay. */
+        corr_int_num = 0;
+        best_delay_int = pitch_delay_int - 1;
+        for (i = pitch_delay_int - 1; i <= pitch_delay_int + 1; i++) {
+            sum = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                           sig_scaled + RES_PREV_DATA_SIZE - i,
+                                           subframe_size);
+            if (sum > corr_int_num) {
+                corr_int_num = sum;
+                best_delay_int = i;
+            }
+        }
+        if (corr_int_num) {
+            /* Compute denominator of pseudo-normalized correlation R'(0). */
+            corr_int_den = adsp->scalarproduct_int16(sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    subframe_size);
+
+            /* Compute signals with non-integer delay k (with 1/8 precision),
+               where k is in [0;6] range.
+               Entire delay is qual to best_delay+(k+1)/8
+               This is archieved by applying an interpolation filter of
+               legth 33 to source signal. */
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                ff_acelp_interpolate(&delayed_signal[k][0],
+                                     &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int],
+                                     ff_g729_interp_filt_short,
+                                     ANALYZED_FRAC_DELAYS+1,
+                                     8 - k - 1,
+                                     SHORT_INT_FILT_LEN,
+                                     subframe_size + 1);
+            }
+
+            /* Compute denominator of pseudo-normalized correlation R'(k).
+
+                 corr_den[k][0] is square root of R'(k) denominator, for int(T) == int(T0)
+                 corr_den[k][1] is square root of R'(k) denominator, for int(T) == int(T0)+1
+
+              Also compute maximum value of above denominators over all k. */
+            tmp = corr_int_den;
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                sum = adsp->scalarproduct_int16(&delayed_signal[k][1],
+                                               &delayed_signal[k][1],
+                                               subframe_size - 1);
+                corr_den[k][0] = sum + delayed_signal[k][0            ] * delayed_signal[k][0            ];
+                corr_den[k][1] = sum + delayed_signal[k][subframe_size] * delayed_signal[k][subframe_size];
+
+                tmp = FFMAX3(tmp, corr_den[k][0], corr_den[k][1]);
+            }
+
+            sh_gain_den = av_log2(tmp) - 14;
+            if (sh_gain_den >= 0) {
+
+                sh_gain_num =  FFMAX(sh_gain_den, sh_ener);
+                /* Loop through all k and find delay that maximizes
+                   R'(k) correlation.
+                   Search is done in [int(T0)-1; intT(0)+1] range
+                   with 1/8 precision. */
+                delayed_signal_offset = 1;
+                best_delay_frac = 0;
+                gain_den = corr_int_den >> sh_gain_den;
+                gain_num = corr_int_num >> sh_gain_num;
+                gain_num_square = gain_num * gain_num;
+                for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                    for (i = 0; i < 2; i++) {
+                        int16_t gain_num_short, gain_den_short;
+                        int gain_num_short_square;
+                        /* Compute numerator of pseudo-normalized
+                           correlation R'(k). */
+                        sum = adsp->scalarproduct_int16(&delayed_signal[k][i],
+                                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                                       subframe_size);
+                        gain_num_short = FFMAX(sum >> sh_gain_num, 0);
+
+                        /*
+                                      gain_num_short_square                gain_num_square
+                           R'(T)^2 = -----------------------, max R'(T)^2= --------------
+                                           den                                 gain_den
+                        */
+                        gain_num_short_square = gain_num_short * gain_num_short;
+                        gain_den_short = corr_den[k][i] >> sh_gain_den;
+
+                        tmp = MULL(gain_num_short_square, gain_den, FRAC_BITS);
+                        tmp2 = MULL(gain_num_square, gain_den_short, FRAC_BITS);
+
+                        // R'(T)^2 > max R'(T)^2
+                        if (tmp > tmp2) {
+                            gain_num = gain_num_short;
+                            gain_den = gain_den_short;
+                            gain_num_square = gain_num_short_square;
+                            delayed_signal_offset = i;
+                            best_delay_frac = k + 1;
+                        }
+                    }
+                }
+
+                /*
+                       R'(T)^2
+                  2 * --------- < 1
+                        R(0)
+                */
+                L64_temp0 =  (int64_t)gain_num_square  << ((sh_gain_num << 1) + 1);
+                L64_temp1 = ((int64_t)gain_den * ener) << (sh_gain_den + sh_ener);
+                if (L64_temp0 < L64_temp1)
+                    gain_num = 0;
+            } // if(sh_gain_den >= 0)
+        } // if(corr_int_num)
+    } // if(ener)
+    /* End of best delay searching code  */
+
+    if (!gain_num) {
+        memcpy(residual_filt, residual + RES_PREV_DATA_SIZE, subframe_size * sizeof(int16_t));
+
+        /* Long-term prediction gain is less than 3dB. Long-term postfilter is disabled. */
+        return 0;
+    }
+    if (best_delay_frac) {
+        /* Recompute delayed signal with an interpolation filter of length 129. */
+        ff_acelp_interpolate(residual_filt,
+                             &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int + delayed_signal_offset],
+                             ff_g729_interp_filt_long,
+                             ANALYZED_FRAC_DELAYS + 1,
+                             8 - best_delay_frac,
+                             LONG_INT_FILT_LEN,
+                             subframe_size + 1);
+        /* Compute R'(k) correlation's numerator. */
+        sum = adsp->scalarproduct_int16(residual_filt,
+                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                       subframe_size);
+
+        if (sum < 0) {
+            gain_long_num = 0;
+            sh_gain_long_num = 0;
+        } else {
+            tmp = FFMAX(av_log2(sum) - 14, 0);
+            sum >>= tmp;
+            gain_long_num = sum;
+            sh_gain_long_num = tmp;
+        }
+
+        /* Compute R'(k) correlation's denominator. */
+        sum = adsp->scalarproduct_int16(residual_filt, residual_filt, subframe_size);
+
+        tmp = FFMAX(av_log2(sum) - 14, 0);
+        sum >>= tmp;
+        gain_long_den = sum;
+        sh_gain_long_den = tmp;
+
+        /* Select between original and delayed signal.
+           Delayed signal will be selected if it increases R'(k)
+           correlation. */
+        L_temp0 = gain_num * gain_num;
+        L_temp0 = MULL(L_temp0, gain_long_den, FRAC_BITS);
+
+        L_temp1 = gain_long_num * gain_long_num;
+        L_temp1 = MULL(L_temp1, gain_den, FRAC_BITS);
+
+        tmp = ((sh_gain_long_num - sh_gain_num) << 1) - (sh_gain_long_den - sh_gain_den);
+        if (tmp > 0)
+            L_temp0 >>= tmp;
+        else
+            L_temp1 >>= -tmp;
+
+        /* Check if longer filter increases the values of R'(k). */
+        if (L_temp1 > L_temp0) {
+            /* Select long filter. */
+            selected_signal = residual_filt;
+            gain_num = gain_long_num;
+            gain_den = gain_long_den;
+            sh_gain_num = sh_gain_long_num;
+            sh_gain_den = sh_gain_long_den;
+        } else
+            /* Select short filter. */
+            selected_signal = &delayed_signal[best_delay_frac-1][delayed_signal_offset];
+
+        /* Rescale selected signal to original value. */
+        if (shift > 0)
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] <<= shift;
+        else
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] >>= -shift;
+
+        /* necessary to avoid compiler warning */
+        selected_signal_const = selected_signal;
+    } // if(best_delay_frac)
+    else
+        selected_signal_const = residual + RES_PREV_DATA_SIZE - (best_delay_int + 1 - delayed_signal_offset);
+#ifdef G729_BITEXACT
+    tmp = sh_gain_num - sh_gain_den;
+    if (tmp > 0)
+        gain_den >>= tmp;
+    else
+        gain_num >>= -tmp;
+
+    if (gain_num > gain_den)
+        lt_filt_factor_a = MIN_LT_FILT_FACTOR_A;
+    else {
+        gain_num >>= 2;
+        gain_den >>= 1;
+        lt_filt_factor_a = (gain_den << 15) / (gain_den + gain_num);
+    }
+#else
+    L64_temp0 = (((int64_t)gain_num) << sh_gain_num) >> 1;
+    L64_temp1 = ((int64_t)gain_den) << sh_gain_den;
+    lt_filt_factor_a = FFMAX((L64_temp1 << 15) / (L64_temp1 + L64_temp0), MIN_LT_FILT_FACTOR_A);
+#endif
+
+    /* Filter through selected filter. */
+    lt_filt_factor_b = 32767 - lt_filt_factor_a + 1;
+
+    ff_acelp_weighted_vector_sum(residual_filt, residual + RES_PREV_DATA_SIZE,
+                                 selected_signal_const,
+                                 lt_filt_factor_a, lt_filt_factor_b,
+                                 1<<14, 15, subframe_size);
+
+    // Long-term prediction gain is larger than 3dB.
+    return 1;
+}
+
+/**
+ * \brief Calculate reflection coefficient for tilt compensation filter (4.2.3).
+ * \param dsp initialized DSP context
+ * \param lp_gn (3.12) coefficients of A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param lp_gd (3.12) coefficients of A(z/FORMANT_PP_FACTOR_DEN) filter
+ * \param speech speech to update
+ * \param subframe_size size of subframe
+ *
+ * \return (3.12) reflection coefficient
+ *
+ * \remark The routine also calculates the gain term for the short-term
+ *         filter (gf) and multiplies the speech data by 1/gf.
+ *
+ * \note All members of lp_gn, except 10-19 must be equal to zero.
+ */
+static int16_t get_tilt_comp(AudioDSPContext *adsp, int16_t *lp_gn,
+                             const int16_t *lp_gd, int16_t* speech,
+                             int subframe_size)
+{
+    int rh1,rh0; // (3.12)
+    int temp;
+    int i;
+    int gain_term;
+
+    lp_gn[10] = 4096; //1.0 in (3.12)
+
+    /* Apply 1/A(z/FORMANT_PP_FACTOR_DEN) filter to hf. */
+    ff_celp_lp_synthesis_filter(lp_gn + 11, lp_gd + 1, lp_gn + 11, 22, 10, 0, 0, 0x800);
+    /* Now lp_gn (starting with 10) contains impulse response
+       of A(z/FORMANT_PP_FACTOR_NUM)/A(z/FORMANT_PP_FACTOR_DEN) filter. */
+
+    rh0 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 10, 20);
+    rh1 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 11, 20);
+
+    /* downscale to avoid overflow */
+    temp = av_log2(rh0) - 14;
+    if (temp > 0) {
+        rh0 >>= temp;
+        rh1 >>= temp;
+    }
+
+    if (FFABS(rh1) > rh0 || !rh0)
+        return 0;
+
+    gain_term = 0;
+    for (i = 0; i < 20; i++)
+        gain_term += FFABS(lp_gn[i + 10]);
+    gain_term >>= 2; // (3.12) -> (5.10)
+
+    if (gain_term > 0x400) { // 1.0 in (5.10)
+        temp = 0x2000000 / gain_term; // 1.0/gain_term in (0.15)
+        for (i = 0; i < subframe_size; i++)
+            speech[i] = (speech[i] * temp + 0x4000) >> 15;
+    }
+
+    return -(rh1 << 15) / rh0;
+}
+
+/**
+ * \brief Apply tilt compensation filter (4.2.3).
+ * \param res_pst [in/out] residual signal (partially filtered)
+ * \param k1 (3.12) reflection coefficient
+ * \param subframe_size size of subframe
+ * \param ht_prev_data previous data for 4.2.3, equation 86
+ *
+ * \return new value for ht_prev_data
+*/
+static int16_t apply_tilt_comp(int16_t* out, int16_t* res_pst, int refl_coeff,
+                               int subframe_size, int16_t ht_prev_data)
+{
+    int tmp, tmp2;
+    int i;
+    int gt, ga;
+    int fact, sh_fact;
+
+    if (refl_coeff > 0) {
+        gt = (refl_coeff * G729_TILT_FACTOR_PLUS + 0x4000) >> 15;
+        fact = 0x4000; // 0.5 in (0.15)
+        sh_fact = 15;
+    } else {
+        gt = (refl_coeff * G729_TILT_FACTOR_MINUS + 0x4000) >> 15;
+        fact = 0x800; // 0.5 in (3.12)
+        sh_fact = 12;
+    }
+    ga = (fact << 15) / av_clip_int16(32768 - FFABS(gt));
+    gt >>= 1;
+
+    /* Apply tilt compensation filter to signal. */
+    tmp = res_pst[subframe_size - 1];
+
+    for (i = subframe_size - 1; i >= 1; i--) {
+        tmp2 = (res_pst[i] << 15) + ((gt * res_pst[i-1]) << 1);
+        tmp2 = (tmp2 + 0x4000) >> 15;
+
+        tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+        out[i] = tmp2;
+    }
+    tmp2 = (res_pst[0] << 15) + ((gt * ht_prev_data) << 1);
+    tmp2 = (tmp2 + 0x4000) >> 15;
+    tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+    out[0] = tmp2;
+
+    return tmp;
+}
+
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech, int subframe_size)
+{
+    int16_t residual_filt_buf[SUBFRAME_SIZE+11];
+    int16_t lp_gn[33]; // (3.12)
+    int16_t lp_gd[11]; // (3.12)
+    int tilt_comp_coeff;
+    int i;
+
+    /* Zero-filling is necessary for tilt-compensation filter. */
+    memset(lp_gn, 0, 33 * sizeof(int16_t));
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_NUM) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gn[i + 11] = (lp_filter_coeffs[i + 1] * formant_pp_factor_num_pow[i] + 0x4000) >> 15;
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_DEN) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gd[i + 1] = (lp_filter_coeffs[i + 1] * formant_pp_factor_den_pow[i] + 0x4000) >> 15;
+
+    /* residual signal calculation (one-half of short-term postfilter) */
+    memcpy(speech - 10, res_filter_data, 10 * sizeof(int16_t));
+    residual_filter(residual + RES_PREV_DATA_SIZE, lp_gn + 11, speech, subframe_size);
+    /* Save data to use it in the next subframe. */
+    memcpy(res_filter_data, speech + subframe_size - 10, 10 * sizeof(int16_t));
+
+    /* long-term filter. If long-term prediction gain is larger than 3dB (returned value is
+       nonzero) then declare current subframe as periodic. */
+    *voicing = FFMAX(*voicing, long_term_filter(adsp, pitch_delay_int,
+                                                residual, residual_filt_buf + 10,
+                                                subframe_size));
+
+    /* shift residual for using in next subframe */
+    memmove(residual, residual + subframe_size, RES_PREV_DATA_SIZE * sizeof(int16_t));
+
+    /* short-term filter tilt compensation */
+    tilt_comp_coeff = get_tilt_comp(adsp, lp_gn, lp_gd, residual_filt_buf + 10, subframe_size);
+
+    /* Apply second half of short-term postfilter: 1/A(z/FORMANT_PP_FACTOR_DEN) */
+    ff_celp_lp_synthesis_filter(pos_filter_data + 10, lp_gd + 1,
+                                residual_filt_buf + 10,
+                                subframe_size, 10, 0, 0, 0x800);
+    memcpy(pos_filter_data, pos_filter_data + subframe_size, 10 * sizeof(int16_t));
+
+    *ht_prev_data = apply_tilt_comp(speech, pos_filter_data + 10, tilt_comp_coeff,
+                                    subframe_size, *ht_prev_data);
+}
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before gain of speech before applying postfilters
+ * \param gain_after  gain of speech after applying postfilters
+ * \param speech [in/out] signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (3.12) previous value of gain coefficient
+ *
+ * \return (3.12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev)
+{
+    int gain; // (3.12)
+    int n;
+    int exp_before, exp_after;
+
+    if(!gain_after && gain_before)
+        return 0;
+
+    if (gain_before) {
+
+        exp_before  = 14 - av_log2(gain_before);
+        gain_before = bidir_sal(gain_before, exp_before);
+
+        exp_after  = 14 - av_log2(gain_after);
+        gain_after = bidir_sal(gain_after, exp_after);
+
+        if (gain_before < gain_after) {
+            gain = (gain_before << 15) / gain_after;
+            gain = bidir_sal(gain, exp_after - exp_before - 1);
+        } else {
+            gain = ((gain_before - gain_after) << 14) / gain_after + 0x4000;
+            gain = bidir_sal(gain, exp_after - exp_before);
+        }
+        gain = (gain * G729_AGC_FAC1 + 0x4000) >> 15; // gain * (1-0.9875)
+    } else
+        gain = 0;
+
+    for (n = 0; n < subframe_size; n++) {
+        // gain_prev = gain + 0.9875 * gain_prev
+        gain_prev = (G729_AGC_FACTOR * gain_prev + 0x4000) >> 15;
+        gain_prev = av_clip_int16(gain + gain_prev);
+        speech[n] = av_clip_int16((speech[n] * gain_prev + 0x2000) >> 14);
+    }
+    return gain_prev;
+}
diff --git a/libavcodec/g729postfilter.h b/libavcodec/g729postfilter.h
new file mode 100644
index 0000000000..89e3e40cea
--- /dev/null
+++ b/libavcodec/g729postfilter.h
@@ -0,0 +1,116 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef FFMPEG_G729POSTFILTER_H
+#define FFMPEG_G729POSTFILTER_H
+
+#include <stdint.h>
+#include "audiodsp.h"
+
+/**
+ * tilt compensation factor (G.729, k1>0)
+ * 0.2 in Q15
+ */
+#define G729_TILT_FACTOR_PLUS       6554
+
+/**
+ * tilt compensation factor (G.729, k1<0)
+ * 0.9 in Q15
+ */
+#define G729_TILT_FACTOR_MINUS     29491
+
+/* 4.2.2 */
+#define FORMANT_PP_FACTOR_NUM  18022             //0.55 in Q15
+#define FORMANT_PP_FACTOR_DEN  22938             //0.70 in Q15
+
+/**
+ * gain adjustment factor (G.729, 4.2.4)
+ * 0.9875 in Q15
+ */
+#define G729_AGC_FACTOR            32358
+#define G729_AGC_FAC1 (32768-G729_AGC_FACTOR)
+
+/**
+ * 1.0 / (1.0 + 0.5) in Q15
+ * where 0.5 is the minimum value of
+ * weight factor, controlling amount of long-term postfiltering
+ */
+#define MIN_LT_FILT_FACTOR_A       21845
+
+/**
+ * Short interpolation filter length
+ */
+#define SHORT_INT_FILT_LEN         2
+
+/**
+ * Long interpolation filter length
+ */
+#define LONG_INT_FILT_LEN          8
+
+/**
+ * Number of analyzed fractional pitch delays in second stage of long-term
+ * postfilter
+ */
+#define ANALYZED_FRAC_DELAYS       7
+
+/**
+ * Amount of past residual signal data stored in buffer
+ */
+#define RES_PREV_DATA_SIZE (PITCH_DELAY_MAX + LONG_INT_FILT_LEN + 1)
+
+/**
+ * \brief Signal postfiltering (4.2)
+ * \param dsp initialized DSP context
+ * \param ht_prev_data [in/out] (Q12) pointer to variable receiving tilt
+ *                     compensation filter data from previous subframe
+ * \param voicing [in/out] (Q0) pointer to variable receiving voicing decision
+ * \param lp_filter_coeffs (Q12) LP filter coefficients
+ * \param pitch_delay_int integer part of the pitch delay
+ * \param residual [in/out] (Q0) residual signal buffer (used in long-term postfilter)
+ * \param res_filter_data [in/out] (Q0) speech data of previous subframe
+ * \param pos_filter_data [in/out] (Q0) previous speech data for short-term postfilter
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size size of subframe
+ *
+ * Filtering has the following  stages:
+ *   Long-term postfilter (4.2.1)
+ *   Short-term postfilter (4.2.2).
+ *   Tilt-compensation (4.2.3)
+ */
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech,
+                     int subframe_size);
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before (Q0) gain of speech before applying postfilters
+ * \param gain_after  (Q0) gain of speech after applying postfilters
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (Q12) previous value of gain coefficient
+ *
+ * \return (Q12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev);
+
+#endif // FFMPEG_G729POSTFILTER_H
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index ea7e0c4509..0d3db1f3cf 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
+#include "libavutil/avassert.h"
 #include "mathops.h"
 
 /*
@@ -54,9 +55,7 @@ typedef struct GetBitContext {
     const uint8_t *buffer, *buffer_end;
     int index;
     int size_in_bits;
-#if !UNCHECKED_BITSTREAM_READER
     int size_in_bits_plus8;
-#endif
 } GetBitContext;
 
 #define VLC_TYPE int16_t
@@ -114,6 +113,9 @@ typedef struct RL_VLC_ELEM {
  * LAST_SKIP_BITS(name, gb, num)
  *   Like SKIP_BITS, to be used if next call is UPDATE_CACHE or CLOSE_READER.
  *
+ * BITS_LEFT(name, gb)
+ *   Return the number of bits left
+ *
  * For examples see get_bits, show_bits, skip_bits, get_vlc.
  */
 
@@ -125,7 +127,7 @@ typedef struct RL_VLC_ELEM {
 
 #define OPEN_READER_NOSIZE(name, gb)            \
     unsigned int name ## _index = (gb)->index;  \
-    unsigned int av_unused name ## _cache = 0
+    unsigned int av_unused name ## _cache
 
 #if UNCHECKED_BITSTREAM_READER
 #define OPEN_READER(name, gb) OPEN_READER_NOSIZE(name, gb)
@@ -141,27 +143,34 @@ typedef struct RL_VLC_ELEM {
 
 #define CLOSE_READER(name, gb) (gb)->index = name ## _index
 
+# ifdef LONG_BITSTREAM_READER
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
+
+#else
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
+
+#endif
+
+
 #ifdef BITSTREAM_READER_LE
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_LE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache >>= (num)
 
 #else
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_BE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache <<= (num)
 
@@ -174,6 +183,8 @@ typedef struct RL_VLC_ELEM {
     name ## _index = FFMIN(name ## _size_plus8, name ## _index + (num))
 #endif
 
+#define BITS_LEFT(name, gb) ((int)((gb)->size_in_bits - name ## _index))
+
 #define SKIP_BITS(name, gb, num)                \
     do {                                        \
         SKIP_CACHE(name, gb, num);              \
@@ -182,12 +193,18 @@ typedef struct RL_VLC_ELEM {
 
 #define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num)
 
+#define SHOW_UBITS_LE(name, gb, num) zero_extend(name ## _cache, num)
+#define SHOW_SBITS_LE(name, gb, num) sign_extend(name ## _cache, num)
+
+#define SHOW_UBITS_BE(name, gb, num) NEG_USR32(name ## _cache, num)
+#define SHOW_SBITS_BE(name, gb, num) NEG_SSR32(name ## _cache, num)
+
 #ifdef BITSTREAM_READER_LE
-#   define SHOW_UBITS(name, gb, num) zero_extend(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) sign_extend(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_LE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_LE(name, gb, num)
 #else
-#   define SHOW_UBITS(name, gb, num) NEG_USR32(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) NEG_SSR32(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_BE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_BE(name, gb, num)
 #endif
 
 #define GET_CACHE(name, gb) ((uint32_t) name ## _cache)
@@ -207,7 +224,7 @@ static inline void skip_bits_long(GetBitContext *s, int n)
 }
 
 /**
- * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
+ * read mpeg1 dc style vlc (sign bit + mantissa with no MSB).
  * if MSB not set it is negative
  * @param n length in bits
  */
@@ -216,6 +233,7 @@ static inline int get_xbits(GetBitContext *s, int n)
     register int sign;
     register int32_t cache;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     cache = GET_CACHE(re, s);
     sign  = ~cache >> 31;
@@ -228,6 +246,7 @@ static inline int get_sbits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_SBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
@@ -242,6 +261,7 @@ static inline unsigned int get_bits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
@@ -249,6 +269,18 @@ static inline unsigned int get_bits(GetBitContext *s, int n)
     return tmp;
 }
 
+static inline unsigned int get_bits_le(GetBitContext *s, int n)
+{
+    register int tmp;
+    OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
+    UPDATE_CACHE_LE(re, s);
+    tmp = SHOW_UBITS_LE(re, s, n);
+    LAST_SKIP_BITS(re, s, n);
+    CLOSE_READER(re, s);
+    return tmp;
+}
+
 /**
  * Show 1-25 bits.
  */
@@ -256,6 +288,7 @@ static inline unsigned int show_bits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER_NOSIZE(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
     return tmp;
@@ -264,7 +297,6 @@ static inline unsigned int show_bits(GetBitContext *s, int n)
 static inline void skip_bits(GetBitContext *s, int n)
 {
     OPEN_READER(re, s);
-    UPDATE_CACHE(re, s);
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
 }
@@ -304,20 +336,22 @@ static inline void skip_bits1(GetBitContext *s)
  */
 static inline unsigned int get_bits_long(GetBitContext *s, int n)
 {
-    if (n <= MIN_CACHE_BITS) {
+    if (!n) {
+        return 0;
+    } else if (n <= MIN_CACHE_BITS) {
         return get_bits(s, n);
     } else {
 #ifdef BITSTREAM_READER_LE
-        int ret = get_bits(s, 16);
+        unsigned ret = get_bits(s, 16);
         return ret | (get_bits(s, n - 16) << 16);
 #else
-        int ret = get_bits(s, 16) << (n - 16);
+        unsigned ret = get_bits(s, 16) << (n - 16);
         return ret | get_bits(s, n - 16);
 #endif
     }
 }
 
-/*
+/**
  * Read 0-64 bits.
  */
 static inline uint64_t get_bits64(GetBitContext *s, int n)
@@ -360,7 +394,7 @@ static inline int check_marker(GetBitContext *s, const char *msg)
 {
     int bit = get_bits1(s);
     if (!bit)
-        av_log(NULL, AV_LOG_INFO, "Marker bit missing %s\n", msg);
+        av_log(NULL, AV_LOG_INFO, "Marker bit missing at %d of %d %s\n", get_bits_count(s) - 1, s->size_in_bits, msg);
 
     return bit;
 }
@@ -379,7 +413,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
     int buffer_size;
     int ret = 0;
 
-    if (bit_size > INT_MAX - 7 || bit_size < 0 || !buffer) {
+    if (bit_size >= INT_MAX - 7 || bit_size < 0 || !buffer) {
         bit_size    = 0;
         buffer      = NULL;
         ret         = AVERROR_INVALIDDATA;
@@ -389,9 +423,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 
     s->buffer             = buffer;
     s->size_in_bits       = bit_size;
-#if !UNCHECKED_BITSTREAM_READER
     s->size_in_bits_plus8 = bit_size + 8;
-#endif
     s->buffer_end         = buffer + buffer_size;
     s->index              = 0;
 
@@ -409,8 +441,8 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer,
                                  int byte_size)
 {
-    if (byte_size > INT_MAX / 8)
-        return AVERROR_INVALIDDATA;
+    if (byte_size > INT_MAX / 8 || byte_size < 0)
+        byte_size = -1;
     return init_get_bits(s, buffer, byte_size * 8);
 }
 
@@ -486,7 +518,7 @@ void ff_free_vlc(VLC *vlc);
         SKIP_BITS(name, gb, n);                                 \
     } while (0)
 
-#define GET_RL_VLC(level, run, name, gb, table, bits,           \
+#define GET_RL_VLC_INTERNAL(level, run, name, gb, table, bits,  \
                    max_depth, need_update)                      \
     do {                                                        \
         int n, nb_bits;                                         \
@@ -558,6 +590,20 @@ static inline int get_bits_left(GetBitContext *gb)
     return gb->size_in_bits - get_bits_count(gb);
 }
 
+static inline int skip_1stop_8data_bits(GetBitContext *gb)
+{
+    if (get_bits_left(gb) <= 0)
+        return AVERROR_INVALIDDATA;
+
+    while (get_bits1(gb)) {
+        skip_bits(gb, 8);
+        if (get_bits_left(gb) <= 0)
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
 //#define TRACE
 
 #ifdef TRACE
@@ -601,6 +647,25 @@ static inline int get_vlc_trace(GetBitContext *s, VLC_TYPE (*table)[2],
     return r;
 }
 
+#define GET_RL_VLC(level, run, name, gb, table, bits,           \
+                   max_depth, need_update)                      \
+    do {                                                        \
+        int show  = SHOW_UBITS(name, gb, 24);                   \
+        int len;                                                \
+        int pos = name ## _index;                               \
+                                                                \
+        GET_RL_VLC_INTERNAL(level, run, name, gb, table, bits,max_depth, need_update); \
+                                                                \
+        len = name ## _index - pos + 1;                         \
+        show = show >> (24 - len);                              \
+                                                                \
+        print_bin(show, len);                                   \
+                                                                \
+        av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d/%-3d rlv @%5d in %s %s:%d\n",\
+               show, len, run-1, level, pos, __FILE__, __PRETTY_FUNCTION__, __LINE__);\
+    } while (0)                                                 \
+
+
 static inline int get_xbits_trace(GetBitContext *s, int n, const char *file,
                                   const char *func, int line)
 {
@@ -620,6 +685,8 @@ static inline int get_xbits_trace(GetBitContext *s, int n, const char *file,
 
 #define get_vlc(s, vlc)             get_vlc_trace(s, (vlc)->table, (vlc)->bits,   3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
 #define get_vlc2(s, tab, bits, max) get_vlc_trace(s,          tab,        bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#else //TRACE
+#define GET_RL_VLC GET_RL_VLC_INTERNAL
 #endif
 
 #endif /* AVCODEC_GET_BITS_H */
diff --git a/libavcodec/gif.c b/libavcodec/gif.c
index 451e335991..acdc0e1e0d 100644
--- a/libavcodec/gif.c
+++ b/libavcodec/gif.c
@@ -1,113 +1,201 @@
 /*
- * GIF encoder.
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2002 Francois Revol
  * Copyright (c) 2006 Baptiste Coudurier
  *
  * first version by Francois Revol <revol@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/*
- * Features and limitations:
- * - currently no compression is performed,
- *   in fact the size of the data is 9/8 the size of the image in 8bpp
- * - uses only a global standard palette
- * - tested with IE 5.0, Opera for BeOS, NetPositive (BeOS), and Mozilla (BeOS).
- *
- * Reference documents:
- * http://www.goice.co.jp/member/mo/formats/gif.html
- * http://astronomy.swin.edu.au/pbourke/dataformats/gif/
- * http://www.dcs.ed.ac.uk/home/mxr/gfx/2d/GIF89a.txt
- *
- * this url claims to have an LZW algorithm not covered by Unisys patent:
- * http://www.msg.net/utility/whirlgif/gifencod.html
- * could help reduce the size of the files _a lot_...
- * some sites mentions an RLE type compression also.
+/**
+ * @file
+ * GIF encoder
+ * @see http://www.w3.org/Graphics/GIF/spec-gif89a.txt
  */
 
+#define BITSTREAM_WRITER_LE
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
-
-/* The GIF format uses reversed order for bitstreams... */
-/* at least they don't use PDP_ENDIAN :) */
-#define BITSTREAM_WRITER_LE
+#include "gif.h"
 
 #include "put_bits.h"
 
 typedef struct GIFContext {
+    const AVClass *class;
     LZWState *lzw;
     uint8_t *buf;
+    AVFrame *last_frame;
+    int flags;
+    uint32_t palette[AVPALETTE_COUNT];  ///< local reference palette for !pal8
+    int palette_loaded;
+    int transparent_index;
+    uint8_t *pal_exdata;
+    uint8_t *tmpl;                      ///< temporary line buffer
 } GIFContext;
 
-/* GIF header */
-static int gif_image_write_header(AVCodecContext *avctx,
-                                  uint8_t **bytestream, uint32_t *palette)
+enum {
+    GF_OFFSETTING = 1<<0,
+    GF_TRANSDIFF  = 1<<1,
+};
+
+static int pick_palette_entry(const uint8_t *buf, int linesize, int w, int h)
 {
-    int i;
-    unsigned int v;
-
-    bytestream_put_buffer(bytestream, "GIF", 3);
-    bytestream_put_buffer(bytestream, "89a", 3);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-
-    bytestream_put_byte(bytestream, 0xf7); /* flags: global clut, 256 entries */
-    bytestream_put_byte(bytestream, 0x1f); /* background color index */
-    bytestream_put_byte(bytestream, 0); /* aspect ratio */
-
-    /* the global palette */
-    for(i=0;i<256;i++) {
-        v = palette[i];
-        bytestream_put_be24(bytestream, v);
-    }
+    int histogram[AVPALETTE_COUNT] = {0};
+    int x, y, i;
 
-    return 0;
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++)
+            histogram[buf[x]]++;
+        buf += linesize;
+    }
+    for (i = 0; i < FF_ARRAY_ELEMS(histogram); i++)
+        if (!histogram[i])
+            return i;
+    return -1;
 }
 
 static int gif_image_write_image(AVCodecContext *avctx,
                                  uint8_t **bytestream, uint8_t *end,
-                                 const uint8_t *buf, int linesize)
+                                 const uint32_t *palette,
+                                 const uint8_t *buf, const int linesize,
+                                 AVPacket *pkt)
 {
     GIFContext *s = avctx->priv_data;
-    int len = 0, height;
+    int len = 0, height = avctx->height, width = avctx->width, x, y;
+    int x_start = 0, y_start = 0, trans = s->transparent_index;
+    int honor_transparency = (s->flags & GF_TRANSDIFF) && s->last_frame;
     const uint8_t *ptr;
+
+    /* Crop image */
+    if ((s->flags & GF_OFFSETTING) && s->last_frame && !palette) {
+        const uint8_t *ref = s->last_frame->data[0];
+        const int ref_linesize = s->last_frame->linesize[0];
+        int x_end = avctx->width  - 1,
+            y_end = avctx->height - 1;
+
+        /* skip common lines */
+        while (y_start < y_end) {
+            if (memcmp(ref + y_start*ref_linesize, buf + y_start*linesize, width))
+                break;
+            y_start++;
+        }
+        while (y_end > y_start) {
+            if (memcmp(ref + y_end*ref_linesize, buf + y_end*linesize, width))
+                break;
+            y_end--;
+        }
+        height = y_end + 1 - y_start;
+
+        /* skip common columns */
+        while (x_start < x_end) {
+            int same_column = 1;
+            for (y = y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + x_start] != buf[y*linesize + x_start]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            x_start++;
+        }
+        while (x_end > x_start) {
+            int same_column = 1;
+            for (y = y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + x_end] != buf[y*linesize + x_end]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            x_end--;
+        }
+        width = x_end + 1 - x_start;
+
+        av_log(avctx, AV_LOG_DEBUG,"%dx%d image at pos (%d;%d) [area:%dx%d]\n",
+               width, height, x_start, y_start, avctx->width, avctx->height);
+    }
+
     /* image block */
+    bytestream_put_byte(bytestream, GIF_IMAGE_SEPARATOR);
+    bytestream_put_le16(bytestream, x_start);
+    bytestream_put_le16(bytestream, y_start);
+    bytestream_put_le16(bytestream, width);
+    bytestream_put_le16(bytestream, height);
 
-    bytestream_put_byte(bytestream, 0x2c);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-    bytestream_put_byte(bytestream, 0x00); /* flags */
-    /* no local clut */
+    if (!palette) {
+        bytestream_put_byte(bytestream, 0x00); /* flags */
+    } else {
+        unsigned i;
+        bytestream_put_byte(bytestream, 1<<7 | 0x7); /* flags */
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            const uint32_t v = palette[i];
+            bytestream_put_be24(bytestream, v);
+        }
+    }
+
+    if (honor_transparency && trans < 0) {
+        trans = pick_palette_entry(buf + y_start*linesize + x_start,
+                                   linesize, width, height);
+        if (trans < 0) { // TODO, patch welcome
+            av_log(avctx, AV_LOG_DEBUG, "No available color, can not use transparency\n");
+        } else {
+            uint8_t *pal_exdata = s->pal_exdata;
+            if (!pal_exdata)
+                pal_exdata = av_packet_new_side_data(pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
+            if (!pal_exdata)
+                return AVERROR(ENOMEM);
+            memcpy(pal_exdata, s->palette, AVPALETTE_SIZE);
+            pal_exdata[trans*4 + 3*!HAVE_BIGENDIAN] = 0x00;
+        }
+    }
+    if (trans < 0)
+        honor_transparency = 0;
 
     bytestream_put_byte(bytestream, 0x08);
 
-    ff_lzw_encode_init(s->lzw, s->buf, avctx->width*avctx->height,
+    ff_lzw_encode_init(s->lzw, s->buf, 2 * width * height,
                        12, FF_LZW_GIF, put_bits);
 
-    ptr = buf;
-    for (height = avctx->height; height--;) {
-        len += ff_lzw_encode(s->lzw, ptr, avctx->width);
-        ptr += linesize;
+    ptr = buf + y_start*linesize + x_start;
+    if (honor_transparency) {
+        const int ref_linesize = s->last_frame->linesize[0];
+        const uint8_t *ref = s->last_frame->data[0] + y_start*ref_linesize + x_start;
+
+        for (y = 0; y < height; y++) {
+            memcpy(s->tmpl, ptr, width);
+            for (x = 0; x < width; x++)
+                if (ref[x] == ptr[x])
+                    s->tmpl[x] = trans;
+            len += ff_lzw_encode(s->lzw, s->tmpl, width);
+            ptr += linesize;
+            ref += ref_linesize;
+        }
+    } else {
+        for (y = 0; y < height; y++) {
+            len += ff_lzw_encode(s->lzw, ptr, width);
+            ptr += linesize;
+        }
     }
     len += ff_lzw_encode_flush(s->lzw, flush_put_bits);
 
@@ -122,7 +210,6 @@ static int gif_image_write_image(AVCodecContext *avctx,
         len -= size;
     }
     bytestream_put_byte(bytestream, 0x00); /* end of image block */
-    bytestream_put_byte(bytestream, 0x3b);
     return 0;
 }
 
@@ -130,6 +217,10 @@ static av_cold int gif_encode_init(AVCodecContext *avctx)
 {
     GIFContext *s = avctx->priv_data;
 
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "GIF does not support resolutions above 65535x65535\n");
+        return AVERROR(EINVAL);
+    }
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
@@ -137,31 +228,85 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    s->transparent_index = -1;
+
     s->lzw = av_mallocz(ff_lzw_encode_state_size);
-    if (!s->lzw)
-        return AVERROR(ENOMEM);
     s->buf = av_malloc(avctx->width*avctx->height*2);
-    if (!s->buf)
-         return AVERROR(ENOMEM);
+    s->tmpl = av_malloc(avctx->width);
+    if (!s->tmpl || !s->buf || !s->lzw)
+        return AVERROR(ENOMEM);
+
+    if (avpriv_set_systematic_pal2(s->palette, avctx->pix_fmt) < 0)
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_PAL8);
+
     return 0;
 }
 
-/* better than nothing gif encoder */
+/* FIXME: duplicated with lavc */
+static int get_palette_transparency_index(const uint32_t *palette)
+{
+    int transparent_color_index = -1;
+    unsigned i, smallest_alpha = 0xff;
+
+    if (!palette)
+        return -1;
+
+    for (i = 0; i < AVPALETTE_COUNT; i++) {
+        const uint32_t v = palette[i];
+        if (v >> 24 < smallest_alpha) {
+            smallest_alpha = v >> 24;
+            transparent_color_index = i;
+        }
+    }
+    return smallest_alpha < 128 ? transparent_color_index : -1;
+}
+
 static int gif_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *pict, int *got_packet)
 {
+    GIFContext *s = avctx->priv_data;
     uint8_t *outbuf_ptr, *end;
+    const uint32_t *palette = NULL;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
     outbuf_ptr = pkt->data;
     end        = pkt->data + pkt->size;
 
-    gif_image_write_header(avctx, &outbuf_ptr, (uint32_t *)pict->data[1]);
-    gif_image_write_image(avctx, &outbuf_ptr, end, pict->data[0], pict->linesize[0]);
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        uint8_t *pal_exdata = av_packet_new_side_data(pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
+        if (!pal_exdata)
+            return AVERROR(ENOMEM);
+        memcpy(pal_exdata, pict->data[1], AVPALETTE_SIZE);
+        palette = (uint32_t*)pict->data[1];
+
+        s->pal_exdata = pal_exdata;
+
+        /* The first palette with PAL8 will be used as generic palette by the
+         * muxer so we don't need to write it locally in the packet. We store
+         * it as a reference here in case it changes later. */
+        if (!s->palette_loaded) {
+            memcpy(s->palette, palette, AVPALETTE_SIZE);
+            s->transparent_index = get_palette_transparency_index(palette);
+            s->palette_loaded = 1;
+            palette = NULL;
+        } else if (!memcmp(s->palette, palette, AVPALETTE_SIZE)) {
+            palette = NULL;
+        }
+    }
+
+    gif_image_write_image(avctx, &outbuf_ptr, end, palette,
+                          pict->data[0], pict->linesize[0], pkt);
+    if (!s->last_frame) {
+        s->last_frame = av_frame_alloc();
+        if (!s->last_frame)
+            return AVERROR(ENOMEM);
+    }
+    av_frame_unref(s->last_frame);
+    ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+    if (ret < 0)
+        return ret;
 
     pkt->size   = outbuf_ptr - pkt->data;
     pkt->flags |= AV_PKT_FLAG_KEY;
@@ -176,9 +321,27 @@ static int gif_encode_close(AVCodecContext *avctx)
 
     av_freep(&s->lzw);
     av_freep(&s->buf);
+    av_frame_free(&s->last_frame);
+    av_freep(&s->tmpl);
     return 0;
 }
 
+#define OFFSET(x) offsetof(GIFContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption gif_options[] = {
+    { "gifflags", "set GIF flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = GF_OFFSETTING|GF_TRANSDIFF}, 0, INT_MAX, FLAGS, "flags" },
+        { "offsetting", "enable picture offsetting", 0, AV_OPT_TYPE_CONST, {.i64=GF_OFFSETTING}, INT_MIN, INT_MAX, FLAGS, "flags" },
+        { "transdiff", "enable transparency detection between frames", 0, AV_OPT_TYPE_CONST, {.i64=GF_TRANSDIFF}, INT_MIN, INT_MAX, FLAGS, "flags" },
+    { NULL }
+};
+
+static const AVClass gif_class = {
+    .class_name = "GIF encoder",
+    .item_name  = av_default_item_name,
+    .option     = gif_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_gif_encoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -192,4 +355,5 @@ AVCodec ff_gif_encoder = {
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE,
         AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8, AV_PIX_FMT_NONE
     },
+    .priv_class     = &gif_class,
 };
diff --git a/libavcodec/gif.h b/libavcodec/gif.h
new file mode 100644
index 0000000000..b4cf66546c
--- /dev/null
+++ b/libavcodec/gif.h
@@ -0,0 +1,49 @@
+/*
+ * GIF format definitions
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * GIF format definitions.
+ */
+
+#ifndef AVCODEC_GIFDEFS_H
+#define AVCODEC_GIFDEFS_H
+
+#include <stdint.h>
+
+static const uint8_t gif87a_sig[6] = "GIF87a";
+static const uint8_t gif89a_sig[6] = "GIF89a";
+
+#define GCE_DISPOSAL_NONE       0
+#define GCE_DISPOSAL_INPLACE    1
+#define GCE_DISPOSAL_BACKGROUND 2
+#define GCE_DISPOSAL_RESTORE    3
+
+#define GIF_TRAILER                 0x3b
+#define GIF_EXTENSION_INTRODUCER    0x21
+#define GIF_IMAGE_SEPARATOR         0x2c
+#define GIF_GCE_EXT_LABEL           0xf9
+#define GIF_APP_EXT_LABEL           0xff
+#define NETSCAPE_EXT_STR            "NETSCAPE2.0"
+
+#endif /* AVCODEC_GIFDEFS_H */
diff --git a/libavcodec/gifdec.c b/libavcodec/gifdec.c
index f08d501113..9f2e6eb3b7 100644
--- a/libavcodec/gifdec.c
+++ b/libavcodec/gifdec.c
@@ -2,122 +2,268 @@
  * GIF decoder
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
+#include "gif.h"
 
-#define GCE_DISPOSAL_NONE       0
-#define GCE_DISPOSAL_INPLACE    1
-#define GCE_DISPOSAL_BACKGROUND 2
-#define GCE_DISPOSAL_RESTORE    3
+/* This value is intentionally set to "transparent white" color.
+ * It is much better to have white background instead of black
+ * when gif image converted to format which not support transparency.
+ */
+#define GIF_TRANSPARENT_COLOR    0x00ffffff
 
 typedef struct GifState {
+    const AVClass *class;
+    AVFrame *frame;
     int screen_width;
     int screen_height;
+    int has_global_palette;
     int bits_per_pixel;
+    uint32_t bg_color;
     int background_color_index;
     int transparent_color_index;
     int color_resolution;
-    uint32_t *image_palette;
+    /* intermediate buffer for storing color indices
+     * obtained from lzw-encoded data stream */
+    uint8_t *idx_line;
+    int idx_line_size;
 
     /* after the frame is displayed, the disposal method is used */
+    int gce_prev_disposal;
     int gce_disposal;
-    /* delay during which the frame is shown */
-    int gce_delay;
+    /* rectangle describing area that must be disposed */
+    int gce_l, gce_t, gce_w, gce_h;
+    /* depending on disposal method we store either part of the image
+     * drawn on the canvas or background color that
+     * should be used upon disposal */
+    uint32_t * stored_img;
+    int stored_img_size;
+    int stored_bg_color;
 
-    /* LZW compatible decoder */
     GetByteContext gb;
     LZWState *lzw;
 
     /* aux buffers */
-    uint8_t global_palette[256 * 3];
-    uint8_t local_palette[256 * 3];
+    uint32_t global_palette[256];
+    uint32_t local_palette[256];
 
-  AVCodecContext* avctx;
+    AVCodecContext *avctx;
+    int keyframe;
+    int keyframe_ok;
+    int trans_color;    /**< color value that is used instead of transparent color */
 } GifState;
 
-static const uint8_t gif87a_sig[6] = "GIF87a";
-static const uint8_t gif89a_sig[6] = "GIF89a";
+static void gif_read_palette(GifState *s, uint32_t *pal, int nb)
+{
+    int i;
+
+    for (i = 0; i < nb; i++, pal++)
+        *pal = (0xffu << 24) | bytestream2_get_be24u(&s->gb);
+}
+
+static void gif_fill(AVFrame *picture, uint32_t color)
+{
+    uint32_t *p = (uint32_t *)picture->data[0];
+    uint32_t *p_end = p + (picture->linesize[0] / sizeof(uint32_t)) * picture->height;
+
+    for (; p < p_end; p++)
+        *p = color;
+}
+
+static void gif_fill_rect(AVFrame *picture, uint32_t color, int l, int t, int w, int h)
+{
+    const int linesize = picture->linesize[0] / sizeof(uint32_t);
+    const uint32_t *py = (uint32_t *)picture->data[0] + t * linesize;
+    const uint32_t *pr, *pb = py + h * linesize;
+    uint32_t *px;
+
+    for (; py < pb; py += linesize) {
+        px = (uint32_t *)py + l;
+        pr = px + w;
+
+        for (; px < pr; px++)
+            *px = color;
+    }
+}
+
+static void gif_copy_img_rect(const uint32_t *src, uint32_t *dst,
+                              int linesize, int l, int t, int w, int h)
+{
+    const int y_start = t * linesize;
+    const uint32_t *src_px,
+                   *src_py = src + y_start,
+                   *dst_py = dst + y_start;
+    const uint32_t *src_pb = src_py + h * linesize;
+    uint32_t *dst_px;
+
+    for (; src_py < src_pb; src_py += linesize, dst_py += linesize) {
+        src_px = src_py + l;
+        dst_px = (uint32_t *)dst_py + l;
+
+        memcpy(dst_px, src_px, w * sizeof(uint32_t));
+    }
+}
 
 static int gif_read_image(GifState *s, AVFrame *frame)
 {
-    int left, top, width, height, bits_per_pixel, code_size, flags;
-    int is_interleaved, has_local_palette, y, pass, y1, linesize, n, i;
-    uint8_t *ptr, *spal, *palette, *ptr1;
-
-    left   = bytestream2_get_le16(&s->gb);
-    top    = bytestream2_get_le16(&s->gb);
-    width  = bytestream2_get_le16(&s->gb);
-    height = bytestream2_get_le16(&s->gb);
-    flags  = bytestream2_get_byte(&s->gb);
+    int left, top, width, height, bits_per_pixel, code_size, flags, pw;
+    int is_interleaved, has_local_palette, y, pass, y1, linesize, pal_size;
+    uint32_t *ptr, *pal, *px, *pr, *ptr1;
+    int ret;
+    uint8_t *idx;
+
+    /* At least 9 bytes of Image Descriptor. */
+    if (bytestream2_get_bytes_left(&s->gb) < 9)
+        return AVERROR_INVALIDDATA;
+
+    left   = bytestream2_get_le16u(&s->gb);
+    top    = bytestream2_get_le16u(&s->gb);
+    width  = bytestream2_get_le16u(&s->gb);
+    height = bytestream2_get_le16u(&s->gb);
+    flags  = bytestream2_get_byteu(&s->gb);
     is_interleaved = flags & 0x40;
     has_local_palette = flags & 0x80;
     bits_per_pixel = (flags & 0x07) + 1;
 
-    ff_dlog(s->avctx, "gif: image x=%d y=%d w=%d h=%d\n", left, top, width, height);
+    ff_dlog(s->avctx, "image x=%d y=%d w=%d h=%d\n", left, top, width, height);
 
     if (has_local_palette) {
-        bytestream2_get_buffer(&s->gb, s->local_palette, 3 * (1 << bits_per_pixel));
-        palette = s->local_palette;
+        pal_size = 1 << bits_per_pixel;
+
+        if (bytestream2_get_bytes_left(&s->gb) < pal_size * 3)
+            return AVERROR_INVALIDDATA;
+
+        gif_read_palette(s, s->local_palette, pal_size);
+        pal = s->local_palette;
     } else {
-        palette = s->global_palette;
-        bits_per_pixel = s->bits_per_pixel;
+        if (!s->has_global_palette) {
+            av_log(s->avctx, AV_LOG_ERROR, "picture doesn't have either global or local palette.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        pal = s->global_palette;
+    }
+
+    if (s->keyframe) {
+        if (s->transparent_color_index == -1 && s->has_global_palette) {
+            /* transparency wasn't set before the first frame, fill with background color */
+            gif_fill(frame, s->bg_color);
+        } else {
+            /* otherwise fill with transparent color.
+             * this is necessary since by default picture filled with 0x80808080. */
+            gif_fill(frame, s->trans_color);
+        }
     }
 
     /* verify that all the image is inside the screen dimensions */
-    if (left + width > s->screen_width ||
-        top + height > s->screen_height ||
-        !width || !height) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid image dimensions.\n");
+    if (!width || width > s->screen_width || left >= s->screen_width) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid image width.\n");
         return AVERROR_INVALIDDATA;
     }
+    if (!height || height > s->screen_height || top >= s->screen_height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid image height.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (left + width > s->screen_width) {
+        /* width must be kept around to avoid lzw vs line desync */
+        pw = s->screen_width - left;
+        av_log(s->avctx, AV_LOG_WARNING, "Image too wide by %d, truncating.\n",
+               left + width - s->screen_width);
+    } else {
+        pw = width;
+    }
+    if (top + height > s->screen_height) {
+        /* we don't care about the extra invisible lines */
+        av_log(s->avctx, AV_LOG_WARNING, "Image too high by %d, truncating.\n",
+               top + height - s->screen_height);
+        height = s->screen_height - top;
+    }
+
+    /* process disposal method */
+    if (s->gce_prev_disposal == GCE_DISPOSAL_BACKGROUND) {
+        gif_fill_rect(frame, s->stored_bg_color, s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    } else if (s->gce_prev_disposal == GCE_DISPOSAL_RESTORE) {
+        gif_copy_img_rect(s->stored_img, (uint32_t *)frame->data[0],
+            frame->linesize[0] / sizeof(uint32_t), s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    }
+
+    s->gce_prev_disposal = s->gce_disposal;
+
+    if (s->gce_disposal != GCE_DISPOSAL_NONE) {
+        s->gce_l = left;  s->gce_t = top;
+        s->gce_w = pw;    s->gce_h = height;
 
-    /* build the palette */
-    n = (1 << bits_per_pixel);
-    spal = palette;
-    for(i = 0; i < n; i++) {
-        s->image_palette[i] = (0xffu << 24) | AV_RB24(spal);
-        spal += 3;
+        if (s->gce_disposal == GCE_DISPOSAL_BACKGROUND) {
+            if (s->transparent_color_index >= 0)
+                s->stored_bg_color = s->trans_color;
+            else
+                s->stored_bg_color = s->bg_color;
+        } else if (s->gce_disposal == GCE_DISPOSAL_RESTORE) {
+            av_fast_malloc(&s->stored_img, &s->stored_img_size, frame->linesize[0] * frame->height);
+            if (!s->stored_img)
+                return AVERROR(ENOMEM);
+
+            gif_copy_img_rect((uint32_t *)frame->data[0], s->stored_img,
+                frame->linesize[0] / sizeof(uint32_t), left, top, pw, height);
+        }
     }
-    for(; i < 256; i++)
-        s->image_palette[i] = (0xffu << 24);
-    /* handle transparency */
-    if (s->transparent_color_index >= 0)
-        s->image_palette[s->transparent_color_index] = 0;
+
+    /* Expect at least 2 bytes: 1 for lzw code size and 1 for block size. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
 
     /* now get the image data */
-    code_size = bytestream2_get_byte(&s->gb);
-    ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
-                       bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF);
+    code_size = bytestream2_get_byteu(&s->gb);
+    if ((ret = ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
+                                  bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF)) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "LZW init failed\n");
+        return ret;
+    }
 
     /* read all the image */
-    linesize = frame->linesize[0];
-    ptr1 = frame->data[0] + top * linesize + left;
+    linesize = frame->linesize[0] / sizeof(uint32_t);
+    ptr1 = (uint32_t *)frame->data[0] + top * linesize + left;
     ptr = ptr1;
     pass = 0;
     y1 = 0;
     for (y = 0; y < height; y++) {
-        ff_lzw_decode(s->lzw, ptr, width);
+        int count = ff_lzw_decode(s->lzw, s->idx_line, width);
+        if (count != width) {
+            if (count)
+                av_log(s->avctx, AV_LOG_ERROR, "LZW decode failed\n");
+            goto decode_tail;
+        }
+
+        pr = ptr + pw;
+
+        for (px = ptr, idx = s->idx_line; px < pr; px++, idx++) {
+            if (*idx != s->transparent_color_index)
+                *px = pal[*idx];
+        }
+
         if (is_interleaved) {
             switch(pass) {
             default:
@@ -144,53 +290,76 @@ static int gif_read_image(GifState *s, AVFrame *frame)
             ptr += linesize;
         }
     }
+
+ decode_tail:
     /* read the garbage data until end marker is found */
     ff_lzw_decode_tail(s->lzw);
 
-    bytestream2_skip(&s->gb, ff_lzw_size_read(s->lzw));
+    /* Graphic Control Extension's scope is single frame.
+     * Remove its influence. */
+    s->transparent_color_index = -1;
+    s->gce_disposal = GCE_DISPOSAL_NONE;
+
     return 0;
 }
 
 static int gif_read_extension(GifState *s)
 {
-    int ext_code, ext_len, i, gce_flags, gce_transparent_index;
+    int ext_code, ext_len, gce_flags, gce_transparent_index;
+
+    /* There must be at least 2 bytes:
+     * 1 for extension label and 1 for extension length. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
 
-    /* extension */
-    ext_code = bytestream2_get_byte(&s->gb);
-    ext_len  = bytestream2_get_byte(&s->gb);
+    ext_code = bytestream2_get_byteu(&s->gb);
+    ext_len  = bytestream2_get_byteu(&s->gb);
 
-    ff_dlog(s->avctx, "gif: ext_code=0x%x len=%d\n", ext_code, ext_len);
+    ff_dlog(s->avctx, "ext_code=0x%x len=%d\n", ext_code, ext_len);
 
     switch(ext_code) {
-    case 0xf9:
+    case GIF_GCE_EXT_LABEL:
         if (ext_len != 4)
             goto discard_ext;
-        s->transparent_color_index = -1;
-        gce_flags    = bytestream2_get_byte(&s->gb);
-        s->gce_delay = bytestream2_get_le16(&s->gb);
-        gce_transparent_index = bytestream2_get_byte(&s->gb);
+
+        /* We need at least 5 bytes more: 4 is for extension body
+         * and 1 for next block size. */
+        if (bytestream2_get_bytes_left(&s->gb) < 5)
+            return AVERROR_INVALIDDATA;
+
+        gce_flags    = bytestream2_get_byteu(&s->gb);
+        bytestream2_skipu(&s->gb, 2);    // delay during which the frame is shown
+        gce_transparent_index = bytestream2_get_byteu(&s->gb);
         if (gce_flags & 0x01)
             s->transparent_color_index = gce_transparent_index;
         else
             s->transparent_color_index = -1;
         s->gce_disposal = (gce_flags >> 2) & 0x7;
 
-        ff_dlog(s->avctx, "gif: gce_flags=%x delay=%d tcolor=%d disposal=%d\n",
-               gce_flags, s->gce_delay,
+        ff_dlog(s->avctx, "gce_flags=%x tcolor=%d disposal=%d\n",
+               gce_flags,
                s->transparent_color_index, s->gce_disposal);
 
-        ext_len = bytestream2_get_byte(&s->gb);
+        if (s->gce_disposal > 3) {
+            s->gce_disposal = GCE_DISPOSAL_NONE;
+            ff_dlog(s->avctx, "invalid value in gce_disposal (%d). Using default value of 0.\n", ext_len);
+        }
+
+        ext_len = bytestream2_get_byteu(&s->gb);
         break;
     }
 
     /* NOTE: many extension blocks can come after */
  discard_ext:
-    while (ext_len != 0) {
-        for (i = 0; i < ext_len; i++)
-            bytestream2_get_byte(&s->gb);
-        ext_len = bytestream2_get_byte(&s->gb);
+    while (ext_len) {
+        /* There must be at least ext_len bytes and 1 for next block size byte. */
+        if (bytestream2_get_bytes_left(&s->gb) < ext_len + 1)
+            return AVERROR_INVALIDDATA;
+
+        bytestream2_skipu(&s->gb, ext_len);
+        ext_len = bytestream2_get_byteu(&s->gb);
 
-        ff_dlog(s->avctx, "gif: ext_len1=%d\n", ext_len);
+        ff_dlog(s->avctx, "ext_len1=%d\n", ext_len);
     }
     return 0;
 }
@@ -199,44 +368,48 @@ static int gif_read_header1(GifState *s)
 {
     uint8_t sig[6];
     int v, n;
-    int has_global_palette;
+    int background_color_index;
 
     if (bytestream2_get_bytes_left(&s->gb) < 13)
         return AVERROR_INVALIDDATA;
 
     /* read gif signature */
-    bytestream2_get_buffer(&s->gb, sig, 6);
-    if (memcmp(sig, gif87a_sig, 6) != 0 &&
-        memcmp(sig, gif89a_sig, 6) != 0)
+    bytestream2_get_bufferu(&s->gb, sig, 6);
+    if (memcmp(sig, gif87a_sig, 6) &&
+        memcmp(sig, gif89a_sig, 6))
         return AVERROR_INVALIDDATA;
 
     /* read screen header */
     s->transparent_color_index = -1;
-    s->screen_width  = bytestream2_get_le16(&s->gb);
-    s->screen_height = bytestream2_get_le16(&s->gb);
-    if(   (unsigned)s->screen_width  > 32767
-       || (unsigned)s->screen_height > 32767){
-        av_log(NULL, AV_LOG_ERROR, "picture size too large\n");
-        return AVERROR_INVALIDDATA;
-    }
+    s->screen_width  = bytestream2_get_le16u(&s->gb);
+    s->screen_height = bytestream2_get_le16u(&s->gb);
 
-    v = bytestream2_get_byte(&s->gb);
+    v = bytestream2_get_byteu(&s->gb);
     s->color_resolution = ((v & 0x70) >> 4) + 1;
-    has_global_palette = (v & 0x80);
+    s->has_global_palette = (v & 0x80);
     s->bits_per_pixel = (v & 0x07) + 1;
-    s->background_color_index = bytestream2_get_byte(&s->gb);
-    bytestream2_get_byte(&s->gb);                /* ignored */
+    background_color_index = bytestream2_get_byteu(&s->gb);
+    n = bytestream2_get_byteu(&s->gb);
+    if (n) {
+        s->avctx->sample_aspect_ratio.num = n + 15;
+        s->avctx->sample_aspect_ratio.den = 64;
+    }
 
-    ff_dlog(s->avctx, "gif: screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
+    ff_dlog(s->avctx, "screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
            s->screen_width, s->screen_height, s->bits_per_pixel,
-           has_global_palette);
+           s->has_global_palette);
 
-    if (has_global_palette) {
+    if (s->has_global_palette) {
+        s->background_color_index = background_color_index;
         n = 1 << s->bits_per_pixel;
         if (bytestream2_get_bytes_left(&s->gb) < n * 3)
             return AVERROR_INVALIDDATA;
-        bytestream2_get_buffer(&s->gb, s->global_palette, n * 3);
-    }
+
+        gif_read_palette(s, s->global_palette, n);
+        s->bg_color = s->global_palette[s->background_color_index];
+    } else
+        s->background_color_index = -1;
+
     return 0;
 }
 
@@ -246,23 +419,24 @@ static int gif_parse_next_image(GifState *s, AVFrame *frame)
         int code = bytestream2_get_byte(&s->gb);
         int ret;
 
-        ff_dlog(s->avctx, "gif: code=%02x '%c'\n", code, code);
+        av_log(s->avctx, AV_LOG_DEBUG, "code=%02x '%c'\n", code, code);
 
         switch (code) {
-        case ',':
+        case GIF_IMAGE_SEPARATOR:
             return gif_read_image(s, frame);
-        case '!':
+        case GIF_EXTENSION_INTRODUCER:
             if ((ret = gif_read_extension(s)) < 0)
                 return ret;
             break;
-        case ';':
+        case GIF_TRAILER:
             /* end of image */
+            return AVERROR_EOF;
         default:
-            /* error or erroneous EOF */
+            /* erroneous block label */
             return AVERROR_INVALIDDATA;
         }
     }
-    return AVERROR_INVALIDDATA;
+    return AVERROR_EOF;
 }
 
 static av_cold int gif_decode_init(AVCodecContext *avctx)
@@ -271,38 +445,74 @@ static av_cold int gif_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
+    avctx->pix_fmt = AV_PIX_FMT_RGB32;
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
     ff_lzw_decode_open(&s->lzw);
     return 0;
 }
 
-static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                            AVPacket *avpkt)
+static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     GifState *s = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
-    bytestream2_init(&s->gb, buf, buf_size);
-    if ((ret = gif_read_header1(s)) < 0)
-        return ret;
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    s->frame->pts     = avpkt->pts;
+    s->frame->pkt_pts = avpkt->pts;
+    s->frame->pkt_dts = avpkt->dts;
+    av_frame_set_pkt_duration(s->frame, avpkt->duration);
 
-    if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
-        return ret;
+    if (avpkt->size >= 6) {
+        s->keyframe = memcmp(avpkt->data, gif87a_sig, 6) == 0 ||
+                      memcmp(avpkt->data, gif89a_sig, 6) == 0;
+    } else {
+        s->keyframe = 0;
+    }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (s->keyframe) {
+        s->keyframe_ok = 0;
+        s->gce_prev_disposal = GCE_DISPOSAL_NONE;
+        if ((ret = gif_read_header1(s)) < 0)
+            return ret;
+
+        if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
+            return ret;
+
+        av_frame_unref(s->frame);
+        if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+            return ret;
+
+        av_fast_malloc(&s->idx_line, &s->idx_line_size, s->screen_width);
+        if (!s->idx_line)
+            return AVERROR(ENOMEM);
+
+        s->frame->pict_type = AV_PICTURE_TYPE_I;
+        s->frame->key_frame = 1;
+        s->keyframe_ok = 1;
+    } else {
+        if (!s->keyframe_ok) {
+            av_log(avctx, AV_LOG_ERROR, "cannot decode frame without keyframe\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
+
+        s->frame->pict_type = AV_PICTURE_TYPE_P;
+        s->frame->key_frame = 0;
     }
-    s->image_palette = (uint32_t *)picture->data[1];
-    ret = gif_parse_next_image(s, picture);
+
+    ret = gif_parse_next_image(s, s->frame);
     if (ret < 0)
         return ret;
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
     *got_frame = 1;
+
     return bytestream2_tell(&s->gb);
 }
 
@@ -311,9 +521,29 @@ static av_cold int gif_decode_close(AVCodecContext *avctx)
     GifState *s = avctx->priv_data;
 
     ff_lzw_decode_close(&s->lzw);
+    av_frame_free(&s->frame);
+    av_freep(&s->idx_line);
+    av_freep(&s->stored_img);
+
     return 0;
 }
 
+static const AVOption options[] = {
+    { "trans_color", "color value (ARGB) that is used instead of transparent color",
+      offsetof(GifState, trans_color), AV_OPT_TYPE_INT,
+      {.i64 = GIF_TRANSPARENT_COLOR}, 0, 0xffffffff,
+      AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM },
+    { NULL },
+};
+
+static const AVClass decoder_class = {
+    .class_name = "gif decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
+};
+
 AVCodec ff_gif_decoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -324,4 +554,5 @@ AVCodec ff_gif_decoder = {
     .close          = gif_decode_close,
     .decode         = gif_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &decoder_class,
 };
diff --git a/libavcodec/golomb-test.c b/libavcodec/golomb-test.c
index e740a20aea..2dfe917144 100644
--- a/libavcodec/golomb-test.c
+++ b/libavcodec/golomb-test.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,7 +58,7 @@ int main(void)
         }
     }
 
-#define EXTEND(i) (i << 3 | i & 7)
+#define EXTEND(i) ((i) << 3 | (i) & 7)
     init_put_bits(&pb, temp, SIZE);
     for (i = 0; i < COUNT; i++)
         set_ue_golomb(&pb, EXTEND(i));
diff --git a/libavcodec/golomb.c b/libavcodec/golomb.c
index 550c41ebfe..937ac22ce1 100644
--- a/libavcodec/golomb.c
+++ b/libavcodec/golomb.c
@@ -2,20 +2,20 @@
  * exp golomb vlc stuff
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/golomb.h b/libavcodec/golomb.h
index 22a87c64e1..d30bb6bc86 100644
--- a/libavcodec/golomb.h
+++ b/libavcodec/golomb.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,10 +66,14 @@ static inline int get_ue_golomb(GetBitContext *gb)
         return ff_ue_golomb_vlc_code[buf];
     } else {
         int log = 2 * av_log2(buf) - 31;
-        buf >>= log;
-        buf--;
         LAST_SKIP_BITS(re, gb, 32 - log);
         CLOSE_READER(re, gb);
+        if (CONFIG_FTRAPV && log < 0) {
+            av_log(NULL, AV_LOG_ERROR, "Invalid UE golomb code\n");
+            return AVERROR_INVALIDDATA;
+        }
+        buf >>= log;
+        buf--;
 
         return buf;
     }
@@ -138,7 +142,7 @@ static inline unsigned svq3_get_ue_golomb(GetBitContext *gb)
             ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
             UPDATE_CACHE(re, gb);
             buf = GET_CACHE(re, gb);
-        } while (BITS_AVAILABLE(re, gb));
+        } while (ret<0x8000000U && BITS_AVAILABLE(re, gb));
 
         CLOSE_READER(re, gb);
         return ret - 1;
@@ -150,7 +154,7 @@ static inline unsigned svq3_get_ue_golomb(GetBitContext *gb)
  */
 static inline int get_te0_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 1)
         return 0;
@@ -165,7 +169,7 @@ static inline int get_te0_golomb(GetBitContext *gb, int range)
  */
 static inline int get_te_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 2)
         return get_bits1(gb) ^ 1;
@@ -191,16 +195,18 @@ static inline int get_se_golomb(GetBitContext *gb)
 
         return ff_se_golomb_vlc_code[buf];
     } else {
-        int log = 2 * av_log2(buf) - 31;
+        int log = av_log2(buf), sign;
+        LAST_SKIP_BITS(re, gb, 31 - log);
+        UPDATE_CACHE(re, gb);
+        buf = GET_CACHE(re, gb);
+
         buf >>= log;
 
         LAST_SKIP_BITS(re, gb, 32 - log);
         CLOSE_READER(re, gb);
 
-        if (buf & 1)
-            buf = -(buf >> 1);
-        else
-            buf = (buf >> 1);
+        sign = -(buf & 1);
+        buf  = ((buf >> 1) ^ sign) - sign;
 
         return buf;
     }
@@ -209,13 +215,8 @@ static inline int get_se_golomb(GetBitContext *gb)
 static inline int get_se_golomb_long(GetBitContext *gb)
 {
     unsigned int buf = get_ue_golomb_long(gb);
-
-    if (buf & 1)
-        buf = (buf + 1) >> 1;
-    else
-        buf = -(buf >> 1);
-
-    return buf;
+    int sign = (buf & 1) - 1;
+    return ((buf >> 1) ^ sign) + 1;
 }
 
 static inline int svq3_get_se_golomb(GetBitContext *gb)
@@ -256,13 +257,8 @@ static inline int dirac_get_se_golomb(GetBitContext *gb)
     uint32_t ret = svq3_get_ue_golomb(gb);
 
     if (ret) {
-        uint32_t buf;
-        OPEN_READER(re, gb);
-        UPDATE_CACHE(re, gb);
-        buf = SHOW_SBITS(re, gb, 1);
-        LAST_SKIP_BITS(re, gb, 1);
-        ret = (ret ^ buf) - buf;
-        CLOSE_READER(re, gb);
+        int sign = -get_bits1(gb);
+        ret = (ret ^ sign) - sign;
     }
 
     return ret;
@@ -285,7 +281,7 @@ static inline int get_ur_golomb(GetBitContext *gb, int k, int limit,
 
     if (log > 31 - limit) {
         buf >>= log - k;
-        buf  += (30 - log) << k;
+        buf  += (30U - log) << k;
         LAST_SKIP_BITS(re, gb, 32 + k - log);
         CLOSE_READER(re, gb);
 
@@ -321,14 +317,16 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
     if (log - k >= 32 - MIN_CACHE_BITS + (MIN_CACHE_BITS == 32) &&
         32 - log < limit) {
         buf >>= log - k;
-        buf  += (30 - log) << k;
+        buf  += (30U - log) << k;
         LAST_SKIP_BITS(re, gb, 32 + k - log);
         CLOSE_READER(re, gb);
 
         return buf;
     } else {
         int i;
-        for (i = 0; i < limit && SHOW_UBITS(re, gb, 1) == 0 && BITS_AVAILABLE(re, gb); i++) {
+        for (i = 0; i < limit && SHOW_UBITS(re, gb, 1) == 0; i++) {
+            if (gb->size_in_bits <= re_index)
+                return -1;
             LAST_SKIP_BITS(re, gb, 1);
             UPDATE_CACHE(re, gb);
         }
@@ -336,8 +334,16 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
 
         if (i < limit - 1) {
             if (k) {
-                buf = SHOW_UBITS(re, gb, k);
-                LAST_SKIP_BITS(re, gb, k);
+                if (k > MIN_CACHE_BITS - 1) {
+                    buf = SHOW_UBITS(re, gb, 16) << (k-16);
+                    LAST_SKIP_BITS(re, gb, 16);
+                    UPDATE_CACHE(re, gb);
+                    buf |= SHOW_UBITS(re, gb, k-16);
+                    LAST_SKIP_BITS(re, gb, k-16);
+                } else {
+                    buf = SHOW_UBITS(re, gb, k);
+                    LAST_SKIP_BITS(re, gb, k);
+                }
             } else {
                 buf = 0;
             }
@@ -361,15 +367,8 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
 static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
                                 int esc_len)
 {
-    int v = get_ur_golomb(gb, k, limit, esc_len);
-
-    v++;
-    if (v & 1)
-        return v >> 1;
-    else
-        return -(v >> 1);
-
-//    return (v>>1) ^ -(v&1);
+    unsigned v = get_ur_golomb(gb, k, limit, esc_len);
+    return (v >> 1) ^ -(v & 1);
 }
 
 /**
@@ -378,7 +377,7 @@ static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
 static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit,
                                      int esc_len)
 {
-    int v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
+    unsigned v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
     return (v >> 1) ^ -(v & 1);
 }
 
@@ -396,10 +395,7 @@ static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k)
 static inline int get_sr_golomb_shorten(GetBitContext *gb, int k)
 {
     int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
-    if (uvar & 1)
-        return ~(uvar >> 1);
-    else
-        return uvar >> 1;
+    return (uvar >> 1) ^ -(uvar & 1);
 }
 
 #ifdef TRACE
@@ -467,14 +463,8 @@ static inline int get_te(GetBitContext *s, int r, char *file, const char *func,
  */
 static inline void set_ue_golomb(PutBitContext *pb, int i)
 {
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
-#if 0
-    if (i = 0) {
-        put_bits(pb, 1, 1);
-        return;
-    }
-#endif
     if (i < 256)
         put_bits(pb, ff_ue_golomb_len[i], i + 1);
     else {
@@ -488,8 +478,8 @@ static inline void set_ue_golomb(PutBitContext *pb, int i)
  */
 static inline void set_te_golomb(PutBitContext *pb, int i, int range)
 {
-    assert(range >= 1);
-    assert(i <= range);
+    av_assert2(range >= 1);
+    av_assert2(i <= range);
 
     if (range == 2)
         put_bits(pb, 1, i ^ 1);
@@ -526,11 +516,11 @@ static inline void set_ur_golomb(PutBitContext *pb, int i, int k, int limit,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = i >> k;
     if (e < limit)
-        put_bits(pb, e + k + 1, (1 << k) + (i & ((1 << k) - 1)));
+        put_bits(pb, e + k + 1, (1 << k) + av_mod_uintp2(i, k));
     else
         put_bits(pb, limit + esc_len, i - limit + 1);
 }
@@ -543,7 +533,7 @@ static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = (i >> k) + 1;
     if (e < limit) {
diff --git a/libavcodec/gsm.h b/libavcodec/gsm.h
index 238cb7359d..53d65c4dc7 100644
--- a/libavcodec/gsm.h
+++ b/libavcodec/gsm.h
@@ -1,20 +1,20 @@
 /*
  * GSM common header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsm_parser.c b/libavcodec/gsm_parser.c
index c0befc7796..1054a30ca9 100644
--- a/libavcodec/gsm_parser.c
+++ b/libavcodec/gsm_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Splits packets into individual blocks.
  */
 
+#include "libavutil/avassert.h"
 #include "parser.h"
 #include "gsm.h"
 
@@ -55,7 +56,7 @@ static int gsm_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
             s->duration   = GSM_FRAME_SIZE * 2;
             break;
         default:
-            return AVERROR(EINVAL);
+            av_assert0(0);
         }
     }
 
diff --git a/libavcodec/gsmdec.c b/libavcodec/gsmdec.c
index a333e58bdc..cd56995183 100644
--- a/libavcodec/gsmdec.c
+++ b/libavcodec/gsmdec.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,10 +79,8 @@ static int gsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
     samples = (int16_t *)frame->data[0];
 
     switch (avctx->codec_id) {
@@ -112,6 +110,7 @@ static void gsm_flush(AVCodecContext *avctx)
     memset(s, 0, sizeof(*s));
 }
 
+#if CONFIG_GSM_DECODER
 AVCodec ff_gsm_decoder = {
     .name           = "gsm",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM"),
@@ -123,7 +122,8 @@ AVCodec ff_gsm_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_GSM_MS_DECODER
 AVCodec ff_gsm_ms_decoder = {
     .name           = "gsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM Microsoft variant"),
@@ -135,3 +135,4 @@ AVCodec ff_gsm_ms_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/gsmdec_data.c b/libavcodec/gsmdec_data.c
index c9b3183a55..d90c69b98f 100644
--- a/libavcodec/gsmdec_data.c
+++ b/libavcodec/gsmdec_data.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_data.h b/libavcodec/gsmdec_data.h
index f5581d53ba..b57194b8d1 100644
--- a/libavcodec/gsmdec_data.h
+++ b/libavcodec/gsmdec_data.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_template.c b/libavcodec/gsmdec_template.c
index 0b54dc54ce..4cb777c854 100644
--- a/libavcodec/gsmdec_template.c
+++ b/libavcodec/gsmdec_template.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,7 @@ static inline int decode_log_area(int coded, int factor, int offset)
 {
     coded <<= 10;
     coded -= offset;
-    return gsm_mult(coded, factor) << 1;
+    return gsm_mult(coded, factor) * 2;
 }
 
 static av_noinline int get_rrp(int filtered)
@@ -121,7 +121,7 @@ static int postprocess(int16_t *data, int msr)
     int i;
     for (i = 0; i < 160; i++) {
         msr = av_clip_int16(data[i] + gsm_mult(msr, 28180));
-        data[i] = av_clip_int16(msr << 1) & ~7;
+        data[i] = av_clip_int16(msr * 2) & ~7;
     }
     return msr;
 }
diff --git a/libavcodec/h261.c b/libavcodec/h261.c
index bca7809773..f33d1a12f9 100644
--- a/libavcodec/h261.c
+++ b/libavcodec/h261.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261.h b/libavcodec/h261.h
index ad7e28bb64..5586462893 100644
--- a/libavcodec/h261.h
+++ b/libavcodec/h261.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,6 @@ typedef struct H261Context {
     MpegEncContext s;
 
     int current_mba;
-    int previous_mba;
     int mba_diff;
     int mtype;
     int current_mv_x;
diff --git a/libavcodec/h261_parser.c b/libavcodec/h261_parser.c
index 2469424c1a..9c31557510 100644
--- a/libavcodec/h261_parser.c
+++ b/libavcodec/h261_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,11 +71,15 @@ static int h261_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next = h261_find_frame_end(pc, avctx, buf, buf_size);
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf      = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next = h261_find_frame_end(pc, avctx, buf, buf_size);
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf      = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
diff --git a/libavcodec/h261data.c b/libavcodec/h261data.c
index a81ccdfef9..a9891edd0a 100644
--- a/libavcodec/h261data.c
+++ b/libavcodec/h261data.c
@@ -2,20 +2,20 @@
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261dec.c b/libavcodec/h261dec.c
index 95452f47b4..df60ac5d2e 100644
--- a/libavcodec/h261dec.c
+++ b/libavcodec/h261dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * H.261 decoder.
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "mpeg_er.h"
 #include "mpegutils.h"
@@ -75,14 +76,11 @@ static av_cold int h261_decode_init(AVCodecContext *avctx)
 
     // set defaults
     ff_mpv_decode_defaults(s);
-    s->avctx       = avctx;
-    s->width       = s->avctx->coded_width;
-    s->height      = s->avctx->coded_height;
-    s->codec_id    = s->avctx->codec->id;
+    ff_mpv_decode_init(s, avctx);
+
     s->out_format  = FMT_H261;
     s->low_delay   = 1;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-    s->codec_id    = avctx->codec->id;
 
     ff_h261_common_init();
     h261_decode_init_vlc(h);
@@ -127,12 +125,12 @@ static int h261_decode_gob_header(H261Context *h)
     }
 
     /* GEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if (s->qscale == 0) {
         av_log(s->avctx, AV_LOG_ERROR, "qscale has forbidden 0 value\n");
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+        if (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return -1;
     }
 
@@ -218,6 +216,13 @@ static int h261_decode_mb_skipped(H261Context *h, int mba1, int mba2)
         s->mb_skipped                  = 1;
         h->mtype                      &= ~MB_TYPE_H261_FIL;
 
+        if (s->current_picture.motion_val[0]) {
+            int b_stride = 2*s->mb_width + 1;
+            int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+            s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+            s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+        }
+
         ff_mpv_decode_mb(s, s->block);
     }
 
@@ -257,7 +262,7 @@ static int decode_mv_component(GetBitContext *gb, int v)
 static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
 {
     MpegEncContext *const s = &h->s;
-    int code, level, i, j, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h261_rl_tcoeff;
     const uint8_t *scan_table;
 
@@ -302,39 +307,47 @@ static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
         s->block_last_index[n] = i - 1;
         return 0;
     }
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for (;;) {
-        code = get_vlc2(&s->gb, rl->vlc.table, TCOEFF_VLC_BITS, 2);
-        if (code < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
-                   s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TCOEFF_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level) {
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
+                       s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             /* The remaining combinations of (run, level) are encoded with a
              * 20-bit word consisting of 6 bits escape, 6 bits run and 8 bits
              * level. */
-            run   = get_bits(&s->gb, 6);
-            level = get_sbits(&s->gb, 8);
-        } else if (code == 0) {
+            run   = SHOW_UBITS(re, &s->gb, 6) + 1;
+            SKIP_CACHE(re, &s->gb, 6);
+            level = SHOW_SBITS(re, &s->gb, 8);
+            SKIP_COUNTER(re, &s->gb, 6 + 8);
+        } else if (level == 0) {
             break;
         } else {
-            run   = rl->table_run[code];
-            level = rl->table_level[code];
-            if (get_bits1(&s->gb))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64) {
+            CLOSE_READER(re, &s->gb);
             av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d\n",
                    s->mb_x, s->mb_y);
             return -1;
         }
         j        = scan_table[i];
         block[j] = level;
-        i++;
     }
-    s->block_last_index[n] = i - 1;
+    CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
     return 0;
 }
 
@@ -379,11 +392,12 @@ static int h261_decode_mb(H261Context *h)
 
     // Read mtype
     h->mtype = get_vlc2(&s->gb, h261_mtype_vlc.table, H261_MTYPE_VLC_BITS, 2);
-    if (h->mtype < 0 || h->mtype >= FF_ARRAY_ELEMS(ff_h261_mtype_map)) {
+    if (h->mtype < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid mtype index %d\n",
                h->mtype);
         return SLICE_ERROR;
     }
+    av_assert0(h->mtype < FF_ARRAY_ELEMS(ff_h261_mtype_map));
     h->mtype = ff_h261_mtype_map[h->mtype];
 
     // Read mquant
@@ -431,6 +445,13 @@ static int h261_decode_mb(H261Context *h)
     s->mv[0][0][0]                 = h->current_mv_x * 2; // gets divided by 2 in motion compensation
     s->mv[0][0][1]                 = h->current_mv_y * 2;
 
+    if (s->current_picture.motion_val[0]) {
+        int b_stride = 2*s->mb_width + 1;
+        int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+        s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+    }
+
 intra:
     /* decode each block */
     if (s->mb_intra || HAS_CBP(h->mtype)) {
@@ -506,8 +527,8 @@ static int h261_decode_picture_header(H261Context *h)
     skip_bits1(&s->gb); /* Reserved */
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* H.261 has no I-frames, but if we pass AV_PICTURE_TYPE_I for the first
      * frame, the codec crashes if it does not contain all I-blocks
@@ -634,12 +655,12 @@ retry:
     }
     ff_mpv_frame_end(s);
 
-    assert(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->pict_type);
 
     if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
         return ret;
-    ff_print_debug_info(s, s->current_picture_ptr);
+    ff_print_debug_info(s, s->current_picture_ptr, pict);
 
     *got_frame = 1;
 
@@ -665,4 +686,5 @@ AVCodec ff_h261_decoder = {
     .close          = h261_decode_end,
     .decode         = h261_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/h261enc.c b/libavcodec/h261enc.c
index 94e8cec2a2..24ef577aa6 100644
--- a/libavcodec/h261enc.c
+++ b/libavcodec/h261enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
@@ -33,6 +34,9 @@
 #include "h261.h"
 #include "mpegvideodata.h"
 
+static uint8_t uni_h261_rl_len [64*64*2*2];
+#define UNI_ENC_INDEX(last,run,level) ((last)*128*64 + (run)*128 + (level))
+
 int ff_h261_get_picture_format(int width, int height)
 {
     // QCIF
@@ -43,7 +47,7 @@ int ff_h261_get_picture_format(int width, int height)
         return 1;
     // ERROR
     else
-        return -1;
+        return AVERROR(EINVAL);
 }
 
 void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
@@ -58,8 +62,8 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
 
     put_bits(&s->pb, 20, 0x10); /* PSC */
 
-    temp_ref = s->picture_number * (int64_t)30000 * s->avctx->time_base.num /
-               (1001 * (int64_t)s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
+    temp_ref = s->picture_number * 30000LL * s->avctx->time_base.num /
+               (1001LL * s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
     put_sbits(&s->pb, 5, temp_ref); /* TemporalReference */
 
     put_bits(&s->pb, 1, 0); /* split screen off */
@@ -78,7 +82,7 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
         h->gob_number = -1;
     else
         h->gob_number = 0;
-    h->current_mba = 0;
+    s->mb_skip_run = 0;
 }
 
 /**
@@ -96,18 +100,21 @@ static void h261_encode_gob_header(MpegEncContext *s, int mb_line)
     put_bits(&s->pb, 4, h->gob_number); /* GN */
     put_bits(&s->pb, 5, s->qscale);     /* GQUANT */
     put_bits(&s->pb, 1, 0);             /* no GEI */
-    h->current_mba  = 0;
-    h->previous_mba = 0;
-    h->current_mv_x = 0;
-    h->current_mv_y = 0;
+    s->mb_skip_run = 0;
+    s->last_mv[0][0][0] = 0;
+    s->last_mv[0][0][1] = 0;
 }
 
 void ff_h261_reorder_mb_index(MpegEncContext *s)
 {
     int index = s->mb_x + s->mb_y * s->mb_width;
 
-    if (index % 33 == 0)
-        h261_encode_gob_header(s, 0);
+    if (index % 11 == 0) {
+        if (index % 33 == 0)
+            h261_encode_gob_header(s, 0);
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
 
     /* for CIF the GOB's are fragmented in the middle of a scanline
      * that's why we need to adjust the x and y index of the macroblocks */
@@ -214,8 +221,8 @@ static void h261_encode_block(H261Context *h, int16_t *block, int n)
             put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
             if (code == rl->n) {
                 put_bits(&s->pb, 6, run);
-                assert(slevel != 0);
-                assert(level <= 127);
+                av_assert1(slevel != 0);
+                av_assert1(level <= 127);
                 put_sbits(&s->pb, 8, slevel);
             } else {
                 put_bits(&s->pb, 1, sign);
@@ -235,7 +242,6 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     cbp = 63; // avoid warning
     mvd = 0;
 
-    h->current_mba++;
     h->mtype = 0;
 
     if (!s->mb_intra) {
@@ -245,19 +251,22 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* mvd indicates if this block is motion compensated */
         mvd = motion_x | motion_y;
 
-        if ((cbp | mvd | s->dquant) == 0) {
+        if ((cbp | mvd) == 0) {
             /* skip macroblock */
             s->skip_count++;
-            h->current_mv_x = 0;
-            h->current_mv_y = 0;
+            s->mb_skip_run++;
+            s->last_mv[0][0][0] = 0;
+            s->last_mv[0][0][1] = 0;
+            s->qscale -= s->dquant;
             return;
         }
     }
 
     /* MB is not skipped, encode MBA */
     put_bits(&s->pb,
-             ff_h261_mba_bits[(h->current_mba - h->previous_mba) - 1],
-             ff_h261_mba_code[(h->current_mba - h->previous_mba) - 1]);
+             ff_h261_mba_bits[s->mb_skip_run],
+             ff_h261_mba_code[s->mb_skip_run]);
+    s->mb_skip_run = 0;
 
     /* calculate MTYPE */
     if (!s->mb_intra) {
@@ -267,13 +276,15 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
             h->mtype += 3;
         if (s->loop_filter)
             h->mtype += 3;
-        if (cbp || s->dquant)
+        if (cbp)
             h->mtype++;
-        assert(h->mtype > 1);
+        av_assert1(h->mtype > 1);
     }
 
-    if (s->dquant)
+    if (s->dquant && cbp) {
         h->mtype++;
+    } else
+        s->qscale -= s->dquant;
 
     put_bits(&s->pb,
              ff_h261_mtype_bits[h->mtype],
@@ -287,18 +298,16 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     }
 
     if (IS_16X16(h->mtype)) {
-        mv_diff_x       = (motion_x >> 1) - h->current_mv_x;
-        mv_diff_y       = (motion_y >> 1) - h->current_mv_y;
-        h->current_mv_x = (motion_x >> 1);
-        h->current_mv_y = (motion_y >> 1);
+        mv_diff_x       = (motion_x >> 1) - s->last_mv[0][0][0];
+        mv_diff_y       = (motion_y >> 1) - s->last_mv[0][0][1];
+        s->last_mv[0][0][0] = (motion_x >> 1);
+        s->last_mv[0][0][1] = (motion_y >> 1);
         h261_encode_motion(h, mv_diff_x);
         h261_encode_motion(h, mv_diff_y);
     }
 
-    h->previous_mba = h->current_mba;
-
     if (HAS_CBP(h->mtype)) {
-        assert(cbp > 0);
+        av_assert1(cbp > 0);
         put_bits(&s->pb,
                  ff_h261_cbp_tab[cbp - 1][1],
                  ff_h261_cbp_tab[cbp - 1][0]);
@@ -307,10 +316,49 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* encode each block */
         h261_encode_block(h, block[i], i);
 
-    if ((h->current_mba == 11) || (h->current_mba == 22) ||
-        (h->current_mba == 33) || (!IS_16X16(h->mtype))) {
-        h->current_mv_x = 0;
-        h->current_mv_y = 0;
+    if (!IS_16X16(h->mtype)) {
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
+}
+
+static av_cold void init_uni_h261_rl_tab(RLTable *rl, uint32_t *bits_tab,
+                                         uint8_t *len_tab)
+{
+    int slevel, run, last;
+
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
+
+    for(slevel=-64; slevel<64; slevel++){
+        if(slevel==0) continue;
+        for(run=0; run<64; run++){
+            for(last=0; last<=1; last++){
+                const int index= UNI_ENC_INDEX(last, run, slevel+64);
+                int level= slevel < 0 ? -slevel : slevel;
+                int len, code;
+
+                len_tab[index]= 100;
+
+                /* ESC0 */
+                code= get_rl_index(rl, 0, run, level);
+                len=  rl->table_vlc[code][1] + 1;
+                if(last)
+                    len += 2;
+
+                if(code!=rl->n && len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+                /* ESC */
+                len = rl->table_vlc[rl->n][1];
+                if(last)
+                    len += 2;
+
+                if(len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+            }
+        }
     }
 }
 
@@ -322,6 +370,12 @@ av_cold void ff_h261_encode_init(MpegEncContext *s)
     s->max_qcoeff       = 127;
     s->y_dc_scale_table =
     s->c_dc_scale_table = ff_mpeg1_dc_scale_table;
+    s->ac_esc_length    = 6+6+8;
+
+    init_uni_h261_rl_tab(&ff_h261_rl_tcoeff, NULL, uni_h261_rl_len);
+
+    s->intra_ac_vlc_length      = s->inter_ac_vlc_length      = uni_h261_rl_len;
+    s->intra_ac_vlc_last_length = s->inter_ac_vlc_last_length = uni_h261_rl_len + 128*64;
 }
 
 static const AVClass h261_class = {
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index b6a68d017d..800387d1c7 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263.h b/libavcodec/h263.h
index e97718b85b..3c3f1698ff 100644
--- a/libavcodec/h263.h
+++ b/libavcodec/h263.h
@@ -1,20 +1,20 @@
 /*
  * H263 internal header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_H263_H
@@ -95,11 +95,10 @@ int av_const h263_get_picture_format(int width, int height);
 
 void ff_clean_h263_qscales(MpegEncContext *s);
 int ff_h263_resync(MpegEncContext *s);
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *p, const uint8_t *end);
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code);
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code);
 
 
-static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code){
+static inline int h263_get_motion_length(int val, int f_code){
     int l, bit_size, code;
 
     if (val == 0) {
@@ -119,11 +118,11 @@ static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code
 static inline void ff_h263_encode_motion_vector(MpegEncContext * s, int x, int y, int f_code){
     if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT) {
         skip_put_bits(&s->pb,
-            h263_get_motion_length(s, x, f_code)
-           +h263_get_motion_length(s, y, f_code));
+            h263_get_motion_length(x, f_code)
+           +h263_get_motion_length(y, f_code));
     }else{
-        ff_h263_encode_motion(s, x, f_code);
-        ff_h263_encode_motion(s, y, f_code);
+        ff_h263_encode_motion(&s->pb, x, f_code);
+        ff_h263_encode_motion(&s->pb, y, f_code);
     }
 }
 
diff --git a/libavcodec/h263_parser.c b/libavcodec/h263_parser.c
index 71e047a77e..2e7d4930c3 100644
--- a/libavcodec/h263_parser.c
+++ b/libavcodec/h263_parser.c
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,12 +70,16 @@ static int h263_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next= ff_h263_find_frame_end(pc, buf, buf_size);
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next= ff_h263_find_frame_end(pc, buf, buf_size);
 
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
 
     *poutbuf = buf;
diff --git a/libavcodec/h263_parser.h b/libavcodec/h263_parser.h
index 5bd715f49d..565a222bc1 100644
--- a/libavcodec/h263_parser.h
+++ b/libavcodec/h263_parser.h
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263data.c b/libavcodec/h263data.c
index ae322e03c0..ceda80fbc3 100644
--- a/libavcodec/h263data.c
+++ b/libavcodec/h263data.c
@@ -1,20 +1,20 @@
 /*
  * H263+ tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -264,11 +264,11 @@ const uint8_t ff_h263_chroma_qscale_table[32] = {
     0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15
 };
 
-uint16_t ff_mba_max[6] = {
+const uint16_t ff_mba_max[6] = {
     47, 98, 395, 1583, 6335, 9215
 };
 
-uint8_t ff_mba_length[7] = {
+const uint8_t ff_mba_length[7] = {
     6, 7, 9, 11, 13, 14, 14
 };
 
diff --git a/libavcodec/h263data.h b/libavcodec/h263data.h
index c081bda1e8..0ad2ef46b9 100644
--- a/libavcodec/h263data.h
+++ b/libavcodec/h263data.h
@@ -4,20 +4,20 @@
  * copyright (c) 2001 Juan J. Sierralta P
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,7 +71,7 @@ extern const uint8_t ff_modified_quant_tab[2][32];
 
 extern const uint8_t ff_h263_chroma_qscale_table[32];
 
-extern uint16_t ff_mba_max[6];
-extern uint8_t ff_mba_length[7];
+extern const uint16_t ff_mba_max[6];
+extern const uint8_t ff_mba_length[7];
 
 #endif /* AVCODEC_H263DATA_H */
diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index 3a0f4f76c7..c85ea9d6cd 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * H.263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/cpu.h"
 #include "avcodec.h"
 #include "error_resilience.h"
@@ -39,6 +41,7 @@
 #include "mpegvideo.h"
 #include "msmpeg4.h"
 #include "qpeldsp.h"
+#include "vdpau_compat.h"
 #include "thread.h"
 #include "wmv2.h"
 
@@ -47,6 +50,12 @@ static enum AVPixelFormat h263_get_format(AVCodecContext *avctx)
     if (avctx->codec->id == AV_CODEC_ID_MSS2)
         return AV_PIX_FMT_YUV420P;
 
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+        return AV_PIX_FMT_GRAY8;
+    }
+
     return avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
 }
 
@@ -55,14 +64,12 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int ret;
 
-    s->avctx           = avctx;
     s->out_format      = FMT_H263;
-    s->width           = avctx->coded_width;
-    s->height          = avctx->coded_height;
-    s->workaround_bugs = avctx->workaround_bugs;
 
     // set defaults
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
+
     s->quant_precision = 5;
     s->decode_mb       = ff_h263_decode_mb;
     s->low_delay       = 1;
@@ -71,6 +78,7 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     /* select sub codec */
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H263:
+    case AV_CODEC_ID_H263P:
         s->unrestricted_mv = 0;
         avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         break;
@@ -117,8 +125,13 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     }
     s->codec_id    = avctx->codec->id;
 
+    if (avctx->codec_tag == AV_RL32("L263") || avctx->codec_tag == AV_RL32("S263"))
+        if (avctx->extradata_size == 56 && avctx->extradata[0] == 1)
+            s->ehc_mode = 1;
+
     /* for h263, we allocate the images after having read the header */
     if (avctx->codec->id != AV_CODEC_ID_H263 &&
+        avctx->codec->id != AV_CODEC_ID_H263P &&
         avctx->codec->id != AV_CODEC_ID_MPEG4) {
         avctx->pix_fmt = h263_get_format(avctx);
         ff_mpv_idct_init(s);
@@ -174,7 +187,7 @@ static int decode_slice(MpegEncContext *s)
 {
     const int part_mask = s->partitioned_frame
                           ? (ER_AC_END | ER_AC_ERROR) : 0x7F;
-    const int mb_size = 16;
+    const int mb_size   = 16 >> s->avctx->lowres;
     int ret;
 
     s->last_resync_gb   = s->gb;
@@ -186,10 +199,10 @@ static int decode_slice(MpegEncContext *s)
 
     if (s->avctx->hwaccel) {
         const uint8_t *start = s->gb.buffer + get_bits_count(&s->gb) / 8;
-        const uint8_t *end   = ff_h263_find_resync_marker(start + 1,
-                                                          s->gb.buffer_end);
-        skip_bits_long(&s->gb, 8 * (end - start));
-        return s->avctx->hwaccel->decode_slice(s->avctx, start, end - start);
+        ret = s->avctx->hwaccel->decode_slice(s->avctx, start, s->gb.buffer_end - start);
+        // ensure we exit decode loop
+        s->mb_y = s->mb_height;
+        return ret;
     }
 
     if (s->partitioned_frame) {
@@ -238,6 +251,8 @@ static int decode_slice(MpegEncContext *s)
             s->mv_type = MV_TYPE_16X16;
             ff_dlog(s, "%d %d %06X\n",
                     ret, get_bits_count(&s->gb), show_bits(&s->gb, 24));
+
+            ff_tlog(NULL, "Decoding MB at %dx%d\n", s->mb_x, s->mb_y);
             ret = s->decode_mb(s, s->block);
 
             if (s->pict_type != AV_PICTURE_TYPE_B)
@@ -274,6 +289,8 @@ static int decode_slice(MpegEncContext *s)
                 ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                                 s->mb_x, s->mb_y, ER_MB_ERROR & part_mask);
 
+                if (s->avctx->err_recognition & AV_EF_IGNORE_ERR)
+                    continue;
                 return AVERROR_INVALIDDATA;
             }
 
@@ -288,7 +305,7 @@ static int decode_slice(MpegEncContext *s)
         s->mb_x = 0;
     }
 
-    assert(s->mb_x == 0 && s->mb_y == s->mb_height);
+    av_assert1(s->mb_x == 0 && s->mb_y == s->mb_height);
 
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
@@ -301,7 +318,7 @@ static int decode_slice(MpegEncContext *s)
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
         get_bits_left(&s->gb) >= 0               &&
-        get_bits_left(&s->gb) < 48               &&
+        get_bits_left(&s->gb) < 137              &&
         !s->data_partitioning) {
         const int bits_count = get_bits_count(&s->gb);
         const int bits_left  = s->gb.size_in_bits - bits_count;
@@ -322,8 +339,27 @@ static int decode_slice(MpegEncContext *s)
         }
     }
 
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 8               &&
+        get_bits_left(&s->gb) < 300              &&
+        s->pict_type == AV_PICTURE_TYPE_I        &&
+        show_bits(&s->gb, 8) == 0                &&
+        !s->data_partitioning) {
+
+        s->padding_bug_score += 32;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 64              &&
+        AV_RB64(s->gb.buffer_end - 8) == 0xCDCDCDCDFC7F0000) {
+
+        s->padding_bug_score += 32;
+    }
+
     if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_id == AV_CODEC_ID_H263 ||
+        if (
             (s->padding_bug_score > -2 && !s->data_partitioning))
             s->workaround_bugs |= FF_BUG_NO_PADDING;
         else
@@ -342,7 +378,7 @@ static int decode_slice(MpegEncContext *s)
         /* buggy padding but the frame should still end approximately at
          * the bitstream end */
         if ((s->workaround_bugs & FF_BUG_NO_PADDING) &&
-            (s->avctx->err_recognition & AV_EF_BUFFER))
+            (s->avctx->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE)))
             max_extra += 48;
         else if ((s->workaround_bugs & FF_BUG_NO_PADDING))
             max_extra += 256 * 256 * 256 * 64;
@@ -377,6 +413,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int buf_size       = avpkt->size;
     MpegEncContext *s  = avctx->priv_data;
     int ret;
+    int slice_ret = 0;
     AVFrame *pict = data;
 
     /* no supplementary picture */
@@ -400,6 +437,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             next = ff_mpeg4_find_frame_end(&s->parse_context, buf, buf_size);
         } else if (CONFIG_H263_DECODER && s->codec_id == AV_CODEC_ID_H263) {
             next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
+        } else if (CONFIG_H263P_DECODER && s->codec_id == AV_CODEC_ID_H263P) {
+            next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
         } else {
             av_log(s->avctx, AV_LOG_ERROR,
                    "this codec does not support truncated bitstreams\n");
@@ -411,13 +450,27 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return buf_size;
     }
 
-    if (s->bitstream_buffer_size && (s->divx_packed || buf_size < 20)) // divx 5.01+/xvid frame reorder
+retry:
+    if (s->divx_packed && s->bitstream_buffer_size) {
+        int i;
+        for(i=0; i < buf_size-3; i++) {
+            if (buf[i]==0 && buf[i+1]==0 && buf[i+2]==1) {
+                if (buf[i+3]==0xB0) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Discarding excessive bitstream in packed xvid\n");
+                    s->bitstream_buffer_size = 0;
+                }
+                break;
+            }
+        }
+    }
+
+    if (s->bitstream_buffer_size && (s->divx_packed || buf_size <= MAX_NVOP_SIZE)) // divx 5.01+/xvid frame reorder
         ret = init_get_bits8(&s->gb, s->bitstream_buffer,
                              s->bitstream_buffer_size);
     else
         ret = init_get_bits8(&s->gb, buf, buf_size);
-    s->bitstream_buffer_size = 0;
 
+    s->bitstream_buffer_size = 0;
     if (ret < 0)
         return ret;
 
@@ -434,11 +487,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->avctx->extradata_size && s->picture_number == 0) {
             GetBitContext gb;
 
-            ret = init_get_bits8(&gb, s->avctx->extradata,
-                                 s->avctx->extradata_size);
-            if (ret < 0)
-                return ret;
-            ff_mpeg4_decode_picture_header(avctx->priv_data, &gb);
+            if (init_get_bits8(&gb, s->avctx->extradata, s->avctx->extradata_size) >= 0 )
+                ff_mpeg4_decode_picture_header(avctx->priv_data, &gb);
         }
         ret = ff_mpeg4_decode_picture_header(avctx->priv_data, &s->gb);
     } else if (CONFIG_H263I_DECODER && s->codec_id == AV_CODEC_ID_H263I) {
@@ -449,6 +499,14 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = ff_h263_decode_picture_header(s);
     }
 
+    if (ret < 0 || ret == FRAME_SKIPPED) {
+        if (   s->width  != avctx->coded_width
+            || s->height != avctx->coded_height) {
+                av_log(s->avctx, AV_LOG_WARNING, "Reverting picture dimensions change due to header decoding failure\n");
+                s->width = avctx->coded_width;
+                s->height= avctx->coded_height;
+        }
+    }
     if (ret == FRAME_SKIPPED)
         return get_consumed_bytes(s, buf_size);
 
@@ -473,25 +531,9 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     avctx->has_b_frames = !s->low_delay;
 
-#define SET_QPEL_FUNC(postfix1, postfix2)                           \
-    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
-    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
-    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
-
-    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
-
-        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4) {
+        if (ff_mpeg4_workaround_bugs(avctx) == 1)
+            goto retry;
     }
 
     /* After H263 & mpeg4 header decode we have the height, width,
@@ -561,6 +603,13 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (!s->divx_packed && !avctx->hwaccel)
         ff_thread_finish_setup(avctx);
 
+#if FF_API_CAP_VDPAU
+    if (CONFIG_MPEG4_VDPAU_DECODER && (s->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)) {
+        ff_vdpau_mpeg4_decode_picture(avctx->priv_data, s->gb.buffer, s->gb.buffer_end - s->gb.buffer);
+        goto frame_end;
+    }
+#endif
+
     if (avctx->hwaccel) {
         ret = avctx->hwaccel->start_frame(avctx, s->gb.buffer,
                                           s->gb.buffer_end - s->gb.buffer);
@@ -578,14 +627,14 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (ret < 0)
             return ret;
         if (ret == 1)
-            goto intrax8_decoded;
+            goto frame_end;
     }
 
     /* decode each macroblock */
     s->mb_x = 0;
     s->mb_y = 0;
 
-    ret = decode_slice(s);
+    slice_ret = decode_slice(s);
     while (s->mb_y < s->mb_height) {
         if (s->msmpeg4_version) {
             if (s->slice_height == 0 || s->mb_x != 0 ||
@@ -603,7 +652,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mpeg4_clean_buffers(s);
 
         if (decode_slice(s) < 0)
-            ret = AVERROR_INVALIDDATA;
+            slice_ret = AVERROR_INVALIDDATA;
     }
 
     if (s->msmpeg4_version && s->msmpeg4_version < 4 &&
@@ -612,12 +661,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_msmpeg4_decode_ext_header(s, buf_size) < 0)
             s->er.error_status_table[s->mb_num - 1] = ER_MB_ERROR;
 
-    assert(s->bitstream_buffer_size == 0);
-
-    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
-        ff_mpeg4_frame_end(avctx, buf, buf_size);
-
-intrax8_decoded:
+    av_assert1(s->bitstream_buffer_size == 0);
+frame_end:
     ff_er_frame_end(&s->er);
 
     if (avctx->hwaccel) {
@@ -628,26 +673,46 @@ intrax8_decoded:
 
     ff_mpv_frame_end(s);
 
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
+        ff_mpeg4_frame_end(avctx, buf, buf_size);
+
     if (!s->divx_packed && avctx->hwaccel)
         ff_thread_finish_setup(avctx);
 
-    assert(s->current_picture.f->pict_type ==
-           s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->pict_type);
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     }
 
-    if (s->last_picture_ptr || s->low_delay)
+    if (s->last_picture_ptr || s->low_delay) {
+        if (   pict->format == AV_PIX_FMT_YUV420P
+            && (s->codec_tag == AV_RL32("GEOV") || s->codec_tag == AV_RL32("GEOX"))) {
+            int x, y, p;
+            av_frame_make_writable(pict);
+            for (p=0; p<3; p++) {
+                int w = FF_CEIL_RSHIFT(pict-> width, !!p);
+                int h = FF_CEIL_RSHIFT(pict->height, !!p);
+                int linesize = pict->linesize[p];
+                for (y=0; y<(h>>1); y++)
+                    for (x=0; x<w; x++)
+                        FFSWAP(int,
+                               pict->data[p][x + y*linesize],
+                               pict->data[p][x + (h-1-y)*linesize]);
+            }
+        }
         *got_frame = 1;
+    }
 
-    if (ret && (avctx->err_recognition & AV_EF_EXPLODE))
+    if (slice_ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
         return ret;
     else
         return get_consumed_bytes(s, buf_size);
@@ -655,11 +720,14 @@ intrax8_decoded:
 
 const enum AVPixelFormat ff_h263_hwaccel_pixfmt_list_420[] = {
 #if CONFIG_H263_VAAPI_HWACCEL || CONFIG_MPEG4_VAAPI_HWACCEL
-    AV_PIX_FMT_VAAPI_VLD,
+    AV_PIX_FMT_VAAPI,
 #endif
 #if CONFIG_H263_VDPAU_HWACCEL || CONFIG_MPEG4_VDPAU_HWACCEL
     AV_PIX_FMT_VDPAU,
 #endif
+#if CONFIG_H263_VIDEOTOOLBOX_HWACCEL || CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
+#endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
 };
@@ -676,5 +744,22 @@ AVCodec ff_h263_decoder = {
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
+    .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
+};
+
+AVCodec ff_h263p_decoder = {
+    .name           = "h263p",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.263 / H.263-1996, H.263+ / H.263-1998 / H.263 version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263P,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_h263_decode_init,
+    .close          = ff_h263_decode_end,
+    .decode         = ff_h263_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
+    .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
 };
diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 70ecdb9a89..b3c0bcd450 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,4 +121,6 @@ av_cold void ff_h263dsp_init(H263DSPContext *ctx)
 
     if (ARCH_X86)
         ff_h263dsp_init_x86(ctx);
+    if (ARCH_MIPS)
+        ff_h263dsp_init_mips(ctx);
 }
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 40f041cd0d..1abea3ca8c 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,6 @@ typedef struct H263DSPContext {
 
 void ff_h263dsp_init(H263DSPContext *ctx);
 void ff_h263dsp_init_x86(H263DSPContext *ctx);
+void ff_h263dsp_init_mips(H263DSPContext *ctx);
 
 #endif /* AVCODEC_H263DSP_H */
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index d4cb0300c1..b662e562f5 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/avassert.h"
 #include "libavutil/display.h"
 #include "libavutil/imgutils.h"
@@ -47,11 +49,16 @@
 #include "rectangle.h"
 #include "svq3.h"
 #include "thread.h"
-
-#include <assert.h>
+#include "vdpau_compat.h"
 
 const uint16_t ff_h264_mb_sizes[4] = { 256, 384, 512, 768 };
 
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    return h ? h->sps.num_reorder_frames : 0;
+}
+
 static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
                               int (*mv)[2][4][2],
                               int mb_x, int mb_y, int mb_intra, int mb_skipped)
@@ -63,19 +70,28 @@ static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
     sl->mb_y = mb_y;
     sl->mb_xy = mb_x + mb_y * h->mb_stride;
     memset(sl->non_zero_count_cache, 0, sizeof(sl->non_zero_count_cache));
-    assert(ref >= 0);
+    av_assert1(ref >= 0);
     /* FIXME: It is possible albeit uncommon that slice references
      * differ between slices. We take the easy approach and ignore
      * it for now. If this turns out to have any relevance in
      * practice then correct remapping should be added. */
     if (ref >= sl->ref_count[0])
         ref = 0;
+    if (!sl->ref_list[0][ref].data[0]) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference not available for error concealing\n");
+        ref = 0;
+    }
+    if ((sl->ref_list[0][ref].reference&3) != 3) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference invalid\n");
+        return;
+    }
     fill_rectangle(&h->cur_pic.ref_index[0][4 * sl->mb_xy],
                    2, 2, 2, ref, 1);
     fill_rectangle(&sl->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
     fill_rectangle(sl->mv_cache[0][scan8[0]], 4, 4, 8,
                    pack16to32((*mv)[0][0][0], (*mv)[0][0][1]), 4);
-    assert(!FRAME_MBAFF(h));
+    sl->mb_mbaff =
+    sl->mb_field_decoding_flag = 0;
     ff_h264_hl_decode_mb(h, &h->slice_ctx[0]);
 }
 
@@ -190,18 +206,18 @@ int ff_h264_check_intra_pred_mode(const H264Context *h, H264SliceContext *sl,
 
     if ((sl->left_samples_available & 0x8080) != 0x8080) {
         mode = left[mode];
-        if (is_chroma && (sl->left_samples_available & 0x8080)) {
-            // mad cow disease mode, aka MBAFF + constrained_intra_pred
-            mode = ALZHEIMER_DC_L0T_PRED8x8 +
-                   (!(sl->left_samples_available & 0x8000)) +
-                   2 * (mode == DC_128_PRED8x8);
-        }
         if (mode < 0) {
             av_log(h->avctx, AV_LOG_ERROR,
                    "left block unavailable for requested intra mode at %d %d\n",
                    sl->mb_x, sl->mb_y);
             return AVERROR_INVALIDDATA;
         }
+        if (is_chroma && (sl->left_samples_available & 0x8080)) {
+            // mad cow disease mode, aka MBAFF + constrained_intra_pred
+            mode = ALZHEIMER_DC_L0T_PRED8x8 +
+                   (!(sl->left_samples_available & 0x8000)) +
+                   2 * (mode == DC_128_PRED8x8);
+        }
     }
 
     return mode;
@@ -223,7 +239,7 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, H264SliceContext *sl,
 
 #define STARTCODE_TEST                                                  \
     if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {         \
-        if (src[i + 2] != 3) {                                          \
+        if (src[i + 2] != 3 && src[i + 2] != 0) {                       \
             /* startcode, so we must be past the end */                 \
             length = i;                                                 \
         }                                                               \
@@ -268,19 +284,23 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, H264SliceContext *sl,
     }
 #endif
 
-    if (i >= length - 1) { // no escaped 0
-        *dst_length = length;
-        *consumed   = length + 1; // +1 for the header
-        return src;
-    }
-
-    av_fast_malloc(&sl->rbsp_buffer, &sl->rbsp_buffer_size,
-                   length + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&sl->rbsp_buffer, &sl->rbsp_buffer_size, length+MAX_MBPAIR_SIZE);
     dst = sl->rbsp_buffer;
 
     if (!dst)
         return NULL;
 
+    if(i>=length-1){ //no escaped 0
+        *dst_length= length;
+        *consumed= length+1; //+1 for the header
+        if(h->avctx->flags2 & AV_CODEC_FLAG2_FAST){
+            return src;
+        }else{
+            memcpy(dst, src, length);
+            return dst;
+        }
+    }
+
     memcpy(dst, src, i);
     si = di = i;
     while (si + 2 < length) {
@@ -288,7 +308,7 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, H264SliceContext *sl,
         if (src[si + 2] > 3) {
             dst[di++] = src[si++];
             dst[di++] = src[si++];
-        } else if (src[si] == 0 && src[si + 1] == 0) {
+        } else if (src[si] == 0 && src[si + 1] == 0 && src[si + 2] != 0) {
             if (src[si + 2] == 3) { // escape
                 dst[di++]  = 0;
                 dst[di++]  = 0;
@@ -378,11 +398,11 @@ void ff_h264_free_tables(H264Context *h)
 int ff_h264_alloc_tables(H264Context *h)
 {
     const int big_mb_num = h->mb_stride * (h->mb_height + 1);
-    const int row_mb_num = h->mb_stride * 2 * h->avctx->thread_count;
+    const int row_mb_num = 2*h->mb_stride*FFMAX(h->avctx->thread_count, 1);
     int x, y;
 
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
-                      row_mb_num * 8 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
+                      row_mb_num, 8 * sizeof(uint8_t), fail)
     h->slice_ctx[0].intra4x4_pred_mode = h->intra4x4_pred_mode;
 
     FF_ALLOCZ_OR_GOTO(h->avctx, h->non_zero_count,
@@ -393,10 +413,10 @@ int ff_h264_alloc_tables(H264Context *h)
                       big_mb_num * sizeof(uint16_t), fail)
     FF_ALLOCZ_OR_GOTO(h->avctx, h->chroma_pred_mode_table,
                       big_mb_num * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[0],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[1],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[0],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[1],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
     h->slice_ctx[0].mvd_table[0] = h->mvd_table[0];
     h->slice_ctx[0].mvd_table[1] = h->mvd_table[1];
 
@@ -423,7 +443,7 @@ int ff_h264_alloc_tables(H264Context *h)
         }
 
     if (!h->dequant4_coeff[0])
-        h264_init_dequant_tables(h);
+        ff_h264_init_dequant_tables(h);
 
     return 0;
 
@@ -452,7 +472,11 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl)
     sl->ref_cache[1][scan8[7]  + 1] =
     sl->ref_cache[1][scan8[13] + 1] = PART_NOT_AVAILABLE;
 
+    if (sl != h->slice_ctx) {
+        memset(er, 0, sizeof(*er));
+    } else
     if (CONFIG_ERROR_RESILIENCE) {
+
         /* init ER */
         er->avctx          = h->avctx;
         er->decode_mb      = h264_er_decode_mb;
@@ -500,20 +524,23 @@ fail:
 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
                             int parse_extradata);
 
-int ff_h264_decode_extradata(H264Context *h)
+int ff_h264_decode_extradata(H264Context *h, const uint8_t *buf, int size)
 {
     AVCodecContext *avctx = h->avctx;
     int ret;
 
-    if (avctx->extradata[0] == 1) {
+    if (!buf || size <= 0)
+        return -1;
+
+    if (buf[0] == 1) {
         int i, cnt, nalsize;
-        unsigned char *p = avctx->extradata;
+        const unsigned char *p = buf;
 
         h->is_avc = 1;
 
-        if (avctx->extradata_size < 7) {
+        if (size < 7) {
             av_log(avctx, AV_LOG_ERROR,
-                   "avcC %d too short\n", avctx->extradata_size);
+                   "avcC %d too short\n", size);
             return AVERROR_INVALIDDATA;
         }
         /* sps and pps in the avcC always have length coded with 2 bytes,
@@ -524,7 +551,7 @@ int ff_h264_decode_extradata(H264Context *h)
         p  += 6;
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - avctx->extradata + nalsize > avctx->extradata_size)
+            if(nalsize > size - (p-buf))
                 return AVERROR_INVALIDDATA;
             ret = decode_nal_units(h, p, nalsize, 1);
             if (ret < 0) {
@@ -538,7 +565,7 @@ int ff_h264_decode_extradata(H264Context *h)
         cnt = *(p++); // Number of pps
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - avctx->extradata + nalsize > avctx->extradata_size)
+            if(nalsize > size - (p-buf))
                 return AVERROR_INVALIDDATA;
             ret = decode_nal_units(h, p, nalsize, 1);
             if (ret < 0) {
@@ -549,14 +576,14 @@ int ff_h264_decode_extradata(H264Context *h)
             p += nalsize;
         }
         // Store right nal length size that will be used to parse all other nals
-        h->nal_length_size = (avctx->extradata[4] & 0x03) + 1;
+        h->nal_length_size = (buf[4] & 0x03) + 1;
     } else {
         h->is_avc = 0;
-        ret = decode_nal_units(h, avctx->extradata, avctx->extradata_size, 1);
+        ret = decode_nal_units(h, buf, size, 1);
         if (ret < 0)
             return ret;
     }
-    return 0;
+    return size;
 }
 
 static int h264_init_context(AVCodecContext *avctx, H264Context *h)
@@ -564,7 +591,12 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     int i;
 
     h->avctx                 = avctx;
+    h->backup_width          = -1;
+    h->backup_height         = -1;
+    h->backup_pix_fmt        = AV_PIX_FMT_NONE;
     h->dequant_coeff_pps     = -1;
+    h->current_sps_id        = -1;
+    h->cur_chroma_format_idc = -1;
 
     h->picture_structure     = PICT_FRAME;
     h->slice_context_count   = 1;
@@ -574,6 +606,8 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     h->x264_build            = -1;
     h->recovery_frame        = -1;
     h->frame_recovered       = 0;
+    h->prev_frame_num        = -1;
+    h->sei_fpa.frame_packing_arrangement_cancel_flag = -1;
 
     h->next_outputed_poc = INT_MIN;
     for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
@@ -600,6 +634,10 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     if (!h->cur_pic.f)
         return AVERROR(ENOMEM);
 
+    h->last_pic_for_ec.f = av_frame_alloc();
+    if (!h->last_pic_for_ec.f)
+        return AVERROR(ENOMEM);
+
     for (i = 0; i < h->nb_slice_ctx; i++)
         h->slice_ctx[i].h264 = h;
 
@@ -624,17 +662,21 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
     ff_init_cabac_states();
 
     if (avctx->codec_id == AV_CODEC_ID_H264) {
-        if (avctx->ticks_per_frame == 1)
-            h->avctx->framerate.num *= 2;
+        if (avctx->ticks_per_frame == 1) {
+            if(h->avctx->time_base.den < INT_MAX/2) {
+                h->avctx->time_base.den *= 2;
+            } else
+                h->avctx->time_base.num /= 2;
+        }
         avctx->ticks_per_frame = 2;
     }
 
     if (avctx->extradata_size > 0 && avctx->extradata) {
-       ret = ff_h264_decode_extradata(h);
-       if (ret < 0) {
-           ff_h264_free_context(h);
-           return ret;
-       }
+        ret = ff_h264_decode_extradata(h, avctx->extradata, avctx->extradata_size);
+        if (ret < 0) {
+            ff_h264_free_context(h);
+            return ret;
+        }
     }
 
     if (h->sps.bitstream_restriction_flag &&
@@ -645,15 +687,21 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
 
     avctx->internal->allocate_progress = 1;
 
-    if (h->enable_er) {
+    ff_h264_flush_change(h);
+
+    if (h->enable_er < 0 && (avctx->active_thread_type & FF_THREAD_SLICE))
+        h->enable_er = 0;
+
+    if (h->enable_er && (avctx->active_thread_type & FF_THREAD_SLICE)) {
         av_log(avctx, AV_LOG_WARNING,
-               "Error resilience is enabled. It is unsafe and unsupported and may crash. "
+               "Error resilience with slice threads is enabled. It is unsafe and unsupported and may crash. "
                "Use it at your own risk\n");
     }
 
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {
     H264Context *h = avctx->priv_data;
@@ -672,6 +720,7 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 /**
  * Run setup operations that must be run after slice header decoding.
@@ -686,7 +735,6 @@ static void decode_postinit(H264Context *h, int setup_finished)
     H264Picture *out = h->cur_pic_ptr;
     H264Picture *cur = h->cur_pic_ptr;
     int i, pics, out_of_order, out_idx;
-    int invalid = 0, cnt = 0;
 
     h->cur_pic_ptr->f->pict_type = h->pict_type;
 
@@ -700,7 +748,10 @@ static void decode_postinit(H264Context *h, int setup_finished)
          * yet, so we assume the worst for now. */
         // if (setup_finished)
         //    ff_thread_finish_setup(h->avctx);
-        return;
+        if (cur->field_poc[0] == INT_MAX && cur->field_poc[1] == INT_MAX)
+            return;
+        if (h->avctx->hwaccel || h->missing_fields <=1)
+            return;
     }
 
     cur->f->interlaced_frame = 0;
@@ -754,7 +805,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
         /* Derive top_field_first from field pocs. */
         cur->f->top_field_first = cur->field_poc[0] < cur->field_poc[1];
     } else {
-        if (cur->f->interlaced_frame || h->sps.pic_struct_present_flag) {
+        if (h->sps.pic_struct_present_flag) {
             /* Use picture timing SEI information. Even if it is a
              * information of a past frame, better than nothing. */
             if (h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM ||
@@ -762,6 +813,10 @@ static void decode_postinit(H264Context *h, int setup_finished)
                 cur->f->top_field_first = 1;
             else
                 cur->f->top_field_first = 0;
+        } else if (cur->f->interlaced_frame) {
+            /* Default to top field first when pic_struct_present_flag
+             * is not set but interlaced frame detected */
+            cur->f->top_field_first = 1;
         } else {
             /* Most likely progressive */
             cur->f->top_field_first = 0;
@@ -774,9 +829,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
         h->content_interpretation_type > 0 &&
         h->content_interpretation_type < 3) {
         AVStereo3D *stereo = av_stereo3d_create_side_data(cur->f);
-        if (!stereo)
-            return;
-
+        if (stereo) {
         switch (h->frame_packing_arrangement_type) {
         case 0:
             stereo->type = AV_STEREO3D_CHECKERBOARD;
@@ -806,6 +859,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
 
         if (h->content_interpretation_type == 2)
             stereo->flags = AV_STEREO3D_FLAG_INVERT;
+        }
     }
 
     if (h->sei_display_orientation_present &&
@@ -814,36 +868,37 @@ static void decode_postinit(H264Context *h, int setup_finished)
         AVFrameSideData *rotation = av_frame_new_side_data(cur->f,
                                                            AV_FRAME_DATA_DISPLAYMATRIX,
                                                            sizeof(int32_t) * 9);
-        if (!rotation)
-            return;
-
-        av_display_rotation_set((int32_t *)rotation->data, angle);
-        av_display_matrix_flip((int32_t *)rotation->data,
-                               h->sei_hflip, h->sei_vflip);
+        if (rotation) {
+            av_display_rotation_set((int32_t *)rotation->data, angle);
+            av_display_matrix_flip((int32_t *)rotation->data,
+                                   h->sei_hflip, h->sei_vflip);
+        }
     }
 
     if (h->sei_reguserdata_afd_present) {
         AVFrameSideData *sd = av_frame_new_side_data(cur->f, AV_FRAME_DATA_AFD,
                                                      sizeof(uint8_t));
-        if (!sd)
-            return;
 
-        *sd->data = h->active_format_description;
-        h->sei_reguserdata_afd_present = 0;
+        if (sd) {
+            *sd->data = h->active_format_description;
+            h->sei_reguserdata_afd_present = 0;
+        }
     }
 
     if (h->a53_caption) {
         AVFrameSideData *sd = av_frame_new_side_data(cur->f,
                                                      AV_FRAME_DATA_A53_CC,
                                                      h->a53_caption_size);
-        if (!sd)
-            return;
-
-        memcpy(sd->data, h->a53_caption, h->a53_caption_size);
+        if (sd)
+            memcpy(sd->data, h->a53_caption, h->a53_caption_size);
         av_freep(&h->a53_caption);
         h->a53_caption_size = 0;
+        h->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
     }
 
+    cur->mmco_reset = h->mmco_reset;
+    h->mmco_reset = 0;
+
     // FIXME do something with unavailable reference frames
 
     /* Sort B-frames into display order */
@@ -860,107 +915,71 @@ static void decode_postinit(H264Context *h, int setup_finished)
         h->low_delay           = 0;
     }
 
+    for (i = 0; 1; i++) {
+        if(i == MAX_DELAYED_PIC_COUNT || cur->poc < h->last_pocs[i]){
+            if(i)
+                h->last_pocs[i-1] = cur->poc;
+            break;
+        } else if(i) {
+            h->last_pocs[i-1]= h->last_pocs[i];
+        }
+    }
+    out_of_order = MAX_DELAYED_PIC_COUNT - i;
+    if(   cur->f->pict_type == AV_PICTURE_TYPE_B
+       || (h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > INT_MIN && h->last_pocs[MAX_DELAYED_PIC_COUNT-1] - h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > 2))
+        out_of_order = FFMAX(out_of_order, 1);
+    if (out_of_order == MAX_DELAYED_PIC_COUNT) {
+        av_log(h->avctx, AV_LOG_VERBOSE, "Invalid POC %d<%d\n", cur->poc, h->last_pocs[0]);
+        for (i = 1; i < MAX_DELAYED_PIC_COUNT; i++)
+            h->last_pocs[i] = INT_MIN;
+        h->last_pocs[0] = cur->poc;
+        cur->mmco_reset = 1;
+    } else if(h->avctx->has_b_frames < out_of_order && !h->sps.bitstream_restriction_flag){
+        av_log(h->avctx, AV_LOG_VERBOSE, "Increasing reorder buffer to %d\n", out_of_order);
+        h->avctx->has_b_frames = out_of_order;
+        h->low_delay = 0;
+    }
+
     pics = 0;
     while (h->delayed_pic[pics])
         pics++;
 
-    assert(pics <= MAX_DELAYED_PIC_COUNT);
+    av_assert0(pics <= MAX_DELAYED_PIC_COUNT);
 
     h->delayed_pic[pics++] = cur;
     if (cur->reference == 0)
         cur->reference = DELAYED_PIC_REF;
 
-    /* Frame reordering. This code takes pictures from coding order and sorts
-     * them by their incremental POC value into display order. It supports POC
-     * gaps, MMCO reset codes and random resets.
-     * A "display group" can start either with a IDR frame (f.key_frame = 1),
-     * and/or can be closed down with a MMCO reset code. In sequences where
-     * there is no delay, we can't detect that (since the frame was already
-     * output to the user), so we also set h->mmco_reset to detect the MMCO
-     * reset code.
-     * FIXME: if we detect insufficient delays (as per h->avctx->has_b_frames),
-     * we increase the delay between input and output. All frames affected by
-     * the lag (e.g. those that should have been output before another frame
-     * that we already returned to the user) will be dropped. This is a bug
-     * that we will fix later. */
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++) {
-        cnt     += out->poc < h->last_pocs[i];
-        invalid += out->poc == INT_MIN;
-    }
-    if (!h->mmco_reset && !cur->f->key_frame &&
-        cnt + invalid == MAX_DELAYED_PIC_COUNT && cnt > 0) {
-        h->mmco_reset = 2;
-        if (pics > 1)
-            h->delayed_pic[pics - 2]->mmco_reset = 2;
-    }
-    if (h->mmco_reset || cur->f->key_frame) {
-        for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-            h->last_pocs[i] = INT_MIN;
-        cnt     = 0;
-        invalid = MAX_DELAYED_PIC_COUNT;
-    }
     out     = h->delayed_pic[0];
     out_idx = 0;
-    for (i = 1; i < MAX_DELAYED_PIC_COUNT &&
-                h->delayed_pic[i] &&
-                !h->delayed_pic[i - 1]->mmco_reset &&
-                !h->delayed_pic[i]->f->key_frame;
+    for (i = 1; h->delayed_pic[i] &&
+                !h->delayed_pic[i]->f->key_frame &&
+                !h->delayed_pic[i]->mmco_reset;
          i++)
         if (h->delayed_pic[i]->poc < out->poc) {
             out     = h->delayed_pic[i];
             out_idx = i;
         }
     if (h->avctx->has_b_frames == 0 &&
-        (h->delayed_pic[0]->f->key_frame || h->mmco_reset))
+        (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset))
         h->next_outputed_poc = INT_MIN;
-    out_of_order = !out->f->key_frame && !h->mmco_reset &&
-                   (out->poc < h->next_outputed_poc);
-
-    if (h->sps.bitstream_restriction_flag &&
-        h->avctx->has_b_frames >= h->sps.num_reorder_frames) {
-    } else if (out_of_order && pics - 1 == h->avctx->has_b_frames &&
-               h->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT) {
-        if (invalid + cnt < MAX_DELAYED_PIC_COUNT) {
-            h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, cnt);
-        }
-        h->low_delay = 0;
-    } else if (h->low_delay &&
-               ((h->next_outputed_poc != INT_MIN &&
-                 out->poc > h->next_outputed_poc + 2) ||
-                cur->f->pict_type == AV_PICTURE_TYPE_B)) {
-        h->low_delay = 0;
-        h->avctx->has_b_frames++;
-    }
+    out_of_order = out->poc < h->next_outputed_poc;
 
-    if (pics > h->avctx->has_b_frames) {
+    if (out_of_order || pics > h->avctx->has_b_frames) {
         out->reference &= ~DELAYED_PIC_REF;
         // for frame threading, the owner must be the second field's thread or
         // else the first thread can release the picture and reuse it unsafely
         for (i = out_idx; h->delayed_pic[i]; i++)
             h->delayed_pic[i] = h->delayed_pic[i + 1];
     }
-    memmove(h->last_pocs, &h->last_pocs[1],
-            sizeof(*h->last_pocs) * (MAX_DELAYED_PIC_COUNT - 1));
-    h->last_pocs[MAX_DELAYED_PIC_COUNT - 1] = cur->poc;
     if (!out_of_order && pics > h->avctx->has_b_frames) {
         h->next_output_pic = out;
-        if (out->mmco_reset) {
-            if (out_idx > 0) {
-                h->next_outputed_poc                    = out->poc;
-                h->delayed_pic[out_idx - 1]->mmco_reset = out->mmco_reset;
-            } else {
-                h->next_outputed_poc = INT_MIN;
-            }
-        } else {
-            if (out_idx == 0 && pics > 1 && h->delayed_pic[0]->f->key_frame) {
-                h->next_outputed_poc = INT_MIN;
-            } else {
-                h->next_outputed_poc = out->poc;
-            }
-        }
-        h->mmco_reset = 0;
+        if (out_idx == 0 && h->delayed_pic[0] && (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset)) {
+            h->next_outputed_poc = INT_MIN;
+        } else
+            h->next_outputed_poc = out->poc;
     } else {
-        av_log(h->avctx, AV_LOG_DEBUG, "no picture\n");
+        av_log(h->avctx, AV_LOG_DEBUG, "no picture %s\n", out_of_order ? "ooo" : "");
     }
 
     if (h->next_output_pic) {
@@ -990,6 +1009,16 @@ int ff_pred_weight_table(H264Context *h, H264SliceContext *sl)
     sl->luma_log2_weight_denom = get_ue_golomb(&sl->gb);
     if (h->sps.chroma_format_idc)
         sl->chroma_log2_weight_denom = get_ue_golomb(&sl->gb);
+
+    if (sl->luma_log2_weight_denom > 7U) {
+        av_log(h->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is out of range\n", sl->luma_log2_weight_denom);
+        sl->luma_log2_weight_denom = 0;
+    }
+    if (sl->chroma_log2_weight_denom > 7U) {
+        av_log(h->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is out of range\n", sl->chroma_log2_weight_denom);
+        sl->chroma_log2_weight_denom = 0;
+    }
+
     luma_def   = 1 << sl->luma_log2_weight_denom;
     chroma_def = 1 << sl->chroma_log2_weight_denom;
 
@@ -1047,28 +1076,43 @@ int ff_pred_weight_table(H264Context *h, H264SliceContext *sl)
  */
 static void idr(H264Context *h)
 {
+    int i;
     ff_h264_remove_all_refs(h);
     h->prev_frame_num        =
-    h->prev_frame_num_offset =
-    h->prev_poc_msb          =
+    h->prev_frame_num_offset = 0;
+    h->prev_poc_msb          = 1<<16;
     h->prev_poc_lsb          = 0;
+    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
+        h->last_pocs[i] = INT_MIN;
 }
 
 /* forget old pics after a seek */
 void ff_h264_flush_change(H264Context *h)
 {
-    int i;
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-        h->last_pocs[i] = INT_MIN;
+    int i, j;
+
     h->next_outputed_poc = INT_MIN;
     h->prev_interlaced_frame = 1;
     idr(h);
-    if (h->cur_pic_ptr)
+
+    h->prev_frame_num = -1;
+    if (h->cur_pic_ptr) {
         h->cur_pic_ptr->reference = 0;
+        for (j=i=0; h->delayed_pic[i]; i++)
+            if (h->delayed_pic[i] != h->cur_pic_ptr)
+                h->delayed_pic[j++] = h->delayed_pic[i];
+        h->delayed_pic[j] = NULL;
+    }
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+
     h->first_field = 0;
     ff_h264_reset_sei(h);
     h->recovery_frame = -1;
     h->frame_recovered = 0;
+    h->current_slice = 0;
+    h->mmco_reset = 1;
+    for (i = 0; i < h->nb_slice_ctx; i++)
+        h->slice_ctx[i].list_count = 0;
 }
 
 /* forget old pics after a seek */
@@ -1200,26 +1244,34 @@ int ff_h264_get_profile(SPS *sps)
 int ff_set_ref_count(H264Context *h, H264SliceContext *sl)
 {
     int ref_count[2], list_count;
-    int num_ref_idx_active_override_flag, max_refs;
+    int num_ref_idx_active_override_flag;
 
     // set defaults, might be overridden a few lines later
     ref_count[0] = h->pps.ref_count[0];
     ref_count[1] = h->pps.ref_count[1];
 
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
+        unsigned max[2];
+        max[0] = max[1] = h->picture_structure == PICT_FRAME ? 15 : 31;
+
         if (sl->slice_type_nos == AV_PICTURE_TYPE_B)
             sl->direct_spatial_mv_pred = get_bits1(&sl->gb);
         num_ref_idx_active_override_flag = get_bits1(&sl->gb);
 
         if (num_ref_idx_active_override_flag) {
             ref_count[0] = get_ue_golomb(&sl->gb) + 1;
-            if (ref_count[0] < 1)
-                return AVERROR_INVALIDDATA;
             if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
                 ref_count[1] = get_ue_golomb(&sl->gb) + 1;
-                if (ref_count[1] < 1)
-                    return AVERROR_INVALIDDATA;
-            }
+            } else
+                // full range is spec-ok in this case, even for frames
+                ref_count[1] = 1;
+        }
+
+        if (ref_count[0]-1 > max[0] || ref_count[1]-1 > max[1]){
+            av_log(h->avctx, AV_LOG_ERROR, "reference overflow %u > %u or %u > %u\n", ref_count[0]-1, max[0], ref_count[1]-1, max[1]);
+            sl->ref_count[0] = sl->ref_count[1] = 0;
+            sl->list_count   = 0;
+            return AVERROR_INVALIDDATA;
         }
 
         if (sl->slice_type_nos == AV_PICTURE_TYPE_B)
@@ -1231,14 +1283,6 @@ int ff_set_ref_count(H264Context *h, H264SliceContext *sl)
         ref_count[0] = ref_count[1] = 0;
     }
 
-    max_refs = h->picture_structure == PICT_FRAME ? 16 : 32;
-
-    if (ref_count[0] > max_refs || ref_count[1] > max_refs) {
-        av_log(h->avctx, AV_LOG_ERROR, "reference overflow\n");
-        sl->ref_count[0] = sl->ref_count[1] = 0;
-        return AVERROR_INVALIDDATA;
-    }
-
     if (list_count   != sl->list_count   ||
         ref_count[0] != sl->ref_count[0] ||
         ref_count[1] != sl->ref_count[1]) {
@@ -1251,42 +1295,7 @@ int ff_set_ref_count(H264Context *h, H264SliceContext *sl)
     return 0;
 }
 
-static int find_start_code(const uint8_t *buf, int buf_size,
-                           int buf_index, int next_avc)
-{
-    // start code prefix search
-    for (; buf_index + 3 < next_avc; buf_index++)
-        // This should always succeed in the first iteration.
-        if (buf[buf_index]     == 0 &&
-            buf[buf_index + 1] == 0 &&
-            buf[buf_index + 2] == 1)
-            break;
-
-    if (buf_index + 3 >= buf_size)
-        return buf_size;
-
-    return buf_index + 3;
-}
-
-static int get_avc_nalsize(H264Context *h, const uint8_t *buf,
-                           int buf_size, int *buf_index)
-{
-    int i, nalsize = 0;
-
-    if (*buf_index >= buf_size - h->nal_length_size) {
-        // the end of the buffer is reached, refill it.
-        return AVERROR(EAGAIN);
-    }
-
-    for (i = 0; i < h->nal_length_size; i++)
-        nalsize = (nalsize << 8) | buf[(*buf_index)++];
-    if (nalsize <= 0 || nalsize > buf_size - *buf_index) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "AVC: nal size %d\n", nalsize);
-        return AVERROR_INVALIDDATA;
-    }
-    return nalsize;
-}
+static const uint8_t start_code[] = { 0x00, 0x00, 0x01 };
 
 static int get_bit_length(H264Context *h, const uint8_t *buf,
                           const uint8_t *ptr, int dst_length,
@@ -1313,6 +1322,7 @@ static int get_last_needed_nal(H264Context *h, const uint8_t *buf, int buf_size)
     int nal_index   = 0;
     int buf_index   = 0;
     int nals_needed = 0;
+    int first_slice = 0;
 
     while(1) {
         GetBitContext gb;
@@ -1329,6 +1339,8 @@ static int get_last_needed_nal(H264Context *h, const uint8_t *buf, int buf_size)
             buf_index = find_start_code(buf, buf_size, buf_index, next_avc);
             if (buf_index >= buf_size)
                 break;
+            if (buf_index >= next_avc)
+                continue;
         }
 
         ptr = ff_h264_decode_nal(h, &h->slice_ctx[0], buf + buf_index, &dst_length, &consumed,
@@ -1356,8 +1368,12 @@ static int get_last_needed_nal(H264Context *h, const uint8_t *buf, int buf_size)
         case NAL_IDR_SLICE:
         case NAL_SLICE:
             init_get_bits(&gb, ptr, bit_length);
-            if (!get_ue_golomb(&gb))
+            if (!get_ue_golomb(&gb) ||
+                !first_slice ||
+                first_slice != h->nal_unit_type)
                 nals_needed = nal_index;
+            if (!first_slice)
+                first_slice = h->nal_unit_type;
         }
     }
 
@@ -1374,8 +1390,13 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
     int next_avc;
     int nals_needed = 0; ///< number of NALs that need decoding before the next frame thread starts
     int nal_index;
+    int idr_cleared=0;
     int ret = 0;
 
+    h->nal_unit_type= 0;
+
+    if(!h->slice_context_count)
+         h->slice_context_count= 1;
     h->max_contexts = h->slice_context_count;
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS)) {
         h->current_slice = 0;
@@ -1384,6 +1405,13 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
         ff_h264_reset_sei(h);
     }
 
+    if (h->nal_length_size == 4) {
+        if (buf_size > 8 && AV_RB32(buf) == 1 && AV_RB32(buf+5) > (unsigned)buf_size) {
+            h->is_avc = 0;
+        }else if(buf_size > 3 && AV_RB32(buf) > 1 && AV_RB32(buf) <= (unsigned)buf_size)
+            h->is_avc = 1;
+    }
+
     if (avctx->active_thread_type & FF_THREAD_FRAME)
         nals_needed = get_last_needed_nal(h, buf, buf_size);
 
@@ -1427,8 +1455,8 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
 
             if (h->avctx->debug & FF_DEBUG_STARTCODE)
                 av_log(h->avctx, AV_LOG_DEBUG,
-                       "NAL %d at %d/%d length %d\n",
-                       h->nal_unit_type, buf_index, buf_size, dst_length);
+                       "NAL %d/%d at %d/%d length %d\n",
+                       h->nal_unit_type, h->nal_ref_idc, buf_index, buf_size, dst_length);
 
             if (h->is_avc && (nalsize != consumed) && nalsize)
                 av_log(h->avctx, AV_LOG_DEBUG,
@@ -1444,44 +1472,76 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size,
                 continue;
 
 again:
-            /* Ignore every NAL unit type except PPS and SPS during extradata
+            /* Ignore per frame NAL unit type during extradata
              * parsing. Decoding slices is not possible in codec init
              * with frame-mt */
-            if (parse_extradata && HAVE_THREADS &&
-                (h->avctx->active_thread_type & FF_THREAD_FRAME) &&
-                (h->nal_unit_type != NAL_PPS &&
-                 h->nal_unit_type != NAL_SPS)) {
-                if (h->nal_unit_type < NAL_AUD ||
-                    h->nal_unit_type > NAL_AUXILIARY_SLICE)
-                    av_log(avctx, AV_LOG_INFO,
-                           "Ignoring NAL unit %d during extradata parsing\n",
+            if (parse_extradata) {
+                switch (h->nal_unit_type) {
+                case NAL_IDR_SLICE:
+                case NAL_SLICE:
+                case NAL_DPA:
+                case NAL_DPB:
+                case NAL_DPC:
+                    av_log(h->avctx, AV_LOG_WARNING,
+                           "Ignoring NAL %d in global header/extradata\n",
                            h->nal_unit_type);
-                h->nal_unit_type = NAL_FF_IGNORE;
+                    // fall through to next case
+                case NAL_AUXILIARY_SLICE:
+                    h->nal_unit_type = NAL_FF_IGNORE;
+                }
             }
+
             err = 0;
+
             switch (h->nal_unit_type) {
             case NAL_IDR_SLICE:
+                if ((ptr[0] & 0xFC) == 0x98) {
+                    av_log(h->avctx, AV_LOG_ERROR, "Invalid inter IDR frame\n");
+                    h->next_outputed_poc = INT_MIN;
+                    ret = -1;
+                    goto end;
+                }
                 if (h->nal_unit_type != NAL_IDR_SLICE) {
                     av_log(h->avctx, AV_LOG_ERROR,
                            "Invalid mix of idr and non-idr slices\n");
                     ret = -1;
                     goto end;
                 }
-                idr(h); // FIXME ensure we don't lose some frames if there is reordering
+                if(!idr_cleared) {
+                    if (h->current_slice && (avctx->active_thread_type & FF_THREAD_SLICE)) {
+                        av_log(h, AV_LOG_ERROR, "invalid mixed IDR / non IDR frames cannot be decoded in slice multithreading mode\n");
+                        ret = AVERROR_INVALIDDATA;
+                        goto end;
+                    }
+                    idr(h); // FIXME ensure we don't lose some frames if there is reordering
+                }
+                idr_cleared = 1;
+                h->has_recovery_point = 1;
             case NAL_SLICE:
                 init_get_bits(&sl->gb, ptr, bit_length);
 
+                if (   nals_needed >= nal_index
+                    || (!(avctx->active_thread_type & FF_THREAD_FRAME) && !context_count))
+                    h->au_pps_id = -1;
+
                 if ((err = ff_h264_decode_slice_header(h, sl)))
                     break;
 
-                if (h->sei_recovery_frame_cnt >= 0 && h->recovery_frame < 0) {
-                    h->recovery_frame = (h->frame_num + h->sei_recovery_frame_cnt) &
-                                        ((1 << h->sps.log2_max_frame_num) - 1);
+                if (h->sei_recovery_frame_cnt >= 0) {
+                    if (h->frame_num != h->sei_recovery_frame_cnt || sl->slice_type_nos != AV_PICTURE_TYPE_I)
+                        h->valid_recovery_point = 1;
+
+                    if (   h->recovery_frame < 0
+                        || av_mod_uintp2(h->recovery_frame - h->frame_num, h->sps.log2_max_frame_num) > h->sei_recovery_frame_cnt) {
+                        h->recovery_frame = av_mod_uintp2(h->frame_num + h->sei_recovery_frame_cnt, h->sps.log2_max_frame_num);
+
+                        if (!h->valid_recovery_point)
+                            h->recovery_frame = h->frame_num;
+                    }
                 }
 
                 h->cur_pic_ptr->f->key_frame |=
-                    (h->nal_unit_type == NAL_IDR_SLICE) ||
-                    (h->sei_recovery_frame_cnt >= 0);
+                    (h->nal_unit_type == NAL_IDR_SLICE);
 
                 if (h->nal_unit_type == NAL_IDR_SLICE ||
                     h->recovery_frame == h->frame_num) {
@@ -1492,31 +1552,45 @@ again:
                 // "recovered".
                 if (h->nal_unit_type == NAL_IDR_SLICE)
                     h->frame_recovered |= FRAME_RECOVERED_IDR;
+                h->frame_recovered |= 3*!!(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL);
+                h->frame_recovered |= 3*!!(avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT);
+#if 1
+                h->cur_pic_ptr->recovered |= h->frame_recovered;
+#else
                 h->cur_pic_ptr->recovered |= !!(h->frame_recovered & FRAME_RECOVERED_IDR);
+#endif
 
                 if (h->current_slice == 1) {
                     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS))
                         decode_postinit(h, nal_index >= nals_needed);
 
                     if (h->avctx->hwaccel &&
-                        (ret = h->avctx->hwaccel->start_frame(h->avctx, NULL, 0)) < 0)
-                        return ret;
+                        (ret = h->avctx->hwaccel->start_frame(h->avctx, buf, buf_size)) < 0)
+                        goto end;
+#if FF_API_CAP_VDPAU
+                    if (CONFIG_H264_VDPAU_DECODER &&
+                        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+                        ff_vdpau_h264_picture_start(h);
+#endif
                 }
 
-                if (sl->redundant_pic_count == 0 &&
-                    (avctx->skip_frame < AVDISCARD_NONREF ||
-                     h->nal_ref_idc) &&
-                    (avctx->skip_frame < AVDISCARD_BIDIR  ||
-                     sl->slice_type_nos != AV_PICTURE_TYPE_B) &&
-                    (avctx->skip_frame < AVDISCARD_NONKEY ||
-                     h->cur_pic_ptr->f->key_frame) &&
-                    avctx->skip_frame < AVDISCARD_ALL) {
+                if (sl->redundant_pic_count == 0) {
                     if (avctx->hwaccel) {
                         ret = avctx->hwaccel->decode_slice(avctx,
                                                            &buf[buf_index - consumed],
                                                            consumed);
                         if (ret < 0)
-                            return ret;
+                            goto end;
+#if FF_API_CAP_VDPAU
+                    } else if (CONFIG_H264_VDPAU_DECODER &&
+                               h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU) {
+                        ff_vdpau_add_data_chunk(h->cur_pic_ptr->f->data[0],
+                                                start_code,
+                                                sizeof(start_code));
+                        ff_vdpau_add_data_chunk(h->cur_pic_ptr->f->data[0],
+                                                &buf[buf_index - consumed],
+                                                consumed);
+#endif
                     } else
                         context_count++;
                 }
@@ -1525,8 +1599,6 @@ again:
             case NAL_DPB:
             case NAL_DPC:
                 avpriv_request_sample(avctx, "data partitioning");
-                ret = AVERROR(ENOSYS);
-                goto end;
                 break;
             case NAL_SEI:
                 init_get_bits(&h->gb, ptr, bit_length);
@@ -1536,14 +1608,22 @@ again:
                 break;
             case NAL_SPS:
                 init_get_bits(&h->gb, ptr, bit_length);
-                ret = ff_h264_decode_seq_parameter_set(h);
-                if (ret < 0 && h->is_avc && (nalsize != consumed) && nalsize) {
+                if (ff_h264_decode_seq_parameter_set(h, 0) >= 0)
+                    break;
+                if (h->is_avc ? nalsize : 1) {
                     av_log(h->avctx, AV_LOG_DEBUG,
                            "SPS decoding failure, trying again with the complete NAL\n");
-                    init_get_bits(&h->gb, buf + buf_index + 1 - consumed,
-                                  8 * (nalsize - 1));
-                    ff_h264_decode_seq_parameter_set(h);
+                    if (h->is_avc)
+                        av_assert0(next_avc - buf_index + consumed == nalsize);
+                    if ((next_avc - buf_index + consumed - 1) >= INT_MAX/8)
+                        break;
+                    init_get_bits(&h->gb, &buf[buf_index + 1 - consumed],
+                                  8*(next_avc - buf_index + consumed - 1));
+                    if (ff_h264_decode_seq_parameter_set(h, 0) >= 0)
+                        break;
                 }
+                init_get_bits(&h->gb, ptr, bit_length);
+                ff_h264_decode_seq_parameter_set(h, 1);
 
                 break;
             case NAL_PPS:
@@ -1573,10 +1653,17 @@ again:
                 context_count = 0;
             }
 
-            if (err < 0) {
-                av_log(h->avctx, AV_LOG_ERROR, "decode_slice_header error\n");
+            if (err < 0 || err == SLICE_SKIPED) {
+                if (err < 0)
+                    av_log(h->avctx, AV_LOG_ERROR, "decode_slice_header error\n");
                 sl->ref_count[0] = sl->ref_count[1] = sl->list_count = 0;
-            } else if (err == 1) {
+            } else if (err == SLICE_SINGLETHREAD) {
+                if (context_count > 1) {
+                    ret = ff_h264_execute_decode_slices(h, context_count - 1);
+                    if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
+                        goto end;
+                    context_count = 0;
+                }
                 /* Slice could not be decoded in parallel mode, restart. Note
                  * that rbsp_buffer is not transferred, but since we no longer
                  * run in parallel mode this should not be an issue. */
@@ -1615,26 +1702,62 @@ static int get_consumed_bytes(int pos, int buf_size)
     return pos;
 }
 
-static int output_frame(H264Context *h, AVFrame *dst, AVFrame *src)
+static int output_frame(H264Context *h, AVFrame *dst, H264Picture *srcp)
 {
+    AVFrame *src = srcp->f;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
     int i;
     int ret = av_frame_ref(dst, src);
     if (ret < 0)
         return ret;
 
-    if (!h->sps.crop)
+    av_dict_set(&dst->metadata, "stereo_mode", ff_h264_sei_stereo_mode(h), 0);
+
+    h->backup_width   = h->avctx->width;
+    h->backup_height  = h->avctx->height;
+    h->backup_pix_fmt = h->avctx->pix_fmt;
+
+    h->avctx->width   = dst->width;
+    h->avctx->height  = dst->height;
+    h->avctx->pix_fmt = dst->format;
+
+    if (srcp->sei_recovery_frame_cnt == 0)
+        dst->key_frame = 1;
+    if (!srcp->crop)
         return 0;
 
-    for (i = 0; i < 3; i++) {
-        int hshift = (i > 0) ? h->chroma_x_shift : 0;
-        int vshift = (i > 0) ? h->chroma_y_shift : 0;
-        int off    = ((h->sps.crop_left >> hshift) << h->pixel_shift) +
-                     (h->sps.crop_top >> vshift) * dst->linesize[i];
+    for (i = 0; i < desc->nb_components; i++) {
+        int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+        int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+        int off    = ((srcp->crop_left >> hshift) << h->pixel_shift) +
+                      (srcp->crop_top  >> vshift) * dst->linesize[i];
         dst->data[i] += off;
     }
     return 0;
 }
 
+static int is_extra(const uint8_t *buf, int buf_size)
+{
+    int cnt= buf[5]&0x1f;
+    const uint8_t *p= buf+6;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || p[2]!=0x67)
+            return 0;
+        p += nalsize;
+    }
+    cnt = *(p++);
+    if(!cnt)
+        return 0;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || p[2]!=0x68)
+            return 0;
+        p += nalsize;
+    }
+    return 1;
+}
+
 static int h264_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
@@ -1643,18 +1766,34 @@ static int h264_decode_frame(AVCodecContext *avctx, void *data,
     H264Context *h     = avctx->priv_data;
     AVFrame *pict      = data;
     int buf_index      = 0;
+    H264Picture *out;
+    int i, out_idx;
     int ret;
 
     h->flags = avctx->flags;
     h->setup_finished = 0;
 
+    if (h->backup_width != -1) {
+        avctx->width    = h->backup_width;
+        h->backup_width = -1;
+    }
+    if (h->backup_height != -1) {
+        avctx->height    = h->backup_height;
+        h->backup_height = -1;
+    }
+    if (h->backup_pix_fmt != AV_PIX_FMT_NONE) {
+        avctx->pix_fmt    = h->backup_pix_fmt;
+        h->backup_pix_fmt = AV_PIX_FMT_NONE;
+    }
+
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+
     /* end of stream, output what is still in the buffers */
-out:
     if (buf_size == 0) {
-        H264Picture *out;
-        int i, out_idx;
+ out:
 
         h->cur_pic_ptr = NULL;
+        h->first_field = 0;
 
         // FIXME factorize this with the output code below
         out     = h->delayed_pic[0];
@@ -1673,7 +1812,8 @@ out:
             h->delayed_pic[i] = h->delayed_pic[i + 1];
 
         if (out) {
-            ret = output_frame(h, pict, out->f);
+            out->reference &= ~DELAYED_PIC_REF;
+            ret = output_frame(h, pict, out);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
@@ -1681,19 +1821,30 @@ out:
 
         return buf_index;
     }
+    if (h->is_avc && av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, NULL)) {
+        int side_size;
+        uint8_t *side = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+        if (is_extra(side, side_size))
+            ff_h264_decode_extradata(h, side, side_size);
+    }
+    if(h->is_avc && buf_size >= 9 && buf[0]==1 && buf[2]==0 && (buf[4]&0xFC)==0xFC && (buf[5]&0x1F) && buf[8]==0x67){
+        if (is_extra(buf, buf_size))
+            return ff_h264_decode_extradata(h, buf, buf_size);
+    }
 
     buf_index = decode_nal_units(h, buf, buf_size, 0);
     if (buf_index < 0)
         return AVERROR_INVALIDDATA;
 
     if (!h->cur_pic_ptr && h->nal_unit_type == NAL_END_SEQUENCE) {
-        buf_size = 0;
+        av_assert0(buf_index <= buf_size);
         goto out;
     }
 
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) && !h->cur_pic_ptr) {
-        if (avctx->skip_frame >= AVDISCARD_NONREF)
-            return 0;
+        if (avctx->skip_frame >= AVDISCARD_NONREF ||
+            buf_size >= 4 && !memcmp("Q264", buf, 4))
+            return buf_size;
         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
         return AVERROR_INVALIDDATA;
     }
@@ -1703,22 +1854,57 @@ out:
         if (avctx->flags2 & AV_CODEC_FLAG2_CHUNKS)
             decode_postinit(h, 1);
 
-        ff_h264_field_end(h, &h->slice_ctx[0], 0);
+        if ((ret = ff_h264_field_end(h, &h->slice_ctx[0], 0)) < 0)
+            return ret;
 
+        /* Wait for second field. */
         *got_frame = 0;
-        if (h->next_output_pic && ((avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) ||
+        if (h->next_output_pic && (
                                    h->next_output_pic->recovered)) {
             if (!h->next_output_pic->recovered)
                 h->next_output_pic->f->flags |= AV_FRAME_FLAG_CORRUPT;
 
-            ret = output_frame(h, pict, h->next_output_pic->f);
+            if (!h->avctx->hwaccel &&
+                 (h->next_output_pic->field_poc[0] == INT_MAX ||
+                  h->next_output_pic->field_poc[1] == INT_MAX)
+            ) {
+                int p;
+                AVFrame *f = h->next_output_pic->f;
+                int field = h->next_output_pic->field_poc[0] == INT_MAX;
+                uint8_t *dst_data[4];
+                int linesizes[4];
+                const uint8_t *src_data[4];
+
+                av_log(h->avctx, AV_LOG_DEBUG, "Duplicating field %d to fill missing\n", field);
+
+                for (p = 0; p<4; p++) {
+                    dst_data[p] = f->data[p] + (field^1)*f->linesize[p];
+                    src_data[p] = f->data[p] +  field   *f->linesize[p];
+                    linesizes[p] = 2*f->linesize[p];
+                }
+
+                av_image_copy(dst_data, linesizes, src_data, linesizes,
+                              f->format, f->width, f->height>>1);
+            }
+
+            ret = output_frame(h, pict, h->next_output_pic);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
+            if (CONFIG_MPEGVIDEO) {
+                ff_print_debug_info2(h->avctx, pict, NULL,
+                                    h->next_output_pic->mb_type,
+                                    h->next_output_pic->qscale_table,
+                                    h->next_output_pic->motion_val,
+                                    &h->low_delay,
+                                    h->mb_width, h->mb_height, h->mb_stride, 1);
+            }
         }
     }
 
-    assert(pict->buf[0] || !*got_frame);
+    av_assert0(pict->buf[0] || !*got_frame);
+
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
 
     return get_consumed_bytes(buf_index, buf_size);
 }
@@ -1733,6 +1919,7 @@ av_cold void ff_h264_free_context(H264Context *h)
         ff_h264_unref_picture(h, &h->DPB[i]);
         av_frame_free(&h->DPB[i].f);
     }
+    memset(h->delayed_pic, 0, sizeof(h->delayed_pic));
 
     h->cur_pic_ptr = NULL;
 
@@ -1752,10 +1939,13 @@ static av_cold int h264_decode_end(AVCodecContext *avctx)
 {
     H264Context *h = avctx->priv_data;
 
+    ff_h264_remove_all_refs(h);
     ff_h264_free_context(h);
 
     ff_h264_unref_picture(h, &h->cur_pic);
     av_frame_free(&h->cur_pic.f);
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+    av_frame_free(&h->last_pic_for_ec.f);
 
     return 0;
 }
@@ -1763,12 +1953,14 @@ static av_cold int h264_decode_end(AVCodecContext *avctx)
 #define OFFSET(x) offsetof(H264Context, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption h264_options[] = {
-    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VD },
+    {"is_avc", "is avc", offsetof(H264Context, is_avc), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {"nal_length_size", "nal_length_size", offsetof(H264Context, nal_length_size), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 4, 0},
+    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VD },
     { NULL },
 };
 
 static const AVClass h264_class = {
-    .class_name = "h264",
+    .class_name = "H264 Decoder",
     .item_name  = av_default_item_name,
     .option     = h264_options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -1809,3 +2001,29 @@ AVCodec ff_h264_decoder = {
     .profiles              = NULL_IF_CONFIG_SMALL(profiles),
     .priv_class            = &h264_class,
 };
+
+#if CONFIG_H264_VDPAU_DECODER && FF_API_VDPAU
+static const AVClass h264_vdpau_class = {
+    .class_name = "H264 VDPAU Decoder",
+    .item_name  = av_default_item_name,
+    .option     = h264_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_vdpau_decoder = {
+    .name           = "h264_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(H264Context),
+    .init           = ff_h264_decode_init,
+    .close          = h264_decode_end,
+    .decode         = h264_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .flush          = flush_dpb,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_VDPAU_H264,
+                                                     AV_PIX_FMT_NONE},
+    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
+    .priv_class     = &h264_vdpau_class,
+};
+#endif
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index bf8b07e871..769abdad4b 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,8 +43,8 @@
 #include "rectangle.h"
 #include "videodsp.h"
 
-#define H264_MAX_PICTURE_COUNT 32
-#define H264_MAX_THREADS       16
+#define H264_MAX_PICTURE_COUNT 36
+#define H264_MAX_THREADS       32
 
 #define MAX_SPS_COUNT          32
 #define MAX_PPS_COUNT         256
@@ -53,6 +53,8 @@
 
 #define MAX_DELAYED_PIC_COUNT  16
 
+#define MAX_MBPAIR_SIZE (256*1024) // a tighter bound could be calculated if someone cares about a few bytes
+
 /* Compiling in interlaced support reduces the speed
  * of progressive decoding by about 2%. */
 #define ALLOW_INTERLACE
@@ -66,17 +68,17 @@
 #define MAX_SLICES 32
 
 #ifdef ALLOW_INTERLACE
-#define MB_MBAFF(h)    h->mb_mbaff
-#define MB_FIELD(h)    h->mb_field_decoding_flag
-#define FRAME_MBAFF(h) h->mb_aff_frame
-#define FIELD_PICTURE(h) (h->picture_structure != PICT_FRAME)
+#define MB_MBAFF(h)    (h)->mb_mbaff
+#define MB_FIELD(sl)  (sl)->mb_field_decoding_flag
+#define FRAME_MBAFF(h) (h)->mb_aff_frame
+#define FIELD_PICTURE(h) ((h)->picture_structure != PICT_FRAME)
 #define LEFT_MBS 2
 #define LTOP     0
 #define LBOT     1
 #define LEFT(i)  (i)
 #else
 #define MB_MBAFF(h)      0
-#define MB_FIELD(h)      0
+#define MB_FIELD(sl)     0
 #define FRAME_MBAFF(h)   0
 #define FIELD_PICTURE(h) 0
 #undef  IS_INTERLACED
@@ -89,11 +91,12 @@
 #define FIELD_OR_MBAFF_PICTURE(h) (FRAME_MBAFF(h) || FIELD_PICTURE(h))
 
 #ifndef CABAC
-#define CABAC(h) h->pps.cabac
+#define CABAC(h) (h)->pps.cabac
 #endif
 
-#define CHROMA422(h) (h->sps.chroma_format_idc == 2)
-#define CHROMA444(h) (h->sps.chroma_format_idc == 3)
+#define CHROMA(h)    ((h)->sps.chroma_format_idc)
+#define CHROMA422(h) ((h)->sps.chroma_format_idc == 2)
+#define CHROMA444(h) ((h)->sps.chroma_format_idc == 3)
 
 #define EXTENDED_SAR       255
 
@@ -102,7 +105,7 @@
 #define IS_REF0(a)         ((a) & MB_TYPE_REF0)
 #define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
 
-#define QP_MAX_NUM (51 + 2 * 6)           // The maximum supported qp
+#define QP_MAX_NUM (51 + 6*6)           // The maximum supported qp
 
 /* NAL unit types */
 enum {
@@ -134,6 +137,7 @@ typedef enum {
     SEI_TYPE_RECOVERY_POINT         = 6,   ///< recovery point (frame # to decoder sync)
     SEI_TYPE_FRAME_PACKING          = 45,  ///< frame packing arrangement
     SEI_TYPE_DISPLAY_ORIENTATION    = 47,  ///< display orientation
+    SEI_TYPE_GREEN_METADATA         = 56   ///< GreenMPEG information
 } SEI_Type;
 
 /**
@@ -152,6 +156,19 @@ typedef enum {
 } SEI_PicStructType;
 
 /**
+ * frame_packing_arrangement types
+ */
+typedef enum {
+    SEI_FPA_TYPE_CHECKERBOARD        = 0,
+    SEI_FPA_TYPE_INTERLEAVE_COLUMN   = 1,
+    SEI_FPA_TYPE_INTERLEAVE_ROW      = 2,
+    SEI_FPA_TYPE_SIDE_BY_SIDE        = 3,
+    SEI_FPA_TYPE_TOP_BOTTOM          = 4,
+    SEI_FPA_TYPE_INTERLEAVE_TEMPORAL = 5,
+    SEI_FPA_TYPE_2D                  = 6,
+} SEI_FpaType;
+
+/**
  * Sequence parameter set
  */
 typedef struct SPS {
@@ -212,6 +229,8 @@ typedef struct SPS {
     int residual_color_transform_flag;    ///< residual_colour_transform_flag
     int constraint_set_flags;             ///< constraint_set[0-3]_flag
     int new;                              ///< flag to keep track if the decoder context needs re-init due to changed SPS
+    uint8_t data[4096];
+    size_t data_size;
 } SPS;
 
 /**
@@ -235,11 +254,41 @@ typedef struct PPS {
     int transform_8x8_mode;         ///< transform_8x8_mode_flag
     uint8_t scaling_matrix4[6][16];
     uint8_t scaling_matrix8[6][64];
-    uint8_t chroma_qp_table[2][64]; ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
+    uint8_t chroma_qp_table[2][QP_MAX_NUM+1];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
     int chroma_qp_diff;
+    uint8_t data[4096];
+    size_t data_size;
 } PPS;
 
 /**
+ * Frame Packing Arrangement Type
+ */
+typedef struct FPA {
+    int         frame_packing_arrangement_id;
+    int         frame_packing_arrangement_cancel_flag; ///< is previous arrangement canceled, -1 if never received
+    SEI_FpaType frame_packing_arrangement_type;
+    int         frame_packing_arrangement_repetition_period;
+    int         content_interpretation_type;
+    int         quincunx_sampling_flag;
+} FPA;
+
+/**
+ *     Green MetaData Information Type
+ */
+typedef struct GreenMetaData {
+    uint8_t  green_metadata_type;
+    uint8_t  period_type;
+    uint16_t  num_seconds;
+    uint16_t  num_pictures;
+    uint8_t percent_non_zero_macroblocks;
+    uint8_t percent_intra_coded_macroblocks;
+    uint8_t percent_six_tap_filtering;
+    uint8_t percent_alpha_point_deblocking_instance;
+    uint8_t xsd_metric_type;
+    uint16_t xsd_metric_value;
+} GreenMetaData;
+
+/**
  * Memory management control operation opcode.
  */
 typedef enum MMCOOpcode {
@@ -288,13 +337,19 @@ typedef struct H264Picture {
     int pic_id;             /**< pic_num (short -> no wrap version of pic_num,
                                  pic_num & max_pic_num; long -> long_pic_num) */
     int long_ref;           ///< 1->long term reference 0->short term reference
-    int ref_poc[2][2][32];  ///< POCs of the frames used as reference (FIXME need per slice)
+    int ref_poc[2][2][32];  ///< POCs of the frames/fields used as reference (FIXME need per slice)
     int ref_count[2][2];    ///< number of entries in ref_poc         (FIXME need per slice)
     int mbaff;              ///< 1 -> MBAFF frame 0-> not MBAFF
     int field_picture;      ///< whether or not picture was encoded in separate fields
 
     int reference;
     int recovered;          ///< picture at IDR or recovery point + recovery count
+    int invalid_gap;
+    int sei_recovery_frame_cnt;
+
+    int crop;
+    int crop_left;
+    int crop_top;
 } H264Picture;
 
 typedef struct H264Ref {
@@ -439,7 +494,7 @@ typedef struct H264SliceContext {
 
     DECLARE_ALIGNED(8, uint16_t, sub_mb_type)[4];
 
-    ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
+    ///< as a dct coefficient is int32_t in high depth, we need to reserve twice the space.
     DECLARE_ALIGNED(16, int16_t, mb)[16 * 48 * 2];
     DECLARE_ALIGNED(16, int16_t, mb_luma_dc)[3][16 * 2];
     ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either
@@ -464,6 +519,7 @@ typedef struct H264SliceContext {
  * H264Context
  */
 typedef struct H264Context {
+    AVClass *av_class;
     AVCodecContext *avctx;
     VideoDSPContext vdsp;
     H264DSPContext h264dsp;
@@ -474,6 +530,7 @@ typedef struct H264Context {
     H264Picture DPB[H264_MAX_PICTURE_COUNT];
     H264Picture *cur_pic_ptr;
     H264Picture cur_pic;
+    H264Picture last_pic_for_ec;
 
     H264SliceContext *slice_ctx;
     int            nb_slice_ctx;
@@ -484,6 +541,14 @@ typedef struct H264Context {
     int width, height;
     int chroma_x_shift, chroma_y_shift;
 
+    /**
+     * Backup frame properties: needed, because they can be different
+     * between returned frame and last decoded frame.
+     **/
+    int backup_width;
+    int backup_height;
+    enum AVPixelFormat backup_pix_fmt;
+
     int droppable;
     int coded_picture_number;
     int low_delay;
@@ -510,9 +575,13 @@ typedef struct H264Context {
     uint32_t *mb2br_xy;
     int b_stride;       // FIXME use s->b4_stride
 
+
+    unsigned current_sps_id; ///< id of the current SPS
     SPS sps; ///< current sps
     PPS pps; ///< current pps
 
+    int au_pps_id; ///< pps_id of current access unit
+
     uint32_t dequant4_buffer[6][QP_MAX_NUM + 1][16]; // FIXME should these be moved down?
     uint32_t dequant8_buffer[6][QP_MAX_NUM + 1][64];
     uint32_t(*dequant4_coeff[6])[16];
@@ -541,12 +610,12 @@ typedef struct H264Context {
     uint8_t field_scan[16];
     uint8_t field_scan8x8[64];
     uint8_t field_scan8x8_cavlc[64];
-    const uint8_t *zigzag_scan_q0;
-    const uint8_t *zigzag_scan8x8_q0;
-    const uint8_t *zigzag_scan8x8_cavlc_q0;
-    const uint8_t *field_scan_q0;
-    const uint8_t *field_scan8x8_q0;
-    const uint8_t *field_scan8x8_cavlc_q0;
+    uint8_t zigzag_scan_q0[16];
+    uint8_t zigzag_scan8x8_q0[64];
+    uint8_t zigzag_scan8x8_cavlc_q0[64];
+    uint8_t field_scan_q0[16];
+    uint8_t field_scan8x8_q0[64];
+    uint8_t field_scan8x8_cavlc_q0[64];
 
     int x264_build;
 
@@ -622,7 +691,7 @@ typedef struct H264Context {
      * @{
      */
     /**
-     * current slice number, used to initalize slice_num of each thread/context
+     * current slice number, used to initialize slice_num of each thread/context
      */
     int current_slice;
 
@@ -645,6 +714,7 @@ typedef struct H264Context {
     enum AVPictureType pict_type;
 
     int last_slice_type;
+    unsigned int last_ref_count[2];
     /** @} */
 
     /**
@@ -710,6 +780,13 @@ typedef struct H264Context {
     int sei_recovery_frame_cnt;
 
     /**
+     * Are the SEI recovery points looking valid.
+     */
+    int valid_recovery_point;
+
+    FPA sei_fpa;
+
+    /**
      * recovery_frame is the frame_num at which the next frame should
      * be fully constructed.
      *
@@ -730,7 +807,11 @@ typedef struct H264Context {
 
     int frame_recovered;    ///< Initial frame has been completely recovered
 
-    /* for frame threading, this is set to 1
+    int has_recovery_point;
+
+    int missing_fields;
+
+/* for frame threading, this is set to 1
      * after finish_setup() has been called, so we cannot modify
      * some context properties (which are supposed to stay constant between
      * slices) anymore */
@@ -740,6 +821,14 @@ typedef struct H264Context {
     int sei_buffering_period_present;   ///< Buffering period SEI flag
     int initial_cpb_removal_delay[32];  ///< Initial timestamps for CPBs
 
+    int cur_chroma_format_idc;
+    int cur_bit_depth_luma;
+    int16_t slice_row[MAX_SLICES]; ///< to detect when MAX_SLICES is too low
+
+    uint8_t parse_history[6];
+    int parse_history_count;
+    int parse_last_mb;
+
     int enable_er;
 
     AVBufferPool *qscale_table_pool;
@@ -750,9 +839,13 @@ typedef struct H264Context {
     /* Motion Estimation */
     qpel_mc_func (*qpel_put)[16];
     qpel_mc_func (*qpel_avg)[16];
+
+    /*Green Metadata */
+    GreenMetaData sei_green_metadata;
+
 } H264Context;
 
-extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1]; ///< One chroma qp table for each supported bit depth (8, 9, 10).
+extern const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1]; ///< One chroma qp table for each possible bit depth (8-14).
 extern const uint16_t ff_h264_mb_sizes[4];
 
 /**
@@ -763,7 +856,7 @@ int ff_h264_decode_sei(H264Context *h);
 /**
  * Decode SPS
  */
-int ff_h264_decode_seq_parameter_set(H264Context *h);
+int ff_h264_decode_seq_parameter_set(H264Context *h, int ignore_truncation);
 
 /**
  * compute profile from sps
@@ -836,7 +929,7 @@ int ff_h264_check_intra_pred_mode(const H264Context *h, H264SliceContext *sl,
                                   int mode, int is_chroma);
 
 void ff_h264_hl_decode_mb(const H264Context *h, H264SliceContext *sl);
-int ff_h264_decode_extradata(H264Context *h);
+int ff_h264_decode_extradata(H264Context *h, const uint8_t *buf, int size);
 int ff_h264_decode_init(AVCodecContext *avctx);
 void ff_h264_decode_init_vlc(void);
 
@@ -854,7 +947,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl);
 
 void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl);
 
-void h264_init_dequant_tables(H264Context *h);
+void ff_h264_init_dequant_tables(H264Context *h);
 
 void ff_h264_direct_dist_scale_factor(const H264Context *const h, H264SliceContext *sl);
 void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *sl);
@@ -875,6 +968,12 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl, int mb_x, int
  */
 void ff_h264_reset_sei(H264Context *h);
 
+/**
+ * Get stereo_mode string from the h264 frame_packing_arrangement
+ * @param h H.264 context.
+ */
+const char* ff_h264_sei_stereo_mode(H264Context *h);
+
 /*
  * o-o o-o
  *  / / /
@@ -925,7 +1024,7 @@ static const uint8_t scan8[16 * 3 + 3] = {
     0 +  0 * 8, 0 +  5 * 8, 0 + 10 * 8
 };
 
-static av_always_inline uint32_t pack16to32(int a, int b)
+static av_always_inline uint32_t pack16to32(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFFFF) + (a << 16);
@@ -934,7 +1033,7 @@ static av_always_inline uint32_t pack16to32(int a, int b)
 #endif
 }
 
-static av_always_inline uint16_t pack8to16(int a, int b)
+static av_always_inline uint16_t pack8to16(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFF) + (a << 8);
@@ -1081,6 +1180,36 @@ static av_always_inline int get_dct8x8_allowed(const H264Context *h, H264SliceCo
                   0x0001000100010001ULL));
 }
 
+static inline int find_start_code(const uint8_t *buf, int buf_size,
+                           int buf_index, int next_avc)
+{
+    uint32_t state = -1;
+
+    buf_index = avpriv_find_start_code(buf + buf_index, buf + next_avc + 1, &state) - buf - 1;
+
+    return FFMIN(buf_index, buf_size);
+}
+
+static inline int get_avc_nalsize(H264Context *h, const uint8_t *buf,
+                           int buf_size, int *buf_index)
+{
+    int i, nalsize = 0;
+
+    if (*buf_index >= buf_size - h->nal_length_size) {
+        // the end of the buffer is reached, refill it.
+        return AVERROR(EAGAIN);
+    }
+
+    for (i = 0; i < h->nal_length_size; i++)
+        nalsize = ((unsigned)nalsize << 8) | buf[(*buf_index)++];
+    if (nalsize <= 0 || nalsize > buf_size - *buf_index) {
+        av_log(h->avctx, AV_LOG_ERROR,
+               "AVC: nal size %d\n", nalsize);
+        return AVERROR_INVALIDDATA;
+    }
+    return nalsize;
+}
+
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup);
 
 int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src);
@@ -1094,6 +1223,9 @@ int ff_pred_weight_table(H264Context *h, H264SliceContext *sl);
 int ff_set_ref_count(H264Context *h, H264SliceContext *sl);
 
 int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl);
+#define SLICE_SINGLETHREAD 1
+#define SLICE_SKIPED 2
+
 int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count);
 int ff_h264_update_thread_context(AVCodecContext *dst,
                                   const AVCodecContext *src);
@@ -1102,4 +1234,6 @@ void ff_h264_flush_change(H264Context *h);
 
 void ff_h264_free_tables(H264Context *h);
 
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src);
+
 #endif /* AVCODEC_H264_H */
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index f4e451d79b..c1c8b80855 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,11 @@
  */
 
 #define CABAC(h) 1
+#define UNCHECKED_BITSTREAM_READER 1
 #define INT_BIT (CHAR_BIT * sizeof(int))
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/timer.h"
 #include "config.h"
 #include "cabac.h"
@@ -45,8 +47,6 @@
 #include "x86/h264_i386.h"
 #endif
 
-#include <assert.h>
-
 /* Cabac pre state table */
 
 static const int8_t cabac_context_init_I[1024][2] =
@@ -1284,7 +1284,7 @@ void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl)
 
 static int decode_cabac_field_decoding_flag(const H264Context *h, H264SliceContext *sl)
 {
-    const long mbb_xy = sl->mb_xy - 2L*h->mb_stride;
+    const int mbb_xy = sl->mb_xy - 2*h->mb_stride;
 
     unsigned long ctx = 0;
 
@@ -1501,7 +1501,7 @@ static int decode_cabac_mb_mvd(H264SliceContext *sl, int ctxbase, int amvd, int
     int mvd;
 
     if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
-//    if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
+//    if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
         *mvda= 0;
         return 0;
     }
@@ -1640,7 +1640,9 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
     cc.range     = sl->cabac.range;
     cc.low       = sl->cabac.low;
     cc.bytestream= sl->cabac.bytestream;
+#if !UNCHECKED_BITSTREAM_READER || ARCH_AARCH64
     cc.bytestream_end = sl->cabac.bytestream_end;
+#endif
 #else
 #define CC &sl->cabac
 #endif
@@ -1689,7 +1691,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         }
 #endif
     }
-    assert(coeff_count > 0);
+    av_assert2(coeff_count > 0);
 
     if( is_dc ) {
         if( cat == 3 )
@@ -1701,7 +1703,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         if( max_coeff == 64 )
             fill_rectangle(&sl->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
         else {
-            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
+            av_assert2( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
             sl->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -1955,7 +1957,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         int ctx = 0;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_B);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_B);
 
         if (!IS_DIRECT(sl->left_type[LTOP] - 1))
             ctx++;
@@ -2008,7 +2010,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
         mb_type = decode_cabac_intra_mb_type(sl, 3, 1);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
 decode_intra_mb:
         partition_count = 0;
         cbp= i_mb_type_info[mb_type].cbp;
@@ -2071,8 +2073,8 @@ decode_intra_mb:
                     int pred = pred_intra_mode(h, sl, i);
                     sl->intra4x4_pred_mode_cache[scan8[i]] = decode_cabac_mb_intra4x4_pred_mode(sl, pred);
 
-                    ff_dlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
-                            h->intra4x4_pred_mode_cache[scan8[i]]);
+                    ff_tlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
+                            sl->intra4x4_pred_mode_cache[scan8[i]]);
                 }
             }
             write_back_intra_pred_mode(h, sl);
@@ -2122,10 +2124,10 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     if(IS_DIRECT(sl->sub_mb_type[i])) continue;
                     if(IS_DIR(sl->sub_mb_type[i], 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc > 1) {
                             ref[list][i] = decode_cabac_mb_ref(sl, list, 4 * i);
-                            if (ref[list][i] >= (unsigned) rc) {
+                            if (ref[list][i] >= rc) {
                                 av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], rc);
                                 return -1;
                             }
@@ -2208,10 +2210,11 @@ decode_intra_mb:
         if(IS_16X16(mb_type)){
             for (list = 0; list < sl->list_count; list++) {
                 if(IS_DIR(mb_type, 0, list)){
-                    int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                    int ref;
+                    unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                     if (rc > 1) {
                         ref= decode_cabac_mb_ref(sl, list, 0);
-                        if (ref >= (unsigned) rc) {
+                        if (ref >= rc) {
                             av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                             return -1;
                         }
@@ -2236,10 +2239,11 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref= decode_cabac_mb_ref(sl, list, 8 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2267,14 +2271,15 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref = decode_cabac_mb_ref(sl, list, 4 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2313,6 +2318,11 @@ decode_intra_mb:
         cbp  = decode_cabac_mb_cbp_luma(sl);
         if(decode_chroma)
             cbp |= decode_cabac_mb_cbp_chroma(sl) << 4;
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     h->cbp_table[mb_xy] = sl->cbp = cbp;
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 7445788e56..b0251f405c 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cavlc bitstream decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #define CABAC(h) 0
+#define UNCHECKED_BITSTREAM_READER 1
 
 #include "internal.h"
 #include "avcodec.h"
@@ -34,8 +35,8 @@
 #include "h264_mvpred.h"
 #include "golomb.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static const uint8_t golomb_to_inter_cbp_gray[16]={
  0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
@@ -360,7 +361,7 @@ av_cold void ff_h264_decode_init_vlc(void){
          * the packed static coeff_token_vlc table sizes
          * were initialized correctly.
          */
-        assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
+        av_assert0(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
 
         for(i=0; i<3; i++){
             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
@@ -483,7 +484,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
 
     trailing_ones= coeff_token&3;
     ff_tlog(h->avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
-    assert(total_coeff<=16);
+    av_assert2(total_coeff<=16);
 
     i = show_bits(gb, 3);
     skip_bits(gb, trailing_ones);
@@ -515,7 +516,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 else
                     level_code= prefix + get_bits(gb, 4); //part
             }else{
-                level_code= 30 + get_bits(gb, prefix-3); //part
+                level_code= 30;
                 if(prefix>=16){
                     if(prefix > 25+3){
                         av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
@@ -523,6 +524,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                     }
                     level_code += (1<<(prefix-3))-4096;
                 }
+                level_code += get_bits(gb, prefix-3); //part
             }
 
             if(trailing_ones < 3) level_code += 2;
@@ -552,9 +554,15 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 if(prefix<15){
                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
                 }else{
-                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
-                    if(prefix>=16)
+                    level_code = 15<<suffix_length;
+                    if (prefix>=16) {
+                        if(prefix > 25+3){
+                            av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
+                            return AVERROR_INVALIDDATA;
+                        }
                         level_code += (1<<(prefix-3))-4096;
+                    }
+                    level_code += get_bits(gb, prefix-3);
                 }
                 mask= -(level_code&1);
                 level_code= (((2+level_code)>>1) ^ mask) - mask;
@@ -569,13 +577,13 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
     else{
         if (max_coeff <= 8) {
             if (max_coeff == 4)
-                zeros_left = get_vlc2(gb, chroma_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[total_coeff].table,
                                       CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
             else
-                zeros_left = get_vlc2(gb, chroma422_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, (chroma422_dc_total_zeros_vlc-1)[total_coeff].table,
                                       CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 1);
         } else {
-            zeros_left= get_vlc2(gb, total_zeros_vlc[total_coeff - 1].table, TOTAL_ZEROS_VLC_BITS, 1);
+            zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
         }
     }
 
@@ -585,7 +593,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = level[0]; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
             else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
             zeros_left -= run_before; \
@@ -600,7 +608,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = ((int)(level[0] * qmul[*scantable] + 32))>>6; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
             else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
             zeros_left -= run_before; \
@@ -613,18 +621,17 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         } \
     }
 
-    if (zeros_left < 0) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
-        return AVERROR_INVALIDDATA;
-    }
-
     if (h->pixel_shift) {
         STORE_BLOCK(int32_t)
     } else {
         STORE_BLOCK(int16_t)
     }
 
+    if(zeros_left<0){
+        av_log(h->avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -645,7 +652,7 @@ int decode_luma_residual(const H264Context *h, H264SliceContext *sl,
             return -1; //FIXME continue if partitioned and other return -1 too
         }
 
-        assert((cbp&15) == 0 || (cbp&15) == 15);
+        av_assert2((cbp&15) == 0 || (cbp&15) == 15);
 
         if(cbp&15){
             for(i8x8=0; i8x8<4; i8x8++){
@@ -715,7 +722,7 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
                 down the code */
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
         if (sl->mb_skip_run == -1)
-            sl->mb_skip_run = get_ue_golomb(&sl->gb);
+            sl->mb_skip_run = get_ue_golomb_long(&sl->gb);
 
         if (sl->mb_skip_run--) {
             if (FRAME_MBAFF(h) && (sl->mb_y & 1) == 0) {
@@ -751,7 +758,7 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
             goto decode_intra_mb;
         }
     }else{
-       assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+       av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
 decode_intra_mb:
@@ -857,7 +864,7 @@ decode_intra_mb:
                 sl->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
             }
         }else{
-            assert(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
+            av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
             for(i=0; i<4; i++){
                 sl->sub_mb_type[i]= get_ue_golomb_31(&sl->gb);
                 if(sl->sub_mb_type[i] >=4){
@@ -950,7 +957,7 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     unsigned int val;
                     if(IS_DIR(mb_type, 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc == 1) {
                             val= 0;
                         } else if (rc == 2) {
@@ -981,7 +988,7 @@ decode_intra_mb:
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1014,12 +1021,12 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1075,6 +1082,11 @@ decode_intra_mb:
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
         }
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
@@ -1127,12 +1139,15 @@ decode_intra_mb:
             if (decode_luma_residual(h, sl, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ) {
                 return -1;
             }
-        } else if (CHROMA422(h)) {
+        } else {
+            const int num_c8x8 = h->sps.chroma_format_idc;
+
             if(cbp&0x30){
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++)
                     if (decode_residual(h, sl, gb, sl->mb + ((256 + 16*16*chroma_idx) << pixel_shift),
-                                        CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma422_dc_scan,
-                                        NULL, 8) < 0) {
+                                        CHROMA_DC_BLOCK_INDEX+chroma_idx,
+                                        CHROMA422(h) ? chroma422_dc_scan : chroma_dc_scan,
+                                        NULL, 4*num_c8x8) < 0) {
                         return -1;
                     }
             }
@@ -1141,7 +1156,7 @@ decode_intra_mb:
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++){
                     const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
                     int16_t *mb = sl->mb + (16*(16 + 16*chroma_idx) << pixel_shift);
-                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
+                    for (i8x8 = 0; i8x8<num_c8x8; i8x8++) {
                         for (i4x4 = 0; i4x4 < 4; i4x4++) {
                             const int index = 16 + 16*chroma_idx + 8*i8x8 + i4x4;
                             if (decode_residual(h, sl, gb, mb, index, scan + 1, qmul, 15) < 0)
@@ -1154,28 +1169,6 @@ decode_intra_mb:
                 fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                 fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
-        } else /* yuv420 */ {
-            if(cbp&0x30){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                    if( decode_residual(h, sl, gb, sl->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
-                        return -1;
-                    }
-            }
-
-            if(cbp&0x20){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                    const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= 16 + 16*chroma_idx + i4x4;
-                        if( decode_residual(h, sl, gb, sl->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
-                            return -1;
-                        }
-                    }
-                }
-            }else{
-                fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-                fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
-            }
         }
     }else{
         fill_rectangle(&sl->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
diff --git a/libavcodec/h264_direct.c b/libavcodec/h264_direct.c
index a7966e15e9..5756a7ba66 100644
--- a/libavcodec/h264_direct.c
+++ b/libavcodec/h264_direct.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -159,11 +159,11 @@ void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *
     }
 }
 
-static void await_reference_mb_row(const H264Context *const h, H264Picture *ref,
+static void await_reference_mb_row(const H264Context *const h, H264Ref *ref,
                                    int mb_y)
 {
     int ref_field         = ref->reference - 1;
-    int ref_field_picture = ref->field_picture;
+    int ref_field_picture = ref->parent->field_picture;
     int ref_height        = 16 * h->mb_height >> ref_field_picture;
 
     if (!HAVE_THREADS || !(h->avctx->active_thread_type & FF_THREAD_FRAME))
@@ -172,7 +172,7 @@ static void await_reference_mb_row(const H264Context *const h, H264Picture *ref,
     /* FIXME: It can be safe to access mb stuff
      * even if pixels aren't deblocked yet. */
 
-    ff_thread_await_progress(&ref->tf,
+    ff_thread_await_progress(&ref->parent->tf,
                              FFMIN(16 * mb_y >> ref_field_picture,
                                    ref_height - 1),
                              ref_field_picture && ref_field);
@@ -196,7 +196,7 @@ static void pred_spatial_direct_motion(const H264Context *const h, H264SliceCont
 
     assert(sl->ref_list[1][0].reference & 3);
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent,
+    await_reference_mb_row(h, &sl->ref_list[1][0],
                            sl->mb_y + !!IS_INTERLACED(*mb_type));
 
 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16 | MB_TYPE_INTRA4x4 | \
@@ -237,6 +237,7 @@ static void pred_spatial_direct_motion(const H264Context *const h, H264SliceCont
                 else
                     mv[list] = AV_RN32A(C);
             }
+            av_assert2(ref[list] < (sl->ref_count[list] << !!FRAME_MBAFF(h)));
         } else {
             int mask = ~(MB_TYPE_L0 << (2 * list));
             mv[list]  = 0;
@@ -320,10 +321,10 @@ single_col:
         }
     }
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent, mb_y);
+    await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
@@ -479,7 +480,7 @@ static void pred_temp_direct_motion(const H264Context *const h, H264SliceContext
 
     assert(sl->ref_list[1][0].reference & 3);
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent,
+    await_reference_mb_row(h, &sl->ref_list[1][0],
                            sl->mb_y + !!IS_INTERLACED(*mb_type));
 
     if (IS_INTERLACED(sl->ref_list[1][0].parent->mb_type[mb_xy])) { // AFL/AFR/FR/FL -> AFL/FL
@@ -544,10 +545,10 @@ single_col:
         }
     }
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent, mb_y);
+    await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index ae93681246..00149272b0 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... loop filter
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,8 +34,6 @@
 #include "mpegutils.h"
 #include "rectangle.h"
 
-#include <assert.h>
-
 /* Deblocking filter (p153) */
 static const uint8_t alpha_table[52*3] = {
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@@ -244,7 +242,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
                                                           unsigned int uvlinesize,
                                                           int pixel_shift)
 {
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int chroma444 = CHROMA444(h);
     int chroma422 = CHROMA422(h);
 
@@ -358,7 +356,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
         }
         return;
     } else {
-        LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
+        LOCAL_ALIGNED(8, int16_t, bS, [2], [4][4]);
         int edges;
         if( IS_8x8DCT(mb_type) && (sl->cbp&7) == 7 && !chroma444 ) {
             edges = 4;
@@ -421,7 +419,7 @@ void ff_h264_filter_mb_fast(const H264Context *h, H264SliceContext *sl,
                             uint8_t *img_cb, uint8_t *img_cr,
                             unsigned int linesize, unsigned int uvlinesize)
 {
-    assert(!FRAME_MBAFF(h));
+    av_assert2(!FRAME_MBAFF(h));
     if(!h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
         ff_h264_filter_mb(h, sl, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
@@ -507,7 +505,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             int j;
 
             for(j=0; j<2; j++, mbn_xy += h->mb_stride){
-                DECLARE_ALIGNED(8, int16_t, bS)[4];
+                LOCAL_ALIGNED(8, int16_t, bS, [4]);
                 int qp;
                 if (IS_INTRA(mb_type | h->cur_pic.mb_type[mbn_xy])) {
                     AV_WN64A(bS, 0x0003000300030003ULL);
@@ -544,7 +542,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
                 }
             }
         }else{
-            DECLARE_ALIGNED(8, int16_t, bS)[4];
+            LOCAL_ALIGNED(8, int16_t, bS, [4]);
             int qp;
 
             if( IS_INTRA(mb_type|mbm_type)) {
@@ -593,7 +591,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             // value in IPCM macroblocks.
             if(bS[0]+bS[1]+bS[2]+bS[3]){
                 qp = (h->cur_pic.qscale_table[mb_xy] + h->cur_pic.qscale_table[mbm_xy] + 1) >> 1;
+                //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
                 ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+                //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
                 chroma_qp_avg[0] = (sl->chroma_qp[0] + get_chroma_qp(h, 0, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 chroma_qp_avg[1] = (sl->chroma_qp[1] + get_chroma_qp(h, 1, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 if( dir == 0 ) {
@@ -625,7 +625,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
 
     /* Calculate bS */
     for( edge = 1; edge < edges; edge++ ) {
-        DECLARE_ALIGNED(8, int16_t, bS)[4];
+        LOCAL_ALIGNED(8, int16_t, bS, [4]);
         int qp;
         const int deblock_edge = !IS_8x8DCT(mb_type & (edge<<24)); // (edge&1) && IS_8x8DCT(mb_type)
 
@@ -676,7 +676,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
         // Do not use s->qscale as luma quantizer because it has not the same
         // value in IPCM macroblocks.
         qp = h->cur_pic.qscale_table[mb_xy];
+        //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
         ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+        //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, a, b, h, 0 );
             if (chroma) {
@@ -721,7 +723,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
     const int mb_type = h->cur_pic.mb_type[mb_xy];
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int qp_bd_offset = 6 * (h->sps.bit_depth_luma - 8);
     int a = 52 + sl->slice_alpha_c0_offset - qp_bd_offset;
     int b = 52 + sl->slice_beta_offset - qp_bd_offset;
@@ -734,7 +736,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
         /* First vertical edge is different in MBAFF frames
          * There are 8 different bS to compute and 2 different Qp
          */
-        DECLARE_ALIGNED(8, int16_t, bS)[8];
+        LOCAL_ALIGNED(8, int16_t, bS, [8]);
         int qp[2];
         int bqp[2];
         int rqp[2];
diff --git a/libavcodec/h264_mb.c b/libavcodec/h264_mb.c
index 2487b96ceb..8302de040e 100644
--- a/libavcodec/h264_mb.c
+++ b/libavcodec/h264_mb.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,17 +40,17 @@ static inline int get_lowest_part_list_y(H264SliceContext *sl,
                                          int n, int height, int y_offset, int list)
 {
     int raw_my             = sl->mv_cache[list][scan8[n]][1];
-    int filter_height_up   = (raw_my & 3) ? 2 : 0;
     int filter_height_down = (raw_my & 3) ? 3 : 0;
     int full_my            = (raw_my >> 2) + y_offset;
-    int top                = full_my - filter_height_up;
     int bottom             = full_my + filter_height_down + height;
 
-    return FFMAX(abs(top), bottom);
+    av_assert2(height >= 0);
+
+    return FFMAX(0, bottom);
 }
 
 static inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
-                                     int refs[2][48], int n,
+                                     int16_t refs[2][48], int n,
                                      int height, int y_offset, int list0,
                                      int list1, int *nrefs)
 {
@@ -97,7 +97,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
 {
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
-    int refs[2][48];
+    int16_t refs[2][48];
     int nrefs[2] = { 0 };
     int ref, list;
 
@@ -119,7 +119,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -151,7 +151,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                                   nrefs);
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_y_offset = y_offset + 2 * (j & 2);
                     get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
@@ -176,6 +176,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                 nrefs[list]--;
 
                 if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
+                    av_assert2((ref_pic->parent->reference & 3) == 3);
                     ff_thread_await_progress(&ref_pic->parent->tf,
                                              FFMIN((row >> 1) - !(row & 1),
                                                    pic_height - 1),
@@ -215,7 +216,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
     int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
     const int luma_xy = (mx & 3) + ((my & 3) << 2);
-    ptrdiff_t offset  = ((mx >> 2) << pixel_shift) + (my >> 2) * sl->mb_linesize;
+    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
     uint8_t *src_y    = pic->data[0] + offset;
     uint8_t *src_cb, *src_cr;
     int extra_width  = 0;
@@ -290,9 +291,9 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
     }
 
-    src_cb = pic->data[1] + ((mx >> 3) << pixel_shift) +
+    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
-    src_cr = pic->data[2] + ((mx >> 3) << pixel_shift) +
+    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
 
     if (emu) {
@@ -304,7 +305,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     }
     chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
               height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 
     if (emu) {
         h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
@@ -314,7 +315,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         src_cr = sl->edge_emu_buffer;
     }
     chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 }
 
 static av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
@@ -424,10 +425,12 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
             int weight1 = 64 - weight0;
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
                             height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+            }
         } else {
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
                             sl->luma_log2_weight_denom,
@@ -435,18 +438,20 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                             sl->luma_weight[refn1][1][0],
                             sl->luma_weight[refn0][0][1] +
                             sl->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
-                              sl->chroma_log2_weight_denom,
-                              sl->chroma_weight[refn0][0][0][0],
-                              sl->chroma_weight[refn1][1][0][0],
-                              sl->chroma_weight[refn0][0][0][1] +
-                              sl->chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
-                              sl->chroma_log2_weight_denom,
-                              sl->chroma_weight[refn0][0][1][0],
-                              sl->chroma_weight[refn1][1][1][0],
-                              sl->chroma_weight[refn0][0][1][1] +
-                              sl->chroma_weight[refn1][1][1][1]);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
+                                  sl->chroma_log2_weight_denom,
+                                  sl->chroma_weight[refn0][0][0][0],
+                                  sl->chroma_weight[refn1][1][0][0],
+                                  sl->chroma_weight[refn0][0][0][1] +
+                                  sl->chroma_weight[refn1][1][0][1]);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
+                                  sl->chroma_log2_weight_denom,
+                                  sl->chroma_weight[refn0][0][1][0],
+                                  sl->chroma_weight[refn1][1][1][0],
+                                  sl->chroma_weight[refn0][0][1][1] +
+                                  sl->chroma_weight[refn1][1][1][1]);
+            }
         }
     } else {
         int list     = list1 ? 1 : 0;
@@ -460,15 +465,17 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                        sl->luma_log2_weight_denom,
                        sl->luma_weight[refn][list][0],
                        sl->luma_weight[refn][list][1]);
-        if (sl->use_weight_chroma) {
-            chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
-                             sl->chroma_log2_weight_denom,
-                             sl->chroma_weight[refn][list][0][0],
-                             sl->chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
-                             sl->chroma_log2_weight_denom,
-                             sl->chroma_weight[refn][list][1][0],
-                             sl->chroma_weight[refn][list][1][1]);
+        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (sl->use_weight_chroma) {
+                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
+                                 sl->chroma_log2_weight_denom,
+                                 sl->chroma_weight[refn][list][0][0],
+                                 sl->chroma_weight[refn][list][0][1]);
+                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
+                                 sl->chroma_log2_weight_denom,
+                                 sl->chroma_weight[refn][list][1][0],
+                                 sl->chroma_weight[refn][list][1][1]);
+            }
         }
     }
 }
@@ -484,7 +491,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
         const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
         const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
         uint8_t **src = sl->ref_list[list][refn].data;
-        int off       = (mx << pixel_shift) +
+        int off       =  mx * (1<< pixel_shift) +
                         (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
                         (64 << pixel_shift);
         h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
@@ -492,9 +499,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
             h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
             h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
         } else {
-            off = ((mx >> 1) << pixel_shift) +
-                  ((my >> 1) + (sl->mb_x & 7)) * sl->uvlinesize +
-                  (64 << pixel_shift);
+            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
             h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
         }
     }
@@ -561,10 +566,8 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
             XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
                  src_y + (17 << pixel_shift), 1);
         }
-    }
-    if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
-        if (chroma444) {
-            if (deblock_top) {
+        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (chroma444) {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -577,9 +580,7 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
                 }
-            }
-        } else {
-            if (deblock_top) {
+            } else {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -637,7 +638,12 @@ static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
                 uint8_t *const ptr = dest_y + block_offset[i];
                 const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
                 if (transform_bypass && h->sps.profile_idc == 244 && dir <= 1) {
-                    h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
+                    if (h->x264_build != -1) {
+                        h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
+                    } else
+                        h->hpc.pred8x8l_filter_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift),
+                                                        (sl-> topleft_samples_available << i) & 0x8000,
+                                                        (sl->topright_samples_available << i) & 0x4000, linesize);
                 } else {
                     const int nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
                     h->hpc.pred8x8l[dir](ptr, (sl->topleft_samples_available << i) & 0x8000,
@@ -670,7 +676,7 @@ static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
                     uint64_t tr_high;
                     if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
                         const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
-                        assert(sl->mb_y || linesize <= block_offset[i]);
+                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
                         if (!topright_avail) {
                             if (pixel_shift) {
                                 tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index 3b0b880647..54420b9022 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,8 +100,8 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
     }
 
     if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
+        const int bit_depth = h->sps.bit_depth_luma;
         if (PIXEL_SHIFT) {
-            const int bit_depth = h->sps.bit_depth_luma;
             int j;
             GetBitContext gb;
             init_get_bits(&gb, sl->intra_pcm_ptr,
@@ -116,13 +116,10 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 if (!h->sps.chroma_format_idc) {
                     for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = 1 << (bit_depth - 1);
-                    }
-                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = 1 << (bit_depth - 1);
+                        for (j = 0; j < 8; j++) {
+                            tmp_cb[j] = tmp_cr[j] = 1 << (bit_depth - 1);
+                        }
                     }
                 } else {
                     for (i = 0; i < block_h; i++) {
@@ -142,9 +139,9 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 memcpy(dest_y + i * linesize, sl->intra_pcm_ptr + i * 16, 16);
             if (SIMPLE || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
                 if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        memset(dest_cb + i * uvlinesize, 128, 8);
-                        memset(dest_cr + i * uvlinesize, 128, 8);
+                    for (i = 0; i < 8; i++) {
+                        memset(dest_cb + i * uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cr + i * uvlinesize, 1 << (bit_depth - 1), 8);
                     }
                 } else {
                     const uint8_t *src_cb = sl->intra_pcm_ptr + 256;
diff --git a/libavcodec/h264_mc_template.c b/libavcodec/h264_mc_template.c
index a2958598ab..eaead35bb2 100644
--- a/libavcodec/h264_mc_template.c
+++ b/libavcodec/h264_mc_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,7 +74,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
 
-    assert(IS_INTER(mb_type));
+    av_assert2(IS_INTER(mb_type));
 
     if (HAVE_THREADS && (h->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h, sl);
@@ -106,7 +106,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -144,7 +144,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_x_offset = x_offset + 2 * (j & 1);
                     int sub_y_offset = y_offset + (j & 2);
diff --git a/libavcodec/h264_mp4toannexb_bsf.c b/libavcodec/h264_mp4toannexb_bsf.c
index eeb250f595..a5da84a7af 100644
--- a/libavcodec/h264_mp4toannexb_bsf.c
+++ b/libavcodec/h264_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * H.264 MP4 to Annex B byte stream format filter
  * Copyright (c) 2007 Benoit Fouet <benoit.fouet@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,25 @@
 #include "avcodec.h"
 
 typedef struct H264BSFContext {
+    int32_t  sps_offset;
+    int32_t  pps_offset;
     uint8_t  length_size;
-    uint8_t  first_idr;
+    uint8_t  new_idr;
+    uint8_t  idr_sps_seen;
+    uint8_t  idr_pps_seen;
     int      extradata_parsed;
+
+    /* When private_spspps is zero then spspps_buf points to global extradata
+       and bsf does replace a global extradata to own-allocated version (default
+       behaviour).
+       When private_spspps is non-zero the bsf uses a private version of spspps buf.
+       This mode necessary when bsf uses in decoder, else bsf has issues after
+       decoder re-initialization. Use the "private_spspps_buf" argument to
+       activate this mode.
+     */
+    int      private_spspps;
+    uint8_t *spspps_buf;
+    uint32_t spspps_size;
 } H264BSFContext;
 
 static int alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size,
@@ -59,7 +75,7 @@ static int alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size,
     return 0;
 }
 
-static int h264_extradata_to_annexb(AVCodecContext *avctx, const int padding)
+static int h264_extradata_to_annexb(H264BSFContext *ctx, AVCodecContext *avctx, const int padding)
 {
     uint16_t unit_size;
     uint64_t total_size                 = 0;
@@ -69,18 +85,14 @@ static int h264_extradata_to_annexb(AVCodecContext *avctx, const int padding)
     static const uint8_t nalu_header[4] = { 0, 0, 0, 1 };
     int length_size = (*extradata++ & 0x3) + 1; // retrieve length coded size
 
-    if (length_size == 3)
-        return AVERROR(EINVAL);
+    ctx->sps_offset = ctx->pps_offset = -1;
 
     /* retrieve sps and pps unit(s) */
     unit_nb = *extradata++ & 0x1f; /* number of sps unit(s) */
     if (!unit_nb) {
-        unit_nb = *extradata++; /* number of pps unit(s) */
-        sps_done++;
-
-        if (unit_nb)
-            pps_seen = 1;
+        goto pps;
     } else {
+        ctx->sps_offset = 0;
         sps_seen = 1;
     }
 
@@ -89,9 +101,15 @@ static int h264_extradata_to_annexb(AVCodecContext *avctx, const int padding)
 
         unit_size   = AV_RB16(extradata);
         total_size += unit_size + 4;
-        if (total_size > INT_MAX - padding ||
-            extradata + 2 + unit_size > avctx->extradata +
-            avctx->extradata_size) {
+        if (total_size > INT_MAX - padding) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Too big extradata size, corrupted stream or invalid MP4/AVCC bitstream\n");
+            av_free(out);
+            return AVERROR(EINVAL);
+        }
+        if (extradata + 2 + unit_size > avctx->extradata + avctx->extradata_size) {
+            av_log(avctx, AV_LOG_ERROR, "Packet header is not contained in global extradata, "
+                   "corrupted stream or invalid MP4/AVCC bitstream\n");
             av_free(out);
             return AVERROR(EINVAL);
         }
@@ -100,16 +118,18 @@ static int h264_extradata_to_annexb(AVCodecContext *avctx, const int padding)
         memcpy(out + total_size - unit_size - 4, nalu_header, 4);
         memcpy(out + total_size - unit_size, extradata + 2, unit_size);
         extradata += 2 + unit_size;
-
+pps:
         if (!unit_nb && !sps_done++) {
             unit_nb = *extradata++; /* number of pps unit(s) */
-            if (unit_nb)
+            if (unit_nb) {
+                ctx->pps_offset = (extradata - 1) - (avctx->extradata + 4);
                 pps_seen = 1;
+            }
         }
     }
 
     if (out)
-        memset(out + total_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        memset(out + total_size, 0, padding);
 
     if (!sps_seen)
         av_log(avctx, AV_LOG_WARNING,
@@ -121,9 +141,13 @@ static int h264_extradata_to_annexb(AVCodecContext *avctx, const int padding)
                "Warning: PPS NALU missing or invalid. "
                "The resulting stream may not play.\n");
 
-    av_free(avctx->extradata);
-    avctx->extradata      = out;
-    avctx->extradata_size = total_size;
+    if (!ctx->private_spspps) {
+        av_free(avctx->extradata);
+        avctx->extradata      = out;
+        avctx->extradata_size = total_size;
+    }
+    ctx->spspps_buf  = out;
+    ctx->spspps_size = total_size;
 
     return length_size;
 }
@@ -135,6 +159,7 @@ static int h264_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
                                    int keyframe)
 {
     H264BSFContext *ctx = bsfc->priv_data;
+    int i;
     uint8_t unit_type;
     int32_t nal_size;
     uint32_t cumul_size    = 0;
@@ -150,48 +175,91 @@ static int h264_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
 
     /* retrieve sps and pps NAL units from extradata */
     if (!ctx->extradata_parsed) {
-        ret = h264_extradata_to_annexb(avctx, AV_INPUT_BUFFER_PADDING_SIZE);
+        if (args && strstr(args, "private_spspps_buf"))
+            ctx->private_spspps = 1;
+
+        ret = h264_extradata_to_annexb(ctx, avctx, AV_INPUT_BUFFER_PADDING_SIZE);
         if (ret < 0)
             return ret;
         ctx->length_size      = ret;
-        ctx->first_idr        = 1;
+        ctx->new_idr          = 1;
+        ctx->idr_sps_seen     = 0;
+        ctx->idr_pps_seen     = 0;
         ctx->extradata_parsed = 1;
     }
 
     *poutbuf_size = 0;
     *poutbuf      = NULL;
     do {
+        ret= AVERROR(EINVAL);
         if (buf + ctx->length_size > buf_end)
             goto fail;
 
-        if (ctx->length_size == 1) {
-            nal_size = buf[0];
-        } else if (ctx->length_size == 2) {
-            nal_size = AV_RB16(buf);
-        } else
-            nal_size = AV_RB32(buf);
+        for (nal_size = 0, i = 0; i<ctx->length_size; i++)
+            nal_size = (nal_size << 8) | buf[i];
 
         buf      += ctx->length_size;
         unit_type = *buf & 0x1f;
 
-        if (buf + nal_size > buf_end || nal_size < 0)
+        if (nal_size > buf_end - buf || nal_size < 0)
             goto fail;
 
-        /* prepend only to the first type 5 NAL unit of an IDR picture */
-        if (ctx->first_idr && unit_type == 5) {
-            if (alloc_and_copy(poutbuf, poutbuf_size,
-                               avctx->extradata, avctx->extradata_size,
-                               buf, nal_size) < 0)
+        if (unit_type == 7)
+            ctx->idr_sps_seen = ctx->new_idr = 1;
+        else if (unit_type == 8) {
+            ctx->idr_pps_seen = ctx->new_idr = 1;
+            /* if SPS has not been seen yet, prepend the AVCC one to PPS */
+            if (!ctx->idr_sps_seen) {
+                if (ctx->sps_offset == -1)
+                    av_log(avctx, AV_LOG_WARNING, "SPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                else {
+                    if ((ret = alloc_and_copy(poutbuf, poutbuf_size,
+                                         ctx->spspps_buf + ctx->sps_offset,
+                                         ctx->pps_offset != -1 ? ctx->pps_offset : ctx->spspps_size - ctx->sps_offset,
+                                         buf, nal_size)) < 0)
+                        goto fail;
+                    ctx->idr_sps_seen = 1;
+                    goto next_nal;
+                }
+            }
+        }
+
+        /* if this is a new IDR picture following an IDR picture, reset the idr flag.
+         * Just check first_mb_in_slice to be 0 as this is the simplest solution.
+         * This could be checking idr_pic_id instead, but would complexify the parsing. */
+        if (!ctx->new_idr && unit_type == 5 && (buf[1] & 0x80))
+            ctx->new_idr = 1;
+
+        /* prepend only to the first type 5 NAL unit of an IDR picture, if no sps/pps are already present */
+        if (ctx->new_idr && unit_type == 5 && !ctx->idr_sps_seen && !ctx->idr_pps_seen) {
+            if ((ret=alloc_and_copy(poutbuf, poutbuf_size,
+                               ctx->spspps_buf, ctx->spspps_size,
+                               buf, nal_size)) < 0)
+                goto fail;
+            ctx->new_idr = 0;
+        /* if only SPS has been seen, also insert PPS */
+        } else if (ctx->new_idr && unit_type == 5 && ctx->idr_sps_seen && !ctx->idr_pps_seen) {
+            if (ctx->pps_offset == -1) {
+                av_log(avctx, AV_LOG_WARNING, "PPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                if ((ret = alloc_and_copy(poutbuf, poutbuf_size,
+                                     NULL, 0, buf, nal_size)) < 0)
+                    goto fail;
+            } else if ((ret = alloc_and_copy(poutbuf, poutbuf_size,
+                                        ctx->spspps_buf + ctx->pps_offset, ctx->spspps_size - ctx->pps_offset,
+                                        buf, nal_size)) < 0)
                 goto fail;
-            ctx->first_idr = 0;
         } else {
-            if (alloc_and_copy(poutbuf, poutbuf_size,
-                               NULL, 0, buf, nal_size) < 0)
+            if ((ret=alloc_and_copy(poutbuf, poutbuf_size,
+                               NULL, 0, buf, nal_size)) < 0)
                 goto fail;
-            if (!ctx->first_idr && unit_type == 1)
-                ctx->first_idr = 1;
+            if (!ctx->new_idr && unit_type == 1) {
+                ctx->new_idr = 1;
+                ctx->idr_sps_seen = 0;
+                ctx->idr_pps_seen = 0;
+            }
         }
 
+next_nal:
         buf        += nal_size;
         cumul_size += nal_size + ctx->length_size;
     } while (cumul_size < buf_size);
@@ -201,11 +269,19 @@ static int h264_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
 fail:
     av_freep(poutbuf);
     *poutbuf_size = 0;
-    return AVERROR(EINVAL);
+    return ret;
+}
+
+static void h264_mp4toannexb_filter_close(AVBitStreamFilterContext *bsfc)
+{
+    H264BSFContext *ctx = bsfc->priv_data;
+    if (ctx->private_spspps)
+        av_free(ctx->spspps_buf);
 }
 
 AVBitStreamFilter ff_h264_mp4toannexb_bsf = {
-    "h264_mp4toannexb",
-    sizeof(H264BSFContext),
-    h264_mp4toannexb_filter,
+    .name           = "h264_mp4toannexb",
+    .priv_data_size = sizeof(H264BSFContext),
+    .filter         = h264_mp4toannexb_filter,
+    .close          = h264_mp4toannexb_filter_close,
 };
diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h
index 5e8f237f63..763746cc26 100644
--- a/libavcodec/h264_mvpred.h
+++ b/libavcodec/h264_mvpred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... motion vector predicion
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@
 #include "avcodec.h"
 #include "h264.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static av_always_inline int fetch_diagonal_mv(const H264Context *h, H264SliceContext *sl,
                                               const int16_t **C,
@@ -106,7 +106,7 @@ static av_always_inline void pred_motion(const H264Context *const h,
     const int16_t *C;
     int diagonal_ref, match_count;
 
-    assert(part_width == 1 || part_width == 2 || part_width == 4);
+    av_assert2(part_width == 1 || part_width == 2 || part_width == 4);
 
 /* mv_cache
  * B . . A T T T T
@@ -488,7 +488,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 } else {
                     int left_typei = h->cur_pic.mb_type[left_xy[LTOP] + h->mb_stride];
 
-                    assert(left_xy[LTOP] == left_xy[LBOT]);
+                    av_assert2(left_xy[LTOP] == left_xy[LBOT]);
                     if (!((left_typei & type_mask) && (left_type[LTOP] & type_mask))) {
                         sl->topleft_samples_available &= 0xDF5F;
                         sl->left_samples_available    &= 0x5F5F;
@@ -613,7 +613,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
             int16_t(*mv)[2]       = h->cur_pic.motion_val[list];
             if (!USES_LIST(mb_type, list))
                 continue;
-            assert(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
+            av_assert2(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
 
             if (USES_LIST(top_type, list)) {
                 const int b_xy = h->mb2b_xy[top_xy] + 3 * b_stride;
@@ -670,7 +670,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 ref_cache[4 - 1 * 8] = topright_type ? LIST_NOT_USED
                                                      : PART_NOT_AVAILABLE;
             }
-            if (ref_cache[4 - 1 * 8] < 0) {
+            if(ref_cache[2 - 1*8] < 0 || ref_cache[4 - 1 * 8] < 0) {
                 if (USES_LIST(topleft_type, list)) {
                     const int b_xy  = h->mb2b_xy[topleft_xy] + 3 + b_stride +
                                       (sl->topleft_partition & 2 * b_stride);
@@ -771,7 +771,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 
 #define MAP_F2F(idx, mb_type)                                           \
     if (!IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {     \
-        sl->ref_cache[list][idx]    <<= 1;                              \
+        sl->ref_cache[list][idx]     *= 2;                              \
         sl->mv_cache[list][idx][1]   /= 2;                              \
         sl->mvd_cache[list][idx][1] >>= 1;                              \
     }
@@ -783,7 +783,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 #define MAP_F2F(idx, mb_type)                                           \
     if (IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {      \
         sl->ref_cache[list][idx]    >>= 1;                              \
-        sl->mv_cache[list][idx][1]  <<= 1;                              \
+        sl->mv_cache[list][idx][1]   *= 2;                              \
         sl->mvd_cache[list][idx][1] <<= 1;                              \
     }
 
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index f44435532e..19d1aa3f21 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parser
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
 #include "parser.h"
 #include "h264data.h"
@@ -32,8 +34,6 @@
 #include "internal.h"
 #include "mpegutils.h"
 
-#include <assert.h>
-
 typedef struct H264ParseContext {
     H264Context h;
     ParseContext pc;
@@ -45,18 +45,36 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
                                int buf_size)
 {
     H264Context *h = &p->h;
-    int i;
+    int i, j;
     uint32_t state;
     ParseContext *pc = &p->pc;
+
+    int next_avc= h->is_avc ? 0 : buf_size;
 //    mb_addr= pc->mb_addr - 1;
     state = pc->state;
     if (state > 13)
         state = 7;
 
+    if (h->is_avc && !h->nal_length_size)
+        av_log(h->avctx, AV_LOG_ERROR, "AVC-parser: nal length size invalid\n");
+
     for (i = 0; i < buf_size; i++) {
+        if (i >= next_avc) {
+            int nalsize = 0;
+            i = next_avc;
+            for (j = 0; j < h->nal_length_size; j++)
+                nalsize = (nalsize << 8) | buf[i++];
+            if (nalsize <= 0 || nalsize > buf_size - i) {
+                av_log(h->avctx, AV_LOG_ERROR, "AVC-parser: nal size %d remaining %d\n", nalsize, buf_size - i);
+                return buf_size;
+            }
+            next_avc = i + nalsize;
+            state    = 5;
+        }
+
         if (state == 7) {
-            i += h->h264dsp.startcode_find_candidate(buf + i, buf_size - i);
-            if (i < buf_size)
+            i += h->h264dsp.startcode_find_candidate(buf + i, next_avc - i);
+            if (i < next_avc)
                 state = 2;
         } else if (state <= 2) {
             if (buf[i] == 1)
@@ -75,27 +93,40 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
                 }
             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
                        nalu_type == NAL_IDR_SLICE) {
+                state += 8;
+                continue;
+            }
+            state = 7;
+        } else {
+            h->parse_history[h->parse_history_count++]= buf[i];
+            if (h->parse_history_count>5) {
+                unsigned int mb, last_mb= h->parse_last_mb;
+                GetBitContext gb;
+
+                init_get_bits(&gb, h->parse_history, 8*h->parse_history_count);
+                h->parse_history_count=0;
+                mb= get_ue_golomb_long(&gb);
+                h->parse_last_mb= mb;
                 if (pc->frame_start_found) {
-                    state += 8;
-                    continue;
+                    if (mb <= last_mb)
+                        goto found;
                 } else
                     pc->frame_start_found = 1;
+                state = 7;
             }
-            state = 7;
-        } else {
-            // first_mb_in_slice is 0, probably the first nal of a new slice
-            if (buf[i] & 0x80)
-                goto found;
-            state = 7;
         }
     }
     pc->state = state;
+    if (h->is_avc)
+        return next_avc;
     return END_NOT_FOUND;
 
 found:
     pc->state             = 7;
     pc->frame_start_found = 0;
-    return i - (state & 5);
+    if (h->is_avc)
+        return next_avc;
+    return i - (state & 5) - 5 * (state > 7);
 }
 
 static int scan_mmco_reset(AVCodecParserContext *s)
@@ -180,16 +211,17 @@ static int scan_mmco_reset(AVCodecParserContext *s)
  */
 static inline int parse_nal_units(AVCodecParserContext *s,
                                   AVCodecContext *avctx,
-                                  const uint8_t *buf, int buf_size)
+                                  const uint8_t * const buf, int buf_size)
 {
     H264ParseContext *p = s->priv_data;
     H264Context      *h = &p->h;
     H264SliceContext *sl = &h->slice_ctx[0];
-    const uint8_t *buf_end = buf + buf_size;
+    int buf_index, next_avc;
     unsigned int pps_id;
     unsigned int slice_type;
     int state = -1, got_reset = 0;
     const uint8_t *ptr;
+    int q264 = buf_size >=4 && !memcmp("Q264", buf, 4);
     int field_poc[2];
 
     /* set some sane default values */
@@ -199,17 +231,31 @@ static inline int parse_nal_units(AVCodecParserContext *s,
 
     h->avctx = avctx;
     ff_h264_reset_sei(h);
+    h->sei_fpa.frame_packing_arrangement_cancel_flag = -1;
 
     if (!buf_size)
         return 0;
 
+    buf_index     = 0;
+    next_avc      = h->is_avc ? 0 : buf_size;
     for (;;) {
-        int src_length, dst_length, consumed;
-        buf = avpriv_find_start_code(buf, buf_end, &state);
-        if (buf >= buf_end)
-            break;
-        --buf;
-        src_length = buf_end - buf;
+        int src_length, dst_length, consumed, nalsize = 0;
+
+        if (buf_index >= next_avc) {
+            nalsize = get_avc_nalsize(h, buf, buf_size, &buf_index);
+            if (nalsize < 0)
+                break;
+            next_avc = buf_index + nalsize;
+        } else {
+            buf_index = find_start_code(buf, buf_size, buf_index, next_avc);
+            if (buf_index >= buf_size)
+                break;
+            if (buf_index >= next_avc)
+                continue;
+        }
+        src_length = next_avc - buf_index;
+
+        state = buf[buf_index];
         switch (state & 0x1f) {
         case NAL_SLICE:
         case NAL_IDR_SLICE:
@@ -226,14 +272,17 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             }
             break;
         }
-        ptr = ff_h264_decode_nal(h, sl, buf, &dst_length, &consumed, src_length);
+        ptr = ff_h264_decode_nal(h, sl, buf + buf_index, &dst_length,
+                                 &consumed, src_length);
         if (!ptr || dst_length < 0)
             break;
 
+        buf_index += consumed;
+
         init_get_bits(&h->gb, ptr, 8 * dst_length);
         switch (h->nal_unit_type) {
         case NAL_SPS:
-            ff_h264_decode_seq_parameter_set(h);
+            ff_h264_decode_seq_parameter_set(h, 0);
             break;
         case NAL_PPS:
             ff_h264_decode_picture_parameter_set(h, h->gb.size_in_bits);
@@ -251,7 +300,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
         /* fall through */
         case NAL_SLICE:
             init_get_bits(&sl->gb, ptr, 8 * dst_length);
-            get_ue_golomb(&sl->gb);  // skip first_mb_in_slice
+            get_ue_golomb_long(&sl->gb);  // skip first_mb_in_slice
             slice_type   = get_ue_golomb_31(&sl->gb);
             s->pict_type = golomb_to_pict_type[slice_type % 5];
             if (h->sei_recovery_frame_cnt >= 0) {
@@ -278,6 +327,9 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             h->sps       = *h->sps_buffers[h->pps.sps_id];
             h->frame_num = get_bits(&sl->gb, h->sps.log2_max_frame_num);
 
+            if(h->sps.ref_frame_count <= 1 && h->pps.ref_count[0] <= 1 && s->pict_type == AV_PICTURE_TYPE_I)
+                s->key_frame = 1;
+
             s->coded_width  = 16 * h->sps.mb_width;
             s->coded_height = 16 * h->sps.mb_height;
             s->width        = s->coded_width  - (h->sps.crop_right + h->sps.crop_left);
@@ -431,10 +483,11 @@ static inline int parse_nal_units(AVCodecParserContext *s,
 
             return 0; /* no need to evaluate the rest */
         }
-        buf += consumed;
     }
+    if (q264)
+        return 0;
     /* didn't find a picture! */
-    av_log(h->avctx, AV_LOG_ERROR, "missing picture in access unit\n");
+    av_log(h->avctx, AV_LOG_ERROR, "missing picture in access unit with size %d\n", buf_size);
     return -1;
 }
 
@@ -452,14 +505,13 @@ static int h264_parse(AVCodecParserContext *s,
         p->got_first = 1;
         if (avctx->extradata_size) {
             h->avctx = avctx;
-            // must be done like in the decoder.
-            // otherwise opening the parser, creating extradata,
-            // and then closing and opening again
+            // must be done like in decoder, otherwise opening the parser,
+            // letting it create extradata and then closing and opening again
             // will cause has_b_frames to be always set.
-            // NB: estimate_timings_from_pts behaves exactly like this.
+            // Note that estimate_timings_from_pts does exactly this.
             if (!avctx->has_b_frames)
                 h->low_delay = 1;
-            ff_h264_decode_extradata(h);
+            ff_h264_decode_extradata(h, avctx->extradata, avctx->extradata_size);
         }
     }
 
@@ -475,13 +527,15 @@ static int h264_parse(AVCodecParserContext *s,
         }
 
         if (next < 0 && next != END_NOT_FOUND) {
-            assert(pc->last_index + next >= 0);
+            av_assert1(pc->last_index + next >= 0);
             h264_find_frame_end(p, &pc->buffer[pc->last_index + next], -next); // update state
         }
     }
 
     parse_nal_units(s, avctx, buf, buf_size);
 
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
     if (h->sei_cpb_removal_delay >= 0) {
         s->dts_sync_point    = h->sei_buffering_period_present;
         s->dts_ref_dts_delta = h->sei_cpb_removal_delay;
@@ -504,41 +558,47 @@ static int h264_parse(AVCodecParserContext *s,
 static int h264_split(AVCodecContext *avctx,
                       const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
     int has_sps    = 0;
+    int has_pps    = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
+    int nalu_type;
 
-    for (i = 0; i <= buf_size; i++) {
-        if ((state & 0xFFFFFF1F) == 0x107)
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state & 0xFFFFFF00) != 0x100)
+            break;
+        nalu_type = state & 0x1F;
+        if (nalu_type == NAL_SPS) {
             has_sps = 1;
-        /*  if((state&0xFFFFFF1F) == 0x101 ||
-         *     (state&0xFFFFFF1F) == 0x102 ||
-         *     (state&0xFFFFFF1F) == 0x105) {
+        } else if (nalu_type == NAL_PPS)
+            has_pps = 1;
+        /* else if (nalu_type == 0x01 ||
+         *     nalu_type == 0x02 ||
+         *     nalu_type == 0x05) {
          *  }
          */
-        if ((state & 0xFFFFFF00) == 0x100 && (state & 0xFFFFFF1F) != 0x106 &&
-            (state & 0xFFFFFF1F) != 0x107 && (state & 0xFFFFFF1F) != 0x108 &&
-            (state & 0xFFFFFF1F) != 0x109 && (state & 0xFFFFFF1F) != 0x10d &&
-            (state & 0xFFFFFF1F) != 0x10f) {
+        else if ((nalu_type != NAL_SEI || has_pps) &&
+                  nalu_type != NAL_AUD && nalu_type != NAL_SPS_EXT &&
+                  nalu_type != 0x0f) {
             if (has_sps) {
-                while (i > 4 && buf[i - 5] == 0)
-                    i--;
-                return i - 4;
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
             }
         }
-        if (i < buf_size)
-            state = (state << 8) | buf[i];
     }
+
     return 0;
 }
 
-static void close(AVCodecParserContext *s)
+static void h264_close(AVCodecParserContext *s)
 {
     H264ParseContext *p = s->priv_data;
     H264Context      *h = &p->h;
     ParseContext *pc = &p->pc;
 
-    av_free(pc->buffer);
+    av_freep(&pc->buffer);
     ff_h264_free_context(h);
 }
 
@@ -562,6 +622,6 @@ AVCodecParser ff_h264_parser = {
     .priv_data_size = sizeof(H264ParseContext),
     .parser_init    = init,
     .parser_parse   = h264_parse,
-    .parser_close   = close,
+    .parser_close   = h264_close,
     .split          = h264_split,
 };
diff --git a/libavcodec/h264_picture.c b/libavcodec/h264_picture.c
index 3e2c84e566..731d780e67 100644
--- a/libavcodec/h264_picture.c
+++ b/libavcodec/h264_picture.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,6 +42,7 @@
 #include "mpegutils.h"
 #include "rectangle.h"
 #include "thread.h"
+#include "vdpau_compat.h"
 
 void ff_h264_unref_picture(H264Context *h, H264Picture *pic)
 {
@@ -114,7 +115,12 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src)
     dst->mbaff         = src->mbaff;
     dst->field_picture = src->field_picture;
     dst->reference     = src->reference;
+    dst->crop          = src->crop;
+    dst->crop_left     = src->crop_left;
+    dst->crop_top      = src->crop_top;
     dst->recovered     = src->recovered;
+    dst->invalid_gap   = src->invalid_gap;
+    dst->sei_recovery_frame_cnt = src->sei_recovery_frame_cnt;
 
     return 0;
 fail:
@@ -122,11 +128,13 @@ fail:
     return ret;
 }
 
-#if CONFIG_ERROR_RESILIENCE
-static void h264_set_erpic(ERPicture *dst, H264Picture *src)
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src)
 {
+#if CONFIG_ERROR_RESILIENCE
     int i;
 
+    memset(dst, 0, sizeof(*dst));
+
     if (!src)
         return;
 
@@ -140,8 +148,8 @@ static void h264_set_erpic(ERPicture *dst, H264Picture *src)
 
     dst->mb_type = src->mb_type;
     dst->field_picture = src->field_picture;
-}
 #endif /* CONFIG_ERROR_RESILIENCE */
+}
 
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
 {
@@ -149,9 +157,11 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     int err = 0;
     h->mb_y = 0;
 
-    if (!in_setup && !h->droppable)
-        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
-                                  h->picture_structure == PICT_BOTTOM_FIELD);
+#if FF_API_CAP_VDPAU
+    if (CONFIG_H264_VDPAU_DECODER &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+        ff_vdpau_h264_set_reference_frames(h);
+#endif
 
     if (in_setup || !(avctx->active_thread_type & FF_THREAD_FRAME)) {
         if (!h->droppable) {
@@ -164,12 +174,20 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     }
 
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->end_frame(avctx) < 0)
+        err = avctx->hwaccel->end_frame(avctx);
+        if (err < 0)
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
     }
 
+#if FF_API_CAP_VDPAU
+    if (CONFIG_H264_VDPAU_DECODER &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+        ff_vdpau_h264_picture_complete(h);
+#endif
+
 #if CONFIG_ERROR_RESILIENCE
+    av_assert0(sl == h->slice_ctx);
     /*
      * FIXME: Error handling code does not seem to support interlaced
      * when slices span multiple rows
@@ -182,16 +200,36 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
      * past end by one (callers fault) and resync_mb_y != 0
      * causes problems for the first MB line, too.
      */
-    if (!FIELD_PICTURE(h) && h->enable_er) {
-        h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
-        h264_set_erpic(&sl->er.last_pic,
-                       sl->ref_count[0] ? sl->ref_list[0][0].parent : NULL);
-        h264_set_erpic(&sl->er.next_pic,
-                       sl->ref_count[1] ? sl->ref_list[1][0].parent : NULL);
+    if (!FIELD_PICTURE(h) && h->current_slice && !h->sps.new && h->enable_er) {
+        int use_last_pic = h->last_pic_for_ec.f->buf[0] && !sl->ref_count[0];
+
+        ff_h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
+
+        if (use_last_pic) {
+            ff_h264_set_erpic(&sl->er.last_pic, &h->last_pic_for_ec);
+            sl->ref_list[0][0].parent = &h->last_pic_for_ec;
+            memcpy(sl->ref_list[0][0].data, h->last_pic_for_ec.f->data, sizeof(sl->ref_list[0][0].data));
+            memcpy(sl->ref_list[0][0].linesize, h->last_pic_for_ec.f->linesize, sizeof(sl->ref_list[0][0].linesize));
+            sl->ref_list[0][0].reference = h->last_pic_for_ec.reference;
+        } else if (sl->ref_count[0]) {
+            ff_h264_set_erpic(&sl->er.last_pic, sl->ref_list[0][0].parent);
+        } else
+            ff_h264_set_erpic(&sl->er.last_pic, NULL);
+
+        if (sl->ref_count[1])
+            ff_h264_set_erpic(&sl->er.next_pic, sl->ref_list[1][0].parent);
+
+        sl->er.ref_count = sl->ref_count[0];
+
         ff_er_frame_end(&sl->er);
+        if (use_last_pic)
+            memset(&sl->ref_list[0][0], 0, sizeof(sl->ref_list[0][0]));
     }
 #endif /* CONFIG_ERROR_RESILIENCE */
 
+    if (!in_setup && !h->droppable)
+        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                  h->picture_structure == PICT_BOTTOM_FIELD);
     emms_c();
 
     h->current_slice = 0;
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index fa9fa7d25b..e37a6d62fa 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,32 +31,12 @@
 #include "internal.h"
 #include "avcodec.h"
 #include "h264.h"
-#include "h264data.h" //FIXME FIXME FIXME (just for zigzag_scan)
+#include "h264data.h"
 #include "golomb.h"
 
 #define MAX_LOG2_MAX_FRAME_NUM    (12 + 4)
 #define MIN_LOG2_MAX_FRAME_NUM    4
 
-static const AVRational pixel_aspect[17] = {
-    {   0,  1 },
-    {   1,  1 },
-    {  12, 11 },
-    {  10, 11 },
-    {  16, 11 },
-    {  40, 33 },
-    {  24, 11 },
-    {  20, 11 },
-    {  32, 11 },
-    {  80, 33 },
-    {  18, 11 },
-    {  15, 11 },
-    {  64, 33 },
-    { 160, 99 },
-    {   4,  3 },
-    {   3,  2 },
-    {   2,  1 },
-};
-
 #define QP(qP, depth) ((qP) + 6 * ((depth) - 8))
 
 #define CHROMA_QP_TABLE_END(d)                                          \
@@ -70,13 +50,35 @@ static const AVRational pixel_aspect[17] = {
     QP(37, d), QP(37, d), QP(37, d), QP(38, d), QP(38, d), QP(38, d),   \
     QP(39, d), QP(39, d), QP(39, d), QP(39, d)
 
-const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1] = {
+const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1] = {
     { CHROMA_QP_TABLE_END(8) },
     { 0, 1, 2, 3, 4, 5,
       CHROMA_QP_TABLE_END(9) },
-    { 0, 1, 2, 3, 4, 5,
+    { 0, 1, 2, 3,  4,  5,
       6, 7, 8, 9, 10, 11,
       CHROMA_QP_TABLE_END(10) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      CHROMA_QP_TABLE_END(11) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      CHROMA_QP_TABLE_END(12) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      CHROMA_QP_TABLE_END(13) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      30,31,32,33, 34, 35,
+      CHROMA_QP_TABLE_END(14) },
 };
 
 static const uint8_t default_scaling4[2][16] = {
@@ -142,8 +144,8 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps)
         if (aspect_ratio_idc == EXTENDED_SAR) {
             sps->sar.num = get_bits(&h->gb, 16);
             sps->sar.den = get_bits(&h->gb, 16);
-        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)) {
-            sps->sar = pixel_aspect[aspect_ratio_idc];
+        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(ff_h264_pixel_aspect)) {
+            sps->sar = ff_h264_pixel_aspect[aspect_ratio_idc];
         } else {
             av_log(h->avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
             return AVERROR_INVALIDDATA;
@@ -182,15 +184,23 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps)
         get_ue_golomb(&h->gb);  /* chroma_sample_location_type_bottom_field */
     }
 
+    if (show_bits1(&h->gb) && get_bits_left(&h->gb) < 10) {
+        av_log(h->avctx, AV_LOG_WARNING, "Truncated VUI\n");
+        return 0;
+    }
+
     sps->timing_info_present_flag = get_bits1(&h->gb);
     if (sps->timing_info_present_flag) {
-        sps->num_units_in_tick = get_bits_long(&h->gb, 32);
-        sps->time_scale        = get_bits_long(&h->gb, 32);
-        if (!sps->num_units_in_tick || !sps->time_scale) {
+        unsigned num_units_in_tick = get_bits_long(&h->gb, 32);
+        unsigned time_scale        = get_bits_long(&h->gb, 32);
+        if (!num_units_in_tick || !time_scale) {
             av_log(h->avctx, AV_LOG_ERROR,
-                   "time_scale/num_units_in_tick invalid or unsupported (%"PRIu32"/%"PRIu32")\n",
-                   sps->time_scale, sps->num_units_in_tick);
-            return AVERROR_INVALIDDATA;
+                   "time_scale/num_units_in_tick invalid or unsupported (%u/%u)\n",
+                   time_scale, num_units_in_tick);
+            sps->timing_info_present_flag = 0;
+        } else {
+            sps->num_units_in_tick = num_units_in_tick;
+            sps->time_scale = time_scale;
         }
         sps->fixed_frame_rate_flag = get_bits1(&h->gb);
     }
@@ -207,7 +217,8 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps)
         sps->vcl_hrd_parameters_present_flag)
         get_bits1(&h->gb);     /* low_delay_hrd_flag */
     sps->pic_struct_present_flag = get_bits1(&h->gb);
-
+    if (!get_bits_left(&h->gb))
+        return 0;
     sps->bitstream_restriction_flag = get_bits1(&h->gb);
     if (sps->bitstream_restriction_flag) {
         get_bits1(&h->gb);     /* motion_vectors_over_pic_boundaries_flag */
@@ -232,11 +243,6 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps)
             return AVERROR_INVALIDDATA;
         }
     }
-    if (get_bits_left(&h->gb) < 0) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "Overread VUI by %d bits\n", -get_bits_left(&h->gb));
-        return AVERROR_INVALIDDATA;
-    }
 
     return 0;
 }
@@ -283,26 +289,35 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps,
         decode_scaling_list(h, scaling_matrix4[5], 16, default_scaling4[1], scaling_matrix4[4]); // Inter, Cb
         if (is_sps || pps->transform_8x8_mode) {
             decode_scaling_list(h, scaling_matrix8[0], 64, default_scaling8[0], fallback[2]); // Intra, Y
-            if (sps->chroma_format_idc == 3) {
-                decode_scaling_list(h, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
-                decode_scaling_list(h, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
-            }
             decode_scaling_list(h, scaling_matrix8[3], 64, default_scaling8[1], fallback[3]); // Inter, Y
             if (sps->chroma_format_idc == 3) {
+                decode_scaling_list(h, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
                 decode_scaling_list(h, scaling_matrix8[4], 64, default_scaling8[1], scaling_matrix8[3]); // Inter, Cr
+                decode_scaling_list(h, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
                 decode_scaling_list(h, scaling_matrix8[5], 64, default_scaling8[1], scaling_matrix8[4]); // Inter, Cb
             }
         }
     }
 }
 
-int ff_h264_decode_seq_parameter_set(H264Context *h)
+int ff_h264_decode_seq_parameter_set(H264Context *h, int ignore_truncation)
 {
     int profile_idc, level_idc, constraint_set_flags = 0;
     unsigned int sps_id;
     int i, log2_max_frame_num_minus4;
     SPS *sps;
 
+    sps = av_mallocz(sizeof(SPS));
+    if (!sps)
+        return AVERROR(ENOMEM);
+
+    sps->data_size = h->gb.buffer_end - h->gb.buffer;
+    if (sps->data_size > sizeof(sps->data)) {
+        av_log(h->avctx, AV_LOG_WARNING, "Truncating likely oversized SPS\n");
+        sps->data_size = sizeof(sps->data);
+    }
+    memcpy(sps->data, h->gb.buffer, sps->data_size);
+
     profile_idc           = get_bits(&h->gb, 8);
     constraint_set_flags |= get_bits1(&h->gb) << 0;   // constraint_set0_flag
     constraint_set_flags |= get_bits1(&h->gb) << 1;   // constraint_set1_flag
@@ -316,21 +331,20 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
 
     if (sps_id >= MAX_SPS_COUNT) {
         av_log(h->avctx, AV_LOG_ERROR, "sps_id %u out of range\n", sps_id);
-        return AVERROR_INVALIDDATA;
+        goto fail;
     }
-    sps = av_mallocz(sizeof(SPS));
-    if (!sps)
-        return AVERROR(ENOMEM);
 
     sps->sps_id               = sps_id;
     sps->time_offset_length   = 24;
     sps->profile_idc          = profile_idc;
     sps->constraint_set_flags = constraint_set_flags;
     sps->level_idc            = level_idc;
+    sps->full_range           = -1;
 
     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
     sps->scaling_matrix_present = 0;
+    sps->colorspace = 2; //AVCOL_SPC_UNSPECIFIED
 
     if (sps->profile_idc == 100 ||  // High profile
         sps->profile_idc == 110 ||  // High10 profile
@@ -344,12 +358,16 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
         sps->profile_idc == 138 ||  // Multiview Depth High profile (MVCD)
         sps->profile_idc == 144) {  // old High444 profile
         sps->chroma_format_idc = get_ue_golomb_31(&h->gb);
-        if (sps->chroma_format_idc > 3) {
+        if (sps->chroma_format_idc > 3U) {
             avpriv_request_sample(h->avctx, "chroma_format_idc %u",
                                   sps->chroma_format_idc);
             goto fail;
         } else if (sps->chroma_format_idc == 3) {
             sps->residual_color_transform_flag = get_bits1(&h->gb);
+            if (sps->residual_color_transform_flag) {
+                av_log(h->avctx, AV_LOG_ERROR, "separate color planes are not supported\n");
+                goto fail;
+            }
         }
         sps->bit_depth_luma   = get_ue_golomb(&h->gb) + 8;
         sps->bit_depth_chroma = get_ue_golomb(&h->gb) + 8;
@@ -358,6 +376,12 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
                                   "Different chroma and luma bit depth");
             goto fail;
         }
+        if (sps->bit_depth_luma   < 8 || sps->bit_depth_luma   > 14 ||
+            sps->bit_depth_chroma < 8 || sps->bit_depth_chroma > 14) {
+            av_log(h->avctx, AV_LOG_ERROR, "illegal bit depth value (%d, %d)\n",
+                   sps->bit_depth_luma, sps->bit_depth_chroma);
+            goto fail;
+        }
         sps->transform_bypass = get_bits1(&h->gb);
         decode_scaling_matrices(h, sps, NULL, 1,
                                 sps->scaling_matrix4, sps->scaling_matrix8);
@@ -380,7 +404,12 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
     sps->poc_type = get_ue_golomb_31(&h->gb);
 
     if (sps->poc_type == 0) { // FIXME #define
-        sps->log2_max_poc_lsb = get_ue_golomb(&h->gb) + 4;
+        unsigned t = get_ue_golomb(&h->gb);
+        if (t>12) {
+            av_log(h->avctx, AV_LOG_ERROR, "log2_max_poc_lsb (%d) is out of range\n", t);
+            goto fail;
+        }
+        sps->log2_max_poc_lsb = t + 4;
     } else if (sps->poc_type == 1) { // FIXME #define
         sps->delta_pic_order_always_zero_flag = get_bits1(&h->gb);
         sps->offset_for_non_ref_pic           = get_se_golomb(&h->gb);
@@ -402,8 +431,10 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
     }
 
     sps->ref_frame_count = get_ue_golomb_31(&h->gb);
+    if (h->avctx->codec_tag == MKTAG('S', 'M', 'V', '2'))
+        sps->ref_frame_count = FFMAX(2, sps->ref_frame_count);
     if (sps->ref_frame_count > H264_MAX_PICTURE_COUNT - 2 ||
-        sps->ref_frame_count >= 32U) {
+        sps->ref_frame_count > 16U) {
         av_log(h->avctx, AV_LOG_ERROR,
                "too many reference frames %d\n", sps->ref_frame_count);
         goto fail;
@@ -426,11 +457,6 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
         sps->mb_aff = 0;
 
     sps->direct_8x8_inference_flag = get_bits1(&h->gb);
-    if (!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "This stream was generated by a broken encoder, invalid 8x8 inference\n");
-        goto fail;
-    }
 
 #ifndef ALLOW_INTERLACE
     if (sps->mb_aff)
@@ -443,6 +469,8 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
         unsigned int crop_right  = get_ue_golomb(&h->gb);
         unsigned int crop_top    = get_ue_golomb(&h->gb);
         unsigned int crop_bottom = get_ue_golomb(&h->gb);
+        int width  = 16 * sps->mb_width;
+        int height = 16 * sps->mb_height * (2 - sps->frame_mbs_only_flag);
 
         if (h->avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
             av_log(h->avctx, AV_LOG_DEBUG, "discarding sps cropping, original "
@@ -469,16 +497,15 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
                        crop_left);
             }
 
-            if (INT_MAX / step_x             <= crop_left               ||
-                INT_MAX / step_x - crop_left <= crop_right              ||
-                16 * sps->mb_width <= step_x * (crop_left + crop_right) ||
-                INT_MAX / step_y             <= crop_top                ||
-                INT_MAX / step_y - crop_top  <= crop_bottom             ||
-                16 * sps->mb_height <= step_y * (crop_top + crop_bottom)) {
-                av_log(h->avctx, AV_LOG_WARNING, "Invalid crop parameters\n");
-                if (h->avctx->err_recognition & AV_EF_EXPLODE)
-                    goto fail;
-                crop_left = crop_right = crop_top = crop_bottom = 0;
+            if (crop_left  > (unsigned)INT_MAX / 4 / step_x ||
+                crop_right > (unsigned)INT_MAX / 4 / step_x ||
+                crop_top   > (unsigned)INT_MAX / 4 / step_y ||
+                crop_bottom> (unsigned)INT_MAX / 4 / step_y ||
+                (crop_left + crop_right ) * step_x >= width ||
+                (crop_top  + crop_bottom) * step_y >= height
+            ) {
+                av_log(h->avctx, AV_LOG_ERROR, "crop values invalid %d %d %d %d / %d %d\n", crop_left, crop_right, crop_top, crop_bottom, width, height);
+                goto fail;
             }
 
             sps->crop_left   = crop_left   * step_x;
@@ -497,7 +524,14 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
     sps->vui_parameters_present_flag = get_bits1(&h->gb);
     if (sps->vui_parameters_present_flag) {
         int ret = decode_vui_parameters(h, sps);
-        if (ret < 0 && h->avctx->err_recognition & AV_EF_EXPLODE)
+        if (ret < 0)
+            goto fail;
+    }
+
+    if (get_bits_left(&h->gb) < 0) {
+        av_log(h->avctx, ignore_truncation ? AV_LOG_WARNING : AV_LOG_ERROR,
+               "Overread %s by %d bits\n", sps->vui_parameters_present_flag ? "VUI" : "SPS", -get_bits_left(&h->gb));
+        if (!ignore_truncation)
             goto fail;
     }
 
@@ -507,7 +541,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
     if (h->avctx->debug & FF_DEBUG_PICT_INFO) {
         static const char csp[4][5] = { "Gray", "420", "422", "444" };
         av_log(h->avctx, AV_LOG_DEBUG,
-               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32"\n",
+               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32" b%d reo:%d\n",
                sps_id, sps->profile_idc, sps->level_idc,
                sps->poc_type,
                sps->ref_frame_count,
@@ -519,13 +553,15 @@ int ff_h264_decode_seq_parameter_set(H264Context *h)
                sps->vui_parameters_present_flag ? "VUI" : "",
                csp[sps->chroma_format_idc],
                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
-               sps->timing_info_present_flag ? sps->time_scale : 0);
+               sps->timing_info_present_flag ? sps->time_scale : 0,
+               sps->bit_depth_luma,
+               sps->bitstream_restriction_flag ? sps->num_reorder_frames : -1
+               );
     }
     sps->new = 1;
 
     av_free(h->sps_buffers[sps_id]);
     h->sps_buffers[sps_id] = sps;
-    h->sps                 = *sps;
 
     return 0;
 
@@ -543,6 +579,21 @@ static void build_qp_table(PPS *pps, int t, int index, const int depth)
             ff_h264_chroma_qp[depth - 8][av_clip(i + index, 0, max_qp)];
 }
 
+static int more_rbsp_data_in_pps(H264Context *h, PPS *pps)
+{
+    const SPS *sps = h->sps_buffers[pps->sps_id];
+    int profile_idc = sps->profile_idc;
+
+    if ((profile_idc == 66 || profile_idc == 77 ||
+         profile_idc == 88) && (sps->constraint_set_flags & 7)) {
+        av_log(h->avctx, AV_LOG_VERBOSE,
+               "Current profile doesn't provide more RBSP data in PPS, skipping\n");
+        return 0;
+    }
+
+    return 1;
+}
+
 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length)
 {
     const SPS *sps;
@@ -560,6 +611,12 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length)
     pps = av_mallocz(sizeof(PPS));
     if (!pps)
         return AVERROR(ENOMEM);
+    pps->data_size = h->gb.buffer_end - h->gb.buffer;
+    if (pps->data_size > sizeof(pps->data)) {
+        av_log(h->avctx, AV_LOG_WARNING, "Truncating likely oversized PPS\n");
+        pps->data_size = sizeof(pps->data);
+    }
+    memcpy(pps->data, h->gb.buffer, pps->data_size);
     pps->sps_id = get_ue_golomb_31(&h->gb);
     if ((unsigned)pps->sps_id >= MAX_SPS_COUNT ||
         !h->sps_buffers[pps->sps_id]) {
@@ -568,10 +625,15 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length)
         goto fail;
     }
     sps = h->sps_buffers[pps->sps_id];
-
-    if (sps->bit_depth_luma > 10) {
+    if (sps->bit_depth_luma > 14) {
+        av_log(h->avctx, AV_LOG_ERROR,
+               "Invalid luma bit depth=%d\n",
+               sps->bit_depth_luma);
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    } else if (sps->bit_depth_luma == 11 || sps->bit_depth_luma == 13) {
         av_log(h->avctx, AV_LOG_ERROR,
-               "Unimplemented luma bit depth=%d (max=10)\n",
+               "Unimplemented luma bit depth=%d\n",
                sps->bit_depth_luma);
         ret = AVERROR_PATCHWELCOME;
         goto fail;
@@ -643,8 +705,7 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length)
            sizeof(pps->scaling_matrix8));
 
     bits_left = bit_length - get_bits_count(&h->gb);
-    if (bits_left && (bits_left > 8 ||
-                      show_bits(&h->gb, bits_left) != 1 << (bits_left - 1))) {
+    if (bits_left > 0 && more_rbsp_data_in_pps(h, pps)) {
         pps->transform_8x8_mode = get_bits1(&h->gb);
         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0,
                                 pps->scaling_matrix4, pps->scaling_matrix8);
diff --git a/libavcodec/h264_refs.c b/libavcodec/h264_refs.c
index adc2213944..619f2edf84 100644
--- a/libavcodec/h264_refs.c
+++ b/libavcodec/h264_refs.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "internal.h"
 #include "avcodec.h"
 #include "h264.h"
@@ -79,16 +80,18 @@ static int build_def_list(H264Ref *def, int def_len,
     int  i[2] = { 0 };
     int index = 0;
 
-    while ((i[0] < len || i[1] < len) && index < def_len) {
+    while (i[0] < len || i[1] < len) {
         while (i[0] < len && !(in[i[0]] && (in[i[0]]->reference & sel)))
             i[0]++;
         while (i[1] < len && !(in[i[1]] && (in[i[1]]->reference & (sel ^ 3))))
             i[1]++;
-        if (i[0] < len && index < def_len) {
+        if (i[0] < len) {
+            av_assert0(index < def_len);
             in[i[0]]->pic_id = is_long ? i[0] : in[i[0]]->frame_num;
             split_field_copy(&def[index++], in[i[0]++], sel, 1);
         }
-        if (i[1] < len && index < def_len) {
+        if (i[1] < len) {
+            av_assert0(index < def_len);
             in[i[1]]->pic_id = is_long ? i[1] : in[i[1]]->frame_num;
             split_field_copy(&def[index++], in[i[1]++], sel ^ 3, 0);
         }
@@ -122,6 +125,7 @@ static int add_sorted(H264Picture **sorted, H264Picture **src, int len, int limi
 int ff_h264_fill_default_ref_list(H264Context *h, H264SliceContext *sl)
 {
     int i, len;
+    int j;
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         H264Picture *sorted[32];
@@ -136,13 +140,14 @@ int ff_h264_fill_default_ref_list(H264Context *h, H264SliceContext *sl)
         for (list = 0; list < 2; list++) {
             len  = add_sorted(sorted,       h->short_ref, h->short_ref_count, cur_poc, 1 ^ list);
             len += add_sorted(sorted + len, h->short_ref, h->short_ref_count, cur_poc, 0 ^ list);
-            assert(len <= 32);
+            av_assert0(len <= 32);
 
             len  = build_def_list(h->default_ref_list[list], FF_ARRAY_ELEMS(h->default_ref_list[0]),
                                   sorted, len, 0, h->picture_structure);
             len += build_def_list(h->default_ref_list[list] + len,
                                   FF_ARRAY_ELEMS(h->default_ref_list[0]) - len,
                                   h->long_ref, 16, 1, h->picture_structure);
+            av_assert0(len <= 32);
 
             if (len < sl->ref_count[list])
                 memset(&h->default_ref_list[list][len], 0, sizeof(H264Ref) * (sl->ref_count[list] - len));
@@ -163,6 +168,7 @@ int ff_h264_fill_default_ref_list(H264Context *h, H264SliceContext *sl)
         len += build_def_list(h->default_ref_list[0] + len,
                               FF_ARRAY_ELEMS(h->default_ref_list[0]) - len,
                               h-> long_ref, 16, 1, h->picture_structure);
+        av_assert0(len <= 32);
 
         if (len < sl->ref_count[0])
             memset(&h->default_ref_list[0][len], 0, sizeof(H264Ref) * (sl->ref_count[0] - len));
@@ -170,19 +176,34 @@ int ff_h264_fill_default_ref_list(H264Context *h, H264SliceContext *sl)
 #ifdef TRACE
     for (i = 0; i < sl->ref_count[0]; i++) {
         ff_tlog(h->avctx, "List0: %s fn:%d 0x%p\n",
-                (h->default_ref_list[0][i].long_ref ? "LT" : "ST"),
+                h->default_ref_list[0][i].parent ? (h->default_ref_list[0][i].parent->long_ref ? "LT" : "ST") : "NULL",
                 h->default_ref_list[0][i].pic_id,
-                h->default_ref_list[0][i].f->data[0]);
+                h->default_ref_list[0][i].parent ? h->default_ref_list[0][i].parent->f->data[0] : 0);
     }
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         for (i = 0; i < sl->ref_count[1]; i++) {
             ff_tlog(h->avctx, "List1: %s fn:%d 0x%p\n",
-                    (h->default_ref_list[1][i].long_ref ? "LT" : "ST"),
+                    h->default_ref_list[1][i].parent ? (h->default_ref_list[1][i].parent->long_ref ? "LT" : "ST") : "NULL",
                     h->default_ref_list[1][i].pic_id,
-                    h->default_ref_list[1][i].f->data[0]);
+                    h->default_ref_list[1][i].parent ? h->default_ref_list[1][i].parent->f->data[0] : 0);
         }
     }
 #endif
+
+    for (j = 0; j<1+(sl->slice_type_nos == AV_PICTURE_TYPE_B); j++) {
+        for (i = 0; i < sl->ref_count[j]; i++) {
+            if (h->default_ref_list[j][i].parent) {
+                AVFrame *f = h->default_ref_list[j][i].parent->f;
+                if (h->cur_pic_ptr->f->width  != f->width ||
+                    h->cur_pic_ptr->f->height != f->height ||
+                    h->cur_pic_ptr->f->format != f->format) {
+                    av_log(h->avctx, AV_LOG_ERROR, "Discarding mismatching reference\n");
+                    memset(&h->default_ref_list[j][i], 0, sizeof(h->default_ref_list[j][i]));
+                }
+            }
+        }
+    }
+
     return 0;
 }
 
@@ -324,13 +345,19 @@ int ff_h264_decode_ref_pic_list_reordering(H264Context *h, H264SliceContext *sl)
     }
     for (list = 0; list < sl->list_count; list++) {
         for (index = 0; index < sl->ref_count[list]; index++) {
-            if (!sl->ref_list[list][index].parent) {
-                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture\n");
-                if (h->default_ref_list[list][0].parent)
+            if (   !sl->ref_list[list][index].parent
+                || (!FIELD_PICTURE(h) && (sl->ref_list[list][index].reference&3) != 3)) {
+                int i;
+                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture, default is %d\n", h->default_ref_list[list][0].poc);
+                for (i = 0; i < FF_ARRAY_ELEMS(h->last_pocs); i++)
+                    h->last_pocs[i] = INT_MIN;
+                if (h->default_ref_list[list][0].parent
+                    && !(!FIELD_PICTURE(h) && (h->default_ref_list[list][0].reference&3) != 3))
                     sl->ref_list[list][index] = h->default_ref_list[list][0];
                 else
                     return -1;
             }
+            av_assert0(av_buffer_get_ref_count(sl->ref_list[list][index].parent->f->buf[0]) > 0);
         }
     }
 
@@ -340,7 +367,7 @@ int ff_h264_decode_ref_pic_list_reordering(H264Context *h, H264SliceContext *sl)
 void ff_h264_fill_mbaff_ref_list(H264Context *h, H264SliceContext *sl)
 {
     int list, i, j;
-    for (list = 0; list < sl->list_count; list++) { //FIXME try list_count
+    for (list = 0; list < sl->list_count; list++) {
         for (i = 0; i < sl->ref_count[list]; i++) {
             H264Ref *frame = &sl->ref_list[list][i];
             H264Ref *field = &sl->ref_list[list][16 + 2 * i];
@@ -486,11 +513,23 @@ void ff_h264_remove_all_refs(H264Context *h)
     }
     assert(h->long_ref_count == 0);
 
+    if (h->short_ref_count && !h->last_pic_for_ec.f->data[0]) {
+        ff_h264_unref_picture(h, &h->last_pic_for_ec);
+        ff_h264_ref_picture(h, &h->last_pic_for_ec, h->short_ref[0]);
+    }
+
     for (i = 0; i < h->short_ref_count; i++) {
         unreference_pic(h, h->short_ref[i], 0);
         h->short_ref[i] = NULL;
     }
     h->short_ref_count = 0;
+
+    memset(h->default_ref_list, 0, sizeof(h->default_ref_list));
+    for (i = 0; i < h->nb_slice_ctx; i++) {
+        H264SliceContext *sl = &h->slice_ctx[i];
+        sl->list_count = sl->ref_count[0] = sl->ref_count[1] = 0;
+        memset(sl->ref_list, 0, sizeof(sl->ref_list));
+    }
 }
 
 /**
@@ -532,8 +571,11 @@ static int check_opcodes(MMCO *mmco1, MMCO *mmco2, int n_mmcos)
     int i;
 
     for (i = 0; i < n_mmcos; i++) {
-        if (mmco1[i].opcode != mmco2[i].opcode)
+        if (mmco1[i].opcode != mmco2[i].opcode) {
+            av_log(NULL, AV_LOG_ERROR, "MMCO opcode [%d, %d] at %d mismatches between slices\n",
+                   mmco1[i].opcode, mmco2[i].opcode, i);
             return -1;
+        }
     }
 
     return 0;
@@ -544,10 +586,8 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
     MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = first_slice ? h->mmco : mmco_temp;
     int mmco_index = 0, i = 0;
 
-    assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
-
     if (h->short_ref_count &&
-        h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
+        h->long_ref_count + h->short_ref_count >= h->sps.ref_frame_count &&
         !(FIELD_PICTURE(h) && !h->first_field && h->cur_pic_ptr->reference)) {
         mmco[0].opcode        = MMCO_SHORT2UNUSED;
         mmco[0].short_pic_num = h->short_ref[h->short_ref_count - 1]->frame_num;
@@ -566,8 +606,8 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
                (mmco_index != h->mmco_index ||
                 (i = check_opcodes(h->mmco, mmco_temp, mmco_index)))) {
         av_log(h->avctx, AV_LOG_ERROR,
-               "Inconsistent MMCO state between slices [%d, %d, %d]\n",
-               mmco_index, h->mmco_index, i);
+               "Inconsistent MMCO state between slices [%d, %d]\n",
+               mmco_index, h->mmco_index);
         return AVERROR_INVALIDDATA;
     }
     return 0;
@@ -576,6 +616,8 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
 int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
 {
     int i, av_uninit(j);
+    int pps_count;
+    int pps_ref_count[2] = {0};
     int current_ref_assigned = 0, err = 0;
     H264Picture *av_uninit(pic);
 
@@ -596,7 +638,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
                 if (mmco[i].opcode != MMCO_SHORT2LONG ||
                     !h->long_ref[mmco[i].long_arg]    ||
                     h->long_ref[mmco[i].long_arg]->frame_num != frame_num) {
-                    av_log(h->avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
+                    av_log(h->avctx, h->short_ref_count ? AV_LOG_ERROR : AV_LOG_DEBUG, "mmco: unref short failure\n");
                     err = AVERROR_INVALIDDATA;
                 }
                 continue;
@@ -637,19 +679,24 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
                      * Report the problem and keep the pair where it is,
                      * and mark this field valid.
                      */
-            if (h->short_ref[0] == h->cur_pic_ptr)
+            if (h->short_ref[0] == h->cur_pic_ptr) {
+                av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to short and long at the same time\n");
                 remove_short_at_index(h, 0);
+            }
 
             /* make sure the current picture is not already assigned as a long ref */
             if (h->cur_pic_ptr->long_ref) {
                 for (j = 0; j < FF_ARRAY_ELEMS(h->long_ref); j++) {
-                    if (h->long_ref[j] == h->cur_pic_ptr)
+                    if (h->long_ref[j] == h->cur_pic_ptr) {
+                        if (j != mmco[i].long_arg)
+                            av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to 2 long term references\n");
                         remove_long(h, j, 0);
+                    }
                 }
             }
 
-
             if (h->long_ref[mmco[i].long_arg] != h->cur_pic_ptr) {
+                av_assert0(!h->cur_pic_ptr->long_ref);
                 remove_long(h, mmco[i].long_arg, 0);
 
                 h->long_ref[mmco[i].long_arg]           = h->cur_pic_ptr;
@@ -677,6 +724,8 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
             h->frame_num  = h->cur_pic_ptr->frame_num = 0;
             h->mmco_reset = 1;
             h->cur_pic_ptr->mmco_reset = 1;
+            for (j = 0; j < MAX_DELAYED_PIC_COUNT; j++)
+                h->last_pocs[j] = INT_MIN;
             break;
         default: assert(0);
         }
@@ -691,7 +740,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
          */
         if (h->short_ref_count && h->short_ref[0] == h->cur_pic_ptr) {
             /* Just mark the second field valid */
-            h->cur_pic_ptr->reference = PICT_FRAME;
+            h->cur_pic_ptr->reference |= h->picture_structure;
         } else if (h->cur_pic_ptr->long_ref) {
             av_log(h->avctx, AV_LOG_ERROR, "illegal short term reference "
                                            "assignment for second field "
@@ -715,8 +764,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
         }
     }
 
-    if (h->long_ref_count + h->short_ref_count -
-        (h->short_ref[0] == h->cur_pic_ptr) > h->sps.ref_frame_count) {
+    if (h->long_ref_count + h->short_ref_count > FFMAX(h->sps.ref_frame_count, 1)) {
 
         /* We have too many reference frames, probably due to corrupted
          * stream. Need to discard one frame. Prevents overrun of the
@@ -741,8 +789,36 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
         }
     }
 
+    for (i = 0; i<h->short_ref_count; i++) {
+        pic = h->short_ref[i];
+        if (pic->invalid_gap) {
+            int d = av_mod_uintp2(h->cur_pic_ptr->frame_num - pic->frame_num, h->sps.log2_max_frame_num);
+            if (d > h->sps.ref_frame_count)
+                remove_short(h, pic->frame_num, 0);
+        }
+    }
+
     print_short_term(h);
     print_long_term(h);
+
+    pps_count = 0;
+    for (i = 0; i < FF_ARRAY_ELEMS(h->pps_buffers); i++) {
+        pps_count += !!h->pps_buffers[i];
+        pps_ref_count[0] = FFMAX(pps_ref_count[0], h->pps.ref_count[0]);
+        pps_ref_count[1] = FFMAX(pps_ref_count[1], h->pps.ref_count[1]);
+    }
+
+    if (   err >= 0
+        && h->long_ref_count==0
+        && (   h->short_ref_count<=2
+            || pps_ref_count[0] <= 1 + (h->picture_structure != PICT_FRAME) && pps_ref_count[1] <= 1)
+        && pps_ref_count[0]<=2 + (h->picture_structure != PICT_FRAME) + (2*!h->has_recovery_point)
+        && h->cur_pic_ptr->f->pict_type == AV_PICTURE_TYPE_I){
+        h->cur_pic_ptr->recovered |= 1;
+        if(!h->avctx->has_b_frames)
+            h->frame_recovered |= FRAME_RECOVERED_SEI;
+    }
+
     return (h->avctx->err_recognition & AV_EF_EXPLODE) ? err : 0;
 }
 
@@ -750,7 +826,7 @@ int ff_h264_decode_ref_pic_marking(H264Context *h, GetBitContext *gb,
                                    int first_slice)
 {
     int i, ret;
-    MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = first_slice ? h->mmco : mmco_temp;
+    MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = mmco_temp;
     int mmco_index = 0;
 
     if (h->nal_unit_type == NAL_IDR_SLICE) { // FIXME fields
@@ -816,6 +892,7 @@ int ff_h264_decode_ref_pic_marking(H264Context *h, GetBitContext *gb,
     }
 
     if (first_slice && mmco_index != -1) {
+        memcpy(h->mmco, mmco_temp, sizeof(h->mmco));
         h->mmco_index = mmco_index;
     } else if (!first_slice && mmco_index >= 0 &&
                (mmco_index != h->mmco_index ||
diff --git a/libavcodec/h264_sei.c b/libavcodec/h264_sei.c
index ddf1b6f9fa..818caa8dfd 100644
--- a/libavcodec/h264_sei.c
+++ b/libavcodec/h264_sei.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... sei decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,14 +50,20 @@ void ff_h264_reset_sei(H264Context *h)
 
 static int decode_picture_timing(H264Context *h)
 {
-    if (h->sps.nal_hrd_parameters_present_flag ||
-        h->sps.vcl_hrd_parameters_present_flag) {
-        h->sei_cpb_removal_delay = get_bits(&h->gb,
-                                            h->sps.cpb_removal_delay_length);
-        h->sei_dpb_output_delay  = get_bits(&h->gb,
-                                            h->sps.dpb_output_delay_length);
+    SPS *sps = &h->sps;
+    int i;
+
+    for (i = 0; i<MAX_SPS_COUNT; i++)
+        if (!sps->log2_max_frame_num && h->sps_buffers[i])
+            sps = h->sps_buffers[i];
+
+    if (sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag) {
+        h->sei_cpb_removal_delay = get_bits_long(&h->gb,
+                                                 sps->cpb_removal_delay_length);
+        h->sei_dpb_output_delay  = get_bits_long(&h->gb,
+                                                 sps->dpb_output_delay_length);
     }
-    if (h->sps.pic_struct_present_flag) {
+    if (sps->pic_struct_present_flag) {
         unsigned int i, num_clock_ts;
 
         h->sei_pic_struct = get_bits(&h->gb, 4);
@@ -93,9 +99,9 @@ static int decode_picture_timing(H264Context *h)
                         }
                     }
                 }
-                if (h->sps.time_offset_length > 0)
+                if (sps->time_offset_length > 0)
                     skip_bits(&h->gb,
-                              h->sps.time_offset_length); /* time_offset */
+                              sps->time_offset_length); /* time_offset */
             }
         }
 
@@ -122,6 +128,11 @@ static int decode_registered_user_data_afd(H264Context *h, int size)
         skip_bits(&h->gb, 4);           // reserved
         h->active_format_description   = get_bits(&h->gb, 4);
         h->sei_reguserdata_afd_present = 1;
+#if FF_API_AFD
+FF_DISABLE_DEPRECATION_WARNINGS
+        h->avctx->dtg_active_format = h->active_format_description;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_AFD */
     }
 
     return 0;
@@ -171,8 +182,6 @@ static int decode_registered_user_data_closed_caption(H264Context *h, int size)
         }
     } else {
         int i;
-        avpriv_request_sample(h->avctx, "Subtitles with data type 0x%02x",
-                              user_data_type_code);
         for (i = 0; i < size - 1; i++)
             skip_bits(&h->gb, 8);
     }
@@ -228,6 +237,8 @@ static int decode_unregistered_user_data(H264Context *h, int size)
     e = sscanf(user_data + 16, "x264 - core %d", &build);
     if (e == 1 && build > 0)
         h->x264_build = build;
+    if (e == 1 && build == 1 && !strncmp(user_data+16, "x264 - core 0000", 16))
+        h->x264_build = 67;
 
     if (h->avctx->debug & FF_DEBUG_BUGS)
         av_log(h->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data + 16);
@@ -247,6 +258,11 @@ static int decode_recovery_point(H264Context *h)
      * 2b changing_slice_group_idc */
     skip_bits(&h->gb, 4);
 
+    if (h->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(h->avctx, AV_LOG_DEBUG, "sei_recovery_frame_cnt: %d\n", h->sei_recovery_frame_cnt);
+
+    h->has_recovery_point = 1;
+
     return 0;
 }
 
@@ -268,7 +284,7 @@ static int decode_buffering_period(H264Context *h)
     if (sps->nal_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(&h->gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(&h->gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(&h->gb, sps->initial_cpb_removal_delay_length);
         }
@@ -276,7 +292,7 @@ static int decode_buffering_period(H264Context *h)
     if (sps->vcl_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(&h->gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(&h->gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(&h->gb, sps->initial_cpb_removal_delay_length);
         }
@@ -288,12 +304,16 @@ static int decode_buffering_period(H264Context *h)
 
 static int decode_frame_packing_arrangement(H264Context *h)
 {
-    get_ue_golomb(&h->gb);              // frame_packing_arrangement_id
-    h->sei_frame_packing_present = !get_bits1(&h->gb);
+    h->sei_fpa.frame_packing_arrangement_id          = get_ue_golomb(&h->gb);
+    h->sei_fpa.frame_packing_arrangement_cancel_flag = get_bits1(&h->gb);
+    h->sei_frame_packing_present = !h->sei_fpa.frame_packing_arrangement_cancel_flag;
 
     if (h->sei_frame_packing_present) {
+        h->sei_fpa.frame_packing_arrangement_type =
         h->frame_packing_arrangement_type = get_bits(&h->gb, 7);
+        h->sei_fpa.quincunx_sampling_flag         =
         h->quincunx_subsampling           = get_bits1(&h->gb);
+        h->sei_fpa.content_interpretation_type    =
         h->content_interpretation_type    = get_bits(&h->gb, 6);
 
         // the following skips: spatial_flipping_flag, frame0_flipped_flag,
@@ -304,10 +324,19 @@ static int decode_frame_packing_arrangement(H264Context *h)
         if (!h->quincunx_subsampling && h->frame_packing_arrangement_type != 5)
             skip_bits(&h->gb, 16);      // frame[01]_grid_position_[xy]
         skip_bits(&h->gb, 8);           // frame_packing_arrangement_reserved_byte
-        get_ue_golomb(&h->gb);          // frame_packing_arrangement_repetition_period
+        h->sei_fpa.frame_packing_arrangement_repetition_period = get_ue_golomb(&h->gb) /* frame_packing_arrangement_repetition_period */;
     }
     skip_bits1(&h->gb);                 // frame_packing_arrangement_extension_flag
 
+    if (h->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(h->avctx, AV_LOG_DEBUG, "SEI FPA %d %d %d %d %d %d\n",
+                                       h->sei_fpa.frame_packing_arrangement_id,
+                                       h->sei_fpa.frame_packing_arrangement_cancel_flag,
+                                       h->sei_fpa.frame_packing_arrangement_type,
+                                       h->sei_fpa.quincunx_sampling_flag,
+                                       h->sei_fpa.content_interpretation_type,
+                                       h->sei_fpa.frame_packing_arrangement_repetition_period);
+
     return 0;
 }
 
@@ -327,32 +356,95 @@ static int decode_display_orientation(H264Context *h)
     return 0;
 }
 
+static int decode_GreenMetadata(H264Context *h)
+{
+    if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+        av_log(h->avctx, AV_LOG_DEBUG,          "Green Metadata Info SEI message\n");
+
+    h->sei_green_metadata.green_metadata_type=get_bits(&h->gb, 8);
+
+    if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+        av_log(h->avctx, AV_LOG_DEBUG,          "green_metadata_type                            = %d\n",
+               h->sei_green_metadata.green_metadata_type);
+
+    if (h->sei_green_metadata.green_metadata_type==0){
+        h->sei_green_metadata.period_type=get_bits(&h->gb, 8);
+
+        if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+            av_log(h->avctx, AV_LOG_DEBUG,      "green_metadata_period_type                     = %d\n",
+                   h->sei_green_metadata.period_type);
+
+        if (h->sei_green_metadata.green_metadata_type==2){
+            h->sei_green_metadata.num_seconds = get_bits(&h->gb, 16);
+            if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+                av_log(h->avctx, AV_LOG_DEBUG,  "green_metadata_num_seconds                     = %d\n",
+                       h->sei_green_metadata.num_seconds);
+        }
+        else if (h->sei_green_metadata.period_type==3){
+            h->sei_green_metadata.num_pictures = get_bits(&h->gb, 16);
+            if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+                av_log(h->avctx, AV_LOG_DEBUG,  "green_metadata_num_pictures                    = %d\n",
+                       h->sei_green_metadata.num_pictures);
+        }
+
+        h->sei_green_metadata.percent_non_zero_macroblocks=get_bits(&h->gb, 8);
+        h->sei_green_metadata.percent_intra_coded_macroblocks=get_bits(&h->gb, 8);
+        h->sei_green_metadata.percent_six_tap_filtering=get_bits(&h->gb, 8);
+        h->sei_green_metadata.percent_alpha_point_deblocking_instance=get_bits(&h->gb, 8);
+
+        if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+            av_log(h->avctx, AV_LOG_DEBUG,      "SEI GREEN Complexity Metrics                   = %f %f %f %f\n",
+                                           (float)h->sei_green_metadata.percent_non_zero_macroblocks/255,
+                                           (float)h->sei_green_metadata.percent_intra_coded_macroblocks/255,
+                                           (float)h->sei_green_metadata.percent_six_tap_filtering/255,
+                                           (float)h->sei_green_metadata.percent_alpha_point_deblocking_instance/255);
+
+    }else if( h->sei_green_metadata.green_metadata_type==1){
+        h->sei_green_metadata.xsd_metric_type=get_bits(&h->gb, 8);
+        h->sei_green_metadata.xsd_metric_value=get_bits(&h->gb, 16);
+
+        if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+            av_log(h->avctx, AV_LOG_DEBUG,      "xsd_metric_type                                = %d\n",
+                   h->sei_green_metadata.xsd_metric_type);
+        if ( h->sei_green_metadata.xsd_metric_type==0){
+            if (h->avctx->debug & FF_DEBUG_GREEN_MD)
+                av_log(h->avctx, AV_LOG_DEBUG,  "xsd_metric_value                               = %f\n",
+                       (float)h->sei_green_metadata.xsd_metric_value/100);
+        }
+    }
+
+    return 0;
+}
+
 int ff_h264_decode_sei(H264Context *h)
 {
-    while (get_bits_left(&h->gb) > 16) {
-        int size = 0;
+    while (get_bits_left(&h->gb) > 16 && show_bits(&h->gb, 16)) {
         int type = 0;
+        unsigned size = 0;
+        unsigned next;
         int ret  = 0;
-        int last = 0;
 
-        while (get_bits_left(&h->gb) >= 8 &&
-               (last = get_bits(&h->gb, 8)) == 255) {
-            type += 255;
-        }
-        type += last;
+        do {
+            if (get_bits_left(&h->gb) < 8)
+                return AVERROR_INVALIDDATA;
+            type += show_bits(&h->gb, 8);
+        } while (get_bits(&h->gb, 8) == 255);
 
-        last = 0;
-        while (get_bits_left(&h->gb) >= 8 &&
-               (last = get_bits(&h->gb, 8)) == 255) {
-            size += 255;
-        }
-        size += last;
+        do {
+            if (get_bits_left(&h->gb) < 8)
+                return AVERROR_INVALIDDATA;
+            size += show_bits(&h->gb, 8);
+        } while (get_bits(&h->gb, 8) == 255);
+
+        if (h->avctx->debug&FF_DEBUG_STARTCODE)
+            av_log(h->avctx, AV_LOG_DEBUG, "SEI %d len:%d\n", type, size);
 
         if (size > get_bits_left(&h->gb) / 8) {
-            av_log(h->avctx, AV_LOG_ERROR, "SEI type %d truncated at %d\n",
-                   type, get_bits_left(&h->gb));
+            av_log(h->avctx, AV_LOG_ERROR, "SEI type %d size %d truncated at %d\n",
+                   type, 8*size, get_bits_left(&h->gb));
             return AVERROR_INVALIDDATA;
         }
+        next = get_bits_count(&h->gb) + 8 * size;
 
         switch (type) {
         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
@@ -376,16 +468,65 @@ int ff_h264_decode_sei(H264Context *h)
         case SEI_TYPE_DISPLAY_ORIENTATION:
             ret = decode_display_orientation(h);
             break;
+        case SEI_TYPE_GREEN_METADATA:
+            ret = decode_GreenMetadata(h);
+            break;
         default:
             av_log(h->avctx, AV_LOG_DEBUG, "unknown SEI type %d\n", type);
-            skip_bits(&h->gb, 8 * size);
         }
         if (ret < 0)
             return ret;
 
+        skip_bits_long(&h->gb, next - get_bits_count(&h->gb));
+
         // FIXME check bits here
         align_get_bits(&h->gb);
     }
 
     return 0;
 }
+
+const char* ff_h264_sei_stereo_mode(H264Context *h)
+{
+    if (h->sei_fpa.frame_packing_arrangement_cancel_flag == 0) {
+        switch (h->sei_fpa.frame_packing_arrangement_type) {
+            case SEI_FPA_TYPE_CHECKERBOARD:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "checkerboard_rl";
+                else
+                    return "checkerboard_lr";
+            case SEI_FPA_TYPE_INTERLEAVE_COLUMN:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "col_interleaved_rl";
+                else
+                    return "col_interleaved_lr";
+            case SEI_FPA_TYPE_INTERLEAVE_ROW:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "row_interleaved_rl";
+                else
+                    return "row_interleaved_lr";
+            case SEI_FPA_TYPE_SIDE_BY_SIDE:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "right_left";
+                else
+                    return "left_right";
+            case SEI_FPA_TYPE_TOP_BOTTOM:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "bottom_top";
+                else
+                    return "top_bottom";
+            case SEI_FPA_TYPE_INTERLEAVE_TEMPORAL:
+                if (h->sei_fpa.content_interpretation_type == 2)
+                    return "block_rl";
+                else
+                    return "block_lr";
+            case SEI_FPA_TYPE_2D:
+            default:
+                return "mono";
+        }
+    } else if (h->sei_fpa.frame_packing_arrangement_cancel_flag == 1) {
+        return "mono";
+    } else {
+        return NULL;
+    }
+}
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 49fd89135d..a346ccbc00 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,23 +47,27 @@
 static const uint8_t rem6[QP_MAX_NUM + 1] = {
     0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
     3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+    3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+    0, 1, 2, 3,
 };
 
 static const uint8_t div6[QP_MAX_NUM + 1] = {
     0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
     3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
-    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10,
+   10,10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13, 13, 13, 13,
+   14,14,14,14,
 };
 
-static const uint8_t field_scan[16] = {
+static const uint8_t field_scan[16+1] = {
     0 + 0 * 4, 0 + 1 * 4, 1 + 0 * 4, 0 + 2 * 4,
     0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4,
     2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4,
     3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4,
 };
 
-static const uint8_t field_scan8x8[64] = {
+static const uint8_t field_scan8x8[64+1] = {
     0 + 0 * 8, 0 + 1 * 8, 0 + 2 * 8, 1 + 0 * 8,
     1 + 1 * 8, 0 + 3 * 8, 0 + 4 * 8, 1 + 2 * 8,
     2 + 0 * 8, 1 + 3 * 8, 0 + 5 * 8, 0 + 6 * 8,
@@ -82,7 +86,7 @@ static const uint8_t field_scan8x8[64] = {
     7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8,
 };
 
-static const uint8_t field_scan8x8_cavlc[64] = {
+static const uint8_t field_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 2 + 0 * 8, 0 + 7 * 8,
     2 + 2 * 8, 2 + 3 * 8, 2 + 4 * 8, 3 + 3 * 8,
     3 + 4 * 8, 4 + 3 * 8, 4 + 4 * 8, 5 + 3 * 8,
@@ -102,7 +106,7 @@ static const uint8_t field_scan8x8_cavlc[64] = {
 };
 
 // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)]
-static const uint8_t zigzag_scan8x8_cavlc[64] = {
+static const uint8_t zigzag_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 1 + 2 * 8, 2 + 2 * 8,
     4 + 1 * 8, 0 + 5 * 8, 3 + 3 * 8, 7 + 0 * 8,
     3 + 4 * 8, 1 + 7 * 8, 5 + 3 * 8, 6 + 3 * 8,
@@ -228,6 +232,10 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
     if (ret < 0)
         goto fail;
 
+    pic->crop     = h->sps.crop;
+    pic->crop_top = h->sps.crop_top;
+    pic->crop_left= h->sps.crop_left;
+
     if (h->avctx->hwaccel) {
         const AVHWAccel *hwaccel = h->avctx->hwaccel;
         av_assert0(!pic->hwaccel_picture_private);
@@ -238,6 +246,18 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
             pic->hwaccel_picture_private = pic->hwaccel_priv_buf->data;
         }
     }
+    if (CONFIG_GRAY && !h->avctx->hwaccel && h->flags & AV_CODEC_FLAG_GRAY && pic->f->data[2]) {
+        int h_chroma_shift, v_chroma_shift;
+        av_pix_fmt_get_chroma_sub_sample(pic->f->format,
+                                         &h_chroma_shift, &v_chroma_shift);
+
+        for(i=0; i<FF_CEIL_RSHIFT(pic->f->height, v_chroma_shift); i++) {
+            memset(pic->f->data[1] + pic->f->linesize[1]*i,
+                   0x80, FF_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+            memset(pic->f->data[2] + pic->f->linesize[2]*i,
+                   0x80, FF_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+        }
+    }
 
     if (!h->qscale_table_pool) {
         ret = init_table_pools(h);
@@ -344,10 +364,12 @@ static void init_dequant4_coeff_table(H264Context *h)
     }
 }
 
-void h264_init_dequant_tables(H264Context *h)
+void ff_h264_init_dequant_tables(H264Context *h)
 {
     int i, x;
     init_dequant4_coeff_table(h);
+    memset(h->dequant8_coeff, 0, sizeof(h->dequant8_coeff));
+
     if (h->pps.transform_8x8_mode)
         init_dequant8_coeff_table(h);
     if (h->sps.transform_bypass) {
@@ -361,12 +383,12 @@ void h264_init_dequant_tables(H264Context *h)
     }
 }
 
-#define IN_RANGE(a, b, size) (((a) >= (b)) && ((a) < ((b) + (size))))
+#define IN_RANGE(a, b, size) (((void*)(a) >= (void*)(b)) && ((void*)(a) < (void*)((b) + (size))))
 
 #define REBASE_PICTURE(pic, new_ctx, old_ctx)             \
-    ((pic && pic >= old_ctx->DPB &&                       \
-      pic < old_ctx->DPB + H264_MAX_PICTURE_COUNT) ?          \
-     &new_ctx->DPB[pic - old_ctx->DPB] : NULL)
+    (((pic) && (pic) >= (old_ctx)->DPB &&                       \
+      (pic) < (old_ctx)->DPB + H264_MAX_PICTURE_COUNT) ?          \
+     &(new_ctx)->DPB[(pic) - (old_ctx)->DPB] : NULL)
 
 static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
                                H264Context *new_base,
@@ -375,10 +397,9 @@ static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
     int i;
 
     for (i = 0; i < count; i++) {
-        assert((IN_RANGE(from[i], old_base, sizeof(*old_base)) ||
-                IN_RANGE(from[i], old_base->DPB,
-                         sizeof(H264Picture) * H264_MAX_PICTURE_COUNT) ||
-                !from[i]));
+        av_assert1(!from[i] ||
+                   IN_RANGE(from[i], old_base, 1) ||
+                   IN_RANGE(from[i], old_base->DPB, H264_MAX_PICTURE_COUNT));
         to[i] = REBASE_PICTURE(from[i], new_base, old_base);
     }
 }
@@ -404,8 +425,8 @@ static int copy_parameter_set(void **to, void **from, int count, int size)
 }
 
 #define copy_fields(to, from, start_field, end_field)                   \
-    memcpy(&to->start_field, &from->start_field,                        \
-           (char *)&to->end_field - (char *)&to->start_field)
+    memcpy(&(to)->start_field, &(from)->start_field,                        \
+           (char *)&(to)->end_field - (char *)&(to)->start_field)
 
 static int h264_slice_header_init(H264Context *h);
 
@@ -417,7 +438,7 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     int need_reinit = 0;
     int i, ret;
 
-    if (dst == src || !h1->context_initialized)
+    if (dst == src)
         return 0;
 
     if (inited &&
@@ -428,9 +449,13 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
          h->sps.bit_depth_luma    != h1->sps.bit_depth_luma    ||
          h->sps.chroma_format_idc != h1->sps.chroma_format_idc ||
          h->sps.colorspace        != h1->sps.colorspace)) {
+
         need_reinit = 1;
     }
 
+    /* copy block_offset since frame_start may not be called */
+    memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
+
     // SPS/PPS
     if ((ret = copy_parameter_set((void **)h->sps_buffers,
                                   (void **)h1->sps_buffers,
@@ -452,11 +477,12 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         h->mb_stride = h1->mb_stride;
         h->b_stride  = h1->b_stride;
 
-        if ((err = h264_slice_header_init(h)) < 0) {
-            av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
-            return err;
+        if (h->context_initialized || h1->context_initialized) {
+            if ((err = h264_slice_header_init(h)) < 0) {
+                av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
+                return err;
+            }
         }
-
         /* copy block_offset since frame_start may not be called */
         memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
     }
@@ -470,6 +496,9 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     h->picture_structure    = h1->picture_structure;
     h->droppable            = h1->droppable;
     h->low_delay            = h1->low_delay;
+    h->backup_width         = h1->backup_width;
+    h->backup_height        = h1->backup_height;
+    h->backup_pix_fmt       = h1->backup_pix_fmt;
 
     for (i = 0; i < H264_MAX_PICTURE_COUNT; i++) {
         ff_h264_unref_picture(h, &h->DPB[i]);
@@ -494,6 +523,7 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     // extradata/NAL handling
     h->is_avc = h1->is_avc;
     h->nal_length_size = h1->nal_length_size;
+    h->x264_build      = h1->x264_build;
 
     // Dequantization matrices
     // FIXME these are big - can they be only copied when PPS changes?
@@ -520,7 +550,7 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     copy_picture_range(h->delayed_pic, h1->delayed_pic,
                        MAX_DELAYED_PIC_COUNT + 2, h, h1);
 
-    h->last_slice_type = h1->last_slice_type;
+    h->frame_recovered       = h1->frame_recovered;
 
     if (!h->cur_pic_ptr)
         return 0;
@@ -534,7 +564,6 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     h->prev_frame_num        = h->frame_num;
 
     h->recovery_frame        = h1->recovery_frame;
-    h->frame_recovered       = h1->frame_recovered;
 
     return err;
 }
@@ -544,6 +573,17 @@ static int h264_frame_start(H264Context *h)
     H264Picture *pic;
     int i, ret;
     const int pixel_shift = h->pixel_shift;
+    int c[4] = {
+        1<<(h->sps.bit_depth_luma-1),
+        1<<(h->sps.bit_depth_chroma-1),
+        1<<(h->sps.bit_depth_chroma-1),
+        -1
+    };
+
+    if (!ff_thread_can_start_frame(h->avctx)) {
+        av_log(h->avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
 
     release_unused_pictures(h, 1);
     h->cur_pic_ptr = NULL;
@@ -558,6 +598,7 @@ static int h264_frame_start(H264Context *h)
     pic->reference              = h->droppable ? 0 : h->picture_structure;
     pic->f->coded_picture_number = h->coded_picture_number++;
     pic->field_picture          = h->picture_structure != PICT_FRAME;
+
     /*
      * Zero key_frame here; IDR markings per slice in frame or fields are ORed
      * in later.
@@ -566,17 +607,37 @@ static int h264_frame_start(H264Context *h)
     pic->f->key_frame = 0;
     pic->mmco_reset  = 0;
     pic->recovered   = 0;
+    pic->invalid_gap = 0;
+    pic->sei_recovery_frame_cnt = h->sei_recovery_frame_cnt;
 
     if ((ret = alloc_picture(h, pic)) < 0)
         return ret;
+    if(!h->frame_recovered && !h->avctx->hwaccel
+#if FF_API_CAP_VDPAU
+       && !(h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+       )
+        avpriv_color_frame(pic->f, c);
 
     h->cur_pic_ptr = pic;
     ff_h264_unref_picture(h, &h->cur_pic);
+    if (CONFIG_ERROR_RESILIENCE) {
+        ff_h264_set_erpic(&h->slice_ctx[0].er.cur_pic, NULL);
+    }
+
     if ((ret = ff_h264_ref_picture(h, &h->cur_pic, h->cur_pic_ptr)) < 0)
         return ret;
 
-    if (CONFIG_ERROR_RESILIENCE && h->enable_er)
+    for (i = 0; i < h->nb_slice_ctx; i++) {
+        h->slice_ctx[i].linesize   = h->cur_pic_ptr->f->linesize[0];
+        h->slice_ctx[i].uvlinesize = h->cur_pic_ptr->f->linesize[1];
+    }
+
+    if (CONFIG_ERROR_RESILIENCE && h->enable_er) {
         ff_er_frame_start(&h->slice_ctx[0].er);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.last_pic, NULL);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.next_pic, NULL);
+    }
 
     for (i = 0; i < 16; i++) {
         h->block_offset[i]           = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 4 * pic->f->linesize[0] * ((scan8[i] - scan8[0]) >> 3);
@@ -589,11 +650,6 @@ static int h264_frame_start(H264Context *h)
         h->block_offset[48 + 32 + i] = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 8 * pic->f->linesize[1] * ((scan8[i] - scan8[0]) >> 3);
     }
 
-    /* Some macroblocks can be accessed before they're available in case
-     * of lost slices, MBAFF or threading. */
-    memset(h->slice_table, -1,
-           (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
-
     /* We mark the current picture as non-reference after allocating it, so
      * that if we break out due to an error it can be released automatically
      * in the next ff_mpv_frame_start().
@@ -779,13 +835,13 @@ static void init_scan_tables(H264Context *h)
 {
     int i;
     for (i = 0; i < 16; i++) {
-#define TRANSPOSE(x) (x >> 2) | ((x << 2) & 0xF)
+#define TRANSPOSE(x) ((x) >> 2) | (((x) << 2) & 0xF)
         h->zigzag_scan[i] = TRANSPOSE(zigzag_scan[i]);
         h->field_scan[i]  = TRANSPOSE(field_scan[i]);
 #undef TRANSPOSE
     }
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) ((x) >> 3) | (((x) & 7) << 3)
         h->zigzag_scan8x8[i]       = TRANSPOSE(ff_zigzag_direct[i]);
         h->zigzag_scan8x8_cavlc[i] = TRANSPOSE(zigzag_scan8x8_cavlc[i]);
         h->field_scan8x8[i]        = TRANSPOSE(field_scan8x8[i]);
@@ -793,31 +849,33 @@ static void init_scan_tables(H264Context *h)
 #undef TRANSPOSE
     }
     if (h->sps.transform_bypass) { // FIXME same ugly
-        h->zigzag_scan_q0          = zigzag_scan;
-        h->zigzag_scan8x8_q0       = ff_zigzag_direct;
-        h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = field_scan;
-        h->field_scan8x8_q0        = field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , zigzag_scan             , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , ff_zigzag_direct        , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , zigzag_scan8x8_cavlc    , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , field_scan              , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , field_scan8x8           , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , field_scan8x8_cavlc     , sizeof(h->field_scan8x8_cavlc_q0 ));
     } else {
-        h->zigzag_scan_q0          = h->zigzag_scan;
-        h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
-        h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = h->field_scan;
-        h->field_scan8x8_q0        = h->field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , h->zigzag_scan          , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , h->zigzag_scan8x8       , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , h->zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , h->field_scan           , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , h->field_scan8x8        , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , h->field_scan8x8_cavlc  , sizeof(h->field_scan8x8_cavlc_q0 ));
     }
 }
 
-static enum AVPixelFormat get_pixel_format(H264Context *h)
+static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
 {
 #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
                      CONFIG_H264_D3D11VA_HWACCEL + \
                      CONFIG_H264_VAAPI_HWACCEL + \
                      (CONFIG_H264_VDA_HWACCEL * 2) + \
+                     CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
                      CONFIG_H264_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
     const enum AVPixelFormat *choices = pix_fmts;
+    int i;
 
     switch (h->sps.bit_depth_luma) {
     case 9:
@@ -842,6 +900,28 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
         else
             *fmt++ = AV_PIX_FMT_YUV420P10;
         break;
+    case 12:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP12;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P12;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P12;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P12;
+        break;
+    case 14:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP14;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P14;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P14;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P14;
+        break;
     case 8:
 #if CONFIG_H264_VDPAU_HWACCEL
         *fmt++ = AV_PIX_FMT_VDPAU;
@@ -866,12 +946,15 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
             *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
 #endif
 #if CONFIG_H264_VAAPI_HWACCEL
-            *fmt++ = AV_PIX_FMT_VAAPI_VLD;
+            *fmt++ = AV_PIX_FMT_VAAPI;
 #endif
 #if CONFIG_H264_VDA_HWACCEL
             *fmt++ = AV_PIX_FMT_VDA_VLD;
             *fmt++ = AV_PIX_FMT_VDA;
 #endif
+#if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+            *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
             if (h->avctx->codec->pix_fmts)
                 choices = h->avctx->codec->pix_fmts;
             else if (h->avctx->color_range == AVCOL_RANGE_JPEG)
@@ -888,7 +971,10 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
 
     *fmt = AV_PIX_FMT_NONE;
 
-    return ff_get_format(h->avctx, choices);
+    for (i=0; choices[i] != AV_PIX_FMT_NONE; i++)
+        if (choices[i] == h->avctx->pix_fmt && !force_callback)
+            return choices[i];
+    return ff_thread_get_format(h->avctx, choices);
 }
 
 /* export coded and cropped frame dimensions to AVCodecContext */
@@ -896,10 +982,15 @@ static int init_dimensions(H264Context *h)
 {
     int width  = h->width  - (h->sps.crop_right + h->sps.crop_left);
     int height = h->height - (h->sps.crop_top   + h->sps.crop_bottom);
+    av_assert0(h->sps.crop_right + h->sps.crop_left < (unsigned)h->width);
+    av_assert0(h->sps.crop_top + h->sps.crop_bottom < (unsigned)h->height);
 
     /* handle container cropping */
     if (FFALIGN(h->avctx->width,  16) == FFALIGN(width,  16) &&
-        FFALIGN(h->avctx->height, 16) == FFALIGN(height, 16)) {
+        FFALIGN(h->avctx->height, 16) == FFALIGN(height, 16) &&
+        h->avctx->width  <= width &&
+        h->avctx->height <= height
+    ) {
         width  = h->avctx->width;
         height = h->avctx->height;
     }
@@ -945,7 +1036,7 @@ static int h264_slice_header_init(H264Context *h)
         if (h->x264_build < 44U)
             den *= 2;
         av_reduce(&h->avctx->framerate.den, &h->avctx->framerate.num,
-                  h->sps.num_units_in_tick, den, 1 << 30);
+                  h->sps.num_units_in_tick * h->avctx->ticks_per_frame, den, 1 << 30);
     }
 
     ff_h264_free_tables(h);
@@ -957,16 +1048,32 @@ static int h264_slice_header_init(H264Context *h)
     ret = ff_h264_alloc_tables(h);
     if (ret < 0) {
         av_log(h->avctx, AV_LOG_ERROR, "Could not allocate memory\n");
-        return ret;
+        goto fail;
     }
 
-    if (h->sps.bit_depth_luma < 8 || h->sps.bit_depth_luma > 10) {
+#if FF_API_CAP_VDPAU
+    if (h->avctx->codec &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU &&
+        (h->sps.bit_depth_luma != 8 || h->sps.chroma_format_idc > 1)) {
+        av_log(h->avctx, AV_LOG_ERROR,
+                "VDPAU decoding does not support video colorspace.\n");
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+#endif
+
+    if (h->sps.bit_depth_luma < 8 || h->sps.bit_depth_luma > 14 ||
+        h->sps.bit_depth_luma == 11 || h->sps.bit_depth_luma == 13
+    ) {
         av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n",
                h->sps.bit_depth_luma);
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
     }
 
+    h->cur_bit_depth_luma         =
     h->avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
+    h->cur_chroma_format_idc      = h->sps.chroma_format_idc;
     h->pixel_shift                = h->sps.bit_depth_luma > 8;
     h->chroma_format_idc          = h->sps.chroma_format_idc;
     h->bit_depth_luma             = h->sps.bit_depth_luma;
@@ -995,7 +1102,7 @@ static int h264_slice_header_init(H264Context *h)
         ret = ff_h264_slice_context_init(h, &h->slice_ctx[0]);
         if (ret < 0) {
             av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-            return ret;
+            goto fail;
         }
     } else {
         for (i = 0; i < h->slice_context_count; i++) {
@@ -1008,7 +1115,7 @@ static int h264_slice_header_init(H264Context *h)
 
             if ((ret = ff_h264_slice_context_init(h, sl)) < 0) {
                 av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-                return ret;
+                goto fail;
             }
         }
     }
@@ -1016,6 +1123,21 @@ static int h264_slice_header_init(H264Context *h)
     h->context_initialized = 1;
 
     return 0;
+fail:
+    ff_h264_free_tables(h);
+    h->context_initialized = 0;
+    return ret;
+}
+
+static enum AVPixelFormat non_j_pixfmt(enum AVPixelFormat a)
+{
+    switch (a) {
+    case AV_PIX_FMT_YUVJ420P: return AV_PIX_FMT_YUV420P;
+    case AV_PIX_FMT_YUVJ422P: return AV_PIX_FMT_YUV422P;
+    case AV_PIX_FMT_YUVJ444P: return AV_PIX_FMT_YUV444P;
+    default:
+        return a;
+    }
 }
 
 /**
@@ -1032,24 +1154,47 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     unsigned int pps_id;
     int ret;
     unsigned int slice_type, tmp, i, j;
-    int default_ref_list_done = 0;
     int last_pic_structure, last_pic_droppable;
+    int must_reinit;
     int needs_reinit = 0;
     int field_pic_flag, bottom_field_flag;
+    int first_slice = sl == h->slice_ctx && !h->current_slice;
     int frame_num, droppable, picture_structure;
-    int mb_aff_frame = 0;
+    int mb_aff_frame, last_mb_aff_frame;
+    PPS *pps;
+
+    if (first_slice)
+        av_assert0(!h->setup_finished);
 
     h->qpel_put = h->h264qpel.put_h264_qpel_pixels_tab;
     h->qpel_avg = h->h264qpel.avg_h264_qpel_pixels_tab;
 
-    first_mb_in_slice = get_ue_golomb(&sl->gb);
+    first_mb_in_slice = get_ue_golomb_long(&sl->gb);
 
     if (first_mb_in_slice == 0) { // FIXME better field boundary detection
-        if (h->current_slice && h->cur_pic_ptr && FIELD_PICTURE(h)) {
-            ff_h264_field_end(h, sl, 1);
+        if (h->current_slice) {
+            if (h->setup_finished) {
+                av_log(h->avctx, AV_LOG_ERROR, "Too many fields\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (h->cur_pic_ptr && FIELD_PICTURE(h) && h->first_field) {
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                h->current_slice = 0;
+                if (ret < 0)
+                    return ret;
+            } else if (h->cur_pic_ptr && !FIELD_PICTURE(h) && !h->first_field && h->nal_unit_type  == NAL_IDR_SLICE) {
+                av_log(h, AV_LOG_WARNING, "Broken frame packetizing\n");
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                h->current_slice = 0;
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
+                h->cur_pic_ptr = NULL;
+                if (ret < 0)
+                    return ret;
+            } else
+                return AVERROR_INVALIDDATA;
         }
 
-        h->current_slice = 0;
         if (!h->first_field) {
             if (h->cur_pic_ptr && !h->droppable) {
                 ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
@@ -1073,10 +1218,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         sl->slice_type_fixed = 0;
 
     slice_type = golomb_to_pict_type[slice_type];
-    if (slice_type == AV_PICTURE_TYPE_I ||
-        (h->current_slice != 0 && slice_type == h->last_slice_type)) {
-        default_ref_list_done = 1;
-    }
+
     sl->slice_type     = slice_type;
     sl->slice_type_nos = slice_type & 3;
 
@@ -1086,6 +1228,17 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         return AVERROR_INVALIDDATA;
     }
 
+    if (h->current_slice == 0 && !h->first_field) {
+        if (
+            (h->avctx->skip_frame >= AVDISCARD_NONREF && !h->nal_ref_idc) ||
+            (h->avctx->skip_frame >= AVDISCARD_BIDIR  && sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONINTRA && sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONKEY && h->nal_unit_type != NAL_IDR_SLICE && h->sei_recovery_frame_cnt < 0) ||
+            h->avctx->skip_frame >= AVDISCARD_ALL) {
+            return SLICE_SKIPED;
+        }
+    }
+
     // to make a few old functions happy, it's wrong though
     if (!h->setup_finished)
         h->pict_type = sl->slice_type;
@@ -1101,26 +1254,48 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                pps_id);
         return AVERROR_INVALIDDATA;
     }
-    if (!h->setup_finished) {
-        h->pps = *h->pps_buffers[pps_id];
-    } else if (h->dequant_coeff_pps != pps_id) {
-        av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
+    if (h->au_pps_id >= 0 && pps_id != h->au_pps_id) {
+        av_log(h->avctx, AV_LOG_ERROR,
+               "PPS change from %d to %d forbidden\n",
+               h->au_pps_id, pps_id);
         return AVERROR_INVALIDDATA;
     }
 
-    if (!h->sps_buffers[h->pps.sps_id]) {
+    pps = h->pps_buffers[pps_id];
+
+    if (!h->sps_buffers[pps->sps_id]) {
         av_log(h->avctx, AV_LOG_ERROR,
                "non-existing SPS %u referenced\n",
                h->pps.sps_id);
         return AVERROR_INVALIDDATA;
     }
 
-    if (h->pps.sps_id != h->sps.sps_id ||
-        h->sps_buffers[h->pps.sps_id]->new) {
-        h->sps_buffers[h->pps.sps_id]->new = 0;
+    if (first_slice) {
+        h->pps = *h->pps_buffers[pps_id];
+    } else if (h->setup_finished && h->dequant_coeff_pps != pps_id) {
+        av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (pps->sps_id != h->sps.sps_id ||
+        pps->sps_id != h->current_sps_id ||
+        h->sps_buffers[pps->sps_id]->new) {
+
+        if (!first_slice) {
+            av_log(h->avctx, AV_LOG_ERROR,
+               "SPS changed in the middle of the frame\n");
+            return AVERROR_INVALIDDATA;
+        }
 
         h->sps = *h->sps_buffers[h->pps.sps_id];
 
+        if (h->mb_width  != h->sps.mb_width ||
+            h->mb_height != h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag) ||
+            h->cur_bit_depth_luma    != h->sps.bit_depth_luma ||
+            h->cur_chroma_format_idc != h->sps.chroma_format_idc
+        )
+            needs_reinit = 1;
+
         if (h->bit_depth_luma    != h->sps.bit_depth_luma ||
             h->chroma_format_idc != h->sps.chroma_format_idc)
             needs_reinit         = 1;
@@ -1140,15 +1315,26 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
 
     }
 
+    must_reinit = (h->context_initialized &&
+                    (   16*h->sps.mb_width != h->avctx->coded_width
+                     || 16*h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag) != h->avctx->coded_height
+                     || h->cur_bit_depth_luma    != h->sps.bit_depth_luma
+                     || h->cur_chroma_format_idc != h->sps.chroma_format_idc
+                     || h->mb_width  != h->sps.mb_width
+                     || h->mb_height != h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag)
+                    ));
+    if (h->avctx->pix_fmt == AV_PIX_FMT_NONE
+        || (non_j_pixfmt(h->avctx->pix_fmt) != non_j_pixfmt(get_pixel_format(h, 0))))
+        must_reinit = 1;
+
+    if (first_slice && av_cmp_q(h->sps.sar, h->avctx->sample_aspect_ratio))
+        must_reinit = 1;
+
     if (!h->setup_finished) {
         h->avctx->profile = ff_h264_get_profile(&h->sps);
         h->avctx->level   = h->sps.level_idc;
         h->avctx->refs    = h->sps.ref_frame_count;
 
-        if (h->mb_width  != h->sps.mb_width ||
-            h->mb_height != h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag))
-            needs_reinit = 1;
-
         h->mb_width  = h->sps.mb_width;
         h->mb_height = h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
         h->mb_num    = h->mb_width * h->mb_height;
@@ -1166,8 +1352,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return ret;
 
         if (h->sps.video_signal_type_present_flag) {
-            h->avctx->color_range = h->sps.full_range ? AVCOL_RANGE_JPEG
-                : AVCOL_RANGE_MPEG;
+            h->avctx->color_range = h->sps.full_range>0 ? AVCOL_RANGE_JPEG
+                                                        : AVCOL_RANGE_MPEG;
             if (h->sps.colour_description_present_flag) {
                 if (h->avctx->colorspace != h->sps.colorspace)
                     needs_reinit = 1;
@@ -1178,7 +1364,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         }
     }
 
-    if (h->context_initialized && needs_reinit) {
+    if (h->context_initialized &&
+        (must_reinit || needs_reinit)) {
         h->context_initialized = 0;
         if (sl != h->slice_ctx) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -1190,14 +1377,16 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return AVERROR_INVALIDDATA;
         }
 
+        av_assert1(first_slice);
+
         ff_h264_flush_change(h);
 
-        if ((ret = get_pixel_format(h)) < 0)
+        if ((ret = get_pixel_format(h, 1)) < 0)
             return ret;
         h->avctx->pix_fmt = ret;
 
         av_log(h->avctx, AV_LOG_INFO, "Reinit context to %dx%d, "
-               "pix_fmt: %d\n", h->width, h->height, h->avctx->pix_fmt);
+               "pix_fmt: %s\n", h->width, h->height, av_get_pix_fmt_name(h->avctx->pix_fmt));
 
         if ((ret = h264_slice_header_init(h)) < 0) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -1212,7 +1401,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return AVERROR_PATCHWELCOME;
         }
 
-        if ((ret = get_pixel_format(h)) < 0)
+        if ((ret = get_pixel_format(h, 1)) < 0)
             return ret;
         h->avctx->pix_fmt = ret;
 
@@ -1223,17 +1412,26 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         }
     }
 
-    if (sl == h->slice_ctx && h->dequant_coeff_pps != pps_id) {
+    if (first_slice && h->dequant_coeff_pps != pps_id) {
         h->dequant_coeff_pps = pps_id;
-        h264_init_dequant_tables(h);
+        ff_h264_init_dequant_tables(h);
     }
 
     frame_num = get_bits(&sl->gb, h->sps.log2_max_frame_num);
+    if (!first_slice) {
+        if (h->frame_num != frame_num) {
+            av_log(h->avctx, AV_LOG_ERROR, "Frame num change from %d to %d\n",
+                   h->frame_num, frame_num);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     if (!h->setup_finished)
         h->frame_num = frame_num;
 
     sl->mb_mbaff       = 0;
-
+    mb_aff_frame       = 0;
+    last_mb_aff_frame  = h->mb_aff_frame;
     last_pic_structure = h->picture_structure;
     last_pic_droppable = h->droppable;
 
@@ -1241,7 +1439,12 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     if (h->sps.frame_mbs_only_flag) {
         picture_structure = PICT_FRAME;
     } else {
+        if (!h->sps.direct_8x8_inference_flag && slice_type == AV_PICTURE_TYPE_B) {
+            av_log(h->avctx, AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
+            return -1;
+        }
         field_pic_flag = get_bits1(&sl->gb);
+
         if (field_pic_flag) {
             bottom_field_flag = get_bits1(&sl->gb);
             picture_structure = PICT_TOP_FIELD + bottom_field_flag;
@@ -1250,16 +1453,11 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             mb_aff_frame      = h->sps.mb_aff;
         }
     }
-    if (!h->setup_finished) {
-        h->droppable         = droppable;
-        h->picture_structure = picture_structure;
-        h->mb_aff_frame      = mb_aff_frame;
-    }
-    sl->mb_field_decoding_flag = h->picture_structure != PICT_FRAME;
 
-    if (h->current_slice != 0) {
+    if (h->current_slice) {
         if (last_pic_structure != picture_structure ||
-            last_pic_droppable != droppable) {
+            last_pic_droppable != droppable ||
+            last_mb_aff_frame  != mb_aff_frame) {
             av_log(h->avctx, AV_LOG_ERROR,
                    "Changing field mode (%d -> %d) between slices is not allowed\n",
                    last_pic_structure, h->picture_structure);
@@ -1270,7 +1468,17 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                    h->current_slice + 1);
             return AVERROR_INVALIDDATA;
         }
-    } else {
+    }
+
+    h->picture_structure = picture_structure;
+    if (!h->setup_finished) {
+        h->droppable         = droppable;
+        h->picture_structure = picture_structure;
+        h->mb_aff_frame      = mb_aff_frame;
+    }
+    sl->mb_field_decoding_flag = picture_structure != PICT_FRAME;
+
+    if (h->current_slice == 0) {
         /* Shorten frame num gaps so we don't have to allocate reference
          * frames just to throw them away */
         if (h->frame_num != h->prev_frame_num) {
@@ -1293,17 +1501,23 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
          * Here, we're using that to see if we should mark previously
          * decode frames as "finished".
          * We have to do that before the "dummy" in-between frame allocation,
-         * since that can modify s->current_picture_ptr. */
+         * since that can modify h->cur_pic_ptr. */
         if (h->first_field) {
-            assert(h->cur_pic_ptr);
-            assert(h->cur_pic_ptr->f->buf[0]);
+            av_assert0(h->cur_pic_ptr);
+            av_assert0(h->cur_pic_ptr->f->buf[0]);
             assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
+            /* Mark old field/frame as completed */
+            if (h->cur_pic_ptr->tf.owner == h->avctx) {
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                          last_pic_structure == PICT_BOTTOM_FIELD);
+            }
+
             /* figure out if we have a complementary field pair */
             if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
                 /* Previous field is unmatched. Don't display it, but let it
                  * remain for reference if marked as such. */
-                if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+                if (last_pic_structure != PICT_FRAME) {
                     ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                               last_pic_structure == PICT_TOP_FIELD);
                 }
@@ -1313,7 +1527,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                      * different frame_nums. Consider this field first in
                      * pair. Throw away previous field except for reference
                      * purposes. */
-                    if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+                    if (last_pic_structure != PICT_FRAME) {
                         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                                   last_pic_structure == PICT_TOP_FIELD);
                     }
@@ -1340,11 +1554,14 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             }
         }
 
-        while (h->frame_num != h->prev_frame_num &&
+        while (h->frame_num != h->prev_frame_num && !h->first_field &&
                h->frame_num != (h->prev_frame_num + 1) % (1 << h->sps.log2_max_frame_num)) {
             H264Picture *prev = h->short_ref_count ? h->short_ref[0] : NULL;
             av_log(h->avctx, AV_LOG_DEBUG, "Frame num gap %d %d\n",
                    h->frame_num, h->prev_frame_num);
+            if (!h->sps.gaps_in_frame_num_allowed_flag)
+                for(i=0; i<FF_ARRAY_ELEMS(h->last_pocs); i++)
+                    h->last_pocs[i] = INT_MIN;
             ret = h264_frame_start(h);
             if (ret < 0) {
                 h->first_field = 0;
@@ -1354,6 +1571,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             h->prev_frame_num++;
             h->prev_frame_num        %= 1 << h->sps.log2_max_frame_num;
             h->cur_pic_ptr->frame_num = h->prev_frame_num;
+            h->cur_pic_ptr->invalid_gap = !h->sps.gaps_in_frame_num_allowed_flag;
             ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
             ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
             ret = ff_generate_sliding_window_mmcos(h, 1);
@@ -1379,8 +1597,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                                   (const uint8_t **)prev->f->data,
                                   prev->f->linesize,
                                   prev->f->format,
-                                  h->mb_width  * 16,
-                                  h->mb_height * 16);
+                                  prev->f->width,
+                                  prev->f->height);
                     h->short_ref[0]->poc = prev->poc + 2;
                 }
                 h->short_ref[0]->frame_num = h->prev_frame_num;
@@ -1391,18 +1609,22 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
          * We're using that to see whether to continue decoding in that
          * frame, or to allocate a new one. */
         if (h->first_field) {
-            assert(h->cur_pic_ptr);
-            assert(h->cur_pic_ptr->f->buf[0]);
+            av_assert0(h->cur_pic_ptr);
+            av_assert0(h->cur_pic_ptr->f->buf[0]);
             assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
             /* figure out if we have a complementary field pair */
             if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
                 /* Previous field is unmatched. Don't display it, but let it
                  * remain for reference if marked as such. */
+                h->missing_fields ++;
                 h->cur_pic_ptr = NULL;
                 h->first_field = FIELD_PICTURE(h);
             } else {
+                h->missing_fields = 0;
                 if (h->cur_pic_ptr->frame_num != h->frame_num) {
+                    ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                              h->picture_structure==PICT_BOTTOM_FIELD);
                     /* This and the previous field had different frame_nums.
                      * Consider this field first in pair. Throw away previous
                      * one except for reference purposes. */
@@ -1426,12 +1648,22 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         } else {
             release_unused_pictures(h, 0);
         }
+        /* Some macroblocks can be accessed before they're available in case
+        * of lost slices, MBAFF or threading. */
+        if (FIELD_PICTURE(h)) {
+            for(i = (h->picture_structure == PICT_BOTTOM_FIELD); i<h->mb_height; i++)
+                memset(h->slice_table + i*h->mb_stride, -1, (h->mb_stride - (i+1==h->mb_height)) * sizeof(*h->slice_table));
+        } else {
+            memset(h->slice_table, -1,
+                (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
+        }
+        h->last_slice_type = -1;
     }
 
     if (!h->setup_finished)
         h->cur_pic_ptr->frame_num = h->frame_num; // FIXME frame_num cleanup
 
-    assert(h->mb_num == h->mb_width * h->mb_height);
+    av_assert1(h->mb_num == h->mb_width * h->mb_height);
     if (first_mb_in_slice << FIELD_OR_MBAFF_PICTURE(h) >= h->mb_num ||
         first_mb_in_slice >= h->mb_num) {
         av_log(h->avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
@@ -1442,7 +1674,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                                  FIELD_OR_MBAFF_PICTURE(h);
     if (h->picture_structure == PICT_BOTTOM_FIELD)
         sl->resync_mb_y = sl->mb_y = sl->mb_y + 1;
-    assert(sl->mb_y < h->mb_height);
+    av_assert1(sl->mb_y < h->mb_height);
 
     if (h->picture_structure == PICT_FRAME) {
         h->curr_pic_num = h->frame_num;
@@ -1491,11 +1723,14 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     ret = ff_set_ref_count(h, sl);
     if (ret < 0)
         return ret;
-    else if (ret == 1)
-        default_ref_list_done = 0;
 
-    if (!default_ref_list_done)
+    if (slice_type != AV_PICTURE_TYPE_I &&
+        (h->current_slice == 0 ||
+         slice_type != h->last_slice_type ||
+         memcmp(h->last_ref_count, sl->ref_count, sizeof(sl->ref_count)))) {
+
         ff_h264_fill_default_ref_list(h, sl);
+    }
 
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
        ret = ff_h264_decode_ref_pic_list_reordering(h, sl);
@@ -1602,6 +1837,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
 
     if (h->avctx->skip_loop_filter >= AVDISCARD_ALL ||
         (h->avctx->skip_loop_filter >= AVDISCARD_NONKEY &&
+         h->nal_unit_type != NAL_IDR_SLICE) ||
+        (h->avctx->skip_loop_filter >= AVDISCARD_NONINTRA &&
          sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
         (h->avctx->skip_loop_filter >= AVDISCARD_BIDIR  &&
          sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
@@ -1618,13 +1855,16 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             h->max_contexts = 1;
             if (!h->single_decode_warning) {
                 av_log(h->avctx, AV_LOG_INFO,
-                       "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
+                       "Cannot parallelize slice decoding with deblocking filter type 1, decoding such frames in sequential order\n"
+                       "To parallelize slice decoding you need video encoded with disable_deblocking_filter_idc set to 2 (deblock only edges that do not cross slices).\n"
+                       "Setting the flags2 libavcodec option to +fast (-flags2 +fast) will disable deblocking across slices and enable parallel slice decoding "
+                       "but will generate non-standard-compliant output.\n");
                 h->single_decode_warning = 1;
             }
             if (sl != h->slice_ctx) {
                 av_log(h->avctx, AV_LOG_ERROR,
                        "Deblocking switched inside frame.\n");
-                return 1;
+                return SLICE_SINGLETHREAD;
             }
         }
     }
@@ -1636,10 +1876,16 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                    6 * (h->sps.bit_depth_luma - 8);
 
     h->last_slice_type = slice_type;
+    memcpy(h->last_ref_count, sl->ref_count, sizeof(h->last_ref_count));
     sl->slice_num       = ++h->current_slice;
-    if (sl->slice_num >= MAX_SLICES) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "Too many slices, increase MAX_SLICES and recompile\n");
+
+    if (sl->slice_num)
+        h->slice_row[(sl->slice_num-1)&(MAX_SLICES-1)]= sl->resync_mb_y;
+    if (   h->slice_row[sl->slice_num&(MAX_SLICES-1)] + 3 >= sl->resync_mb_y
+        && h->slice_row[sl->slice_num&(MAX_SLICES-1)] <= sl->resync_mb_y
+        && sl->slice_num >= MAX_SLICES) {
+        //in case of ASO this check needs to be updated depending on how we decide to assign slice numbers in this case
+        av_log(h->avctx, AV_LOG_WARNING, "Possibly too many slices (%d >= %d), increase MAX_SLICES and recompile if there are artifacts\n", sl->slice_num, MAX_SLICES);
     }
 
     for (j = 0; j < 2; j++) {
@@ -1675,6 +1921,11 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                              (sl->ref_list[j][i].reference & 3);
     }
 
+    h->au_pps_id = pps_id;
+    h->sps.new =
+    h->sps_buffers[h->pps.sps_id]->new = 0;
+    h->current_sps_id = h->pps.sps_id;
+
     if (h->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(h->avctx, AV_LOG_DEBUG,
                "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
@@ -1732,7 +1983,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
         if (USES_LIST(top_type, list)) {
             const int b_xy  = h->mb2b_xy[top_xy] + 3 * b_stride;
             const int b8_xy = 4 * top_xy + 2;
-            int (*ref2frm)[64] = sl->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
+            int (*ref2frm)[64] = (void*)(sl->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2));
             AV_COPY128(mv_dst - 1 * 8, h->cur_pic.motion_val[list][b_xy + 0]);
             ref_cache[0 - 1 * 8] =
             ref_cache[1 - 1 * 8] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 0]];
@@ -1747,7 +1998,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
             if (USES_LIST(left_type[LTOP], list)) {
                 const int b_xy  = h->mb2b_xy[left_xy[LTOP]] + 3;
                 const int b8_xy = 4 * left_xy[LTOP] + 1;
-                int (*ref2frm)[64] = sl->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
+                int (*ref2frm)[64] =(void*)( sl->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2));
                 AV_COPY32(mv_dst - 1 +  0, h->cur_pic.motion_val[list][b_xy + b_stride * 0]);
                 AV_COPY32(mv_dst - 1 +  8, h->cur_pic.motion_val[list][b_xy + b_stride * 1]);
                 AV_COPY32(mv_dst - 1 + 16, h->cur_pic.motion_val[list][b_xy + b_stride * 2]);
@@ -1780,7 +2031,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
 
     {
         int8_t *ref = &h->cur_pic.ref_index[list][4 * mb_xy];
-        int (*ref2frm)[64] = sl->ref2frm[sl->slice_num & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
+        int (*ref2frm)[64] = (void*)(sl->ref2frm[sl->slice_num & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2));
         uint32_t ref01 = (pack16to32(ref2frm[list][ref[0]], ref2frm[list][ref[1]]) & 0x00FF00FF) * 0x0101;
         uint32_t ref23 = (pack16to32(ref2frm[list][ref[2]], ref2frm[list][ref[3]]) & 0x00FF00FF) * 0x0101;
         AV_WN32A(&ref_cache[0 * 8], ref01);
@@ -2053,7 +2304,7 @@ static void decode_finish_row(const H264Context *h, H264SliceContext *sl)
 
     ff_h264_draw_horiz_band(h, sl, top, height);
 
-    if (h->droppable)
+    if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred)
         return;
 
     ff_thread_report_progress(&h->cur_pic_ptr->tf, top + height - 1,
@@ -2064,15 +2315,14 @@ static void er_add_slice(H264SliceContext *sl,
                          int startx, int starty,
                          int endx, int endy, int status)
 {
-#if CONFIG_ERROR_RESILIENCE
-    ERContext *er = &sl->er;
-
     if (!sl->h264->enable_er)
         return;
 
-    er->ref_count = sl->ref_count[0];
-    ff_er_add_slice(er, startx, starty, endx, endy, status);
-#endif
+    if (CONFIG_ERROR_RESILIENCE) {
+        ERContext *er = &sl->h264->slice_ctx[0].er;
+
+        ff_er_add_slice(er, startx, starty, endx, endy, status);
+    }
 }
 
 static int decode_slice(struct AVCodecContext *avctx, void *arg)
@@ -2091,10 +2341,22 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
 
     sl->mb_skip_run = -1;
 
+    av_assert0(h->block_offset[15] == (4 * ((scan8[15] - scan8[0]) & 7) << h->pixel_shift) + 4 * sl->linesize * ((scan8[15] - scan8[0]) >> 3));
+
     sl->is_complex = FRAME_MBAFF(h) || h->picture_structure != PICT_FRAME ||
                      avctx->codec_id != AV_CODEC_ID_H264 ||
                      (CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
 
+    if (!(h->avctx->active_thread_type & FF_THREAD_SLICE) && h->picture_structure == PICT_FRAME && h->slice_ctx[0].er.error_status_table) {
+        const int start_i  = av_clip(sl->resync_mb_x + sl->resync_mb_y * h->mb_width, 0, h->mb_num - 1);
+        if (start_i) {
+            int prev_status = h->slice_ctx[0].er.error_status_table[h->slice_ctx[0].er.mb_index2xy[start_i - 1]];
+            prev_status &= ~ VP_START;
+            if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
+                h->slice_ctx[0].er.error_occurred = 1;
+        }
+    }
+
     if (h->pps.cabac) {
         /* realign */
         align_get_bits(&sl->gb);
@@ -2109,10 +2371,11 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
         for (;;) {
             // START_TIMER
             int ret, eos;
-
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -2142,9 +2405,11 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     loop_filter(h, sl, lf_x_start, sl->mb_x + 1);
                 return 0;
             }
-            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 2) {
+            if (sl->cabac.bytestream > sl->cabac.bytestream_end + 2 )
+                av_log(h->avctx, AV_LOG_DEBUG, "bytestream overread %"PTRDIFF_SPECIFIER"\n", sl->cabac.bytestream_end - sl->cabac.bytestream);
+            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 4) {
                 av_log(h->avctx, AV_LOG_ERROR,
-                       "error while decoding MB %d %d, bytestream %td\n",
+                       "error while decoding MB %d %d, bytestream %"PTRDIFF_SPECIFIER"\n",
                        sl->mb_x, sl->mb_y,
                        sl->cabac.bytestream_end - sl->cabac.bytestream);
                 er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
@@ -2181,6 +2446,8 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -2221,14 +2488,15 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     ff_tlog(h->avctx, "slice end %d %d\n",
                             get_bits_count(&sl->gb), sl->gb.size_in_bits);
 
-                    if (get_bits_left(&sl->gb) == 0) {
+                    if (   get_bits_left(&sl->gb) == 0
+                        || get_bits_left(&sl->gb) > 0 && !(h->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
                                      sl->mb_x - 1, sl->mb_y, ER_MB_END);
 
                         return 0;
                     } else {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
-                                     sl->mb_x - 1, sl->mb_y, ER_MB_END);
+                                     sl->mb_x, sl->mb_y, ER_MB_END);
 
                         return AVERROR_INVALIDDATA;
                     }
@@ -2269,7 +2537,15 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
     H264SliceContext *sl;
     int i, j;
 
-    if (h->avctx->hwaccel)
+    av_assert0(context_count && h->slice_ctx[context_count - 1].mb_y < h->mb_height);
+
+    h->slice_ctx[0].next_slice_idx = INT_MAX;
+
+    if (h->avctx->hwaccel
+#if FF_API_CAP_VDPAU
+        || h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+        )
         return 0;
     if (context_count == 1) {
         int ret;
@@ -2280,12 +2556,15 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
         h->mb_y = h->slice_ctx[0].mb_y;
         return ret;
     } else {
+        av_assert0(context_count > 0);
         for (i = 0; i < context_count; i++) {
             int next_slice_idx = h->mb_width * h->mb_height;
             int slice_idx;
 
             sl                 = &h->slice_ctx[i];
-            sl->er.error_count = 0;
+            if (CONFIG_ERROR_RESILIENCE) {
+                sl->er.error_count = 0;
+            }
 
             /* make sure none of those slices overlap */
             slice_idx = sl->mb_y * h->mb_width + sl->mb_x;
@@ -2306,8 +2585,10 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
         /* pull back stuff from slices to master context */
         sl                   = &h->slice_ctx[context_count - 1];
         h->mb_y              = sl->mb_y;
-        for (i = 1; i < context_count; i++)
-            h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        if (CONFIG_ERROR_RESILIENCE) {
+            for (i = 1; i < context_count; i++)
+                h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        }
     }
 
     return 0;
diff --git a/libavcodec/h264addpx_template.c b/libavcodec/h264addpx_template.c
index cdbfc67dfe..046b6c2e19 100644
--- a/libavcodec/h264addpx_template.c
+++ b/libavcodec/h264addpx_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index d5146de7b1..c2f1f30f5a 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,9 +32,11 @@
     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_ ## depth ## _c; \
+    c->put_h264_chroma_pixels_tab[3] = put_h264_chroma_mc1_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_ ## depth ## _c; \
+    c->avg_h264_chroma_pixels_tab[3] = avg_h264_chroma_mc1_ ## depth ## _c; \
 
 av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
 {
@@ -52,4 +54,6 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
         ff_h264chroma_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264chroma_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264chroma_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index 93064fe06b..e0f45ad13b 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,8 @@
 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
 
 typedef struct H264ChromaContext {
-    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+    h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
+    h264_chroma_mc_func avg_h264_chroma_pixels_tab[4];
 } H264ChromaContext;
 
 void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
@@ -34,5 +34,6 @@ void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index 028ed132cf..072b5e03f7 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -2,28 +2,62 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+#include "libavutil/avassert.h"
 
 #include "bit_depth_template.c"
 
 #define H264_CHROMA_MC(OPNAME, OP)\
+static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride >>= sizeof(pixel)-1;\
+    \
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else if (B + C) {\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else {\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
 static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
@@ -32,9 +66,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -70,9 +104,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -114,9 +148,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index 6b40b39f0a..95ea385deb 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -2,20 +2,20 @@
  * H26L/H264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,7 +51,7 @@ static const uint8_t golomb_to_inter_cbp[48] = {
     17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
 };
 
-static const uint8_t zigzag_scan[16] = {
+static const uint8_t zigzag_scan[16+1] = {
     0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4,
     1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
     1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4,
@@ -167,4 +167,23 @@ static const PMbInfo b_sub_mb_type_info[13] = {
     { MB_TYPE_8x8   | MB_TYPE_P0L0 | MB_TYPE_P0L1 | MB_TYPE_P1L0 | MB_TYPE_P1L1, 4, },
 };
 
+static const AVRational ff_h264_pixel_aspect[17] = {
+    {   0,  1 },
+    {   1,  1 },
+    {  12, 11 },
+    {  10, 11 },
+    {  16, 11 },
+    {  40, 33 },
+    {  24, 11 },
+    {  20, 11 },
+    {  32, 11 },
+    {  80, 33 },
+    {  18, 11 },
+    {  15, 11 },
+    {  64, 33 },
+    { 160, 99 },
+    {   4,  3 },
+    {   3,  2 },
+    {   2,  1 },
+};
 #endif /* AVCODEC_H264DATA_H */
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index c8edbd08eb..bfb6f1ef6c 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+
 #include "avcodec.h"
 #include "h264dsp.h"
 #include "h264idct.h"
@@ -46,6 +48,14 @@
 #include "h264dsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
 #define BIT_DEPTH 8
 #include "h264addpx_template.c"
 #undef BIT_DEPTH
@@ -130,7 +140,14 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     case 10:
         H264_DSP(10);
         break;
+    case 12:
+        H264_DSP(12);
+        break;
+    case 14:
+        H264_DSP(14);
+        break;
     default:
+        av_assert0(bit_depth<=8);
         H264_DSP(8);
         break;
     }
@@ -140,4 +157,5 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
     if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
     if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS) ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index c4be235519..c232193054 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,5 +126,7 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
+void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                          const int chroma_format_idc);
 
 #endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index 3d99cfcfec..9b2cc2457c 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -1,21 +1,21 @@
 /*
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,8 @@ static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int heig
 { \
     int y; \
     pixel *block = (pixel*)_block; \
-    stride /= sizeof(pixel); \
-    offset <<= (log2_denom + (BIT_DEPTH-8)); \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
     for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
@@ -66,9 +66,9 @@ static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int s
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
-    stride /= sizeof(pixel); \
-    offset <<= (BIT_DEPTH-8); \
-    offset = ((offset + 1) | 1) << log2_denom; \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (BIT_DEPTH-8); \
+    offset = (unsigned)((offset + 1) | 1) << log2_denom; \
     for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
@@ -101,16 +101,16 @@ H264_WEIGHT(2)
 #undef op_scale2
 #undef H264_WEIGHT
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( i = 0; i < 4; i++ ) {
-        const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
+        const int tc_orig = tc0[i] * (1 << (BIT_DEPTH - 8));
         if( tc_orig < 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -141,7 +141,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_p
                     tc++;
                 }
 
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                i_delta = av_clip( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
                 pix[-xstride] = av_clip_pixel( p0 + i_delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - i_delta );    /* q0' */
             }
@@ -162,12 +162,12 @@ static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int a
     FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
@@ -228,16 +228,16 @@ static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride,
     FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     for( i = 0; i < 4; i++ ) {
-        const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
+        const int tc = ((tc0[i] - 1U) << (BIT_DEPTH - 8)) + 1;
         if( tc <= 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -252,7 +252,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *
                 FFABS( p1 - p0 ) < beta &&
                 FFABS( q1 - q0 ) < beta ) {
 
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                int delta = av_clip( ((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc );
 
                 pix[-xstride] = av_clip_pixel( p0 + delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - delta );    /* q0' */
@@ -282,12 +282,12 @@ static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, int stride,
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index ea08d03b07..6a771affe1 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,3 +38,11 @@
 #define BIT_DEPTH 10
 #include "h264idct_template.c"
 #undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "h264idct_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264idct_template.c"
+#undef BIT_DEPTH
diff --git a/libavcodec/h264idct.h b/libavcodec/h264idct.h
index 816a8251ba..17e0051497 100644
--- a/libavcodec/h264idct.h
+++ b/libavcodec/h264idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,5 +38,7 @@ void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(int16_t *block, int qmul);
 H264_IDCT( 8)
 H264_IDCT( 9)
 H264_IDCT(10)
+H264_IDCT(12)
+H264_IDCT(14)
 
 #endif /* AVCODEC_H264IDCT_H */
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 83c2a959b2..abf888ed96 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 1 << 5;
 
@@ -70,7 +70,7 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 32;
 
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 7c3876ad20..8f15f71f55 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "h264pred.h"
@@ -42,6 +43,14 @@
 #include "h264pred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
 static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright,
                                    ptrdiff_t stride)
 {
@@ -401,7 +410,7 @@ static void pred8x8_tm_vp8_c(uint8_t *src, ptrdiff_t stride)
  */
 av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
                                const int bit_depth,
-                               const int chroma_format_idc)
+                               int chroma_format_idc)
 {
 #undef FUNC
 #undef FUNCC
@@ -552,6 +561,8 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
     h->pred4x4_add  [ HOR_PRED   ]= FUNCC(pred4x4_horizontal_add          , depth);\
     h->pred8x8l_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_add           , depth);\
     h->pred8x8l_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_add         , depth);\
+    h->pred8x8l_filter_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_filter_add           , depth);\
+    h->pred8x8l_filter_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_filter_add         , depth);\
     if (chroma_format_idc <= 1) {\
     h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add            , depth);\
     h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add          , depth);\
@@ -569,7 +580,14 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         case 10:
             H264_PRED(10)
             break;
+        case 12:
+            H264_PRED(12)
+            break;
+        case 14:
+            H264_PRED(14)
+            break;
         default:
+            av_assert0(bit_depth<=8);
             H264_PRED(8)
             break;
     }
@@ -580,4 +598,6 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc);
     if (ARCH_X86)
         ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS)
+        ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 673f4cd882..091dcbbf71 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -101,6 +101,8 @@ typedef struct H264PredContext {
                           int16_t *block /*align 16*/, ptrdiff_t stride);
     void(*pred8x8l_add[2])(uint8_t *pix /*align  8*/,
                            int16_t *block /*align 16*/, ptrdiff_t stride);
+    void(*pred8x8l_filter_add[2])(uint8_t *pix /*align  8*/,
+                           int16_t *block /*align 16*/, int topleft, int topright, ptrdiff_t stride);
     void(*pred8x8_add[3])(uint8_t *pix /*align  8*/,
                           const int *block_offset,
                           int16_t *block /*align 16*/, ptrdiff_t stride);
@@ -118,5 +120,7 @@ void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
 void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
+void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                            const int bit_depth, const int chroma_format_idc);
 
 #endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 48baec8f85..f684433abe 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
                                     ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(src-stride);
 
     AV_WN4PA(src+0*stride, a);
@@ -48,7 +48,7 @@ static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
     AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
     AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
@@ -59,7 +59,7 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
                               ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
@@ -74,7 +74,7 @@ static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
                                    ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -88,7 +88,7 @@ static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -102,7 +102,7 @@ static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
 
     AV_WN4PA(src+0*stride, a);
@@ -115,7 +115,7 @@ static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
 
     AV_WN4PA(src+0*stride, a);
@@ -128,7 +128,7 @@ static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
 
     AV_WN4PA(src+0*stride, a);
@@ -166,7 +166,7 @@ static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -194,7 +194,7 @@ static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 //    LOAD_LEFT_EDGE
@@ -222,7 +222,7 @@ static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
                                           ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -251,7 +251,7 @@ static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 
@@ -277,7 +277,7 @@ static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
                                          ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_LEFT_EDGE
 
     src[0+0*stride]=(l0 + l1 + 1)>>1;
@@ -303,7 +303,7 @@ static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
                                            ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -330,7 +330,7 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
     const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
@@ -348,7 +348,7 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<16; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -374,7 +374,7 @@ static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -393,7 +393,7 @@ static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -408,7 +408,7 @@ static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[i-stride];
@@ -423,7 +423,7 @@ static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
 {\
     int i;\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
 }
 
@@ -440,7 +440,7 @@ static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +7-stride;
   const pixel *       src1 = src +8*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
@@ -489,7 +489,7 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
 
@@ -517,7 +517,7 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<8; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -544,7 +544,7 @@ static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
     int i;\
     const pixel4 a = PIXEL_SPLAT_X4(v);\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     for(i=0; i<8; i++){\
         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
@@ -567,7 +567,7 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc2;
     pixel4 dc0splat, dc2splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc2=0;
     for(i=0;i<4; i++){
@@ -599,7 +599,7 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1;
     pixel4 dc0splat, dc1splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=0;
     for(i=0;i<4; i++){
@@ -647,7 +647,7 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1, dc2;
     pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=dc2=0;
     for(i=0;i<4; i++){
@@ -713,6 +713,7 @@ static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
     }
 }
 
+//the following 4 function should not be optimized!
 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
 {
     FUNCC(pred8x8_top_dc)(src, stride);
@@ -771,7 +772,7 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +3-stride;
   const pixel *       src1 = src +4*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
@@ -885,7 +886,7 @@ static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
 }
@@ -893,7 +894,7 @@ static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
                                     int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
@@ -903,7 +904,7 @@ static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_TOP;
     const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
@@ -913,7 +914,7 @@ static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
                                int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOP;
@@ -925,7 +926,7 @@ static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a;
 
     PREDICT_8x8_LOAD_LEFT;
@@ -940,7 +941,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
 {
     int y;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a, b;
 
     PREDICT_8x8_LOAD_TOP;
@@ -963,7 +964,7 @@ static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
                                       int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
@@ -986,7 +987,7 @@ static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1010,7 +1011,7 @@ static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
                                            int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1041,7 +1042,7 @@ static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
                                             int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1072,7 +1073,7 @@ static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + t1 + 1) >> 1;
@@ -1102,7 +1103,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_LEFT;
     SRC(0,0)= (l0 + l1 + 1) >> 1;
     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
@@ -1123,6 +1124,79 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
 }
+
+static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
+                                     int has_topright, ptrdiff_t _stride)
+{
+    int i;
+    pixel *src = (pixel*)_src;
+    const dctcoef *block = (const dctcoef*)_block;
+    pixel pix[8];
+    int stride = _stride>>(sizeof(pixel)-1);
+    PREDICT_8x8_LOAD_TOP;
+
+    pix[0] = t0;
+    pix[1] = t1;
+    pix[2] = t2;
+    pix[3] = t3;
+    pix[4] = t4;
+    pix[5] = t5;
+    pix[6] = t6;
+    pix[7] = t7;
+
+    for(i=0; i<8; i++){
+        pixel v = pix[i];
+        src[0*stride]= v += block[0];
+        src[1*stride]= v += block[8];
+        src[2*stride]= v += block[16];
+        src[3*stride]= v += block[24];
+        src[4*stride]= v += block[32];
+        src[5*stride]= v += block[40];
+        src[6*stride]= v += block[48];
+        src[7*stride]= v +  block[56];
+        src++;
+        block++;
+    }
+
+    memset(_block, 0, sizeof(dctcoef) * 64);
+}
+
+static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
+                               int has_topright, ptrdiff_t _stride)
+{
+    int i;
+    pixel *src = (pixel*)_src;
+    const dctcoef *block = (const dctcoef*)_block;
+    pixel pix[8];
+    int stride = _stride>>(sizeof(pixel)-1);
+    PREDICT_8x8_LOAD_LEFT;
+
+    pix[0] = l0;
+    pix[1] = l1;
+    pix[2] = l2;
+    pix[3] = l3;
+    pix[4] = l4;
+    pix[5] = l5;
+    pix[6] = l6;
+    pix[7] = l7;
+
+    for(i=0; i<8; i++){
+        pixel v = pix[i];
+        src[0]= v += block[0];
+        src[1]= v += block[1];
+        src[2]= v += block[2];
+        src[3]= v += block[3];
+        src[4]= v += block[4];
+        src[5]= v += block[5];
+        src[6]= v += block[6];
+        src[7]= v +  block[7];
+        src+= stride;
+        block+= 8;
+    }
+
+    memset(_block, 0, sizeof(dctcoef) * 64);
+}
+
 #undef PREDICT_8x8_LOAD_LEFT
 #undef PREDICT_8x8_LOAD_TOP
 #undef PREDICT_8x8_LOAD_TOPLEFT
@@ -1139,7 +1213,7 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<4; i++){
         pixel v = pix[0];
@@ -1160,7 +1234,7 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<4; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
@@ -1180,7 +1254,7 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<8; i++){
         pixel v = pix[0];
@@ -1205,7 +1279,7 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<8; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index ec46da244f..50e82e23b0 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -2,26 +2,27 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "h264qpel.h"
 
+#define pixeltmp int16_t
 #define BIT_DEPTH 8
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
@@ -33,6 +34,17 @@
 #define BIT_DEPTH 10
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
+#undef pixeltmp
+
+#define pixeltmp int32_t
+#define BIT_DEPTH 12
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
 
 av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
 {
@@ -76,6 +88,12 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
     case 10:
         SET_QPEL(10);
         break;
+    case 12:
+        SET_QPEL(12);
+        break;
+    case 14:
+        SET_QPEL(14);
+        break;
     }
 
     if (ARCH_AARCH64)
@@ -86,4 +104,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
         ff_h264qpel_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264qpel_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264qpel_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 97ce195d6a..7c57ad001c 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,5 +35,6 @@ void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264QPEL_H */
diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c
index e846ac9e91..27c5b8f17f 100644
--- a/libavcodec/h264qpel_template.c
+++ b/libavcodec/h264qpel_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,14 +75,14 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
 }
 
 #define H264_LOWPASS(OPNAME, OP, OP2) \
-static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *_dst, const uint8_t *_src, int dstStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *p_dst, const uint8_t *p_src, int dstStride, int srcStride){\
     const int h=2;\
     INIT_CLIP\
     int i;\
-    pixel *dst = (pixel*)_dst;\
-    const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    pixel *dst = (pixel*)p_dst;\
+    const pixel *src = (const pixel*)p_src;\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -98,8 +98,8 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -116,16 +116,16 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     }\
 }\
 \
-static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=2;\
     const int w=2;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -156,8 +156,8 @@ static void FUNC(OPNAME ## h264_qpel4_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -175,8 +175,8 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -197,16 +197,16 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=4;\
     const int w=4;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -244,8 +244,8 @@ static void FUNC(OPNAME ## h264_qpel8_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
@@ -267,8 +267,8 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -297,16 +297,16 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=8;\
     const int w=8;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -368,7 +368,7 @@ static void FUNC(OPNAME ## h264_qpel16_h_lowpass)(uint8_t *dst, const uint8_t *s
     FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, pixeltmp *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst                , tmp  , src                , dstStride, tmpStride, srcStride);\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
     src += 8*srcStride;\
@@ -480,13 +480,13 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
 }\
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
@@ -496,7 +496,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
@@ -508,7 +508,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
@@ -521,7 +521,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
diff --git a/libavcodec/hap.c b/libavcodec/hap.c
index 770142cd42..5b3af5e1d0 100644
--- a/libavcodec/hap.c
+++ b/libavcodec/hap.c
@@ -2,20 +2,20 @@
  * Vidvox Hap utility functions
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hap.h b/libavcodec/hap.h
index 9d847f7b86..e4762ee438 100644
--- a/libavcodec/hap.h
+++ b/libavcodec/hap.h
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c
index 8f5365b269..6adac21bbb 100644
--- a/libavcodec/hapdec.c
+++ b/libavcodec/hapdec.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ static int parse_section_header(GetByteContext *gbc, int *section_size,
         *section_size = bytestream2_get_le32(gbc);
     }
 
-    if (*section_size > bytestream2_get_bytes_left(gbc))
+    if (*section_size > bytestream2_get_bytes_left(gbc) || *section_size < 0)
         return AVERROR_INVALIDDATA;
     else
         return 0;
@@ -309,6 +309,7 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     HapContext *ctx = avctx->priv_data;
     ThreadFrame tframe;
     int ret, i;
+    int tex_size;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
 
@@ -322,12 +323,14 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
+    if (avctx->codec->update_thread_context)
+        ff_thread_finish_setup(avctx);
 
     /* Unpack the DXT texture */
     if (hap_can_use_tex_in_place(ctx)) {
         /* Only DXTC texture compression in a contiguous block */
         ctx->tex_data = ctx->gbc.buffer;
+        tex_size = bytestream2_get_bytes_left(&ctx->gbc);
     } else {
         /* Perform the second-stage decompression */
         ret = av_reallocp(&ctx->tex_buf, ctx->tex_size);
@@ -343,6 +346,14 @@ static int hap_decode(AVCodecContext *avctx, void *data,
         }
 
         ctx->tex_data = ctx->tex_buf;
+        tex_size = ctx->tex_size;
+    }
+
+    if (tex_size < (avctx->coded_width  / TEXTURE_BLOCK_W)
+                  *(avctx->coded_height / TEXTURE_BLOCK_H)
+                  *ctx->tex_rat) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient data\n");
+        return AVERROR_INVALIDDATA;
     }
 
     /* Use the decompress function on the texture, one block per thread */
diff --git a/libavcodec/hapenc.c b/libavcodec/hapenc.c
index 4a31447c9e..7daadce254 100644
--- a/libavcodec/hapenc.c
+++ b/libavcodec/hapenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -196,7 +196,7 @@ static int hap_encode(AVCodecContext *avctx, AVPacket *pkt,
     int pktsize = FFMAX(ctx->tex_size, ctx->max_snappy * ctx->chunk_count) + header_length;
 
     /* Allocate maximum size packet, shrink later. */
-    ret = ff_alloc_packet(pkt, pktsize);
+    ret = ff_alloc_packet2(avctx, pkt, pktsize, header_length);
     if (ret < 0)
         return ret;
 
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index a802e1e4ac..02869a078c 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1,28 +1,29 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Mickael Raulet
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2012 - 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/display.h"
@@ -38,67 +39,7 @@
 #include "golomb.h"
 #include "hevc.h"
 
-const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 2 };
-const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 3, 4, 4 };
-const uint8_t ff_hevc_qpel_extra[4]        = { 0, 6, 7, 6 };
-
-static const uint8_t scan_1x1[1] = { 0 };
-
-static const uint8_t horiz_scan2x2_x[4] = { 0, 1, 0, 1 };
-
-static const uint8_t horiz_scan2x2_y[4] = { 0, 0, 1, 1 };
-
-static const uint8_t horiz_scan4x4_x[16] = {
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-};
-
-static const uint8_t horiz_scan4x4_y[16] = {
-    0, 0, 0, 0,
-    1, 1, 1, 1,
-    2, 2, 2, 2,
-    3, 3, 3, 3,
-};
-
-static const uint8_t horiz_scan8x8_inv[8][8] = {
-    {  0,  1,  2,  3, 16, 17, 18, 19, },
-    {  4,  5,  6,  7, 20, 21, 22, 23, },
-    {  8,  9, 10, 11, 24, 25, 26, 27, },
-    { 12, 13, 14, 15, 28, 29, 30, 31, },
-    { 32, 33, 34, 35, 48, 49, 50, 51, },
-    { 36, 37, 38, 39, 52, 53, 54, 55, },
-    { 40, 41, 42, 43, 56, 57, 58, 59, },
-    { 44, 45, 46, 47, 60, 61, 62, 63, },
-};
-
-static const uint8_t diag_scan2x2_x[4] = { 0, 0, 1, 1 };
-
-static const uint8_t diag_scan2x2_y[4] = { 0, 1, 0, 1 };
-
-static const uint8_t diag_scan2x2_inv[2][2] = {
-    { 0, 2, },
-    { 1, 3, },
-};
-
-static const uint8_t diag_scan4x4_inv[4][4] = {
-    { 0,  2,  5,  9, },
-    { 1,  4,  8, 12, },
-    { 3,  7, 11, 14, },
-    { 6, 10, 13, 15, },
-};
-
-static const uint8_t diag_scan8x8_inv[8][8] = {
-    {  0,  2,  5,  9, 14, 20, 27, 35, },
-    {  1,  4,  8, 13, 19, 26, 34, 42, },
-    {  3,  7, 12, 18, 25, 33, 41, 48, },
-    {  6, 11, 17, 24, 32, 40, 47, 53, },
-    { 10, 16, 23, 31, 39, 46, 52, 57, },
-    { 15, 22, 30, 38, 45, 51, 56, 60, },
-    { 21, 29, 37, 44, 50, 55, 59, 62, },
-    { 28, 36, 43, 49, 54, 58, 61, 63, },
-};
+const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
 
 /**
  * NOTE: Each function hls_foo correspond to the function foo in the
@@ -129,6 +70,10 @@ static void pic_arrays_free(HEVCContext *s)
     av_freep(&s->horizontal_bs);
     av_freep(&s->vertical_bs);
 
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.size);
+    av_freep(&s->sh.offset);
+
     av_buffer_pool_uninit(&s->tab_mvf_pool);
     av_buffer_pool_uninit(&s->rpl_tab_pool);
 }
@@ -144,40 +89,40 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
     int ctb_count        = sps->ctb_width * sps->ctb_height;
     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
 
-    s->bs_width  = width  >> 3;
-    s->bs_height = height >> 3;
+    s->bs_width  = (width  >> 2) + 1;
+    s->bs_height = (height >> 2) + 1;
 
     s->sao           = av_mallocz_array(ctb_count, sizeof(*s->sao));
     s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
     if (!s->sao || !s->deblock)
         goto fail;
 
-    s->skip_flag    = av_malloc(pic_size_in_ctb);
-    s->tab_ct_depth = av_malloc(sps->min_cb_height * sps->min_cb_width);
+    s->skip_flag    = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
+    s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
     if (!s->skip_flag || !s->tab_ct_depth)
         goto fail;
 
-    s->cbf_luma = av_malloc(sps->min_tb_width * sps->min_tb_height);
+    s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
     s->tab_ipm  = av_mallocz(min_pu_size);
-    s->is_pcm   = av_malloc(min_pu_size);
+    s->is_pcm   = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
     if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
         goto fail;
 
-    s->filter_slice_edges = av_malloc(ctb_count);
-    s->tab_slice_address  = av_malloc(pic_size_in_ctb *
+    s->filter_slice_edges = av_mallocz(ctb_count);
+    s->tab_slice_address  = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->tab_slice_address));
-    s->qp_y_tab           = av_malloc(pic_size_in_ctb *
+    s->qp_y_tab           = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->qp_y_tab));
     if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
         goto fail;
 
-    s->horizontal_bs = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
-    s->vertical_bs   = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
+    s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height);
+    s->vertical_bs   = av_mallocz_array(s->bs_width, s->bs_height);
     if (!s->horizontal_bs || !s->vertical_bs)
         goto fail;
 
     s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
-                                          av_buffer_alloc);
+                                          av_buffer_allocz);
     s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
                                           av_buffer_allocz);
     if (!s->tab_mvf_pool || !s->rpl_tab_pool)
@@ -198,11 +143,15 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
     uint8_t chroma_weight_l0_flag[16];
     uint8_t luma_weight_l1_flag[16];
     uint8_t chroma_weight_l1_flag[16];
+    int luma_log2_weight_denom;
 
-    s->sh.luma_log2_weight_denom = av_clip(get_ue_golomb_long(gb), 0, 7);
+    luma_log2_weight_denom = get_ue_golomb_long(gb);
+    if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
+        av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
+    s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
     if (s->ps.sps->chroma_format_idc != 0) {
         int delta = get_se_golomb(gb);
-        s->sh.chroma_log2_weight_denom = av_clip(s->sh.luma_log2_weight_denom + delta, 0, 7);
+        s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
     }
 
     for (i = 0; i < s->sh.nb_refs[L0]; i++) {
@@ -212,7 +161,7 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
             s->sh.luma_offset_l0[i] = 0;
         }
     }
-    if (s->ps.sps->chroma_format_idc != 0) { // FIXME: invert "if" and "for"
+    if (s->ps.sps->chroma_format_idc != 0) {
         for (i = 0; i < s->sh.nb_refs[L0]; i++)
             chroma_weight_l0_flag[i] = get_bits1(gb);
     } else {
@@ -295,7 +244,7 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
         nb_sps = get_ue_golomb_long(gb);
     nb_sh = get_ue_golomb_long(gb);
 
-    if (nb_sh + nb_sps > FF_ARRAY_ELEMS(rps->poc))
+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
         return AVERROR_INVALIDDATA;
 
     rps->nb_refs = nb_sh + nb_sps;
@@ -377,11 +326,11 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
                   num, den, 1 << 30);
 }
 
-static int set_sps(HEVCContext *s, const HEVCSPS *sps)
+static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
 {
-    #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL)
+    #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
-    int ret;
+    int ret, i;
 
     pic_arrays_free(s);
     s->ps.sps = NULL;
@@ -403,26 +352,50 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
 #if CONFIG_HEVC_D3D11VA_HWACCEL
         *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
 #endif
+#if CONFIG_HEVC_VAAPI_HWACCEL
+        *fmt++ = AV_PIX_FMT_VAAPI;
+#endif
+#if CONFIG_HEVC_VDPAU_HWACCEL
+        *fmt++ = AV_PIX_FMT_VDPAU;
+#endif
     }
 
-    *fmt++ = sps->pix_fmt;
-    *fmt = AV_PIX_FMT_NONE;
+    if (pix_fmt == AV_PIX_FMT_NONE) {
+        *fmt++ = sps->pix_fmt;
+        *fmt = AV_PIX_FMT_NONE;
 
-    ret = ff_get_format(s->avctx, pix_fmts);
-    if (ret < 0)
-        goto fail;
-    s->avctx->pix_fmt = ret;
+        ret = ff_thread_get_format(s->avctx, pix_fmts);
+        if (ret < 0)
+            goto fail;
+        s->avctx->pix_fmt = ret;
+    }
+    else {
+        s->avctx->pix_fmt = pix_fmt;
+    }
 
     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
 
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
+
     if (sps->sao_enabled && !s->avctx->hwaccel) {
-        av_frame_unref(s->tmp_frame);
-        ret = ff_get_buffer(s->avctx, s->tmp_frame, AV_GET_BUFFER_FLAG_REF);
-        if (ret < 0)
-            goto fail;
-        s->frame = s->tmp_frame;
+        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+        int c_idx;
+
+        for(c_idx = 0; c_idx < c_count; c_idx++) {
+            int w = sps->width >> sps->hshift[c_idx];
+            int h = sps->height >> sps->vshift[c_idx];
+            s->sao_pixel_buffer_h[c_idx] =
+                av_malloc((w * 2 * sps->ctb_height) <<
+                          sps->pixel_shift);
+            s->sao_pixel_buffer_v[c_idx] =
+                av_malloc((h * 2 * sps->ctb_width) <<
+                          sps->pixel_shift);
+        }
     }
 
     s->ps.sps = sps;
@@ -438,7 +411,7 @@ fail:
 
 static int hls_slice_header(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     SliceHeader *sh   = &s->sh;
     int i, ret;
 
@@ -450,6 +423,7 @@ static int hls_slice_header(HEVCContext *s)
         if (IS_IDR(s))
             ff_hevc_clear_refs(s);
     }
+    sh->no_output_of_prior_pics_flag = 0;
     if (IS_IRAP(s))
         sh->no_output_of_prior_pics_flag = get_bits1(gb);
 
@@ -464,12 +438,20 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
     s->ps.pps = (HEVCPPS*)s->ps.pps_list[sh->pps_id]->data;
+    if (s->nal_unit_type == NAL_CRA_NUT && s->last_eos == 1)
+        sh->no_output_of_prior_pics_flag = 1;
 
     if (s->ps.sps != (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
+        const HEVCSPS* last_sps = s->ps.sps;
         s->ps.sps = (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-
+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != NAL_CRA_NUT) {
+            if (s->ps.sps->width !=  last_sps->width || s->ps.sps->height != last_sps->height ||
+                s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering !=
+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
+                sh->no_output_of_prior_pics_flag = 0;
+        }
         ff_hevc_clear_refs(s);
-        ret = set_sps(s, s->ps.sps);
+        ret = set_sps(s, s->ps.sps, AV_PIX_FMT_NONE);
         if (ret < 0)
             return ret;
 
@@ -532,7 +514,7 @@ static int hls_slice_header(HEVCContext *s)
             sh->colour_plane_id = get_bits(gb, 2);
 
         if (!IS_IDR(s)) {
-            int poc;
+            int poc, pos;
 
             sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
             poc = ff_hevc_compute_poc(s, sh->pic_order_cnt_lsb);
@@ -546,13 +528,12 @@ static int hls_slice_header(HEVCContext *s)
             s->poc = poc;
 
             sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
+            pos = get_bits_left(gb);
             if (!sh->short_term_ref_pic_set_sps_flag) {
-                int pos = get_bits_left(gb);
                 ret = ff_hevc_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
                 if (ret < 0)
                     return ret;
 
-                sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
                 sh->short_term_rps = &sh->slice_rps;
             } else {
                 int numbits, rps_idx;
@@ -566,13 +547,16 @@ static int hls_slice_header(HEVCContext *s)
                 rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
                 sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
             }
+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
 
+            pos = get_bits_left(gb);
             ret = decode_lt_rps(s, &sh->long_term_rps, gb);
             if (ret < 0) {
                 av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
                 if (s->avctx->err_recognition & AV_EF_EXPLODE)
                     return AVERROR_INVALIDDATA;
             }
+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
 
             if (s->ps.sps->sps_temporal_mvp_enabled_flag)
                 sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
@@ -596,8 +580,10 @@ static int hls_slice_header(HEVCContext *s)
 
         if (s->ps.sps->sao_enabled) {
             sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-            sh->slice_sample_adaptive_offset_flag[1] =
-            sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            if (s->ps.sps->chroma_format_idc) {
+                sh->slice_sample_adaptive_offset_flag[1] =
+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            }
         } else {
             sh->slice_sample_adaptive_offset_flag[0] = 0;
             sh->slice_sample_adaptive_offset_flag[1] = 0;
@@ -695,6 +681,11 @@ static int hls_slice_header(HEVCContext *s)
             sh->slice_cr_qp_offset = 0;
         }
 
+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
+        else
+            sh->cu_chroma_qp_offset_enabled_flag = 0;
+
         if (s->ps.pps->deblocking_filter_control_present_flag) {
             int deblocking_filter_override_flag = 0;
 
@@ -733,23 +724,59 @@ static int hls_slice_header(HEVCContext *s)
 
     sh->num_entry_point_offsets = 0;
     if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-        sh->num_entry_point_offsets = get_ue_golomb_long(gb);
+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
+        // It would be possible to bound this tighter but this here is simpler
+        if (num_entry_point_offsets > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
+            return AVERROR_INVALIDDATA;
+        }
+
+        sh->num_entry_point_offsets = num_entry_point_offsets;
         if (sh->num_entry_point_offsets > 0) {
             int offset_len = get_ue_golomb_long(gb) + 1;
 
-            for (i = 0; i < sh->num_entry_point_offsets; i++)
-                skip_bits(gb, offset_len);
-        }
+            if (offset_len < 1 || offset_len > 32) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
+                return AVERROR_INVALIDDATA;
+            }
+
+            av_freep(&sh->entry_point_offset);
+            av_freep(&sh->offset);
+            av_freep(&sh->size);
+            sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
+                return AVERROR(ENOMEM);
+            }
+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
+                unsigned val = get_bits_long(gb, offset_len);
+                sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
+            }
+            if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
+                s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
+                s->threads_number = 1;
+            } else
+                s->enable_parallel_tiles = 0;
+        } else
+            s->enable_parallel_tiles = 0;
     }
 
     if (s->ps.pps->slice_header_extension_present_flag) {
         unsigned int length = get_ue_golomb_long(gb);
+        if (length*8LL > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
+            return AVERROR_INVALIDDATA;
+        }
         for (i = 0; i < length; i++)
             skip_bits(gb, 8);  // slice_header_extension_data_byte
     }
 
     // Inferred parameters
-    sh->slice_qp = 26 + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
     if (sh->slice_qp > 51 ||
         sh->slice_qp < -s->ps.sps->qp_bd_offset) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -767,13 +794,22 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
 
-    s->HEVClc.first_qp_group = !s->sh.dependent_slice_segment_flag;
+    if (get_bits_left(gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Overread slice header by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag;
 
     if (!s->ps.pps->cu_qp_delta_enabled_flag)
-        s->HEVClc.qp_y = FFUMOD(s->sh.slice_qp + 52 + 2 * s->ps.sps->qp_bd_offset,
-                                52 + s->ps.sps->qp_bd_offset) - s->ps.sps->qp_bd_offset;
+        s->HEVClc->qp_y = s->sh.slice_qp;
 
     s->slice_initialized = 1;
+    s->HEVClc->tu.cu_qp_offset_cb = 0;
+    s->HEVClc->tu.cu_qp_offset_cr = 0;
+
+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == NAL_CRA_NUT && s->last_eos);
 
     return 0;
 }
@@ -794,10 +830,9 @@ do {                                                    \
 
 static void hls_sao_param(HEVCContext *s, int rx, int ry)
 {
-    HEVCLocalContext *lc    = &s->HEVClc;
+    HEVCLocalContext *lc    = s->HEVClc;
     int sao_merge_left_flag = 0;
     int sao_merge_up_flag   = 0;
-    int shift               = s->ps.sps->bit_depth - FFMIN(s->ps.sps->bit_depth, 10);
     SAOParams *sao          = &CTB(s->sao, rx, ry);
     int c_idx, i;
 
@@ -813,7 +848,10 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
+
         if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
             sao->type_idx[c_idx] = SAO_NOT_APPLIED;
             continue;
@@ -849,13 +887,14 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         // Inferred parameters
         sao->offset_val[c_idx][0] = 0;
         for (i = 0; i < 4; i++) {
-            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i] << shift;
+            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i];
             if (sao->type_idx[c_idx] == SAO_EDGE) {
                 if (i > 1)
                     sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             } else if (sao->offset_sign[c_idx][i]) {
                 sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             }
+            sao->offset_val[c_idx][i + 1] *= 1 << log2_sao_offset_scale;
         }
     }
 }
@@ -863,384 +902,45 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
 #undef SET_SAO
 #undef CTB
 
-static void hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                int log2_trafo_size, enum ScanType scan_idx,
-                                int c_idx)
-{
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (scan_x_cg[offset >> 4] << 2) + scan_x_off[n];    \
-        y_c = (scan_y_cg[offset >> 4] << 2) + scan_y_off[n];    \
-    } while (0)
-    HEVCLocalContext *lc    = &s->HEVClc;
-    int transform_skip_flag = 0;
-
-    int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
-    int num_coeff    = 0;
-    int greater1_ctx = 1;
-
-    int num_last_subset;
-    int x_cg_last_sig, y_cg_last_sig;
-
-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-
-    ptrdiff_t stride = s->frame->linesize[c_idx];
-    int hshift       = s->ps.sps->hshift[c_idx];
-    int vshift       = s->ps.sps->vshift[c_idx];
-    uint8_t *dst     = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-    DECLARE_ALIGNED(16, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 };
-    DECLARE_ALIGNED(8, uint8_t, significant_coeff_group_flag[8][8]) = { { 0 } };
-
-    int trafo_size = 1 << log2_trafo_size;
-    int i, qp, shift, add, scale, scale_m;
-    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-    const uint8_t *scale_matrix;
-    uint8_t dc_scale;
-
-    // Derive QP for dequant
-    if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = {
-            29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37
-        };
-
-        static const uint8_t rem6[51 + 2 * 6 + 1] = {
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-        };
-
-        static const uint8_t div6[51 + 2 * 6 + 1] = {
-            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,  3,  3,  3,
-            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6,  6,  6,  6,
-            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
-        };
-        int qp_y = lc->qp_y;
-
-        if (c_idx == 0) {
-            qp = qp_y + s->ps.sps->qp_bd_offset;
-        } else {
-            int qp_i, offset;
-
-            if (c_idx == 1)
-                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset;
-            else
-                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset;
-
-            qp_i = av_clip(qp_y + offset, -s->ps.sps->qp_bd_offset, 57);
-            if (qp_i < 30)
-                qp = qp_i;
-            else if (qp_i > 43)
-                qp = qp_i - 6;
-            else
-                qp = qp_c[qp_i - 30];
-
-            qp += s->ps.sps->qp_bd_offset;
-        }
-
-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift - 1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
-
-        if (s->ps.sps->scaling_list_enable_flag) {
-            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-                                    &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
-
-            if (log2_trafo_size != 5)
-                matrix_id = 3 * matrix_id + c_idx;
-
-            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-            if (log2_trafo_size >= 4)
-                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-        }
-    }
-
-    if (s->ps.pps->transform_skip_enabled_flag &&
-        !lc->cu.cu_transquant_bypass_flag   &&
-        log2_trafo_size == 2) {
-        transform_skip_flag = ff_hevc_transform_skip_flag_decode(s, c_idx);
-    }
-
-    last_significant_coeff_x =
-        ff_hevc_last_significant_coeff_x_prefix_decode(s, c_idx, log2_trafo_size);
-    last_significant_coeff_y =
-        ff_hevc_last_significant_coeff_y_prefix_decode(s, c_idx, log2_trafo_size);
+static int hls_cross_component_pred(HEVCContext *s, int idx) {
+    HEVCLocalContext *lc    = s->HEVClc;
+    int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(s, idx);
 
-    if (last_significant_coeff_x > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
-        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_x & 1)) +
-                                   suffix;
-    }
-
-    if (last_significant_coeff_y > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
-        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_y & 1)) +
-                                   suffix;
-    }
-
-    if (scan_idx == SCAN_VERT)
-        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-
-    x_cg_last_sig = last_significant_coeff_x >> 2;
-    y_cg_last_sig = last_significant_coeff_y >> 2;
-
-    switch (scan_idx) {
-    case SCAN_DIAG: {
-        int last_x_c = last_significant_coeff_x & 3;
-        int last_y_c = last_significant_coeff_y & 3;
-
-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
-        num_coeff  = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
-            scan_x_cg = scan_1x1;
-            scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
-            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = diag_scan2x2_x;
-            scan_y_cg  = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
-            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan4x4_x;
-            scan_y_cg  = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
-            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan8x8_x;
-            scan_y_cg  = ff_hevc_diag_scan8x8_y;
-        }
-        break;
-    }
-    case SCAN_HORIZ:
-        scan_x_cg  = horiz_scan2x2_x;
-        scan_y_cg  = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-        break;
-    default: //SCAN_VERT
-        scan_x_cg  = horiz_scan2x2_y;
-        scan_y_cg  = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-        break;
+    if (log2_res_scale_abs_plus1 !=  0) {
+        int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(s, idx);
+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
+                               (1 - 2 * res_scale_sign_flag);
+    } else {
+        lc->tu.res_scale_val = 0;
     }
-    num_coeff++;
-    num_last_subset = (num_coeff - 1) >> 4;
-
-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset   = i << 4;
-
-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
-
-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
-
-        if (i < num_last_subset && i > 0) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ff_hevc_significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-                 (x_cg == 0 && y_cg == 0));
-        }
-
-        last_scan_pos = num_coeff - offset - 1;
-
-        if (i == num_last_subset) {
-            n_end                         = last_scan_pos - 1;
-            significant_coeff_flag_idx[0] = last_scan_pos;
-            nb_significant_coeff_flag     = 1;
-        } else {
-            n_end = 15;
-        }
-
-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += significant_coeff_group_flag[x_cg][y_cg + 1] << 1;
-
-        for (n = n_end; n >= 0; n--) {
-            GET_COORD(offset, n);
-
-            if (significant_coeff_group_flag[x_cg][y_cg] &&
-                (n > 0 || implicit_non_zero_coeff == 0)) {
-                if (ff_hevc_significant_coeff_flag_decode(s, c_idx, x_c, y_c,
-                                                          log2_trafo_size,
-                                                          scan_idx,
-                                                          prev_sig) == 1) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                    implicit_non_zero_coeff = 0;
-                }
-            } else {
-                int last_cg = (x_c == (x_cg << 2) && y_c == (y_cg << 2));
-                if (last_cg && implicit_non_zero_coeff && significant_coeff_group_flag[x_cg][y_cg]) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                }
-            }
-        }
-
-        n_end = nb_significant_coeff_flag;
-
-        if (n_end) {
-            int first_nz_pos_in_cg = 16;
-            int last_nz_pos_in_cg = -1;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[16] = { 0 };
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden = 0;
-
-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-
-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx      = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int n_idx = significant_coeff_flag_idx[m];
-                int inc   = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[n_idx] =
-                    ff_hevc_coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[n_idx]) {
-                    greater1_ctx = 0;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
-                }
 
-                if (coeff_abs_level_greater1_flag[n_idx] &&
-                    first_greater1_coeff_idx == -1)
-                    first_greater1_coeff_idx = n_idx;
-            }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-            sign_hidden        = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 &&
-                                 !lc->cu.cu_transquant_bypass_flag;
-
-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += ff_hevc_coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden) {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
-
-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                trans_coeff_level = 1 + coeff_abs_level_greater1_flag[n];
-                if (trans_coeff_level == ((m < 8) ?
-                                          ((n == first_greater1_coeff_idx) ? 3 : 2) : 1)) {
-                    int last_coeff_abs_level_remaining = ff_hevc_coeff_abs_level_remaining(s, trans_coeff_level, c_rice_param);
-
-                    trans_coeff_level += last_coeff_abs_level_remaining;
-                    if ((trans_coeff_level) > (3 * (1 << c_rice_param)))
-                        c_rice_param = FFMIN(c_rice_param + 1, 4);
-                }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && ((sum_abs & 1) == 1))
-                        trans_coeff_level = -trans_coeff_level;
-                }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if (!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag) {
-                        if (y_c || x_c || log2_trafo_size < 4) {
-                            int pos;
-                            switch (log2_trafo_size) {
-                            case 3:  pos = (y_c        << 3) +  x_c;       break;
-                            case 4:  pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                            case 5:  pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                            default: pos = (y_c        << 2) +  x_c;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
-                    }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if (trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
-                    }
-                }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-            }
-        }
-    }
 
-    if (lc->cu.cu_transquant_bypass_flag) {
-        s->hevcdsp.transquant_bypass[log2_trafo_size - 2](dst, coeffs, stride);
-    } else {
-        if (transform_skip_flag)
-            s->hevcdsp.transform_skip(dst, coeffs, stride);
-        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 &&
-                 log2_trafo_size == 2)
-            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
-            s->hevcdsp.transform_add[log2_trafo_size - 2](dst, coeffs, stride);
-    }
+    return 0;
 }
 
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
-                              int blk_idx, int cbf_luma, int cbf_cb, int cbf_cr)
+                              int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
+    int i;
 
     if (lc->cu.pred_mode == MODE_INTRA) {
         int trafo_size = 1 << log2_trafo_size;
         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
 
         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-        if (log2_trafo_size > 2) {
-            trafo_size = trafo_size << (s->ps.sps->hshift[1] - 1);
-            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 1);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 2);
-        } else if (blk_idx == 3) {
-            trafo_size = trafo_size << s->ps.sps->hshift[1];
-            ff_hevc_set_neighbour_available(s, xBase, yBase,
-                                            trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-        }
     }
 
-    if (cbf_luma || cbf_cb || cbf_cr) {
+    if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+        (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
         int scan_idx   = SCAN_DIAG;
         int scan_idx_c = SCAN_DIAG;
+        int cbf_chroma = cbf_cb[0] || cbf_cr[0] ||
+                         (s->ps.sps->chroma_format_idc == 2 &&
+                         (cbf_cb[1] || cbf_cr[1]));
 
         if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
             lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s);
@@ -1260,41 +960,167 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 return AVERROR_INVALIDDATA;
             }
 
-            ff_hevc_set_qPy(s, x0, y0, cb_xBase, cb_yBase, log2_cb_size);
+            ff_hevc_set_qPy(s, cb_xBase, cb_yBase, log2_cb_size);
+        }
+
+        if (s->sh.cu_chroma_qp_offset_enabled_flag && cbf_chroma &&
+            !lc->cu.cu_transquant_bypass_flag  &&  !lc->tu.is_cu_chroma_qp_offset_coded) {
+            int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(s);
+            if (cu_chroma_qp_offset_flag) {
+                int cu_chroma_qp_offset_idx  = 0;
+                if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
+                    cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s);
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "cu_chroma_qp_offset_idx not yet tested.\n");
+                }
+                lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
+                lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
+            } else {
+                lc->tu.cu_qp_offset_cb = 0;
+                lc->tu.cu_qp_offset_cr = 0;
+            }
+            lc->tu.is_cu_chroma_qp_offset_coded = 1;
         }
 
         if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
-            if (lc->tu.cur_intra_pred_mode >= 6 &&
-                lc->tu.cur_intra_pred_mode <= 14) {
+            if (lc->tu.intra_pred_mode >= 6 &&
+                lc->tu.intra_pred_mode <= 14) {
                 scan_idx = SCAN_VERT;
-            } else if (lc->tu.cur_intra_pred_mode >= 22 &&
-                       lc->tu.cur_intra_pred_mode <= 30) {
+            } else if (lc->tu.intra_pred_mode >= 22 &&
+                       lc->tu.intra_pred_mode <= 30) {
                 scan_idx = SCAN_HORIZ;
             }
 
-            if (lc->pu.intra_pred_mode_c >=  6 &&
-                lc->pu.intra_pred_mode_c <= 14) {
+            if (lc->tu.intra_pred_mode_c >=  6 &&
+                lc->tu.intra_pred_mode_c <= 14) {
                 scan_idx_c = SCAN_VERT;
-            } else if (lc->pu.intra_pred_mode_c >= 22 &&
-                       lc->pu.intra_pred_mode_c <= 30) {
+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
+                       lc->tu.intra_pred_mode_c <= 30) {
                 scan_idx_c = SCAN_HORIZ;
             }
         }
 
+        lc->tu.cross_pf = 0;
+
         if (cbf_luma)
-            hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
-        if (log2_trafo_size > 2) {
-            if (cbf_cb)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
+            ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
+        if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            lc->tu.cross_pf  = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
+                                (lc->cu.pred_mode == MODE_INTER ||
+                                 (lc->tu.chroma_mode_c ==  4)));
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 0);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 1);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[1];
+                        int hshift = s->ps.sps->hshift[1];
+                        int vshift = s->ps.sps->vshift[1];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
+                                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 2);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[2];
+                        int hshift = s->ps.sps->hshift[2];
+                        int vshift = s->ps.sps->vshift[2];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
+                                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+        } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                    trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 2);
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc && lc->cu.pred_mode == MODE_INTRA) {
+        if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+            }
         } else if (blk_idx == 3) {
-            if (cbf_cb)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, xBase, yBase,
+                                            trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+            }
         }
     }
+
     return 0;
 }
 
@@ -1317,17 +1143,34 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
                               int trafo_depth, int blk_idx,
-                              int cbf_cb, int cbf_cr)
+                              const int *base_cbf_cb, const int *base_cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t split_transform_flag;
+    int cbf_cb[2];
+    int cbf_cr[2];
     int ret;
 
+    cbf_cb[0] = base_cbf_cb[0];
+    cbf_cb[1] = base_cbf_cb[1];
+    cbf_cr[0] = base_cbf_cr[0];
+    cbf_cr[1] = base_cbf_cr[1];
+
     if (lc->cu.intra_split_flag) {
-        if (trafo_depth == 1)
-            lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
+        if (trafo_depth == 1) {
+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
+            if (s->ps.sps->chroma_format_idc == 3) {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
+            } else {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
+            }
+        }
     } else {
-        lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
     }
 
     if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
@@ -1346,14 +1189,21 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                                inter_split;
     }
 
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cb))
-        cbf_cb = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cb = 0;
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cr))
-        cbf_cr = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cr = 0;
+    if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+        if (trafo_depth == 0 || cbf_cb[0]) {
+            cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+
+        if (trafo_depth == 0 || cbf_cr[0]) {
+            cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+    }
 
     if (split_transform_flag) {
         const int trafo_size_split = 1 << (log2_trafo_size - 1);
@@ -1382,8 +1232,10 @@ do {
         int cbf_luma         = 1;
 
         if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
-            cbf_cb || cbf_cr)
+            cbf_cb[0] || cbf_cr[0] ||
+            (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
             cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth);
+        }
 
         ret = hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
                                  log2_cb_size, log2_trafo_size,
@@ -1412,8 +1264,7 @@ do {
 
 static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
-    //TODO: non-4:2:0 support
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext gb;
     int cb_size   = 1 << log2_cb_size;
     int stride0   = s->frame->linesize[0];
@@ -1423,7 +1274,10 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     int   stride2 = s->frame->linesize[2];
     uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
 
-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth + ((cb_size * cb_size) >> 1) * s->ps.sps->pcm.bit_depth_chroma;
+    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+                          s->ps.sps->pcm.bit_depth_chroma;
     const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
     int ret;
 
@@ -1434,38 +1288,23 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     if (ret < 0)
         return ret;
 
-    s->hevcdsp.put_pcm(dst0, stride0, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
-    s->hevcdsp.put_pcm(dst1, stride1, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    s->hevcdsp.put_pcm(dst2, stride2, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    return 0;
-}
-
-static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x = ff_hevc_abs_mvd_greater0_flag_decode(s);
-    int y = ff_hevc_abs_mvd_greater0_flag_decode(s);
-
-    if (x)
-        x += ff_hevc_abs_mvd_greater1_flag_decode(s);
-    if (y)
-        y += ff_hevc_abs_mvd_greater1_flag_decode(s);
-
-    switch (x) {
-    case 2: lc->pu.mvd.x = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.x = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.x = 0;                               break;
+    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+    if (s->ps.sps->chroma_format_idc) {
+        s->hevcdsp.put_pcm(dst1, stride1,
+                           cb_size >> s->ps.sps->hshift[1],
+                           cb_size >> s->ps.sps->vshift[1],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+        s->hevcdsp.put_pcm(dst2, stride2,
+                           cb_size >> s->ps.sps->hshift[2],
+                           cb_size >> s->ps.sps->vshift[2],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
     }
 
-    switch (y) {
-    case 2: lc->pu.mvd.y = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.y = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.y = 0;                               break;
-    }
+    return 0;
 }
 
 /**
- * 8.5.3.2.2.1 Luma sample interpolation process
+ * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
  *
  * @param s HEVC decoding context
  * @param dst target buffer for block data at block position
@@ -1476,49 +1315,147 @@ static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param luma_weight weighting factor applied to the luma prediction
+ * @param luma_offset additive offset applied to the luma prediction value
  */
-static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
-                    AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                    int block_w, int block_h)
+
+static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                        int block_w, int block_h, int luma_weight, int luma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t *src         = ref->data[0];
     ptrdiff_t srcstride  = ref->linesize[0];
     int pic_width        = s->ps.sps->width;
     int pic_height       = s->ps.sps->height;
-
-    int mx         = mv->x & 3;
-    int my         = mv->y & 3;
-    int extra_left = ff_hevc_qpel_extra_before[mx];
-    int extra_top  = ff_hevc_qpel_extra_before[my];
+    int mx               = mv->x & 3;
+    int my               = mv->y & 3;
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
 
     x_off += mv->x >> 2;
     y_off += mv->y >> 2;
     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
-    if (x_off < extra_left || y_off < extra_top ||
-        x_off >= pic_width - block_w - ff_hevc_qpel_extra_after[mx] ||
-        y_off >= pic_height - block_h - ff_hevc_qpel_extra_after[my]) {
+    if (x_off < QPEL_EXTRA_BEFORE || y_off < QPEL_EXTRA_AFTER ||
+        x_off >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off >= pic_height - block_h - QPEL_EXTRA_AFTER) {
         const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
-        int offset = extra_top * srcstride + (extra_left << s->ps.sps->pixel_shift);
-        int buf_offset = extra_top *
-                         edge_emu_stride + (extra_left << s->ps.sps->pixel_shift);
+        int offset     = QPEL_EXTRA_BEFORE * srcstride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src - offset,
                                  edge_emu_stride, srcstride,
-                                 block_w + ff_hevc_qpel_extra[mx],
-                                 block_h + ff_hevc_qpel_extra[my],
-                                 x_off - extra_left, y_off - extra_top,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off - QPEL_EXTRA_BEFORE, y_off - QPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
         src = lc->edge_emu_buffer + buf_offset;
         srcstride = edge_emu_stride;
     }
-    s->hevcdsp.put_hevc_qpel[my][mx](dst, dststride, src, srcstride, block_w,
-                                     block_h, lc->mc_buffer);
+
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_uni[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                      block_h, mx, my, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_uni_w[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                        block_h, s->sh.luma_log2_weight_denom,
+                                                        luma_weight, luma_offset, mx, my, block_w);
+}
+
+/**
+ * 8.5.3.2.2.1 Luma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ */
+ static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    ptrdiff_t src0stride  = ref0->linesize[0];
+    ptrdiff_t src1stride  = ref1->linesize[0];
+    int pic_width        = s->ps.sps->width;
+    int pic_height       = s->ps.sps->height;
+    int mx0              = mv0->x & 3;
+    int my0              = mv0->y & 3;
+    int mx1              = mv1->x & 3;
+    int my1              = mv1->y & 3;
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int x_off0           = x_off + (mv0->x >> 2);
+    int y_off0           = y_off + (mv0->y >> 2);
+    int x_off1           = x_off + (mv1->x >> 2);
+    int y_off1           = y_off + (mv1->y >> 2);
+    int idx              = ff_hevc_pel_weight[block_w];
+
+    uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src0stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset,
+                                 edge_emu_stride, src0stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off0 - QPEL_EXTRA_BEFORE, y_off0 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src0 = lc->edge_emu_buffer + buf_offset;
+        src0stride = edge_emu_stride;
+    }
+
+    if (x_off1 < QPEL_EXTRA_BEFORE || y_off1 < QPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src1stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src1 - offset,
+                                 edge_emu_stride, src1stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off1 - QPEL_EXTRA_BEFORE, y_off1 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src1 = lc->edge_emu_buffer2 + buf_offset;
+        src1stride = edge_emu_stride;
+    }
+
+    s->hevcdsp.put_hevc_qpel[idx][!!my0][!!mx0](lc->tmp, src0, src0stride,
+                                                block_h, mx0, my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_bi[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                       block_h, mx1, my1, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_bi_w[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                         block_h, s->sh.luma_log2_weight_denom,
+                                                         s->sh.luma_weight_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_weight_l1[current_mv->ref_idx[1]],
+                                                         s->sh.luma_offset_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_offset_l1[current_mv->ref_idx[1]],
+                                                         mx1, my1, block_w);
+
 }
 
 /**
- * 8.5.3.2.2.2 Chroma sample interpolation process
+ * 8.5.3.2.2.2 Chroma sample uniprediction interpolation process
  *
  * @param s HEVC decoding context
  * @param dst1 target buffer for block data at block position (U plane)
@@ -1530,88 +1467,184 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param chroma_weight weighting factor applied to the chroma prediction
+ * @param chroma_offset additive offset applied to the chroma prediction value
  */
-static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2,
-                      ptrdiff_t dststride, AVFrame *ref, const Mv *mv,
-                      int x_off, int y_off, int block_w, int block_h)
+
+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    uint8_t *src1        = ref->data[1];
-    uint8_t *src2        = ref->data[2];
-    ptrdiff_t src1stride = ref->linesize[1];
-    ptrdiff_t src2stride = ref->linesize[2];
-    int pic_width        = s->ps.sps->width >> 1;
-    int pic_height       = s->ps.sps->height >> 1;
-
-    int mx = mv->x & 7;
-    int my = mv->y & 7;
-
-    x_off += mv->x >> 3;
-    y_off += mv->y >> 3;
-    src1  += y_off * src1stride + (x_off * (1 << s->ps.sps->pixel_shift));
-    src2  += y_off * src2stride + (x_off * (1 << s->ps.sps->pixel_shift));
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    const Mv *mv         = &current_mv->mv[reflist];
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
+    int hshift           = s->ps.sps->hshift[1];
+    int vshift           = s->ps.sps->vshift[1];
+    intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+    intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+    intptr_t _mx         = mx << (1 - hshift);
+    intptr_t _my         = my << (1 - vshift);
+
+    x_off += mv->x >> (2 + hshift);
+    y_off += mv->y >> (2 + vshift);
+    src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
     if (x_off < EPEL_EXTRA_BEFORE || y_off < EPEL_EXTRA_AFTER ||
         x_off >= pic_width - block_w - EPEL_EXTRA_AFTER ||
         y_off >= pic_height - block_h - EPEL_EXTRA_AFTER) {
         const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset0 = EPEL_EXTRA_BEFORE * (srcstride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset0 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset0,
+                                 edge_emu_stride, srcstride,
+                                 block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
+                                 x_off - EPEL_EXTRA_BEFORE,
+                                 y_off - EPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+
+        src0 = lc->edge_emu_buffer + buf_offset0;
+        srcstride = edge_emu_stride;
+    }
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_uni[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                  block_h, _mx, _my, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_uni_w[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                        block_h, s->sh.chroma_log2_weight_denom,
+                                                        chroma_weight, chroma_offset, _mx, _my, block_w);
+}
+
+/**
+ * 8.5.3.2.2.2 Chroma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ * @param cidx chroma component(cb, cr)
+ */
+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    uint8_t *src1        = ref0->data[cidx+1];
+    uint8_t *src2        = ref1->data[cidx+1];
+    ptrdiff_t src1stride = ref0->linesize[cidx+1];
+    ptrdiff_t src2stride = ref1->linesize[cidx+1];
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    Mv *mv0              = &current_mv->mv[0];
+    Mv *mv1              = &current_mv->mv[1];
+    int hshift = s->ps.sps->hshift[1];
+    int vshift = s->ps.sps->vshift[1];
+
+    intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+    intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+    intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+    intptr_t my1 = av_mod_uintp2(mv1->y, 2 + vshift);
+    intptr_t _mx0 = mx0 << (1 - hshift);
+    intptr_t _my0 = my0 << (1 - vshift);
+    intptr_t _mx1 = mx1 << (1 - hshift);
+    intptr_t _my1 = my1 << (1 - vshift);
+
+    int x_off0 = x_off + (mv0->x >> (2 + hshift));
+    int y_off0 = y_off + (mv0->y >> (2 + vshift));
+    int x_off1 = x_off + (mv1->x >> (2 + hshift));
+    int y_off1 = y_off + (mv1->y >> (2 + vshift));
+    int idx = ff_hevc_pel_weight[block_w];
+    src1  += y_off0 * src1stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    src2  += y_off1 * src2stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < EPEL_EXTRA_BEFORE || y_off0 < EPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
         int offset1 = EPEL_EXTRA_BEFORE * (src1stride + (1 << s->ps.sps->pixel_shift));
         int buf_offset1 = EPEL_EXTRA_BEFORE *
                           (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
-        int offset2 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
-        int buf_offset2 = EPEL_EXTRA_BEFORE *
-                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src1 - offset1,
                                  edge_emu_stride, src1stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off0 - EPEL_EXTRA_BEFORE,
+                                 y_off0 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
 
         src1 = lc->edge_emu_buffer + buf_offset1;
         src1stride = edge_emu_stride;
-        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride,
-                                             block_w, block_h, mx, my, lc->mc_buffer);
+    }
 
-        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src2 - offset2,
+    if (x_off1 < EPEL_EXTRA_BEFORE || y_off1 < EPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset1 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset1 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src2 - offset1,
                                  edge_emu_stride, src2stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off1 - EPEL_EXTRA_BEFORE,
+                                 y_off1 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
-        src2 = lc->edge_emu_buffer + buf_offset2;
-        src2stride = edge_emu_stride;
 
-        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride,
-                                             block_w, block_h, mx, my,
-                                             lc->mc_buffer);
-    } else {
-        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride,
-                                             block_w, block_h, mx, my,
-                                             lc->mc_buffer);
-        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride,
-                                             block_w, block_h, mx, my,
-                                             lc->mc_buffer);
+        src2 = lc->edge_emu_buffer2 + buf_offset1;
+        src2stride = edge_emu_stride;
     }
+
+    s->hevcdsp.put_hevc_epel[idx][!!my0][!!mx0](lc->tmp, src1, src1stride,
+                                                block_h, _mx0, _my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_bi[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                       src2, src2stride, lc->tmp,
+                                                       block_h, _mx1, _my1, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_bi_w[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                         src2, src2stride, lc->tmp,
+                                                         block_h,
+                                                         s->sh.chroma_log2_weight_denom,
+                                                         s->sh.chroma_weight_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_weight_l1[current_mv->ref_idx[1]][cidx],
+                                                         s->sh.chroma_offset_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_offset_l1[current_mv->ref_idx[1]][cidx],
+                                                         _mx1, _my1, block_w);
 }
 
 static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
                                 const Mv *mv, int y0, int height)
 {
-    int y = (mv->y >> 2) + y0 + height + 9;
-    ff_thread_await_progress(&ref->tf, y, 0);
+    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+    if (s->threads_type == FF_THREAD_FRAME )
+        ff_thread_await_progress(&ref->tf, y, 0);
 }
 
-static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
+static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                   int nPbH, int log2_cb_size, int part_idx,
                                   int merge_idx, MvField *mv)
 {
-    HEVCLocalContext *lc             = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     enum InterPredIdc inter_pred_idc = PRED_L0;
     int mvp_flag;
 
     ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
+    mv->pred_flag = 0;
     if (s->sh.slice_type == B_SLICE)
         inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH);
 
@@ -1619,8 +1652,8 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.nb_refs[L0])
             mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]);
 
-        mv->pred_flag[0] = 1;
-        hls_mvd_coding(s, x0, y0, 0);
+        mv->pred_flag = PF_L0;
+        ff_hevc_hls_mvd_coding(s, x0, y0, 0);
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 0);
@@ -1635,10 +1668,10 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
             AV_ZERO32(&lc->pu.mvd);
         } else {
-            hls_mvd_coding(s, x0, y0, 1);
+            ff_hevc_hls_mvd_coding(s, x0, y0, 1);
         }
 
-        mv->pred_flag[1] = 1;
+        mv->pred_flag += PF_L1;
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 1);
@@ -1649,12 +1682,12 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
 
 static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
                                 int nPbW, int nPbH,
-                                int log2_cb_size, int partIdx)
+                                int log2_cb_size, int partIdx, int idx)
 {
 #define POS(c_idx, x, y)                                                              \
     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int merge_idx = 0;
     struct MvField current_mv = {{{ 0 }}};
 
@@ -1662,10 +1695,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 
     MvField *tab_mvf = s->ref->tab_mvf;
     RefPicList  *refPicList = s->ref->refPicList;
-    HEVCFrame *ref0, *ref1;
-
-    int tmpstride = MAX_PB_SIZE;
-
+    HEVCFrame *ref0 = NULL, *ref1 = NULL;
     uint8_t *dst0 = POS(0, x0, y0);
     uint8_t *dst1 = POS(1, x0, y0);
     uint8_t *dst2 = POS(2, x0, y0);
@@ -1690,7 +1720,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                    partIdx, merge_idx, &current_mv);
     } else {
-        hevc_luma_mv_mpv_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+        hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                               partIdx, merge_idx, &current_mv);
     }
 
@@ -1701,139 +1731,74 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
             tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
 
-    if (current_mv.pred_flag[0]) {
+    if (current_mv.pred_flag & PF_L0) {
         ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
         if (!ref0)
             return;
         hevc_await_progress(s, ref0, &current_mv.mv[0], y0, nPbH);
     }
-    if (current_mv.pred_flag[1]) {
+    if (current_mv.pred_flag & PF_L1) {
         ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
         if (!ref1)
             return;
         hevc_await_progress(s, ref1, &current_mv.mv[1], y0, nPbH);
     }
 
-    if (current_mv.pred_flag[0] && !current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t,  tmp[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
+    if (current_mv.pred_flag == PF_L0) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
 
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH);
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                    s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
 
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom,
-                                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                     s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                     dst0, s->frame->linesize[0], tmp,
-                                     tmpstride, nPbW, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH);
-        }
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
-                                     s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                     s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                     dst1, s->frame->linesize[1], tmp, tmpstride,
-                                     nPbW / 2, nPbH / 2);
-            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
-                                     s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                     s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                     dst2, s->frame->linesize[2], tmp2, tmpstride,
-                                     nPbW / 2, nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
-            s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
-        }
-    } else if (!current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom,
-                                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                      s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                      dst0, s->frame->linesize[0], tmp, tmpstride,
-                                      nPbW, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH);
-        }
-
-        chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0/2, y0/2, nPbW/2, nPbH/2);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
-                                     s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                     dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
-            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
-                                     s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                     dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
-        } else {
-            s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
-            s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
-        }
-    } else if (current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp3[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH);
-        luma_mc(s, tmp2, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_avg(s->sh.luma_log2_weight_denom,
-                                         s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                         s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                         s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                         s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                         dst0, s->frame->linesize[0],
-                                         tmp, tmp2, tmpstride, nPbW, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg(dst0, s->frame->linesize[0],
-                                               tmp, tmp2, tmpstride, nPbW, nPbH);
-        }
-
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2);
-        chroma_mc(s, tmp3, tmp4, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom,
-                                         s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                         s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                         s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                         s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                         dst1, s->frame->linesize[1], tmp, tmp3,
-                                         tmpstride, nPbW / 2, nPbH / 2);
-            s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom,
-                                         s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                         s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                         s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                         s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                         dst2, s->frame->linesize[2], tmp2, tmp4,
-                                         tmpstride, nPbW / 2, nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg(dst1, s->frame->linesize[1], tmp, tmp3, tmpstride, nPbW/2, nPbH/2);
-            s->hevcdsp.put_unweighted_pred_avg(dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbW/2, nPbH/2);
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+        }
+    } else if (current_mv.pred_flag == PF_L1) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                    &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                    s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+        }
+    } else if (current_mv.pred_flag == PF_BI) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+
+        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                   &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                   ref1->frame, &current_mv.mv[1], &current_mv);
+
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+
+            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
         }
     }
 }
@@ -1844,13 +1809,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                                 int prev_intra_luma_pred_flag)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
     int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int size_in_pus      = pu_size >> s->ps.sps->log2_min_pu_size;
-    int x0b              = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b              = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b              = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b              = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     int cand_up   = (lc->ctb_up_flag || y0b) ?
                     s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
@@ -1914,15 +1879,7 @@ static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                intra_pred_mode, size_in_pus);
 
         for (j = 0; j < size_in_pus; j++) {
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].is_intra     = 1;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[0] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[1] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[0]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[1]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].y      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].y      = 0;
+            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA;
         }
     }
 
@@ -1942,10 +1899,14 @@ static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0,
                ct_depth, length);
 }
 
+static const uint8_t tab_mode_idx[] = {
+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
+
 static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
                                   int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
     uint8_t prev_intra_luma_pred_flag[4];
     int split   = lc->cu.part_mode == PART_NxN;
@@ -1971,14 +1932,42 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
         }
     }
 
-    chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-    if (chroma_mode != 4) {
-        if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-            lc->pu.intra_pred_mode_c = 34;
-        else
-            lc->pu.intra_pred_mode_c = intra_chroma_table[chroma_mode];
-    } else {
-        lc->pu.intra_pred_mode_c = lc->pu.intra_pred_mode[0];
+    if (s->ps.sps->chroma_format_idc == 3) {
+        for (i = 0; i < side; i++) {
+            for (j = 0; j < side; j++) {
+                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+                if (chroma_mode != 4) {
+                    if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
+                        lc->pu.intra_pred_mode_c[2 * i + j] = 34;
+                    else
+                        lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode];
+                } else {
+                    lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j];
+                }
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc == 2) {
+        int mode_idx;
+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                mode_idx = 34;
+            else
+                mode_idx = intra_chroma_table[chroma_mode];
+        } else {
+            mode_idx = lc->pu.intra_pred_mode[0];
+        }
+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
+    } else if (s->ps.sps->chroma_format_idc != 0) {
+        chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                lc->pu.intra_pred_mode_c[0] = 34;
+            else
+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
+        } else {
+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
+        }
     }
 }
 
@@ -1986,7 +1975,7 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
                                                 int x0, int y0,
                                                 int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int pb_size          = 1 << log2_cb_size;
     int size_in_pus      = pb_size >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
@@ -1997,22 +1986,25 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
 
     if (size_in_pus == 0)
         size_in_pus = 1;
-    for (j = 0; j < size_in_pus; j++) {
+    for (j = 0; j < size_in_pus; j++)
         memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus);
-        for (k = 0; k < size_in_pus; k++)
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].is_intra = lc->cu.pred_mode == MODE_INTRA;
-    }
+    if (lc->cu.pred_mode == MODE_INTRA)
+        for (j = 0; j < size_in_pus; j++)
+            for (k = 0; k < size_in_pus; k++)
+                tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
 }
 
 static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
     int cb_size          = 1 << log2_cb_size;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
     int length           = cb_size >> log2_min_cb_size;
     int min_cb_width     = s->ps.sps->min_cb_width;
     int x_cb             = x0 >> log2_min_cb_size;
     int y_cb             = y0 >> log2_min_cb_size;
+    int idx              = log2_cb_size - 2;
+    int qp_block_mask    = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int x, y, ret;
 
     lc->cu.x                = x0;
@@ -2040,10 +2032,16 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             x += min_cb_width;
         }
         lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
+    } else {
+        x = y_cb * min_cb_width + x_cb;
+        for (y = 0; y < length; y++) {
+            memset(&s->skip_flag[x], 0, length);
+            x += min_cb_width;
+        }
     }
 
     if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
-        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
         intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
 
         if (!s->sh.disable_deblocking_filter_flag)
@@ -2081,37 +2079,37 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
             switch (lc->cu.part_mode) {
             case PART_2Nx2N:
-                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
                 break;
             case PART_2NxN:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
                 break;
             case PART_Nx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
                 break;
             case PART_2NxnU:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
                 break;
             case PART_2NxnD:
-                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
                 break;
             case PART_nLx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_nRx2N:
-                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_NxN:
-                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1);
-                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3);
+                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
+                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
                 break;
             }
         }
@@ -2124,12 +2122,13 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                 rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s);
             }
             if (rqt_root_cbf) {
+                const static int cbf[2] = { 0 };
                 lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
                                          s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
                                          s->ps.sps->max_transform_hierarchy_depth_inter;
                 ret = hls_transform_tree(s, x0, y0, x0, y0, x0, y0,
                                          log2_cb_size,
-                                         log2_cb_size, 0, 0, 0, 0);
+                                         log2_cb_size, 0, 0, cbf, cbf);
                 if (ret < 0)
                     return ret;
             } else {
@@ -2140,7 +2139,7 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
     }
 
     if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
-        ff_hevc_set_qPy(s, x0, y0, x0, y0, log2_cb_size);
+        ff_hevc_set_qPy(s, x0, y0, log2_cb_size);
 
     x = y_cb * min_cb_width + x_cb;
     for (y = 0; y < length; y++) {
@@ -2148,7 +2147,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
         x += min_cb_width;
     }
 
-    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct.depth);
+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
+        lc->qPy_pred = lc->qp_y;
+    }
+
+    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
 
     return 0;
 }
@@ -2156,11 +2160,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
                                int log2_cb_size, int cb_depth)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     const int cb_size    = 1 << log2_cb_size;
+    int ret;
     int split_cu;
 
-    lc->ct.depth = cb_depth;
+    lc->ct_depth = cb_depth;
     if (x0 + cb_size <= s->ps.sps->width  &&
         y0 + cb_size <= s->ps.sps->height &&
         log2_cb_size > s->ps.sps->log2_min_cb_size) {
@@ -2174,31 +2179,64 @@ static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
         lc->tu.cu_qp_delta          = 0;
     }
 
+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
+        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth) {
+        lc->tu.is_cu_chroma_qp_offset_coded = 0;
+    }
+
     if (split_cu) {
+        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
         const int cb_size_split = cb_size >> 1;
         const int x1 = x0 + cb_size_split;
         const int y1 = y0 + cb_size_split;
 
-        log2_cb_size--;
-        cb_depth++;
+        int more_data = 0;
 
-#define SUBDIVIDE(x, y)                                                \
-do {                                                                   \
-    if (x < s->ps.sps->width && y < s->ps.sps->height) {                     \
-        int ret = hls_coding_quadtree(s, x, y, log2_cb_size, cb_depth);\
-        if (ret < 0)                                                   \
-            return ret;                                                \
-    }                                                                  \
-} while (0)
+        more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data < 0)
+            return more_data;
+
+        if (more_data && x1 < s->ps.sps->width) {
+            more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && x1 < s->ps.sps->width &&
+            y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
 
-        SUBDIVIDE(x0, y0);
-        SUBDIVIDE(x1, y0);
-        SUBDIVIDE(x0, y1);
-        SUBDIVIDE(x1, y1);
+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
+            lc->qPy_pred = lc->qp_y;
+
+        if (more_data)
+            return ((x1 + cb_size_split) < s->ps.sps->width ||
+                    (y1 + cb_size_split) < s->ps.sps->height);
+        else
+            return 0;
     } else {
-        int ret = hls_coding_unit(s, x0, y0, log2_cb_size);
+        ret = hls_coding_unit(s, x0, y0, log2_cb_size);
         if (ret < 0)
             return ret;
+        if ((!((x0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (x0 + cb_size >= s->ps.sps->width)) &&
+            (!((y0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (y0 + cb_size >= s->ps.sps->height))) {
+            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s);
+            return !end_of_slice_flag;
+        } else {
+            return 1;
+        }
     }
 
     return 0;
@@ -2207,7 +2245,7 @@ do {                                                                   \
 static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
                                  int ctb_addr_ts)
 {
-    HEVCLocalContext *lc  = &s->HEVClc;
+    HEVCLocalContext *lc  = s->HEVClc;
     int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
     int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
     int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;
@@ -2221,7 +2259,6 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     } else if (s->ps.pps->tiles_enabled_flag) {
         if (ctb_addr_ts && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
             int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
-            lc->start_of_tiles_x = x_ctb;
             lc->end_of_tiles_x   = x_ctb + (s->ps.pps->column_width[idxX] << s->ps.sps->log2_ctb_size);
             lc->first_qp_group   = 1;
         }
@@ -2242,7 +2279,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
         if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
     } else {
-        if (!ctb_addr_in_slice > 0)
+        if (ctb_addr_in_slice <= 0)
             lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
         if (ctb_addr_in_slice < s->ps.sps->ctb_width)
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
@@ -2254,14 +2291,27 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
 }
 
-static int hls_slice_data(HEVCContext *s)
+static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
 {
+    HEVCContext *s  = avctxt->priv_data;
     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
     int more_data   = 1;
     int x_ctb       = 0;
     int y_ctb       = 0;
     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-    int ret;
+
+    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->sh.dependent_slice_segment_flag) {
+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
@@ -2278,10 +2328,12 @@ static int hls_slice_data(HEVCContext *s)
         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
 
-        ret = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-        if (ret < 0)
-            return ret;
-        more_data = !ff_hevc_end_of_slice_flag_decode(s);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            return more_data;
+        }
+
 
         ctb_addr_ts++;
         ff_hevc_save_states(s, ctb_addr_ts);
@@ -2290,36 +2342,182 @@ static int hls_slice_data(HEVCContext *s)
 
     if (x_ctb + ctb_size >= s->ps.sps->width &&
         y_ctb + ctb_size >= s->ps.sps->height)
-        ff_hevc_hls_filter(s, x_ctb, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
 
     return ctb_addr_ts;
 }
 
-static void restore_tqb_pixels(HEVCContext *s)
+static int hls_slice_data(HEVCContext *s)
 {
-    int min_pu_size = 1 << s->ps.sps->log2_min_pu_size;
-    int x, y, c_idx;
-
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        ptrdiff_t stride = s->frame->linesize[c_idx];
-        int hshift       = s->ps.sps->hshift[c_idx];
-        int vshift       = s->ps.sps->vshift[c_idx];
-        for (y = 0; y < s->ps.sps->min_pu_height; y++) {
-            for (x = 0; x < s->ps.sps->min_pu_width; x++) {
-                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
-                    int n;
-                    int len      = min_pu_size >> hshift;
-                    uint8_t *src = &s->frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    for (n = 0; n < (min_pu_size >> vshift); n++) {
-                        memcpy(dst, src, len);
-                        src += stride;
-                        dst += stride;
-                    }
-                }
+    int arg[2];
+    int ret[2];
+
+    arg[0] = 0;
+    arg[1] = 1;
+
+    s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int));
+    return ret[0];
+}
+static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int job, int self_id)
+{
+    HEVCContext *s1  = avctxt->priv_data, *s;
+    HEVCLocalContext *lc;
+    int ctb_size    = 1<< s1->ps.sps->log2_ctb_size;
+    int more_data   = 1;
+    int *ctb_row_p    = input_ctb_row;
+    int ctb_row = ctb_row_p[job];
+    int ctb_addr_rs = s1->sh.slice_ctb_addr_rs + ctb_row * ((s1->ps.sps->width + ctb_size - 1) >> s1->ps.sps->log2_ctb_size);
+    int ctb_addr_ts = s1->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    int thread = ctb_row % s1->threads_number;
+    int ret;
+
+    s = s1->sList[self_id];
+    lc = s->HEVClc;
+
+    if(ctb_row) {
+        ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+
+        if (ret < 0)
+            return ret;
+        ff_init_cabac_decoder(&lc->cc, s->data + s->sh.offset[(ctb_row)-1], s->sh.size[ctb_row - 1]);
+    }
+
+    while(more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+        int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+        int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+
+        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+
+        ff_thread_await_progress2(s->avctx, ctb_row, thread, SHIFT_CTB_WPP);
+
+        if (avpriv_atomic_int_get(&s1->wpp_err)){
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        ff_hevc_cabac_init(s, ctb_addr_ts);
+        hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            avpriv_atomic_int_set(&s1->wpp_err,  1);
+            ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+            return more_data;
+        }
+
+        ctb_addr_ts++;
+
+        ff_hevc_save_states(s, ctb_addr_ts);
+        ff_thread_report_progress2(s->avctx, ctb_row, thread, 1);
+        ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+
+        if (!more_data && (x_ctb+ctb_size) < s->ps.sps->width && ctb_row != s->sh.num_entry_point_offsets) {
+            avpriv_atomic_int_set(&s1->wpp_err,  1);
+            ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        if ((x_ctb+ctb_size) >= s->ps.sps->width && (y_ctb+ctb_size) >= s->ps.sps->height ) {
+            ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return ctb_addr_ts;
+        }
+        ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        x_ctb+=ctb_size;
+
+        if(x_ctb >= s->ps.sps->width) {
+            break;
+        }
+    }
+    ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+
+    return 0;
+}
+
+static int hls_slice_data_wpp(HEVCContext *s, const HEVCNAL *nal)
+{
+    const uint8_t *data = nal->data;
+    int length          = nal->size;
+    HEVCLocalContext *lc = s->HEVClc;
+    int *ret = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int *arg = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int offset;
+    int startheader, cmpt = 0;
+    int i, j, res = 0;
+
+    if (!ret || !arg) {
+        av_free(ret);
+        av_free(arg);
+        return AVERROR(ENOMEM);
+    }
+
+
+    if (!s->sList[1]) {
+        ff_alloc_entries(s->avctx, s->sh.num_entry_point_offsets + 1);
+
+
+        for (i = 1; i < s->threads_number; i++) {
+            s->sList[i] = av_malloc(sizeof(HEVCContext));
+            memcpy(s->sList[i], s, sizeof(HEVCContext));
+            s->HEVClcList[i] = av_mallocz(sizeof(HEVCLocalContext));
+            s->sList[i]->HEVClc = s->HEVClcList[i];
+        }
+    }
+
+    offset = (lc->gb.index >> 3);
+
+    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+            startheader--;
+            cmpt++;
+        }
+    }
+
+    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
+        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
+        for (j = 0, cmpt = 0, startheader = offset
+             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+                startheader--;
+                cmpt++;
             }
         }
+        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
+        s->sh.offset[i - 1] = offset;
+
     }
+    if (s->sh.num_entry_point_offsets != 0) {
+        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
+        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
+        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
+
+    }
+    s->data = data;
+
+    for (i = 1; i < s->threads_number; i++) {
+        s->sList[i]->HEVClc->first_qp_group = 1;
+        s->sList[i]->HEVClc->qp_y = s->sList[0]->HEVClc->qp_y;
+        memcpy(s->sList[i], s, sizeof(HEVCContext));
+        s->sList[i]->HEVClc = s->HEVClcList[i];
+    }
+
+    avpriv_atomic_int_set(&s->wpp_err, 0);
+    ff_reset_entries(s->avctx);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++) {
+        arg[i] = i;
+        ret[i] = 0;
+    }
+
+    if (s->ps.pps->entropy_coding_sync_enabled_flag)
+        s->avctx->execute2(s->avctx, (void *) hls_decode_entry_wpp, arg, ret, s->sh.num_entry_point_offsets + 1);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++)
+        res += ret[i];
+    av_free(ret);
+    av_free(arg);
+    return res;
 }
 
 static int set_side_data(HEVCContext *s)
@@ -2373,23 +2571,24 @@ static int set_side_data(HEVCContext *s)
 
 static int hevc_frame_start(HEVCContext *s)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
+                           ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;
 
-    memset(s->horizontal_bs, 0, 2 * s->bs_width * (s->bs_height + 1));
-    memset(s->vertical_bs,   0, 2 * s->bs_width * (s->bs_height + 1));
+    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
     memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, s->ps.sps->min_pu_width * s->ps.sps->min_pu_height);
+    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
 
-    lc->start_of_tiles_x = 0;
     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
 
     if (s->ps.pps->tiles_enabled_flag)
         lc->end_of_tiles_x = s->ps.pps->column_width[0] << s->ps.sps->log2_ctb_size;
 
-    ret = ff_hevc_set_new_ref(s, s->ps.sps->sao_enabled ? &s->sao_frame : &s->frame,
-                              s->poc);
+    ret = ff_hevc_set_new_ref(s, &s->frame, s->poc);
     if (ret < 0)
         goto fail;
 
@@ -2405,12 +2604,18 @@ static int hevc_frame_start(HEVCContext *s)
     if (ret < 0)
         goto fail;
 
+    s->frame->pict_type = 3 - s->sh.slice_type;
+
+    if (!IS_IRAP(s))
+        ff_hevc_bump_frame(s);
+
     av_frame_unref(s->output_frame);
     ret = ff_hevc_output_frame(s, s->output_frame, 0);
     if (ret < 0)
         goto fail;
 
-    ff_thread_finish_setup(s->avctx);
+    if (!s->avctx->hwaccel)
+        ff_thread_finish_setup(s->avctx);
 
     return 0;
 
@@ -2423,7 +2628,7 @@ fail:
 
 static int decode_nal_unit(HEVCContext *s, const HEVCNAL *nal)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext *gb    = &lc->gb;
     int ctb_addr_ts, ret;
 
@@ -2529,13 +2734,12 @@ static int decode_nal_unit(HEVCContext *s, const HEVCNAL *nal)
             if (ret < 0)
                 goto fail;
         } else {
-            ctb_addr_ts = hls_slice_data(s);
+            if (s->threads_number > 1 && s->sh.num_entry_point_offsets > 0)
+                ctb_addr_ts = hls_slice_data_wpp(s, nal);
+            else
+                ctb_addr_ts = hls_slice_data(s);
             if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
                 s->is_decoded = 1;
-                if ((s->ps.pps->transquant_bypass_enable_flag ||
-                     (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) &&
-                    s->ps.sps->sao_enabled)
-                    restore_tqb_pixels(s);
             }
 
             if (ctb_addr_ts < 0) {
@@ -2569,11 +2773,12 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     int i, ret = 0;
 
     s->ref = NULL;
+    s->last_eos = s->eos;
     s->eos = 0;
 
     /* split the input packet into NAL units, so we know the upper bound on the
      * number of slices in the frame */
-    ret = ff_hevc_split_packet(&s->pkt, buf, length, s->avctx, s->is_nalff,
+    ret = ff_hevc_split_packet(s, &s->pkt, buf, length, s->avctx, s->is_nalff,
                                s->nal_length_size);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -2598,7 +2803,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     }
 
 fail:
-    if (s->ref)
+    if (s->ref && s->threads_type == FF_THREAD_FRAME)
         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 
     return ret;
@@ -2698,9 +2903,12 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
         return ret;
 
     if (avctx->hwaccel) {
-        if (s->ref && avctx->hwaccel->end_frame(avctx) < 0)
+        if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            ff_hevc_unref_frame(s, s->ref, ~0);
+            return ret;
+        }
     } else {
         /* verify the SEI checksum */
         if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
@@ -2729,7 +2937,9 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
 
 static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
 {
-    int ret = ff_thread_ref_frame(&dst->tf, &src->tf);
+    int ret;
+
+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
     if (ret < 0)
         return ret;
 
@@ -2775,7 +2985,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
 
     av_freep(&s->md5_ctx);
 
-    av_frame_free(&s->tmp_frame);
+    av_freep(&s->cabac_state);
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
     av_frame_free(&s->output_frame);
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -2789,9 +3004,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
         av_buffer_unref(&s->ps.sps_list[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
         av_buffer_unref(&s->ps.pps_list[i]);
+    s->ps.sps = NULL;
+    s->ps.pps = NULL;
+    s->ps.vps = NULL;
+
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.offset);
+    av_freep(&s->sh.size);
+
+    for (i = 1; i < s->threads_number; i++) {
+        HEVCLocalContext *lc = s->HEVClcList[i];
+        if (lc) {
+            av_freep(&s->HEVClcList[i]);
+            av_freep(&s->sList[i]);
+        }
+    }
+    if (s->HEVClc == s->HEVClcList[0])
+        s->HEVClc = NULL;
+    av_freep(&s->HEVClcList[0]);
 
-    for (i = 0; i < s->pkt.nals_allocated; i++)
+    for (i = 0; i < s->pkt.nals_allocated; i++) {
         av_freep(&s->pkt.nals[i].rbsp_buffer);
+        av_freep(&s->pkt.nals[i].skipped_bytes_pos);
+    }
     av_freep(&s->pkt.nals);
     s->pkt.nals_allocated = 0;
 
@@ -2805,8 +3040,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    s->tmp_frame = av_frame_alloc();
-    if (!s->tmp_frame)
+    s->HEVClc = av_mallocz(sizeof(HEVCLocalContext));
+    if (!s->HEVClc)
+        goto fail;
+    s->HEVClcList[0] = s->HEVClc;
+    s->sList[0] = s;
+
+    s->cabac_state = av_malloc(HEVC_CONTEXTS);
+    if (!s->cabac_state)
         goto fail;
 
     s->output_frame = av_frame_alloc();
@@ -2829,6 +3070,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     ff_bswapdsp_init(&s->bdsp);
 
     s->context_initialized = 1;
+    s->eos = 0;
 
     return 0;
 
@@ -2859,6 +3101,8 @@ static int hevc_update_thread_context(AVCodecContext *dst,
         }
     }
 
+    if (s->ps.sps != s0->ps.sps)
+        s->ps.sps = NULL;
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
         av_buffer_unref(&s->ps.vps_list[i]);
         if (s0->ps.vps_list[i]) {
@@ -2887,16 +3131,22 @@ static int hevc_update_thread_context(AVCodecContext *dst,
     }
 
     if (s->ps.sps != s0->ps.sps)
-        ret = set_sps(s, s0->ps.sps);
+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
+            return ret;
 
     s->seq_decode = s0->seq_decode;
     s->seq_output = s0->seq_output;
     s->pocTid0    = s0->pocTid0;
     s->max_ra     = s0->max_ra;
+    s->eos        = s0->eos;
+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
 
     s->is_nalff        = s0->is_nalff;
     s->nal_length_size = s0->nal_length_size;
 
+    s->threads_number      = s0->threads_number;
+    s->threads_type        = s0->threads_type;
+
     if (s0->eos) {
         s->seq_decode = (s->seq_decode + 1) & 0xff;
         s->max_ra = INT_MAX;
@@ -2992,6 +3242,15 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    s->enable_parallel_tiles = 0;
+    s->picture_struct = 0;
+    s->eos = 1;
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE)
+        s->threads_number = avctx->thread_count;
+    else
+        s->threads_number = 1;
+
     if (avctx->extradata_size > 0 && avctx->extradata) {
         ret = hevc_decode_extradata(s);
         if (ret < 0) {
@@ -3000,6 +3259,11 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
         }
     }
 
+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+            s->threads_type = FF_THREAD_FRAME;
+        else
+            s->threads_type = FF_THREAD_SLICE;
+
     return 0;
 }
 
@@ -3022,6 +3286,7 @@ static void hevc_decode_flush(AVCodecContext *avctx)
     HEVCContext *s = avctx->priv_data;
     ff_hevc_flush_dpb(s);
     s->max_ra = INT_MAX;
+    s->eos = 1;
 }
 
 #define OFFSET(x) offsetof(HEVCContext, x)
@@ -3031,12 +3296,15 @@ static const AVProfile profiles[] = {
     { FF_PROFILE_HEVC_MAIN,                 "Main"                },
     { FF_PROFILE_HEVC_MAIN_10,              "Main 10"             },
     { FF_PROFILE_HEVC_MAIN_STILL_PICTURE,   "Main Still Picture"  },
+    { FF_PROFILE_HEVC_REXT,                 "Rext"  },
     { FF_PROFILE_UNKNOWN },
 };
 
 static const AVOption options[] = {
     { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
         AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
+        AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
     { NULL },
 };
 
@@ -3061,6 +3329,6 @@ AVCodec ff_hevc_decoder = {
     .update_thread_context = hevc_update_thread_context,
     .init_thread_copy      = hevc_init_thread_copy,
     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-                             AV_CODEC_CAP_FRAME_THREADS,
+                             AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
     .profiles              = NULL_IF_CONFIG_SMALL(profiles),
 };
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index c6e05bc0bb..66b9a2f0fc 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -3,29 +3,26 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_HEVC_H
 #define AVCODEC_HEVC_H
 
-#include <stddef.h>
-#include <stdint.h>
-
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"
 
@@ -33,6 +30,7 @@
 #include "bswapdsp.h"
 #include "cabac.h"
 #include "get_bits.h"
+#include "hevcpred.h"
 #include "hevcdsp.h"
 #include "internal.h"
 #include "thread.h"
@@ -41,6 +39,9 @@
 #define MAX_DPB_SIZE 16 // A.4.1
 #define MAX_REFS 16
 
+#define MAX_NB_THREADS 16
+#define SHIFT_CTB_WPP 2
+
 /**
  * 7.4.2.1
  */
@@ -55,12 +56,11 @@
 #define MAX_TRANSFORM_DEPTH 5
 
 #define MAX_TB_SIZE 32
-#define MAX_PB_SIZE 64
 #define MAX_LOG2_CTB_SIZE 6
 #define MAX_QP 51
 #define DEFAULT_INTRA_TC_OFFSET 2
 
-#define HEVC_CONTEXTS 183
+#define HEVC_CONTEXTS 199
 
 #define MRG_MAX_NUM_CANDS     5
 
@@ -70,6 +70,9 @@
 #define EPEL_EXTRA_BEFORE 1
 #define EPEL_EXTRA_AFTER  2
 #define EPEL_EXTRA        3
+#define QPEL_EXTRA_BEFORE 3
+#define QPEL_EXTRA_AFTER  4
+#define QPEL_EXTRA        7
 
 #define EDGE_EMU_BUFFER_STRIDE 80
 
@@ -79,13 +82,10 @@
 #define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
 #define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
 
-#define IS_IDR(s) (s->nal_unit_type == NAL_IDR_W_RADL || s->nal_unit_type == NAL_IDR_N_LP)
-#define IS_BLA(s) (s->nal_unit_type == NAL_BLA_W_RADL || s->nal_unit_type == NAL_BLA_W_LP || \
-                   s->nal_unit_type == NAL_BLA_N_LP)
-#define IS_IRAP(s) (s->nal_unit_type >= 16 && s->nal_unit_type <= 23)
-
-#define FFUDIV(a,b) (((a) > 0 ? (a) : (a) - (b) + 1) / (b))
-#define FFUMOD(a,b) ((a) - (b) * FFUDIV(a,b))
+#define IS_IDR(s) ((s)->nal_unit_type == NAL_IDR_W_RADL || (s)->nal_unit_type == NAL_IDR_N_LP)
+#define IS_BLA(s) ((s)->nal_unit_type == NAL_BLA_W_RADL || (s)->nal_unit_type == NAL_BLA_W_LP || \
+                   (s)->nal_unit_type == NAL_BLA_N_LP)
+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
 
 /**
  * Table 7-3: NAL unit type codes
@@ -167,6 +167,8 @@ enum SyntaxElement {
     CBF_LUMA,
     CBF_CB_CR,
     TRANSFORM_SKIP_FLAG,
+    EXPLICIT_RDPCM_FLAG,
+    EXPLICIT_RDPCM_DIR_FLAG,
     LAST_SIGNIFICANT_COEFF_X_PREFIX,
     LAST_SIGNIFICANT_COEFF_Y_PREFIX,
     LAST_SIGNIFICANT_COEFF_X_SUFFIX,
@@ -177,6 +179,10 @@ enum SyntaxElement {
     COEFF_ABS_LEVEL_GREATER2_FLAG,
     COEFF_ABS_LEVEL_REMAINING,
     COEFF_SIGN_FLAG,
+    LOG2_RES_SCALE_ABS,
+    RES_SCALE_SIGN_FLAG,
+    CU_CHROMA_QP_OFFSET_FLAG,
+    CU_CHROMA_QP_OFFSET_IDX,
 };
 
 enum PartMode {
@@ -202,6 +208,13 @@ enum InterPredIdc {
     PRED_BI,
 };
 
+enum PredFlag {
+    PF_INTRA = 0,
+    PF_L0,
+    PF_L1,
+    PF_BI,
+};
+
 enum IntraPredMode {
     INTRA_PLANAR = 0,
     INTRA_DC,
@@ -244,6 +257,7 @@ enum SAOType {
     SAO_NOT_APPLIED = 0,
     SAO_BAND,
     SAO_EDGE,
+    SAO_APPLIED
 };
 
 enum SAOEOClass {
@@ -262,6 +276,7 @@ enum ScanType {
 typedef struct ShortTermRPS {
     unsigned int num_negative_pics;
     int num_delta_pocs;
+    int rps_idx_num_delta_pocs;
     int32_t delta_poc[32];
     uint8_t used[32];
 } ShortTermRPS;
@@ -381,7 +396,7 @@ typedef struct ScalingList {
 } ScalingList;
 
 typedef struct HEVCSPS {
-    int vps_id;
+    unsigned vps_id;
     int chroma_format_idc;
     uint8_t separate_colour_plane_flag;
 
@@ -442,6 +457,13 @@ typedef struct HEVCSPS {
     int max_transform_hierarchy_depth_inter;
     int max_transform_hierarchy_depth_intra;
 
+    int transform_skip_rotation_enabled_flag;
+    int transform_skip_context_enabled_flag;
+    int implicit_rdpcm_enabled_flag;
+    int explicit_rdpcm_enabled_flag;
+    int intra_smoothing_disabled_flag;
+    int persistent_rice_adaptation_enabled_flag;
+
     ///< coded frame dimension in various units
     int width;
     int height;
@@ -454,6 +476,7 @@ typedef struct HEVCSPS {
     int min_tb_height;
     int min_pu_width;
     int min_pu_height;
+    int tb_mask;
 
     int hshift[3];
     int vshift[3];
@@ -510,6 +533,15 @@ typedef struct HEVCPPS {
     int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
     int num_extra_slice_header_bits;
     uint8_t slice_header_extension_present_flag;
+    uint8_t log2_max_transform_skip_block_size;
+    uint8_t cross_component_prediction_enabled_flag;
+    uint8_t chroma_qp_offset_list_enabled_flag;
+    uint8_t diff_cu_chroma_qp_offset_depth;
+    uint8_t chroma_qp_offset_list_len_minus1;
+    int8_t  cb_qp_offset_list[5];
+    int8_t  cr_qp_offset_list[5];
+    uint8_t log2_sao_offset_scale_luma;
+    uint8_t log2_sao_offset_scale_chroma;
 
     // Inferred parameters
     unsigned int *column_width;  ///< ColumnWidth
@@ -523,6 +555,7 @@ typedef struct HEVCPPS {
     int *tile_id;           ///< TileId
     int *tile_pos_rs;       ///< TilePosRS
     int *min_tb_addr_zs;    ///< MinTbAddrZS
+    int *min_tb_addr_zs_tab;///< MinTbAddrZS
 } HEVCPPS;
 
 typedef struct HEVCParamSets {
@@ -558,6 +591,7 @@ typedef struct SliceHeader {
     int short_term_ref_pic_set_size;
     ShortTermRPS slice_rps;
     const ShortTermRPS *short_term_rps;
+    int long_term_ref_pic_set_size;
     LongTermRPS long_term_rps;
     unsigned int list_entry_lx[2][32];
 
@@ -581,11 +615,16 @@ typedef struct SliceHeader {
     int slice_cb_qp_offset;
     int slice_cr_qp_offset;
 
+    uint8_t cu_chroma_qp_offset_enabled_flag;
+
     int beta_offset;    ///< beta_offset_div2 * 2
     int tc_offset;      ///< tc_offset_div2 * 2
 
     unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
 
+    int *entry_point_offset;
+    int * offset;
+    int * size;
     int num_entry_point_offsets;
 
     int8_t slice_qp;
@@ -607,10 +646,6 @@ typedef struct SliceHeader {
     int slice_ctb_addr_rs;
 } SliceHeader;
 
-typedef struct CodingTree {
-    int depth; ///< ctDepth
-} CodingTree;
-
 typedef struct CodingUnit {
     int x;
     int y;
@@ -632,8 +667,7 @@ typedef struct Mv {
 typedef struct MvField {
     DECLARE_ALIGNED(4, Mv, mv)[2];
     int8_t ref_idx[2];
-    int8_t pred_flag[2];
-    uint8_t is_intra;
+    int8_t pred_flag;
 } MvField;
 
 typedef struct NeighbourAvailable {
@@ -651,15 +685,24 @@ typedef struct PredictionUnit {
     uint8_t intra_pred_mode[4];
     Mv mvd;
     uint8_t merge_flag;
-    uint8_t intra_pred_mode_c;
+    uint8_t intra_pred_mode_c[4];
+    uint8_t chroma_mode_c[4];
 } PredictionUnit;
 
 typedef struct TransformUnit {
     int cu_qp_delta;
 
+    int res_scale_val;
+
     // Inferred parameters;
-    int cur_intra_pred_mode;
+    int intra_pred_mode;
+    int intra_pred_mode_c;
+    int chroma_mode_c;
     uint8_t is_cu_qp_delta_coded;
+    uint8_t is_cu_chroma_qp_offset_coded;
+    int8_t  cu_qp_offset_cb;
+    int8_t  cu_qp_offset_cr;
+    uint8_t cross_pf;
 } TransformUnit;
 
 typedef struct DBParams {
@@ -670,6 +713,7 @@ typedef struct DBParams {
 #define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
 #define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
 #define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
 
 typedef struct HEVCFrame {
     AVFrame *frame;
@@ -716,6 +760,10 @@ typedef struct HEVCNAL {
 
     enum NALUnitType type;
     int temporal_id;
+
+    int skipped_bytes;
+    int skipped_bytes_pos_size;
+    int *skipped_bytes_pos;
 } HEVCNAL;
 
 /* an input packet split into unescaped NAL units */
@@ -725,24 +773,11 @@ typedef struct HEVCPacket {
     int nals_allocated;
 } HEVCPacket;
 
-struct HEVCContext;
-
-typedef struct HEVCPredContext {
-    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
-
-    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-                           const uint8_t *left, ptrdiff_t stride);
-    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-                    ptrdiff_t stride, int log2_size, int c_idx);
-    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-                            const uint8_t *left, ptrdiff_t stride,
-                            int c_idx, int mode);
-} HEVCPredContext;
-
 typedef struct HEVCLocalContext {
-    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]);
     uint8_t cabac_state[HEVC_CONTEXTS];
 
+    uint8_t stat_coeff[4];
+
     uint8_t first_qp_group;
 
     GetBitContext gb;
@@ -751,18 +786,23 @@ typedef struct HEVCLocalContext {
     int8_t qp_y;
     int8_t curr_qp_y;
 
+    int qPy_pred;
+
     TransformUnit tu;
 
     uint8_t ctb_left_flag;
     uint8_t ctb_up_flag;
     uint8_t ctb_up_right_flag;
     uint8_t ctb_up_left_flag;
-    int     start_of_tiles_x;
     int     end_of_tiles_x;
     int     end_of_tiles_y;
     /* +7 is for subpixel interpolation, *2 for high bit depths */
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-    CodingTree ct;
+    /* The extended size between the new edge emu buffer is abused by SAO */
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+
+    int ct_depth;
     CodingUnit cu;
     PredictionUnit pu;
     NeighbourAvailable na;
@@ -780,17 +820,26 @@ typedef struct HEVCContext {
     const AVClass *c;  // needed by private avoptions
     AVCodecContext *avctx;
 
-    HEVCLocalContext HEVClc;
+    struct HEVCContext  *sList[MAX_NB_THREADS];
 
-    uint8_t cabac_state[HEVC_CONTEXTS];
+    HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+    HEVCLocalContext    *HEVClc;
+
+    uint8_t             threads_type;
+    uint8_t             threads_number;
+
+    int                 width;
+    int                 height;
+
+    uint8_t *cabac_state;
 
     /** 1 if the independent slice segment header was successfully parsed */
     uint8_t slice_initialized;
 
     AVFrame *frame;
-    AVFrame *sao_frame;
-    AVFrame *tmp_frame;
     AVFrame *output_frame;
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
 
     HEVCParamSets ps;
 
@@ -811,11 +860,13 @@ typedef struct HEVCContext {
     int pocTid0;
     int slice_idx; ///< number of the slice being currently decoded
     int eos;       ///< current packet contains an EOS/EOB NAL
+    int last_eos;  ///< last packet contains an EOS/EOB NAL
     int max_ra;
     int bs_width;
     int bs_height;
 
     int is_decoded;
+    int no_rasl_output_flag;
 
     HEVCPredContext hpc;
     HEVCDSPContext hevcdsp;
@@ -850,6 +901,11 @@ typedef struct HEVCContext {
     uint16_t seq_decode;
     uint16_t seq_output;
 
+    int enable_parallel_tiles;
+    int wpp_err;
+
+    const uint8_t *data;
+
     HEVCPacket pkt;
     // type of the first VCL NAL of the current frame
     enum NALUnitType first_nal_type;
@@ -864,6 +920,8 @@ typedef struct HEVCContext {
                             ///< as a format defined in 14496-15
     int apply_defdispwin;
 
+    int active_seq_parameter_set_id;
+
     int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
     int nuh_layer_id;
 
@@ -877,6 +935,8 @@ typedef struct HEVCContext {
     int sei_display_orientation_present;
     int sei_anticlockwise_rotation;
     int sei_hflip, sei_vflip;
+
+    int picture_struct;
 } HEVCContext;
 
 int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
@@ -957,32 +1017,11 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH);
 int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx);
 int ff_hevc_mvp_lx_flag_decode(HEVCContext *s);
 int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s);
-int ff_hevc_mvd_decode(HEVCContext *s);
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s);
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size);
 int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth);
 int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth);
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx);
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
-                                                 int last_significant_coeff_prefix);
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx,
-                                                int ctx_cg);
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c,
-                                          int y_c, int log2_trafo_size,
-                                          int scan_idx, int prev_sig);
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx,
-                                                 int ctx_set);
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx,
-                                                 int inc);
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level,
-                                      int rc_rice_param);
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb);
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx);
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx);
 
 /**
  * Get the number of candidate references for the current frame.
@@ -997,6 +1036,8 @@ int ff_hevc_set_new_ref(HEVCContext *s, AVFrame **frame, int poc);
  */
 int ff_hevc_output_frame(HEVCContext *s, AVFrame *frame, int flush);
 
+void ff_hevc_bump_frame(HEVCContext *s);
+
 void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags);
 
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
@@ -1008,29 +1049,33 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0,
                               int nPbW, int nPbH, int log2_cb_size,
                               int part_idx, int merge_idx,
                               MvField *mv, int mvp_lx_flag, int LX);
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC, int xBase, int yBase,
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase,
                      int log2_cb_size);
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size);
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s);
 int ff_hevc_cu_qp_delta_abs(HEVCContext *s);
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y);
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s);
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s);
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size);
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size);
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                 int log2_trafo_size, enum ScanType scan_idx,
+                                 int c_idx);
 
-void ff_hevc_pps_free(HEVCPPS **ppps);
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
 
-void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 
 /**
  * Extract the raw (unescaped) HEVC bitstream.
  */
-int ff_hevc_extract_rbsp(const uint8_t *src, int length,
+int ff_hevc_extract_rbsp(HEVCContext *s, const uint8_t *src, int length,
                          HEVCNAL *nal);
 
 /**
  * Split an input packet into NAL units.
  */
-int ff_hevc_split_packet(HEVCPacket *pkt, const uint8_t *buf, int length,
+int ff_hevc_split_packet(HEVCContext *s, HEVCPacket *pkt, const uint8_t *buf, int length,
                          AVCodecContext *avctx, int is_nalff, int nal_length_size);
 
 int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index b01808fd84..ffff87d4f0 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,65 +66,77 @@ av_unused static const int8_t num_bins_in_se[] = {
      2, // cbf_luma
      4, // cbf_cb, cbf_cr
      2, // transform_skip_flag[][]
+     2, // explicit_rdpcm_flag[][]
+     2, // explicit_rdpcm_dir_flag[][]
     18, // last_significant_coeff_x_prefix
     18, // last_significant_coeff_y_prefix
      0, // last_significant_coeff_x_suffix
      0, // last_significant_coeff_y_suffix
      4, // significant_coeff_group_flag
-    42, // significant_coeff_flag
+    44, // significant_coeff_flag
     24, // coeff_abs_level_greater1_flag
      6, // coeff_abs_level_greater2_flag
      0, // coeff_abs_level_remaining
      0, // coeff_sign_flag
+     8, // log2_res_scale_abs
+     2, // res_scale_sign_flag
+     1, // cu_chroma_qp_offset_flag
+     1, // cu_chroma_qp_offset_idx
 };
 
 /**
  * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
  */
 static const int elem_offset[sizeof(num_bins_in_se)] = {
-      0,
-      1,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2,
-      5,
-      6,
-      9,
-     12,
-     13,
-     17,
-     17,
-     18,
-     18,
-     18,
-     20,
-     21,
-     22,
-     27,
-     29,
-     31,
-     33,
-     35,
-     35,
-     35,
-     36,
-     37,
-     40,
-     42,
-     46,
-     48,
-     66,
-     84,
-     84,
-     84,
-     88,
-    130,
-    154,
-    160,
-    160,
+    0, // sao_merge_flag
+    1, // sao_type_idx
+    2, // sao_eo_class
+    2, // sao_band_position
+    2, // sao_offset_abs
+    2, // sao_offset_sign
+    2, // end_of_slice_flag
+    2, // split_coding_unit_flag
+    5, // cu_transquant_bypass_flag
+    6, // skip_flag
+    9, // cu_qp_delta
+    12, // pred_mode
+    13, // part_mode
+    17, // pcm_flag
+    17, // prev_intra_luma_pred_mode
+    18, // mpm_idx
+    18, // rem_intra_luma_pred_mode
+    18, // intra_chroma_pred_mode
+    20, // merge_flag
+    21, // merge_idx
+    22, // inter_pred_idc
+    27, // ref_idx_l0
+    29, // ref_idx_l1
+    31, // abs_mvd_greater0_flag
+    33, // abs_mvd_greater1_flag
+    35, // abs_mvd_minus2
+    35, // mvd_sign_flag
+    35, // mvp_lx_flag
+    36, // no_residual_data_flag
+    37, // split_transform_flag
+    40, // cbf_luma
+    42, // cbf_cb, cbf_cr
+    46, // transform_skip_flag[][]
+    48, // explicit_rdpcm_flag[][]
+    50, // explicit_rdpcm_dir_flag[][]
+    52, // last_significant_coeff_x_prefix
+    70, // last_significant_coeff_y_prefix
+    88, // last_significant_coeff_x_suffix
+    88, // last_significant_coeff_y_suffix
+    88, // significant_coeff_group_flag
+    92, // significant_coeff_flag
+    136, // coeff_abs_level_greater1_flag
+    160, // coeff_abs_level_greater2_flag
+    166, // coeff_abs_level_remaining
+    166, // coeff_sign_flag
+    166, // log2_res_scale_abs
+    174, // res_scale_sign_flag
+    176, // cu_chroma_qp_offset_flag
+    177, // cu_chroma_qp_offset_idx
 };
 
 #define CNU 154
@@ -178,6 +190,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       94, 138, 182, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
        79, 108, 123,  63,
@@ -190,11 +206,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
       125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
       139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
+      141, 111,
       // coeff_abs_level_greater1_flag
       140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
       122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
       // coeff_abs_level_greater2_flag
-      138, 153, 136, 167, 152, 152, },
+      138, 153, 136, 167, 152, 152,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -241,6 +267,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 107, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
        94, 108, 123, 108,
@@ -253,11 +283,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 122, 107, 167, },
+      107, 167, 91, 122, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -304,6 +344,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 92, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
        79, 108, 123,  93,
@@ -316,11 +360,89 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 107, 107, 167, },
+      107, 167, 91, 107, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+};
+
+static const uint8_t scan_1x1[1] = {
+    0,
+};
+
+static const uint8_t horiz_scan2x2_x[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t horiz_scan2x2_y[4] = {
+    0, 0, 1, 1
+};
+
+static const uint8_t horiz_scan4x4_x[16] = {
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+};
+
+static const uint8_t horiz_scan4x4_y[16] = {
+    0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+};
+
+static const uint8_t horiz_scan8x8_inv[8][8] = {
+    {  0,  1,  2,  3, 16, 17, 18, 19, },
+    {  4,  5,  6,  7, 20, 21, 22, 23, },
+    {  8,  9, 10, 11, 24, 25, 26, 27, },
+    { 12, 13, 14, 15, 28, 29, 30, 31, },
+    { 32, 33, 34, 35, 48, 49, 50, 51, },
+    { 36, 37, 38, 39, 52, 53, 54, 55, },
+    { 40, 41, 42, 43, 56, 57, 58, 59, },
+    { 44, 45, 46, 47, 60, 61, 62, 63, },
+};
+
+static const uint8_t diag_scan2x2_x[4] = {
+    0, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x2_y[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan2x2_inv[2][2] = {
+    { 0, 2, },
+    { 1, 3, },
+};
+
+static const uint8_t diag_scan4x4_inv[4][4] = {
+    { 0,  2,  5,  9, },
+    { 1,  4,  8, 12, },
+    { 3,  7, 11, 14, },
+    { 6, 10, 13, 15, },
+};
+
+static const uint8_t diag_scan8x8_inv[8][8] = {
+    {  0,  2,  5,  9, 14, 20, 27, 35, },
+    {  1,  4,  8, 13, 19, 26, 34, 42, },
+    {  3,  7, 12, 18, 25, 33, 41, 48, },
+    {  6, 11, 17, 24, 32, 40, 47, 53, },
+    { 10, 16, 23, 31, 39, 46, 52, 57, },
+    { 15, 22, 30, 38, 45, 51, 56, 60, },
+    { 21, 29, 37, 44, 50, 55, 59, 62, },
+    { 28, 36, 43, 49, 54, 58, 61, 63, },
 };
 
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
@@ -329,13 +451,13 @@ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
         (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
          (s->ps.sps->ctb_width == 2 &&
           ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
-        memcpy(s->cabac_state, s->HEVClc.cabac_state, HEVC_CONTEXTS);
+        memcpy(s->cabac_state, s->HEVClc->cabac_state, HEVC_CONTEXTS);
     }
 }
 
 static void load_states(HEVCContext *s)
 {
-    memcpy(s->HEVClc.cabac_state, s->cabac_state, HEVC_CONTEXTS);
+    memcpy(s->HEVClc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
 }
 
 static void cabac_reinit(HEVCLocalContext *lc)
@@ -345,10 +467,10 @@ static void cabac_reinit(HEVCLocalContext *lc)
 
 static void cabac_init_decoder(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     skip_bits(gb, 1);
     align_get_bits(gb);
-    ff_init_cabac_decoder(&s->HEVClc.cc,
+    ff_init_cabac_decoder(&s->HEVClc->cc,
                           gb->buffer + get_bits_count(gb) / 8,
                           (get_bits_left(gb) + 7) / 8);
 }
@@ -370,8 +492,11 @@ static void cabac_init_state(HEVCContext *s)
         pre ^= pre >> 31;
         if (pre > 124)
             pre = 124 + (pre & 1);
-        s->HEVClc.cabac_state[i] = pre;
+        s->HEVClc->cabac_state[i] = pre;
     }
+
+    for (i = 0; i < 4; i++)
+        s->HEVClc->stat_coeff[i] = 0;
 }
 
 void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
@@ -395,13 +520,19 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
     } else {
         if (s->ps.pps->tiles_enabled_flag &&
             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
-            cabac_reinit(&s->HEVClc);
+            if (s->threads_number == 1)
+                cabac_reinit(s->HEVClc);
+            else
+                cabac_init_decoder(s);
             cabac_init_state(s);
         }
         if (s->ps.pps->entropy_coding_sync_enabled_flag) {
             if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
-                get_cabac_terminate(&s->HEVClc.cc);
-                cabac_reinit(&s->HEVClc);
+                get_cabac_terminate(&s->HEVClc->cc);
+                if (s->threads_number == 1)
+                    cabac_reinit(s->HEVClc);
+                else
+                    cabac_init_decoder(s);
 
                 if (s->ps.sps->ctb_width == 1)
                     cabac_init_state(s);
@@ -412,7 +543,7 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
     }
 }
 
-#define GET_CABAC(ctx) get_cabac(&s->HEVClc.cc, &s->HEVClc.cabac_state[ctx])
+#define GET_CABAC(ctx) get_cabac(&s->HEVClc->cc, &s->HEVClc->cabac_state[ctx])
 
 int ff_hevc_sao_merge_flag_decode(HEVCContext *s)
 {
@@ -424,7 +555,7 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[SAO_TYPE_IDX]))
         return 0;
 
-    if (!get_cabac_bypass(&s->HEVClc.cc))
+    if (!get_cabac_bypass(&s->HEVClc->cc))
         return SAO_BAND;
     return SAO_EDGE;
 }
@@ -432,10 +563,10 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
 int ff_hevc_sao_band_position_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -444,26 +575,26 @@ int ff_hevc_sao_offset_abs_decode(HEVCContext *s)
     int i = 0;
     int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
 
-    while (i < length && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < length && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
 
 int ff_hevc_sao_offset_sign_decode(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
 }
 
 int ff_hevc_sao_eo_class_decode(HEVCContext *s)
 {
-    int ret = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret    |= get_cabac_bypass(&s->HEVClc.cc);
+    int ret = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret    |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
 int ff_hevc_end_of_slice_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s)
@@ -475,12 +606,12 @@ int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb)
 {
     int min_cb_width = s->ps.sps->min_cb_width;
     int inc = 0;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb);
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1);
 
     return GET_CABAC(elem_offset[SKIP_FLAG] + inc);
@@ -498,7 +629,7 @@ int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
     }
     if (prefix_val >= 5) {
         int k = 0;
-        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
             suffix_val += 1 << k;
             k++;
         }
@@ -506,14 +637,30 @@ int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
             av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
 
         while (k--)
-            suffix_val += get_cabac_bypass(&s->HEVClc.cc) << k;
+            suffix_val += get_cabac_bypass(&s->HEVClc->cc) << k;
     }
     return prefix_val + suffix_val;
 }
 
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
+}
+
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]);
+}
+
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s)
+{
+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
+    int i = 0;
+
+    while (i < c_max && GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
+        i++;
+
+    return i;
 }
 
 int ff_hevc_pred_mode_decode(HEVCContext *s)
@@ -524,14 +671,14 @@ int ff_hevc_pred_mode_decode(HEVCContext *s)
 int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0)
 {
     int inc = 0, depth_left = 0, depth_top = 0;
-    int x0b  = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b  = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b  = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b  = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
     int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
     int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
 
     inc += (depth_left > ct_depth);
@@ -545,7 +692,7 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE])) // 1
         return PART_2Nx2N;
     if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-        if (s->HEVClc.cu.pred_mode == MODE_INTRA) // 0
+        if (s->HEVClc->cu.pred_mode == MODE_INTRA) // 0
             return PART_NxN;
         if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
             return PART_2NxN;
@@ -565,21 +712,21 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
         if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011
             return PART_2NxN;
-        if (get_cabac_bypass(&s->HEVClc.cc)) // 0101
+        if (get_cabac_bypass(&s->HEVClc->cc)) // 0101
             return PART_2NxnD;
         return PART_2NxnU; // 0100
     }
 
     if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001
         return PART_Nx2N;
-    if (get_cabac_bypass(&s->HEVClc.cc)) // 0001
+    if (get_cabac_bypass(&s->HEVClc->cc)) // 0001
         return PART_nRx2N;
     return PART_nLx2N;  // 0000
 }
 
 int ff_hevc_pcm_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
@@ -590,7 +737,7 @@ int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
 int ff_hevc_mpm_idx_decode(HEVCContext *s)
 {
     int i = 0;
-    while (i < 2 && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < 2 && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
@@ -598,10 +745,10 @@ int ff_hevc_mpm_idx_decode(HEVCContext *s)
 int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -611,8 +758,8 @@ int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE]))
         return 4;
 
-    ret  = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret |= get_cabac_bypass(&s->HEVClc.cc);
+    ret  = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
@@ -621,7 +768,7 @@ int ff_hevc_merge_idx_decode(HEVCContext *s)
     int i = GET_CABAC(elem_offset[MERGE_IDX]);
 
     if (i != 0) {
-        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
     return i;
@@ -636,7 +783,7 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH)
 {
     if (nPbW + nPbH == 12)
         return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
-    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc.ct.depth))
+    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc->ct_depth))
         return PRED_BI;
 
     return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
@@ -651,7 +798,7 @@ int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx)
     while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i))
         i++;
     if (i == 2) {
-        while (i < max && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < max && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
 
@@ -668,35 +815,35 @@ int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s)
     return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
 }
 
-int ff_hevc_mvd_decode(HEVCContext *s)
+static av_always_inline int mvd_decode(HEVCContext *s)
 {
     int ret = 2;
     int k = 1;
 
-    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
         ret += 1 << k;
         k++;
     }
     if (k == CABAC_MAX_BIN)
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
     while (k--)
-        ret += get_cabac_bypass(&s->HEVClc.cc) << k;
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -ret);
+        ret += get_cabac_bypass(&s->HEVClc->cc) << k;
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -ret);
 }
 
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s)
+static av_always_inline int mvd_sign_flag_decode(HEVCContext *s)
 {
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -1);
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -1);
 }
 
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size)
@@ -714,53 +861,73 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }
 
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
 {
     return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
 }
 
-#define LAST_SIG_COEFF(elem)                                                    \
-    int i = 0;                                                                  \
-    int max = (log2_size << 1) - 1;                                             \
-    int ctx_offset, ctx_shift;                                                  \
-                                                                                \
-    if (c_idx == 0) {                                                           \
-        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);             \
-        ctx_shift = (log2_size + 1) >> 2;                                       \
-    } else {                                                                    \
-        ctx_offset = 15;                                                        \
-        ctx_shift = log2_size - 2;                                              \
-    }                                                                           \
-    while (i < max &&                                                           \
-           GET_CABAC(elem_offset[elem] + (i >> ctx_shift) + ctx_offset))        \
-        i++;                                                                    \
-    return i;
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+{
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+}
 
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_X_PREFIX)
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
 }
 
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+    int i =0;
+
+    while (i < 4 && GET_CABAC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
+        i++;
+
+    return i;
+}
+
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+    return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+}
+
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_Y_PREFIX)
+    int i = 0;
+    int max = (log2_size << 1) - 1;
+    int ctx_offset, ctx_shift;
+
+    if (!c_idx) {
+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+        ctx_shift = (log2_size + 1) >> 2;
+    } else {
+        ctx_offset = 15;
+        ctx_shift = log2_size - 2;
+    }
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scx_prefix = i;
+
+    i = 0;
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scy_prefix = i;
 }
 
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
+static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
                                                  int last_significant_coeff_prefix)
 {
     int i;
     int length = (last_significant_coeff_prefix >> 1) - 1;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 1; i < length; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
 {
     int inc;
 
@@ -768,58 +935,19 @@ int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int c
 
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c, int y_c,
-                                          int log2_trafo_size, int scan_idx, int prev_sig)
+static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+                                           int offset, const uint8_t *ctx_idx_map)
 {
-    static const uint8_t ctx_idx_map[] = {
-        0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8
-    };
-    int x_cg = x_c >> 2;
-    int y_cg = y_c >> 2;
-    int sig_ctx, inc;
-
-    if (x_c + y_c == 0) {
-        sig_ctx = 0;
-    } else if (log2_trafo_size == 2) {
-        sig_ctx = ctx_idx_map[(y_c << 2) + x_c];
-    } else {
-        switch (prev_sig) {
-        case 0: {
-                int x_off = x_c & 3;
-                int y_off = y_c & 3;
-                sig_ctx   = ((x_off + y_off) == 0) ? 2 : ((x_off + y_off) <= 2) ? 1 : 0;
-            }
-            break;
-        case 1:
-            sig_ctx = 2 - FFMIN(y_c & 3, 2);
-            break;
-        case 2:
-            sig_ctx = 2 - FFMIN(x_c & 3, 2);
-            break;
-        default:
-            sig_ctx = 2;
-        }
-
-        if (c_idx == 0 && (x_cg > 0 || y_cg > 0))
-            sig_ctx += 3;
-
-        if (log2_trafo_size == 3) {
-            sig_ctx += (scan_idx == SCAN_DIAG) ? 9 : 15;
-        } else {
-            sig_ctx += c_idx ? 12 : 21;
-        }
-    }
-
-    if (c_idx == 0)
-        inc = sig_ctx;
-    else
-        inc = sig_ctx + 27;
-
+    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+{
+    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+}
+
+static av_always_inline int coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
 
     if (c_idx > 0)
@@ -828,7 +956,7 @@ int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
     if (c_idx > 0)
         inc += 4;
@@ -836,37 +964,572 @@ int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level, int rc_rice_param)
+static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
 {
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;
 
-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
         prefix++;
     if (prefix == CABAC_MAX_BIN)
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
     return last_coeff_abs_level_remaining;
 }
 
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb)
+static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
 {
     int i;
     int ret = 0;
 
     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
+
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                int log2_trafo_size, enum ScanType scan_idx,
+                                int c_idx)
+{
+#define GET_COORD(offset, n)                                    \
+    do {                                                        \
+        x_c = (x_cg << 2) + scan_x_off[n];                      \
+        y_c = (y_cg << 2) + scan_y_off[n];                      \
+    } while (0)
+    HEVCLocalContext *lc = s->HEVClc;
+    int transform_skip_flag = 0;
+
+    int last_significant_coeff_x, last_significant_coeff_y;
+    int last_scan_pos;
+    int n_end;
+    int num_coeff = 0;
+    int greater1_ctx = 1;
+
+    int num_last_subset;
+    int x_cg_last_sig, y_cg_last_sig;
+
+    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+
+    ptrdiff_t stride = s->frame->linesize[c_idx];
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+    int explicit_rdpcm_flag = 0;
+    int explicit_rdpcm_dir_flag;
+
+    int trafo_size = 1 << log2_trafo_size;
+    int i;
+    int qp,shift,add,scale,scale_m;
+    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+    const uint8_t *scale_matrix = NULL;
+    uint8_t dc_scale;
+    int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                         lc->tu.intra_pred_mode_c;
+
+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+
+    // Derive QP for dequant
+    if (!lc->cu.cu_transquant_bypass_flag) {
+        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t rem6[51 + 4 * 6 + 1] = {
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+            4, 5, 0, 1, 2, 3, 4, 5, 0, 1
+        };
+
+        static const uint8_t div6[51 + 4 * 6 + 1] = {
+            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
+            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
+            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+            10, 10, 11, 11, 11, 11, 11, 11, 12, 12
+        };
+        int qp_y = lc->qp_y;
+
+        if (s->ps.pps->transform_skip_enabled_flag &&
+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+        }
+
+        if (c_idx == 0) {
+            qp = qp_y + s->ps.sps->qp_bd_offset;
+        } else {
+            int qp_i, offset;
+
+            if (c_idx == 1)
+                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
+                         lc->tu.cu_qp_offset_cb;
+            else
+                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
+                         lc->tu.cu_qp_offset_cr;
+
+            qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
+            if (s->ps.sps->chroma_format_idc == 1) {
+                if (qp_i < 30)
+                    qp = qp_i;
+                else if (qp_i > 43)
+                    qp = qp_i - 6;
+                else
+                    qp = qp_c[qp_i - 30];
+            } else {
+                if (qp_i > 51)
+                    qp = 51;
+                else
+                    qp = qp_i;
+            }
+
+            qp += s->ps.sps->qp_bd_offset;
+        }
+
+        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+        add      = 1 << (shift-1);
+        scale    = level_scale[rem6[qp]] << (div6[qp]);
+        scale_m  = 16; // default when no custom scaling lists.
+        dc_scale = 16;
+
+        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+
+            matrix_id = 3 * matrix_id + c_idx;
+
+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            if (log2_trafo_size >= 4)
+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+        }
+    } else {
+        shift        = 0;
+        add          = 0;
+        scale        = 0;
+        dc_scale     = 0;
+    }
+
+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        if (explicit_rdpcm_flag) {
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+        }
+    }
+
+    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+                                           &last_significant_coeff_x, &last_significant_coeff_y);
+
+    if (last_significant_coeff_x > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+        (2 + (last_significant_coeff_x & 1)) +
+        suffix;
+    }
+
+    if (last_significant_coeff_y > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+        (2 + (last_significant_coeff_y & 1)) +
+        suffix;
+    }
+
+    if (scan_idx == SCAN_VERT)
+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+    x_cg_last_sig = last_significant_coeff_x >> 2;
+    y_cg_last_sig = last_significant_coeff_y >> 2;
+
+    switch (scan_idx) {
+    case SCAN_DIAG: {
+        int last_x_c = last_significant_coeff_x & 3;
+        int last_y_c = last_significant_coeff_y & 3;
+
+        scan_x_off = ff_hevc_diag_scan4x4_x;
+        scan_y_off = ff_hevc_diag_scan4x4_y;
+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+        if (trafo_size == 4) {
+            scan_x_cg = scan_1x1;
+            scan_y_cg = scan_1x1;
+        } else if (trafo_size == 8) {
+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = diag_scan2x2_x;
+            scan_y_cg = diag_scan2x2_y;
+        } else if (trafo_size == 16) {
+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan4x4_x;
+            scan_y_cg = ff_hevc_diag_scan4x4_y;
+        } else { // trafo_size == 32
+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan8x8_x;
+            scan_y_cg = ff_hevc_diag_scan8x8_y;
+        }
+        break;
+    }
+    case SCAN_HORIZ:
+        scan_x_cg = horiz_scan2x2_x;
+        scan_y_cg = horiz_scan2x2_y;
+        scan_x_off = horiz_scan4x4_x;
+        scan_y_off = horiz_scan4x4_y;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+        break;
+    default: //SCAN_VERT
+        scan_x_cg = horiz_scan2x2_y;
+        scan_y_cg = horiz_scan2x2_x;
+        scan_x_off = horiz_scan4x4_y;
+        scan_y_off = horiz_scan4x4_x;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+        break;
+    }
+    num_coeff++;
+    num_last_subset = (num_coeff - 1) >> 4;
+
+    for (i = num_last_subset; i >= 0; i--) {
+        int n, m;
+        int x_cg, y_cg, x_c, y_c, pos;
+        int implicit_non_zero_coeff = 0;
+        int64_t trans_coeff_level;
+        int prev_sig = 0;
+        int offset = i << 4;
+        int rice_init = 0;
+
+        uint8_t significant_coeff_flag_idx[16];
+        uint8_t nb_significant_coeff_flag = 0;
+
+        x_cg = scan_x_cg[i];
+        y_cg = scan_y_cg[i];
+
+        if ((i < num_last_subset) && (i > 0)) {
+            int ctx_cg = 0;
+            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+
+            significant_coeff_group_flag[x_cg][y_cg] =
+                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+            implicit_non_zero_coeff = 1;
+        } else {
+            significant_coeff_group_flag[x_cg][y_cg] =
+            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+             (x_cg == 0 && y_cg == 0));
+        }
+
+        last_scan_pos = num_coeff - offset - 1;
+
+        if (i == num_last_subset) {
+            n_end = last_scan_pos - 1;
+            significant_coeff_flag_idx[0] = last_scan_pos;
+            nb_significant_coeff_flag = 1;
+        } else {
+            n_end = 15;
+        }
+
+        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+
+        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+            static const uint8_t ctx_idx_map[] = {
+                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+            };
+            const uint8_t *ctx_idx_map_p;
+            int scf_offset = 0;
+            if (s->ps.sps->transform_skip_context_enabled_flag &&
+                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+                if (c_idx == 0) {
+                    scf_offset = 40;
+                } else {
+                    scf_offset = 14 + 27;
+                }
+            } else {
+                if (c_idx != 0)
+                    scf_offset = 27;
+                if (log2_trafo_size == 2) {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                } else {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+                    if (c_idx == 0) {
+                        if ((x_cg > 0 || y_cg > 0))
+                            scf_offset += 3;
+                        if (log2_trafo_size == 3) {
+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                        } else {
+                            scf_offset += 21;
+                        }
+                    } else {
+                        if (log2_trafo_size == 3)
+                            scf_offset += 9;
+                        else
+                            scf_offset += 12;
+                    }
+                }
+            }
+            for (n = n_end; n > 0; n--) {
+                x_c = scan_x_off[n];
+                y_c = scan_y_off[n];
+                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+                    nb_significant_coeff_flag++;
+                    implicit_non_zero_coeff = 0;
+                }
+            }
+            if (implicit_non_zero_coeff == 0) {
+                if (s->ps.sps->transform_skip_context_enabled_flag &&
+                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                    if (c_idx == 0) {
+                        scf_offset = 42;
+                    } else {
+                        scf_offset = 16 + 27;
+                    }
+                } else {
+                    if (i == 0) {
+                        if (c_idx == 0)
+                            scf_offset = 0;
+                        else
+                            scf_offset = 27;
+                    } else {
+                        scf_offset = 2 + scf_offset;
+                    }
+                }
+                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                    nb_significant_coeff_flag++;
+                }
+            } else {
+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                nb_significant_coeff_flag++;
+            }
+        }
+
+        n_end = nb_significant_coeff_flag;
+
+
+        if (n_end) {
+            int first_nz_pos_in_cg;
+            int last_nz_pos_in_cg;
+            int c_rice_param = 0;
+            int first_greater1_coeff_idx = -1;
+            uint8_t coeff_abs_level_greater1_flag[8];
+            uint16_t coeff_sign_flag;
+            int sum_abs = 0;
+            int sign_hidden;
+            int sb_type;
+
+
+            // initialize first elem of coeff_bas_level_greater1_flag
+            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+
+            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+                else
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+                c_rice_param = lc->stat_coeff[sb_type] / 4;
+            }
+
+            if (!(i == num_last_subset) && greater1_ctx == 0)
+                ctx_set++;
+            greater1_ctx = 1;
+            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+
+            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+                int inc = (ctx_set << 2) + greater1_ctx;
+                coeff_abs_level_greater1_flag[m] =
+                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+                if (coeff_abs_level_greater1_flag[m]) {
+                    greater1_ctx = 0;
+                    if (first_greater1_coeff_idx == -1)
+                        first_greater1_coeff_idx = m;
+                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+                    greater1_ctx++;
+                }
+            }
+            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+
+            if (lc->cu.cu_transquant_bypass_flag ||
+                (lc->cu.pred_mode ==  MODE_INTRA  &&
+                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+                 explicit_rdpcm_flag)
+                sign_hidden = 0;
+            else
+                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+
+            if (first_greater1_coeff_idx != -1) {
+                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+            }
+            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+            } else {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+            }
+
+            for (m = 0; m < n_end; m++) {
+                n = significant_coeff_flag_idx[m];
+                GET_COORD(offset, n);
+                if (m < 8) {
+                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                        trans_coeff_level += last_coeff_abs_level_remaining;
+                        if (trans_coeff_level > (3 << c_rice_param))
+                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                                lc->stat_coeff[sb_type]++;
+                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                                if (lc->stat_coeff[sb_type] > 0)
+                                    lc->stat_coeff[sb_type]--;
+                            rice_init = 1;
+                        }
+                    }
+                } else {
+                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+                    if (trans_coeff_level > (3 << c_rice_param))
+                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                            lc->stat_coeff[sb_type]++;
+                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                            if (lc->stat_coeff[sb_type] > 0)
+                                lc->stat_coeff[sb_type]--;
+                        rice_init = 1;
+                    }
+                }
+                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+                    sum_abs += trans_coeff_level;
+                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+                        trans_coeff_level = -trans_coeff_level;
+                }
+                if (coeff_sign_flag >> 15)
+                    trans_coeff_level = -trans_coeff_level;
+                coeff_sign_flag <<= 1;
+                if(!lc->cu.cu_transquant_bypass_flag) {
+                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+                        if(y_c || x_c || log2_trafo_size < 4) {
+                            switch(log2_trafo_size) {
+                                case 3: pos = (y_c << 3) + x_c; break;
+                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+                                default: pos = (y_c << 2) + x_c; break;
+                            }
+                            scale_m = scale_matrix[pos];
+                        } else {
+                            scale_m = dc_scale;
+                        }
+                    }
+                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+                    if(trans_coeff_level < 0) {
+                        if((~trans_coeff_level) & 0xFffffffffff8000)
+                            trans_coeff_level = -32768;
+                    } else {
+                        if(trans_coeff_level & 0xffffffffffff8000)
+                            trans_coeff_level = 32767;
+                    }
+                }
+                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+            }
+        }
+    }
+
+    if (lc->cu.cu_transquant_bypass_flag) {
+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
+
+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+        }
+    } else {
+        if (transform_skip_flag) {
+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                      log2_trafo_size == 2 &&
+                      lc->cu.pred_mode == MODE_INTRA;
+            if (rot) {
+                for (i = 0; i < 8; i++)
+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+            }
+
+            s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+
+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                        lc->cu.pred_mode == MODE_INTRA &&
+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
+
+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+            }
+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+            s->hevcdsp.idct_4x4_luma(coeffs);
+        } else {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+            }
+        }
+    }
+    if (lc->tu.cross_pf) {
+        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+
+        for (i = 0; i < (trafo_size * trafo_size); i++) {
+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+        }
+    }
+    s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+}
+
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    int x = abs_mvd_greater0_flag_decode(s);
+    int y = abs_mvd_greater0_flag_decode(s);
+
+    if (x)
+        x += abs_mvd_greater1_flag_decode(s);
+    if (y)
+        y += abs_mvd_greater1_flag_decode(s);
+
+    switch (x) {
+    case 2: lc->pu.mvd.x = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.x = 0;                       break;
+    }
+
+    switch (y) {
+    case 2: lc->pu.mvd.y = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.y = 0;                       break;
+    }
+}
+
diff --git a/libavcodec/hevc_data.c b/libavcodec/hevc_data.c
index f4b6096aec..f74f2725c0 100644
--- a/libavcodec/hevc_data.c
+++ b/libavcodec/hevc_data.c
@@ -1,20 +1,20 @@
 /*
  * HEVC shared tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 769977b44d..1f33b0cdfe 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2013 Seppo Tomperi
  * Copyright (C) 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,8 @@
 #include "golomb.h"
 #include "hevc.h"
 
+#include "bit_depth_template.c"
+
 #define LUMA 0
 #define CB 1
 #define CR 2
@@ -59,28 +61,30 @@ static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset)
         offset = s->ps.pps->cr_qp_offset;
 
     qp_i = av_clip(qp_y + offset, 0, 57);
-    if (qp_i < 30)
-        qp = qp_i;
-    else if (qp_i > 43)
-        qp = qp_i - 6;
-    else
-        qp = qp_c[qp_i - 30];
+    if (s->ps.sps->chroma_format_idc == 1) {
+        if (qp_i < 30)
+            qp = qp_i;
+        else if (qp_i > 43)
+            qp = qp_i - 6;
+        else
+            qp = qp_c[qp_i - 30];
+    } else {
+        qp = av_clip(qp_i, 0, 51);
+    }
 
     idxt = av_clip(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53);
     return tctable[idxt];
 }
 
-static int get_qPy_pred(HEVCContext *s, int xC, int yC,
-                        int xBase, int yBase, int log2_cb_size)
+static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    HEVCLocalContext *lc     = &s->HEVClc;
+    HEVCLocalContext *lc     = s->HEVClc;
     int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
     int MinCuQpDeltaSizeMask = (1 << (s->ps.sps->log2_ctb_size -
                                       s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int xQgBase              = xBase - (xBase & MinCuQpDeltaSizeMask);
     int yQgBase              = yBase - (yBase & MinCuQpDeltaSizeMask);
     int min_cb_width         = s->ps.sps->min_cb_width;
-    int min_cb_height        = s->ps.sps->min_cb_height;
     int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
     int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
     int availableA           = (xBase   & ctb_size_mask) &&
@@ -94,46 +98,7 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
         lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded;
         qPy_pred = s->sh.slice_qp;
     } else {
-        qPy_pred = lc->qp_y;
-        if (log2_cb_size < s->ps.sps->log2_ctb_size -
-                           s->ps.pps->diff_cu_qp_delta_depth) {
-            static const int offsetX[8][8] = {
-                { -1, 1, 3, 1, 7, 1, 3, 1 },
-                {  0, 0, 0, 0, 0, 0, 0, 0 },
-                {  1, 3, 1, 3, 1, 3, 1, 3 },
-                {  2, 2, 2, 2, 2, 2, 2, 2 },
-                {  3, 5, 7, 5, 3, 5, 7, 5 },
-                {  4, 4, 4, 4, 4, 4, 4, 4 },
-                {  5, 7, 5, 7, 5, 7, 5, 7 },
-                {  6, 6, 6, 6, 6, 6, 6, 6 }
-            };
-            static const int offsetY[8][8] = {
-                { 7, 0, 1, 2, 3, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 3, 0, 1, 2, 7, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 }
-            };
-            int xC0b = (xC - (xC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int yC0b = (yC - (yC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int idxX = (xQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idxY = (yQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idx_mask = ctb_size_mask >> s->ps.sps->log2_min_cb_size;
-            int x, y;
-
-            x = FFMIN(xC0b +  offsetX[idxX][idxY],             min_cb_width  - 1);
-            y = FFMIN(yC0b + (offsetY[idxX][idxY] & idx_mask), min_cb_height - 1);
-
-            if (xC0b == (lc->start_of_tiles_x >> s->ps.sps->log2_min_cb_size) &&
-                offsetX[idxX][idxY] == -1) {
-                x = (lc->end_of_tiles_x >> s->ps.sps->log2_min_cb_size) - 1;
-                y = yC0b - 1;
-            }
-            qPy_pred = s->qp_y_tab[y * min_cb_width + x];
-        }
+        qPy_pred = lc->qPy_pred;
     }
 
     // qPy_a
@@ -148,20 +113,22 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
     else
         qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width];
 
+    av_assert2(qPy_a >= -s->ps.sps->qp_bd_offset && qPy_a < 52);
+    av_assert2(qPy_b >= -s->ps.sps->qp_bd_offset && qPy_b < 52);
+
     return (qPy_a + qPy_b + 1) >> 1;
 }
 
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC,
-                     int xBase, int yBase, int log2_cb_size)
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    int qp_y = get_qPy_pred(s, xC, yC, xBase, yBase, log2_cb_size);
+    int qp_y = get_qPy_pred(s, xBase, yBase, log2_cb_size);
 
-    if (s->HEVClc.tu.cu_qp_delta != 0) {
+    if (s->HEVClc->tu.cu_qp_delta != 0) {
         int off = s->ps.sps->qp_bd_offset;
-        s->HEVClc.qp_y = FFUMOD(qp_y + s->HEVClc.tu.cu_qp_delta + 52 + 2 * off,
-                                52 + off) - off;
+        s->HEVClc->qp_y = FFUMOD(qp_y + s->HEVClc->tu.cu_qp_delta + 52 + 2 * off,
+                                 52 + off) - off;
     } else
-        s->HEVClc.qp_y = qp_y;
+        s->HEVClc->qp_y = qp_y;
 }
 
 static int get_qPy(HEVCContext *s, int xC, int yC)
@@ -172,15 +139,106 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
 }
 
-static void copy_CTB(uint8_t *dst, uint8_t *src,
-                     int width, int height, int stride)
+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                     intptr_t stride_dst, intptr_t stride_src)
+{
+int i, j;
+
+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=8)
+                AV_COPY64U(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=16)
+                AV_COPY128(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src,
+                      int pixel_shift, int height,
+                      int stride_dst, int stride_src)
 {
     int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                           int stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    int sh = s->ps.sps->pixel_shift;
+    int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+    int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
 
-    for (i = 0; i < height; i++) {
-        memcpy(dst, src, width);
-        dst += stride;
-        src += stride;
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+
+static void restore_tqb_pixels(HEVCContext *s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
+                               int x0, int y0, int width, int height, int c_idx)
+{
+    if ( s->ps.pps->transquant_bypass_enable_flag ||
+            (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+        int x, y;
+        int min_pu_size  = 1 << s->ps.sps->log2_min_pu_size;
+        int hshift       = s->ps.sps->hshift[c_idx];
+        int vshift       = s->ps.sps->vshift[c_idx];
+        int x_min        = ((x0         ) >> s->ps.sps->log2_min_pu_size);
+        int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+        int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+        int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+        for (y = y_min; y < y_max; y++) {
+            for (x = x_min; x < x_max; x++) {
+                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                    int n;
+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    for (n = 0; n < (min_pu_size >> vshift); n++) {
+                        memcpy(src, dst, len);
+                        src += stride_src;
+                        dst += stride_dst;
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -188,128 +246,209 @@ static void copy_CTB(uint8_t *dst, uint8_t *src,
 
 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
-    //  TODO: This should be easily parallelizable
-    //  TODO: skip CBs when (cu_transquant_bypass_flag || (pcm_loop_filter_disable_flag && pcm_flag))
-    int c_idx = 0;
-    int class = 1, class_index;
+    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
-    SAOParams *sao[4];
-    int classes[4];
-    int x_shift = 0, y_shift = 0;
-    int x_ctb = x >> s->ps.sps->log2_ctb_size;
-    int y_ctb = y >> s->ps.sps->log2_ctb_size;
-    int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
-    int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-
+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    SAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
     // flags indicating unfilterable edges
-    uint8_t vert_edge[]  = { 0, 0, 0, 0 };
-    uint8_t horiz_edge[] = { 0, 0, 0, 0 };
-    uint8_t diag_edge[]  = { 0, 0, 0, 0 };
-    uint8_t lfase[3]; // current, above, left
-    uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
-                             !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-    uint8_t left_tile_edge = 0, up_tile_edge = 0;
-
-    sao[0]     = &CTB(s->sao, x_ctb, y_ctb);
+    uint8_t vert_edge[]      = { 0, 0 };
+    uint8_t horiz_edge[]     = { 0, 0 };
+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
+    uint8_t restore          = no_tile_filter || !lfase;
+    uint8_t left_tile_edge   = 0;
+    uint8_t right_tile_edge  = 0;
+    uint8_t up_tile_edge     = 0;
+    uint8_t bottom_tile_edge = 0;
+
     edges[0]   = x_ctb == 0;
     edges[1]   = y_ctb == 0;
     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-    lfase[0]   = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-    classes[0] = 0;
-
-    if (!edges[0]) {
-        left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-        sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb);
-        vert_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-        vert_edge[2] = vert_edge[0];
-        lfase[2]     = CTB(s->filter_slice_edges, x_ctb - 1, y_ctb);
-        classes[class] = 2;
-        class++;
-        x_shift = 8;
-    }
-
-    if (!edges[1]) {
-        up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-        sao[class] = &CTB(s->sao, x_ctb, y_ctb - 1);
-        horiz_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-        horiz_edge[1] = horiz_edge[0];
-        lfase[1] = CTB(s->filter_slice_edges, x_ctb, y_ctb - 1);
-        classes[class] = 1;
-        class++;
-        y_shift = 4;
 
+    if (restore) {
         if (!edges[0]) {
-            classes[class] = 3;
-            sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb - 1);
-            class++;
-
-            // Tile check here is done current CTB row/col, not above/left like you'd expect,
-            //but that is because the tile boundary always extends through the whole pic
-            vert_edge[1] = (!lfase[1] && CTB(s->tab_slice_address, x_ctb, y_ctb - 1) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge;
-            vert_edge[3] = vert_edge[1];
-            horiz_edge[2] = (!lfase[2] && CTB(s->tab_slice_address, x_ctb - 1, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || up_tile_edge;
-            horiz_edge[3] = horiz_edge[2];
-            diag_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-            diag_edge[3] = diag_edge[0];
-
-            // Does left CTB comes after above CTB?
-            if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) >
-                CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[2] = !lfase[2] || left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            } else if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) <
-                       CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[1] = !lfase[1] || left_tile_edge || up_tile_edge;
-                diag_edge[2] = diag_edge[1];
-            } else {
-                // Same slice, only consider tiles
-                diag_edge[2] = left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            }
+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
+        }
+        if (!edges[2]) {
+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
+        }
+        if (!edges[1]) {
+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
+        }
+        if (!edges[3]) {
+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[1]) {
+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+        }
+        if (!edges[1] && !edges[2]) {
+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
+        }
+        if (!edges[2] && !edges[3]) {
+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[3]) {
+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        int chroma = c_idx ? 1 : 0;
-        int x0 = x >> chroma;
-        int y0 = y >> chroma;
-        int stride = s->frame->linesize[c_idx];
-        int ctb_size = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
-        int width = FFMIN(ctb_size,
-                          (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0);
-        int height = FFMIN(ctb_size,
-                           (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
-
-        uint8_t *src = &s->frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        int offset = (y_shift >> chroma) * stride + ((x_shift >> chroma) << s->ps.sps->pixel_shift);
-
-        copy_CTB(dst - offset, src - offset,
-                 (edges[2] ? width  + (x_shift >> chroma) : width)  << s->ps.sps->pixel_shift,
-                 (edges[3] ? height + (y_shift >> chroma) : height), stride);
-
-        for (class_index = 0; class_index < class; class_index++) {
-
-            switch (sao[class_index]->type_idx[c_idx]) {
-            case SAO_BAND:
-                s->hevcdsp.sao_band_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx);
-                break;
-            case SAO_EDGE:
-                s->hevcdsp.sao_edge_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx,
-                                                                 vert_edge[classes[class_index]],
-                                                                 horiz_edge[classes[class_index]],
-                                                                 diag_edge[classes[class_index]]);
-                break;
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int x0       = x >> s->ps.sps->hshift[c_idx];
+        int y0       = y >> s->ps.sps->vshift[c_idx];
+        int stride_src = s->frame->linesize[c_idx];
+        int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
+        int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->vshift[c_idx];
+        int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+        int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+        int stride_dst;
+        uint8_t *dst;
+
+        switch (sao->type_idx[c_idx]) {
+        case SAO_BAND:
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            if (s->ps.pps->transquant_bypass_enable_flag ||
+                (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+            dst = lc->edge_emu_buffer;
+            stride_dst = 2*MAX_PB_SIZE;
+            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            } else {
+            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            }
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        case SAO_EDGE:
+        {
+            int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+            int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+            int left_edge = edges[0];
+            int top_edge = edges[1];
+            int right_edge = edges[2];
+            int bottom_edge = edges[3];
+            int sh = s->ps.sps->pixel_shift;
+            int left_pixels, right_pixels;
+
+            stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+            dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+
+            if (!top_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst - stride_dst - (left << sh);
+                src1[0] = src - stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
             }
+            if (!bottom_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst + height * stride_dst - (left << sh);
+                src1[0] = src + height * stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            left_pixels = 0;
+            if (!left_edge) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    left_pixels = 1;
+                }
+            }
+            right_pixels = 0;
+            if (!right_edge) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    right_pixels = 1;
+                }
+            }
+
+            copy_CTB(dst - (left_pixels << sh),
+                     src - (left_pixels << sh),
+                     (width + left_pixels + right_pixels) << sh,
+                     height, stride_dst, stride_src);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+                                            sao->eo_class[c_idx], width, height);
+            s->hevcdsp.sao_edge_restore[restore](src, dst,
+                                                stride_src, stride_dst,
+                                                sao,
+                                                edges, width,
+                                                height, c_idx,
+                                                vert_edge,
+                                                horiz_edge,
+                                                diag_edge);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        }
         }
     }
 }
@@ -338,18 +477,21 @@ static int get_pcm(HEVCContext *s, int x, int y)
 static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
 {
     uint8_t *src;
-    int x, y, x_end, y_end, chroma;
-    int c_tc[2], tc[2], beta;
+    int x, y;
+    int chroma, beta;
+    int32_t c_tc[2], tc[2];
     uint8_t no_p[2] = { 0 };
     uint8_t no_q[2] = { 0 };
 
     int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    int x_end, x_end2, y_end;
     int ctb_size        = 1 << log2_ctb_size;
     int ctb             = (x0 >> log2_ctb_size) +
                           (y0 >> log2_ctb_size) * s->ps.sps->ctb_width;
     int cur_tc_offset   = s->deblock[ctb].tc_offset;
     int cur_beta_offset = s->deblock[ctb].beta_offset;
-    int tc_offset, left_tc_offset, beta_offset, left_beta_offset;
+    int left_tc_offset, left_beta_offset;
+    int tc_offset, beta_offset;
     int pcmf = (s->ps.sps->pcm_enabled_flag &&
                 s->ps.sps->pcm.loop_filter_disable_flag) ||
                s->ps.pps->transquant_bypass_enable_flag;
@@ -357,6 +499,9 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     if (x0) {
         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+    } else {
+        left_tc_offset   = 0;
+        left_beta_offset = 0;
     }
 
     x_end = x0 + ctb_size;
@@ -369,11 +514,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     tc_offset   = cur_tc_offset;
     beta_offset = cur_beta_offset;
 
-    // vertical filtering luma
+    x_end2 = x_end;
+    if (x_end2 != s->ps.sps->width)
+        x_end2 -= 8;
     for (y = y0; y < y_end; y += 8) {
+        // vertical filtering luma
         for (x = x0 ? x0 : 8; x < x_end; x += 8) {
-            const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-            const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
+            const int bs0 = s->vertical_bs[(x +  y      * s->bs_width) >> 2];
+            const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -396,45 +544,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                        beta, tc, no_p, no_q);
             }
         }
-    }
 
-    // vertical filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0; y < y_end; y += 16) {
-            for (x = x0 ? x0 : 16; x < x_end; x += 16) {
-                const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-                const int bs1 = s->vertical_bs[(x >> 3) + ((y + 8) >> 2) * s->bs_width];
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
-                    const int qp1 = (get_qPy(s, x - 1, y + 8) + get_qPy(s, x, y + 8) + 1) >> 1;
-
-                    c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
-                    c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
-                    src     = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x - 1, y);
-                        no_p[1] = get_pcm(s, x - 1, y + 8);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x, y + 8);
-                        s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_v_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
-                }
-            }
-        }
-    }
+        if(!y)
+             continue;
 
-    // horizontal filtering luma
-    if (x_end != s->ps.sps->width)
-        x_end -= 8;
-    for (y = y0 ? y0 : 8; y < y_end; y += 8) {
-        for (x = x0 ? x0 - 8 : 0; x < x_end; x += 8) {
-            const int bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-            const int bs1 = s->horizontal_bs[(x + 4 + y * s->bs_width) >> 2];
+        // horizontal filtering luma
+        for (x = x0 ? x0 - 8 : 0; x < x_end2; x += 8) {
+            const int bs0 = s->horizontal_bs[( x      + y * s->bs_width) >> 2];
+            const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -461,123 +578,135 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
         }
     }
 
-    // horizontal filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0 ? y0 : 16; y < y_end; y += 16) {
-            for (x = x0 - 8; x < x_end; x += 16) {
-                int bs0, bs1;
-                // to make sure no memory access over boundary when x = -8
-                // TODO: simplify with row based deblocking
-                if (x < 0) {
-                    bs0 = 0;
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
-                } else if (x >= x_end - 8) {
-                    bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-                    bs1 = 0;
-                } else {
-                    bs0 = s->horizontal_bs[(x + y     * s->bs_width) >> 2];
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
+    if (s->ps.sps->chroma_format_idc) {
+        for (chroma = 1; chroma <= 2; chroma++) {
+            int h = 1 << s->ps.sps->hshift[chroma];
+            int v = 1 << s->ps.sps->vshift[chroma];
+
+            // vertical filtering chroma
+            for (y = y0; y < y_end; y += (8 * v)) {
+                for (x = x0 ? x0 : 8 * h; x < x_end; x += (8 * h)) {
+                    const int bs0 = s->vertical_bs[(x +  y            * s->bs_width) >> 2];
+                    const int bs1 = s->vertical_bs[(x + (y + (4 * v)) * s->bs_width) >> 2];
+
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = (get_qPy(s, x - 1, y)           + get_qPy(s, x, y)           + 1) >> 1;
+                        const int qp1 = (get_qPy(s, x - 1, y + (4 * v)) + get_qPy(s, x, y + (4 * v)) + 1) >> 1;
+
+                        c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                        c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x - 1, y);
+                            no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+                            no_q[0] = get_pcm(s, x, y);
+                            no_q[1] = get_pcm(s, x, y + (4 * v));
+                            s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_v_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
 
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = bs0 == 2 ? (get_qPy(s, x,     y - 1) + get_qPy(s, x,     y) + 1) >> 1 : 0;
-                    const int qp1 = bs1 == 2 ? (get_qPy(s, x + 8, y - 1) + get_qPy(s, x + 8, y) + 1) >> 1 : 0;
-
-                    tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset;
-                    c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
-                    c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
-                    src       = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x, y - 1);
-                        no_p[1] = get_pcm(s, x + 8, y - 1);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x + 8, y);
-                        s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_h_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
+                if(!y)
+                    continue;
+
+                // horizontal filtering chroma
+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+                x_end2 = x_end;
+                if (x_end != s->ps.sps->width)
+                    x_end2 = x_end - 8 * h;
+                for (x = x0 ? x0 - 8 * h : 0; x < x_end2; x += (8 * h)) {
+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,           y - 1) + get_qPy(s, x,           y) + 1) >> 1 : 0;
+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + (4 * h), y - 1) + get_qPy(s, x + (4 * h), y) + 1) >> 1 : 0;
+
+                        c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                        c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x,           y - 1);
+                            no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+                            no_q[0] = get_pcm(s, x,           y);
+                            no_q[1] = get_pcm(s, x + (4 * h), y);
+                            s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
             }
         }
     }
 }
 
-static int boundary_strength(HEVCContext *s, MvField *curr,
-                             uint8_t curr_cbf_luma, MvField *neigh,
-                             uint8_t neigh_cbf_luma,
-                             RefPicList *neigh_refPicList,
-                             int tu_border)
+static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+                             RefPicList *neigh_refPicList)
 {
-    int mvs = curr->pred_flag[0] + curr->pred_flag[1];
-
-    if (tu_border) {
-        if (curr->is_intra || neigh->is_intra)
-            return 2;
-        if (curr_cbf_luma || neigh_cbf_luma)
-            return 1;
-    }
-
-    if (mvs == neigh->pred_flag[0] + neigh->pred_flag[1]) {
-        if (mvs == 2) {
-            // same L0 and L1
-            if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-                s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-                neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-                if ((abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-                    (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4))
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else {
+    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+        // same L0 and L1
+        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
                 return 1;
-            }
-        } else { // 1 MV
-            Mv A, B;
-            int ref_A, ref_B;
-
-            if (curr->pred_flag[0]) {
-                A     = curr->mv[0];
-                ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-            } else {
-                A     = curr->mv[1];
-                ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-            }
+            else
+                return 0;
+        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else {
+            return 1;
+        }
+    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+        Mv A, B;
+        int ref_A, ref_B;
+
+        if (curr->pred_flag & 1) {
+            A     = curr->mv[0];
+            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+        } else {
+            A     = curr->mv[1];
+            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+        }
 
-            if (neigh->pred_flag[0]) {
-                B     = neigh->mv[0];
-                ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-            } else {
-                B     = neigh->mv[1];
-                ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-            }
+        if (neigh->pred_flag & 1) {
+            B     = neigh->mv[0];
+            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+        } else {
+            B     = neigh->mv[1];
+            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+        }
 
-            if (ref_A == ref_B) {
-                if (abs(A.x - B.x) >= 4 || abs(A.y - B.y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else
+        if (ref_A == ref_B) {
+            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
                 return 1;
-        }
+            else
+                return 0;
+        } else
+            return 1;
     }
 
     return 1;
@@ -586,14 +715,14 @@ static int boundary_strength(HEVCContext *s, MvField *curr,
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf     = s->ref->tab_mvf;
     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int min_tu_width     = s->ps.sps->min_tb_width;
     int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-                           (x0 >> log2_min_pu_size)].is_intra;
+                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
     int boundary_upper, boundary_left;
     int i, j, bs;
 
@@ -611,37 +740,11 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
                               s->ref->refPicList;
-
         int yp_pu = (y0 - 1) >> log2_min_pu_size;
         int yq_pu =  y0      >> log2_min_pu_size;
         int yp_tu = (y0 - 1) >> log2_min_tu_size;
         int yq_tu =  y0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int x_pu = (x0 + i) >> log2_min_pu_size;
-            int x_tu = (x0 + i) >> log2_min_tu_size;
-            MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-            uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   top, top_cbf_luma, rpl_top, 1);
-            if (bs)
-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-        }
-    }
-
-    // bs for TU internal horizontal PU boundaries
-    if (log2_trafo_size > s->ps.sps->log2_min_pu_size && !is_intra) {
-        RefPicList *rpl = s->ref->refPicList;
-
-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-            int yp_tu = (y0 + j - 1) >> log2_min_tu_size;
-            int yq_tu = (y0 + j)     >> log2_min_tu_size;
-
             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
                 int x_pu = (x0 + i) >> log2_min_pu_size;
                 int x_tu = (x0 + i) >> log2_min_tu_size;
@@ -650,12 +753,14 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                 uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
                 uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       top, top_cbf_luma, rpl, 0);
-                if (bs)
-                    s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || top_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, top, rpl_top);
+                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
             }
-        }
     }
 
     // bs for vertical TU boundaries
@@ -673,50 +778,59 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
                                s->ref->refPicList;
-
         int xp_pu = (x0 - 1) >> log2_min_pu_size;
         int xq_pu =  x0      >> log2_min_pu_size;
         int xp_tu = (x0 - 1) >> log2_min_tu_size;
         int xq_tu =  x0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int y_pu      = (y0 + i) >> log2_min_pu_size;
-            int y_tu      = (y0 + i) >> log2_min_tu_size;
-            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-
-            uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int y_pu      = (y0 + i) >> log2_min_pu_size;
+                int y_tu      = (y0 + i) >> log2_min_tu_size;
+                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   left, left_cbf_luma, rpl_left, 1);
-            if (bs)
-                s->vertical_bs[(x0 >> 3) + ((y0 + i) >> 2) * s->bs_width] = bs;
-        }
+                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || left_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, left, rpl_left);
+                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+            }
     }
 
-    // bs for TU internal vertical PU boundaries
     if (log2_trafo_size > log2_min_pu_size && !is_intra) {
         RefPicList *rpl = s->ref->refPicList;
 
+        // bs for TU internal horizontal PU boundaries
+        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int x_pu = (x0 + i) >> log2_min_pu_size;
+                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+
+                bs = boundary_strength(s, curr, top, rpl);
+                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+            }
+        }
+
+        // bs for TU internal vertical PU boundaries
         for (j = 0; j < (1 << log2_trafo_size); j += 4) {
             int y_pu = (y0 + j) >> log2_min_pu_size;
-            int y_tu = (y0 + j) >> log2_min_tu_size;
 
             for (i = 8; i < (1 << log2_trafo_size); i += 8) {
                 int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
                 int xq_pu = (x0 + i)     >> log2_min_pu_size;
-                int xp_tu = (x0 + i - 1) >> log2_min_tu_size;
-                int xq_tu = (x0 + i)     >> log2_min_tu_size;
                 MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
                 MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       left, left_cbf_luma, rpl, 0);
-                if (bs)
-                    s->vertical_bs[((x0 + i) >> 3) + ((y0 + j) >> 2) * s->bs_width] = bs;
+                bs = boundary_strength(s, curr, left, rpl);
+                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
             }
         }
     }
@@ -726,21 +840,39 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
 #undef CB
 #undef CR
 
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y)
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
 {
-    deblocking_filter_CTB(s, x, y);
-    if (s->ps.sps->sao_enabled)
-        sao_filter_CTB(s, x, y);
+    int x_end = x >= s->ps.sps->width  - ctb_size;
+    if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+        deblocking_filter_CTB(s, x, y);
+    if (s->ps.sps->sao_enabled) {
+        int y_end = y >= s->ps.sps->height - ctb_size;
+        if (y && x)
+            sao_filter_CTB(s, x - ctb_size, y - ctb_size);
+        if (x && y_end)
+            sao_filter_CTB(s, x - ctb_size, y);
+        if (y && x_end) {
+            sao_filter_CTB(s, x, y - ctb_size);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y, 0);
+        }
+        if (x_end && y_end) {
+            sao_filter_CTB(s, x , y);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+        }
+    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 }
 
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
 {
+    int x_end = x_ctb >= s->ps.sps->width  - ctb_size;
+    int y_end = y_ctb >= s->ps.sps->height - ctb_size;
     if (y_ctb && x_ctb)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size);
-    if (y_ctb && x_ctb >= s->ps.sps->width - ctb_size) {
-        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size);
-        ff_thread_report_progress(&s->ref->tf, y_ctb - ctb_size, 0);
-    }
-    if (x_ctb && y_ctb >= s->ps.sps->height - ctb_size)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size);
+    if (y_ctb && x_end)
+        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size);
+    if (x_ctb && y_end)
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size);
 }
diff --git a/libavcodec/hevc_mp4toannexb_bsf.c b/libavcodec/hevc_mp4toannexb_bsf.c
index dfecf5e99f..d6feb998f8 100644
--- a/libavcodec/hevc_mp4toannexb_bsf.c
+++ b/libavcodec/hevc_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * HEVC MP4 to Annex B byte stream format filter
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,16 +35,28 @@ typedef struct HEVCBSFContext {
     int      extradata_parsed;
 
     int logged_nonmp4_warning;
+
+    /* When private_spspps is zero then spspps_buf points to global extradata
+       and bsf does replace a global extradata to own-allocated version (default
+       behaviour).
+       When private_spspps is non-zero the bsf uses a private version of spspps buf.
+       This mode necessary when bsf uses in decoder, else bsf has issues after
+       decoder re-initialization. Use the "private_spspps_buf" argument to
+       activate this mode.
+     */
+    int      private_spspps;
+    uint8_t *spspps_buf;
+    uint32_t spspps_size;
 } HEVCBSFContext;
 
-static int hevc_extradata_to_annexb(AVCodecContext *avctx)
+static int hevc_extradata_to_annexb(HEVCBSFContext* ctx, AVCodecContext *avctx)
 {
     GetByteContext gb;
     int length_size, num_arrays, i, j;
     int ret = 0;
 
     uint8_t *new_extradata = NULL;
-    size_t   new_extradata_size = 0;;
+    size_t   new_extradata_size = 0;
 
     bytestream2_init(&gb, avctx->extradata, avctx->extradata_size);
 
@@ -82,9 +94,13 @@ static int hevc_extradata_to_annexb(AVCodecContext *avctx)
         }
     }
 
-    av_freep(&avctx->extradata);
-    avctx->extradata      = new_extradata;
-    avctx->extradata_size = new_extradata_size;
+    if (!ctx->private_spspps) {
+        av_freep(&avctx->extradata);
+        avctx->extradata      = new_extradata;
+        avctx->extradata_size = new_extradata_size;
+    }
+    ctx->spspps_buf  = new_extradata;
+    ctx->spspps_size = new_extradata_size;
 
     if (!new_extradata_size)
         av_log(avctx, AV_LOG_WARNING, "No parameter sets in the extradata\n");
@@ -118,12 +134,14 @@ static int hevc_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
                        "The input looks like it is Annex B already\n");
                 ctx->logged_nonmp4_warning = 1;
             }
-            *poutbuf      = buf;
+            *poutbuf      = (uint8_t *)buf;
             *poutbuf_size = buf_size;
             return 0;
         }
+        if (args && strstr(args, "private_spspps_buf"))
+            ctx->private_spspps = 1;
 
-        ret = hevc_extradata_to_annexb(avctx);
+        ret = hevc_extradata_to_annexb(ctx, avctx);
         if (ret < 0)
             return ret;
         ctx->length_size      = ret;
@@ -148,7 +166,7 @@ static int hevc_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
         /* prepend extradata to IRAP frames */
         is_irap       = nalu_type >= 16 && nalu_type <= 23;
         add_extradata = is_irap && !got_irap;
-        extra_size    = add_extradata * avctx->extradata_size;
+        extra_size    = add_extradata * ctx->spspps_size;
         got_irap     |= is_irap;
 
         if (SIZE_MAX - out_size < 4             ||
@@ -163,7 +181,7 @@ static int hevc_mp4toannexb_filter(AVBitStreamFilterContext *bsfc,
             goto fail;
 
         if (add_extradata)
-            memcpy(out + out_size, avctx->extradata, extra_size);
+            memcpy(out + out_size, ctx->spspps_buf, extra_size);
         AV_WB32(out + out_size + extra_size, 1);
         bytestream2_get_buffer(&gb, out + out_size + 4 + extra_size, nalu_size);
         out_size += 4 + nalu_size + extra_size;
@@ -179,8 +197,16 @@ fail:
     return ret;
 }
 
+static void hevc_mp4toannexb_close(AVBitStreamFilterContext *bsfc)
+{
+    HEVCBSFContext *ctx = bsfc->priv_data;
+    if (ctx->private_spspps)
+        av_freep(&ctx->spspps_buf);
+}
+
 AVBitStreamFilter ff_hevc_mp4toannexb_bsf = {
     "hevc_mp4toannexb",
     sizeof(HEVCBSFContext),
     hevc_mp4toannexb_filter,
+    hevc_mp4toannexb_close,
 };
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index b99223c07b..00da575aad 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2013 Anand Meher Kotra
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,9 +41,9 @@ static const uint8_t l0_l1_cand_idx[12][2] = {
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
                                      int nPbW, int nPbH)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    HEVCLocalContext *lc = s->HEVClc;
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
     lc->na.cand_left     = (lc->ctb_left_flag || x0b);
@@ -52,8 +52,7 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
             ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ?
                     lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
     lc->na.cand_up_right =
-            ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size) ?
-                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up )
+            lc->na.cand_up_right_sap
                      && (x0 + nPbW) < lc->end_of_tiles_x;
     lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
 }
@@ -61,56 +60,29 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
 /*
  * 6.4.1 Derivation process for z-scan order block availability
  */
-static int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
+static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
                               int xN, int yN)
 {
 #define MIN_TB_ADDR_ZS(x, y)                                            \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-    int Curr = MIN_TB_ADDR_ZS(xCurr >> s->ps.sps->log2_min_tb_size,
-                              yCurr >> s->ps.sps->log2_min_tb_size);
-    int N;
-
-    if (xN < 0 || yN < 0 ||
-        xN >= s->ps.sps->width ||
-        yN >= s->ps.sps->height)
-        return 0;
-
-    N = MIN_TB_ADDR_ZS(xN >> s->ps.sps->log2_min_tb_size,
-                       yN >> s->ps.sps->log2_min_tb_size);
-
-    return N <= Curr;
-}
-
-static int same_prediction_block(HEVCLocalContext *lc, int log2_cb_size,
-                                 int x0, int y0, int nPbW, int nPbH,
-                                 int xA1, int yA1, int partIdx)
-{
-    return !(nPbW << 1 == 1 << log2_cb_size &&
-             nPbH << 1 == 1 << log2_cb_size && partIdx == 1 &&
-             lc->cu.x + nPbW > xA1 &&
-             lc->cu.y + nPbH <= yA1);
-}
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 
-/*
- * 6.4.2 Derivation process for prediction block availability
- */
-static int check_prediction_block_available(HEVCContext *s, int log2_cb_size,
-                                            int x0, int y0, int nPbW, int nPbH,
-                                            int xA1, int yA1, int partIdx)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-
-    if (lc->cu.x < xA1 && lc->cu.y < yA1 &&
-        (lc->cu.x + (1 << log2_cb_size)) > xA1 &&
-        (lc->cu.y + (1 << log2_cb_size)) > yA1)
-        return same_prediction_block(lc, log2_cb_size, x0, y0,
-                                     nPbW, nPbH, xA1, yA1, partIdx);
-    else
-        return z_scan_block_avail(s, x0, y0, xA1, yA1);
+    int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size;
+    int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size;
+    int xN_ctb    = xN    >> s->ps.sps->log2_ctb_size;
+    int yN_ctb    = yN    >> s->ps.sps->log2_ctb_size;
+    if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb )
+        return 1;
+    else {
+        int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        int N    = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        return N <= Curr;
+    }
 }
 
 //check if the two luma locations belong to the same mostion estimation region
-static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
+static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, int yP)
 {
     uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
 
@@ -122,18 +94,20 @@ static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
 #define MATCH(x) (A.x == B.x)
 
 // check if the mv's and refidx are the same between A and B
-static int compareMVrefidx(struct MvField A, struct MvField B)
+static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField B)
 {
-    if (A.pred_flag[0] && A.pred_flag[1] && B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
-               MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
-    if (A.pred_flag[0] && !A.pred_flag[1] && B.pred_flag[0] && !B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
-
-    if (!A.pred_flag[0] && A.pred_flag[1] && !B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
+    int a_pf = A.pred_flag;
+    int b_pf = B.pred_flag;
+    if (a_pf == b_pf) {
+        if (a_pf == PF_BI) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
+                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        } else if (a_pf == PF_L0) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
+        } else if (a_pf == PF_L1) {
+            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        }
+    }
     return 0;
 }
 
@@ -144,11 +118,11 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
     td = av_clip_int8(td);
     tb = av_clip_int8(tb);
     tx = (0x4000 + abs(td / 2)) / td;
-    scale_factor = av_clip((tb * tx + 32) >> 6, -4096, 4095);
+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
     dst->x = av_clip_int16((scale_factor * src->x + 127 +
-                             (scale_factor * src->x < 0)) >> 8);
+                           (scale_factor * src->x < 0)) >> 8);
     dst->y = av_clip_int16((scale_factor * src->y + 127 +
-                             (scale_factor * src->y < 0)) >> 8);
+                           (scale_factor * src->y < 0)) >> 8);
 }
 
 static int check_mvset(Mv *mvLXCol, Mv *mvCol,
@@ -169,10 +143,7 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
     col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
     cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
 
-    if (!col_poc_diff)
-        col_poc_diff = 1;  // error resilience
-
-    if (cur_lt || col_poc_diff == cur_poc_diff) {
+    if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) {
         mvLXCol->x = mvCol->x;
         mvLXCol->y = mvCol->y;
     } else {
@@ -194,32 +165,30 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
 {
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (temp_col.is_intra) {
-        mvLXCol->x = 0;
-        mvLXCol->y = 0;
+    if (temp_col.pred_flag == PF_INTRA)
         return 0;
-    }
 
-    if (temp_col.pred_flag[0] == 0)
+    if (!(temp_col.pred_flag & PF_L0))
         return CHECK_MVSET(1);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 0)
+    else if (temp_col.pred_flag == PF_L0)
         return CHECK_MVSET(0);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 1) {
+    else if (temp_col.pred_flag == PF_BI) {
         int check_diffpicount = 0;
-        int i = 0;
-        for (i = 0; i < refPicList[0].nb_refs; i++) {
-            if (refPicList[0].list[i] > s->poc)
-                check_diffpicount++;
-        }
-        for (i = 0; i < refPicList[1].nb_refs; i++) {
-            if (refPicList[1].list[i] > s->poc)
-                check_diffpicount++;
+        int i, j;
+        for (j = 0; j < 2; j++) {
+            for (i = 0; i < refPicList[j].nb_refs; i++) {
+                if (refPicList[j].list[i] > s->poc) {
+                    check_diffpicount++;
+                    break;
+                }
+            }
         }
-        if (check_diffpicount == 0 && X == 0)
-            return CHECK_MVSET(0);
-        else if (check_diffpicount == 0 && X == 1)
-            return CHECK_MVSET(1);
-        else {
+        if (!check_diffpicount) {
+            if (X==0)
+                return CHECK_MVSET(0);
+            else
+                return CHECK_MVSET(1);
+        } else {
             if (s->sh.collocated_list == L1)
                 return CHECK_MVSET(0);
             else
@@ -234,7 +203,8 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
     tab_mvf[(y) * min_pu_width + x]
 
 #define TAB_MVF_PU(v)                                                   \
-    TAB_MVF(x ## v ## _pu, y ## v ## _pu)
+    TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size),                     \
+            ((y ## v) >> s->ps.sps->log2_min_pu_size))
 
 #define DERIVE_TEMPORAL_COLOCATED_MVS                                   \
     derive_temporal_colocated_mvs(s, temp_col,                          \
@@ -275,7 +245,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         x < s->ps.sps->width) {
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -288,7 +259,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         y                  = y0 + (nPbH >> 1);
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -298,15 +270,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
 }
 
 #define AVAILABLE(cand, v)                                      \
-    (cand && !TAB_MVF_PU(v).is_intra)
+    (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
 
 #define PRED_BLOCK_AVAILABLE(v)                                 \
-    check_prediction_block_available(s, log2_cb_size,           \
-                                     x0, y0, nPbW, nPbH,        \
-                                     x ## v, y ## v, part_idx)
+    z_scan_block_avail(s, x0, y0, x ## v, y ## v)
 
 #define COMPARE_MV_REFIDX(a, b)                                 \
-    compareMVrefidx(TAB_MVF_PU(a), TAB_MVF_PU(b))
+    compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
 
 /*
  * 8.5.3.1.2  Derivation process for spatial merging candidates
@@ -318,7 +288,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                             int merge_idx,
                                             struct MvField mergecandlist[])
 {
-    HEVCLocalContext *lc   = &s->HEVClc;
+    HEVCLocalContext *lc   = s->HEVClc;
     RefPicList *refPicList = s->ref->refPicList;
     MvField *tab_mvf       = s->ref->tab_mvf;
 
@@ -332,33 +302,21 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     const int xA1    = x0 - 1;
     const int yA1    = y0 + nPbH - 1;
-    const int xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    const int yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB1    = x0 + nPbW - 1;
     const int yB1    = y0 - 1;
-    const int xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-    const int yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB0    = x0 + nPbW;
     const int yB0    = y0 - 1;
-    const int xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    const int yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
     const int xA0    = x0 - 1;
     const int yA0    = y0 + nPbH;
-    const int xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    const int yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
     const int xB2    = x0 - 1;
     const int yB2    = y0 - 1;
-    const int xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-    const int yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
 
     const int nb_refs = (s->sh.slice_type == P_SLICE) ?
                         s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]);
-    int check_MER   = 1;
-    int check_MER_1 = 1;
 
     int zero_idx = 0;
 
@@ -370,57 +328,49 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     int is_available_b0;
     int is_available_b1;
     int is_available_b2;
-    int check_B0;
-    int check_A0;
 
-    //first left spatial merge candidate
-    is_available_a1 = AVAILABLE(cand_left, A1);
 
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_Nx2N ||
          lc->cu.part_mode == PART_nLx2N ||
          lc->cu.part_mode == PART_nRx2N) ||
-        isDiffMER(s, xA1, yA1, x0, y0)) {
+        is_diff_mer(s, xA1, yA1, x0, y0)) {
         is_available_a1 = 0;
+    } else {
+        is_available_a1 = AVAILABLE(cand_left, A1);
+        if (is_available_a1) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
+            if (merge_idx == 0)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1) {
-        mergecandlist[0] = TAB_MVF_PU(A1);
-        if (merge_idx == 0)
-            return;
-        nb_merge_cand++;
-    }
-
-    // above spatial merge candidate
-    is_available_b1 = AVAILABLE(cand_up, B1);
-
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_2NxN ||
          lc->cu.part_mode == PART_2NxnU ||
          lc->cu.part_mode == PART_2NxnD) ||
-        isDiffMER(s, xB1, yB1, x0, y0)) {
+        is_diff_mer(s, xB1, yB1, x0, y0)) {
         is_available_b1 = 0;
+    } else {
+        is_available_b1 = AVAILABLE(cand_up, B1);
+        if (is_available_b1 &&
+            !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
+            if (merge_idx == nb_merge_cand)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1 && is_available_b1)
-        check_MER = !COMPARE_MV_REFIDX(B1, A1);
-
-    if (is_available_b1 && check_MER)
-        mergecandlist[nb_merge_cand++] = TAB_MVF_PU(B1);
-
     // above right spatial merge candidate
-    check_MER = 1;
-    check_B0  = PRED_BLOCK_AVAILABLE(B0);
-
-    is_available_b0 = check_B0 && AVAILABLE(cand_up_right, B0);
+    is_available_b0 = AVAILABLE(cand_up_right, B0) &&
+                      xB0 < s->ps.sps->width &&
+                      PRED_BLOCK_AVAILABLE(B0) &&
+                      !is_diff_mer(s, xB0, yB0, x0, y0);
 
-    if (isDiffMER(s, xB0, yB0, x0, y0))
-        is_available_b0 = 0;
-
-    if (is_available_b1 && is_available_b0)
-        check_MER = !COMPARE_MV_REFIDX(B0, B1);
-
-    if (is_available_b0 && check_MER) {
+    if (is_available_b0 &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -428,18 +378,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // left bottom spatial merge candidate
-    check_MER = 1;
-    check_A0  = PRED_BLOCK_AVAILABLE(A0);
-
-    is_available_a0 = check_A0 && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0) &&
+                      !is_diff_mer(s, xA0, yA0, x0, y0);
 
-    if (isDiffMER(s, xA0, yA0, x0, y0))
-        is_available_a0 = 0;
-
-    if (is_available_a1 && is_available_a0)
-        check_MER = !COMPARE_MV_REFIDX(A0, A1);
-
-    if (is_available_a0 && check_MER) {
+    if (is_available_a0 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -447,20 +392,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // above left spatial merge candidate
-    check_MER = 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2) &&
+                      !is_diff_mer(s, xB2, yB2, x0, y0);
 
-    is_available_b2 = AVAILABLE(cand_up_left, B2);
-
-    if (isDiffMER(s, xB2, yB2, x0, y0))
-        is_available_b2 = 0;
-
-    if (is_available_a1 && is_available_b2)
-        check_MER = !COMPARE_MV_REFIDX(B2, A1);
-
-    if (is_available_b1 && is_available_b2)
-        check_MER_1 = !COMPARE_MV_REFIDX(B2, B1);
-
-    if (is_available_b2 && check_MER && check_MER_1 && nb_merge_cand != 4) {
+    if (is_available_b2 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) &&
+        nb_merge_cand != 4) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2);
         if (merge_idx == nb_merge_cand)
             return;
@@ -478,9 +416,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                                        0, &mv_l1_col, 1) : 0;
 
         if (available_l0 || available_l1) {
-            mergecandlist[nb_merge_cand].is_intra     = 0;
-            mergecandlist[nb_merge_cand].pred_flag[0] = available_l0;
-            mergecandlist[nb_merge_cand].pred_flag[1] = available_l1;
+            mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1);
             AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx);
             mergecandlist[nb_merge_cand].mv[0]      = mv_l0_col;
             mergecandlist[nb_merge_cand].mv[1]      = mv_l1_col;
@@ -496,7 +432,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     // combined bi-predictive merge candidates  (applies for B slices)
     if (s->sh.slice_type == B_SLICE && nb_orig_merge_cand > 1 &&
         nb_orig_merge_cand < s->sh.max_num_merge_cand) {
-        int comb_idx;
+        int comb_idx = 0;
 
         for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand &&
                            comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) {
@@ -505,17 +441,15 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
             MvField l0_cand = mergecandlist[l0_cand_idx];
             MvField l1_cand = mergecandlist[l1_cand_idx];
 
-            if (l0_cand.pred_flag[0] && l1_cand.pred_flag[1] &&
+            if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
                 (refPicList[0].list[l0_cand.ref_idx[0]] !=
                  refPicList[1].list[l1_cand.ref_idx[1]] ||
                  AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
                 mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
                 mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
-                mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-                mergecandlist[nb_merge_cand].pred_flag[1] = 1;
+                mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
-                mergecandlist[nb_merge_cand].is_intra     = 0;
                 if (merge_idx == nb_merge_cand)
                     return;
                 nb_merge_cand++;
@@ -525,11 +459,9 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     // append Zero motion vector candidates
     while (nb_merge_cand < s->sh.max_num_merge_cand) {
-        mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-        mergecandlist[nb_merge_cand].pred_flag[1] = s->sh.slice_type == B_SLICE;
+        mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == B_SLICE) << 1);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1);
-        mergecandlist[nb_merge_cand].is_intra     = 0;
         mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
         mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
 
@@ -552,7 +484,7 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
     LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
     int nPbW2 = nPbW;
     int nPbH2 = nPbH;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
 
     if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
         singleMCLFlag = 1;
@@ -568,11 +500,9 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                     singleMCLFlag, part_idx,
                                     merge_idx, mergecand_list);
 
-    if (mergecand_list[merge_idx].pred_flag[0] == 1 &&
-        mergecand_list[merge_idx].pred_flag[1] == 1 &&
+    if (mergecand_list[merge_idx].pred_flag == PF_BI &&
         (nPbW2 + nPbH2) == 12) {
-        mergecand_list[merge_idx].ref_idx[1]   = -1;
-        mergecand_list[merge_idx].pred_flag[1] = 0;
+        mergecand_list[merge_idx].pred_flag = PF_L0;
     }
 
     *mv = mergecand_list[merge_idx];
@@ -603,7 +533,7 @@ static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
 
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] == 1 &&
+    if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
         refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
         *mv = TAB_MVF(x, y).mv[pred_flag_index];
         return 1;
@@ -618,82 +548,73 @@ static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
     int min_pu_width = s->ps.sps->min_pu_width;
 
     RefPicList *refPicList = s->ref->refPicList;
-    int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    int colIsLongTerm =
-        refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+    if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) {
+        int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] &&
-        colIsLongTerm == currIsLongTerm) {
-        *mv = TAB_MVF(x, y).mv[pred_flag_index];
-        if (!currIsLongTerm)
-            dist_scale(s, mv, min_pu_width, x, y,
-                       pred_flag_index, ref_idx_curr, ref_idx);
-        return 1;
+        int colIsLongTerm =
+            refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+
+        if (colIsLongTerm == currIsLongTerm) {
+            *mv = TAB_MVF(x, y).mv[pred_flag_index];
+            if (!currIsLongTerm)
+                dist_scale(s, mv, min_pu_width, x, y,
+                           pred_flag_index, ref_idx_curr, ref_idx);
+            return 1;
+        }
     }
     return 0;
 }
 
 #define MP_MX(v, pred, mx)                                      \
-    mv_mp_mode_mx(s, x ## v ## _pu, y ## v ## _pu, pred,        \
-                  &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx(s,                                            \
+                  (x ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  (y ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  pred, &mx, ref_idx_curr, ref_idx)
 
 #define MP_MX_LT(v, pred, mx)                                   \
-    mv_mp_mode_mx_lt(s, x ## v ## _pu, y ## v ## _pu, pred,     \
-                     &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx_lt(s,                                         \
+                     (x ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     (y ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     pred, &mx, ref_idx_curr, ref_idx)
 
 void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                               int nPbH, int log2_cb_size, int part_idx,
                               int merge_idx, MvField *mv,
                               int mvp_lx_flag, int LX)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf = s->ref->tab_mvf;
     int isScaledFlag_L0 = 0;
-    int availableFlagLXA0 = 0;
-    int availableFlagLXB0 = 0;
+    int availableFlagLXA0 = 1;
+    int availableFlagLXB0 = 1;
     int numMVPCandLX = 0;
     int min_pu_width = s->ps.sps->min_pu_width;
 
     int xA0, yA0;
-    int xA0_pu, yA0_pu;
     int is_available_a0;
-
     int xA1, yA1;
-    int xA1_pu, yA1_pu;
     int is_available_a1;
-
     int xB0, yB0;
-    int xB0_pu, yB0_pu;
     int is_available_b0;
-
     int xB1, yB1;
-    int xB1_pu = 0, yB1_pu = 0;
-    int is_available_b1 = 0;
-
+    int is_available_b1;
     int xB2, yB2;
-    int xB2_pu = 0, yB2_pu = 0;
-    int is_available_b2 = 0;
+    int is_available_b2;
+
     Mv mvpcand_list[2] = { { 0 } };
-    Mv mxA = { 0 };
-    Mv mxB = { 0 };
-    int ref_idx_curr = 0;
+    Mv mxA;
+    Mv mxB;
+    int ref_idx_curr;
     int ref_idx = 0;
     int pred_flag_index_l0;
     int pred_flag_index_l1;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-
-    int cand_up = (lc->ctb_up_flag || y0b);
-    int cand_left = (lc->ctb_left_flag || x0b);
-    int cand_up_left =
-            (!x0b && !y0b) ? lc->ctb_up_left_flag : cand_left && cand_up;
-    int cand_up_right =
-            (x0b + nPbW == (1 << s->ps.sps->log2_ctb_size) ||
-             x0  + nPbW >= lc->end_of_tiles_x) ? lc->ctb_up_right_flag && !y0b
-                                               : cand_up;
-    int cand_bottom_left = (y0 + nPbH >= lc->end_of_tiles_y) ? 0 : cand_left;
 
+    const int cand_bottom_left = lc->na.cand_bottom_left;
+    const int cand_left        = lc->na.cand_left;
+    const int cand_up_left     = lc->na.cand_up_left;
+    const int cand_up          = lc->na.cand_up;
+    const int cand_up_right    = lc->na.cand_up_right_sap;
     ref_idx_curr       = LX;
     ref_idx            = mv->ref_idx[LX];
     pred_flag_index_l0 = LX;
@@ -702,97 +623,109 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
     // left bottom spatial candidate
     xA0 = x0 - 1;
     yA0 = y0 + nPbH;
-    xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_a0 = PRED_BLOCK_AVAILABLE(A0) && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0);
 
     //left spatial merge candidate
     xA1    = x0 - 1;
     yA1    = y0 + nPbH - 1;
-    xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     is_available_a1 = AVAILABLE(cand_left, A1);
     if (is_available_a0 || is_available_a1)
         isScaledFlag_L0 = 1;
 
     if (is_available_a0) {
-        availableFlagLXA0 = MP_MX(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A0, pred_flag_index_l1, mxA);
-    }
-
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A1, pred_flag_index_l1, mxA);
+        if (MP_MX(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a0 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l1, mxA);
+    if (is_available_a1) {
+        if (MP_MX(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l1, mxA);
+    if (is_available_a0) {
+        if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (availableFlagLXA0 && !mvp_lx_flag) {
-        mv->mv[LX] = mxA;
-        return;
+    if (is_available_a1) {
+        if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
+    availableFlagLXA0 = 0;
 
+b_candidates:
     // B candidates
     // above right spatial merge candidate
     xB0    = x0 + nPbW;
     yB0    = y0 - 1;
-    xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_b0 = PRED_BLOCK_AVAILABLE(B0) && AVAILABLE(cand_up_right, B0);
+    is_available_b0 =  AVAILABLE(cand_up_right, B0) &&
+                       xB0 < s->ps.sps->width &&
+                       PRED_BLOCK_AVAILABLE(B0);
 
-    if (is_available_b0) {
-        availableFlagLXB0 = MP_MX(B0, pred_flag_index_l0, mxB);
-        if (!availableFlagLXB0)
-            availableFlagLXB0 = MP_MX(B0, pred_flag_index_l1, mxB);
-    }
-
-    if (!availableFlagLXB0) {
-        // above spatial merge candidate
-        xB1    = x0 + nPbW - 1;
-        yB1    = y0 - 1;
-        xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-        yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
+    // above spatial merge candidate
+    xB1    = x0 + nPbW - 1;
+    yB1    = y0 - 1;
+    is_available_b1 = AVAILABLE(cand_up, B1);
 
-        is_available_b1 = AVAILABLE(cand_up, B1);
+    // above left spatial merge candidate
+    xB2 = x0 - 1;
+    yB2 = y0 - 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2);
 
-        if (is_available_b1) {
-            availableFlagLXB0 = MP_MX(B1, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B1, pred_flag_index_l1, mxB);
+    // above right spatial merge candidate
+    if (is_available_b0) {
+        if (MP_MX(B0, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B0, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
 
-    if (!availableFlagLXB0) {
-        // above left spatial merge candidate
-        xB2 = x0 - 1;
-        yB2 = y0 - 1;
-        xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-        yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
-        is_available_b2 = AVAILABLE(cand_up_left, B2);
+    // above spatial merge candidate
+    if (is_available_b1) {
+        if (MP_MX(B1, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B1, pred_flag_index_l1, mxB)) {
+            goto scalef;
+        }
+    }
 
-        if (is_available_b2) {
-            availableFlagLXB0 = MP_MX(B2, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B2, pred_flag_index_l1, mxB);
+    // above left spatial merge candidate
+    if (is_available_b2) {
+        if (MP_MX(B2, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B2, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
+    availableFlagLXB0 = 0;
 
-    if (isScaledFlag_L0 == 0) {
+scalef:
+    if (!isScaledFlag_L0) {
         if (availableFlagLXB0) {
             availableFlagLXA0 = 1;
             mxA = mxB;
@@ -836,10 +769,5 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
             mvpcand_list[numMVPCandLX++] = mv_col;
     }
 
-    // insert zero motion vectors when the number of available candidates are less than 2
-    while (numMVPCandLX < 2)
-        mvpcand_list[numMVPCandLX++] = (Mv){ 0, 0 };
-
-    mv->mv[LX].x = mvpcand_list[mvp_lx_flag].x;
-    mv->mv[LX].y = mvpcand_list[mvp_lx_flag].y;
+    mv->mv[LX] = mvpcand_list[mvp_lx_flag];
 }
diff --git a/libavcodec/hevc_parse.c b/libavcodec/hevc_parse.c
index de00245809..63ed84a8de 100644
--- a/libavcodec/hevc_parse.c
+++ b/libavcodec/hevc_parse.c
@@ -1,20 +1,20 @@
 /*
  * HEVC common code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,12 +29,14 @@
 
 /* FIXME: This is adapted from ff_h264_decode_nal, avoiding duplication
  * between these functions would be nice. */
-int ff_hevc_extract_rbsp(const uint8_t *src, int length,
+int ff_hevc_extract_rbsp(HEVCContext *s, const uint8_t *src, int length,
                          HEVCNAL *nal)
 {
     int i, si, di;
     uint8_t *dst;
 
+    if (s)
+        nal->skipped_bytes = 0;
 #define STARTCODE_TEST                                                  \
         if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {     \
             if (src[i + 2] != 3) {                                      \
@@ -108,6 +110,22 @@ int ff_hevc_extract_rbsp(const uint8_t *src, int length,
                 dst[di++] = 0;
                 si       += 3;
 
+                if (s && nal->skipped_bytes_pos) {
+                    nal->skipped_bytes++;
+                    if (nal->skipped_bytes_pos_size < nal->skipped_bytes) {
+                        nal->skipped_bytes_pos_size *= 2;
+                        av_assert0(nal->skipped_bytes_pos_size >= nal->skipped_bytes);
+                        av_reallocp_array(&nal->skipped_bytes_pos,
+                                nal->skipped_bytes_pos_size,
+                                sizeof(*nal->skipped_bytes_pos));
+                        if (!nal->skipped_bytes_pos) {
+                            nal->skipped_bytes_pos_size = 0;
+                            return AVERROR(ENOMEM);
+                        }
+                    }
+                    if (nal->skipped_bytes_pos)
+                        nal->skipped_bytes_pos[nal->skipped_bytes-1] = di - 1;
+                }
                 continue;
             } else // next start code
                 goto nsc;
@@ -128,6 +146,38 @@ nsc:
     return si;
 }
 
+static const char *nal_unit_name(int nal_type)
+{
+    switch(nal_type) {
+    case NAL_TRAIL_N    : return "TRAIL_N";
+    case NAL_TRAIL_R    : return "TRAIL_R";
+    case NAL_TSA_N      : return "TSA_N";
+    case NAL_TSA_R      : return "TSA_R";
+    case NAL_STSA_N     : return "STSA_N";
+    case NAL_STSA_R     : return "STSA_R";
+    case NAL_RADL_N     : return "RADL_N";
+    case NAL_RADL_R     : return "RADL_R";
+    case NAL_RASL_N     : return "RASL_N";
+    case NAL_RASL_R     : return "RASL_R";
+    case NAL_BLA_W_LP   : return "BLA_W_LP";
+    case NAL_BLA_W_RADL : return "BLA_W_RADL";
+    case NAL_BLA_N_LP   : return "BLA_N_LP";
+    case NAL_IDR_W_RADL : return "IDR_W_RADL";
+    case NAL_IDR_N_LP   : return "IDR_N_LP";
+    case NAL_CRA_NUT    : return "CRA_NUT";
+    case NAL_VPS        : return "VPS";
+    case NAL_SPS        : return "SPS";
+    case NAL_PPS        : return "PPS";
+    case NAL_AUD        : return "AUD";
+    case NAL_EOS_NUT    : return "EOS_NUT";
+    case NAL_EOB_NUT    : return "EOB_NUT";
+    case NAL_FD_NUT     : return "FD_NUT";
+    case NAL_SEI_PREFIX : return "SEI_PREFIX";
+    case NAL_SEI_SUFFIX : return "SEI_SUFFIX";
+    default : return "?";
+    }
+}
+
 /**
  * @return AVERROR_INVALIDDATA if the packet is not a valid NAL unit,
  * 0 if the unit should be skipped, 1 otherwise
@@ -148,14 +198,14 @@ static int hls_nal_unit(HEVCNAL *nal, AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
 
     av_log(avctx, AV_LOG_DEBUG,
-           "nal_unit_type: %d, nuh_layer_id: %dtemporal_id: %d\n",
-           nal->type, nuh_layer_id, nal->temporal_id);
+           "nal_unit_type: %d(%s), nuh_layer_id: %d, temporal_id: %d\n",
+           nal->type, nal_unit_name(nal->type), nuh_layer_id, nal->temporal_id);
 
     return nuh_layer_id == 0;
 }
 
 
-int ff_hevc_split_packet(HEVCPacket *pkt, const uint8_t *buf, int length,
+int ff_hevc_split_packet(HEVCContext *s, HEVCPacket *pkt, const uint8_t *buf, int length,
                          AVCodecContext *avctx, int is_nalff, int nal_length_size)
 {
     int consumed, ret = 0;
@@ -177,13 +227,15 @@ int ff_hevc_split_packet(HEVCPacket *pkt, const uint8_t *buf, int length,
                 return AVERROR_INVALIDDATA;
             }
         } else {
-            if (buf[2] == 0) {
-                length--;
-                buf++;
-                continue;
+            /* search start code */
+            while (buf[0] != 0 || buf[1] != 0 || buf[2] != 1) {
+                ++buf;
+                --length;
+                if (length < 4) {
+                    av_log(avctx, AV_LOG_ERROR, "No start code is found.\n");
+                    return AVERROR_INVALIDDATA;
+                }
             }
-            if (buf[0] != 0 || buf[1] != 0 || buf[2] != 1)
-                return AVERROR_INVALIDDATA;
 
             buf           += 3;
             length        -= 3;
@@ -192,21 +244,31 @@ int ff_hevc_split_packet(HEVCPacket *pkt, const uint8_t *buf, int length,
 
         if (pkt->nals_allocated < pkt->nb_nals + 1) {
             int new_size = pkt->nals_allocated + 1;
-            HEVCNAL *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*tmp));
+            void *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*pkt->nals));
+
             if (!tmp)
                 return AVERROR(ENOMEM);
 
             pkt->nals = tmp;
             memset(pkt->nals + pkt->nals_allocated, 0,
-                   (new_size - pkt->nals_allocated) * sizeof(*tmp));
+                   (new_size - pkt->nals_allocated) * sizeof(*pkt->nals));
+
+            nal = &pkt->nals[pkt->nb_nals];
+            nal->skipped_bytes_pos_size = 1024; // initial buffer size
+            nal->skipped_bytes_pos = av_malloc_array(nal->skipped_bytes_pos_size, sizeof(*nal->skipped_bytes_pos));
+            if (!nal->skipped_bytes_pos)
+                return AVERROR(ENOMEM);
+
             pkt->nals_allocated = new_size;
         }
-        nal = &pkt->nals[pkt->nb_nals++];
+        nal = &pkt->nals[pkt->nb_nals];
 
-        consumed = ff_hevc_extract_rbsp(buf, extract_length, nal);
+        consumed = ff_hevc_extract_rbsp(s, buf, extract_length, nal);
         if (consumed < 0)
             return consumed;
 
+        pkt->nb_nals++;
+
         ret = init_get_bits8(&nal->gb, nal->data, nal->size);
         if (ret < 0)
             return ret;
@@ -226,3 +288,4 @@ int ff_hevc_split_packet(HEVCPacket *pkt, const uint8_t *buf, int length,
 
     return 0;
 }
+
diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index 030163e7e0..fc107978c6 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 
 #define IS_IRAP_NAL(nal) (nal->type >= 16 && nal->type <= 23)
 
+#define ADVANCED_PARSER CONFIG_HEVC_DECODER
+
 typedef struct HEVCParserContext {
     ParseContext pc;
 
@@ -37,8 +39,13 @@ typedef struct HEVCParserContext {
     HEVCParamSets ps;
 
     int parsed_extradata;
+
+#if ADVANCED_PARSER
+    HEVCContext h;
+#endif
 } HEVCParserContext;
 
+#if !ADVANCED_PARSER
 static int hevc_parse_slice_header(AVCodecParserContext *s, HEVCNAL *nal,
                                    AVCodecContext *avctx)
 {
@@ -81,7 +88,7 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
     HEVCParserContext *ctx = s->priv_data;
     int ret, i;
 
-    ret = ff_hevc_split_packet(&ctx->pkt, buf, buf_size, avctx, 0, 0);
+    ret = ff_hevc_split_packet(NULL, &ctx->pkt, buf, buf_size, avctx, 0, 0);
     if (ret < 0)
         return ret;
 
@@ -108,12 +115,19 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
         case NAL_RADL_N:
         case NAL_RADL_R:
         case NAL_RASL_N:
-        case NAL_RASL_R: hevc_parse_slice_header(s, nal, avctx); break;
+        case NAL_RASL_R:
+            if (buf == avctx->extradata) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid NAL unit: %d\n", nal->type);
+                return AVERROR_INVALIDDATA;
+            }
+            hevc_parse_slice_header(s, nal, avctx);
+            break;
         }
     }
 
     return 0;
 }
+#endif
 
 /**
  * Find the end of the current frame in the bitstream.
@@ -122,9 +136,8 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
 static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
                                int buf_size)
 {
-    HEVCParserContext *ctx = s->priv_data;
-    ParseContext       *pc = &ctx->pc;
     int i;
+    ParseContext *pc = s->priv_data;
 
     for (i = 0; i < buf_size; i++) {
         int nut;
@@ -148,7 +161,6 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
             if (first_slice_segment_in_pic_flag) {
                 if (!pc->frame_start_found) {
                     pc->frame_start_found = 1;
-                    s->key_frame = nut >= NAL_BLA_W_LP && nut <= NAL_CRA_NUT;
                 } else { // First slice of next frame found
                     pc->frame_start_found = 0;
                     return i - 5;
@@ -160,12 +172,217 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
     return END_NOT_FOUND;
 }
 
-static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
+#if ADVANCED_PARSER
+/**
+ * Parse NAL units of found picture and decode some basic information.
+ *
+ * @param s parser context.
+ * @param avctx codec context.
+ * @param buf buffer with field/frame data.
+ * @param buf_size size of the buffer.
+ */
+static inline int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
+                           int buf_size, AVCodecContext *avctx)
+{
+    HEVCParserContext *ctx = s->priv_data;
+    HEVCContext       *h   = &ctx->h;
+    GetBitContext      *gb;
+    SliceHeader        *sh = &h->sh;
+    HEVCParamSets *ps = &h->ps;
+    HEVCPacket   *pkt = &ctx->pkt;
+    const uint8_t *buf_end = buf + buf_size;
+    int state = -1, i;
+    HEVCNAL *nal;
+    int is_global = buf == avctx->extradata;
+
+    if (!h->HEVClc)
+        h->HEVClc = av_mallocz(sizeof(HEVCLocalContext));
+    if (!h->HEVClc)
+        return AVERROR(ENOMEM);
+
+    gb = &h->HEVClc->gb;
+
+    /* set some sane default values */
+    s->pict_type         = AV_PICTURE_TYPE_I;
+    s->key_frame         = 0;
+    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
+
+    h->avctx = avctx;
+
+    if (!buf_size)
+        return 0;
+
+    if (pkt->nals_allocated < 1) {
+        HEVCNAL *tmp = av_realloc_array(pkt->nals, 1, sizeof(*tmp));
+        if (!tmp)
+            return AVERROR(ENOMEM);
+        pkt->nals = tmp;
+        memset(pkt->nals, 0, sizeof(*tmp));
+        pkt->nals_allocated = 1;
+    }
+
+    nal = &pkt->nals[0];
+
+    for (;;) {
+        int src_length, consumed;
+        int ret;
+        buf = avpriv_find_start_code(buf, buf_end, &state);
+        if (--buf + 2 >= buf_end)
+            break;
+        src_length = buf_end - buf;
+
+        h->nal_unit_type = (*buf >> 1) & 0x3f;
+        h->temporal_id   = (*(buf + 1) & 0x07) - 1;
+        if (h->nal_unit_type <= NAL_CRA_NUT) {
+            // Do not walk the whole buffer just to decode slice segment header
+            if (src_length > 20)
+                src_length = 20;
+        }
+
+        consumed = ff_hevc_extract_rbsp(NULL, buf, src_length, nal);
+        if (consumed < 0)
+            return consumed;
+
+        ret = init_get_bits8(gb, nal->data + 2, nal->size);
+        if (ret < 0)
+            return ret;
+
+        switch (h->nal_unit_type) {
+        case NAL_VPS:
+            ff_hevc_decode_nal_vps(gb, avctx, ps);
+            break;
+        case NAL_SPS:
+            ff_hevc_decode_nal_sps(gb, avctx, ps, 1);
+            break;
+        case NAL_PPS:
+            ff_hevc_decode_nal_pps(gb, avctx, ps);
+            break;
+        case NAL_SEI_PREFIX:
+        case NAL_SEI_SUFFIX:
+            ff_hevc_decode_nal_sei(h);
+            break;
+        case NAL_TRAIL_N:
+        case NAL_TRAIL_R:
+        case NAL_TSA_N:
+        case NAL_TSA_R:
+        case NAL_STSA_N:
+        case NAL_STSA_R:
+        case NAL_RADL_N:
+        case NAL_RADL_R:
+        case NAL_RASL_N:
+        case NAL_RASL_R:
+        case NAL_BLA_W_LP:
+        case NAL_BLA_W_RADL:
+        case NAL_BLA_N_LP:
+        case NAL_IDR_W_RADL:
+        case NAL_IDR_N_LP:
+        case NAL_CRA_NUT:
+
+            if (is_global) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid NAL unit: %d\n", h->nal_unit_type);
+                return AVERROR_INVALIDDATA;
+            }
+
+            sh->first_slice_in_pic_flag = get_bits1(gb);
+            s->picture_structure = h->picture_struct;
+            s->field_order = h->picture_struct;
+
+            if (IS_IRAP(h)) {
+                s->key_frame = 1;
+                sh->no_output_of_prior_pics_flag = get_bits1(gb);
+            }
+
+            sh->pps_id = get_ue_golomb(gb);
+            if (sh->pps_id >= MAX_PPS_COUNT || !ps->pps_list[sh->pps_id]) {
+                av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
+                return AVERROR_INVALIDDATA;
+            }
+            ps->pps = (HEVCPPS*)ps->pps_list[sh->pps_id]->data;
+
+            if (ps->pps->sps_id >= MAX_SPS_COUNT || !ps->sps_list[ps->pps->sps_id]) {
+                av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", ps->pps->sps_id);
+                return AVERROR_INVALIDDATA;
+            }
+            if (ps->sps != (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data) {
+                ps->sps = (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data;
+                ps->vps = (HEVCVPS*)ps->vps_list[ps->sps->vps_id]->data;
+            }
+
+            if (!sh->first_slice_in_pic_flag) {
+                int slice_address_length;
+
+                if (ps->pps->dependent_slice_segments_enabled_flag)
+                    sh->dependent_slice_segment_flag = get_bits1(gb);
+                else
+                    sh->dependent_slice_segment_flag = 0;
+
+                slice_address_length = av_ceil_log2_c(ps->sps->ctb_width *
+                                                      ps->sps->ctb_height);
+                sh->slice_segment_addr = slice_address_length ? get_bits(gb, slice_address_length) : 0;
+                if (sh->slice_segment_addr >= ps->sps->ctb_width * ps->sps->ctb_height) {
+                    av_log(avctx, AV_LOG_ERROR, "Invalid slice segment address: %u.\n",
+                           sh->slice_segment_addr);
+                    return AVERROR_INVALIDDATA;
+                }
+            } else
+                sh->dependent_slice_segment_flag = 0;
+
+            if (sh->dependent_slice_segment_flag)
+                break;
+
+            for (i = 0; i < ps->pps->num_extra_slice_header_bits; i++)
+                skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
+
+            sh->slice_type = get_ue_golomb(gb);
+            if (!(sh->slice_type == I_SLICE || sh->slice_type == P_SLICE ||
+                  sh->slice_type == B_SLICE)) {
+                av_log(avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
+                       sh->slice_type);
+                return AVERROR_INVALIDDATA;
+            }
+            s->pict_type = sh->slice_type == B_SLICE ? AV_PICTURE_TYPE_B :
+                           sh->slice_type == P_SLICE ? AV_PICTURE_TYPE_P :
+                                                       AV_PICTURE_TYPE_I;
+
+            if (ps->pps->output_flag_present_flag)
+                sh->pic_output_flag = get_bits1(gb);
+
+            if (ps->sps->separate_colour_plane_flag)
+                sh->colour_plane_id = get_bits(gb, 2);
+
+            if (!IS_IDR(h)) {
+                sh->pic_order_cnt_lsb = get_bits(gb, ps->sps->log2_max_poc_lsb);
+                s->output_picture_number = h->poc = ff_hevc_compute_poc(h, sh->pic_order_cnt_lsb);
+            } else
+                s->output_picture_number = h->poc = 0;
+
+            if (h->temporal_id == 0 &&
+                h->nal_unit_type != NAL_TRAIL_N &&
+                h->nal_unit_type != NAL_TSA_N &&
+                h->nal_unit_type != NAL_STSA_N &&
+                h->nal_unit_type != NAL_RADL_N &&
+                h->nal_unit_type != NAL_RASL_N &&
+                h->nal_unit_type != NAL_RADL_R &&
+                h->nal_unit_type != NAL_RASL_R)
+                h->pocTid0 = h->poc;
+
+            return 0; /* no need to evaluate the rest */
+        }
+        buf += consumed;
+    }
+    /* didn't find a picture! */
+    if (!is_global)
+        av_log(h->avctx, AV_LOG_ERROR, "missing picture in access unit\n");
+    return -1;
+}
+#endif
+
+static int hevc_parse(AVCodecParserContext *s,
+                      AVCodecContext *avctx,
                       const uint8_t **poutbuf, int *poutbuf_size,
                       const uint8_t *buf, int buf_size)
 {
     int next;
-
     HEVCParserContext *ctx = s->priv_data;
     ParseContext *pc = &ctx->pc;
 
@@ -195,20 +412,31 @@ static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 // Split after the parameter sets at the beginning of the stream if they exist.
 static int hevc_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
     uint32_t state = -1;
-    int has_ps = 0;
-
-    for (i = 0; i < buf_size; i++) {
-        state = (state << 8) | buf[i];
-        if (((state >> 8) & 0xFFFFFF) == START_CODE) {
-            int nut = (state >> 1) & 0x3F;
-            if (nut >= NAL_VPS && nut <= NAL_PPS)
-                has_ps = 1;
-            else if (has_ps)
-                return i - 3;
-            else // no parameter set at the beginning of the stream
-                return 0;
+    int has_vps = 0;
+    int has_sps = 0;
+    int has_pps = 0;
+    int nut;
+
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state >> 8) != START_CODE)
+            break;
+        nut = (state >> 1) & 0x3F;
+        if (nut == NAL_VPS)
+            has_vps = 1;
+        else if (nut == NAL_SPS)
+            has_sps = 1;
+        else if (nut == NAL_PPS)
+            has_pps = 1;
+        else if ((nut != NAL_SEI_PREFIX || has_pps) &&
+                  nut != NAL_AUD) {
+            if (has_vps && has_sps) {
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
+            }
         }
     }
     return 0;
@@ -219,6 +447,21 @@ static void hevc_parser_close(AVCodecParserContext *s)
     HEVCParserContext *ctx = s->priv_data;
     int i;
 
+#if ADVANCED_PARSER
+    HEVCContext  *h  = &ctx->h;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.vps_list); i++)
+        av_buffer_unref(&h->ps.vps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.sps_list); i++)
+        av_buffer_unref(&h->ps.sps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list); i++)
+        av_buffer_unref(&h->ps.pps_list[i]);
+
+    h->ps.sps = NULL;
+
+    av_freep(&h->HEVClc);
+#endif
+
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.vps_list); i++)
         av_buffer_unref(&ctx->ps.vps_list[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.sps_list); i++)
@@ -226,8 +469,12 @@ static void hevc_parser_close(AVCodecParserContext *s)
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.pps_list); i++)
         av_buffer_unref(&ctx->ps.pps_list[i]);
 
-    for (i = 0; i < ctx->pkt.nals_allocated; i++)
+    ctx->ps.sps = NULL;
+
+    for (i = 0; i < ctx->pkt.nals_allocated; i++) {
         av_freep(&ctx->pkt.nals[i].rbsp_buffer);
+        av_freep(&ctx->pkt.nals[i].skipped_bytes_pos);
+    }
     av_freep(&ctx->pkt.nals);
     ctx->pkt.nals_allocated = 0;
 
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index 2faee96bbf..427cf098aa 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -6,25 +6,24 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
-
 #include "golomb.h"
 #include "hevc.h"
 
@@ -88,6 +87,8 @@ static void remove_sps(HEVCParamSets *s, int id)
         for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
             if (s->pps_list[i] && ((HEVCPPS*)s->pps_list[i]->data)->sps_id == id)
                 remove_pps(s, i);
+
+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCSPS*)s->sps_list[id]->data));
     }
     av_buffer_unref(&s->sps_list[id]);
 }
@@ -121,7 +122,8 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 
     if (rps_predict) {
         const ShortTermRPS *rps_ridx;
-        int delta_rps, abs_delta_rps;
+        int delta_rps;
+        unsigned abs_delta_rps;
         uint8_t use_delta_flag = 0;
         uint8_t delta_rps_sign;
 
@@ -134,11 +136,18 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             }
             rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
         } else
             rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
 
         delta_rps_sign = get_bits1(gb);
         abs_delta_rps  = get_ue_golomb_long(gb) + 1;
+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid value of abs_delta_rps: %d\n",
+                   abs_delta_rps);
+            return AVERROR_INVALIDDATA;
+        }
         delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
         for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
             int used = rps->used[k] = get_bits1(gb);
@@ -226,11 +235,14 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 }
 
 
-static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
+static int decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
                                       PTLCommon *ptl)
 {
     int i;
 
+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
+        return -1;
+
     ptl->profile_space = get_bits(gb, 2);
     ptl->tier_flag     = get_bits1(gb);
     ptl->profile_idc   = get_bits(gb, 5);
@@ -240,6 +252,8 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
     else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
         av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
     else
         av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
 
@@ -253,28 +267,48 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
     skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
+
+    return 0;
 }
 
-static void parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
+static int parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
                       PTL *ptl, int max_num_sub_layers)
 {
     int i;
-    decode_profile_tier_level(gb, avctx, &ptl->general_ptl);
+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
+        get_bits_left(gb) < 8 + 8*2) {
+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
+        return -1;
+    }
+
     ptl->general_ptl.level_idc = get_bits(gb, 8);
 
     for (i = 0; i < max_num_sub_layers - 1; i++) {
         ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
         ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
     }
-    if (max_num_sub_layers - 1 > 0)
+
+    if (max_num_sub_layers - 1> 0)
         for (i = max_num_sub_layers - 1; i < 8; i++)
             skip_bits(gb, 2); // reserved_zero_2bits[i]
     for (i = 0; i < max_num_sub_layers - 1; i++) {
-        if (ptl->sub_layer_profile_present_flag[i])
-            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]);
-        if (ptl->sub_layer_level_present_flag[i])
-            ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        if (ptl->sub_layer_profile_present_flag[i] &&
+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "PTL information for sublayer %i too short\n", i);
+            return -1;
+        }
+        if (ptl->sub_layer_level_present_flag[i]) {
+            if (get_bits_left(gb) < 8) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not enough data for sublayer %i level_idc\n", i);
+                return -1;
+            } else
+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        }
     }
+
+    return 0;
 }
 
 static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
@@ -294,7 +328,7 @@ static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
     }
 }
 
-static void decode_hrd(GetBitContext *gb, int common_inf_present,
+static int decode_hrd(GetBitContext *gb, int common_inf_present,
                        int max_sublayers)
 {
     int nal_params_present = 0, vcl_params_present = 0;
@@ -340,14 +374,20 @@ static void decode_hrd(GetBitContext *gb, int common_inf_present,
         else
             low_delay = get_bits1(gb);
 
-        if (!low_delay)
+        if (!low_delay) {
             nb_cpb = get_ue_golomb_long(gb) + 1;
+            if (nb_cpb < 1 || nb_cpb > 32) {
+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
+                return AVERROR_INVALIDDATA;
+            }
+        }
 
         if (nal_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
         if (vcl_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
     }
+    return 0;
 }
 
 int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
@@ -390,7 +430,8 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         goto err;
     }
 
-    parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers);
+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
+        goto err;
 
     vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
 
@@ -400,7 +441,7 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
         vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
 
-        if (vps->vps_max_dec_pic_buffering[i] > MAX_DPB_SIZE) {
+        if (vps->vps_max_dec_pic_buffering[i] > MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
             av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
                    vps->vps_max_dec_pic_buffering[i] - 1);
             goto err;
@@ -415,6 +456,12 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
 
     vps->vps_max_layer_id   = get_bits(gb, 6);
     vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
+        goto err;
+    }
+
     for (i = 1; i < vps->vps_num_layer_sets; i++)
         for (j = 0; j <= vps->vps_max_layer_id; j++)
             skip_bits(gb, 1);  // layer_id_included_flag[i][j]
@@ -427,6 +474,11 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         if (vps->vps_poc_proportional_to_timing_flag)
             vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
         vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
+            goto err;
+        }
         for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
             int common_inf_present = 1;
 
@@ -438,6 +490,13 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
     }
     get_bits1(gb); /* vps_extension_flag */
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread VPS by %d bits\n", -get_bits_left(gb));
+        if (ps->vps_list[vps_id])
+            goto err;
+    }
+
     if (ps->vps_list[vps_id] &&
         !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
         av_buffer_unref(&vps_buf);
@@ -457,7 +516,8 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
                        int apply_defdispwin, HEVCSPS *sps)
 {
     VUI *vui          = &sps->vui;
-    int sar_present;
+    GetBitContext backup;
+    int sar_present, alt = 0;
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
 
@@ -510,7 +570,14 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
     vui->field_seq_flag                = get_bits1(gb);
     vui->frame_field_info_present_flag = get_bits1(gb);
 
-    vui->default_display_window_flag = get_bits1(gb);
+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
+        vui->default_display_window_flag = 0;
+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
+    } else
+        vui->default_display_window_flag = get_bits1(gb);
+    // Backup context in case an alternate header is detected
+    memcpy(&backup, gb, sizeof(backup));
+
     if (vui->default_display_window_flag) {
         //TODO: * 2 is only valid for 420
         vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * 2;
@@ -536,9 +603,24 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     vui->vui_timing_info_present_flag = get_bits1(gb);
+
     if (vui->vui_timing_info_present_flag) {
+        if( get_bits_left(gb) < 66) {
+            // The alternate syntax seem to have timing info located
+            // at where def_disp_win is normally located
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI timing information, retrying...\n");
+            vui->default_display_window_flag = 0;
+            memset(&vui->def_disp_win, 0, sizeof(vui->def_disp_win));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+        }
         vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
         vui->vui_time_scale                      = get_bits_long(gb, 32);
+        if (alt) {
+            av_log(avctx, AV_LOG_INFO, "Retry got %i/%i fps\n",
+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
+        }
         vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
         if (vui->vui_poc_proportional_to_timing_flag)
             vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
@@ -583,19 +665,24 @@ static void set_default_scaling_list_data(ScalingList *sl)
     memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
     memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
     memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-    memcpy(sl->sl[3][1], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
 }
 
-static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl)
+static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl, HEVCSPS *sps)
 {
-    uint8_t scaling_list_pred_mode_flag[4][6];
+    uint8_t scaling_list_pred_mode_flag;
     int32_t scaling_list_dc_coef[2][6];
-    int size_id, matrix_id, i, pos;
+    int size_id, matrix_id, pos;
+    int i;
 
     for (size_id = 0; size_id < 4; size_id++)
-        for (matrix_id = 0; matrix_id < (size_id == 3 ? 2 : 6); matrix_id++) {
-            scaling_list_pred_mode_flag[size_id][matrix_id] = get_bits1(gb);
-            if (!scaling_list_pred_mode_flag[size_id][matrix_id]) {
+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
+            scaling_list_pred_mode_flag = get_bits1(gb);
+            if (!scaling_list_pred_mode_flag) {
                 unsigned int delta = get_ue_golomb_long(gb);
                 /* Only need to handle non-zero delta. Zero means default,
                  * which should already be in the arrays. */
@@ -639,26 +726,58 @@ static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingLi
             }
         }
 
+    if (sps->chroma_format_idc == 3) {
+        for (i = 0; i < 64; i++) {
+            sl->sl[3][1][i] = sl->sl[2][1][i];
+            sl->sl[3][2][i] = sl->sl[2][2][i];
+            sl->sl[3][4][i] = sl->sl[2][4][i];
+            sl->sl[3][5][i] = sl->sl[2][5][i];
+        }
+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
+    }
+
+
     return 0;
 }
 
 static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
 {
     const AVPixFmtDescriptor *desc;
-    if (sps->chroma_format_idc == 1) {
-        switch (sps->bit_depth) {
-        case 8:  sps->pix_fmt = AV_PIX_FMT_YUV420P;   break;
-        case 9:  sps->pix_fmt = AV_PIX_FMT_YUV420P9;  break;
-        case 10: sps->pix_fmt = AV_PIX_FMT_YUV420P10; break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n",
-                   sps->bit_depth);
-            return AVERROR_PATCHWELCOME;
-        }
-    } else {
+    switch (sps->bit_depth) {
+    case 8:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+       break;
+    case 9:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P9;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P9;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P9;
+        break;
+    case 10:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+        break;
+    case 12:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P12;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P12;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P12;
+        break;
+    default:
         av_log(avctx, AV_LOG_ERROR,
-               "non-4:2:0 support is currently unspecified.\n");
-        return AVERROR_PATCHWELCOME;
+               "4:2:0, 4:2:2, 4:4:4 supports are currently specified for 8, 10 and 12 bits.\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "chroma_format_idc is %d, depth is %d",
+               sps->chroma_format_idc, sps->bit_depth);
+        return AVERROR_INVALIDDATA;
     }
 
     desc = av_pix_fmt_desc_get(sps->pix_fmt);
@@ -687,52 +806,46 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->vps_id = get_bits(gb, 4);
     if (sps->vps_id >= MAX_VPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     if (vps_list && !vps_list[sps->vps_id]) {
         av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
                sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->max_sub_layers = get_bits(gb, 3) + 1;
     if (sps->max_sub_layers > MAX_SUB_LAYERS) {
         av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
                sps->max_sub_layers);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     skip_bits1(gb); // temporal_id_nesting_flag
 
-    parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers);
+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
+        return ret;
 
     *sps_id = get_ue_golomb_long(gb);
     if (*sps_id >= MAX_SPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->chroma_format_idc = get_ue_golomb_long(gb);
-    if (sps->chroma_format_idc != 1) {
-        avpriv_report_missing_feature(avctx, "chroma_format_idc %d",
-                                      sps->chroma_format_idc);
-        ret = AVERROR_PATCHWELCOME;
-        goto err;
-    }
 
     if (sps->chroma_format_idc == 3)
         sps->separate_colour_plane_flag = get_bits1(gb);
 
+    if (sps->separate_colour_plane_flag)
+        sps->chroma_format_idc = 0;
+
     sps->width  = get_ue_golomb_long(gb);
     sps->height = get_ue_golomb_long(gb);
     if ((ret = av_image_check_size(sps->width,
                                    sps->height, 0, avctx)) < 0)
-        goto err;
+        return ret;
 
     if (get_bits1(gb)) { // pic_conformance_flag
         //TODO: * 2 is only valid for 420
@@ -760,26 +873,23 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
 
     sps->bit_depth   = get_ue_golomb_long(gb) + 8;
     bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-    if (bit_depth_chroma != sps->bit_depth) {
+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
         av_log(avctx, AV_LOG_ERROR,
                "Luma bit depth (%d) is different from chroma bit depth (%d), "
                "this is unsupported.\n",
                sps->bit_depth, bit_depth_chroma);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-
     ret = map_pixel_format(avctx, sps);
     if (ret < 0)
-        goto err;
+        return ret;
 
     sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
     if (sps->log2_max_poc_lsb > 16) {
         av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
                sps->log2_max_poc_lsb - 4);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sublayer_ordering_info = get_bits1(gb);
@@ -791,16 +901,14 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         if (sps->temporal_layer[i].max_dec_pic_buffering > MAX_DPB_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
                    sps->temporal_layer[i].max_dec_pic_buffering - 1);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
             av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
                    sps->temporal_layer[i].num_reorder_pics);
             if (avctx->err_recognition & AV_EF_EXPLODE ||
                 sps->temporal_layer[i].num_reorder_pics > MAX_DPB_SIZE - 1) {
-                ret = AVERROR_INVALIDDATA;
-                goto err;
+                return AVERROR_INVALIDDATA;
             }
             sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
         }
@@ -821,11 +929,26 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
                                                sps->log2_min_tb_size;
 
-    if (sps->log2_min_tb_size >= sps->log2_min_cb_size) {
+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_diff_max_min_coding_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
         av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
+        return AVERROR_INVALIDDATA;
     }
+
     sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
     sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
 
@@ -834,9 +957,9 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         set_default_scaling_list_data(&sps->scaling_list);
 
         if (get_bits1(gb)) {
-            ret = scaling_list_data(gb, avctx, &sps->scaling_list);
+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
             if (ret < 0)
-                goto err;
+                return ret;
         }
     }
 
@@ -854,8 +977,7 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
             av_log(avctx, AV_LOG_ERROR,
                    "PCM bit depth (%d) is greater than normal bit depth (%d)\n",
                    sps->pcm.bit_depth, sps->bit_depth);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
 
         sps->pcm.loop_filter_disable_flag = get_bits1(gb);
@@ -865,18 +987,22 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     if (sps->nb_st_rps > MAX_SHORT_TERM_RPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
                sps->nb_st_rps);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     for (i = 0; i < sps->nb_st_rps; i++) {
         if ((ret = ff_hevc_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
                                                  sps, 0)) < 0)
-            goto err;
+            return ret;
     }
 
     sps->long_term_ref_pics_present_flag = get_bits1(gb);
     if (sps->long_term_ref_pics_present_flag) {
         sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
+        if (sps->num_long_term_ref_pics_sps > 31U) {
+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
+                   sps->num_long_term_ref_pics_sps);
+            return AVERROR_INVALIDDATA;
+        }
         for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
             sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
             sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
@@ -889,8 +1015,42 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     vui_present = get_bits1(gb);
     if (vui_present)
         decode_vui(gb, avctx, apply_defdispwin, sps);
-    skip_bits1(gb); // sps_extension_flag
 
+    if (get_bits1(gb)) { // sps_extension_flag
+        int sps_extension_flag[1];
+        for (i = 0; i < 1; i++)
+            sps_extension_flag[i] = get_bits1(gb);
+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+        if (sps_extension_flag[0]) {
+            int extended_precision_processing_flag;
+            int high_precision_offsets_enabled_flag;
+            int cabac_bypass_alignment_enabled_flag;
+
+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            extended_precision_processing_flag = get_bits1(gb);
+            if (extended_precision_processing_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "extended_precision_processing_flag not yet implemented\n");
+
+            sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+            high_precision_offsets_enabled_flag  = get_bits1(gb);
+            if (high_precision_offsets_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "high_precision_offsets_enabled_flag not yet implemented\n");
+
+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+
+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
+            if (cabac_bypass_alignment_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
+        }
+    }
     if (apply_defdispwin) {
         sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
         sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
@@ -908,19 +1068,17 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                          (sps->output_window.left_offset + sps->output_window.right_offset);
     sps->output_height = sps->height -
                          (sps->output_window.top_offset + sps->output_window.bottom_offset);
-    if (sps->output_width <= 0 || sps->output_height <= 0) {
+    if (sps->width  <= sps->output_window.left_offset + (int64_t)sps->output_window.right_offset  ||
+        sps->height <= sps->output_window.top_offset  + (int64_t)sps->output_window.bottom_offset) {
         av_log(avctx, AV_LOG_WARNING, "Invalid visible frame dimensions: %dx%d.\n",
                sps->output_width, sps->output_height);
         if (avctx->err_recognition & AV_EF_EXPLODE) {
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         av_log(avctx, AV_LOG_WARNING,
                "Displaying the whole video surface.\n");
-        sps->output_window.left_offset   =
-        sps->output_window.right_offset  =
-        sps->output_window.top_offset    =
-        sps->output_window.bottom_offset = 0;
+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
+        memset(&sps->output_window, 0, sizeof(sps->output_window));
         sps->output_width               = sps->width;
         sps->output_height              = sps->height;
     }
@@ -930,6 +1088,19 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                          sps->log2_diff_max_min_coding_block_size;
     sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
 
+    if (sps->log2_ctb_size > MAX_LOG2_CTB_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+    if (sps->log2_ctb_size < 4) {
+        av_log(avctx,
+               AV_LOG_ERROR,
+               "log2_ctb_size %d differs from the bounds of any known profile\n",
+               sps->log2_ctb_size);
+        avpriv_request_sample(avctx, "log2_ctb_size %d", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_size   = sps->ctb_width * sps->ctb_height;
@@ -940,40 +1111,40 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
     sps->min_pu_width  = sps->width  >> sps->log2_min_pu_size;
     sps->min_pu_height = sps->height >> sps->log2_min_pu_size;
+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
 
     sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
 
-    if (sps->width  & ((1 << sps->log2_min_cb_size) - 1) ||
-        sps->height & ((1 << sps->log2_min_cb_size) - 1)) {
+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
         av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    if (sps->log2_ctb_size > MAX_LOG2_CTB_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
-        goto err;
-    }
     if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
                sps->max_transform_hierarchy_depth_inter);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
                sps->max_transform_hierarchy_depth_intra);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
         av_log(avctx, AV_LOG_ERROR,
                "max transform block size out of range: %d\n",
                sps->log2_max_trafo_size);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread SPS by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
 
-err:
-    return ret < 0 ? ret : AVERROR_INVALIDDATA;
+    return 0;
 }
 
 int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
@@ -1034,16 +1205,52 @@ static void hevc_pps_free(void *opaque, uint8_t *data)
     av_freep(&pps->ctb_addr_ts_to_rs);
     av_freep(&pps->tile_pos_rs);
     av_freep(&pps->tile_id);
-    av_freep(&pps->min_tb_addr_zs);
+    av_freep(&pps->min_tb_addr_zs_tab);
 
     av_freep(&pps);
 }
 
+static int pps_range_extensions(GetBitContext *gb, AVCodecContext *avctx,
+                                HEVCPPS *pps, HEVCSPS *sps) {
+    int i;
+
+    if (pps->transform_skip_enabled_flag) {
+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
+    }
+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
+    if (pps->chroma_qp_offset_list_enabled_flag) {
+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
+        if (pps->chroma_qp_offset_list_len_minus1 && pps->chroma_qp_offset_list_len_minus1 >= 5) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) {
+            pps->cb_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cb_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+            pps->cr_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cr_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+        }
+    }
+    pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
+    pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
+
+    return(0);
+}
+
 static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                             HEVCPPS *pps, HEVCSPS *sps)
 {
     int log2_diff;
-    int pic_area_in_ctbs, pic_area_in_min_tbs;
+    int pic_area_in_ctbs;
     int i, j, x, y, ctb_addr_rs, tile_id;
 
     // Inferred parameters
@@ -1090,14 +1297,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
      * 6.5
      */
     pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
-    pic_area_in_min_tbs  = sps->min_tb_width * sps->min_tb_height;
 
     pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
     pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
     pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-    pps->min_tb_addr_zs    = av_malloc_array(pic_area_in_min_tbs, sizeof(*pps->min_tb_addr_zs));
+    pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
     if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-        !pps->tile_id || !pps->min_tb_addr_zs) {
+        !pps->tile_id || !pps->min_tb_addr_zs_tab) {
         return AVERROR(ENOMEM);
     }
 
@@ -1150,8 +1356,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
 
     log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
-    for (y = 0; y < sps->min_tb_height; y++) {
-        for (x = 0; x < sps->min_tb_width; x++) {
+    pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
+    for (y = 0; y < sps->tb_mask+2; y++) {
+        pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1;
+        pps->min_tb_addr_zs_tab[y]    = -1;
+    }
+    for (y = 0; y < sps->tb_mask+1; y++) {
+        for (x = 0; x < sps->tb_mask+1; x++) {
             int tb_x = x >> log2_diff;
             int tb_y = y >> log2_diff;
             int rs   = sps->ctb_width * tb_y + tb_x;
@@ -1160,7 +1371,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 int m = 1 << i;
                 val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
             }
-            pps->min_tb_addr_zs[y * sps->min_tb_width + x] = val;
+            pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val;
         }
     }
 
@@ -1197,6 +1408,7 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     pps->disable_dbf                           = 0;
     pps->beta_offset                           = 0;
     pps->tc_offset                             = 0;
+    pps->log2_max_transform_skip_block_size    = 2;
 
     // Coded parameters
     pps_id = get_ue_golomb_long(gb);
@@ -1239,6 +1451,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->cu_qp_delta_enabled_flag)
         pps->diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
 
+    if (pps->diff_cu_qp_delta_depth < 0 ||
+        pps->diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
+        av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
+               pps->diff_cu_qp_delta_depth);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
     pps->cb_qp_offset = get_se_golomb(gb);
     if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
         av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
@@ -1265,14 +1485,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->tiles_enabled_flag) {
         pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
         pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-        if (pps->num_tile_columns == 0 ||
+        if (pps->num_tile_columns <= 0 ||
             pps->num_tile_columns >= sps->width) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
                    pps->num_tile_columns - 1);
             ret = AVERROR_INVALIDDATA;
             goto err;
         }
-        if (pps->num_tile_rows == 0 ||
+        if (pps->num_tile_rows <= 0 ||
             pps->num_tile_rows >= sps->height) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
                    pps->num_tile_rows - 1);
@@ -1343,7 +1563,7 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     pps->scaling_list_data_present_flag = get_bits1(gb);
     if (pps->scaling_list_data_present_flag) {
         set_default_scaling_list_data(&pps->scaling_list);
-        ret = scaling_list_data(gb, avctx, &pps->scaling_list);
+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
         if (ret < 0)
             goto err;
     }
@@ -1357,12 +1577,26 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     pps->slice_header_extension_present_flag = get_bits1(gb);
-    skip_bits1(gb);     // pps_extension_flag
+
+    if (get_bits1(gb)) { // pps_extension_present_flag
+        int pps_range_extensions_flag = get_bits1(gb);
+        /* int pps_extension_7bits = */ get_bits(gb, 7);
+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
+                goto err;
+        }
+    }
 
     ret = setup_pps(avctx, gb, pps, sps);
     if (ret < 0)
         goto err;
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread PPS by %d bits\n", -get_bits_left(gb));
+        goto err;
+    }
+
     remove_pps(ps, pps_id);
     ps->pps_list[pps_id] = pps_buf;
 
diff --git a/libavcodec/hevc_ps_enc.c b/libavcodec/hevc_ps_enc.c
index 007a1321ed..c05bf63d3b 100644
--- a/libavcodec/hevc_ps_enc.c
+++ b/libavcodec/hevc_ps_enc.c
@@ -1,20 +1,20 @@
 /*
  * HEVC Parameter Set encoding
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index b51f2594a1..611ad458de 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,8 +57,7 @@ RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0)
 {
     int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
     int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
-    int pic_width_cb = (s->ps.sps->width + (1 << s->ps.sps->log2_ctb_size) - 1) >>
-        s->ps.sps->log2_ctb_size;
+    int pic_width_cb = s->ps.sps->ctb_width;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
     return (RefPicList *)ref->rpl_tab[ctb_addr_ts];
 }
@@ -109,6 +108,9 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         for (j = 0; j < frame->ctb_count; j++)
             frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
 
+        frame->frame->top_field_first  = s->picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+        frame->frame->interlaced_frame = (s->picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+
         if (s->avctx->hwaccel) {
             const AVHWAccel *hwaccel = s->avctx->hwaccel;
             av_assert0(!frame->hwaccel_picture_private);
@@ -121,7 +123,6 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         }
 
         return frame;
-
 fail:
         ff_hevc_unref_frame(s, frame, ~0);
         return NULL;
@@ -173,12 +174,22 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
         int min_poc   = INT_MAX;
         int i, min_idx, ret;
 
+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+                HEVCFrame *frame = &s->DPB[i];
+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
+                        frame->sequence == s->seq_output) {
+                    ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+                }
+            }
+        }
+
         for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
             HEVCFrame *frame = &s->DPB[i];
             if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
                 frame->sequence == s->seq_output) {
                 nb_output++;
-                if (frame->poc < min_poc) {
+                if (frame->poc < min_poc || nb_output == 1) {
                     min_poc = frame->poc;
                     min_idx = i;
                 }
@@ -192,16 +203,16 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
 
         if (nb_output) {
             HEVCFrame *frame = &s->DPB[min_idx];
-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->frame->format);
-            int pixel_shift;
-
-            if (!desc)
-                return AVERROR_BUG;
-
-            pixel_shift = desc->comp[0].depth > 8;
-
-            ret = av_frame_ref(out, frame->frame);
-            ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+            AVFrame *dst = out;
+            AVFrame *src = frame->frame;
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+            int pixel_shift = !!(desc->comp[0].depth > 8);
+
+            ret = av_frame_ref(out, src);
+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
+            else
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
             if (ret < 0)
                 return ret;
 
@@ -209,8 +220,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
                 int hshift = (i > 0) ? desc->log2_chroma_w : 0;
                 int vshift = (i > 0) ? desc->log2_chroma_h : 0;
                 int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
-                          (frame->window.top_offset   >> vshift) * out->linesize[i];
-                out->data[i] += off;
+                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
+                dst->data[i] += off;
             }
             av_log(s->avctx, AV_LOG_DEBUG,
                    "Output frame with POC %d.\n", frame->poc);
@@ -226,6 +237,46 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
     return 0;
 }
 
+void ff_hevc_bump_frame(HEVCContext *s)
+{
+    int dpb = 0;
+    int min_poc = INT_MAX;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+        if ((frame->flags) &&
+            frame->sequence == s->seq_output &&
+            frame->poc != s->poc) {
+            dpb++;
+        }
+    }
+
+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if ((frame->flags) &&
+                frame->sequence == s->seq_output &&
+                frame->poc != s->poc) {
+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
+                    min_poc = frame->poc;
+                }
+            }
+        }
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
+                frame->sequence == s->seq_output &&
+                frame->poc <= min_poc) {
+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
+            }
+        }
+
+        dpb--;
+    }
+}
+
 static int init_slice_rpl(HEVCContext *s)
 {
     HEVCFrame *frame = s->ref;
@@ -335,8 +386,9 @@ static HEVCFrame *find_ref_idx(HEVCContext *s, int poc)
         }
     }
 
-    av_log(s->avctx, AV_LOG_ERROR,
-           "Could not find ref with POC %d\n", poc);
+    if (s->nal_unit_type != NAL_CRA_NUT && !IS_BLA(s))
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Could not find ref with POC %d\n", poc);
     return NULL;
 }
 
@@ -374,7 +426,8 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
     frame->sequence = s->seq_decode;
     frame->flags    = 0;
 
-    ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+    if (s->threads_type == FF_THREAD_FRAME)
+        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
 
     return frame;
 }
diff --git a/libavcodec/hevc_sei.c b/libavcodec/hevc_sei.c
index 590d9fd53a..179b045864 100644
--- a/libavcodec/hevc_sei.c
+++ b/libavcodec/hevc_sei.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,10 +56,13 @@ enum HEVC_SEI_TYPE {
 static int decode_nal_sei_decoded_picture_hash(HEVCContext *s)
 {
     int cIdx, i;
-    GetBitContext *gb = &s->HEVClc.gb;
-    uint8_t hash_type = get_bits(gb, 8);
+    uint8_t hash_type;
+    //uint16_t picture_crc;
+    //uint32_t picture_checksum;
+    GetBitContext *gb = &s->HEVClc->gb;
+    hash_type = get_bits(gb, 8);
 
-    for (cIdx = 0; cIdx < 3; cIdx++) {
+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
         if (hash_type == 0) {
             s->is_md5 = 1;
             for (i = 0; i < 16; i++)
@@ -68,7 +71,7 @@ static int decode_nal_sei_decoded_picture_hash(HEVCContext *s)
             // picture_crc = get_bits(gb, 16);
             skip_bits(gb, 16);
         } else if (hash_type == 2) {
-            // picture_checksum = get_bits(gb, 32);
+            // picture_checksum = get_bits_long(gb, 32);
             skip_bits(gb, 32);
         }
     }
@@ -77,7 +80,7 @@ static int decode_nal_sei_decoded_picture_hash(HEVCContext *s)
 
 static int decode_nal_sei_frame_packing_arrangement(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     get_ue_golomb(gb);                  // frame_packing_arrangement_id
     s->sei_frame_packing_present = !get_bits1(gb);
@@ -103,7 +106,7 @@ static int decode_nal_sei_frame_packing_arrangement(HEVCContext *s)
 
 static int decode_nal_sei_display_orientation(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     s->sei_display_orientation_present = !get_bits1(gb);
 
@@ -118,9 +121,64 @@ static int decode_nal_sei_display_orientation(HEVCContext *s)
     return 0;
 }
 
+static int decode_pic_timing(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    HEVCSPS *sps;
+
+    if (!s->ps.sps_list[s->active_seq_parameter_set_id])
+        return(AVERROR(ENOMEM));
+    sps = (HEVCSPS*)s->ps.sps_list[s->active_seq_parameter_set_id]->data;
+
+    if (sps->vui.frame_field_info_present_flag) {
+        int pic_struct = get_bits(gb, 4);
+        s->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
+        if (pic_struct == 2) {
+            av_log(s->avctx, AV_LOG_DEBUG, "BOTTOM Field\n");
+            s->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+        } else if (pic_struct == 1) {
+            av_log(s->avctx, AV_LOG_DEBUG, "TOP Field\n");
+            s->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
+        }
+        get_bits(gb, 2);                   // source_scan_type
+        get_bits(gb, 1);                   // duplicate_flag
+    }
+    return 1;
+}
+
+static int active_parameter_sets(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    int num_sps_ids_minus1;
+    int i;
+    unsigned active_seq_parameter_set_id;
+
+    get_bits(gb, 4); // active_video_parameter_set_id
+    get_bits(gb, 1); // self_contained_cvs_flag
+    get_bits(gb, 1); // num_sps_ids_minus1
+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
+
+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
+        av_log(s->avctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
+        return AVERROR_INVALIDDATA;
+    }
+
+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
+    if (active_seq_parameter_set_id >= MAX_SPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
+
+    for (i = 1; i <= num_sps_ids_minus1; i++)
+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
+
+    return 0;
+}
+
 static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     switch (type) {
     case 256:  // Mismatched value from HM 8.1
@@ -129,6 +187,17 @@ static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
         return decode_nal_sei_frame_packing_arrangement(s);
     case SEI_TYPE_DISPLAY_ORIENTATION:
         return decode_nal_sei_display_orientation(s);
+    case SEI_TYPE_PICTURE_TIMING:
+        {
+            int ret = decode_pic_timing(s);
+            av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
+            skip_bits(gb, 8 * size);
+            return ret;
+        }
+    case SEI_TYPE_ACTIVE_PARAMETER_SETS:
+        active_parameter_sets(s);
+        av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
+        return 0;
     default:
         av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
         skip_bits_long(gb, 8 * size);
@@ -138,7 +207,7 @@ static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
 
 static int decode_nal_sei_suffix(HEVCContext *s, int type, int size)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     switch (type) {
     case SEI_TYPE_DECODED_PICTURE_HASH:
@@ -152,7 +221,7 @@ static int decode_nal_sei_suffix(HEVCContext *s, int type, int size)
 
 static int decode_nal_sei_message(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     int payload_type = 0;
     int payload_size = 0;
@@ -173,7 +242,7 @@ static int decode_nal_sei_message(HEVCContext *s)
     } else { /* nal_unit_type == NAL_SEI_SUFFIX */
         return decode_nal_sei_suffix(s, payload_type, payload_size);
     }
-    return 0;
+    return 1;
 }
 
 static int more_rbsp_data(GetBitContext *gb)
@@ -183,8 +252,12 @@ static int more_rbsp_data(GetBitContext *gb)
 
 int ff_hevc_decode_nal_sei(HEVCContext *s)
 {
+    int ret;
+
     do {
-        decode_nal_sei_message(s);
-    } while (more_rbsp_data(&s->HEVClc.gb));
-    return 0;
+        ret = decode_nal_sei_message(s);
+        if (ret < 0)
+            return(AVERROR(ENOMEM));
+    } while (more_rbsp_data(&s->HEVClc->gb));
+    return 1;
 }
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 216101a083..9d773d960e 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -89,14 +91,20 @@ static const int8_t transform[32][32] = {
       90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
 };
 
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
-    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
-    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
-    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
-    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
-    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
-    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
-    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][4]) = {
+    { -2, 58, 10, -2},
+    { -4, 54, 16, -2},
+    { -6, 46, 28, -4},
+    { -4, 36, 36, -4},
+    { -4, 28, 46, -6},
+    { -2, 16, 54, -4},
+    { -2, 10, 58, -2},
+};
+
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
 };
 
 #define BIT_DEPTH 8
@@ -111,62 +119,119 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 {
 #undef FUNC
 #define FUNC(a, depth) a ## _ ## depth
 
+#undef PEL_FUNC
+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
+    for(i = 0 ; i < 10 ; i++)                                                  \
+{                                                                              \
+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
+}
+
+#undef EPEL_FUNCS
+#define EPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
+
+#undef EPEL_UNI_FUNCS
+#define EPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
+
+#undef EPEL_BI_FUNCS
+#define EPEL_BI_FUNCS(depth)                                                \
+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
+
+#undef QPEL_FUNCS
+#define QPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
+
+#undef QPEL_UNI_FUNCS
+#define QPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
+
+#undef QPEL_BI_FUNCS
+#define QPEL_BI_FUNCS(depth)                                                  \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-    hevcdsp->transquant_bypass[0]   = FUNC(transquant_bypass4x4, depth);    \
-    hevcdsp->transquant_bypass[1]   = FUNC(transquant_bypass8x8, depth);    \
-    hevcdsp->transquant_bypass[2]   = FUNC(transquant_bypass16x16, depth);  \
-    hevcdsp->transquant_bypass[3]   = FUNC(transquant_bypass32x32, depth);  \
+    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
+    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
-    hevcdsp->transform_4x4_luma_add = FUNC(transform_4x4_luma_add, depth);  \
-    hevcdsp->transform_add[0]       = FUNC(transform_4x4_add, depth);       \
-    hevcdsp->transform_add[1]       = FUNC(transform_8x8_add, depth);       \
-    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
-    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
-                                                                            \
-    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
-    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
-    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
-    hevcdsp->sao_band_filter[3] = FUNC(sao_band_filter_3, depth);           \
+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
                                                                             \
-    hevcdsp->sao_edge_filter[0] = FUNC(sao_edge_filter_0, depth);           \
-    hevcdsp->sao_edge_filter[1] = FUNC(sao_edge_filter_1, depth);           \
-    hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth);           \
-    hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth);           \
-                                                                            \
-    hevcdsp->put_hevc_qpel[0][0] = FUNC(put_hevc_qpel_pixels, depth);       \
-    hevcdsp->put_hevc_qpel[0][1] = FUNC(put_hevc_qpel_h1, depth);           \
-    hevcdsp->put_hevc_qpel[0][2] = FUNC(put_hevc_qpel_h2, depth);           \
-    hevcdsp->put_hevc_qpel[0][3] = FUNC(put_hevc_qpel_h3, depth);           \
-    hevcdsp->put_hevc_qpel[1][0] = FUNC(put_hevc_qpel_v1, depth);           \
-    hevcdsp->put_hevc_qpel[1][1] = FUNC(put_hevc_qpel_h1v1, depth);         \
-    hevcdsp->put_hevc_qpel[1][2] = FUNC(put_hevc_qpel_h2v1, depth);         \
-    hevcdsp->put_hevc_qpel[1][3] = FUNC(put_hevc_qpel_h3v1, depth);         \
-    hevcdsp->put_hevc_qpel[2][0] = FUNC(put_hevc_qpel_v2, depth);           \
-    hevcdsp->put_hevc_qpel[2][1] = FUNC(put_hevc_qpel_h1v2, depth);         \
-    hevcdsp->put_hevc_qpel[2][2] = FUNC(put_hevc_qpel_h2v2, depth);         \
-    hevcdsp->put_hevc_qpel[2][3] = FUNC(put_hevc_qpel_h3v2, depth);         \
-    hevcdsp->put_hevc_qpel[3][0] = FUNC(put_hevc_qpel_v3, depth);           \
-    hevcdsp->put_hevc_qpel[3][1] = FUNC(put_hevc_qpel_h1v3, depth);         \
-    hevcdsp->put_hevc_qpel[3][2] = FUNC(put_hevc_qpel_h2v3, depth);         \
-    hevcdsp->put_hevc_qpel[3][3] = FUNC(put_hevc_qpel_h3v3, depth);         \
-                                                                            \
-    hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth);       \
-    hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth);            \
-    hevcdsp->put_hevc_epel[1][0] = FUNC(put_hevc_epel_v, depth);            \
-    hevcdsp->put_hevc_epel[1][1] = FUNC(put_hevc_epel_hv, depth);           \
-                                                                            \
-    hevcdsp->put_unweighted_pred   = FUNC(put_unweighted_pred, depth);      \
-    hevcdsp->put_unweighted_pred_avg = FUNC(put_unweighted_pred_avg, depth);    \
-                                                                            \
-    hevcdsp->weighted_pred         = FUNC(weighted_pred, depth);            \
-    hevcdsp->weighted_pred_avg     = FUNC(weighted_pred_avg, depth);        \
+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
                                                                             \
+    hevcdsp->sao_band_filter[0] =                                              \
+    hevcdsp->sao_band_filter[1] =                                              \
+    hevcdsp->sao_band_filter[2] =                                              \
+    hevcdsp->sao_band_filter[3] =                                              \
+    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+    hevcdsp->sao_edge_filter[0] =                                              \
+    hevcdsp->sao_edge_filter[1] =                                              \
+    hevcdsp->sao_edge_filter[2] =                                              \
+    hevcdsp->sao_edge_filter[3] =                                              \
+    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                               \
+    QPEL_FUNCS(depth);                                                         \
+    QPEL_UNI_FUNCS(depth);                                                     \
+    QPEL_BI_FUNCS(depth);                                                      \
+    EPEL_FUNCS(depth);                                                         \
+    EPEL_UNI_FUNCS(depth);                                                     \
+    EPEL_BI_FUNCS(depth);                                                      \
+                                                                               \
     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
@@ -174,7 +239,8 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth);
+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
+int i = 0;
 
     switch (bit_depth) {
     case 9:
@@ -183,6 +249,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     case 10:
         HEVC_DSP(10);
         break;
+    case 12:
+        HEVC_DSP(12);
+        break;
     default:
         HEVC_DSP(8);
         break;
@@ -190,4 +259,8 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+    if (ARCH_ARM)
+        ff_hevcdsp_init_arm(hevcdsp, bit_depth);
+    if (ARCH_MIPS)
+        ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
 }
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 72784645e9..9f1f6dd59f 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,87 +27,107 @@
 
 #include "get_bits.h"
 
+#define MAX_PB_SIZE 64
+
 typedef struct SAOParams {
     int offset_abs[3][4];   ///< sao_offset_abs
     int offset_sign[3][4];  ///< sao_offset_sign
 
-    int band_position[3];   ///< sao_band_position
+    uint8_t band_position[3];   ///< sao_band_position
 
     int eo_class[3];        ///< sao_eo_class
 
-    int offset_val[3][5];   ///<SaoOffsetVal
+    int16_t offset_val[3][5];   ///<SaoOffsetVal
 
     uint8_t type_idx[3];    ///< sao_type_idx
 } SAOParams;
 
 typedef struct HEVCDSPContext {
-    void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
-                    GetBitContext *gb, int pcm_bit_depth);
-
-    void (*transquant_bypass[4])(uint8_t *dst, int16_t *coeffs,
-                                 ptrdiff_t stride);
-
-    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
-                                   ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-
-    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders,
-                               int width, int height, int c_idx);
-    void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders, int width,
-                               int height, int c_idx, uint8_t vert_edge,
-                               uint8_t horiz_edge, uint8_t diag_edge);
-
-    void (*put_hevc_qpel[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                ptrdiff_t srcstride, int width, int height,
-                                int16_t *mcbuffer);
-    void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                ptrdiff_t srcstride, int width, int height,
-                                int mx, int my, int16_t *mcbuffer);
-
-    void (*put_unweighted_pred)(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                ptrdiff_t srcstride, int width, int height);
-    void (*put_unweighted_pred_avg)(uint8_t *dst, ptrdiff_t dststride,
-                                    int16_t *src1, int16_t *src2,
-                                    ptrdiff_t srcstride, int width, int height);
-    void (*weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                          uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                          ptrdiff_t srcstride, int width, int height);
-    void (*weighted_pred_avg)(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
-                              int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
-                              ptrdiff_t dststride, int16_t *src1, int16_t *src2,
-                              ptrdiff_t srcstride, int width, int height);
+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
+
+    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+
+    void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+
+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
+
+    void (*idct_4x4_luma)(int16_t *coeffs);
+
+    void (*idct[4])(int16_t *coeffs, int col_limit);
+
+    void (*idct_dc[4])(int16_t *coeffs);
+
+    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+
+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int wx1,
+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int ox0, int wx1,
+                                         int ox1, intptr_t mx, intptr_t my, int width);
 
     void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
 } HEVCDSPContext;
 
 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 
-void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-
-extern const int8_t ff_hevc_epel_filters[7][16];
+extern const int8_t ff_hevc_epel_filters[7][4];
+extern const int8_t ff_hevc_qpel_filters[3][16];
 
+void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
+void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
 #endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 390f683295..b840d179c3 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,10 @@
 #include "hevc.h"
 
 #include "bit_depth_template.c"
+#include "hevcdsp.h"
 
-static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
+
+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                           GetBitContext *gb, int pcm_bit_depth)
 {
     int x, y;
@@ -33,8 +35,8 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
 
     stride /= sizeof(pixel);
 
-    for (y = 0; y < size; y++) {
-        for (x = 0; x < size; x++)
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
         dst += stride;
     }
@@ -57,48 +59,76 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
     }
 }
 
-static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
 }
 
-static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
 }
 
-static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
 }
 
-static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
 }
 
-static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
-                                 ptrdiff_t stride)
+
+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
 {
-    pixel *dst = (pixel *)_dst;
-    int shift  = 13 - BIT_DEPTH;
-#if BIT_DEPTH <= 13
-    int offset = 1 << (shift - 1);
-#else
-    int offset = 0;
-#endif
+    int16_t *coeffs = (int16_t *) _coeffs;
+    int x, y;
+    int size = 1 << log2_size;
+
+    if (mode) {
+        coeffs += size;
+        for (y = 0; y < size - 1; y++) {
+            for (x = 0; x < size; x++)
+                coeffs[x] += coeffs[x - size];
+            coeffs += size;
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 1; x < size; x++)
+                coeffs[x] += coeffs[x - 1];
+            coeffs += size;
+        }
+    }
+}
+
+static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+{
+    int shift  = 15 - BIT_DEPTH - log2_size;
     int x, y;
+    int size = 1 << log2_size;
+    int16_t *coeffs = _coeffs;
 
-    stride /= sizeof(pixel);
 
-    for (y = 0; y < 4 * 4; y += 4) {
-        for (x = 0; x < 4; x++)
-            dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
-        dst += stride;
+    if (shift > 0) {
+        int offset = 1 << (shift - 1);
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = (*coeffs + offset) >> shift;
+                coeffs++;
+            }
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = *coeffs << -shift;
+                coeffs++;
+            }
+        }
     }
 }
 
@@ -122,17 +152,13 @@ static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
     } while (0)
 
-static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
-                                         ptrdiff_t stride)
+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 {
     int i;
-    pixel *dst   = (pixel *)_dst;
     int shift    = 7;
     int add      = 1 << (shift - 1);
     int16_t *src = coeffs;
 
-    stride /= sizeof(pixel);
-
     for (i = 0; i < 4; i++) {
         TR_4x4_LUMA(src, src, 4, SCALE);
         src++;
@@ -141,323 +167,226 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
     shift = 20 - BIT_DEPTH;
     add   = 1 << (shift - 1);
     for (i = 0; i < 4; i++) {
-        TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE);
+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
         coeffs += 4;
-        dst    += stride;
     }
 }
 
 #undef TR_4x4_LUMA
 
-#define TR_4(dst, src, dstep, sstep, assign)                            \
-    do {                                                                \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
-                       transform[8 * 2][0] * src[2 * sstep];            \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
-                       transform[8 * 2][1] * src[2 * sstep];            \
-        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
-                       transform[8 * 3][0] * src[3 * sstep];            \
-        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
-                       transform[8 * 3][1] * src[3 * sstep];            \
-                                                                        \
-        assign(dst[0 * dstep], e0 + o0);                                \
-        assign(dst[1 * dstep], e1 + o1);                                \
-        assign(dst[2 * dstep], e1 - o1);                                \
-        assign(dst[3 * dstep], e0 - o0);                                \
+#define TR_4(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
+                                                                               \
+        assign(dst[0 * dstep], e0 + o0);                                       \
+        assign(dst[1 * dstep], e1 + o1);                                       \
+        assign(dst[2 * dstep], e1 - o1);                                       \
+        assign(dst[3 * dstep], e0 - o0);                                       \
     } while (0)
 
-static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 4; i++) {
-        TR_4(src, src, 4, 4, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 4; i++) {
-        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 4;
-        dst    += stride;
-    }
-}
-
-#define TR_8(dst, src, dstep, sstep, assign)                      \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_8[4];                                               \
-        int o_8[4] = { 0 };                                       \
-        for (i = 0; i < 4; i++)                                   \
-            for (j = 1; j < 8; j += 2)                            \
-                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
-                                                                  \
-        for (i = 0; i < 4; i++) {                                 \
-            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-        }                                                         \
+#define TR_8(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_8[4];                                                            \
+        int o_8[4] = { 0 };                                                    \
+        for (i = 0; i < 4; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
+                                                                               \
+        for (i = 0; i < 4; i++) {                                              \
+            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
+        }                                                                      \
     } while (0)
 
-#define TR_16(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_16[8];                                              \
-        int o_16[8] = { 0 };                                      \
-        for (i = 0; i < 8; i++)                                   \
-            for (j = 1; j < 16; j += 2)                           \
-                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
-                                                                  \
-        for (i = 0; i < 8; i++) {                                 \
-            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-        }                                                         \
+#define TR_16(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_16[8];                                                           \
+        int o_16[8] = { 0 };                                                   \
+        for (i = 0; i < 8; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
+                                                                               \
+        for (i = 0; i < 8; i++) {                                              \
+            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
+        }                                                                      \
     } while (0)
 
-#define TR_32(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_32[16];                                             \
-        int o_32[16] = { 0 };                                     \
-        for (i = 0; i < 16; i++)                                  \
-            for (j = 1; j < 32; j += 2)                           \
-                o_32[i] += transform[j][i] * src[j * sstep];      \
-        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
-                                                                  \
-        for (i = 0; i < 16; i++) {                                \
-            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-        }                                                         \
+#define TR_32(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_32[16];                                                          \
+        int o_32[16] = { 0 };                                                  \
+        for (i = 0; i < 16; i++)                                               \
+            for (j = 1; j < end; j += 2)                                       \
+                o_32[i] += transform[j][i] * src[j * sstep];                   \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
+                                                                               \
+        for (i = 0; i < 16; i++) {                                             \
+            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
+        }                                                                      \
     } while (0)
 
-
-
-static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 8; i++) {
-        TR_8(src, src, 8, 8, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 8; i++) {
-        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 8;
-        dst    += stride;
-    }
+#define IDCT_VAR4(H)                                                          \
+    int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H)                                                          \
+        int      limit   = FFMIN(col_limit, H);                               \
+        int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H)   IDCT_VAR8(H)
+#define IDCT_VAR32(H)   IDCT_VAR8(H)
+
+#define IDCT(H)                                                              \
+static void FUNC(idct_##H ##x ##H )(                                         \
+                   int16_t *coeffs, int col_limit) {                         \
+    int i;                                                                   \
+    int      shift   = 7;                                                    \
+    int      add     = 1 << (shift - 1);                                     \
+    int16_t *src     = coeffs;                                               \
+    IDCT_VAR ##H(H);                                                         \
+                                                                             \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(src, src, H, H, SCALE, limit2);                             \
+        if (limit2 < H && i%4 == 0 && !!i)                                   \
+            limit2 -= 4;                                                     \
+        src++;                                                               \
+    }                                                                        \
+                                                                             \
+    shift   = 20 - BIT_DEPTH;                                                \
+    add     = 1 << (shift - 1);                                              \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
+        coeffs += H;                                                         \
+    }                                                                        \
 }
 
-static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 16; i++) {
-        TR_16(src, src, 16, 16, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 16; i++) {
-        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 16;
-        dst    += stride;
-    }
+#define IDCT_DC(H)                                                           \
+static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
+                   int16_t *coeffs) {                                        \
+    int i, j;                                                                \
+    int      shift   = 14 - BIT_DEPTH;                                       \
+    int      add     = 1 << (shift - 1);                                     \
+    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
+                                                                             \
+    for (j = 0; j < H; j++) {                                                \
+        for (i = 0; i < H; i++) {                                            \
+            coeffs[i+j*H] = coeff;                                           \
+        }                                                                    \
+    }                                                                        \
 }
 
-static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
+IDCT( 4)
+IDCT( 8)
+IDCT(16)
+IDCT(32)
 
-    stride /= sizeof(pixel);
+IDCT_DC( 4)
+IDCT_DC( 8)
+IDCT_DC(16)
+IDCT_DC(32)
 
-    for (i = 0; i < 32; i++) {
-        TR_32(src, src, 32, 32, SCALE);
-        src++;
-    }
-    src   = coeffs;
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 32; i++) {
-        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 32;
-        dst    += stride;
-    }
-}
+#undef TR_4
+#undef TR_8
+#undef TR_16
+#undef TR_32
+
+#undef SET
+#undef SCALE
+#undef ADD_AND_SCALE
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-                                  ptrdiff_t stride, SAOParams *sao,
-                                  int *borders, int width, int height,
-                                  int c_idx, int class)
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height)
 {
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
     int offset_table[32] = { 0 };
     int k, y, x;
-    int chroma = !!c_idx;
     int shift  = BIT_DEPTH - 5;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_left_class  = sao->band_position[c_idx];
-    int init_y = 0, init_x = 0;
-
-    stride /= sizeof(pixel);
 
-    switch (class) {
-    case 0:
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 1:
-        init_y = -(4 >> chroma) - 2;
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        height = (4 >> chroma) + 2;
-        break;
-    case 2:
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 3:
-        init_y = -(4 >> chroma) - 2;
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        height =  (4 >> chroma) + 2;
-        break;
-    }
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
     for (k = 0; k < 4; k++)
         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        dst += stride;
-        src += stride;
+        dst += stride_dst;
+        src += stride_src;
     }
 }
 
-static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 0);
-}
+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
 
-static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 1);
-}
+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height) {
 
-static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 2);
-}
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+    stride_dst /= sizeof(pixel);
 
-static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
-                          width, height, c_idx, 3);
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int diff0 = CMP(src[x], src[x + a_stride]);
+            int diff1 = CMP(src[x], src[x + b_stride]);
+            int offset_val        = edge_idx[2 + diff0 + diff1];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
 }
 
-static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
-
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-
-    stride /= sizeof(pixel);
+    int init_x = 0, width = _width, height = _height;
 
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            int y_stride   = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            int x_stride   = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
@@ -467,526 +396,725 @@ static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
             int offset_val = sao_offset_val[0];
             for (x = init_x; x < width; x++)
                 dst[x] = av_clip_pixel(src[x] + offset_val);
-            init_y = 1;
         }
         if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            int y_stride   = stride * (height - 1);
+            int offset_val   = sao_offset_val[0];
+            int y_stride_dst = stride_dst * (height - 1);
+            int y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
             height--;
         }
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
-
-    {
-        // Restore pixels that can't be modified
-        int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-        if (vert_edge && sao_eo_class != SAO_EO_VERT)
-            for (y = init_y+save_upper_left; y< height; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_upper_left; x<width; x++)
-                dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[0] = src[0];
-    }
-
-#undef CMP
 }
 
-static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
     int init_x = 0, init_y = 0, width = _width, height = _height;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1, 0  }, { 1,  0 } }, // horizontal
-        { { 0,  -1 }, { 0,  1 } }, // vertical
-        { { -1, -1 }, { 1,  1 } }, // 45 degree
-        { { 1,  -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    stride /= sizeof(pixel);
-
-    init_y = -(4 >> chroma) - 2;
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    height = (4 >> chroma) + 2;
-
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            int y_stride   = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            int x_stride   = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]) {
+            int offset_val = sao_offset_val[0];
+            for (x = init_x; x < width; x++)
+                dst[x] = av_clip_pixel(src[x] + offset_val);
+            init_y = 1;
+        }
+        if (borders[3]) {
+            int offset_val   = sao_offset_val[0];
+            int y_stride_dst = stride_dst * (height - 1);
+            int y_stride_src = stride_src * (height - 1);
+            for (x = init_x; x < width; x++)
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+            height--;
         }
     }
 
     {
+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
+
         // Restore pixels that can't be modified
-        int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_left; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_lower_left; x<width; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
-            dst[stride*(height-1)] = src[stride*(height-1)];
+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
+                dst[y*stride_dst] = src[y*stride_src];
+        }
+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
+        }
+
+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
+                dst[x] = src[x];
+        }
+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
+        }
+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
+            dst[width-1] = src[width-1];
+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
+
     }
+}
 
 #undef CMP
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
 }
 
-static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, intptr_t mx, intptr_t my, int width)
+{
+    int y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        memcpy(dst, src, width * sizeof(pixel));
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    stride /= sizeof(pixel);
+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
-    if (sao_eo_class != SAO_EO_HORIZ) {
-        if (borders[1]) {
-            int offset_val = sao_offset_val[0];
-            for (x = init_x; x < width; x++)
-                dst[x] = av_clip_pixel(src[x] + offset_val);
-            init_y = 1;
-        }
-        if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            int y_stride   = stride * (height - 1);
-            for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
-            height--;
+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                           int16_t *src2,
+                                           int height, int denom, int wx0, int wx1,
+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
         }
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define QPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - 3 * stride] +                                         \
+     filter[1] * src[x - 2 * stride] +                                         \
+     filter[2] * src[x -     stride] +                                         \
+     filter[3] * src[x             ] +                                         \
+     filter[4] * src[x +     stride] +                                         \
+     filter[5] * src[x + 2 * stride] +                                         \
+     filter[6] * src[x + 3 * stride] +                                         \
+     filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
     }
+}
 
-    {
-        // Restore pixels that can't be modified
-        int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y+save_upper_right; y< height; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_upper_right; x++)
-                dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
-            dst[width-1] = src[width-1];
+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    for (y = 0; y < height; y++)  {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
     }
-#undef CMP
 }
 
-static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
+                                   uint8_t *_src,
+                                   ptrdiff_t _srcstride,
+                                   int height, intptr_t mx,
+                                   intptr_t my, int width)
 {
     int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
 
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
 
-    stride /= sizeof(pixel);
+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
 
-    init_y = -(4 >> chroma) - 2;
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    height =  (4 >> chroma) + 2;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
+}
 
-    dst    = dst + (init_y * stride + init_x);
-    src    = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
     }
+}
 
-    {
-        // Restore pixels that can't be modified
-        int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_right; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_lower_right; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                     uint8_t *_src, ptrdiff_t _srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
     }
-#undef CMP
 }
 
-#undef SET
-#undef SCALE
-#undef ADD_AND_SCALE
-#undef TR_4
-#undef TR_8
-#undef TR_16
-#undef TR_32
 
-static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
                                        uint8_t *_src, ptrdiff_t _srcstride,
-                                       int width, int height, int16_t* mcbuffer)
+                                       int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift =  14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         src += srcstride;
         dst += dststride;
     }
 }
 
-#define QPEL_FILTER_1(src, stride)      \
-    (1 * -src[x - 3 * stride] +         \
-     4 *  src[x - 2 * stride] -         \
-    10 *  src[x -     stride] +         \
-    58 *  src[x]              +         \
-    17 *  src[x +     stride] -         \
-     5 *  src[x + 2 * stride] +         \
-     1 *  src[x + 3 * stride])
-
-#define QPEL_FILTER_2(src, stride)      \
-    (1  * -src[x - 3 * stride] +        \
-     4  *  src[x - 2 * stride] -        \
-    11  *  src[x -     stride] +        \
-    40  *  src[x]              +        \
-    40  *  src[x +     stride] -        \
-    11  *  src[x + 2 * stride] +        \
-     4  *  src[x + 3 * stride] -        \
-     1  *  src[x + 4 * stride])
-
-#define QPEL_FILTER_3(src, stride)      \
-    (1  * src[x - 2 * stride] -         \
-     5  * src[x -     stride] +         \
-    17  * src[x]              +         \
-    58  * src[x + stride]     -         \
-    10  * src[x + 2 * stride] +         \
-     4  * src[x + 3 * stride] -         \
-     1  * src[x + 4 * stride])
-
-
-#define PUT_HEVC_QPEL_H(H)                                                     \
-static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-#define PUT_HEVC_QPEL_V(V)                                                     \
-static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    for (y = 0; y < height; y++)  {                                            \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
 }
 
-#define PUT_HEVC_QPEL_HV(H, V)                                                 \
-static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
-                                                 ptrdiff_t dststride,          \
-                                                 uint8_t *_src,                \
-                                                 ptrdiff_t _srcstride,         \
-                                                 int width, int height,        \
-                                                 int16_t* mcbuffer)            \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
-    int16_t *tmp = tmp_array;                                                  \
-                                                                               \
-    src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
-                                                                               \
-    for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
-        for (x = 0; x < width; x++)                                            \
-            tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        tmp += MAX_PB_SIZE;                                                    \
-    }                                                                          \
-                                                                               \
-    tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
-                                                                               \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
-        tmp += MAX_PB_SIZE;                                                    \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-PUT_HEVC_QPEL_H(1)
-PUT_HEVC_QPEL_H(2)
-PUT_HEVC_QPEL_H(3)
-PUT_HEVC_QPEL_V(1)
-PUT_HEVC_QPEL_V(2)
-PUT_HEVC_QPEL_V(3)
-PUT_HEVC_QPEL_HV(1, 1)
-PUT_HEVC_QPEL_HV(1, 2)
-PUT_HEVC_QPEL_HV(1, 3)
-PUT_HEVC_QPEL_HV(2, 1)
-PUT_HEVC_QPEL_HV(2, 2)
-PUT_HEVC_QPEL_HV(2, 3)
-PUT_HEVC_QPEL_HV(3, 1)
-PUT_HEVC_QPEL_HV(3, 2)
-PUT_HEVC_QPEL_HV(3, 3)
-
-static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                                       uint8_t *_src, ptrdiff_t _srcstride,
-                                       int width, int height, int mx, int my,
-                                       int16_t* mcbuffer)
+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                         uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox,
+                                         intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    for (y = 0; y < height; y++) {
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
         src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
         dst += dststride;
     }
 }
 
-#define EPEL_FILTER(src, stride)                \
-    (filter_0 * src[x - stride] +               \
-     filter_1 * src[x]          +               \
-     filter_2 * src[x + stride] +               \
-     filter_3 * src[x + 2 * stride])
+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define EPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - stride] +                                             \
+     filter[1] * src[x]          +                                             \
+     filter[2] * src[x + stride] +                                             \
+     filter[3] * src[x + 2 * stride])
 
-static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+static void FUNC(put_hevc_epel_h)(int16_t *dst,
                                   uint8_t *_src, ptrdiff_t _srcstride,
-                                  int width, int height, int mx, int my,
-                                  int16_t* mcbuffer)
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
+static void FUNC(put_hevc_epel_v)(int16_t *dst,
                                   uint8_t *_src, ptrdiff_t _srcstride,
-                                  int width, int height, int mx, int my,
-                                  int16_t* mcbuffer)
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     const int8_t *filter = ff_hevc_epel_filters[my - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
                                    uint8_t *_src, ptrdiff_t _srcstride,
-                                   int width, int height, int mx, int my,
-                                   int16_t* mcbuffer)
+                                   int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
-    const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
-    int8_t filter_0 = filter_h[0];
-    int8_t filter_1 = filter_h[1];
-    int8_t filter_2 = filter_h[2];
-    int8_t filter_3 = filter_h[3];
-    int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
     int16_t *tmp = tmp_array;
 
     src -= EPEL_EXTRA_BEFORE * srcstride;
@@ -999,49 +1127,101 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     }
 
     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter_0 = filter_v[0];
-    filter_1 = filter_v[1];
-    filter_2 = filter_v[2];
-    filter_3 = filter_v[3];
+    filter = ff_hevc_epel_filters[my - 1];
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
         tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
         dst += dststride;
     }
 }
 
-static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
-                                      int16_t *src, ptrdiff_t srcstride,
-                                      int width, int height)
+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        }
+        dst  += dststride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     int shift = 14 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
 #else
     int offset = 0;
 #endif
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src[x] + offset) >> shift);
-        dst += dststride;
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
         src += srcstride;
+        dst += dststride;
     }
 }
 
-static void FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
-                                          int16_t *src1, int16_t *src2,
-                                          ptrdiff_t srcstride,
-                                          int width, int height)
+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-
     int shift = 14 + 1 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -1051,71 +1231,273 @@ static void FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
 
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
     }
 }
 
-static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                                uint8_t *_dst, ptrdiff_t _dststride,
-                                int16_t *src, ptrdiff_t srcstride,
-                                int width, int height)
+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, wx, ox, x, y, offset;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    offset = 1 << (log2Wd - 1);
-    wx     = wlxFlag;
-    ox     = olxFlag * (1 << (BIT_DEPTH - 8));
+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
+    ox     = ox * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
-            if (log2Wd >= 1) {
-                dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
-            } else {
-                dst[x] = av_clip_pixel(src[x] * wx + ox);
-            }
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         }
         dst += dststride;
         src += srcstride;
     }
 }
 
-static void FUNC(weighted_pred_avg)(uint8_t denom,
-                                    int16_t wl0Flag, int16_t wl1Flag,
-                                    int16_t ol0Flag, int16_t ol1Flag,
-                                    uint8_t *_dst, ptrdiff_t _dststride,
-                                    int16_t *src1, int16_t *src2,
-                                    ptrdiff_t srcstride,
-                                    int width, int height)
+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, w0, w1, o0, o1, x, y;
-    pixel *dst = (pixel *)_dst;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    w0     = wl0Flag;
-    w1     = wl1Flag;
-    o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
-    o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
 
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
-                                    ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src2 += MAX_PB_SIZE;
     }
 }
 
-// line zero
+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}// line zero
 #define P3 pix[-4 * xstride]
 #define P2 pix[-3 * xstride]
 #define P1 pix[-2 * xstride]
@@ -1266,21 +1648,21 @@ static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
 }
 
 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
 }
 
 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
 }
 
 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
@@ -1288,7 +1670,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 }
 
 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
index 1ba2487a62..02c1766059 100644
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@@ -1,27 +1,29 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "hevc.h"
 
+#include "hevcpred.h"
+
 #define BIT_DEPTH 8
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
@@ -34,6 +36,10 @@
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 {
 #undef FUNC
@@ -61,8 +67,14 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
     case 10:
         HEVC_PRED(10);
         break;
+    case 12:
+        HEVC_PRED(12);
+        break;
     default:
         HEVC_PRED(8);
         break;
     }
+
+    if (ARCH_MIPS)
+        ff_hevc_pred_init_mips(hpc, bit_depth);
 }
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
new file mode 100644
index 0000000000..eb17663683
--- /dev/null
+++ b/libavcodec/hevcpred.h
@@ -0,0 +1,46 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVCPRED_H
+#define AVCODEC_HEVCPRED_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct HEVCContext;
+
+typedef struct HEVCPredContext {
+    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride, int log2_size, int c_idx);
+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int c_idx, int mode);
+} HEVCPredContext;
+
+void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_HEVCPRED_H */
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 039882bb52..6ae87cca13 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -3,28 +3,27 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/pixdesc.h"
 
-#include "hevc.h"
-
 #include "bit_depth_template.c"
+#include "hevcpred.h"
 
 #define POS(x, y) src[(x) + stride * (y)]
 
@@ -38,10 +37,9 @@ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
 #define MVF_PU(x, y) \
     MVF(PU(x0 + ((x) << hshift)), PU(y0 + ((y) << vshift)))
 #define IS_INTRA(x, y) \
-    MVF_PU(x, y).is_intra
+    (MVF_PU(x, y).pred_flag == PF_INTRA)
 #define MIN_TB_ADDR_ZS(x, y) \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 #define EXTEND(ptr, val, len)         \
 do {                                  \
     pixel4 pix = PIXEL_SPLAT_X4(val); \
@@ -49,36 +47,43 @@ do {                                  \
         AV_WN4P(ptr + i, pix);        \
 } while (0)
 
+#define EXTEND_RIGHT_CIP(ptr, start, length)                                   \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(i, -1))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i+3])
 #define EXTEND_LEFT_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
+        for (i = start; i > (start) - (length); i--) \
             if (!IS_INTRA(i - 1, -1)) \
                 ptr[i - 1] = ptr[i]
-#define EXTEND_RIGHT_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(i, -1)) \
-                ptr[i] = ptr[i - 1]
-#define EXTEND_UP_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            if (!IS_INTRA(-1, i - 1)) \
-                ptr[i - 1] = ptr[i]
-#define EXTEND_UP_CIP_0(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            ptr[i - 1] = ptr[i]
-#define EXTEND_DOWN_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(-1, i)) \
-                ptr[i] = ptr[i - 1]
-    HEVCLocalContext *lc = &s->HEVClc;
+#define EXTEND_UP_CIP(ptr, start, length)                                      \
+        for (i = (start); i > (start) - (length); i -= 4)                      \
+            if (!IS_INTRA(-1, i - 3))                                          \
+                AV_WN4P(&ptr[i - 3], a);                                       \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i - 3])
+#define EXTEND_DOWN_CIP(ptr, start, length)                                    \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(-1, i))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i + 3])
+
+    HEVCLocalContext *lc = s->HEVClc;
     int i;
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
     int size = (1 << log2_size);
-    int size_in_luma = size << hshift;
-    int size_in_tbs = size_in_luma >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_h = size << hshift;
+    int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = size << vshift;
+    int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
     int x = x0 >> hshift;
     int y = y0 >> vshift;
-    int x_tb = x0 >> s->ps.sps->log2_min_tb_size;
-    int y_tb = y0 >> s->ps.sps->log2_min_tb_size;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
 
     ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
@@ -86,87 +91,77 @@ do {                                  \
 
     int min_pu_width = s->ps.sps->min_pu_width;
 
-    enum IntraPredMode mode = c_idx ? lc->pu.intra_pred_mode_c :
-                              lc->tu.cur_intra_pred_mode;
-
-    pixel left_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
-    pixel top_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
-
-    pixel *left          = left_array + 1;
-    pixel *top           = top_array  + 1;
-    pixel *filtered_left = filtered_left_array + 1;
-    pixel *filtered_top  = filtered_top_array  + 1;
-
-    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb - 1, y_tb + size_in_tbs);
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+                              lc->tu.intra_pred_mode;
+    pixel4 a;
+    pixel  left_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+    pixel  top_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+
+    pixel  *left          = left_array + 1;
+    pixel  *top           = top_array  + 1;
+    pixel  *filtered_left = filtered_left_array + 1;
+    pixel  *filtered_top  = filtered_top_array  + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
     int cand_left        = lc->na.cand_left;
     int cand_up_left     = lc->na.cand_up_left;
     int cand_up          = lc->na.cand_up;
-    int cand_up_right    = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb + size_in_tbs, y_tb - 1);
+    int cand_up_right    = lc->na.cand_up_right    && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
 
-    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma, s->ps.sps->height) -
-                            (y0 + size_in_luma)) >> vshift;
-    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma, s->ps.sps->width) -
-                            (x0 + size_in_luma)) >> hshift;
+    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
+                           (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                           (x0 + size_in_luma_h)) >> hshift;
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-        int size_in_luma_pu = PU(size_in_luma);
-        int on_pu_edge_x    = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        int on_pu_edge_y    = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        if (!size_in_luma_pu)
-            size_in_luma_pu++;
+        int size_in_luma_pu_v = PU(size_in_luma_v);
+        int size_in_luma_pu_h = PU(size_in_luma_h);
+        int on_pu_edge_x    = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_y    = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
         if (cand_bottom_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
-            int y_bottom_pu = PU(y0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_bottom_pu);
+            int y_bottom_pu = PU(y0 + size_in_luma_v);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
             cand_bottom_left = 0;
-            for (i = 0; i < max; i++)
-                cand_bottom_left |= MVF(x_left_pu, y_bottom_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
             int y_left_pu   = PU(y0);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_left_pu);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
             cand_left = 0;
-            for (i = 0; i < max; i++)
-                cand_left |= MVF(x_left_pu, y_left_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_up_left == 1) {
             int x_left_pu   = PU(x0 - 1);
             int y_top_pu    = PU(y0 - 1);
-            cand_up_left = MVF(x_left_pu, y_top_pu).is_intra;
+            cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
         }
         if (cand_up == 1 && on_pu_edge_y) {
             int x_top_pu    = PU(x0);
             int y_top_pu    = PU(y0 - 1);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_top_pu);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
             cand_up = 0;
-            for (i = 0; i < max; i++)
-                cand_up |= MVF(x_top_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
         if (cand_up_right == 1 && on_pu_edge_y) {
             int y_top_pu    = PU(y0 - 1);
-            int x_right_pu  = PU(x0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_right_pu);
+            int x_right_pu  = PU(x0 + size_in_luma_h);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
             cand_up_right = 0;
-            for (i = 0; i < max; i++)
-                cand_up_right |= MVF(x_right_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
-        for (i = 0; i < 2 * MAX_TB_SIZE; i++) {
-            left[i] = 128;
-            top[i]  = 128;
-        }
-    }
-    if (cand_bottom_left) {
-        for (i = size; i < size + bottom_left_size; i++)
-            left[i] = POS(-1, i);
-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
-               size - bottom_left_size);
+        memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        top[-1] = 128;
     }
-    if (cand_left)
-        for (i = size - 1; i >= 0; i--)
-            left[i] = POS(-1, i);
     if (cand_up_left) {
         left[-1] = POS(-1, -1);
         top[-1]  = left[-1];
@@ -178,6 +173,15 @@ do {                                  \
         EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
                size - top_right_size);
     }
+    if (cand_left)
+        for (i = 0; i < size; i++)
+            left[i] = POS(-1, i);
+    if (cand_bottom_left) {
+        for (i = size; i < size + bottom_left_size; i++)
+            left[i] = POS(-1, i);
+        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+               size - bottom_left_size);
+    }
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
         if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
@@ -203,7 +207,6 @@ do {                                  \
                         j++;
                     EXTEND_LEFT_CIP(top, j, j + 1);
                     left[-1] = top[-1];
-                    j        = 0;
                 }
             } else {
                 j = 0;
@@ -217,24 +220,30 @@ do {                                  \
                         top[-1] = top[0];
                     }
                 left[-1] = top[-1];
-                j        = 0;
             }
+            left[-1] = top[-1];
             if (cand_bottom_left || cand_left) {
-                EXTEND_DOWN_CIP(left, j, size_max_y - j);
+                a = PIXEL_SPLAT_X4(left[-1]);
+                EXTEND_DOWN_CIP(left, 0, size_max_y);
             }
             if (!cand_left)
                 EXTEND(left, left[-1], size);
             if (!cand_bottom_left)
                 EXTEND(left + size, left[size - 1], size);
             if (x0 != 0 && y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
                 EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
+                if (!IS_INTRA(-1, - 1))
+                    left[-1] = left[0];
             } else if (x0 == 0) {
-                EXTEND_UP_CIP_0(left, size_max_y - 1, size_max_y);
+                EXTEND(left, 0, size_max_y);
             } else {
-                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y - 1);
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
+                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
             }
             top[-1] = left[-1];
             if (y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[-1]);
                 EXTEND_RIGHT_CIP(top, 0, size_max_x);
             }
         }
@@ -278,40 +287,42 @@ do {                                  \
     top[-1] = left[-1];
 
     // Filtering process
-    if (c_idx == 0 && mode != INTRA_DC && size != 4) {
-        int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-        int min_dist_vert_hor = FFMIN(FFABS((int)mode - 26),
-                                      FFABS((int)mode - 10));
-        if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
-            int threshold = 1 << (BIT_DEPTH - 5);
-            if (s->ps.sps->sps_strong_intra_smoothing_enable_flag &&
-                log2_size == 5 &&
-                FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
-                FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
-                // We can't just overwrite values in top because it could be
-                // a pointer into src
-                filtered_top[-1] = top[-1];
-                filtered_top[63] = top[63];
-                for (i = 0; i < 63; i++)
-                    filtered_top[i] = ((64 - (i + 1)) * top[-1] +
-                                             (i + 1)  * top[63] + 32) >> 6;
-                for (i = 0; i < 63; i++)
-                    left[i] = ((64 - (i + 1)) * left[-1] +
-                                     (i + 1)  * left[63] + 32) >> 6;
-                top = filtered_top;
-            } else {
-                filtered_left[2 * size - 1] = left[2 * size - 1];
-                filtered_top[2 * size - 1]  = top[2 * size - 1];
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
-                                        left[i - 1] + 2) >> 2;
-                filtered_top[-1]  =
-                filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
-                                       top[i - 1] + 2) >> 2;
-                left = filtered_left;
-                top  = filtered_top;
+    if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && size != 4){
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
+                                          FFABS((int)(mode - 10U)));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
+                int threshold = 1 << (BIT_DEPTH - 5);
+                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
+                    log2_size == 5 &&
+                    FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
+                    FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
+                    // We can't just overwrite values in top because it could be
+                    // a pointer into src
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+                    for (i = 0; i < 63; i++)
+                        filtered_top[i] = ((64 - (i + 1)) * top[-1] +
+                                           (i + 1)  * top[63] + 32) >> 6;
+                    for (i = 0; i < 63; i++)
+                        left[i] = ((64 - (i + 1)) * left[-1] +
+                                   (i + 1)  * left[63] + 32) >> 6;
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * size - 1] = left[2 * size - 1];
+                    filtered_top[2 * size - 1]  = top[2 * size - 1];
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1]  =
+                    filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top  = filtered_top;
+                }
             }
         }
     }
@@ -394,8 +405,8 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
     a = PIXEL_SPLAT_X4(dc);
 
     for (i = 0; i < size; i++)
-        for (j = 0; j < size / 4; j++)
-            AV_WN4PA(&POS(j * 4, i), a);
+        for (j = 0; j < size; j+=4)
+            AV_WN4P(&POS(j, i), a);
 
     if (c_idx == 0 && size < 32) {
         POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
@@ -427,7 +438,7 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     };
 
     int angle = intra_pred_angle[mode - 2];
-    pixel ref_array[3 * MAX_TB_SIZE + 1];
+    pixel ref_array[3 * MAX_TB_SIZE + 4];
     pixel *ref_tmp = ref_array + size;
     const pixel *ref;
     int last = (size * angle) >> 5;
@@ -435,8 +446,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     if (mode >= 18) {
         ref = top - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = top[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -446,13 +457,19 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             int idx  = ((y + 1) * angle) >> 5;
             int fact = ((y + 1) * angle) & 31;
             if (fact) {
-                for (x = 0; x < size; x++) {
-                    POS(x, y) = ((32 - fact) * ref[x + idx + 1] +
-                                       fact  * ref[x + idx + 2] + 16) >> 5;
+                for (x = 0; x < size; x += 4) {
+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
+                                           fact  * ref[x + idx + 2] + 16) >> 5;
+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
                 }
             } else {
-                for (x = 0; x < size; x++)
-                    POS(x, y) = ref[x + idx + 1];
+                for (x = 0; x < size; x += 4)
+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
             }
         }
         if (mode == 26 && c_idx == 0 && size < 32) {
@@ -462,8 +479,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     } else {
         ref = left - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = left[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -483,8 +500,12 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             }
         }
         if (mode == 10 && c_idx == 0 && size < 32) {
-            for (x = 0; x < size; x++)
-                POS(x, 0) = av_clip_pixel(left[0] + ((top[x] - top[-1]) >> 1));
+            for (x = 0; x < size; x += 4) {
+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - top[-1]) >> 1));
+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
+            }
         }
     }
 }
diff --git a/libavcodec/hnm4video.c b/libavcodec/hnm4video.c
index 1dc6ed3bac..a64dbb1746 100644
--- a/libavcodec/hnm4video.c
+++ b/libavcodec/hnm4video.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 David Kment
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
         if (getbit(&gb, &bitbuf, &bits)) {
             if (writeoffset >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
@@ -100,11 +100,11 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
             count  += 2;
             offset += writeoffset;
             if (offset < 0 || offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + count >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             while (count--) {
@@ -147,7 +147,8 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 {
     Hnm4VideoContext *hnm = avctx->priv_data;
     GetByteContext gb;
-    uint32_t writeoffset = 0, count, left, offset;
+    uint32_t writeoffset = 0;
+    int count, left, offset;
     uint8_t tag, previous, backline, backward, swap;
 
     bytestream2_init(&gb, src, size);
@@ -157,7 +158,12 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
         if (count == 0) {
             tag = bytestream2_get_byte(&gb) & 0xE0;
             tag = tag >> 5;
+
             if (tag == 0) {
+                if (writeoffset + 2 > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
             } else if (tag == 1) {
@@ -168,6 +174,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
                 writeoffset += count;
             } else if (tag == 3) {
                 count = bytestream2_get_byte(&gb) * 2;
+                if (writeoffset + count > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 while (count > 0) {
                     hnm->current[writeoffset++] = bytestream2_peek_byte(&gb);
                     count--;
@@ -176,6 +186,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
             } else {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             previous = bytestream2_peek_byte(&gb) & 0x20;
             backline = bytestream2_peek_byte(&gb) & 0x40;
@@ -188,17 +202,28 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 
             left = count;
 
-            if (!backward && offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            if (!backward && offset + 2*count > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (backward && offset >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            } else if (backward && offset + 1 >= hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (writeoffset + count >= hnm->width * hnm->height) {
+            } else if (writeoffset + 2*count > hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
+            if(backward) {
+                if (offset < (!!backline)*(2 * hnm->width - 1) + 2*(left-1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            } else {
+                if (offset < (!!backline)*(2 * hnm->width - 1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            }
 
             if (previous) {
                 while (left > 0) {
@@ -263,6 +288,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             if (tag == 0) {
                 writeoffset += bytestream2_get_byte(&gb);
             } else if (tag == 1) {
+                if (writeoffset + hnm->width >= hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset]              = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset + hnm->width] = bytestream2_get_byte(&gb);
                 writeoffset++;
@@ -271,6 +300,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             } else if (tag == 3) {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             delta    = bytestream2_peek_byte(&gb) & 0x80;
             previous = bytestream2_peek_byte(&gb) & 0x40;
@@ -279,14 +312,19 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             offset  = writeoffset;
             offset += bytestream2_get_le16(&gb);
 
-            if (delta)
+            if (delta) {
+                if (offset < 0x10000) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
                 offset -= 0x10000;
+            }
 
             if (offset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds\n");
                 break;
             }
 
@@ -360,17 +398,23 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
     int ret;
     uint16_t chunk_id;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (avpkt->size < 8) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
     }
 
     chunk_id = AV_RL16(avpkt->data + 4);
 
     if (chunk_id == HNM4_CHUNK_ID_PL) {
         hnm_update_palette(avctx, avpkt->data, avpkt->size);
-        frame->palette_has_changed = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IZ) {
+        if (avpkt->size < 12) {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         unpack_intraframe(avctx, avpkt->data + 12, avpkt->size - 12);
         memcpy(hnm->previous, hnm->current, hnm->width * hnm->height);
         if (hnm->version == 0x4a)
@@ -383,6 +427,9 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
         memcpy(frame->data[1], hnm->palette, 256 * 4);
         *got_frame = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IU) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         if (hnm->version == 0x4a) {
             decode_interframe_v4a(avctx, avpkt->data + 8, avpkt->size - 8);
             memcpy(hnm->processed, hnm->current, hnm->width * hnm->height);
@@ -427,7 +474,9 @@ static av_cold int hnm_decode_init(AVCodecContext *avctx)
     hnm->buffer2   = av_mallocz(avctx->width * avctx->height);
     hnm->processed = av_mallocz(avctx->width * avctx->height);
 
-    if (!hnm->buffer1 || !hnm->buffer2 || !hnm->processed) {
+    if (   !hnm->buffer1 || !hnm->buffer2 || !hnm->processed
+        || avctx->width * avctx->height == 0
+        || avctx->height % 2) {
         av_log(avctx, AV_LOG_ERROR, "av_mallocz() failed\n");
         av_freep(&hnm->buffer1);
         av_freep(&hnm->buffer2);
diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index 81d3892e56..fccfe7610f 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 25694c5940..8e2fd8fcf5 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -5,20 +5,20 @@
  *
  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -357,10 +357,14 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
 
     if (ARCH_AARCH64)
         ff_hpeldsp_init_aarch64(c, flags);
+    if (ARCH_ALPHA)
+        ff_hpeldsp_init_alpha(c, flags);
     if (ARCH_ARM)
         ff_hpeldsp_init_arm(c, flags);
     if (ARCH_PPC)
         ff_hpeldsp_init_ppc(c, flags);
     if (ARCH_X86)
         ff_hpeldsp_init_x86(c, flags);
+    if (ARCH_MIPS)
+        ff_hpeldsp_init_mips(c, flags);
 }
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index d037cba14e..1a3cea54b8 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,8 +95,10 @@ typedef struct HpelDSPContext {
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 
 void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);
 
 #endif /* AVCODEC_HPELDSP_H */
diff --git a/libavcodec/hq_hqa.c b/libavcodec/hq_hqa.c
index c63e5a8e55..3ef83d4eb4 100644
--- a/libavcodec/hq_hqa.c
+++ b/libavcodec/hq_hqa.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,7 +121,7 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
     uint32_t slice_off[21];
     int slice, start_off, next_off, i, ret;
 
-    if (prof_num >= NUM_HQ_PROFILES) {
+    if ((unsigned)prof_num >= NUM_HQ_PROFILES) {
         profile = &ff_hq_profile[0];
         avpriv_request_sample(ctx->avctx, "HQ Profile %d", prof_num);
     } else {
@@ -137,10 +137,8 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
     ctx->avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from CUV position, so adjust them accordingly. */
     for (i = 0; i < profile->num_slices + 1; i++)
@@ -267,10 +265,8 @@ static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
     }
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from HQA1 position, so adjust them accordingly. */
     for (i = 0; i < num_slices + 1; i++)
@@ -302,7 +298,8 @@ static int hq_hqa_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *pic = data;
     uint32_t info_tag;
     unsigned int data_size;
-    int tag, ret;
+    int ret;
+    unsigned tag;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
     if (bytestream2_get_bytes_left(&ctx->gbc) < 4 + 4) {
diff --git a/libavcodec/hq_hqa.h b/libavcodec/hq_hqa.h
index 6bd858ddf2..4286dd0752 100644
--- a/libavcodec/hq_hqa.h
+++ b/libavcodec/hq_hqa.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadata.c b/libavcodec/hq_hqadata.c
index 23fefc11b0..ae9231aa02 100644
--- a/libavcodec/hq_hqadata.c
+++ b/libavcodec/hq_hqadata.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadsp.c b/libavcodec/hq_hqadsp.c
index 93fc06709b..db1ea2e246 100644
--- a/libavcodec/hq_hqadsp.c
+++ b/libavcodec/hq_hqadsp.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadsp.h b/libavcodec/hq_hqadsp.h
index 22b1e6145c..420ed92f76 100644
--- a/libavcodec/hq_hqadsp.h
+++ b/libavcodec/hq_hqadsp.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqx.c b/libavcodec/hqx.c
index 34e36056c7..8060c7a31c 100644
--- a/libavcodec/hqx.c
+++ b/libavcodec/hqx.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -458,7 +458,7 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
     }
     ret = av_image_check_size(ctx->width, ctx->height, 0, avctx);
     if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimenstions %dx%d.\n",
+        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimensions %dx%d.\n",
                ctx->width, ctx->height);
         return AVERROR_INVALIDDATA;
     }
@@ -492,10 +492,8 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     ret = ff_get_buffer(avctx, ctx->pic, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     avctx->execute2(avctx, decode_slice_thread, NULL, NULL, 16);
 
diff --git a/libavcodec/hqx.h b/libavcodec/hqx.h
index 7f329712fd..42d382de1f 100644
--- a/libavcodec/hqx.h
+++ b/libavcodec/hqx.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqxdsp.c b/libavcodec/hqxdsp.c
index 2a02299a29..feff9c0b68 100644
--- a/libavcodec/hqxdsp.c
+++ b/libavcodec/hqxdsp.c
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -118,7 +118,7 @@ static void hqx_idct_put(uint16_t *dst, ptrdiff_t stride,
 
     for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-            int v = av_clip(block[j + i * 8] + 0x800, 0, 0xFFF);
+            int v = av_clip_uintp2(block[j + i * 8] + 0x800, 12);
             dst[j] = (v << 4) | (v >> 8);
         }
         dst += stride >> 1;
diff --git a/libavcodec/hqxdsp.h b/libavcodec/hqxdsp.h
index 2cd2a8e9a4..39ab3e2f38 100644
--- a/libavcodec/hqxdsp.h
+++ b/libavcodec/hqxdsp.h
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqxvlc.c b/libavcodec/hqxvlc.c
index d185e861da..06a8073661 100644
--- a/libavcodec/hqxvlc.c
+++ b/libavcodec/hqxvlc.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/htmlsubtitles.c b/libavcodec/htmlsubtitles.c
new file mode 100644
index 0000000000..a2cd40fad3
--- /dev/null
+++ b/libavcodec/htmlsubtitles.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/parseutils.h"
+#include "htmlsubtitles.h"
+
+static int html_color_parse(void *log_ctx, const char *str)
+{
+    uint8_t rgba[4];
+    if (av_parse_color(rgba, str, strcspn(str, "\" >"), log_ctx) < 0)
+        return -1;
+    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
+}
+
+enum {
+    PARAM_UNKNOWN = -1,
+    PARAM_SIZE,
+    PARAM_COLOR,
+    PARAM_FACE,
+    PARAM_NUMBER
+};
+
+typedef struct SrtStack {
+    char tag[128];
+    char param[PARAM_NUMBER][128];
+} SrtStack;
+
+static void rstrip_spaces_buf(AVBPrint *buf)
+{
+    while (buf->len > 0 && buf->str[buf->len - 1] == ' ')
+        buf->str[--buf->len] = 0;
+}
+
+void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
+{
+    char *param, buffer[128], tmp[128];
+    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
+    SrtStack stack[16];
+
+    stack[0].tag[0] = 0;
+    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
+    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
+    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
+
+    for (; !end && *in; in++) {
+        switch (*in) {
+        case '\r':
+            break;
+        case '\n':
+            if (line_start) {
+                end = 1;
+                break;
+            }
+            rstrip_spaces_buf(dst);
+            av_bprintf(dst, "\\N");
+            line_start = 1;
+            break;
+        case ' ':
+            if (!line_start)
+                av_bprint_chars(dst, *in, 1);
+            break;
+        case '{':    /* skip all {\xxx} substrings except for {\an%d}
+                        and all microdvd like styles such as {Y:xxx} */
+            len = 0;
+            an += sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0;
+            if ((an != 1 && (len = 0, sscanf(in, "{\\%*[^}]}%n", &len) >= 0 && len > 0)) ||
+                (len = 0, sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n", &len) >= 0 && len > 0)) {
+                in += len - 1;
+            } else
+                av_bprint_chars(dst, *in, 1);
+            break;
+        case '<':
+            tag_close = in[1] == '/';
+            len = 0;
+            if (sscanf(in+tag_close+1, "%127[^>]>%n", buffer, &len) >= 1 && len > 0) {
+                const char *tagname = buffer;
+                while (*tagname == ' ')
+                    tagname++;
+                if ((param = strchr(tagname, ' ')))
+                    *param++ = 0;
+                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
+                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, tagname))) {
+                    int i, j, unknown = 0;
+                    in += len + tag_close;
+                    if (!tag_close)
+                        memset(stack+sptr, 0, sizeof(*stack));
+                    if (!strcmp(tagname, "font")) {
+                        if (tag_close) {
+                            for (i=PARAM_NUMBER-1; i>=0; i--)
+                                if (stack[sptr-1].param[i][0])
+                                    for (j=sptr-2; j>=0; j--)
+                                        if (stack[j].param[i][0]) {
+                                            av_bprintf(dst, "%s", stack[j].param[i]);
+                                            break;
+                                        }
+                        } else {
+                            while (param) {
+                                if (!strncmp(param, "size=", 5)) {
+                                    unsigned font_size;
+                                    param += 5 + (param[5] == '"');
+                                    if (sscanf(param, "%u", &font_size) == 1) {
+                                        snprintf(stack[sptr].param[PARAM_SIZE],
+                                             sizeof(stack[0].param[PARAM_SIZE]),
+                                             "{\\fs%u}", font_size);
+                                    }
+                                } else if (!strncmp(param, "color=", 6)) {
+                                    param += 6 + (param[6] == '"');
+                                    snprintf(stack[sptr].param[PARAM_COLOR],
+                                         sizeof(stack[0].param[PARAM_COLOR]),
+                                         "{\\c&H%X&}",
+                                         html_color_parse(log_ctx, param));
+                                } else if (!strncmp(param, "face=", 5)) {
+                                    param += 5 + (param[5] == '"');
+                                    len = strcspn(param,
+                                                  param[-1] == '"' ? "\"" :" ");
+                                    av_strlcpy(tmp, param,
+                                               FFMIN(sizeof(tmp), len+1));
+                                    param += len;
+                                    snprintf(stack[sptr].param[PARAM_FACE],
+                                             sizeof(stack[0].param[PARAM_FACE]),
+                                             "{\\fn%s}", tmp);
+                                }
+                                if ((param = strchr(param, ' ')))
+                                    param++;
+                            }
+                            for (i=0; i<PARAM_NUMBER; i++)
+                                if (stack[sptr].param[i][0])
+                                    av_bprintf(dst, "%s", stack[sptr].param[i]);
+                        }
+                    } else if (!tagname[1] && strspn(tagname, "bisu") == 1) {
+                        av_bprintf(dst, "{\\%c%d}", tagname[0], !tag_close);
+                    } else {
+                        unknown = 1;
+                        snprintf(tmp, sizeof(tmp), "</%s>", tagname);
+                    }
+                    if (tag_close) {
+                        sptr--;
+                    } else if (unknown && !strstr(in, tmp)) {
+                        in -= len + tag_close;
+                        av_bprint_chars(dst, *in, 1);
+                    } else
+                        av_strlcpy(stack[sptr++].tag, tagname,
+                                   sizeof(stack[0].tag));
+                    break;
+                }
+            }
+        default:
+            av_bprint_chars(dst, *in, 1);
+            break;
+        }
+        if (*in != ' ' && *in != '\r' && *in != '\n')
+            line_start = 0;
+    }
+
+    while (dst->len >= 2 && !strncmp(&dst->str[dst->len - 2], "\\N", 2))
+        dst->len -= 2;
+    dst->str[dst->len] = 0;
+    rstrip_spaces_buf(dst);
+}
diff --git a/libavcodec/htmlsubtitles.h b/libavcodec/htmlsubtitles.h
new file mode 100644
index 0000000000..e10cdda241
--- /dev/null
+++ b/libavcodec/htmlsubtitles.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HTMLSUBTITLES_H
+#define AVCODEC_HTMLSUBTITLES_H
+
+#include "libavutil/bprint.h"
+
+void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in);
+
+#endif /* AVCODEC_HTMLSUBTITLES_H */
diff --git a/libavcodec/huffman.c b/libavcodec/huffman.c
index f13ab296a5..c771bcfe08 100644
--- a/libavcodec/huffman.c
+++ b/libavcodec/huffman.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Konstantin Shishkov
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,18 +52,31 @@ static void heap_sift(HeapElem *h, int root, int size)
     }
 }
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int stats_size, int skip0)
 {
-    HeapElem h[256];
-    int up[2*256];
-    int len[2*256];
+    HeapElem *h  = av_malloc_array(sizeof(*h), stats_size);
+    int *up      = av_malloc_array(sizeof(*up) * 2, stats_size);
+    uint8_t *len = av_malloc_array(sizeof(*len) * 2, stats_size);
+    uint16_t *map= av_malloc_array(sizeof(*map), stats_size);
     int offset, i, next;
-    int size = 256;
+    int size = 0;
+    int ret = 0;
+
+    if (!h || !up || !len || !map) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    for (i = 0; i<stats_size; i++) {
+        dst[i] = 255;
+        if (stats[i] || !skip0)
+            map[size++] = i;
+    }
 
     for (offset = 1; ; offset <<= 1) {
         for (i=0; i < size; i++) {
             h[i].name = i;
-            h[i].val = (stats[i] << 8) + offset;
+            h[i].val = (stats[map[i]] << 14) + offset;
         }
         for (i = size / 2 - 1; i >= 0; i--)
             heap_sift(h, i, size);
@@ -84,11 +97,17 @@ void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
         for (i = 2 * size - 3; i >= size; i--)
             len[i] = len[up[i]] + 1;
         for (i = 0; i < size; i++) {
-            dst[i] = len[up[i]] + 1;
-            if (dst[i] >= 32) break;
+            dst[map[i]] = len[up[i]] + 1;
+            if (dst[map[i]] >= 32) break;
         }
         if (i==size) break;
     }
+end:
+    av_free(h);
+    av_free(up);
+    av_free(len);
+    av_free(map);
+    return ret;
 }
 
 static void get_tree_codes(uint32_t *bits, int16_t *lens, uint8_t *xlat,
@@ -155,18 +174,19 @@ int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bit
     cur_node = nb_codes;
     nodes[nb_codes*2-1].count = 0;
     for (i = 0; i < nb_codes * 2 - 1; i += 2) {
-        nodes[cur_node].sym = HNODE;
-        nodes[cur_node].count = nodes[i].count + nodes[i + 1].count;
-        nodes[cur_node].n0 = i;
-        for (j = cur_node; j > 0; j--) {
-            if (nodes[j].count > nodes[j - 1].count ||
-                (nodes[j].count == nodes[j - 1].count &&
-                 (!(flags & FF_HUFFMAN_FLAG_HNODE_FIRST) ||
-                  nodes[j].n0 == j - 1 || nodes[j].n0 == j - 2 ||
-                  (nodes[j].sym!=HNODE && nodes[j-1].sym!=HNODE))))
+        uint32_t cur_count = nodes[i].count + nodes[i+1].count;
+        // find correct place to insert new node, and
+        // make space for the new node while at it
+        for(j = cur_node; j > i + 2; j--){
+            if(cur_count > nodes[j-1].count ||
+               (cur_count == nodes[j-1].count &&
+                !(flags & FF_HUFFMAN_FLAG_HNODE_FIRST)))
                 break;
-            FFSWAP(Node, nodes[j], nodes[j - 1]);
+            nodes[j] = nodes[j - 1];
         }
+        nodes[j].sym = HNODE;
+        nodes[j].count = cur_count;
+        nodes[j].n0 = i;
         cur_node++;
     }
     if (build_huff_tree(vlc, nodes, nb_codes * 2 - 2, flags, nb_bits) < 0) {
diff --git a/libavcodec/huffman.h b/libavcodec/huffman.h
index c9eeb3736b..6ab23ae216 100644
--- a/libavcodec/huffman.h
+++ b/libavcodec/huffman.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,6 @@ typedef int (*HuffCmp)(const void *va, const void *vb);
 int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bits,
                        Node *nodes, HuffCmp cmp, int flags);
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats);
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int n, int skip0);
 
 #endif /* AVCODEC_HUFFMAN_H */
diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c
index da5c52f9a6..492155550f 100644
--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@@ -1,25 +1,25 @@
 /*
  * huffyuv codec for libavcodec
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,13 +36,13 @@
 #include "bswapdsp.h"
 #include "huffyuv.h"
 
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table)
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n)
 {
     int len, index;
     uint32_t bits = 0;
 
     for (len = 32; len > 0; len--) {
-        for (index = 0; index < 256; index++) {
+        for (index = 0; index < n; index++) {
             if (len_table[index] == len)
                 dst[index] = bits++;
         }
@@ -59,16 +59,11 @@ av_cold int ff_huffyuv_alloc_temp(HYuvContext *s)
 {
     int i;
 
-    if (s->bitstream_bpp<24) {
-        for (i=0; i<3; i++) {
-            s->temp[i]= av_malloc(s->width + 16);
-            if (!s->temp[i])
-                return AVERROR(ENOMEM);
-        }
-    } else {
-        s->temp[0]= av_mallocz(4*s->width + 16);
-        if (!s->temp[0])
+    for (i=0; i<3; i++) {
+        s->temp[i]= av_malloc(4*s->width + 16);
+        if (!s->temp[i])
             return AVERROR(ENOMEM);
+        s->temp16[i] = (uint16_t*)s->temp[i];
     }
     return 0;
 }
@@ -81,17 +76,20 @@ av_cold void ff_huffyuv_common_init(AVCodecContext *avctx)
     s->flags = avctx->flags;
 
     ff_bswapdsp_init(&s->bdsp);
+    ff_llviddsp_init(&s->llviddsp, avctx);
 
     s->width = avctx->width;
     s->height = avctx->height;
-    assert(s->width>0 && s->height>0);
+
+    av_assert1(s->width > 0 && s->height > 0);
 }
 
-void ff_huffyuv_common_end(HYuvContext *s)
+av_cold void ff_huffyuv_common_end(HYuvContext *s)
 {
     int i;
 
     for(i = 0; i < 3; i++) {
         av_freep(&s->temp[i]);
+        s->temp16[i] = NULL;
     }
 }
diff --git a/libavcodec/huffyuv.h b/libavcodec/huffyuv.h
index aed153769a..c18247ed0f 100644
--- a/libavcodec/huffyuv.h
+++ b/libavcodec/huffyuv.h
@@ -1,23 +1,23 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,20 +37,13 @@
 #include "huffyuvdsp.h"
 #include "huffyuvencdsp.h"
 #include "put_bits.h"
+#include "lossless_videodsp.h"
 
-#define VLC_BITS 11
+#define VLC_BITS 12
 
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
+#define MAX_BITS 16
+#define MAX_N (1<<MAX_BITS)
+#define MAX_VLC_N 16384
 
 typedef enum Predictor {
     LEFT = 0,
@@ -59,6 +52,7 @@ typedef enum Predictor {
 } Predictor;
 
 typedef struct HYuvContext {
+    AVClass *class;
     AVCodecContext *avctx;
     Predictor predictor;
     GetBitContext gb;
@@ -69,27 +63,38 @@ typedef struct HYuvContext {
     int version;
     int yuy2;                               //use yuy2 instead of 422P
     int bgr32;                              //use bgr32 instead of bgr24
+    int bps;
+    int n;                                  // 1<<bps
+    int vlc_n;                              // number of vlc codes (FFMIN(1<<bps, MAX_VLC_N))
+    int alpha;
+    int chroma;
+    int yuv;
+    int chroma_h_shift;
+    int chroma_v_shift;
     int width, height;
     int flags;
     int context;
     int picture_number;
     int last_slice_end;
     uint8_t *temp[3];
-    uint64_t stats[3][256];
-    uint8_t len[3][256];
-    uint32_t bits[3][256];
+    uint16_t *temp16[3];                    ///< identical to temp but 16bit type
+    uint64_t stats[4][MAX_VLC_N];
+    uint8_t len[4][MAX_VLC_N];
+    uint32_t bits[4][MAX_VLC_N];
     uint32_t pix_bgr_map[1<<VLC_BITS];
-    VLC vlc[6];                             //Y,U,V,YY,YU,YV
+    VLC vlc[8];                             //Y,U,V,A,YY,YU,YV,AA
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     BswapDSPContext bdsp;
     HuffYUVDSPContext hdsp;
     HuffYUVEncDSPContext hencdsp;
+    LLVidDSPContext llviddsp;
+    int non_determ; // non-deterministic, multi-threaded encoder allowed
 } HYuvContext;
 
 void ff_huffyuv_common_init(AVCodecContext *s);
 void ff_huffyuv_common_end(HYuvContext *s);
 int  ff_huffyuv_alloc_temp(HYuvContext *s);
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table);
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n);
 
 #endif /* AVCODEC_HUFFYUV_H */
diff --git a/libavcodec/huffyuvdec.c b/libavcodec/huffyuvdec.c
index 12eca26cff..7314519fca 100644
--- a/libavcodec/huffyuvdec.c
+++ b/libavcodec/huffyuvdec.c
@@ -1,26 +1,28 @@
 /*
  * huffyuv decoder
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -28,17 +30,22 @@
  * huffyuv decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "avcodec.h"
 #include "get_bits.h"
 #include "huffyuv.h"
 #include "huffyuvdsp.h"
 #include "thread.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 
 #define classic_shift_luma_table_size 42
 static const unsigned char classic_shift_luma[classic_shift_luma_table_size + AV_INPUT_BUFFER_PADDING_SIZE] = {
     34, 36, 35, 69, 135, 232,   9, 16, 10, 24,  11,  23,  12,  16, 13, 10,
     14,  8, 15,  8,  16,   8,  17, 20, 16, 10, 207, 206, 205, 236, 11,  8,
-    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0
+    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0,
+  0,0,0,0,0,0,0,0,
 };
 
 #define classic_shift_chroma_table_size 59
@@ -46,7 +53,8 @@ static const unsigned char classic_shift_chroma[classic_shift_chroma_table_size
     66, 36,  37,  38, 39, 40,  41,  75,  76,  77, 110, 239, 144, 81, 82,  83,
     84, 85, 118, 183, 56, 57,  88,  89,  56,  89, 154,  57,  58, 57, 26, 141,
     57, 56,  58,  57, 58, 57, 184, 119, 214, 245, 116,  83,  82, 49, 80,  79,
-    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0
+    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0,
+  0,0,0,0,0,0,0,0,
 };
 
 static const unsigned char classic_add_luma[256] = {
@@ -87,16 +95,16 @@ static const unsigned char classic_add_chroma[256] = {
       6,  12,   8,  10,   7,   9,   6,   4,   6,   2,   2,   3,   3,   3,   3,   2,
 };
 
-static int read_len_table(uint8_t *dst, GetBitContext *gb)
+static int read_len_table(uint8_t *dst, GetBitContext *gb, int n)
 {
     int i, val, repeat;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         repeat = get_bits(gb, 3);
         val    = get_bits(gb, 5);
         if (repeat == 0)
             repeat = get_bits(gb, 8);
-        if (i + repeat > 256 || get_bits_left(gb) < 0) {
+        if (i + repeat > n || get_bits_left(gb) < 0) {
             av_log(NULL, AV_LOG_ERROR, "Error reading huffman table\n");
             return AVERROR_INVALIDDATA;
         }
@@ -108,34 +116,43 @@ static int read_len_table(uint8_t *dst, GetBitContext *gb)
 
 static int generate_joint_tables(HYuvContext *s)
 {
-    uint16_t symbols[1 << VLC_BITS];
-    uint16_t bits[1 << VLC_BITS];
-    uint8_t len[1 << VLC_BITS];
     int ret;
+    uint16_t *symbols = av_mallocz(5 << VLC_BITS);
+    uint16_t *bits;
+    uint8_t *len;
+    if (!symbols)
+        return AVERROR(ENOMEM);
+    bits = symbols + (1 << VLC_BITS);
+    len = (uint8_t *)(bits + (1 << VLC_BITS));
 
-    if (s->bitstream_bpp < 24) {
+    if (s->bitstream_bpp < 24 || s->version > 2) {
         int p, i, y, u;
-        for (p = 0; p < 3; p++) {
-            for (i = y = 0; y < 256; y++) {
-                int len0  = s->len[0][y];
+        for (p = 0; p < 4; p++) {
+            int p0 = s->version > 2 ? p : 0;
+            for (i = y = 0; y < s->vlc_n; y++) {
+                int len0  = s->len[p0][y];
                 int limit = VLC_BITS - len0;
-                if (limit <= 0)
+                if (limit <= 0 || !len0)
+                    continue;
+                if ((sign_extend(y, 8) & (s->vlc_n-1)) != y)
                     continue;
-                for (u = 0; u < 256; u++) {
+                for (u = 0; u < s->vlc_n; u++) {
                     int len1 = s->len[p][u];
-                    if (len1 > limit)
+                    if (len1 > limit || !len1)
                         continue;
+                    if ((sign_extend(u, 8) & (s->vlc_n-1)) != u)
+                        continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]     = len0 + len1;
-                    bits[i]    = (s->bits[0][y] << len1) + s->bits[p][u];
-                    symbols[i] = (y << 8) + u;
-                    if (symbols[i] != 0xffff) // reserved to mean "invalid"
+                    bits[i]    = (s->bits[p0][y] << len1) + s->bits[p][u];
+                    symbols[i] = (y << 8) + (u & 0xFF);
                         i++;
                 }
             }
-            ff_free_vlc(&s->vlc[3 + p]);
-            if ((ret = ff_init_vlc_sparse(&s->vlc[3 + p], VLC_BITS, i, len, 1, 1,
+            ff_free_vlc(&s->vlc[4 + p]);
+            if ((ret = ff_init_vlc_sparse(&s->vlc[4 + p], VLC_BITS, i, len, 1, 1,
                                           bits, 2, 2, symbols, 2, 2, 0)) < 0)
-                return ret;
+                goto out;
         }
     } else {
         uint8_t (*map)[4] = (uint8_t(*)[4]) s->pix_bgr_map;
@@ -148,18 +165,19 @@ static int generate_joint_tables(HYuvContext *s)
         for (i = 0, g = -16; g < 16; g++) {
             int len0   = s->len[p0][g & 255];
             int limit0 = VLC_BITS - len0;
-            if (limit0 < 2)
+            if (limit0 < 2 || !len0)
                 continue;
             for (b = -16; b < 16; b++) {
                 int len1   = s->len[p1][b & 255];
                 int limit1 = limit0 - len1;
-                if (limit1 < 1)
+                if (limit1 < 1 || !len1)
                     continue;
                 code = (s->bits[p0][g & 255] << len1) + s->bits[p1][b & 255];
                 for (r = -16; r < 16; r++) {
                     int len2 = s->len[2][r & 255];
-                    if (len2 > limit1)
+                    if (len2 > limit1 || !len2)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]  = len0 + len1 + len2;
                     bits[i] = (code << len2) + s->bits[2][r & 255];
                     if (s->decorrelate) {
@@ -175,30 +193,37 @@ static int generate_joint_tables(HYuvContext *s)
                 }
             }
         }
-        ff_free_vlc(&s->vlc[3]);
-        if ((ret = init_vlc(&s->vlc[3], VLC_BITS, i, len, 1, 1,
+        ff_free_vlc(&s->vlc[4]);
+        if ((ret = init_vlc(&s->vlc[4], VLC_BITS, i, len, 1, 1,
                             bits, 2, 2, 0)) < 0)
-            return ret;
+            goto out;
     }
-    return 0;
+    ret = 0;
+out:
+    av_freep(&symbols);
+    return ret;
 }
 
 static int read_huffman_tables(HYuvContext *s, const uint8_t *src, int length)
 {
     GetBitContext gb;
     int i, ret;
+    int count = 3;
 
     if ((ret = init_get_bits(&gb, src, length * 8)) < 0)
         return ret;
 
-    for (i = 0; i < 3; i++) {
-        if ((ret = read_len_table(s->len[i], &gb)) < 0)
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = read_len_table(s->len[i], &gb, s->vlc_n)) < 0)
             return ret;
-        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i])) < 0)
+        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n)) < 0)
             return ret;
         ff_free_vlc(&s->vlc[i]);
-        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
-                            s->bits[i], 4, 4, 0)) < 0)
+        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, s->vlc_n, s->len[i], 1, 1,
+                           s->bits[i], 4, 4, 0)) < 0)
             return ret;
     }
 
@@ -213,16 +238,14 @@ static int read_old_huffman_tables(HYuvContext *s)
     GetBitContext gb;
     int i, ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_luma,
-                             classic_shift_luma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[0], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_luma,
+                  classic_shift_luma_table_size * 8);
+    if ((ret = read_len_table(s->len[0], &gb, 256)) < 0)
         return ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_chroma,
-                             classic_shift_chroma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[1], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_chroma,
+                  classic_shift_chroma_table_size * 8);
+    if ((ret = read_len_table(s->len[1], &gb, 256)) < 0)
         return ret;
 
     for (i = 0; i < 256; i++)
@@ -237,7 +260,7 @@ static int read_old_huffman_tables(HYuvContext *s)
     memcpy(s->bits[2], s->bits[1], 256 * sizeof(uint32_t));
     memcpy(s->len[2], s->len[1], 256 * sizeof(uint8_t));
 
-    for (i = 0; i < 3; i++) {
+    for (i = 0; i < 4; i++) {
         ff_free_vlc(&s->vlc[i]);
         if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
                             s->bits[i], 4, 4, 0)) < 0)
@@ -250,28 +273,51 @@ static int read_old_huffman_tables(HYuvContext *s)
     return 0;
 }
 
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int i;
+
+    ff_huffyuv_common_end(s);
+    av_freep(&s->bitstream_buffer);
+
+    for (i = 0; i < 8; i++)
+        ff_free_vlc(&s->vlc[i]);
+
+    return 0;
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int ret;
 
-    ff_huffyuv_common_init(avctx);
+    ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
+    if (ret < 0)
+        return ret;
+
     ff_huffyuvdsp_init(&s->hdsp);
-    memset(s->vlc, 0, 3 * sizeof(VLC));
+    memset(s->vlc, 0, 4 * sizeof(VLC));
 
-    s->interlaced = s->height > 288;
+    s->interlaced = avctx->height > 288;
     s->bgr32      = 1;
 
     if (avctx->extradata_size) {
         if ((avctx->bits_per_coded_sample & 7) &&
             avctx->bits_per_coded_sample != 12)
             s->version = 1; // do such files exist at all?
-        else
+        else if (avctx->extradata_size > 3 && avctx->extradata[3] == 0)
             s->version = 2;
+        else
+            s->version = 3;
     } else
         s->version = 0;
 
-    if (s->version == 2) {
+    s->bps = 8;
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+    s->chroma = 1;
+    if (s->version >= 2) {
         int method, interlace;
 
         if (avctx->extradata_size < 4)
@@ -280,16 +326,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
         method           = avctx->extradata[0];
         s->decorrelate   = method & 64 ? 1 : 0;
         s->predictor     = method & 63;
-        s->bitstream_bpp = avctx->extradata[1];
-        if (s->bitstream_bpp == 0)
-            s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        if (s->version == 2) {
+            s->bitstream_bpp = avctx->extradata[1];
+            if (s->bitstream_bpp == 0)
+                s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        } else {
+            s->bps = (avctx->extradata[1] >> 4) + 1;
+            s->n = 1<<s->bps;
+            s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+            s->chroma_h_shift = avctx->extradata[1] & 3;
+            s->chroma_v_shift = (avctx->extradata[1] >> 2) & 3;
+            s->yuv   = !!(avctx->extradata[2] & 1);
+            s->chroma= !!(avctx->extradata[2] & 3);
+            s->alpha = !!(avctx->extradata[2] & 4);
+        }
         interlace     = (avctx->extradata[2] & 0x30) >> 4;
         s->interlaced = (interlace == 1) ? 1 : (interlace == 2) ? 0 : s->interlaced;
         s->context    = avctx->extradata[2] & 0x40 ? 1 : 0;
 
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size - 4)) < 0)
-            return ret;
+            goto error;
     } else {
         switch (avctx->bits_per_coded_sample & 7) {
         case 1:
@@ -317,55 +374,218 @@ static av_cold int decode_init(AVCodecContext *avctx)
         s->context       = 0;
 
         if ((ret = read_old_huffman_tables(s)) < 0)
-            return ret;
+            goto error;
     }
 
-    switch (s->bitstream_bpp) {
-    case 12:
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-        break;
-    case 16:
-        if (s->yuy2)
-            avctx->pix_fmt = AV_PIX_FMT_YUYV422;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-        break;
-    case 24:
-    case 32:
-        if (s->bgr32)
+    if (s->version <= 2) {
+        switch (s->bitstream_bpp) {
+        case 12:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            s->yuv = 1;
+            break;
+        case 16:
+            if (s->yuy2)
+                avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            s->yuv = 1;
+            break;
+        case 24:
+            if (s->bgr32)
+                avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_BGR24;
+            break;
+        case 32:
+            av_assert0(s->bgr32);
             avctx->pix_fmt = AV_PIX_FMT_RGB32;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_BGR24;
-        break;
-    default:
-        return AVERROR_INVALIDDATA;
+            s->alpha = 1;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+        av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                         &s->chroma_h_shift,
+                                         &s->chroma_v_shift);
+    } else {
+        switch ( (s->chroma<<10) | (s->yuv<<9) | (s->alpha<<8) | ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2)) {
+        case 0x070:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            break;
+        case 0x0F0:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            break;
+        case 0x170:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            break;
+        case 0x470:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            break;
+        case 0x480:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP9;
+            break;
+        case 0x490:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+            break;
+        case 0x4B0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+            break;
+        case 0x4D0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP14;
+            break;
+        case 0x4F0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP16;
+            break;
+        case 0x570:
+            avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            break;
+        case 0x670:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            break;
+        case 0x680:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
+            break;
+        case 0x690:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+            break;
+        case 0x6B0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12;
+            break;
+        case 0x6D0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P14;
+            break;
+        case 0x6F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            break;
+        case 0x671:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            break;
+        case 0x681:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
+            break;
+        case 0x691:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            break;
+        case 0x6B1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            break;
+        case 0x6D1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P14;
+            break;
+        case 0x6F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            break;
+        case 0x672:
+            avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            break;
+        case 0x674:
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            break;
+        case 0x675:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            break;
+        case 0x685:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
+            break;
+        case 0x695:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
+            break;
+        case 0x6B5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12;
+            break;
+        case 0x6D5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P14;
+            break;
+        case 0x6F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+            break;
+        case 0x67A:
+            avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            break;
+        case 0x770:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            break;
+        case 0x780:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P9;
+            break;
+        case 0x790:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+            break;
+        case 0x7F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P16;
+            break;
+        case 0x771:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+            break;
+        case 0x781:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P9;
+            break;
+        case 0x791:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+            break;
+        case 0x7F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P16;
+            break;
+        case 0x775:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            break;
+        case 0x785:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P9;
+            break;
+        case 0x795:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P10;
+            break;
+        case 0x7F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
     }
 
+    ff_huffyuv_common_init(avctx);
+
+    if ((avctx->pix_fmt == AV_PIX_FMT_YUV422P || avctx->pix_fmt == AV_PIX_FMT_YUV420P) && avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "width must be even for this colorspace\n");
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
     if (s->predictor == MEDIAN && avctx->pix_fmt == AV_PIX_FMT_YUV422P &&
         avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "width must be multiple of 4 "
+        av_log(avctx, AV_LOG_ERROR, "width must be a multiple of 4 "
                "for this combination of colorspace and predictor type.\n");
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto error;
     }
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
-        return ret;
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
+        goto error;
+    }
 
     return 0;
+  error:
+    decode_end(avctx);
+    return ret;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, ret;
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
         return ret;
+    }
 
-    for (i = 0; i < 6; i++)
+    for (i = 0; i < 8; i++)
         s->vlc[i].table = NULL;
 
-    if (s->version == 2) {
+    if (s->version >= 2) {
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size)) < 0)
             return ret;
@@ -376,49 +596,174 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
+
+/** Subset of GET_VLC for use in hand-roller VLC code */
+#define VLC_INTERN(dst, table, gb, name, bits, max_depth)   \
+    code = table[index][0];                                 \
+    n    = table[index][1];                                 \
+    if (max_depth > 1 && n < 0) {                           \
+        LAST_SKIP_BITS(name, gb, bits);                     \
+        UPDATE_CACHE(name, gb);                             \
+                                                            \
+        nb_bits = -n;                                       \
+        index   = SHOW_UBITS(name, gb, nb_bits) + code;     \
+        code    = table[index][0];                          \
+        n       = table[index][1];                          \
+        if (max_depth > 2 && n < 0) {                       \
+            LAST_SKIP_BITS(name, gb, nb_bits);              \
+            UPDATE_CACHE(name, gb);                         \
+                                                            \
+            nb_bits = -n;                                   \
+            index   = SHOW_UBITS(name, gb, nb_bits) + code; \
+            code    = table[index][0];                      \
+            n       = table[index][1];                      \
+        }                                                   \
+    }                                                       \
+    dst = code;                                             \
+    LAST_SKIP_BITS(name, gb, n)
+
+
+#define GET_VLC_DUAL(dst0, dst1, name, gb, dtable, table1, table2,  \
+                     bits, max_depth, OP)                           \
+    do {                                                            \
+        unsigned int index = SHOW_UBITS(name, gb, bits);            \
+        int          code, n = dtable[index][1];                    \
+                                                                    \
+        if (n<=0) {                                                 \
+            int nb_bits;                                            \
+            VLC_INTERN(dst0, table1, gb, name, bits, max_depth);    \
+                                                                    \
+            UPDATE_CACHE(re, gb);                                   \
+            index = SHOW_UBITS(name, gb, bits);                     \
+            VLC_INTERN(dst1, table2, gb, name, bits, max_depth);    \
+        } else {                                                    \
+            code = dtable[index][0];                                \
+            OP(dst0, dst1, code);                                   \
+            LAST_SKIP_BITS(name, gb, n);                            \
+        }                                                           \
+    } while (0)
+
+#define OP8bits(dst0, dst1, code) dst0 = code>>8; dst1 = code
 
-/* TODO instead of restarting the read when the code isn't in the first level
- * of the joint table, jump into the 2nd level of the individual table. */
 #define READ_2PIX(dst0, dst1, plane1)                                   \
-    {                                                                   \
-        uint16_t code = get_vlc2(&s->gb, s->vlc[3 + plane1].table,      \
-                                 VLC_BITS, 1);                          \
-        if (code != 0xffff) {                                           \
-            dst0 = code >> 8;                                           \
-            dst1 = code;                                                \
-        } else {                                                        \
-            dst0 = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);      \
-            dst1 = get_vlc2(&s->gb, s->vlc[plane1].table, VLC_BITS, 3); \
-        }                                                               \
-    }
+    UPDATE_CACHE(re, &s->gb);                                           \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane1].table,        \
+                 s->vlc[0].table, s->vlc[plane1].table, VLC_BITS, 3, OP8bits)
 
 static void decode_422_bitstream(HYuvContext *s, int count)
 {
-    int i;
-
+    int i, icount;
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 4)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    icount = get_bits_left(&s->gb) / (32 * 4);
+    if (count >= icount) {
+        for (i = 0; i < icount; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
+        for (; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+            READ_2PIX(s->temp[0][2 * i    ], s->temp[1][i], 1);
+            if (BITS_LEFT(re, &s->gb) <= 0) break;
+            READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
+        }
+        for (; i < count; i++)
+            s->temp[0][2 * i    ] = s->temp[1][i] =
+            s->temp[0][2 * i + 1] = s->temp[2][i] = 0;
     } else {
         for (i = 0; i < count; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
     }
+    CLOSE_READER(re, &s->gb);
+}
+
+#define READ_2PIX_PLANE(dst0, dst1, plane, OP) \
+    UPDATE_CACHE(re, &s->gb); \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane].table, \
+                 s->vlc[plane].table, s->vlc[plane].table, VLC_BITS, 3, OP)
+
+#define OP14bits(dst0, dst1, code) dst0 = code>>8; dst1 = sign_extend(code, 8)
+
+/* TODO instead of restarting the read when the code isn't in the first level
+ * of the joint table, jump into the 2nd level of the individual table. */
+#define READ_2PIX_PLANE16(dst0, dst1, plane){\
+    dst0 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst0 += get_bits(&s->gb, 2);\
+    dst1 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst1 += get_bits(&s->gb, 2);\
+}
+static void decode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->bps <= 8) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else if (s->bps <= 14) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp16[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else {
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        }
+        if( width&1 && get_bits_left(&s->gb)>0 ) {
+            int dst = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;
+            s->temp16[0][width-1] = dst + get_bits(&s->gb, 2);
+        }
+    }
 }
 
 static void decode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
-
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 2)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+        for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     } else {
@@ -426,30 +771,65 @@ static void decode_gray_bitstream(HYuvContext *s, int count)
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static av_always_inline void decode_bgr_1(HYuvContext *s, int count,
                                           int decorrelate, int alpha)
 {
     int i;
-    for (i = 0; i < count; i++) {
-        int code = get_vlc2(&s->gb, s->vlc[3].table, VLC_BITS, 1);
-        if (code != -1) {
+    OPEN_READER(re, &s->gb);
+
+    for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+        unsigned int index;
+        int code, n, nb_bits;
+
+        UPDATE_CACHE(re, &s->gb);
+        index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+        n     = s->vlc[4].table[index][1];
+
+        if (n>0) {
+            code  = s->vlc[4].table[index][0];
             *(uint32_t *) &s->temp[0][4 * i] = s->pix_bgr_map[code];
-        } else if (decorrelate) {
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
+            LAST_SKIP_BITS(re, &s->gb, n);
         } else {
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+            if (decorrelate) {
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[0].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + B] = code + s->temp[0][4 * i + G];
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[2].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + R] = code + s->temp[0][4 * i + G];
+            } else {
+                VLC_INTERN(s->temp[0][4 * i + B], s->vlc[0].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + R], s->vlc[2].table,
+                           &s->gb, re, VLC_BITS, 3);
+            }
         }
-        if (alpha)
-            s->temp[0][4 * i + A] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+        if (alpha) {
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][4 * i + A], s->vlc[2].table,
+                       &s->gb, re, VLC_BITS, 3);
+        } else
+            s->temp[0][4 * i + A] = 0;
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static void decode_bgr_bitstream(HYuvContext *s, int count)
@@ -495,6 +875,32 @@ static void draw_slice(HYuvContext *s, AVFrame *frame, int y)
     s->last_slice_end = y + h;
 }
 
+static int left_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, int w, int acc)
+{
+    if (s->bps <= 8) {
+        return s->hdsp.add_hfyu_left_pred(dst, src, w, acc);
+    } else {
+        return s->llviddsp.add_hfyu_left_pred_int16((      uint16_t *)dst, (const uint16_t *)src, s->n-1, w, acc);
+    }
+}
+
+static void add_bytes(HYuvContext *s, uint8_t *dst, uint8_t *src, int w)
+{
+    if (s->bps <= 8) {
+        s->hdsp.add_bytes(dst, src, w);
+    } else {
+        s->llviddsp.add_int16((uint16_t*)dst, (const uint16_t*)src, s->n - 1, w);
+    }
+}
+
+static void add_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, const uint8_t *diff, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->hdsp.add_hfyu_median_pred(dst, src, diff, w, left, left_top);
+    } else {
+        s->llviddsp.add_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src, (const uint16_t *)diff, s->n-1, w, left, left_top);
+    }
+}
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
@@ -509,20 +915,17 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *const p = data;
     int table_size = 0, ret;
 
-    av_fast_malloc(&s->bitstream_buffer,
+    av_fast_padded_malloc(&s->bitstream_buffer,
                    &s->bitstream_buffer_size,
-                   buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+                   buf_size);
     if (!s->bitstream_buffer)
         return AVERROR(ENOMEM);
 
-    memset(s->bitstream_buffer + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     s->bdsp.bswap_buf((uint32_t *) s->bitstream_buffer,
                       (const uint32_t *) buf, buf_size / 4);
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
 
     if (s->context) {
         table_size = read_huffman_tables(s, s->bitstream_buffer, buf_size);
@@ -543,7 +946,72 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     s->last_slice_end = 0;
 
-    if (s->bitstream_bpp < 24) {
+    if (s->version > 2) {
+        int plane;
+        for(plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, lefttop, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            switch (s->predictor) {
+            case LEFT:
+            case PLANE:
+                decode_plane_bitstream(s, w, plane);
+                left = left_prediction(s, p->data[plane], s->temp[0], w, 0);
+
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane]*y;
+
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, dst, s->temp[0], w, left);
+                    if (s->predictor == PLANE) {
+                        if (y > s->interlaced) {
+                            add_bytes(s, dst, dst - fake_stride, w);
+                        }
+                    }
+                }
+
+                break;
+            case MEDIAN:
+                decode_plane_bitstream(s, w, plane);
+                left= left_prediction(s, p->data[plane], s->temp[0], w, 0);
+
+                y = 1;
+
+                /* second line is left predicted for interlaced case */
+                if (s->interlaced) {
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, p->data[plane] + p->linesize[plane], s->temp[0], w, left);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+                decode_plane_bitstream(s, w, plane);
+                add_median_prediction(s, p->data[plane] + fake_stride, p->data[plane], s->temp[0], w, &left, &lefttop);
+                y++;
+
+                for (; y<h; y++) {
+                    uint8_t *dst;
+
+                    decode_plane_bitstream(s, w, plane);
+
+                    dst = p->data[plane] + p->linesize[plane] * y;
+
+                    add_median_prediction(s, dst, dst - fake_stride, s->temp[0], w, &left, &lefttop);
+                }
+
+                break;
+            }
+        }
+        draw_slice(s, p, height);
+    } else if (s->bitstream_bpp < 24) {
         int y, cy;
         int lefty, leftu, leftv;
         int lefttopy, lefttopu, lefttopv;
@@ -554,7 +1022,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             p->data[0][1] = get_bits(&s->gb, 8);
             p->data[0][0] = get_bits(&s->gb, 8);
 
-            avpriv_report_missing_feature(avctx, "YUY2 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "YUY2 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         } else {
             leftv         =
@@ -708,19 +1177,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     } else {
         int y;
-        int leftr, leftg, leftb, lefta;
+        uint8_t left[4];
         const int last_line = (height - 1) * p->linesize[0];
 
         if (s->bitstream_bpp == 32) {
-            lefta = p->data[0][last_line + A] = get_bits(&s->gb, 8);
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = get_bits(&s->gb, 8);
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
         } else {
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
-            lefta = p->data[0][last_line + A] = 255;
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = 255;
             skip_bits(&s->gb, 8);
         }
 
@@ -730,23 +1199,20 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             case PLANE:
                 decode_bgr_bitstream(s, width - 1);
                 s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + last_line + 4,
-                                                 s->temp[0], width - 1, &leftr,
-                                                 &leftg, &leftb, &lefta);
+                                                 s->temp[0], width - 1, left);
 
                 for (y = s->height - 2; y >= 0; y--) { // Yes it is stored upside down.
                     decode_bgr_bitstream(s, width);
 
                     s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + p->linesize[0] * y,
-                                                     s->temp[0], width, &leftr,
-                                                     &leftg, &leftb, &lefta);
+                                                     s->temp[0], width, left);
                     if (s->predictor == PLANE) {
                         if (s->bitstream_bpp != 32)
-                            lefta = 0;
-                        if ((y & s->interlaced) == 0 &&
-                            y < s->height - 1 - s->interlaced) {
+                            left[A] = 0;
+                        if (y < s->height - 1 - s->interlaced) {
                             s->hdsp.add_bytes(p->data[0] + p->linesize[0] * y,
                                               p->data[0] + p->linesize[0] * y +
-                                              fake_ystride, fake_ystride);
+                                              fake_ystride, 4 * width);
                         }
                     }
                 }
@@ -758,7 +1224,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                        "prediction type not supported!\n");
             }
         } else {
-            avpriv_report_missing_feature(avctx, "BGR24 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "BGR24 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         }
     }
@@ -769,20 +1236,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     return (get_bits_count(&s->gb) + 31) / 32 * 4 + table_size;
 }
 
-static av_cold int decode_end(AVCodecContext *avctx)
-{
-    HYuvContext *s = avctx->priv_data;
-    int i;
-
-    ff_huffyuv_common_end(s);
-    av_freep(&s->bitstream_buffer);
-
-    for (i = 0; i < 6; i++)
-        ff_free_vlc(&s->vlc[i]);
-
-    return 0;
-}
-
 AVCodec ff_huffyuv_decoder = {
     .name             = "huffyuv",
     .long_name        = NULL_IF_CONFIG_SMALL("Huffyuv / HuffYUV"),
diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index b5a714d072..e8a05f6592 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     long i;
 
@@ -41,7 +41,7 @@ static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
 }
 
 static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
-                                   const uint8_t *diff, int w,
+                                   const uint8_t *diff, intptr_t w,
                                    int *left, int *left_top)
 {
     int i;
@@ -60,7 +60,7 @@ static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
     *left_top = lt;
 }
 
-static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
+static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, intptr_t w,
                                 int acc)
 {
     int i;
@@ -81,22 +81,11 @@ static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
     return acc;
 }
 
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
 static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
-                                       int w, int *red, int *green,
-                                       int *blue, int *alpha)
+                                       intptr_t w, uint8_t *left)
 {
-    int i, r = *red, g = *green, b = *blue, a = *alpha;
+    int i;
+    uint8_t r = left[R], g = left[G], b = left[B], a = left[A];
 
     for (i = 0; i < w; i++) {
         b += src[4 * i + B];
@@ -110,15 +99,11 @@ static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
         dst[4 * i + A] = a;
     }
 
-    *red   = r;
-    *green = g;
-    *blue  = b;
-    *alpha = a;
+    left[B] = b;
+    left[G] = g;
+    left[R] = r;
+    left[A] = a;
 }
-#undef B
-#undef G
-#undef R
-#undef A
 
 av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c)
 {
diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index 5e84e3aecb..db377289ee 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,18 +20,30 @@
 #define AVCODEC_HUFFYUVDSP_H
 
 #include <stdint.h>
+#include "config.h"
+
+#if HAVE_BIGENDIAN
+#define B 3
+#define G 2
+#define R 1
+#define A 0
+#else
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+#endif
 
 typedef struct HuffYUVDSPContext {
     void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
-                      int w);
+                      intptr_t w);
     void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
-                                 const uint8_t *diff, int w,
+                                 const uint8_t *diff, intptr_t w,
                                  int *left, int *left_top);
     int (*add_hfyu_left_pred)(uint8_t *dst, const uint8_t *src,
-                              int w, int left);
+                              intptr_t w, int left);
     void (*add_hfyu_left_pred_bgr32)(uint8_t *dst, const uint8_t *src,
-                                     int w, int *red, int *green,
-                                     int *blue, int *alpha);
+                                     intptr_t w, uint8_t *left);
 } HuffYUVDSPContext;
 
 void ff_huffyuvdsp_init(HuffYUVDSPContext *c);
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index c18b38c4c1..49d711a948 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -1,24 +1,26 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -32,31 +34,64 @@
 #include "huffyuvencdsp.h"
 #include "internal.h"
 #include "put_bits.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+static inline void diff_bytes(HYuvContext *s, uint8_t *dst,
+                              const uint8_t *src0, const uint8_t *src1, int w)
+{
+    if (s->bps <= 8) {
+        s->hencdsp.diff_bytes(dst, src0, src1, w);
+    } else {
+        s->llviddsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
+    }
+}
 
 static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst,
-                                      uint8_t *src, int w, int left)
+                                      const uint8_t *src, int w, int left)
 {
     int i;
-    if (w < 32) {
-        for (i = 0; i < w; i++) {
-            const int temp = src[i];
-            dst[i] = temp - left;
-            left   = temp;
+    if (s->bps <= 8) {
+        if (w < 32) {
+            for (i = 0; i < w; i++) {
+                const int temp = src[i];
+                dst[i] = temp - left;
+                left   = temp;
+            }
+            return left;
+        } else {
+            for (i = 0; i < 16; i++) {
+                const int temp = src[i];
+                dst[i] = temp - left;
+                left   = temp;
+            }
+            s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16);
+            return src[w-1];
         }
-        return left;
     } else {
-        for (i = 0; i < 16; i++) {
-            const int temp = src[i];
-            dst[i] = temp - left;
-            left   = temp;
+        const uint16_t *src16 = (const uint16_t *)src;
+        uint16_t       *dst16 = (      uint16_t *)dst;
+        if (w < 32) {
+            for (i = 0; i < w; i++) {
+                const int temp = src16[i];
+                dst16[i] = temp - left;
+                left   = temp;
+            }
+            return left;
+        } else {
+            for (i = 0; i < 16; i++) {
+                const int temp = src16[i];
+                dst16[i] = temp - left;
+                left   = temp;
+            }
+            s->llviddsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16);
+            return src16[w-1];
         }
-        s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16);
-        return src[w-1];
     }
 }
 
 static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
-                                             uint8_t *src, int w,
+                                             const uint8_t *src, int w,
                                              int *red, int *green, int *blue,
                                              int *alpha)
 {
@@ -118,20 +153,30 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
     *blue  = src[(w - 1) * 3 + 2];
 }
 
+static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top);
+    } else {
+        s->llviddsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
+    }
+}
+
 static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
 {
     int i;
     int index = 0;
+    int n = s->vlc_n;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         int val = len[i];
         int repeat = 0;
 
-        for (; i < 256 && len[i] == val && repeat < 255; i++)
+        for (; i < n && len[i] == val && repeat < 255; i++)
             repeat++;
 
-        assert(val < 32 && val >0 && repeat<256 && repeat>0);
-        if ( repeat > 7) {
+        av_assert0(val < 32 && val >0 && repeat < 256 && repeat>0);
+        if (repeat > 7) {
             buf[index++] = val;
             buf[index++] = repeat;
         } else {
@@ -142,19 +187,48 @@ static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
     return index;
 }
 
+static int store_huffman_tables(HYuvContext *s, uint8_t *buf)
+{
+    int i, ret;
+    int size = 0;
+    int count = 3;
+
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = ff_huff_gen_len_table(s->len[i], s->stats[i], s->vlc_n, 0)) < 0)
+            return ret;
+
+        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n) < 0) {
+            return -1;
+        }
+
+        size += store_table(s, s->len[i], buf + size);
+    }
+    return size;
+}
+
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, j;
+    int ret;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     ff_huffyuv_common_init(avctx);
     ff_huffyuvencdsp_init(&s->hencdsp);
 
-    avctx->extradata = av_mallocz(1024*30); // 256*3+4 == 772
-    avctx->stats_out = av_mallocz(1024*30); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+    avctx->extradata = av_mallocz(3*MAX_N + 4);
+    if (s->flags&AV_CODEC_FLAG_PASS1) {
+#define STATS_OUT_SIZE 21*MAX_N*3 + 4
+        avctx->stats_out = av_mallocz(STATS_OUT_SIZE); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
     s->version = 2;
 
-    if (!avctx->extradata || !avctx->stats_out)
+    if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
 #if FF_API_CODED_FRAME
@@ -164,15 +238,66 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    s->bps = desc->comp[0].depth;
+    s->yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
+    s->chroma = desc->nb_components > 2;
+    s->alpha = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                     &s->chroma_h_shift,
+                                     &s->chroma_v_shift);
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
         if (s->width & 1) {
             av_log(avctx, AV_LOG_ERROR, "Width must be even for this colorspace.\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->bitstream_bpp = avctx->pix_fmt == AV_PIX_FMT_YUV420P ? 12 : 16;
         break;
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUV410P:
+    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV440P:
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YUVA444P:
+    case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUV422P9:
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV422P16:
+    case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV444P14:
+    case AV_PIX_FMT_YUV444P16:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA444P16:
+        s->version = 3;
+        break;
     case AV_PIX_FMT_RGB32:
         s->bitstream_bpp = 32;
         break;
@@ -181,10 +306,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+
     avctx->bits_per_coded_sample = s->bitstream_bpp;
-    s->decorrelate = s->bitstream_bpp >= 24;
+    s->decorrelate = s->bitstream_bpp >= 24 && !s->yuv && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
     s->predictor = avctx->prediction_method;
     s->interlaced = avctx->flags & AV_CODEC_FLAG_INTERLACED_ME ? 1 : 0;
     if (avctx->context_model == 1) {
@@ -193,7 +321,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "context=1 is not compatible with "
                    "2 pass huffyuv encoding\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
     }else s->context= 0;
 
@@ -202,45 +330,66 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "Error: YV12 is not supported by huffyuv; use "
                    "vcodec=ffvhuff or format=422p\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         if (avctx->context_model) {
             av_log(avctx, AV_LOG_ERROR,
                    "Error: per-frame huffman tables are not supported "
                    "by huffyuv; use vcodec=ffvhuff\n");
-            return -1;
+            return AVERROR(EINVAL);
+        }
+        if (s->version > 2) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error: ver>2 is not supported "
+                   "by huffyuv; use vcodec=ffvhuff\n");
+            return AVERROR(EINVAL);
         }
         if (s->interlaced != ( s->height > 288 ))
             av_log(avctx, AV_LOG_INFO,
                    "using huffyuv 2.2.0 or newer interlacing flag\n");
     }
 
-    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN) {
+    if (s->version > 3 && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Ver > 3 is under development, files encoded with it may not be decodable with future versions!!!\n"
+               "Use vstrict=-2 / -strict -2 to use it anyway.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN && s->version <= 2) {
         av_log(avctx, AV_LOG_ERROR,
                "Error: RGB is incompatible with median predictor\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     ((uint8_t*)avctx->extradata)[0] = s->predictor | (s->decorrelate << 6);
-    ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
     ((uint8_t*)avctx->extradata)[2] = s->interlaced ? 0x10 : 0x20;
     if (s->context)
         ((uint8_t*)avctx->extradata)[2] |= 0x40;
-    ((uint8_t*)avctx->extradata)[3] = 0;
+    if (s->version < 3) {
+        ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
+        ((uint8_t*)avctx->extradata)[3] = 0;
+    } else {
+        ((uint8_t*)avctx->extradata)[1] = ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2);
+        if (s->chroma)
+            ((uint8_t*)avctx->extradata)[2] |= s->yuv ? 1 : 2;
+        if (s->alpha)
+            ((uint8_t*)avctx->extradata)[2] |= 4;
+        ((uint8_t*)avctx->extradata)[3] = 1;
+    }
     s->avctx->extradata_size = 4;
 
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] = 1;
 
         for (;;) {
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 4; i++) {
                 char *next;
 
-                for (j = 0; j < 256; j++) {
+                for (j = 0; j < s->vlc_n; j++) {
                     s->stats[i][j] += strtol(p, &next, 0);
                     if (next == p) return -1;
                     p = next;
@@ -249,40 +398,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
             if (p[0] == 0 || p[1] == 0 || p[2] == 0) break;
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
 
-                s->stats[i][j] = 100000000 / (d + 1);
+                s->stats[i][j] = 100000000 / (d*d + 1);
             }
     }
 
-    for (i = 0; i < 3; i++) {
-        ff_huff_gen_len_table(s->len[i], s->stats[i]);
-
-        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0) {
-            return -1;
-        }
-
-        s->avctx->extradata_size +=
-            store_table(s, s->len[i], &((uint8_t*)s->avctx->extradata)[s->avctx->extradata_size]);
-    }
+    ret = store_huffman_tables(s, s->avctx->extradata + s->avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+    s->avctx->extradata_size += ret;
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < 4; i++) {
             int pels = s->width * s->height / (i ? 40 : 10);
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
-                s->stats[i][j] = pels/(d + 1);
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
+                s->stats[i][j] = pels/(d*d + 1);
             }
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j]= 0;
     }
 
-    ff_huffyuv_alloc_temp(s);
+    if (ff_huffyuv_alloc_temp(s)) {
+        ff_huffyuv_common_end(s);
+        return AVERROR(ENOMEM);
+    }
 
     s->picture_number=0;
 
@@ -343,6 +489,168 @@ static int encode_422_bitstream(HYuvContext *s, int offset, int count)
     return 0;
 }
 
+static int encode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) < count * s->bps / 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
+
+#define LOADEND\
+            int y0 = s->temp[0][width-1];
+#define LOADEND_14\
+            int y0 = s->temp16[0][width-1] & mask;
+#define LOADEND_16\
+            int y0 = s->temp16[0][width-1];
+#define STATEND\
+            s->stats[plane][y0]++;
+#define STATEND_16\
+            s->stats[plane][y0>>2]++;
+#define WRITEEND\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);
+#define WRITEEND_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);
+
+#define LOAD2\
+            int y0 = s->temp[0][2 * i];\
+            int y1 = s->temp[0][2 * i + 1];
+#define LOAD2_14\
+            int y0 = s->temp16[0][2 * i] & mask;\
+            int y1 = s->temp16[0][2 * i + 1] & mask;
+#define LOAD2_16\
+            int y0 = s->temp16[0][2 * i];\
+            int y1 = s->temp16[0][2 * i + 1];
+#define STAT2\
+            s->stats[plane][y0]++;\
+            s->stats[plane][y1]++;
+#define STAT2_16\
+            s->stats[plane][y0>>2]++;\
+            s->stats[plane][y1>>2]++;
+#define WRITE2\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);\
+            put_bits(&s->pb, s->len[plane][y1], s->bits[plane][y1]);
+#define WRITE2_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);\
+            put_bits(&s->pb, s->len[plane][y1>>2], s->bits[plane][y1>>2]);\
+            put_bits(&s->pb, 2, y1&3);
+
+    if (s->bps <= 8) {
+    if (s->flags & AV_CODEC_FLAG_PASS1) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+        }
+    }
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+        return 0;
+
+    if (s->context) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+            WRITEEND;
+        }
+    } else {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            WRITEEND;
+        }
+    }
+    } else if (s->bps <= 14) {
+        int mask = s->n - 1;
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+                WRITEEND;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                WRITEEND;
+            }
+        }
+    } else {
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+                WRITEEND_16;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                WRITEEND_16;
+            }
+        }
+    }
+#undef LOAD2
+#undef STAT2
+#undef WRITE2
+    return 0;
+}
+
 static int encode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
@@ -400,8 +708,8 @@ static inline int encode_bgra_bitstream(HYuvContext *s, int count, int planes)
 
 #define LOAD_GBRA                                                       \
     int g = s->temp[0][planes == 3 ? 3 * i + 1 : 4 * i + G];            \
-    int b = s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g & 0xFF; \
-    int r = s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g & 0xFF; \
+    int b =(s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g) & 0xFF;\
+    int r =(s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g) & 0xFF;\
     int a = s->temp[0][planes * i + A];
 
 #define STAT_BGRA                                                       \
@@ -452,22 +760,16 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int i, j, size = 0, ret;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error allocating output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
-            ff_huff_gen_len_table(s->len[i], s->stats[i]);
-            if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0)
-                return -1;
-            size += store_table(s, s->len[i], &pkt->data[size]);
-        }
+        size = store_huffman_tables(s, pkt->data);
+        if (size < 0)
+            return size;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] >>= 1;
     }
 
@@ -635,6 +937,59 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             }
             encode_bgra_bitstream(s, width, 3);
         }
+    } else if (s->version > 2) {
+        int plane;
+        for (plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            left = sub_left_prediction(s, s->temp[0], p->data[plane], w , 0);
+
+            encode_plane_bitstream(s, w, plane);
+
+            if (s->predictor==MEDIAN) {
+                int lefttop;
+                y = 1;
+                if (s->interlaced) {
+                    left = sub_left_prediction(s, s->temp[0], p->data[plane] + p->linesize[plane], w , left);
+
+                    encode_plane_bitstream(s, w, plane);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+
+                for (; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    sub_median_prediction(s, s->temp[0], dst - fake_stride, dst, w , &left, &lefttop);
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            } else {
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    if (s->predictor == PLANE && s->interlaced < y) {
+                        diff_bytes(s, s->temp[1], dst, dst - fake_stride, w);
+
+                        left = sub_left_prediction(s, s->temp[0], s->temp[1], w , left);
+                    } else {
+                        left = sub_left_prediction(s, s->temp[0], dst, w , left);
+                    }
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            }
+        }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
     }
@@ -648,17 +1003,19 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if ((s->flags & AV_CODEC_FLAG_PASS1) && (s->picture_number & 31) == 0) {
         int j;
         char *p = avctx->stats_out;
-        char *end = p + 1024*30;
-        for (i = 0; i < 3; i++) {
-            for (j = 0; j < 256; j++) {
+        char *end = p + STATS_OUT_SIZE;
+        for (i = 0; i < 4; i++) {
+            for (j = 0; j < s->vlc_n; j++) {
                 snprintf(p, end-p, "%"PRIu64" ", s->stats[i][j]);
                 p += strlen(p);
                 s->stats[i][j]= 0;
             }
             snprintf(p, end-p, "\n");
             p++;
+            if (end <= p)
+                return AVERROR(ENOMEM);
         }
-    } else
+    } else if (avctx->stats_out)
         avctx->stats_out[0] = '\0';
     if (!(s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)) {
         flush_put_bits(&s->pb);
@@ -686,6 +1043,27 @@ static av_cold int encode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+    { "non_deterministic", "Allow multithreading for e.g. context=1 at the expense of determinism",
+      offsetof(HYuvContext, non_determ), AV_OPT_TYPE_BOOL, { .i64 = 1 },
+      0, 1, AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL },
+};
+
+static const AVClass normal_class = {
+    .class_name = "huffyuv",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass ff_class = {
+    .class_name = "ffvhuff",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_huffyuv_encoder = {
     .name           = "huffyuv",
     .long_name      = NULL_IF_CONFIG_SMALL("Huffyuv / HuffYUV"),
@@ -695,6 +1073,8 @@ AVCodec ff_huffyuv_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &normal_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
         AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
@@ -713,8 +1093,24 @@ AVCodec ff_ffvhuff_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &ff_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV440P,
+        AV_PIX_FMT_GBRP,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16,
+        AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
+        AV_PIX_FMT_GBRAP,
+        AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV422P16,
+        AV_PIX_FMT_YUV444P9, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P16,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
     },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index 6c30877737..95fcc19582 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,12 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
+static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
 {
     long i;
 
 #if !HAVE_FAST_UNALIGNED
-    if ((long) src2 & (sizeof(long) - 1)) {
+    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
         for (i = 0; i + 7 < w; i += 8) {
             dst[i + 0] = src1[i + 0] - src2[i + 0];
             dst[i + 1] = src1[i + 1] - src2[i + 1];
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index 603c36f7af..3a49b4a7ef 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
 
 typedef struct HuffYUVEncDSPContext {
     void (*diff_bytes)(uint8_t *dst /* align 16 */,
-                       uint8_t *src1 /* align 16 */,
-                       uint8_t *src2 /* align 1 */,
+                       const uint8_t *src1 /* align 16 */,
+                       const uint8_t *src2 /* align 1 */,
                        int w);
     /**
      * Subtract HuffYUV's variant of median prediction.
diff --git a/libavcodec/idcinvideo.c b/libavcodec/idcinvideo.c
index 1a5a1c3a25..4a0a6fb6c0 100644
--- a/libavcodec/idcinvideo.c
+++ b/libavcodec/idcinvideo.c
@@ -1,21 +1,21 @@
 /*
  * id Quake II CIN Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,7 +75,7 @@ typedef struct IdcinContext {
     uint32_t pal[256];
 } IdcinContext;
 
-/*
+/**
  * Find the lowest probability node in a Huffman table, and mark it as
  * being assigned to a higher probability.
  * @return the node index of the lowest unused node, or -1 if all nodes
@@ -169,7 +169,7 @@ static av_cold int idcin_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
+static int idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
 {
     hnode *hnodes;
     long x, y;
@@ -188,7 +188,7 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
                 if(!bit_pos) {
                     if(dat_pos >= s->size) {
                         av_log(s->avctx, AV_LOG_ERROR, "Huffman decode error.\n");
-                        return;
+                        return -1;
                     }
                     bit_pos = 8;
                     v = s->buf[dat_pos++];
@@ -203,6 +203,8 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
             prev = node_num;
         }
     }
+
+    return 0;
 }
 
 static int idcin_decode_frame(AVCodecContext *avctx,
@@ -219,12 +221,11 @@ static int idcin_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  id CIN Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    idcin_decode_vlcs(s, frame);
+    if (idcin_decode_vlcs(s, frame))
+        return AVERROR_INVALIDDATA;
 
     if (pal) {
         frame->palette_has_changed = 1;
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index a9b8727468..63e9b5216b 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include "faanidct.h"
 #include "idctdsp.h"
 #include "simple_idct.h"
+#include "xvididct.h"
 
 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
                                const uint8_t *src_scantable)
@@ -79,11 +80,11 @@ av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
     }
 }
 
-void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
+void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
 
-static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
+static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 ptrdiff_t line_size)
 {
     int i;
 
@@ -103,9 +104,41 @@ static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+        pixels[2] = av_clip_uint8(block[2]);
+        pixels[3] = av_clip_uint8(block[3]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
 static void put_signed_pixels_clamped_c(const int16_t *block,
-                                        uint8_t *restrict pixels,
-                                        int line_size)
+                                        uint8_t *av_restrict pixels,
+                                        ptrdiff_t line_size)
 {
     int i, j;
 
@@ -124,8 +157,8 @@ static void put_signed_pixels_clamped_c(const int16_t *block,
     }
 }
 
-static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
+static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 ptrdiff_t line_size)
 {
     int i;
 
@@ -144,47 +177,139 @@ static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
+        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    put_pixels_clamped4_c(block, dest, line_size);
+}
+static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    add_pixels_clamped4_c(block, dest, line_size);
+}
+
+static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    put_pixels_clamped2_c(block, dest, line_size);
+}
+static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    add_pixels_clamped2_c(block, dest, line_size);
+}
+
+static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8((block[0] + 4)>>3);
+}
+static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
+}
+
 av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
-        c->idct_put  = ff_simple_idct_put_10;
-        c->idct_add  = ff_simple_idct_add_10;
-        c->idct      = ff_simple_idct_10;
+    if (avctx->lowres==1) {
+        c->idct_put  = ff_jref_idct4_put;
+        c->idct_add  = ff_jref_idct4_add;
+        c->idct      = ff_j_rev_dct4;
         c->perm_type = FF_IDCT_PERM_NONE;
-    } else if (avctx->idct_algo == FF_IDCT_INT) {
-        c->idct_put  = ff_jref_idct_put;
-        c->idct_add  = ff_jref_idct_add;
-        c->idct      = ff_j_rev_dct;
-        c->perm_type = FF_IDCT_PERM_LIBMPEG2;
-#if CONFIG_FAANIDCT
-    } else if (avctx->idct_algo == FF_IDCT_FAAN) {
-        c->idct_put  = ff_faanidct_put;
-        c->idct_add  = ff_faanidct_add;
-        c->idct      = ff_faanidct;
+    } else if (avctx->lowres==2) {
+        c->idct_put  = ff_jref_idct2_put;
+        c->idct_add  = ff_jref_idct2_add;
+        c->idct      = ff_j_rev_dct2;
         c->perm_type = FF_IDCT_PERM_NONE;
-#endif /* CONFIG_FAANIDCT */
-    } else { // accurate/default
-        c->idct_put  = ff_simple_idct_put_8;
-        c->idct_add  = ff_simple_idct_add_8;
-        c->idct      = ff_simple_idct_8;
+    } else if (avctx->lowres==3) {
+        c->idct_put  = ff_jref_idct1_put;
+        c->idct_add  = ff_jref_idct1_add;
+        c->idct      = ff_j_rev_dct1;
         c->perm_type = FF_IDCT_PERM_NONE;
+    } else {
+        if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
+            c->idct_put              = ff_simple_idct_put_10;
+            c->idct_add              = ff_simple_idct_add_10;
+            c->idct                  = ff_simple_idct_10;
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else if (avctx->bits_per_raw_sample == 12) {
+            c->idct_put              = ff_simple_idct_put_12;
+            c->idct_add              = ff_simple_idct_add_12;
+            c->idct                  = ff_simple_idct_12;
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else {
+            if (avctx->idct_algo == FF_IDCT_INT) {
+                c->idct_put  = ff_jref_idct_put;
+                c->idct_add  = ff_jref_idct_add;
+                c->idct      = ff_j_rev_dct;
+                c->perm_type = FF_IDCT_PERM_LIBMPEG2;
+#if CONFIG_FAANIDCT
+            } else if (avctx->idct_algo == FF_IDCT_FAAN) {
+                c->idct_put  = ff_faanidct_put;
+                c->idct_add  = ff_faanidct_add;
+                c->idct      = ff_faanidct;
+                c->perm_type = FF_IDCT_PERM_NONE;
+#endif /* CONFIG_FAANIDCT */
+            } else { // accurate/default
+                c->idct_put  = ff_simple_idct_put_8;
+                c->idct_add  = ff_simple_idct_add_8;
+                c->idct      = ff_simple_idct_8;
+                c->perm_type = FF_IDCT_PERM_NONE;
+            }
+        }
     }
 
     c->put_pixels_clamped        = put_pixels_clamped_c;
     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
     c->add_pixels_clamped        = add_pixels_clamped_c;
 
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
+    if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID)
+        ff_xvid_idct_init(c, avctx);
 
+    if (ARCH_ALPHA)
+        ff_idctdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_idctdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_idctdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_idctdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_idctdsp_init_mips(c, avctx, high_bit_depth);
+
+    ff_put_pixels_clamped = c->put_pixels_clamped;
+    ff_add_pixels_clamped = c->add_pixels_clamped;
 
     ff_init_scantable_permutation(c->idct_permutation,
                                   c->perm_type);
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index c49a4ca15a..b180a6762a 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,13 +52,13 @@ typedef struct IDCTDSPContext {
     /* pixel ops : interface with DCT */
     void (*put_pixels_clamped)(const int16_t *block /* align 16 */,
                                uint8_t *pixels /* align 8 */,
-                               int line_size);
+                               ptrdiff_t line_size);
     void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */,
                                       uint8_t *pixels /* align 8 */,
-                                      int line_size);
+                                      ptrdiff_t line_size);
     void (*add_pixels_clamped)(const int16_t *block /* align 16 */,
                                uint8_t *pixels /* align 8 */,
-                               int line_size);
+                               ptrdiff_t line_size);
 
     void (*idct)(int16_t *block /* align 16 */);
 
@@ -95,16 +95,20 @@ typedef struct IDCTDSPContext {
     enum idct_permutation_type perm_type;
 } IDCTDSPContext;
 
-extern void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+extern void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
+extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
 
 void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
+void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                           unsigned high_bit_depth);
 void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
+void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth);
 
 #endif /* AVCODEC_IDCTDSP_H */
diff --git a/libavcodec/iff.c b/libavcodec/iff.c
index 0d56cd5837..9d1f9a77d1 100644
--- a/libavcodec/iff.c
+++ b/libavcodec/iff.c
@@ -1,28 +1,28 @@
 /*
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/DEEP/ILBM/PBM bitmap decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  * Copyright (c) 2010 Sebastian Vater <cdgs.basty@googlemail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/DEEP/ILBM/PBM bitmap decoder
  */
 
 #include <stdint.h>
@@ -33,11 +33,30 @@
 #include "get_bits.h"
 #include "internal.h"
 
+// TODO: masking bits
+typedef enum {
+    MASK_NONE,
+    MASK_HAS_MASK,
+    MASK_HAS_TRANSPARENT_COLOR,
+    MASK_LASSO
+} mask_type;
+
 typedef struct IffContext {
     AVFrame *frame;
     int planesize;
     uint8_t * planebuf;
+    uint8_t * ham_buf;      ///< temporary buffer for planar to chunky conversation
+    uint32_t *ham_palbuf;   ///< HAM decode table
+    uint32_t *mask_buf;     ///< temporary buffer for palette indices
+    uint32_t *mask_palbuf;  ///< masking palette table
+    unsigned  compression;  ///< delta compression method used
+    unsigned  bpp;          ///< bits per plane to decode (differs from bits_per_coded_sample if HAM)
+    unsigned  ham;          ///< 0 if non-HAM or number of hold bits (6 for bpp > 6, 4 otherwise)
+    unsigned  flags;        ///< 1 for EHB, 0 is no extra half darkening
+    unsigned  transparency; ///< TODO: transparency color index in palette
+    unsigned  masking;      ///< TODO: masking method used
     int init; // 1 if buffer and palette data already initialized, 0 otherwise
+    int16_t   tvdc[16];     ///< TVDC lookup table
 } IffContext;
 
 #define LUT8_PART(plane, v)                             \
@@ -124,25 +143,180 @@ static av_always_inline uint32_t gray2rgb(const uint32_t x) {
  */
 static int cmap_read_palette(AVCodecContext *avctx, uint32_t *pal)
 {
+    IffContext *s = avctx->priv_data;
     int count, i;
+    const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+    int palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
 
     if (avctx->bits_per_coded_sample > 8) {
-        av_log(avctx, AV_LOG_ERROR, "bit_per_coded_sample > 8 not supported\n");
+        av_log(avctx, AV_LOG_ERROR, "bits_per_coded_sample > 8 not supported\n");
         return AVERROR_INVALIDDATA;
     }
 
     count = 1 << avctx->bits_per_coded_sample;
     // If extradata is smaller than actually needed, fill the remaining with black.
-    count = FFMIN(avctx->extradata_size / 3, count);
+    count = FFMIN(palette_size / 3, count);
     if (count) {
         for (i = 0; i < count; i++)
-            pal[i] = 0xFF000000 | AV_RB24(avctx->extradata + i * 3);
+            pal[i] = 0xFF000000 | AV_RB24(palette + i*3);
+        if (s->flags && count >= 32) { // EHB
+            for (i = 0; i < 32; i++)
+                pal[i + 32] = 0xFF000000 | (AV_RB24(palette + i*3) & 0xFEFEFE) >> 1;
+            count = FFMAX(count, 64);
+        }
     } else { // Create gray-scale color palette for bps < 8
         count = 1 << avctx->bits_per_coded_sample;
 
         for (i = 0; i < count; i++)
             pal[i] = 0xFF000000 | gray2rgb((i * 255) >> avctx->bits_per_coded_sample);
     }
+    if (s->masking == MASK_HAS_MASK) {
+        memcpy(pal + (1 << avctx->bits_per_coded_sample), pal, count * 4);
+        for (i = 0; i < count; i++)
+            pal[i] &= 0xFFFFFF;
+    } else if (s->masking == MASK_HAS_TRANSPARENT_COLOR &&
+        s->transparency < 1 << avctx->bits_per_coded_sample)
+        pal[s->transparency] &= 0xFFFFFF;
+    return 0;
+}
+
+/**
+ * Extracts the IFF extra context and updates internal
+ * decoder structures.
+ *
+ * @param avctx the AVCodecContext where to extract extra context to
+ * @param avpkt the AVPacket to extract extra context from or NULL to use avctx
+ * @return >= 0 in case of success, a negative error code otherwise
+ */
+static int extract_header(AVCodecContext *const avctx,
+                          const AVPacket *const avpkt) {
+    const uint8_t *buf;
+    unsigned buf_size;
+    IffContext *s = avctx->priv_data;
+    int i, palette_size;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+        return AVERROR_INVALIDDATA;
+    }
+    palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+
+    if (avpkt) {
+        int image_size;
+        if (avpkt->size < 2)
+            return AVERROR_INVALIDDATA;
+        image_size = avpkt->size - AV_RB16(avpkt->data);
+        buf = avpkt->data;
+        buf_size = bytestream_get_be16(&buf);
+        if (buf_size <= 1 || image_size <= 1) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid image size received: %u -> image data offset: %d\n",
+                   buf_size, image_size);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        buf = avctx->extradata;
+        buf_size = bytestream_get_be16(&buf);
+        if (buf_size <= 1 || palette_size < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid palette size received: %u -> palette data offset: %d\n",
+                   buf_size, palette_size);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (buf_size >= 41) {
+        s->compression  = bytestream_get_byte(&buf);
+        s->bpp          = bytestream_get_byte(&buf);
+        s->ham          = bytestream_get_byte(&buf);
+        s->flags        = bytestream_get_byte(&buf);
+        s->transparency = bytestream_get_be16(&buf);
+        s->masking      = bytestream_get_byte(&buf);
+        for (i = 0; i < 16; i++)
+            s->tvdc[i] = bytestream_get_be16(&buf);
+
+        if (s->masking == MASK_HAS_MASK) {
+            if (s->bpp >= 8 && !s->ham) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB32;
+                av_freep(&s->mask_buf);
+                av_freep(&s->mask_palbuf);
+                s->mask_buf = av_malloc((s->planesize * 32) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_buf)
+                    return AVERROR(ENOMEM);
+                if (s->bpp > 16) {
+                    av_log(avctx, AV_LOG_ERROR, "bpp %d too large for palette\n", s->bpp);
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+                s->mask_palbuf = av_malloc((2 << s->bpp) * sizeof(uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_palbuf) {
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+            }
+            s->bpp++;
+        } else if (s->masking != MASK_NONE && s->masking != MASK_HAS_TRANSPARENT_COLOR) {
+            av_log(avctx, AV_LOG_ERROR, "Masking not supported\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (!s->bpp || s->bpp > 32) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of bitplanes: %u\n", s->bpp);
+            return AVERROR_INVALIDDATA;
+        } else if (s->ham >= 8) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of hold bits for HAM: %u\n", s->ham);
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_freep(&s->ham_buf);
+        av_freep(&s->ham_palbuf);
+
+        if (s->ham) {
+            int i, count = FFMIN(palette_size / 3, 1 << s->ham);
+            int ham_count;
+            const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+
+            s->ham_buf = av_malloc((s->planesize * 8) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_buf)
+                return AVERROR(ENOMEM);
+
+            ham_count = 8 * (1 << s->ham);
+            s->ham_palbuf = av_malloc((ham_count << !!(s->masking == MASK_HAS_MASK)) * sizeof (uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_palbuf) {
+                av_freep(&s->ham_buf);
+                return AVERROR(ENOMEM);
+            }
+
+            if (count) { // HAM with color palette attached
+                // prefill with black and palette and set HAM take direct value mask to zero
+                memset(s->ham_palbuf, 0, (1 << s->ham) * 2 * sizeof (uint32_t));
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | AV_RL24(palette + i*3);
+                }
+                count = 1 << s->ham;
+            } else { // HAM with grayscale color palette
+                count = 1 << s->ham;
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2]   = 0xFF000000; // take direct color value from palette
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | av_le2ne32(gray2rgb((i * 255) >> s->ham));
+                }
+            }
+            for (i=0; i < count; i++) {
+                uint32_t tmp = i << (8 - s->ham);
+                tmp |= tmp >> s->ham;
+                s->ham_palbuf[(i+count)*2]     = 0xFF00FFFF; // just modify blue color component
+                s->ham_palbuf[(i+count*2)*2]   = 0xFFFFFF00; // just modify red color component
+                s->ham_palbuf[(i+count*3)*2]   = 0xFFFF00FF; // just modify green color component
+                s->ham_palbuf[(i+count)*2+1]   = 0xFF000000 | tmp << 16;
+                s->ham_palbuf[(i+count*2)*2+1] = 0xFF000000 | tmp;
+                s->ham_palbuf[(i+count*3)*2+1] = 0xFF000000 | tmp << 8;
+            }
+            if (s->masking == MASK_HAS_MASK) {
+                for (i = 0; i < ham_count; i++)
+                    s->ham_palbuf[(1 << s->bpp) + i] = s->ham_palbuf[i] | 0xFF000000;
+            }
+        }
+    }
+
     return 0;
 }
 
@@ -151,6 +325,8 @@ static av_cold int decode_end(AVCodecContext *avctx)
     IffContext *s = avctx->priv_data;
     av_frame_free(&s->frame);
     av_freep(&s->planebuf);
+    av_freep(&s->ham_buf);
+    av_freep(&s->ham_palbuf);
     return 0;
 }
 
@@ -160,11 +336,29 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int err;
 
     if (avctx->bits_per_coded_sample <= 8) {
-        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8 ||
-                          avctx->extradata_size) ? AV_PIX_FMT_PAL8
-                                                 : AV_PIX_FMT_GRAY8;
+        int palette_size;
+
+        if (avctx->extradata_size >= 2)
+            palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+        else
+            palette_size = 0;
+        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8) ||
+                         (avctx->extradata_size >= 2 && palette_size) ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
     } else if (avctx->bits_per_coded_sample <= 32) {
-        avctx->pix_fmt = AV_PIX_FMT_BGR32;
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        } else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB444;
+        } else if (avctx->codec_tag != MKTAG('D', 'E', 'E', 'P')) {
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_0BGR32;
+            } else if (avctx->bits_per_coded_sample == 32) {
+                avctx->pix_fmt = AV_PIX_FMT_BGR32;
+            } else {
+                avpriv_request_sample(avctx, "unknown bits_per_coded_sample");
+                return AVERROR_PATCHWELCOME;
+            }
+        }
     } else {
         return AVERROR_INVALIDDATA;
     }
@@ -176,12 +370,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if (!s->planebuf)
         return AVERROR(ENOMEM);
 
+    s->bpp = avctx->bits_per_coded_sample;
     s->frame = av_frame_alloc();
     if (!s->frame) {
         decode_end(avctx);
         return AVERROR(ENOMEM);
     }
 
+    if ((err = extract_header(avctx, NULL)) < 0)
+        return err;
+
     return 0;
 }
 
@@ -195,6 +393,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static void decodeplane8(uint8_t *dst, const uint8_t *buf, int buf_size, int plane)
 {
     const uint64_t *lut = plane8_lut[plane];
+    if (plane >= 8) {
+        av_log(NULL, AV_LOG_WARNING, "Ignoring extra planes beyond 8\n");
+        return;
+    }
     do {
         uint64_t v = AV_RN64A(dst) | lut[*buf++];
         AV_WN64A(dst, v);
@@ -227,6 +429,47 @@ static void decodeplane32(uint32_t *dst, const uint8_t *buf, int buf_size, int p
     } while (--buf_size);
 }
 
+#define DECODE_HAM_PLANE32(x)       \
+    first       = buf[x] << 1;      \
+    second      = buf[(x)+1] << 1;  \
+    delta      &= pal[first++];     \
+    delta      |= pal[first];       \
+    dst[x]      = delta;            \
+    delta      &= pal[second++];    \
+    delta      |= pal[second];      \
+    dst[(x)+1]  = delta
+
+/**
+ * Converts one line of HAM6/8-encoded chunky buffer to 24bpp.
+ *
+ * @param dst the destination 24bpp buffer
+ * @param buf the source 8bpp chunky buffer
+ * @param pal the HAM decode table
+ * @param buf_size the plane size in bytes
+ */
+static void decode_ham_plane32(uint32_t *dst, const uint8_t  *buf,
+                               const uint32_t *const pal, unsigned buf_size)
+{
+    uint32_t delta = pal[1]; /* first palette entry */
+    do {
+        uint32_t first, second;
+        DECODE_HAM_PLANE32(0);
+        DECODE_HAM_PLANE32(2);
+        DECODE_HAM_PLANE32(4);
+        DECODE_HAM_PLANE32(6);
+        buf += 8;
+        dst += 8;
+    } while (--buf_size);
+}
+
+static void lookup_pal_indicies(uint32_t *dst, const uint32_t *buf,
+                         const uint32_t *const pal, unsigned width)
+{
+    do {
+        *dst++ = pal[*buf++];
+    } while (--width);
+}
+
 /**
  * Decode one complete byterun1 encoded line.
  *
@@ -245,124 +488,385 @@ static int decode_byterun(uint8_t *dst, int dst_size,
         unsigned length;
         const int8_t value = *buf++;
         if (value >= 0) {
-            length = value + 1;
-            memcpy(dst + x, buf, FFMIN3(length, dst_size - x, buf_end - buf));
+            length = FFMIN3(value + 1, dst_size - x, buf_end - buf);
+            memcpy(dst + x, buf, length);
             buf += length;
         } else if (value > -128) {
-            length = -value + 1;
-            memset(dst + x, *buf++, FFMIN(length, dst_size - x));
+            length = FFMIN(-value + 1, dst_size - x);
+            memset(dst + x, *buf++, length);
         } else { // noop
             continue;
         }
         x += length;
     }
+    if (x < dst_size) {
+        av_log(NULL, AV_LOG_WARNING, "decode_byterun ended before plane size\n");
+        memset(dst+x, 0, dst_size - x);
+    }
     return buf - buf_start;
 }
 
-static int decode_frame_ilbm(AVCodecContext *avctx,
-                             void *data, int *got_frame,
-                             AVPacket *avpkt)
-{
-    IffContext *s          = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    const uint8_t *buf_end = buf + buf_size;
-    int y, plane, res;
+#define DECODE_RGBX_COMMON(type) \
+    if (!length) { \
+        length = bytestream2_get_byte(gb); \
+        if (!length) { \
+            length = bytestream2_get_be16(gb); \
+            if (!length) \
+                return; \
+        } \
+    } \
+    for (i = 0; i < length; i++) { \
+        *(type *)(dst + y*linesize + x * sizeof(type)) = pixel; \
+        x += 1; \
+        if (x >= width) { \
+            y += 1; \
+            if (y >= height) \
+                return; \
+            x = 0; \
+        } \
+    }
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
-        return res;
+/**
+ * Decode RGB8 buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgb8(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
+{
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 4) {
+        uint32_t pixel = 0xFF000000 | bytestream2_get_be24(gb);
+        length = bytestream2_get_byte(gb) & 0x7F;
+        DECODE_RGBX_COMMON(uint32_t)
+    }
+}
 
-    if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
-        if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
-            return res;
+/**
+ * Decode RGBN buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgbn(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
+{
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 2) {
+        uint32_t pixel = bytestream2_get_be16u(gb);
+        length = pixel & 0x7;
+        pixel >>= 4;
+        DECODE_RGBX_COMMON(uint16_t)
     }
-    s->init = 1;
+}
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
-        if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+/**
+ * Decode DEEP RLE 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_deep_rle32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize)
+{
+    const uint8_t *src_end = src + src_size;
+    int x = 0, y = 0, i;
+    while (src + 5 <= src_end) {
+        int opcode;
+        opcode = *(int8_t *)src++;
+        if (opcode >= 0) {
+            int size = opcode + 1;
+            for (i = 0; i < size; i++) {
+                int length = FFMIN(size - i, width);
+                memcpy(dst + y*linesize + x * 4, src, length * 4);
+                src += length * 4;
+                x += length;
+                i += length;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane32((uint32_t *)row, buf,
-                                  FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+        } else {
+            int size = -opcode + 1;
+            uint32_t pixel = AV_RN32(src);
+            for (i = 0; i < size; i++) {
+                *(uint32_t *)(dst + y*linesize + x * 4) = pixel;
+                x += 1;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
                 }
             }
-        }
-    } else if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) { // IFF-PBM
-        for (y = 0; y < avctx->height && buf < buf_end; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
-            buf += avctx->width + (avctx->width % 2); // padding if odd
+            src += 4;
         }
     }
+}
 
-    if ((res = av_frame_ref(data, s->frame)) < 0)
-        return res;
-
-    *got_frame = 1;
+/**
+ * Decode DEEP TVDC 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ * @param[int] tvdc TVDC lookup table
+ */
+static void decode_deep_tvdc32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize, const int16_t *tvdc)
+{
+    int x = 0, y = 0, plane = 0;
+    int8_t pixel = 0;
+    int i, j;
+
+    for (i = 0; i < src_size * 2;) {
+#define GETNIBBLE ((i & 1) ?  (src[i>>1] & 0xF) : (src[i>>1] >> 4))
+        int d = tvdc[GETNIBBLE];
+        i++;
+        if (d) {
+            pixel += d;
+            dst[y * linesize + x*4 + plane] = pixel;
+            x++;
+        } else {
+            if (i >= src_size * 2)
+                return;
+            d = GETNIBBLE + 1;
+            i++;
+            d = FFMIN(d, width - x);
+            for (j = 0; j < d; j++) {
+                dst[y * linesize + x*4 + plane] = pixel;
+                x++;
+            }
+        }
+        if (x >= width) {
+            plane++;
+            if (plane >= 4) {
+                y++;
+                if (y >= height)
+                    return;
+                plane = 0;
+            }
+            x = 0;
+            pixel = 0;
+            i = (i + 1) & ~1;
+        }
+    }
+}
 
-    return buf_size;
+static int unsupported(AVCodecContext *avctx)
+{
+    IffContext *s = avctx->priv_data;
+    avpriv_request_sample(avctx, "bitmap (compression %i, bpp %i, ham %i)", s->compression, s->bpp, s->ham);
+    return AVERROR_INVALIDDATA;
 }
 
-static int decode_frame_byterun1(AVCodecContext *avctx,
-                                 void *data, int *got_frame,
-                                 AVPacket *avpkt)
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
 {
     IffContext *s          = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
+    const uint8_t *buf     = avpkt->size >= 2 ? avpkt->data + AV_RB16(avpkt->data) : NULL;
+    const int buf_size     = avpkt->size >= 2 ? avpkt->size - AV_RB16(avpkt->data) : 0;
     const uint8_t *buf_end = buf + buf_size;
     int y, plane, res;
+    GetByteContext gb;
+    const AVPixFmtDescriptor *desc;
 
+    if ((res = extract_header(avctx, avpkt)) < 0)
+        return res;
     if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
         return res;
 
+    desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
     if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
+        avctx->pix_fmt == AV_PIX_FMT_PAL8) {
         if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
             return res;
+    } else if (!s->init && avctx->bits_per_coded_sample <= 8 &&
+               avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if ((res = cmap_read_palette(avctx, s->mask_palbuf)) < 0)
+            return res;
     }
     s->init = 1;
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
-        if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-            for (y = 0; y < avctx->height; y++) {
+    switch (s->compression) {
+    case 0:
+        if (avctx->codec_tag == MKTAG('A', 'C', 'B', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                memset(s->frame->data[0], 0, avctx->height * s->frame->linesize[0]);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                        uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                memset(s->frame->data[0], 0, avctx->height * s->frame->linesize[0]);
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        const uint8_t * start = buf + (plane * avctx->height + y) * s->planesize;
+                        if (start >= buf_end)
+                            break;
+                        decodeplane8(s->ham_buf, start, FFMIN(s->planesize, buf_end - start), plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            int raw_width = avctx->width * (av_get_bits_per_pixel(desc) >> 3);
+            int x;
+            for (y = 0; y < avctx->height && buf < buf_end; y++) {
                 uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane8(row, s->planebuf, s->planesize, plane);
+                memcpy(row, buf, FFMIN(raw_width, buf_end - buf));
+                buf += raw_width;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGR32) {
+                    for (x = 0; x < avctx->width; x++)
+                        row[4 * x + 3] = row[4 * x + 3] & 0xF0 | (row[4 * x + 3] >> 4);
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+        } else if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(s->ham_buf, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane32((uint32_t *)row, buf,
+                                      FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
                 }
             }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width % 2); // padding if odd
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memcpy(s->ham_buf, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width & 1); // padding if odd
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
         }
-    } else {
-        for (y = 0; y < avctx->height; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            buf += decode_byterun(row, avctx->width, buf, buf_end);
+        break;
+    case 1:
+        if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
+                        decodeplane8(row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            } else if (avctx->bits_per_coded_sample <= 8) { //8-bit (+ mask) to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(s->mask_buf, 0, avctx->width * sizeof(uint32_t));
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
+                        decodeplane32(s->mask_buf, s->planebuf, s->planesize, plane);
+                    }
+                    lookup_pal_indicies((uint32_t *)row, s->mask_buf, s->mask_palbuf, avctx->width);
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
+                        decodeplane8(s->ham_buf, s->planebuf, s->planesize, plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
+                        decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    buf += decode_byterun(row, avctx->width, buf, buf_end);
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                    buf += decode_byterun(s->ham_buf, avctx->width, buf, buf_end);
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) { // IFF-DEEP
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_rle32(s->frame->data[0], buf, buf_size, avctx->width, avctx->height, s->frame->linesize[0]);
+            else
+                return unsupported(avctx);
         }
+        break;
+    case 4:
+        bytestream2_init(&gb, buf, buf_size);
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8') && avctx->pix_fmt == AV_PIX_FMT_RGB32)
+            decode_rgb8(&gb, s->frame->data[0], avctx->width, avctx->height, s->frame->linesize[0]);
+        else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N') && avctx->pix_fmt == AV_PIX_FMT_RGB444)
+            decode_rgbn(&gb, s->frame->data[0], avctx->width, avctx->height, s->frame->linesize[0]);
+        else
+            return unsupported(avctx);
+        break;
+    case 5:
+        if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_tvdc32(s->frame->data[0], buf, buf_size, avctx->width, avctx->height, s->frame->linesize[0], s->tvdc);
+            else
+                return unsupported(avctx);
+        } else
+            return unsupported(avctx);
+        break;
+    default:
+        return unsupported(avctx);
     }
 
     if ((res = av_frame_ref(data, s->frame)) < 0)
@@ -373,26 +877,29 @@ static int decode_frame_byterun1(AVCodecContext *avctx,
     return buf_size;
 }
 
+#if CONFIG_IFF_ILBM_DECODER
 AVCodec ff_iff_ilbm_decoder = {
-    .name           = "iff_ilbm",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ILBM"),
+    .name           = "iff",
+    .long_name      = NULL_IF_CONFIG_SMALL("IFF"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_IFF_ILBM,
     .priv_data_size = sizeof(IffContext),
     .init           = decode_init,
     .close          = decode_end,
-    .decode         = decode_frame_ilbm,
+    .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_IFF_BYTERUN1_DECODER
 AVCodec ff_iff_byterun1_decoder = {
-    .name           = "iff_byterun1",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ByteRun1"),
+    .name           = "iff",
+    .long_name      = NULL_IF_CONFIG_SMALL("IFF"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_IFF_BYTERUN1,
     .priv_data_size = sizeof(IffContext),
     .init           = decode_init,
     .close          = decode_end,
-    .decode         = decode_frame_byterun1,
+    .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/iirfilter.c b/libavcodec/iirfilter.c
index 40a543da4c..cb5871cb29 100644
--- a/libavcodec/iirfilter.c
+++ b/libavcodec/iirfilter.c
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -196,7 +196,7 @@ av_cold struct FFIIRFilterCoeffs* ff_iir_filter_init_coeffs(void *avc,
         return c;
 
 init_fail:
-    ff_iir_filter_free_coeffs(c);
+    ff_iir_filter_free_coeffsp(&c);
     return NULL;
 }
 
@@ -299,18 +299,26 @@ void ff_iir_filter_flt(const struct FFIIRFilterCoeffs *c,
     }
 }
 
-av_cold void ff_iir_filter_free_state(struct FFIIRFilterState *state)
+av_cold void ff_iir_filter_free_statep(struct FFIIRFilterState **state)
 {
-    av_free(state);
+    av_freep(state);
 }
 
-av_cold void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs)
+av_cold void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffsp)
 {
+    struct FFIIRFilterCoeffs *coeffs = *coeffsp;
     if(coeffs){
-        av_free(coeffs->cx);
-        av_free(coeffs->cy);
+        av_freep(&coeffs->cx);
+        av_freep(&coeffs->cy);
     }
-    av_free(coeffs);
+    av_freep(coeffsp);
+}
+
+void ff_iir_filter_init(FFIIRFilterContext *f) {
+    f->filter_flt = ff_iir_filter_flt;
+
+    if (HAVE_MIPSFPU)
+        ff_iir_filter_init_mips(f);
 }
 
 #ifdef TEST
@@ -340,8 +348,8 @@ int main(void)
     for (i = 0; i < SIZE; i++)
         printf("%6d %6d\n", x[i], y[i]);
 
-    ff_iir_filter_free_coeffs(fcoeffs);
-    ff_iir_filter_free_state(fstate);
+    ff_iir_filter_free_coeffsp(&fcoeffs);
+    ff_iir_filter_free_statep(&fstate);
     return 0;
 }
 #endif /* TEST */
diff --git a/libavcodec/iirfilter.h b/libavcodec/iirfilter.h
index bc65a96b59..6f7bba67ac 100644
--- a/libavcodec/iirfilter.h
+++ b/libavcodec/iirfilter.h
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,29 @@ enum IIRFilterMode{
     FF_FILTER_MODE_BANDSTOP,
 };
 
+typedef struct FFIIRFilterContext {
+    /**
+    * Perform IIR filtering on floating-point input samples.
+    *
+    * @param coeffs pointer to filter coefficients
+    * @param state  pointer to filter state
+    * @param size   input length
+    * @param src    source samples
+    * @param sstep  source stride
+    * @param dst    filtered samples (destination may be the same as input)
+    * @param dstep  destination stride
+    */
+    void (*filter_flt)(const struct FFIIRFilterCoeffs *coeffs,
+                        struct FFIIRFilterState *state, int size,
+                        const float *src, int sstep, float *dst, int dstep);
+} FFIIRFilterContext;
+
+/**
+ * Initialize FFIIRFilterContext
+ */
+void ff_iir_filter_init(FFIIRFilterContext *f);
+void ff_iir_filter_init_mips(FFIIRFilterContext *f);
+
 /**
  * Initialize filter coefficients.
  *
@@ -81,14 +104,14 @@ struct FFIIRFilterState* ff_iir_filter_init_state(int order);
  *
  * @param coeffs pointer allocated with ff_iir_filter_init_coeffs()
  */
-void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs);
+void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffs);
 
 /**
- * Free filter state.
+ * Free and zero filter state.
  *
- * @param state pointer allocated with ff_iir_filter_init_state()
+ * @param state pointer to pointer allocated with ff_iir_filter_init_state()
  */
-void ff_iir_filter_free_state(struct FFIIRFilterState *state);
+void ff_iir_filter_free_statep(struct FFIIRFilterState **state);
 
 /**
  * Perform IIR filtering on signed 16-bit input samples.
diff --git a/libavcodec/imc.c b/libavcodec/imc.c
index 498fdd578e..14f9fa30a5 100644
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/internal.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
 #include "get_bits.h"
@@ -95,7 +96,7 @@ typedef struct IMCContext {
     GetBitContext gb;
 
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext fft;
     DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS / 2];
     float *out_samples;
@@ -179,6 +180,14 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
     IMCContext *q = avctx->priv_data;
     double r1, r2;
 
+    if (avctx->codec_id == AV_CODEC_ID_IAC && avctx->sample_rate > 96000) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Strange sample rate of %i, file likely corrupt or "
+               "needing a new table derivation method.\n",
+               avctx->sample_rate);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (avctx->codec_id == AV_CODEC_ID_IMC)
         avctx->channels = 1;
 
@@ -247,7 +256,13 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
         return ret;
     }
     ff_bswapdsp_init(&q->bdsp);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!q->fdsp) {
+        ff_fft_end(&q->fft);
+
+        return AVERROR(ENOMEM);
+    }
+
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO
                                                  : AV_CH_LAYOUT_STEREO;
@@ -356,7 +371,7 @@ static void imc_decode_level_coefficients(IMCContext *q, int *levlCoeffBuf,
     float tmp, tmp2;
     // maybe some frequency division thingy
 
-    flcoeffs1[0] = 20000.0 / pow (2, levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
+    flcoeffs1[0] = 20000.0 / exp2 (levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
     flcoeffs2[0] = log2f(flcoeffs1[0]);
     tmp  = flcoeffs1[0];
     tmp2 = flcoeffs2[0];
@@ -450,8 +465,13 @@ static int bit_allocation(IMCContext *q, IMCChannel *chctx,
     for (i = 0; i < BANDS; i++)
         highest = FFMAX(highest, chctx->flcoeffs1[i]);
 
-    for (i = 0; i < BANDS - 1; i++)
+    for (i = 0; i < BANDS - 1; i++) {
+        if (chctx->flcoeffs5[i] <= 0) {
+            av_log(NULL, AV_LOG_ERROR, "flcoeffs5 %f invalid\n", chctx->flcoeffs5[i]);
+            return AVERROR_INVALIDDATA;
+        }
         chctx->flcoeffs4[i] = chctx->flcoeffs3[i] - log2f(chctx->flcoeffs5[i]);
+    }
     chctx->flcoeffs4[BANDS - 1] = limit;
 
     highest = highest * 0.25;
@@ -770,7 +790,8 @@ static int inverse_quant_coeff(IMCContext *q, IMCChannel *chctx,
 }
 
 
-static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
+static void imc_get_coeffs(AVCodecContext *avctx,
+                           IMCContext *q, IMCChannel *chctx)
 {
     int i, j, cw_len, cw;
 
@@ -782,19 +803,19 @@ static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
                 cw_len = chctx->CWlengthT[j];
                 cw = 0;
 
-                if (get_bits_count(&q->gb) + cw_len > 512) {
-                    ff_dlog(NULL, "Band %i coeff %i cw_len %i\n", i, j, cw_len);
-                    return AVERROR_INVALIDDATA;
+                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j])) {
+                    if (get_bits_count(&q->gb) + cw_len > 512) {
+                        av_log(avctx, AV_LOG_WARNING,
+                            "Potential problem on band %i, coefficient %i"
+                            ": cw_len=%i\n", i, j, cw_len);
+                    } else
+                        cw = get_bits(&q->gb, cw_len);
                 }
 
-                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j]))
-                    cw = get_bits(&q->gb, cw_len);
-
                 chctx->codewords[j] = cw;
             }
         }
     }
-    return 0;
 }
 
 static void imc_refine_bit_allocation(IMCContext *q, IMCChannel *chctx)
@@ -887,6 +908,13 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
         imc_decode_level_coefficients2(q, chctx->levlCoeffBuf, chctx->old_floor,
                                        chctx->flcoeffs1, chctx->flcoeffs2);
 
+    for(i=0; i<BANDS; i++) {
+        if(chctx->flcoeffs1[i] > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "scalefactor out of range\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     memcpy(chctx->old_floor, chctx->flcoeffs1, 32 * sizeof(float));
 
     counter = 0;
@@ -968,11 +996,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
 
     memset(chctx->codewords, 0, sizeof(chctx->codewords));
 
-    if (imc_get_coeffs(q, chctx) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Read coefficients failed\n");
-        chctx->decoder_reset = 1;
-        return AVERROR_INVALIDDATA;
-    }
+    imc_get_coeffs(avctx, q, chctx);
 
     if (inverse_quant_coeff(q, chctx, stream_format_code) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Inverse quantization of coefficients failed\n");
@@ -1006,10 +1030,8 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = COEFFS;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (i = 0; i < avctx->channels; i++) {
         q->out_samples = (float *)frame->extended_data[i];
@@ -1025,7 +1047,7 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (avctx->channels == 2) {
-        q->fdsp.butterflies_float((float *)frame->extended_data[0],
+        q->fdsp->butterflies_float((float *)frame->extended_data[0],
                                   (float *)frame->extended_data[1], COEFFS);
     }
 
@@ -1034,17 +1056,25 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     return IMC_BLOCK_SIZE * avctx->channels;
 }
 
-
 static av_cold int imc_decode_close(AVCodecContext * avctx)
 {
     IMCContext *q = avctx->priv_data;
 
     ff_fft_end(&q->fft);
+    av_freep(&q->fdsp);
 
     return 0;
 }
 
+static av_cold void flush(AVCodecContext *avctx)
+{
+    IMCContext *q = avctx->priv_data;
+
+    q->chctx[0].decoder_reset =
+    q->chctx[1].decoder_reset = 1;
+}
 
+#if CONFIG_IMC_DECODER
 AVCodec ff_imc_decoder = {
     .name           = "imc",
     .long_name      = NULL_IF_CONFIG_SMALL("IMC (Intel Music Coder)"),
@@ -1054,11 +1084,13 @@ AVCodec ff_imc_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_IAC_DECODER
 AVCodec ff_iac_decoder = {
     .name           = "iac",
     .long_name      = NULL_IF_CONFIG_SMALL("IAC (Indeo Audio Coder)"),
@@ -1068,7 +1100,9 @@ AVCodec ff_iac_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/imcdata.h b/libavcodec/imcdata.h
index 8e99391d61..64e7c7185e 100644
--- a/libavcodec/imcdata.h
+++ b/libavcodec/imcdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imdct15.c b/libavcodec/imdct15.c
index e02e9cedf6..e91aa11085 100644
--- a/libavcodec/imdct15.c
+++ b/libavcodec/imdct15.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,11 +105,11 @@ av_cold int ff_imdct15_init(IMDCT15Context **ps, int N)
     s->len4 = len2 / 2;
     s->len2 = len2;
 
-    s->tmp  = av_malloc(len * 2 * sizeof(*s->tmp));
+    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
     if (!s->tmp)
         goto fail;
 
-    s->twiddle_exptab  = av_malloc(s->len4 * sizeof(*s->twiddle_exptab));
+    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
     if (!s->twiddle_exptab)
         goto fail;
 
diff --git a/libavcodec/imdct15.h b/libavcodec/imdct15.h
index ed3f0039e8..1979aa76af 100644
--- a/libavcodec/imdct15.h
+++ b/libavcodec/imdct15.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imgconvert.c b/libavcodec/imgconvert.c
index 53e7df810a..2241cc1760 100644
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -2,20 +2,20 @@
  * Misc image conversion routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 #include "imgconvert.h"
 #include "internal.h"
 #include "mathops.h"
+#include "libavutil/avassert.h"
 #include "libavutil/colorspace.h"
 #include "libavutil/common.h"
 #include "libavutil/pixdesc.h"
@@ -42,121 +43,49 @@
 void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
     *h_shift = desc->log2_chroma_w;
     *v_shift = desc->log2_chroma_h;
 }
 
-static int is_gray(const AVPixFmtDescriptor *desc)
-{
-    return desc->nb_components - (desc->flags & AV_PIX_FMT_FLAG_ALPHA) == 1;
-}
-
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt,
                              enum AVPixelFormat src_pix_fmt,
                              int has_alpha)
 {
-    const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
-    const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
-    int loss, i, nb_components = FFMIN(src_desc->nb_components,
-                                       dst_desc->nb_components);
-
-    /* compute loss */
-    loss = 0;
-
-    if (dst_pix_fmt == src_pix_fmt)
-        return 0;
-
-    for (i = 0; i < nb_components; i++)
-        if (src_desc->comp[i].depth > dst_desc->comp[i].depth)
-            loss |= FF_LOSS_DEPTH;
-
-    if (dst_desc->log2_chroma_w > src_desc->log2_chroma_w ||
-        dst_desc->log2_chroma_h > src_desc->log2_chroma_h)
-        loss |= FF_LOSS_RESOLUTION;
-
-    if ((src_desc->flags & AV_PIX_FMT_FLAG_RGB) != (dst_desc->flags & AV_PIX_FMT_FLAG_RGB))
-        loss |= FF_LOSS_COLORSPACE;
-
-    if (has_alpha && !(dst_desc->flags & AV_PIX_FMT_FLAG_ALPHA) &&
-         (src_desc->flags & AV_PIX_FMT_FLAG_ALPHA))
-        loss |= FF_LOSS_ALPHA;
-
-    if (dst_pix_fmt == AV_PIX_FMT_PAL8 && !is_gray(src_desc))
-        return loss | FF_LOSS_COLORQUANT;
-
-    if (src_desc->nb_components > dst_desc->nb_components)
-        if (is_gray(dst_desc))
-            loss |= FF_LOSS_CHROMA;
-
-    return loss;
+    return av_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
 }
 
-static enum AVPixelFormat avcodec_find_best_pix_fmt1(enum AVPixelFormat *pix_fmt_list,
-                                      enum AVPixelFormat src_pix_fmt,
-                                      int has_alpha,
-                                      int loss_mask)
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
 {
-    int dist, i, loss, min_dist;
-    enum AVPixelFormat dst_pix_fmt;
-
-    /* find exact color match with smallest size */
-    dst_pix_fmt = AV_PIX_FMT_NONE;
-    min_dist = 0x7fffffff;
-    i = 0;
-    while (pix_fmt_list[i] != AV_PIX_FMT_NONE) {
-        enum AVPixelFormat pix_fmt = pix_fmt_list[i];
-
-        if (i > AV_PIX_FMT_NB) {
-            av_log(NULL, AV_LOG_ERROR, "Pixel format list longer than expected, "
-                   "it is either not properly terminated or contains duplicates\n");
-            return AV_PIX_FMT_NONE;
-        }
-
-        loss = avcodec_get_pix_fmt_loss(pix_fmt, src_pix_fmt, has_alpha) & loss_mask;
-        if (loss == 0) {
-            dist = av_get_bits_per_pixel(av_pix_fmt_desc_get(pix_fmt));
-            if (dist < min_dist) {
-                min_dist = dist;
-                dst_pix_fmt = pix_fmt;
-            }
-        }
-        i++;
-    }
-    return dst_pix_fmt;
+    return av_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
 }
 
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
+#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
+enum AVPixelFormat avcodec_find_best_pix_fmt2(const enum AVPixelFormat *pix_fmt_list,
                                             enum AVPixelFormat src_pix_fmt,
-                                            int has_alpha, int *loss_ptr)
+                                            int has_alpha, int *loss_ptr){
+    return avcodec_find_best_pix_fmt_of_list(pix_fmt_list, src_pix_fmt, has_alpha, loss_ptr);
+}
+#else
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
 {
-    enum AVPixelFormat dst_pix_fmt;
-    int loss_mask, i;
-    static const int loss_mask_order[] = {
-        ~0, /* no loss first */
-        ~FF_LOSS_ALPHA,
-        ~FF_LOSS_RESOLUTION,
-        ~(FF_LOSS_COLORSPACE | FF_LOSS_RESOLUTION),
-        ~FF_LOSS_COLORQUANT,
-        ~FF_LOSS_DEPTH,
-        0,
-    };
-
-    /* try with successive loss */
-    i = 0;
-    for(;;) {
-        loss_mask = loss_mask_order[i++];
-        dst_pix_fmt = avcodec_find_best_pix_fmt1(pix_fmt_list, src_pix_fmt,
-                                                 has_alpha, loss_mask);
-        if (dst_pix_fmt >= 0)
-            goto found;
-        if (loss_mask == 0)
-            break;
-    }
-    return AV_PIX_FMT_NONE;
- found:
-    if (loss_ptr)
-        *loss_ptr = avcodec_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
-    return dst_pix_fmt;
+    return avcodec_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
+}
+#endif
+
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr){
+    int i;
+
+    enum AVPixelFormat best = AV_PIX_FMT_NONE;
+
+    for(i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++)
+        best = avcodec_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, loss_ptr);
+
+    return best;
 }
 
 /* 2x2 -> 1x1 */
@@ -248,8 +177,22 @@ void ff_shrink88(uint8_t *dst, int dst_wrap,
 /* return true if yuv planar */
 static inline int is_yuv_planar(const AVPixFmtDescriptor *desc)
 {
-    return (!(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
-             (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
+    int i;
+    int planes[4] = { 0 };
+
+    if (     desc->flags & AV_PIX_FMT_FLAG_RGB
+        || !(desc->flags & AV_PIX_FMT_FLAG_PLANAR))
+        return 0;
+
+    /* set the used planes */
+    for (i = 0; i < desc->nb_components; i++)
+        planes[desc->comp[i].plane] = 1;
+
+    /* if there is an unused plane, the format is not planar */
+    for (i = 0; i < desc->nb_components; i++)
+        if (!planes[i])
+            return 0;
+    return 1;
 }
 
 int av_picture_crop(AVPicture *dst, const AVPicture *src,
@@ -258,16 +201,24 @@ int av_picture_crop(AVPicture *dst, const AVPicture *src,
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int y_shift;
     int x_shift;
+    int max_step[4];
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB || !is_yuv_planar(desc))
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
         return -1;
 
     y_shift = desc->log2_chroma_h;
     x_shift = desc->log2_chroma_w;
+    av_image_fill_max_pixsteps(max_step, NULL, desc);
 
+    if (is_yuv_planar(desc)) {
     dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + left_band;
     dst->data[1] = src->data[1] + ((top_band >> y_shift) * src->linesize[1]) + (left_band >> x_shift);
     dst->data[2] = src->data[2] + ((top_band >> y_shift) * src->linesize[2]) + (left_band >> x_shift);
+    } else{
+        if(top_band % (1<<y_shift) || left_band % (1<<x_shift))
+            return -1;
+        dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + (left_band * max_step[0]);
+    }
 
     dst->linesize[0] = src->linesize[0];
     dst->linesize[1] = src->linesize[1];
@@ -285,9 +236,41 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
     int x_shift;
     int yheight;
     int i, y;
+    int max_step[4];
+
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
+        return -1;
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB ||
-        !is_yuv_planar(desc)) return -1;
+    if (!is_yuv_planar(desc)) {
+        if (src)
+            return -1; //TODO: Not yet implemented
+
+        av_image_fill_max_pixsteps(max_step, NULL, desc);
+
+        if (padtop || padleft) {
+            memset(dst->data[0], color[0],
+                    dst->linesize[0] * padtop + (padleft * max_step[0]));
+        }
+
+        if (padleft || padright) {
+            optr = dst->data[0] + dst->linesize[0] * padtop +
+                    (dst->linesize[0] - (padright * max_step[0]));
+            yheight = height - 1 - (padtop + padbottom);
+            for (y = 0; y < yheight; y++) {
+                memset(optr, color[0], (padleft + padright) * max_step[0]);
+                optr += dst->linesize[0];
+            }
+        }
+
+        if (padbottom || padright) {
+            optr = dst->data[0] + dst->linesize[0] * (height - padbottom) -
+                    (padright * max_step[0]);
+            memset(optr, color[0], dst->linesize[0] * padbottom +
+                    (padright * max_step[0]));
+        }
+
+        return 0;
+    }
 
     for (i = 0; i < 3; i++) {
         x_shift = i ? desc->log2_chroma_w : 0;
@@ -333,5 +316,34 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
                 (padbottom >> y_shift) + (padright >> x_shift));
         }
     }
+
     return 0;
 }
+
+#ifdef TEST
+
+int main(void){
+    int i;
+    int err=0;
+    int skip = 0;
+
+    for (i=0; i<AV_PIX_FMT_NB*2; i++) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(i);
+        if(!desc || !desc->name) {
+            skip ++;
+            continue;
+        }
+        if (skip) {
+            av_log(NULL, AV_LOG_INFO, "%3d unused pixel format values\n", skip);
+            skip = 0;
+        }
+        av_log(NULL, AV_LOG_INFO, "pix fmt %s yuv_plan:%d avg_bpp:%d\n", desc->name, is_yuv_planar(desc), av_get_padded_bits_per_pixel(desc));
+        if ((!(desc->flags & AV_PIX_FMT_FLAG_ALPHA)) != (desc->nb_components != 2 && desc->nb_components != 4)) {
+            av_log(NULL, AV_LOG_ERROR, "Alpha flag mismatch\n");
+            err = 1;
+        }
+    }
+    return err;
+}
+
+#endif
diff --git a/libavcodec/imgconvert.h b/libavcodec/imgconvert.h
index e52ce0a64e..21c5b879bd 100644
--- a/libavcodec/imgconvert.h
+++ b/libavcodec/imgconvert.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imx_dump_header_bsf.c b/libavcodec/imx_dump_header_bsf.c
index a79039da33..3a69e98c04 100644
--- a/libavcodec/imx_dump_header_bsf.c
+++ b/libavcodec/imx_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * imx dump header bitstream filter
  * Copyright (c) 2007 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,7 +55,6 @@ static int imx_dump_header(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx
 }
 
 AVBitStreamFilter ff_imx_dump_header_bsf = {
-    "imxdump",
-    0,
-    imx_dump_header,
+    .name   = "imxdump",
+    .filter = imx_dump_header,
 };
diff --git a/libavcodec/indeo2.c b/libavcodec/indeo2.c
index 7b355f39b9..74135b9af0 100644
--- a/libavcodec/indeo2.c
+++ b/libavcodec/indeo2.c
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,15 +54,13 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
     int i;
     int j;
     int out = 0;
-    int c;
-    int t;
 
     if (width & 1)
         return AVERROR_INVALIDDATA;
 
     /* first line contain absolute values, other lines contain deltas */
     while (out < width) {
-        c = ir2_get_code(&ctx->gb);
+        int c = ir2_get_code(&ctx->gb);
         if (c >= 0x80) { /* we have a run */
             c -= 0x7F;
             if (out + c*2 > width)
@@ -79,7 +77,7 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
     for (j = 1; j < height; j++) {
         out = 0;
         while (out < width) {
-            c = ir2_get_code(&ctx->gb);
+            int c = ir2_get_code(&ctx->gb);
             if (c >= 0x80) { /* we have a skip */
                 c -= 0x7F;
                 if (out + c*2 > width)
@@ -89,7 +87,7 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
                     out++;
                 }
             } else { /* add two deltas from table */
-                t        = dst[out - pitch] + (table[c * 2] - 128);
+                int t    = dst[out - pitch] + (table[c * 2] - 128);
                 t        = av_clip_uint8(t);
                 dst[out] = t;
                 out++;
@@ -149,10 +147,8 @@ static int ir2_decode_frame(AVCodecContext *avctx,
     AVFrame * const p    = s->picture;
     int start, ret;
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
     start = 48; /* hardcoded for now */
 
diff --git a/libavcodec/indeo2data.h b/libavcodec/indeo2data.h
index ed8d83c83b..0d6d82f22c 100644
--- a/libavcodec/indeo2data.h
+++ b/libavcodec/indeo2data.h
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo3.c b/libavcodec/indeo3.c
index 5a0d3a7d4b..3f31946d66 100644
--- a/libavcodec/indeo3.c
+++ b/libavcodec/indeo3.c
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
+#include "copy_block.h"
 #include "bytestream.h"
 #include "get_bits.h"
 #include "hpeldsp.h"
@@ -93,7 +94,7 @@ typedef struct Indeo3DecodeContext {
 
     int16_t         width, height;
     uint32_t        frame_num;      ///< current frame number (zero-based)
-    uint32_t        data_size;      ///< size of the frame data in bytes
+    int             data_size;      ///< size of the frame data in bytes
     uint16_t        frame_flags;    ///< frame properties
     uint8_t         cb_offset;      ///< needed for selecting VQ tables
     uint8_t         buf_sel;        ///< active frame buffer: 0 - primary, 1 -secondary
@@ -117,8 +118,8 @@ static uint8_t requant_tab[8][128];
  */
 static av_cold void build_requant_tab(void)
 {
-    static int8_t offsets[8] = { 1, 1, 2, -3, -3, 3, 4, 4 };
-    static int8_t deltas [8] = { 0, 1, 0,  4,  4, 1, 0, 1 };
+    static const int8_t offsets[8] = { 1, 1, 2, -3, -3, 3, 4, 4 };
+    static const int8_t deltas [8] = { 0, 1, 0,  4,  4, 1, 0, 1 };
 
     int i, j, step;
 
@@ -147,15 +148,26 @@ static av_cold void build_requant_tab(void)
 }
 
 
+static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
+{
+    int p;
+
+    ctx->width = ctx->height = 0;
+
+    for (p = 0; p < 3; p++) {
+        av_freep(&ctx->planes[p].buffers[0]);
+        av_freep(&ctx->planes[p].buffers[1]);
+        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
+    }
+}
+
+
 static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
-                                          AVCodecContext *avctx)
+                                          AVCodecContext *avctx, int luma_width, int luma_height)
 {
-    int p, luma_width, luma_height, chroma_width, chroma_height;
+    int p, chroma_width, chroma_height;
     int luma_pitch, chroma_pitch, luma_size, chroma_size;
 
-    luma_width  = ctx->width;
-    luma_height = ctx->height;
-
     if (luma_width  < 16 || luma_width  > 640 ||
         luma_height < 16 || luma_height > 480 ||
         luma_width  &  3 || luma_height &   3) {
@@ -164,6 +176,9 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         return AVERROR_INVALIDDATA;
     }
 
+    ctx->width  = luma_width ;
+    ctx->height = luma_height;
+
     chroma_width  = FFALIGN(luma_width  >> 2, 4);
     chroma_height = FFALIGN(luma_height >> 2, 4);
 
@@ -187,6 +202,11 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         ctx->planes[p].buffers[0] = av_malloc(!p ? luma_size : chroma_size);
         ctx->planes[p].buffers[1] = av_malloc(!p ? luma_size : chroma_size);
 
+        if (!ctx->planes[p].buffers[0] || !ctx->planes[p].buffers[1]) {
+            free_frame_buffers(ctx);
+            return AVERROR(ENOMEM);
+        }
+
         /* fill the INTRA prediction lines with the middle pixel value = 64 */
         memset(ctx->planes[p].buffers[0], 0x40, ctx->planes[p].pitch);
         memset(ctx->planes[p].buffers[1], 0x40, ctx->planes[p].pitch);
@@ -201,19 +221,6 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
     return 0;
 }
 
-
-static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
-{
-    int p;
-
-    for (p = 0; p < 3; p++) {
-        av_freep(&ctx->planes[p].buffers[0]);
-        av_freep(&ctx->planes[p].buffers[1]);
-        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
-    }
-}
-
-
 /**
  *  Copy pixels of the cell(x + mv_x, y + mv_y) from the previous frame into
  *  the cell(x, y) in the current frame.
@@ -230,8 +237,11 @@ static int copy_cell(Indeo3DecodeContext *ctx, Plane *plane, Cell *cell)
     /* setup output and reference pointers */
     offset_dst  = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     dst         = plane->pixels[ctx->buf_sel] + offset_dst;
+    if(cell->mv_ptr){
     mv_y        = cell->mv_ptr[0];
     mv_x        = cell->mv_ptr[1];
+    }else
+        mv_x= mv_y= 0;
 
     /* -1 because there is an extra line on top for prediction */
     if ((cell->ypos << 2) + mv_y < -1 || (cell->xpos << 2) + mv_x < 0 ||
@@ -333,7 +343,7 @@ if (*data_ptr >= last_ptr) \
 
 #define RLE_BLOCK_COPY \
     if (cell->mv_ptr || !skip_flag) \
-        ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, 4 << v_zoom)
+        copy_block4(dst, ref, row_offset, row_offset, 4 << v_zoom)
 
 #define RLE_BLOCK_COPY_8 \
     pix64 = AV_RN64(ref);\
@@ -345,7 +355,7 @@ if (*data_ptr >= last_ptr) \
         fill_64(dst, pix64, 8, row_offset)
 
 #define RLE_LINES_COPY \
-    ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, num_lines << v_zoom)
+    copy_block4(dst, ref, row_offset, row_offset, num_lines << v_zoom)
 
 #define RLE_LINES_COPY_M10 \
     pix64 = AV_RN64(ref);\
@@ -589,6 +599,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* setup output and reference pointers */
     offset = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     block  =  plane->pixels[ctx->buf_sel] + offset;
+
     if (!cell->mv_ptr) {
         /* use previous line as reference for INTRA cells */
         ref_block = block - plane->pitch;
@@ -643,7 +654,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* of the predicted cell in order to avoid overflows. */
     if (vq_index >= 8 && ref_block) {
         for (x = 0; x < cell->width << 2; x++)
-            ref_block[x] = requant_tab[vq_index & 7][ref_block[x]];
+            ref_block[x] = requant_tab[vq_index & 7][ref_block[x] & 127];
     }
 
     error = IV3_NOERR;
@@ -771,7 +782,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
     }
 
-    while (1) { /* loop until return */
+    while (get_bits_left(&ctx->gb) >= 2) { /* loop until return */
         RESYNC_BITSTREAM;
         switch (code = get_bits(&ctx->gb, 2)) {
         case H_SPLIT:
@@ -796,6 +807,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 CHECK_CELL
                 if (!curr_cell.mv_ptr)
                     return AVERROR_INVALIDDATA;
+
                 ret = copy_cell(ctx, plane, &curr_cell);
                 return ret;
             }
@@ -806,6 +818,10 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 /* get motion vector index and setup the pointer to the mv set */
                 if (!ctx->need_resync)
                     ctx->next_cell_data = &ctx->gb.buffer[(get_bits_count(&ctx->gb) + 7) >> 3];
+                if (ctx->next_cell_data >= ctx->last_byte) {
+                    av_log(avctx, AV_LOG_ERROR, "motion vector out of array\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 mv_idx = *(ctx->next_cell_data++);
                 if (mv_idx >= ctx->num_vectors) {
                     av_log(avctx, AV_LOG_ERROR, "motion vector index out of range\n");
@@ -832,7 +848,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
         }
     }//while
 
-    return 0;
+    return AVERROR_INVALIDDATA;
 }
 
 
@@ -845,13 +861,13 @@ static int decode_plane(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
 
     /* each plane data starts with mc_vector_count field, */
     /* an optional array of motion vectors followed by the vq data */
-    num_vectors = bytestream_get_le32(&data);
+    num_vectors = bytestream_get_le32(&data); data_size -= 4;
     if (num_vectors > 256) {
         av_log(ctx->avctx, AV_LOG_ERROR,
                "Read invalid number of motion vectors %d\n", num_vectors);
         return AVERROR_INVALIDDATA;
     }
-    if (num_vectors * 2 >= data_size)
+    if (num_vectors * 2 > data_size)
         return AVERROR_INVALIDDATA;
 
     ctx->num_vectors = num_vectors;
@@ -862,7 +878,7 @@ static int decode_plane(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     ctx->skip_bits   = 0;
     ctx->need_resync = 0;
 
-    ctx->last_byte = data + data_size - 1;
+    ctx->last_byte = data + data_size;
 
     /* initialize the 1st cell and set its dimensions to whole plane */
     curr_cell.xpos   = curr_cell.ypos = 0;
@@ -883,7 +899,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     GetByteContext gb;
     const uint8_t   *bs_hdr;
     uint32_t        frame_num, word2, check_sum, data_size;
-    uint32_t        y_offset, u_offset, v_offset, starts[3], ends[3];
+    int             y_offset, u_offset, v_offset;
+    uint32_t        starts[3], ends[3];
     uint16_t        height, width;
     int             i, j;
 
@@ -937,12 +954,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                    "Invalid picture dimensions: %d x %d!\n", width, height);
             return AVERROR_INVALIDDATA;
         }
-
-        ctx->width  = width;
-        ctx->height = height;
-
         free_frame_buffers(ctx);
-        if ((res = allocate_frame_buffers(ctx, avctx)) < 0)
+        if ((res = allocate_frame_buffers(ctx, avctx, width, height)) < 0)
              return res;
         if ((res = ff_set_dimensions(avctx, width, height)) < 0)
             return res;
@@ -969,7 +982,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     ctx->y_data_size = ends[0] - starts[0];
     ctx->v_data_size = ends[1] - starts[1];
     ctx->u_data_size = ends[2] - starts[2];
-    if (FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
+    if (FFMIN3(y_offset, v_offset, u_offset) < 0 ||
+        FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
         FFMIN3(y_offset, v_offset, u_offset) < gb.buffer - bs_hdr + 16 ||
         FFMIN3(ctx->y_data_size, ctx->v_data_size, ctx->u_data_size) <= 0) {
         av_log(avctx, AV_LOG_ERROR, "One of the y/u/v offsets is invalid\n");
@@ -1040,17 +1054,13 @@ static av_cold int decode_init(AVCodecContext *avctx)
     Indeo3DecodeContext *ctx = avctx->priv_data;
 
     ctx->avctx     = avctx;
-    ctx->width     = avctx->width;
-    ctx->height    = avctx->height;
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
     build_requant_tab();
 
     ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
 
-    allocate_frame_buffers(ctx, avctx);
-
-    return 0;
+    return allocate_frame_buffers(ctx, avctx, avctx->width, avctx->height);
 }
 
 
@@ -1086,6 +1096,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     /* use BS_BUFFER flag for buffer switching */
     ctx->buf_sel = (ctx->frame_flags >> BS_BUFFER) & 1;
 
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
+        return res;
+
     /* decode luma plane */
     if ((res = decode_plane(ctx, avctx, ctx->planes, ctx->y_data_ptr, ctx->y_data_size, 40)))
         return res;
@@ -1097,11 +1110,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((res = decode_plane(ctx, avctx, &ctx->planes[2], ctx->v_data_ptr, ctx->v_data_size, 10)))
         return res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return res;
-    }
-
     output_plane(&ctx->planes[0], ctx->buf_sel,
                  frame->data[0], frame->linesize[0],
                  avctx->height);
diff --git a/libavcodec/indeo3data.h b/libavcodec/indeo3data.h
index 28c9bb6061..fe8f0bae4d 100644
--- a/libavcodec/indeo3data.h
+++ b/libavcodec/indeo3data.h
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -238,9 +238,9 @@
  * according with endianness of the host machine.
  */
 #if HAVE_BIGENDIAN
-#define PD(a,b) (((a) << 8) + (b))
+#define PD(a,b) (((a) * (1 << 8)) + (b))
 #else
-#define PD(a,b) (((b) << 8) + (a))
+#define PD(a,b) (((b) * (1 << 8)) + (a))
 #endif
 
 /**
@@ -285,9 +285,9 @@ static const int16_t delta_tab_3_5[79]  = { TAB_3_5 };
  * according with endianness of the host machine.
  */
 #if HAVE_BIGENDIAN
-#define PD(a,b) (((a) << 24) + ((a) << 16) + ((b) << 8) + (b))
+#define PD(a,b) (((a) * (1 << 24)) + ((a) * (1 << 16)) + ((b) * (1 << 8)) + (b))
 #else
-#define PD(a,b) (((b) << 24) + ((b) << 16) + ((a) << 8) + (a))
+#define PD(a,b) (((b) * (1 << 24)) + ((b) * (1 << 16)) + ((a) * (1 << 8)) + (a))
 #endif
 
 /*
diff --git a/libavcodec/indeo4.c b/libavcodec/indeo4.c
index 64ed8cd581..0065b52c75 100644
--- a/libavcodec/indeo4.c
+++ b/libavcodec/indeo4.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v4 compatible decoder
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -184,6 +184,7 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
 
     /* decode subdivision of the planes */
     pic_conf.luma_bands = decode_plane_subdivision(&ctx->gb);
+    pic_conf.chroma_bands = 0;
     if (pic_conf.luma_bands)
         pic_conf.chroma_bands = decode_plane_subdivision(&ctx->gb);
     ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
@@ -271,6 +272,7 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 {
     int plane, band_num, indx, transform_id, scan_indx;
     int i;
+    int quant_mat;
 
     plane    = get_bits(&ctx->gb, 2);
     band_num = get_bits(&ctx->gb, 4);
@@ -328,6 +330,10 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                 return AVERROR_PATCHWELCOME;
             }
 
+            if (transform_id < 10 && band->blk_size < 8) {
+                av_log(avctx, AV_LOG_ERROR, "wrong transform size!\n");
+                return AVERROR_INVALIDDATA;
+            }
 #if IVI4_STREAM_ANALYSER
             if ((transform_id >= 0 && transform_id <= 2) || transform_id == 10)
                 ctx->uses_haar = 1;
@@ -336,13 +342,16 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
             band->inv_transform = transforms[transform_id].inv_trans;
             band->dc_transform  = transforms[transform_id].dc_trans;
             band->is_2d_trans   = transforms[transform_id].is_2d_trans;
+
             if (transform_id < 10)
                 band->transform_size = 8;
             else
                 band->transform_size = 4;
 
-            if (band->blk_size != band->transform_size)
+            if (band->blk_size != band->transform_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
             scan_indx = get_bits(&ctx->gb, 4);
             if (scan_indx == 15) {
@@ -350,25 +359,29 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                 return AVERROR_INVALIDDATA;
             }
             if (scan_indx > 4 && scan_indx < 10) {
-                if (band->blk_size != 4)
+                if (band->blk_size != 4) {
+                    av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                     return AVERROR_INVALIDDATA;
-            } else if (band->blk_size != 8)
+                }
+            } else if (band->blk_size != 8) {
+                av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                 return AVERROR_INVALIDDATA;
+            }
 
             band->scan = scan_index_to_tab[scan_indx];
+            band->scan_size = band->blk_size;
 
-            band->quant_mat = get_bits(&ctx->gb, 5);
-            if (band->quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
-
-                if (band->quant_mat == 31)
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Custom quant matrix encountered!\n");
-                else
-                    avpriv_request_sample(avctx, "Quantization matrix %d",
-                                          band->quant_mat);
-                band->quant_mat = -1;
+            quant_mat = get_bits(&ctx->gb, 5);
+            if (quant_mat == 31) {
+                av_log(avctx, AV_LOG_ERROR, "Custom quant matrix encountered!\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
+                avpriv_request_sample(avctx, "Quantization matrix %d",
+                                      quant_mat);
+                return AVERROR_INVALIDDATA;
+            }
+            band->quant_mat = quant_mat;
         } else {
             if (old_blk_size != band->blk_size) {
                 av_log(avctx, AV_LOG_ERROR,
@@ -376,10 +389,19 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                        "inherited\n");
                 return AVERROR_INVALIDDATA;
             }
-            if (band->quant_mat < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Invalid quant_mat inherited\n");
-                return AVERROR_INVALIDDATA;
-            }
+        }
+        if (quant_index_to_tab[band->quant_mat] > 4 && band->blk_size == 4) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid quant matrix for 4x4 block encountered!\n");
+            band->quant_mat = 0;
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->scan_size != band->blk_size) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->transform_size == 8 && band->blk_size < 8) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching transform_size!\n");
+            return AVERROR_INVALIDDATA;
         }
 
         /* decode block huffman codebook */
@@ -423,6 +445,11 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 
     align_get_bits(&ctx->gb);
 
+    if (!band->scan) {
+        av_log(avctx, AV_LOG_ERROR, "band->scan not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -441,7 +468,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset, blks_per_mb,
-                mv_scale, mb_type_bits;
+                mv_scale, mb_type_bits, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -456,6 +483,11 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
     mv_scale = (ctx->planes[0].bands[0].mb_size >> 3) - (band->mb_size >> 3);
     mv_x = mv_y = 0;
 
+    if (((tile->width + band->mb_size-1)/band->mb_size) * ((tile->height + band->mb_size-1)/band->mb_size) != tile->num_MBs) {
+        av_log(avctx, AV_LOG_ERROR, "num_MBs mismatch %d %d %d %d\n", tile->width, tile->height, band->mb_size, tile->num_MBs);
+        return -1;
+    }
+
     for (y = tile->ypos; y < tile->ypos + tile->height; y += band->mb_size) {
         mb_offset = offs;
 
@@ -495,8 +527,10 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
             } else {
                 if (band->inherit_mv) {
                     /* copy mb_type from corresponding reference mb */
-                    if (!ref_mb)
+                    if (!ref_mb) {
+                        av_log(avctx, AV_LOG_ERROR, "ref_mb unavailable\n");
                         return AVERROR_INVALIDDATA;
+                    }
                     mb->type = ref_mb->type;
                 } else if (ctx->frame_type == IVI4_FRAMETYPE_INTRA ||
                            ctx->frame_type == IVI4_FRAMETYPE_INTRA1) {
@@ -562,6 +596,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize -1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
diff --git a/libavcodec/indeo4data.h b/libavcodec/indeo4data.h
index be7c41356b..cc497c2391 100644
--- a/libavcodec/indeo4data.h
+++ b/libavcodec/indeo4data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 4 compatible decoder
  * Copyright (c) 2009-2010 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,7 +60,7 @@ static const uint8_t ivi4_horizontal_scan_4x4[16] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
-static const uint8_t *scan_index_to_tab[15] = {
+static const uint8_t * const scan_index_to_tab[15] = {
     // for 8x8 transforms
     ff_zigzag_direct,
     ivi4_alternate_scan_8x8,
diff --git a/libavcodec/indeo5.c b/libavcodec/indeo5.c
index bed915370a..5f931c8b98 100644
--- a/libavcodec/indeo5.c
+++ b/libavcodec/indeo5.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ enum {
  */
 static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 {
-    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size;
+    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size, is_scalable;
     int             quant_mat, blk_size_changed = 0;
     IVIBandDesc     *band, *band1, *band2;
     IVIPicConfig    pic_conf;
@@ -80,8 +80,8 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
     /* num_levels * 3 + 1 */
     pic_conf.luma_bands   = get_bits(&ctx->gb, 2) * 3 + 1;
     pic_conf.chroma_bands = get_bits1(&ctx->gb)   * 3 + 1;
-    ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
-    if (ctx->is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
+    is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
+    if (is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
         av_log(avctx, AV_LOG_ERROR, "Scalability: unsupported subdivision! Luma bands: %d, chroma bands: %d\n",
                pic_conf.luma_bands, pic_conf.chroma_bands);
         return AVERROR_INVALIDDATA;
@@ -119,6 +119,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             return result;
         }
         ctx->pic_conf = pic_conf;
+        ctx->is_scalable = is_scalable;
         blk_size_changed = 1; /* force reallocation of the internal structures */
     }
 
@@ -132,6 +133,11 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             blk_size = 8 >> get_bits1(&ctx->gb);
             mb_size  = blk_size << !mb_size;
 
+            if (p==0 && blk_size==4) {
+                av_log(avctx, AV_LOG_ERROR, "4x4 luma blocks are unsupported!\n");
+                return AVERROR_PATCHWELCOME;
+            }
+
             blk_size_changed = mb_size != band->mb_size || blk_size != band->blk_size;
             if (blk_size_changed) {
                 band->mb_size  = mb_size;
@@ -184,8 +190,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             band->is_2d_trans = band->inv_transform == ff_ivi_inverse_slant_8x8 ||
                                 band->inv_transform == ff_ivi_inverse_slant_4x4;
 
-            if (band->transform_size != band->blk_size)
+            if (band->transform_size != band->blk_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
             /* select dequant matrix according to plane and band number */
             if (!p) {
@@ -195,6 +203,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             }
 
             if (band->blk_size == 8) {
+                if(quant_mat >= 5){
+                    av_log(avctx, AV_LOG_ERROR, "quant_mat %d too large!\n", quant_mat);
+                    return -1;
+                }
                 band->intra_base  = &ivi5_base_quant_8x8_intra[quant_mat][0];
                 band->inter_base  = &ivi5_base_quant_8x8_inter[quant_mat][0];
                 band->intra_scale = &ivi5_scale_quant_8x8_intra[quant_mat][0];
@@ -231,6 +243,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
         band2->inv_transform = band1->inv_transform;
         band2->dc_transform  = band1->dc_transform;
         band2->is_2d_trans   = band1->is_2d_trans;
+        band2->transform_size= band1->transform_size;
     }
 
     /* reallocate internal structures if needed */
@@ -276,14 +289,18 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
  *
  *  @param[in,out]  gb  the GetBit context
  */
-static inline void skip_hdr_extension(GetBitContext *gb)
+static inline int skip_hdr_extension(GetBitContext *gb)
 {
     int i, len;
 
     do {
         len = get_bits(gb, 8);
+        if (8*len > get_bits_left(gb))
+            return AVERROR_INVALIDDATA;
         for (i = 0; i < len; i++) skip_bits(gb, 8);
     } while(len);
+
+    return 0;
 }
 
 
@@ -321,6 +338,12 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
         ctx->gop_invalid = 0;
     }
 
+    if (ctx->frame_type == FRAMETYPE_INTER_SCAL && !ctx->is_scalable) {
+        av_log(avctx, AV_LOG_ERROR, "Scalable inter frame in non scalable stream\n");
+        ctx->frame_type = FRAMETYPE_INTER;
+        return AVERROR_INVALIDDATA;
+    }
+
     if (ctx->frame_type != FRAMETYPE_NULL) {
         ctx->frame_flags = get_bits(&ctx->gb, 8);
 
@@ -431,7 +454,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset,
-                mv_scale, blks_per_mb;
+                mv_scale, blks_per_mb, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -477,7 +500,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
 
                 mb->mv_x = mb->mv_y = 0; /* no motion vector coded */
-                if (band->inherit_mv){
+                if (band->inherit_mv && ref_mb){
                     /* motion vector inheritance */
                     if (mv_scale) {
                         mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -488,7 +511,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                     }
                 }
             } else {
-                if (band->inherit_mv) {
+                if (band->inherit_mv && ref_mb) {
                     mb->type = ref_mb->type; /* copy mb_type from corresponding reference mb */
                 } else if (ctx->frame_type == FRAMETYPE_INTRA) {
                     mb->type = 0; /* mb_type is always INTRA for intra-frames */
@@ -514,7 +537,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 if (!mb->type) {
                     mb->mv_x = mb->mv_y = 0; /* there is no motion vector in intra-macroblocks */
                 } else {
-                    if (band->inherit_mv){
+                    if (band->inherit_mv && ref_mb){
                         /* motion vector inheritance */
                         if (mv_scale) {
                             mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -537,6 +560,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize - 1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
@@ -647,7 +679,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-
 AVCodec ff_indeo5_decoder = {
     .name           = "indeo5",
     .long_name      = NULL_IF_CONFIG_SMALL("Intel Indeo Video Interactive 5"),
diff --git a/libavcodec/indeo5data.h b/libavcodec/indeo5data.h
index f4252b59f6..a6217d0bf8 100644
--- a/libavcodec/indeo5data.h
+++ b/libavcodec/indeo5data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intelh263dec.c b/libavcodec/intelh263dec.c
index ec2553bdb1..fe8d185ec7 100644
--- a/libavcodec/intelh263dec.c
+++ b/libavcodec/intelh263dec.c
@@ -1,20 +1,20 @@
 /*
  * H.263i decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,8 +39,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
 
-    if (get_bits1(&s->gb) != 1) {
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(&s->gb, "after picture_number") != 1) {
         return -1;      /* marker */
     }
     if (get_bits1(&s->gb) != 0) {
@@ -60,14 +59,14 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
 
     s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
-    s->unrestricted_mv = get_bits1(&s->gb);
-    s->h263_long_vectors = s->unrestricted_mv;
+    s->h263_long_vectors = get_bits1(&s->gb);
 
     if (get_bits1(&s->gb) != 0) {
         av_log(s->avctx, AV_LOG_ERROR, "SAC not supported\n");
         return -1;      /* SAC: off */
     }
     s->obmc= get_bits1(&s->gb);
+    s->unrestricted_mv = s->obmc || s->h263_long_vectors;
     s->pb_frame = get_bits1(&s->gb);
 
     if (format < 6) {
@@ -83,7 +82,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
         }
         if(get_bits(&s->gb, 2))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) * !s->avctx->lowres;
         if(get_bits1(&s->gb))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
         if(get_bits1(&s->gb))
@@ -96,7 +95,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     if(format == 6){
         int ar = get_bits(&s->gb, 4);
         skip_bits(&s->gb, 9); // display width
-        skip_bits1(&s->gb);
+        check_marker(&s->gb, "in dimensions");
         skip_bits(&s->gb, 9); // display height
         if(ar == 15){
             s->avctx->sample_aspect_ratio.num = get_bits(&s->gb, 8); // aspect ratio - width
@@ -117,9 +116,8 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
     s->f_code = 1;
 
     s->y_dc_scale_table=
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index 500511dada..324f0998af 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,16 +54,10 @@
  */
 #define FF_CODEC_CAP_SETS_PKT_DTS           (1 << 2)
 
-#ifdef DEBUG
-#   define ff_dlog(ctx, ...) av_log(ctx, AV_LOG_DEBUG, __VA_ARGS__)
-#else
-#   define ff_dlog(ctx, ...) while(0)
-#endif
-
 #ifdef TRACE
 #   define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__)
 #else
-#   define ff_tlog(ctx, ...) while(0)
+#   define ff_tlog(ctx, ...) do {} while(0)
 #endif
 
 
@@ -71,9 +65,17 @@
 #define FF_DEFAULT_QUANT_BIAS 999999
 #endif
 
-#define FF_SANE_NB_CHANNELS 63U
+#define FF_SANE_NB_CHANNELS 64U
+
+#define FF_SIGNBIT(x) ((x) >> CHAR_BIT * sizeof(x) - 1)
 
-#define FF_SIGNBIT(x) (x >> CHAR_BIT * sizeof(x) - 1)
+#if HAVE_AVX
+#   define STRIDE_ALIGN 32
+#elif HAVE_SIMD_ALIGN_16
+#   define STRIDE_ALIGN 16
+#else
+#   define STRIDE_ALIGN 8
+#endif
 
 typedef struct FramePool {
     /**
@@ -137,6 +139,19 @@ typedef struct AVCodecInternal {
     AVPacket *pkt;
 
     /**
+     * temporary buffer used for encoders to store their bitstream
+     */
+    uint8_t *byte_buffer;
+    unsigned int byte_buffer_size;
+
+    void *frame_thread_encoder;
+
+    /**
+     * Number of audio samples to skip at the start of the next decoded frame
+     */
+    int skip_samples;
+
+    /**
      * hwaccel-specific private data
      */
     void *hwaccel_priv_data;
@@ -147,6 +162,8 @@ struct AVCodecDefault {
     const uint8_t *value;
 };
 
+extern const uint8_t ff_log2_run[41];
+
 /**
  * Return the index into tab at which {a,b} match elements {[0],[1]} of tab.
  * If there is no such matching pair then size is returned.
@@ -155,6 +172,18 @@ int ff_match_2uint16(const uint16_t (*tab)[2], int size, int a, int b);
 
 unsigned int avpriv_toupper4(unsigned int x);
 
+/**
+ * does needed setup of pkt_pts/pos and such for (re)get_buffer();
+ */
+int ff_init_buffer_info(AVCodecContext *s, AVFrame *frame);
+
+
+void avpriv_color_frame(AVFrame *frame, const int color[4]);
+
+extern volatile int ff_avcodec_locked;
+int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec);
+int ff_unlock_avcodec(const AVCodec *codec);
+
 int avpriv_lock_avformat(void);
 int avpriv_unlock_avformat(void);
 
@@ -172,6 +201,7 @@ int avpriv_unlock_avformat(void);
  * ensure the output packet data is large enough, whether provided by the user
  * or allocated in this function.
  *
+ * @param avctx   the AVCodecContext of the encoder
  * @param avpkt   the AVPacket
  *                If avpkt->data is already set, avpkt->size is checked
  *                to ensure it is large enough.
@@ -179,9 +209,20 @@ int avpriv_unlock_avformat(void);
  *                avpkt->size is set to the specified size.
  *                All other AVPacket fields will be reset with av_init_packet().
  * @param size    the minimum required packet size
- * @return        0 on success, negative error code on failure
+ * @param min_size This is a hint to the allocation algorithm, which indicates
+ *                to what minimal size the caller might later shrink the packet
+ *                to. Encoders often allocate packets which are larger than the
+ *                amount of data that is written into them as the exact amount is
+ *                not known at the time of allocation. min_size represents the
+ *                size a packet might be shrunk to by the caller. Can be set to
+ *                0. setting this roughly correctly allows the allocation code
+ *                to choose between several allocation strategies to improve
+ *                speed slightly.
+ * @return        non negative on success, negative error code on failure
  */
-int ff_alloc_packet(AVPacket *avpkt, int size);
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size);
+
+attribute_deprecated int ff_alloc_packet(AVPacket *avpkt, int size);
 
 /**
  * Rescale from sample rate to AVCodecContext.time_base.
@@ -189,6 +230,8 @@ int ff_alloc_packet(AVPacket *avpkt, int size);
 static av_always_inline int64_t ff_samples_to_time_base(AVCodecContext *avctx,
                                                         int64_t samples)
 {
+    if(samples == AV_NOPTS_VALUE)
+        return AV_NOPTS_VALUE;
     return av_rescale_q(samples, (AVRational){ 1, avctx->sample_rate },
                         avctx->time_base);
 }
@@ -206,9 +249,25 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags);
  */
 int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame);
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int ff_thread_can_start_frame(AVCodecContext *avctx);
+
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx);
+
+/**
+ * Call avcodec_open2 recursively by decrementing counter, unlocking mutex,
+ * calling the function and then restoring again. Assumes the mutex is
+ * already locked
+ */
+int ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options);
+
+/**
+ * Finalize buf into extradata and set its size appropriately.
+ */
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf);
+
+const uint8_t *avpriv_find_start_code(const uint8_t *p,
                                       const uint8_t *end,
-                                      uint32_t *restrict state);
+                                      uint32_t *state);
 
 /**
  * Check that the provided frame dimensions are valid and set them on the codec
@@ -240,4 +299,6 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt);
  */
 int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame);
 
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type);
+
 #endif /* AVCODEC_INTERNAL_H */
diff --git a/libavcodec/interplayvideo.c b/libavcodec/interplayvideo.c
index f0a770a519..a29b5fe8da 100644
--- a/libavcodec/interplayvideo.c
+++ b/libavcodec/interplayvideo.c
@@ -1,21 +1,21 @@
 /*
  * Interplay MVE Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,10 +72,10 @@ static int copy_from(IpvideoContext *s, AVFrame *src, AVFrame *dst, int delta_x,
     int motion_offset = current_offset + delta_y * dst->linesize[0]
                        + delta_x * (1 + s->is_16bpp);
     if (motion_offset < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset < 0 (%d)\n", motion_offset);
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset < 0 (%d)\n", motion_offset);
         return AVERROR_INVALIDDATA;
     } else if (motion_offset > s->upper_motion_limit_offset) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset above limit (%d >= %d)\n",
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset above limit (%d >= %d)\n",
             motion_offset, s->upper_motion_limit_offset);
         return AVERROR_INVALIDDATA;
     }
@@ -118,7 +118,7 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s, AVFrame *frame)
         y =   8 + ((B - 56) / 29);
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -144,7 +144,7 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s, AVFrame *frame)
         y = -(  8 + ((B - 56) / 29));
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, frame, frame, x, y);
 }
 
@@ -165,7 +165,7 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s, AVFrame *frame)
     x = -8 + BL;
     y = -8 + BH;
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
@@ -178,14 +178,14 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
 static int ipvideo_decode_block_opcode_0x6(IpvideoContext *s, AVFrame *frame)
 {
     /* mystery opcode? skip multiple blocks? */
-    av_log(s->avctx, AV_LOG_ERROR, "  Interplay video: Help! Mystery opcode 0x6 seen\n");
+    av_log(s->avctx, AV_LOG_ERROR, "Help! Mystery opcode 0x6 seen\n");
 
     /* report success */
     return 0;
@@ -197,6 +197,11 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s, AVFrame *frame)
     unsigned char P[2];
     unsigned int flags;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x7\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
     P[1] = bytestream2_get_byte(&s->stream_ptr);
@@ -236,6 +241,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s, AVFrame *frame)
     unsigned char P[4];
     unsigned int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 12) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x8\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on
      * either top and bottom or left and right halves */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
@@ -308,6 +318,11 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s, AVFrame *frame)
     int x, y;
     unsigned char P[4];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x9\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color encoding */
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
@@ -374,6 +389,11 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s, AVFrame *frame)
     unsigned char P[8];
     int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 16) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xA\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
     /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on
@@ -467,6 +487,11 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s, AVFrame *frame)
     int y;
     unsigned char P[2];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xD\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color block encoding: each 4x4 block is a different color */
     for (y = 0; y < 8; y++) {
         if (!(y & 3)) {
@@ -528,7 +553,7 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -903,7 +928,7 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
         for (x = 0; x < s->avctx->width; x += 8) {
             opcode = get_bits(&gb, 4);
 
-            ff_dlog(s->avctx,
+            ff_tlog(s->avctx,
                     "  block @ (%3d, %3d): encoding 0x%X, data ptr offset %d\n",
                     x, y, opcode, bytestream2_tell(&s->stream_ptr));
 
@@ -917,7 +942,7 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
                 ret = ipvideo_decode_block16[opcode](s, frame);
             }
             if (ret != 0) {
-                av_log(s->avctx, AV_LOG_ERROR, " Interplay video: decode problem on frame %d, @ block (%d, %d)\n",
+                av_log(s->avctx, AV_LOG_ERROR, "decode problem on frame %d, @ block (%d, %d)\n",
                        s->avctx->frame_number, x, y);
                 return;
             }
@@ -925,7 +950,7 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
     }
     if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
         av_log(s->avctx, AV_LOG_ERROR,
-               "Interplay video: decode finished with %d bytes left over\n",
+               "decode finished with %d bytes left over\n",
                bytestream2_get_bytes_left(&s->stream_ptr));
     }
 }
@@ -970,14 +995,17 @@ static int ipvideo_decode_frame(AVCodecContext *avctx,
     if (buf_size < s->decoding_map_size)
         return buf_size;
 
+    if (av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, NULL)) {
+        av_frame_unref(s->last_frame);
+        av_frame_unref(s->second_last_frame);
+    }
+
     s->decoding_map = buf;
     bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size,
                      buf_size - s->decoding_map_size);
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  Interplay Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (!s->is_16bpp) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c
index 3cd84dc170..cf01289fdf 100644
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
  * @brief IntraX8 (J-Frame) subdecoder, used by WMV2 and VC-1
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "error_resilience.h"
 #include "get_bits.h"
@@ -127,13 +128,13 @@ static inline void x8_select_ac_table(IntraX8Context * const w , int mode){
     MpegEncContext * const s= w->s;
     int table_index;
 
-    assert(mode<4);
+    av_assert2(mode<4);
 
     if( w->j_ac_vlc[mode] ) return;
 
     table_index = get_bits(&s->gb, 3);
     w->j_ac_vlc[mode] = &j_ac_vlc[w->quant<13][mode>>1][table_index];//2 modes use same tables
-    assert(w->j_ac_vlc[mode]);
+    av_assert2(w->j_ac_vlc[mode]);
 }
 
 static inline int x8_get_orient_vlc(IntraX8Context * w){
@@ -144,8 +145,6 @@ static inline int x8_get_orient_vlc(IntraX8Context * w){
         table_index = get_bits(&s->gb, 1+(w->quant<13) );
         w->j_orient_vlc = &j_orient_vlc[w->quant<13][table_index];
     }
-    assert(w->j_orient_vlc);
-    assert(w->j_orient_vlc->table);
 
     return get_vlc2(&s->gb, w->j_orient_vlc->table, OR_VLC_BITS, OR_VLC_MTD);
 }
@@ -267,15 +266,13 @@ static int x8_get_dc_rlf(IntraX8Context * const w,int const mode, int * const le
     MpegEncContext * const s= w->s;
     int i,e,c;
 
-    assert(mode<3);
+    av_assert2(mode<3);
     if( !w->j_dc_vlc[mode] ) {
         int table_index;
         table_index = get_bits(&s->gb, 3);
         //4 modes, same table
         w->j_dc_vlc[mode]= &j_dc_vlc[w->quant<13][table_index];
     }
-    assert(w->j_dc_vlc);
-    assert(w->j_dc_vlc[mode]->table);
 
     i=get_vlc2(&s->gb, w->j_dc_vlc[mode]->table, DC_VLC_BITS, DC_VLC_MTD);
 
@@ -328,7 +325,7 @@ static int x8_setup_spatial_predictor(IntraX8Context * const w, const int chroma
     if(chroma)
         return 0;
 
-    assert(w->orient < 3);
+    av_assert2(w->orient < 3);
     if(range < 2*w->quant){
         if( (w->edges&3) == 0){
             if(w->orient==1) w->orient=11;
@@ -345,8 +342,8 @@ static int x8_setup_spatial_predictor(IntraX8Context * const w, const int chroma
         };
         w->raw_orient=x8_get_orient_vlc(w);
         if(w->raw_orient<0) return -1;
-        assert(w->raw_orient < 12 );
-        assert(w->orient<3);
+        av_assert2(w->raw_orient < 12 );
+        av_assert2(w->orient<3);
         w->orient=prediction_table[w->orient][w->raw_orient];
     }
     return 0;
@@ -441,7 +438,7 @@ lut2[q>12][c]={
 static void x8_ac_compensation(IntraX8Context * const w, int const direction, int const dc_level){
     MpegEncContext * const s= w->s;
     int t;
-#define B(x, y) s->block[0][s->idsp.idct_permutation[(x) + (y) * 8]]
+#define B(x,y)  s->block[0][w->idct_permutation[(x)+(y)*8]]
 #define T(x)  ((x) * dc_level + 0x8000) >> 16;
     switch(direction){
     case 0:
@@ -538,7 +535,7 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){
     int use_quant_matrix;
     int sign;
 
-    assert(w->orient<12);
+    av_assert2(w->orient<12);
     s->bdsp.clear_block(s->block[0]);
 
     if(chroma){
@@ -647,7 +644,7 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){
                                             s->current_picture.f->linesize[!!chroma] );
     }
     if(!zeros_only)
-        s->idsp.idct_add(s->dest[chroma],
+        w->wdsp.idct_add(s->dest[chroma],
                          s->current_picture.f->linesize[!!chroma],
                          s->block[0]);
 
@@ -696,12 +693,16 @@ av_cold void ff_intrax8_common_init(IntraX8Context * w, MpegEncContext * const s
 
     w->s=s;
     x8_vlc_init();
-    assert(s->mb_width>0);
+    av_assert0(s->mb_width>0);
     w->prediction_table=av_mallocz(s->mb_width*2*2);//two rows, 2 blocks per cannon mb
 
-    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]);
-    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]);
-    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]);
+    ff_wmv2dsp_init(&w->wdsp);
+    ff_init_scantable_permutation(w->idct_permutation,
+                                  w->wdsp.idct_perm);
+
+    ff_init_scantable(w->idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]);
+    ff_init_scantable(w->idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]);
+    ff_init_scantable(w->idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]);
 
     ff_intrax8dsp_init(&w->dsp);
 }
@@ -721,6 +722,7 @@ av_cold void ff_intrax8_common_end(IntraX8Context * w)
  * The parent codec must call ff_mpv_frame_start(), ff_er_frame_start() before calling this function.
  * The parent codec must call ff_er_frame_end(), ff_mpv_frame_end() after calling this function.
  * This function does not use ff_mpv_decode_mb().
+ * lowres decoding is theoretically impossible.
  * @param w pointer to IntraX8Context
  * @param dquant doubled quantizer, it would be odd in case of VC-1 halfpq==1.
  * @param quant_offset offset away from zero
@@ -728,7 +730,6 @@ av_cold void ff_intrax8_common_end(IntraX8Context * w)
 int ff_intrax8_decode_picture(IntraX8Context * const w, int dquant, int quant_offset){
     MpegEncContext * const s= w->s;
     int mb_xy;
-    assert(s);
     w->use_quant_matrix = get_bits1(&s->gb);
 
     w->dquant = dquant;
diff --git a/libavcodec/intrax8.h b/libavcodec/intrax8.h
index 69673171a8..9981785403 100644
--- a/libavcodec/intrax8.h
+++ b/libavcodec/intrax8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #include "get_bits.h"
 #include "mpegvideo.h"
 #include "intrax8dsp.h"
+#include "wmv2dsp.h"
 
 typedef struct IntraX8Context {
     VLC * j_ac_vlc[4];//they point to the static j_mb_vlc
@@ -32,6 +33,8 @@ typedef struct IntraX8Context {
 //set by ff_intrax8_common_init
     uint8_t * prediction_table;//2*(mb_w*2)
     ScanTable scantable[3];
+    WMV2DSPContext wdsp;
+    uint8_t idct_permutation[64];
 //set by the caller codec
     MpegEncContext * s;
     IntraX8DSPContext dsp;
diff --git a/libavcodec/intrax8dsp.c b/libavcodec/intrax8dsp.c
index 1115945eba..1b34f899d6 100644
--- a/libavcodec/intrax8dsp.c
+++ b/libavcodec/intrax8dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8dsp.h b/libavcodec/intrax8dsp.h
index 5c3cc4aa74..1e4a3af881 100644
--- a/libavcodec/intrax8dsp.h
+++ b/libavcodec/intrax8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8huf.h b/libavcodec/intrax8huf.h
index 6bf01f388a..375906bab2 100644
--- a/libavcodec/intrax8huf.h
+++ b/libavcodec/intrax8huf.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index b9189b2e4d..2e449f8eeb 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * h263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <limits.h>
 
 #include "libavutil/attributes.h"
@@ -106,11 +107,9 @@ static VLC cbpc_b_vlc;
 /* XXX: find a better solution to handle static init */
 av_cold void ff_h263_decode_init_vlc(void)
 {
-    static int done = 0;
+    static volatile int done = 0;
 
     if (!done) {
-        done = 1;
-
         INIT_VLC_STATIC(&ff_h263_intra_MCBPC_vlc, INTRA_MCBPC_VLC_BITS, 9,
                  ff_h263_intra_MCBPC_bits, 1, 1,
                  ff_h263_intra_MCBPC_code, 1, 1, 72);
@@ -133,6 +132,7 @@ av_cold void ff_h263_decode_init_vlc(void)
         INIT_VLC_STATIC(&cbpc_b_vlc, CBPC_B_VLC_BITS, 4,
                  &ff_cbpc_b_tab[0][1], 2, 1,
                  &ff_cbpc_b_tab[0][0], 2, 1, 8);
+        done = 1;
     }
 }
 
@@ -175,17 +175,17 @@ static int h263_decode_gob_header(MpegEncContext *s)
         return -1;
 
     if(s->h263_slice_structured){
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(&s->gb, "before MBA")==0)
             return -1;
 
         ff_h263_decode_mba(s);
 
         if(s->mb_num > 1583)
-            if(get_bits1(&s->gb)==0)
+            if(check_marker(&s->gb, "after MBA")==0)
                 return -1;
 
         s->qscale = get_bits(&s->gb, 5); /* SQUANT */
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(&s->gb, "after SQUANT")==0)
             return -1;
         skip_bits(&s->gb, 2); /* GFID */
     }else{
@@ -206,27 +206,6 @@ static int h263_decode_gob_header(MpegEncContext *s)
 }
 
 /**
- * Find the next resync_marker.
- * @param p pointer to buffer to scan
- * @param end pointer to the end of the buffer
- * @return pointer to the next resync_marker, or end if none was found
- */
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *restrict p, const uint8_t * restrict end)
-{
-    assert(p < end);
-
-    end-=2;
-    p++;
-    for(;p<end; p+=2){
-        if(!*p){
-            if     (!p[-1] && p[1]) return p - 1;
-            else if(!p[ 1] && p[2]) return p;
-        }
-    }
-    return end+2;
-}
-
-/**
  * Decode the group of blocks / video packet header.
  * @return bit position of the resync_marker, or <0 if none was found
  */
@@ -328,7 +307,7 @@ static int h263p_decode_umotion(MpegEncContext * s, int pred)
    code >>= 1;
 
    code = (sign) ? (pred - code) : (pred + code);
-   ff_dlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
+   ff_tlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
    return code;
 
 }
@@ -350,7 +329,7 @@ static void preview_obmc(MpegEncContext *s){
         s->block_index[i]+= 1;
     s->mb_x++;
 
-    assert(s->pict_type == AV_PICTURE_TYPE_P);
+    av_assert2(s->pict_type == AV_PICTURE_TYPE_P);
 
     do{
         if (get_bits1(&s->gb)) {
@@ -444,7 +423,7 @@ static void h263_decode_dquant(MpegEncContext *s){
 static int h263_decode_block(MpegEncContext * s, int16_t * block,
                              int n, int coded)
 {
-    int code, level, i, j, last, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h263_rl_inter;
     const uint8_t *scan_table;
     GetBitContext gb= s->gb;
@@ -485,7 +464,7 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
             level = get_bits(&s->gb, 8);
             if((level&0x7F) == 0){
                 av_log(s->avctx, AV_LOG_ERROR, "illegal dc %d at %d %d\n", level, s->mb_x, s->mb_y);
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     return -1;
             }
             if (level == 255)
@@ -503,39 +482,67 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
         return 0;
     }
 retry:
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for(;;) {
-        code = get_vlc2(&s->gb, rl->vlc.table, TEX_VLC_BITS, 2);
-        if (code < 0){
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level){
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             if (CONFIG_FLV_DECODER && s->h263_flv > 1) {
-                ff_flv2_decode_ac_esc(&s->gb, &level, &run, &last);
+                int is11 = SHOW_UBITS(re, &s->gb, 1);
+                SKIP_CACHE(re, &s->gb, 1);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                if (is11) {
+                    SKIP_COUNTER(re, &s->gb, 1 + 7);
+                    UPDATE_CACHE(re, &s->gb);
+                    level = SHOW_SBITS(re, &s->gb, 11);
+                    SKIP_COUNTER(re, &s->gb, 11);
+                } else {
+                    SKIP_CACHE(re, &s->gb, 7);
+                    level = SHOW_SBITS(re, &s->gb, 7);
+                    SKIP_COUNTER(re, &s->gb, 1 + 7 + 7);
+                }
             } else {
-                last = get_bits1(&s->gb);
-                run = get_bits(&s->gb, 6);
-                level = (int8_t)get_bits(&s->gb, 8);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                SKIP_CACHE(re, &s->gb, 7);
+                level = (int8_t)SHOW_UBITS(re, &s->gb, 8);
+                SKIP_COUNTER(re, &s->gb, 7 + 8);
                 if(level == -128){
+                    UPDATE_CACHE(re, &s->gb);
                     if (s->codec_id == AV_CODEC_ID_RV10) {
                         /* XXX: should patch encoder too */
-                        level = get_sbits(&s->gb, 12);
+                        level = SHOW_SBITS(re, &s->gb, 12);
+                        SKIP_COUNTER(re, &s->gb, 12);
                     }else{
-                        level = get_bits(&s->gb, 5);
-                        level |= get_sbits(&s->gb, 6)<<5;
+                        level = SHOW_UBITS(re, &s->gb, 5);
+                        SKIP_CACHE(re, &s->gb, 5);
+                        level |= SHOW_SBITS(re, &s->gb, 6)<<5;
+                        SKIP_COUNTER(re, &s->gb, 5 + 6);
                     }
                 }
             }
         } else {
-            run = rl->table_run[code];
-            level = rl->table_level[code];
-            last = code >= rl->last;
-            if (get_bits1(&s->gb))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64){
+            CLOSE_READER(re, &s->gb);
+            // redo update without last flag, revert -1 offset
+            i = i - run + ((run-1)&63) + 1;
+            if (i < 64) {
+                // only last marker, no overrun
+                block[scan_table[i]] = level;
+                break;
+            }
             if(s->alt_inter_vlc && rl == &ff_h263_rl_inter && !s->mb_intra){
                 //Looks like a hack but no, it's the way it is supposed to work ...
                 rl = &ff_rl_intra_aic;
@@ -549,9 +556,7 @@ retry:
         }
         j = scan_table[i];
         block[j] = level;
-        if (last)
-            break;
-        i++;
+    }
     }
 not_coded:
     if (s->mb_intra && s->h263_aic) {
@@ -566,11 +571,13 @@ static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
     LOCAL_ALIGNED_16(int16_t, dblock, [64]);
     int i, mbi;
+    int bli[6];
 
     /* we have to set s->mb_intra to zero to decode B-part of PB-frame correctly
      * but real value should be restored in order to be used later (in OBMC condition)
      */
     mbi = s->mb_intra;
+    memcpy(bli, s->block_last_index, sizeof(bli));
     s->mb_intra = 0;
     for (i = 0; i < 6; i++) {
         if (h263_decode_block(s, dblock, i, cbp&32) < 0)
@@ -578,6 +585,7 @@ static int h263_skip_b_part(MpegEncContext *s, int cbp)
         cbp+=cbp;
     }
     s->mb_intra = mbi;
+    memcpy(s->block_last_index, bli, sizeof(bli));
     return 0;
 }
 
@@ -607,7 +615,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
     const int xy= s->mb_x + s->mb_y * s->mb_stride;
     int cbpb = 0, pb_mv_count = 0;
 
-    assert(!s->h263_pred);
+    av_assert2(!s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         do{
@@ -747,15 +755,13 @@ int ff_h263_decode_mb(MpegEncContext *s,
         }else
             cbp=0;
 
-        assert(!s->mb_intra);
+        av_assert2(!s->mb_intra);
 
         if(IS_QUANT(mb_type)){
             h263_decode_dquant(s);
         }
 
         if(IS_DIRECT(mb_type)){
-            if (!s->pp_time)
-                return AVERROR_INVALIDDATA;
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
             mb_type |= ff_mpeg4_set_direct_mv(s, 0, 0);
         }else{
@@ -874,6 +880,10 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
 
     align_get_bits(&s->gb);
 
+    if (show_bits(&s->gb, 2) == 2 && s->avctx->frame_number == 0) {
+         av_log(s->avctx, AV_LOG_WARNING, "Header looks like RTP instead of H.263\n");
+    }
+
     startcode= get_bits(&s->gb, 22-8);
 
     for(i= get_bits_left(&s->gb); i>24; i-=8) {
@@ -894,9 +904,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     s->picture_number= (s->picture_number&~0xFF) + i;
 
     /* PTYPE starts here */
-    if (get_bits1(&s->gb) != 1) {
-        /* marker */
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(&s->gb, "in PTYPE") != 1) {
         return -1;
     }
     if (get_bits1(&s->gb) != 0) {
@@ -920,6 +928,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
         /* H.263v1 */
         width = ff_h263_format[format][0];
         height = ff_h263_format[format][1];
+        if (!width)
+            return -1;
 
         s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
@@ -961,6 +971,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
             s->h263_aic = get_bits1(&s->gb); /* Advanced Intra Coding (AIC) */
             s->loop_filter= get_bits1(&s->gb);
             s->unrestricted_mv = s->umvplus || s->obmc || s->loop_filter;
+            if(s->avctx->lowres)
+                s->loop_filter = 0;
 
             s->h263_slice_structured= get_bits1(&s->gb);
             if (get_bits1(&s->gb) != 0) {
@@ -1013,7 +1025,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 6-14 - reserved
                 */
                 width = (get_bits(&s->gb, 9) + 1) * 4;
-                skip_bits1(&s->gb);
+                check_marker(&s->gb, "in dimensions");
                 height = get_bits(&s->gb, 9) * 4;
                 ff_dlog(s->avctx, "\nH.263+ Custom picture: %dx%d\n",width,height);
                 if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
@@ -1028,6 +1040,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 height = ff_h263_format[format][1];
                 s->avctx->sample_aspect_ratio= (AVRational){12,11};
             }
+            s->avctx->sample_aspect_ratio.den <<= s->ehc_mode;
             if ((width == 0) || (height == 0))
                 return -1;
             s->width = width;
@@ -1103,20 +1116,17 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if(s->h263_slice_structured){
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n");
+        if (check_marker(&s->gb, "SEPB1") != 1) {
             return -1;
         }
 
         ff_h263_decode_mba(s);
 
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB2 marker missing\n");
+        if (check_marker(&s->gb, "SEPB2") != 1) {
             return -1;
         }
     }
@@ -1131,7 +1141,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
         ff_h263_show_pict_info(s);
-    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO")){
+    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO") && get_bits_left(&s->gb) >= 85 + 13*3*16 + 50){
         int i,j;
         for(i=0; i<85; i++) av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
         av_log(s->avctx, AV_LOG_DEBUG, "\n");
diff --git a/libavcodec/ituh263enc.c b/libavcodec/ituh263enc.c
index f9986a4ad7..03f4011bfa 100644
--- a/libavcodec/ituh263enc.c
+++ b/libavcodec/ituh263enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -90,7 +90,7 @@ static const uint8_t wrong_run[102] = {
 av_const int ff_h263_aspect_to_info(AVRational aspect){
     int i;
 
-    if(aspect.num==0) aspect= (AVRational){1,1};
+    if(aspect.num==0 || aspect.den==0) aspect= (AVRational){1,1};
 
     for(i=1; i<6; i++){
         if(av_cmp_q(ff_h263_pixel_aspect[i], aspect) == 0){
@@ -228,19 +228,11 @@ void ff_h263_encode_picture_header(MpegEncContext * s, int picture_number)
     if(s->h263_slice_structured){
         put_bits(&s->pb, 1, 1);
 
-        assert(s->mb_x == 0 && s->mb_y == 0);
+        av_assert1(s->mb_x == 0 && s->mb_y == 0);
         ff_h263_encode_mba(s);
 
         put_bits(&s->pb, 1, 1);
     }
-
-    if(s->h263_aic){
-         s->y_dc_scale_table=
-         s->c_dc_scale_table= ff_aic_dc_scale_table;
-    }else{
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
-    }
 }
 
 /**
@@ -270,7 +262,7 @@ void ff_h263_encode_gob_header(MpegEncContext * s, int mb_line)
 }
 
 /**
- * modify qscale so that encoding is acually possible in h263 (limit difference to -2..2)
+ * modify qscale so that encoding is actually possible in h263 (limit difference to -2..2)
  */
 void ff_clean_h263_qscales(MpegEncContext *s){
     int i;
@@ -395,7 +387,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
                 put_bits(&s->pb, 1, last);
                 put_bits(&s->pb, 6, run);
 
-                assert(slevel != 0);
+                av_assert2(slevel != 0);
 
                 if(level < 128)
                     put_sbits(&s->pb, 8, slevel);
@@ -416,7 +408,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
 }
 
 /* Encode MV differences on H.263+ with Unrestricted MV mode */
-static void h263p_encode_umotion(MpegEncContext * s, int val)
+static void h263p_encode_umotion(PutBitContext *pb, int val)
 {
     short sval = 0;
     short i = 0;
@@ -426,11 +418,11 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
     int tcode;
 
     if ( val == 0)
-        put_bits(&s->pb, 1, 1);
+        put_bits(pb, 1, 1);
     else if (val == 1)
-        put_bits(&s->pb, 3, 0);
+        put_bits(pb, 3, 0);
     else if (val == -1)
-        put_bits(&s->pb, 3, 2);
+        put_bits(pb, 3, 2);
     else {
 
         sval = ((val < 0) ? (short)(-val):(short)val);
@@ -449,7 +441,7 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
             i--;
         }
         code = ((code << 1) | (val < 0)) << 1;
-        put_bits(&s->pb, (2*n_bits)+1, code);
+        put_bits(pb, (2*n_bits)+1, code);
     }
 }
 
@@ -506,8 +498,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                 motion_y - pred_y, 1);
             }
             else {
-                h263p_encode_umotion(s, motion_x - pred_x);
-                h263p_encode_umotion(s, motion_y - pred_y);
+                h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                h263p_encode_umotion(&s->pb, motion_y - pred_y);
                 if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                     /* To prevent Start Code emulation */
                     put_bits(&s->pb,1,1);
@@ -535,8 +527,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                     motion_y - pred_y, 1);
                 }
                 else {
-                    h263p_encode_umotion(s, motion_x - pred_x);
-                    h263p_encode_umotion(s, motion_y - pred_y);
+                    h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                    h263p_encode_umotion(&s->pb, motion_y - pred_y);
                     if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                         /* To prevent Start Code emulation */
                         put_bits(&s->pb,1,1);
@@ -548,7 +540,7 @@ void ff_h263_encode_mb(MpegEncContext * s,
             s->mv_bits+= get_bits_diff(s);
         }
     } else {
-        assert(s->mb_intra);
+        av_assert2(s->mb_intra);
 
         cbp = 0;
         if (s->h263_aic) {
@@ -652,14 +644,14 @@ void ff_h263_encode_mb(MpegEncContext * s,
     }
 }
 
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code)
 {
     int range, bit_size, sign, code, bits;
 
     if (val == 0) {
         /* zero vector */
         code = 0;
-        put_bits(&s->pb, ff_mvtab[code][1], ff_mvtab[code][0]);
+        put_bits(pb, ff_mvtab[code][1], ff_mvtab[code][0]);
     } else {
         bit_size = f_code - 1;
         range = 1 << bit_size;
@@ -673,9 +665,9 @@ void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
         code = (val >> bit_size) + 1;
         bits = val & (range - 1);
 
-        put_bits(&s->pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
+        put_bits(pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
         if (bit_size > 0) {
-            put_bits(&s->pb, bit_size, bits);
+            put_bits(pb, bit_size, bits);
         }
     }
 }
@@ -727,8 +719,8 @@ static av_cold void init_uni_h263_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN   >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
 
     for(slevel=-64; slevel<64; slevel++){
         if(slevel==0) continue;
@@ -817,12 +809,15 @@ av_cold void ff_h263_encode_init(MpegEncContext *s)
             s->min_qcoeff= -127;
             s->max_qcoeff=  127;
         }
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
         break;
     default: //nothing needed - default table already set in mpegvideo.c
         s->min_qcoeff= -127;
         s->max_qcoeff=  127;
+    }
+    if(s->h263_aic){
+         s->y_dc_scale_table=
+         s->c_dc_scale_table= ff_aic_dc_scale_table;
+    }else{
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
     }
diff --git a/libavcodec/ivi.c b/libavcodec/ivi.c
index 6086e2e681..4525ff494c 100644
--- a/libavcodec/ivi.c
+++ b/libavcodec/ivi.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -87,12 +87,9 @@ static int ivi_mc(IVIBandDesc *band, ivi_mc_func mc, ivi_mc_avg_func mc_avg,
     int ref_size = (mc_type > 1) * band->pitch + (mc_type & 1);
 
     if (mc_type != -1) {
-        if (offs < 0 || ref_offs < 0 || !band->ref_buf)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size < offs)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size - ref_size < ref_offs)
-            return AVERROR_INVALIDDATA;
+        av_assert0(offs >= 0 && ref_offs >= 0 && band->ref_buf);
+        av_assert0(buf_size - min_size >= offs);
+        av_assert0(buf_size - min_size - ref_size >= ref_offs);
     }
 
     if (mc_type2 == -1) {
@@ -136,7 +133,7 @@ static uint16_t inv_bits(uint16_t val, int nbits)
 
 /*
  *  Generate a huffman codebook from the given descriptor
- *  and convert it into the Libav VLC table.
+ *  and convert it into the FFmpeg VLC table.
  *
  *  @param[in]   cb    pointer to codebook descriptor
  *  @param[out]  vlc   where to place the generated VLC table
@@ -250,7 +247,7 @@ int ff_ivi_dec_huff_desc(GetBitContext *gb, int desc_coded, int which_tab,
             new_huff.xbits[i] = get_bits(gb, 4);
 
         /* Have we got the same custom table? Rebuild if not. */
-        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc)) {
+        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc) || !huff_tab->cust_tab.table) {
             ivi_huff_desc_copy(&huff_tab->cust_desc, &new_huff);
 
             if (huff_tab->cust_tab.table)
@@ -285,6 +282,7 @@ static av_cold void ivi_free_buffers(IVIPlaneDesc *planes)
     int p, b, t;
 
     for (p = 0; p < 3; p++) {
+        if (planes[p].bands)
         for (b = 0; b < planes[p].num_bands; b++) {
             av_freep(&planes[p].bands[b].bufs[0]);
             av_freep(&planes[p].bands[b].bufs[1]);
@@ -327,7 +325,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
     planes[1].num_bands = planes[2].num_bands = cfg->chroma_bands;
 
     for (p = 0; p < 3; p++) {
-        planes[p].bands = av_mallocz(planes[p].num_bands * sizeof(IVIBandDesc));
+        planes[p].bands = av_mallocz_array(planes[p].num_bands, sizeof(IVIBandDesc));
         if (!planes[p].bands)
             return AVERROR(ENOMEM);
 
@@ -356,6 +354,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
             band->aheight  = height_aligned;
             band->bufs[0]  = av_mallocz(buf_size);
             band->bufs[1]  = av_mallocz(buf_size);
+            band->bufsize  = buf_size/2;
             if (!band->bufs[0] || !band->bufs[1])
                 return AVERROR(ENOMEM);
 
@@ -397,14 +396,16 @@ static int ivi_init_tiles(IVIBandDesc *band, IVITile *ref_tile,
                                               band->mb_size);
 
             av_freep(&tile->mbs);
-            tile->mbs = av_malloc(tile->num_MBs * sizeof(IVIMbInfo));
+            tile->mbs = av_mallocz_array(tile->num_MBs, sizeof(IVIMbInfo));
             if (!tile->mbs)
                 return AVERROR(ENOMEM);
 
             tile->ref_mbs = 0;
             if (p || b) {
-                if (tile->num_MBs != ref_tile->num_MBs)
+                if (tile->num_MBs != ref_tile->num_MBs) {
+                    av_log(NULL, AV_LOG_DEBUG, "ref_tile mismatch\n");
                     return AVERROR_INVALIDDATA;
+                }
                 tile->ref_mbs = ref_tile->mbs;
                 ref_tile++;
             }
@@ -429,6 +430,8 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
             t_width  >>= 1;
             t_height >>= 1;
         }
+        if(t_width<=0 || t_height<=0)
+            return AVERROR(EINVAL);
 
         for (b = 0; b < planes[p].num_bands; b++) {
             band = &planes[p].bands[b];
@@ -437,7 +440,7 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
             band->num_tiles = x_tiles * y_tiles;
 
             av_freep(&band->tiles);
-            band->tiles = av_mallocz(band->num_tiles * sizeof(IVITile));
+            band->tiles = av_mallocz_array(band->num_tiles, sizeof(IVITile));
             if (!band->tiles)
                 return AVERROR(ENOMEM);
 
@@ -486,10 +489,6 @@ static int ivi_dc_transform(IVIBandDesc *band, int *prev_dc, int buf_offs,
     int buf_size = band->pitch * band->aheight - buf_offs;
     int min_size = (blk_size - 1) * band->pitch + blk_size;
 
-    if (!band->dc_transform)
-        return 0;
-
-
     if (min_size > buf_size)
         return AVERROR_INVALIDDATA;
 
@@ -583,6 +582,11 @@ static int ivi_decode_coded_blocks(GetBitContext *gb, IVIBandDesc *band,
         col_flags[0] |= !!*prev_dc;
     }
 
+    if(band->transform_size > band->blk_size){
+        av_log(NULL, AV_LOG_ERROR, "Too large transform\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* apply inverse transform */
     band->inv_transform(trvec, band->buf + offs,
                         band->pitch, col_flags);
@@ -642,7 +646,7 @@ static int ivi_decode_blocks(GetBitContext *gb, IVIBandDesc *band,
 
         quant = band->glob_quant + mb->q_delta;
         if (avctx->codec_id == AV_CODEC_ID_INDEO4)
-            quant = av_clip(quant, 0, 31);
+            quant = av_clip_uintp2(quant, 5);
         else
             quant = av_clip(quant, 0, 23);
 
@@ -805,6 +809,22 @@ static int ivi_process_empty_tile(AVCodecContext *avctx, IVIBandDesc *band,
                     mb->mv_y = ref_mb->mv_y;
                 }
                 need_mc |= mb->mv_x || mb->mv_y; /* tracking non-zero motion vectors */
+                {
+                    int dmv_x, dmv_y, cx, cy;
+
+                    dmv_x = mb->mv_x >> band->is_halfpel;
+                    dmv_y = mb->mv_y >> band->is_halfpel;
+                    cx    = mb->mv_x &  band->is_halfpel;
+                    cy    = mb->mv_y &  band->is_halfpel;
+
+                    if (   mb->xpos + dmv_x < 0
+                        || mb->xpos + dmv_x + band->mb_size + cx > band->pitch
+                        || mb->ypos + dmv_y < 0
+                        || mb->ypos + dmv_y + band->mb_size + cy > band->aheight) {
+                        av_log(avctx, AV_LOG_ERROR, "MV out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
             }
 
             mb++;
@@ -946,6 +966,10 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i * 2 + 1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
     pos = get_bits_count(&ctx->gb);
@@ -969,7 +993,8 @@ static int decode_band(IVI45DecContext *ctx,
             tile->data_size = ivi_dec_tile_data_size(&ctx->gb);
             if (!tile->data_size) {
                 av_log(avctx, AV_LOG_ERROR, "Tile data size is zero!\n");
-                return AVERROR_INVALIDDATA;
+                result = AVERROR_INVALIDDATA;
+                break;
             }
 
             result = ctx->decode_mb_info(ctx, band, tile, avctx);
@@ -1001,6 +1026,10 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i*2+1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
 #ifdef DEBUG
@@ -1068,6 +1097,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     //{ START_TIMER;
 
     if (ctx->is_nonnull_frame(ctx)) {
+        ctx->buf_invalid[ctx->dst_buf] = 1;
         for (p = 0; p < 3; p++) {
             for (b = 0; b < ctx->planes[p].num_bands; b++) {
                 result = decode_band(ctx, &ctx->planes[p].bands[b], avctx);
@@ -1078,6 +1108,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 }
             }
         }
+        ctx->buf_invalid[ctx->dst_buf] = 0;
     } else {
         if (ctx->is_scalable)
             return AVERROR_INVALIDDATA;
@@ -1087,17 +1118,20 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return AVERROR_INVALIDDATA;
         }
     }
+    if (ctx->buf_invalid[ctx->dst_buf])
+        return -1;
 
     //STOP_TIMER("decode_planes"); }
 
+    if (!ctx->is_nonnull_frame(ctx))
+        return buf_size;
+
     result = ff_set_dimensions(avctx, ctx->planes[0].width, ctx->planes[0].height);
     if (result < 0)
         return result;
 
-    if ((result = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_get_buffer(avctx, frame, 0)) < 0)
         return result;
-    }
 
     if (ctx->is_scalable) {
         if (ctx->is_indeo4)
@@ -1121,7 +1155,11 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (ctx->is_indeo4 && ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
         int left;
 
-        while (get_bits(&ctx->gb, 8)); // skip version string
+            // skip version string
+        while (get_bits(&ctx->gb, 8)) {
+            if (get_bits_left(&ctx->gb) < 8)
+                return AVERROR_INVALIDDATA;
+        }
         left = get_bits_count(&ctx->gb) & 0x18;
         skip_bits_long(&ctx->gb, 64 - left);
         if (get_bits_left(&ctx->gb) > 18 &&
diff --git a/libavcodec/ivi.h b/libavcodec/ivi.h
index 5dadd9f4a5..abe43462a1 100644
--- a/libavcodec/ivi.h
+++ b/libavcodec/ivi.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -165,6 +165,7 @@ typedef struct IVIBandDesc {
     int             quant_mat;      ///< dequant matrix index
     int             glob_quant;     ///< quant base for this band
     const uint8_t   *scan;          ///< ptr to the scan pattern
+    int             scan_size;      ///< size of the scantable
 
     IVIHuffTab      blk_vlc;        ///< vlc table for decoding block data
 
@@ -264,6 +265,7 @@ typedef struct IVI45DecContext {
     int             (*is_nonnull_frame)(struct IVI45DecContext *ctx);
 
     int gop_invalid;
+    int buf_invalid[4];
 
     int is_indeo4;
 
diff --git a/libavcodec/ivi_dsp.c b/libavcodec/ivi_dsp.c
index 7d1278431b..4b973992e0 100644
--- a/libavcodec/ivi_dsp.c
+++ b/libavcodec/ivi_dsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     int32_t         b0_1, b0_2, b1_1, b1_2, b1_3, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6;
     int32_t         b3_1, b3_2, b3_3, b3_4, b3_5, b3_6, b3_7, b3_8, b3_9;
     int32_t         pitch, back_pitch;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     const int       num_bands = 4;
 
     /* all bands should have the same pitch */
@@ -54,6 +54,9 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     b3_ptr = plane->bands[3].buf;
 
     for (y = 0; y < plane->height; y += 2) {
+
+        if (y+2 >= plane->height)
+            pitch= 0;
         /* load storage variables with values */
         if (num_bands > 0) {
             b0_1 = b0_ptr[0];
@@ -83,6 +86,13 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
         }
 
         for (x = 0, indx = 0; x < plane->width; x+=2, indx++) {
+            if (x+2 >= plane->width) {
+                b0_ptr --;
+                b1_ptr --;
+                b2_ptr --;
+                b3_ptr --;
+            }
+
             /* some values calculated in the previous iterations can */
             /* be reused in the next ones, so do appropriate copying */
             b2_1 = b2_2; // b2[x-1,y  ] = b2[x,  y  ]
@@ -170,10 +180,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
 
         back_pitch = -pitch;
 
-        b0_ptr += pitch;
-        b1_ptr += pitch;
-        b2_ptr += pitch;
-        b3_ptr += pitch;
+        b0_ptr += pitch + 1;
+        b1_ptr += pitch + 1;
+        b2_ptr += pitch + 1;
+        b3_ptr += pitch + 1;
     }
 }
 
@@ -181,7 +191,7 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
                            const int dst_pitch)
 {
     int             x, y, indx, b0, b1, b2, b3, p0, p1, p2, p3;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     int32_t         pitch;
 
     /* all bands should have the same pitch */
@@ -225,15 +235,15 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
 
 /** butterfly operation for the inverse Haar transform */
 #define IVI_HAAR_BFLY(s1, s2, o1, o2, t) \
-    t  = (s1 - s2) >> 1;\
-    o1 = (s1 + s2) >> 1;\
-    o2 = t;\
+    t  = ((s1) - (s2)) >> 1;\
+    o1 = ((s1) + (s2)) >> 1;\
+    o2 = (t);\
 
 /** inverse 8-point Haar transform */
 #define INV_HAAR8(s1, s5, s3, s7, s2, s4, s6, s8,\
                   d1, d2, d3, d4, d5, d6, d7, d8,\
                   t0, t1, t2, t3, t4, t5, t6, t7, t8) {\
-    t1 = s1 << 1; t5 = s5 << 1;\
+    t1 = (s1) << 1; t5 = (s5) << 1;\
     IVI_HAAR_BFLY(t1, t5, t1, t5, t0); IVI_HAAR_BFLY(t1, s3, t1, t3, t0);\
     IVI_HAAR_BFLY(t5, s7, t5, t7, t0); IVI_HAAR_BFLY(t1, s2, t1, t2, t0);\
     IVI_HAAR_BFLY(t3, s4, t3, t4, t0); IVI_HAAR_BFLY(t5, s6, t5, t6, t0);\
@@ -475,21 +485,21 @@ void ff_ivi_dc_haar_2d(const int32_t *in, int16_t *out, uint32_t pitch,
 
 /** butterfly operation for the inverse slant transform */
 #define IVI_SLANT_BFLY(s1, s2, o1, o2, t) \
-    t  = s1 - s2;\
-    o1 = s1 + s2;\
-    o2 = t;\
+    t  = (s1) - (s2);\
+    o1 = (s1) + (s2);\
+    o2 = (t);\
 
 /** This is a reflection a,b = 1/2, 5/4 for the inverse slant transform */
 #define IVI_IREFLECT(s1, s2, o1, o2, t) \
-    t  = ((s1 + s2*2 + 2) >> 2) + s1;\
-    o2 = ((s1*2 - s2 + 2) >> 2) - s2;\
-    o1 = t;\
+    t  = (((s1) + (s2)*2 + 2) >> 2) + (s1);\
+    o2 = (((s1)*2 - (s2) + 2) >> 2) - (s2);\
+    o1 = (t);\
 
 /** This is a reflection a,b = 1/2, 7/8 for the inverse slant transform */
 #define IVI_SLANT_PART4(s1, s2, o1, o2, t) \
-    t  = s2 + ((s1*4  - s2 + 4) >> 3);\
-    o2 = s1 + ((-s1 - s2*4 + 4) >> 3);\
-    o1 = t;\
+    t  = (s2) + (((s1)*4  - (s2) + 4) >> 3);\
+    o2 = (s1) + ((-(s1) - (s2)*4 + 4) >> 3);\
+    o1 = (t);\
 
 /** inverse slant8 transform */
 #define IVI_INV_SLANT8(s1, s4, s8, s5, s2, s6, s3, s7,\
@@ -547,7 +557,7 @@ void ff_ivi_inverse_slant_8x8(const int32_t *in, int16_t *out, uint32_t pitch, c
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 8; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3] && !src[4] && !src[5] && !src[6] && !src[7]) {
@@ -587,7 +597,7 @@ void ff_ivi_inverse_slant_4x4(const int32_t *in, int16_t *out, uint32_t pitch, c
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 4; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3]) {
@@ -621,7 +631,7 @@ void ff_ivi_row_slant8(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     int     i;
     int     t0, t1, t2, t3, t4, t5, t6, t7, t8;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3] && !in[4] && !in[5] && !in[6] && !in[7]) {
             memset(out, 0, 8*sizeof(out[0]));
@@ -663,7 +673,7 @@ void ff_ivi_col_slant8(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     row4 = pitch << 2;
     row8 = pitch << 3;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (flags[i]) {
             IVI_INV_SLANT8(in[0], in[8], in[16], in[24], in[32], in[40], in[48], in[56],
@@ -700,7 +710,7 @@ void ff_ivi_row_slant4(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     int     i;
     int     t0, t1, t2, t3, t4;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3]) {
             memset(out, 0, 4*sizeof(out[0]));
@@ -722,7 +732,7 @@ void ff_ivi_col_slant4(const int32_t *in, int16_t *out, uint32_t pitch, const ui
 
     row2 = pitch << 1;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (flags[i]) {
             IVI_INV_SLANT4(in[0], in[4], in[8], in[12],
diff --git a/libavcodec/ivi_dsp.h b/libavcodec/ivi_dsp.h
index 45c8a18086..f64c718e72 100644
--- a/libavcodec/ivi_dsp.h
+++ b/libavcodec/ivi_dsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,10 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
  */
 void ff_ivi_inverse_haar_8x8(const int32_t *in, int16_t *out, uint32_t pitch,
                              const uint8_t *flags);
+void ff_ivi_inverse_haar_8x1(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
+void ff_ivi_inverse_haar_1x8(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
 
 /**
  *  one-dimensional inverse 8-point Haar transform on rows for Indeo 4
diff --git a/libavcodec/j2kenc.c b/libavcodec/j2kenc.c
new file mode 100644
index 0000000000..152da8bdfd
--- /dev/null
+++ b/libavcodec/j2kenc.c
@@ -0,0 +1,1169 @@
+/*
+ * JPEG2000 image encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * JPEG2000 image encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include <float.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "jpeg2000.h"
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+
+#define NMSEDEC_BITS 7
+#define NMSEDEC_FRACBITS (NMSEDEC_BITS-1)
+#define WMSEDEC_SHIFT 13 ///< must be >= 13
+#define LAMBDA_SCALE (100000000LL << (WMSEDEC_SHIFT - 13))
+
+#define CODEC_JP2 1
+#define CODEC_J2K 0
+
+static int lut_nmsedec_ref [1<<NMSEDEC_BITS],
+           lut_nmsedec_ref0[1<<NMSEDEC_BITS],
+           lut_nmsedec_sig [1<<NMSEDEC_BITS],
+           lut_nmsedec_sig0[1<<NMSEDEC_BITS];
+
+static const int dwt_norms[2][4][10] = { // [dwt_type][band][rlevel] (multiplied by 10000)
+    {{10000, 19650, 41770,  84030, 169000, 338400,  676900, 1353000, 2706000, 5409000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20800, 38650, 83070, 171800, 347100, 695900, 1393000, 2786000, 5572000}},
+
+    {{10000, 15000, 27500, 53750, 106800, 213400, 426700, 853300, 1707000, 3413000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     { 7186,  9218, 15860, 30430,  60190, 120100, 240000, 479700,  959300}}
+};
+
+typedef struct {
+   Jpeg2000Component *comp;
+} Jpeg2000Tile;
+
+typedef struct {
+    AVClass *class;
+    AVCodecContext *avctx;
+    const AVFrame *picture;
+
+    int width, height; ///< image width and height
+    uint8_t cbps[4]; ///< bits per sample in particular components
+    int chroma_shift[2];
+    uint8_t planar;
+    int ncomponents;
+    int tile_width, tile_height; ///< tile size
+    int numXtiles, numYtiles;
+
+    uint8_t *buf_start;
+    uint8_t *buf;
+    uint8_t *buf_end;
+    int bit_index;
+
+    int64_t lambda;
+
+    Jpeg2000CodingStyle codsty;
+    Jpeg2000QuantStyle  qntsty;
+
+    Jpeg2000Tile *tile;
+
+    int format;
+} Jpeg2000EncoderContext;
+
+
+/* debug */
+#if 0
+#undef ifprintf
+#undef printf
+
+static void nspaces(FILE *fd, int n)
+{
+    while(n--) putc(' ', fd);
+}
+
+static void printcomp(Jpeg2000Component *comp)
+{
+    int i;
+    for (i = 0; i < comp->y1 - comp->y0; i++)
+        ff_jpeg2000_printv(comp->i_data + i * (comp->x1 - comp->x0), comp->x1 - comp->x0);
+}
+
+static void dump(Jpeg2000EncoderContext *s, FILE *fd)
+{
+    int tileno, compno, reslevelno, bandno, precno;
+    fprintf(fd, "XSiz = %d, YSiz = %d, tile_width = %d, tile_height = %d\n"
+                "numXtiles = %d, numYtiles = %d, ncomponents = %d\n"
+                "tiles:\n",
+            s->width, s->height, s->tile_width, s->tile_height,
+            s->numXtiles, s->numYtiles, s->ncomponents);
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        nspaces(fd, 2);
+        fprintf(fd, "tile %d:\n", tileno);
+        for(compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = tile->comp + compno;
+            nspaces(fd, 4);
+            fprintf(fd, "component %d:\n", compno);
+            nspaces(fd, 4);
+            fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d\n",
+                        comp->x0, comp->x1, comp->y0, comp->y1);
+            for(reslevelno = 0; reslevelno < s->nreslevels; reslevelno++){
+                Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+                nspaces(fd, 6);
+                fprintf(fd, "reslevel %d:\n", reslevelno);
+                nspaces(fd, 6);
+                fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d, nbands = %d\n",
+                        reslevel->x0, reslevel->x1, reslevel->y0,
+                        reslevel->y1, reslevel->nbands);
+                for(bandno = 0; bandno < reslevel->nbands; bandno++){
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    nspaces(fd, 8);
+                    fprintf(fd, "band %d:\n", bandno);
+                    nspaces(fd, 8);
+                    fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d,"
+                                "codeblock_width = %d, codeblock_height = %d cblknx = %d cblkny = %d\n",
+                                band->x0, band->x1,
+                                band->y0, band->y1,
+                                band->codeblock_width, band->codeblock_height,
+                                band->cblknx, band->cblkny);
+                    for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                        Jpeg2000Prec *prec = band->prec + precno;
+                        nspaces(fd, 10);
+                        fprintf(fd, "prec %d:\n", precno);
+                        nspaces(fd, 10);
+                        fprintf(fd, "xi0 = %d, xi1 = %d, yi0 = %d, yi1 = %d\n",
+                                     prec->xi0, prec->xi1, prec->yi0, prec->yi1);
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
+
+/* bitstream routines */
+
+/** put n times val bit */
+static void put_bits(Jpeg2000EncoderContext *s, int val, int n) // TODO: optimize
+{
+    while (n-- > 0){
+        if (s->bit_index == 8)
+        {
+            s->bit_index = *s->buf == 0xff;
+            *(++s->buf) = 0;
+        }
+        *s->buf |= val << (7 - s->bit_index++);
+    }
+}
+
+/** put n least significant bits of a number num */
+static void put_num(Jpeg2000EncoderContext *s, int num, int n)
+{
+    while(--n >= 0)
+        put_bits(s, (num >> n) & 1, 1);
+}
+
+/** flush the bitstream */
+static void j2k_flush(Jpeg2000EncoderContext *s)
+{
+    if (s->bit_index){
+        s->bit_index = 0;
+        s->buf++;
+    }
+}
+
+/* tag tree routines */
+
+/** code the value stored in node */
+static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *node, int threshold)
+{
+    Jpeg2000TgtNode *stack[30];
+    int sp = 1, curval = 0;
+    stack[0] = node;
+
+    node = node->parent;
+    while(node){
+        if (node->vis){
+            curval = node->val;
+            break;
+        }
+        node->vis++;
+        stack[sp++] = node;
+        node = node->parent;
+    }
+    while(--sp >= 0){
+        if (stack[sp]->val >= threshold){
+            put_bits(s, 0, threshold - curval);
+            break;
+        }
+        put_bits(s, 0, stack[sp]->val - curval);
+        put_bits(s, 1, 1);
+        curval = stack[sp]->val;
+    }
+}
+
+/** update the value in node */
+static void tag_tree_update(Jpeg2000TgtNode *node)
+{
+    int lev = 0;
+    while (node->parent){
+        if (node->parent->val <= node->val)
+            break;
+        node->parent->val = node->val;
+        node = node->parent;
+        lev++;
+    }
+}
+
+static int put_siz(Jpeg2000EncoderContext *s)
+{
+    int i;
+
+    if (s->buf_end - s->buf < 40 + 3 * s->ncomponents)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SIZ);
+    bytestream_put_be16(&s->buf, 38 + 3 * s->ncomponents); // Lsiz
+    bytestream_put_be16(&s->buf, 0); // Rsiz
+    bytestream_put_be32(&s->buf, s->width); // width
+    bytestream_put_be32(&s->buf, s->height); // height
+    bytestream_put_be32(&s->buf, 0); // X0Siz
+    bytestream_put_be32(&s->buf, 0); // Y0Siz
+
+    bytestream_put_be32(&s->buf, s->tile_width); // XTSiz
+    bytestream_put_be32(&s->buf, s->tile_height); // YTSiz
+    bytestream_put_be32(&s->buf, 0); // XT0Siz
+    bytestream_put_be32(&s->buf, 0); // YT0Siz
+    bytestream_put_be16(&s->buf, s->ncomponents); // CSiz
+
+    for (i = 0; i < s->ncomponents; i++){ // Ssiz_i XRsiz_i, YRsiz_i
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[0]:1);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[1]:1);
+    }
+    return 0;
+}
+
+static int put_cod(Jpeg2000EncoderContext *s)
+{
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    if (s->buf_end - s->buf < 14)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COD);
+    bytestream_put_be16(&s->buf, 12); // Lcod
+    bytestream_put_byte(&s->buf, 0);  // Scod
+    // SGcod
+    bytestream_put_byte(&s->buf, 0); // progression level
+    bytestream_put_be16(&s->buf, 1); // num of layers
+    if(s->avctx->pix_fmt == AV_PIX_FMT_YUV444P){
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }else{
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }
+    // SPcod
+    bytestream_put_byte(&s->buf, codsty->nreslevels - 1); // num of decomp. levels
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_width-2); // cblk width
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_height-2); // cblk height
+    bytestream_put_byte(&s->buf, 0); // cblk style
+    bytestream_put_byte(&s->buf, codsty->transform == FF_DWT53); // transformation
+    return 0;
+}
+
+static int put_qcd(Jpeg2000EncoderContext *s, int compno)
+{
+    int i, size;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        size = 4 + 3 * (codsty->nreslevels-1);
+    else // QSTY_SE
+        size = 5 + 6 * (codsty->nreslevels-1);
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_QCD);
+    bytestream_put_be16(&s->buf, size);  // LQcd
+    bytestream_put_byte(&s->buf, (qntsty->nguardbits << 5) | qntsty->quantsty);  // Sqcd
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_byte(&s->buf, qntsty->expn[i] << 3);
+    else // QSTY_SE
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_be16(&s->buf, (qntsty->expn[i] << 11) | qntsty->mant[i]);
+    return 0;
+}
+
+static int put_com(Jpeg2000EncoderContext *s, int compno)
+{
+    int size = 4 + strlen(LIBAVCODEC_IDENT);
+
+    if (s->avctx->flags & AV_CODEC_FLAG_BITEXACT)
+        return 0;
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COM);
+    bytestream_put_be16(&s->buf, size);
+    bytestream_put_be16(&s->buf, 1); // General use (ISO/IEC 8859-15 (Latin) values)
+
+    bytestream_put_buffer(&s->buf, LIBAVCODEC_IDENT, strlen(LIBAVCODEC_IDENT));
+
+    return 0;
+}
+
+static uint8_t *put_sot(Jpeg2000EncoderContext *s, int tileno)
+{
+    uint8_t *psotptr;
+
+    if (s->buf_end - s->buf < 12)
+        return NULL;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SOT);
+    bytestream_put_be16(&s->buf, 10); // Lsot
+    bytestream_put_be16(&s->buf, tileno); // Isot
+
+    psotptr = s->buf;
+    bytestream_put_be32(&s->buf, 0); // Psot (filled in later)
+
+    bytestream_put_byte(&s->buf, 0); // TPsot
+    bytestream_put_byte(&s->buf, 1); // TNsot
+    return psotptr;
+}
+
+/**
+ * compute the sizes of tiles, resolution levels, bands, etc.
+ * allocate memory for them
+ * divide the input image into tile-components
+ */
+static int init_tiles(Jpeg2000EncoderContext *s)
+{
+    int tileno, tilex, tiley, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->numXtiles = ff_jpeg2000_ceildiv(s->width, s->tile_width);
+    s->numYtiles = ff_jpeg2000_ceildiv(s->height, s->tile_height);
+
+    s->tile = av_malloc_array(s->numXtiles, s->numYtiles * sizeof(Jpeg2000Tile));
+    if (!s->tile)
+        return AVERROR(ENOMEM);
+    for (tileno = 0, tiley = 0; tiley < s->numYtiles; tiley++)
+        for (tilex = 0; tilex < s->numXtiles; tilex++, tileno++){
+            Jpeg2000Tile *tile = s->tile + tileno;
+
+            tile->comp = av_mallocz_array(s->ncomponents, sizeof(Jpeg2000Component));
+            if (!tile->comp)
+                return AVERROR(ENOMEM);
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int ret, i, j;
+
+                comp->coord[0][0] = comp->coord_o[0][0] = tilex * s->tile_width;
+                comp->coord[0][1] = comp->coord_o[0][1] = FFMIN((tilex+1)*s->tile_width, s->width);
+                comp->coord[1][0] = comp->coord_o[1][0] = tiley * s->tile_height;
+                comp->coord[1][1] = comp->coord_o[1][1] = FFMIN((tiley+1)*s->tile_height, s->height);
+                if (compno > 0)
+                    for (i = 0; i < 2; i++)
+                        for (j = 0; j < 2; j++)
+                            comp->coord[i][j] = comp->coord_o[i][j] = ff_jpeg2000_ceildivpow2(comp->coord[i][j], s->chroma_shift[i]);
+
+                if ((ret = ff_jpeg2000_init_component(comp,
+                                                codsty,
+                                                qntsty,
+                                                s->cbps[compno],
+                                                compno?1<<s->chroma_shift[0]:1,
+                                                compno?1<<s->chroma_shift[1]:1,
+                                                s->avctx
+                                               )) < 0)
+                    return ret;
+            }
+        }
+    return 0;
+}
+
+static void copy_frame(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno, i, y, x;
+    uint8_t *line;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        if (s->planar){
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int *dst = comp->i_data;
+                line = s->picture->data[compno]
+                       + comp->coord[1][0] * s->picture->linesize[compno]
+                       + comp->coord[0][0];
+                for (y = comp->coord[1][0]; y < comp->coord[1][1]; y++){
+                    uint8_t *ptr = line;
+                    for (x = comp->coord[0][0]; x < comp->coord[0][1]; x++)
+                        *dst++ = *ptr++ - (1 << 7);
+                    line += s->picture->linesize[compno];
+                }
+            }
+        } else{
+            line = s->picture->data[0] + tile->comp[0].coord[1][0] * s->picture->linesize[0]
+                   + tile->comp[0].coord[0][0] * s->ncomponents;
+
+            i = 0;
+            for (y = tile->comp[0].coord[1][0]; y < tile->comp[0].coord[1][1]; y++){
+                uint8_t *ptr = line;
+                for (x = tile->comp[0].coord[0][0]; x < tile->comp[0].coord[0][1]; x++, i++){
+                    for (compno = 0; compno < s->ncomponents; compno++){
+                        tile->comp[compno].i_data[i] = *ptr++  - (1 << 7);
+                    }
+                }
+                line += s->picture->linesize[0];
+            }
+        }
+    }
+}
+
+static void init_quantization(Jpeg2000EncoderContext *s)
+{
+    int compno, reslevelno, bandno;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        int gbandno = 0;
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            int nbands, lev = codsty->nreslevels - reslevelno - 1;
+            nbands = reslevelno ? 3 : 1;
+            for (bandno = 0; bandno < nbands; bandno++, gbandno++){
+                int expn, mant = 0;
+
+                if (codsty->transform == FF_DWT97_INT){
+                    int bandpos = bandno + (reslevelno>0),
+                        ss = 81920000 / dwt_norms[0][bandpos][lev],
+                        log = av_log2(ss);
+                    mant = (11 - log < 0 ? ss >> log - 11 : ss << 11 - log) & 0x7ff;
+                    expn = s->cbps[compno] - log + 13;
+                } else
+                    expn = ((bandno&2)>>1) + (reslevelno>0) + s->cbps[compno];
+
+                qntsty->expn[gbandno] = expn;
+                qntsty->mant[gbandno] = mant;
+            }
+        }
+    }
+}
+
+static void init_luts(void)
+{
+    int i, a,
+        mask = ~((1<<NMSEDEC_FRACBITS)-1);
+
+    for (i = 0; i < (1 << NMSEDEC_BITS); i++){
+        lut_nmsedec_sig[i]  = FFMAX(6*i - (9<<NMSEDEC_FRACBITS-1) << 12-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_sig0[i] = FFMAX((i*i + (1<<NMSEDEC_FRACBITS-1) & mask) << 1, 0);
+
+        a = (i >> (NMSEDEC_BITS-2)&2) + 1;
+        lut_nmsedec_ref[i]  = FFMAX((-2*i + (1<<NMSEDEC_FRACBITS) + a*i - (a*a<<NMSEDEC_FRACBITS-2))
+                                    << 13-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_ref0[i] = FFMAX(((i*i + (1-4*i << NMSEDEC_FRACBITS-1) + (1<<2*NMSEDEC_FRACBITS)) & mask)
+                                    << 1, 0);
+    }
+}
+
+/* tier-1 routines */
+static int getnmsedec_sig(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_sig[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_sig0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static int getnmsedec_ref(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_ref[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_ref0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static void encode_sigpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++){
+                if (!(t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG) && (t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB)){
+                    int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno),
+                        bit = t1->data[(y) * t1->stride + x] & mask ? 1 : 0;
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, bit);
+                    if (bit){
+                        int xorbit;
+                        int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                        *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                        ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_VIS;
+                }
+            }
+}
+
+static void encode_refpass(Jpeg2000T1Context *t1, int width, int height, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++)
+                if ((t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG){
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y+1) * t1->stride + x+1]);
+                    *nmsedec += getnmsedec_ref(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_REF;
+                }
+}
+
+static void encode_clnpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++){
+            if (y0 + 3 < height && !(
+            (t1->flags[(y0+1) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+2) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+3) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+4) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG))))
+            {
+                // aggregation mode
+                int rlen;
+                for (rlen = 0; rlen < 4; rlen++)
+                    if (t1->data[(y0+rlen) * t1->stride + x] & mask)
+                        break;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL, rlen != 4);
+                if (rlen == 4)
+                    continue;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen >> 1);
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen & 1);
+                for (y = y0 + rlen; y < y0 + 4; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        if (y > y0 + rlen)
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            } else{
+                for (y = y0; y < y0 + 4 && y < height; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            }
+        }
+}
+
+static void encode_cblk(Jpeg2000EncoderContext *s, Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk, Jpeg2000Tile *tile,
+                        int width, int height, int bandpos, int lev)
+{
+    int pass_t = 2, passno, x, y, max=0, nmsedec, bpno;
+    int64_t wmsedec = 0;
+
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    for (y = 0; y < height; y++){
+        for (x = 0; x < width; x++){
+            if (t1->data[(y) * t1->stride + x] < 0){
+                t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_SGN;
+                t1->data[(y) * t1->stride + x] = -t1->data[(y) * t1->stride + x];
+            }
+            max = FFMAX(max, t1->data[(y) * t1->stride + x]);
+        }
+    }
+
+    if (max == 0){
+        cblk->nonzerobits = 0;
+        bpno = 0;
+    } else{
+        cblk->nonzerobits = av_log2(max) + 1 - NMSEDEC_FRACBITS;
+        bpno = cblk->nonzerobits - 1;
+    }
+
+    ff_mqc_initenc(&t1->mqc, cblk->data);
+
+    for (passno = 0; bpno >= 0; passno++){
+        nmsedec=0;
+
+        switch(pass_t){
+            case 0: encode_sigpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+            case 1: encode_refpass(t1, width, height, &nmsedec, bpno);
+                    break;
+            case 2: encode_clnpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+        }
+
+        cblk->passes[passno].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno].flushed, &cblk->passes[passno].flushed_len);
+        wmsedec += (int64_t)nmsedec << (2*bpno);
+        cblk->passes[passno].disto = wmsedec;
+
+        if (++pass_t == 3){
+            pass_t = 0;
+            bpno--;
+        }
+    }
+    cblk->npasses = passno;
+    cblk->ninclpasses = passno;
+
+    cblk->passes[passno-1].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno-1].flushed, &cblk->passes[passno-1].flushed_len);
+}
+
+/* tier-2 routines: */
+
+static void putnumpasses(Jpeg2000EncoderContext *s, int n)
+{
+    if (n == 1)
+        put_num(s, 0, 1);
+    else if (n == 2)
+        put_num(s, 2, 2);
+    else if (n <= 5)
+        put_num(s, 0xc | (n-3), 4);
+    else if (n <= 36)
+        put_num(s, 0x1e0 | (n-6), 9);
+    else
+        put_num(s, 0xff80 | (n-37), 16);
+}
+
+
+static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, int precno,
+                          uint8_t *expn, int numgbits)
+{
+    int bandno, empty = 1;
+
+    // init bitstream
+    *s->buf = 0;
+    s->bit_index = 0;
+
+    // header
+
+    // is the packet empty?
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        if (rlevel->band[bandno].coord[0][0] < rlevel->band[bandno].coord[0][1]
+        &&  rlevel->band[bandno].coord[1][0] < rlevel->band[bandno].coord[1][1]){
+            empty = 0;
+            break;
+        }
+    }
+
+    put_bits(s, !empty, 1);
+    if (empty){
+        j2k_flush(s);
+        return 0;
+    }
+
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, xi, pos;
+        int cblknw = prec->nb_codeblocks_width;
+
+        if (band->coord[0][0] == band->coord[0][1]
+        ||  band->coord[1][0] == band->coord[1][1])
+            continue;
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                prec->cblkincl[pos].val = prec->cblk[yi * cblknw + xi].ninclpasses == 0;
+                tag_tree_update(prec->cblkincl + pos);
+                prec->zerobits[pos].val = expn[bandno] + numgbits - 1 - prec->cblk[yi * cblknw + xi].nonzerobits;
+                tag_tree_update(prec->zerobits + pos);
+            }
+        }
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                int pad = 0, llen, length;
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+
+                if (s->buf_end - s->buf < 20) // approximately
+                    return -1;
+
+                // inclusion information
+                tag_tree_code(s, prec->cblkincl + pos, 1);
+                if (!cblk->ninclpasses)
+                    continue;
+                // zerobits information
+                tag_tree_code(s, prec->zerobits + pos, 100);
+                // number of passes
+                putnumpasses(s, cblk->ninclpasses);
+
+                length = cblk->passes[cblk->ninclpasses-1].rate;
+                llen = av_log2(length) - av_log2(cblk->ninclpasses) - 2;
+                if (llen < 0){
+                    pad = -llen;
+                    llen = 0;
+                }
+                // length of code block
+                put_bits(s, 1, llen);
+                put_bits(s, 0, 1);
+                put_num(s, length, av_log2(length)+1+pad);
+            }
+        }
+    }
+    j2k_flush(s);
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, cblknw = prec->nb_codeblocks_width;
+        for (yi =0; yi < prec->nb_codeblocks_height; yi++){
+            int xi;
+            for (xi = 0; xi < cblknw; xi++){
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+                if (cblk->ninclpasses){
+                    if (s->buf_end - s->buf < cblk->passes[cblk->ninclpasses-1].rate)
+                        return -1;
+                    bytestream_put_buffer(&s->buf, cblk->data,   cblk->passes[cblk->ninclpasses-1].rate
+                                                               - cblk->passes[cblk->ninclpasses-1].flushed_len);
+                    bytestream_put_buffer(&s->buf, cblk->passes[cblk->ninclpasses-1].flushed,
+                                                   cblk->passes[cblk->ninclpasses-1].flushed_len);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_packets(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, ret;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "tier2\n");
+    // lay-rlevel-comp-pos progression
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            int precno;
+            Jpeg2000ResLevel *reslevel = s->tile[tileno].comp[compno].reslevel + reslevelno;
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                if ((ret = encode_packet(s, reslevel, precno, qntsty->expn + (reslevelno ? 3*reslevelno-2 : 0),
+                              qntsty->nguardbits)) < 0)
+                    return ret;
+            }
+        }
+    }
+    av_log(s->avctx, AV_LOG_DEBUG, "after tier2\n");
+    return 0;
+}
+
+static int getcut(Jpeg2000Cblk *cblk, int64_t lambda, int dwt_norm)
+{
+    int passno, res = 0;
+    for (passno = 0; passno < cblk->npasses; passno++){
+        int dr;
+        int64_t dd;
+
+        dr = cblk->passes[passno].rate
+           - (res ? cblk->passes[res-1].rate:0);
+        dd = cblk->passes[passno].disto
+           - (res ? cblk->passes[res-1].disto:0);
+
+        if (((dd * dwt_norm) >> WMSEDEC_SHIFT) * dwt_norm >= dr * lambda)
+            res = passno+1;
+    }
+    return res;
+}
+
+static void truncpasses(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile)
+{
+    int precno, compno, reslevelno, bandno, cblkno, lev;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = tile->comp + compno;
+
+        for (reslevelno = 0, lev = codsty->nreslevels-1; reslevelno < codsty->nreslevels; reslevelno++, lev--){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                    int bandpos = bandno + (reslevelno > 0);
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    Jpeg2000Prec *prec = band->prec + precno;
+
+                    for (cblkno = 0; cblkno < prec->nb_codeblocks_height * prec->nb_codeblocks_width; cblkno++){
+                        Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+
+                        cblk->ninclpasses = getcut(cblk, s->lambda,
+                                (int64_t)dwt_norms[codsty->transform == FF_DWT53][bandpos][lev] * (int64_t)band->i_stepsize >> 15);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int encode_tile(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, bandno, ret;
+    Jpeg2000T1Context t1;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
+        av_log(s->avctx, AV_LOG_DEBUG,"dwt\n");
+        if ((ret = ff_dwt_encode(&comp->dwt, comp->i_data)) < 0)
+            return ret;
+        av_log(s->avctx, AV_LOG_DEBUG,"after dwt -> tier1\n");
+
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                Jpeg2000Band *band = reslevel->band + bandno;
+                Jpeg2000Prec *prec = band->prec; // we support only 1 precinct per band ATM in the encoder
+                int cblkx, cblky, cblkno=0, xx0, x0, xx1, y0, yy0, yy1, bandpos;
+                yy0 = bandno == 0 ? 0 : comp->reslevel[reslevelno-1].coord[1][1] - comp->reslevel[reslevelno-1].coord[1][0];
+                y0 = yy0;
+                yy1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[1][0] + 1, band->log2_cblk_height) << band->log2_cblk_height,
+                            band->coord[1][1]) - band->coord[1][0] + yy0;
+
+                if (band->coord[0][0] == band->coord[0][1] || band->coord[1][0] == band->coord[1][1])
+                    continue;
+
+                bandpos = bandno + (reslevelno > 0);
+
+                for (cblky = 0; cblky < prec->nb_codeblocks_height; cblky++){
+                    if (reslevelno == 0 || bandno == 1)
+                        xx0 = 0;
+                    else
+                        xx0 = comp->reslevel[reslevelno-1].coord[0][1] - comp->reslevel[reslevelno-1].coord[0][0];
+                    x0 = xx0;
+                    xx1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[0][0] + 1, band->log2_cblk_width) << band->log2_cblk_width,
+                                band->coord[0][1]) - band->coord[0][0] + xx0;
+
+                    for (cblkx = 0; cblkx < prec->nb_codeblocks_width; cblkx++, cblkno++){
+                        int y, x;
+                        if (codsty->transform == FF_DWT53){
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr++ = comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x] << NMSEDEC_FRACBITS;
+                                }
+                            }
+                        } else{
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr = (comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x]);
+                                    *ptr = (int64_t)*ptr * (int64_t)(16384 * 65536 / band->i_stepsize) >> 15 - NMSEDEC_FRACBITS;
+                                    ptr++;
+                                }
+                            }
+                        }
+                        encode_cblk(s, &t1, prec->cblk + cblkno, tile, xx1 - xx0, yy1 - yy0,
+                                    bandpos, codsty->nreslevels - reslevelno - 1);
+                        xx0 = xx1;
+                        xx1 = FFMIN(xx1 + (1 << band->log2_cblk_width), band->coord[0][1] - band->coord[0][0] + x0);
+                    }
+                    yy0 = yy1;
+                    yy1 = FFMIN(yy1 + (1 << band->log2_cblk_height), band->coord[1][1] - band->coord[1][0] + y0);
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "after tier1\n");
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "rate control\n");
+    truncpasses(s, tile);
+    if ((ret = encode_packets(s, tile, tileno)) < 0)
+        return ret;
+    av_log(s->avctx, AV_LOG_DEBUG, "after rate control\n");
+    return 0;
+}
+
+static void cleanup(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+            ff_jpeg2000_cleanup(comp, codsty);
+        }
+        av_freep(&s->tile[tileno].comp);
+    }
+    av_freep(&s->tile);
+}
+
+static void reinit(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        for (compno = 0; compno < s->ncomponents; compno++)
+            ff_jpeg2000_reinit(tile->comp + compno, &s->codsty);
+    }
+}
+
+static void update_size(uint8_t *size, const uint8_t *end)
+{
+    AV_WB32(size, end-size);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    int tileno, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    uint8_t *chunkstart, *jp2cstart, *jp2hstart;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    // init:
+    s->buf = s->buf_start = pkt->data;
+    s->buf_end = pkt->data + pkt->size;
+
+    s->picture = pict;
+
+    s->lambda = s->picture->quality * LAMBDA_SCALE;
+
+    copy_frame(s);
+    reinit(s);
+
+    if (s->format == CODEC_JP2) {
+        av_assert0(s->buf == pkt->data);
+
+        bytestream_put_be32(&s->buf, 0x0000000C);
+        bytestream_put_be32(&s->buf, 0x6A502020);
+        bytestream_put_be32(&s->buf, 0x0D0A870A);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ftyp", 4);
+        bytestream_put_buffer(&s->buf, "jp2\040\040", 4);
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2\040", 4);
+        update_size(chunkstart, s->buf);
+
+        jp2hstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2h", 4);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ihdr", 4);
+        bytestream_put_be32(&s->buf, avctx->height);
+        bytestream_put_be32(&s->buf, avctx->width);
+        bytestream_put_be16(&s->buf, s->ncomponents);
+        bytestream_put_byte(&s->buf, s->cbps[0]);
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        update_size(chunkstart, s->buf);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "colr", 4);
+        bytestream_put_byte(&s->buf, 1);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        if (s->ncomponents == 1) {
+            bytestream_put_be32(&s->buf, 17);
+        } else if (avctx->pix_fmt == AV_PIX_FMT_RGB24) {
+            bytestream_put_be32(&s->buf, 16);
+        } else {
+            bytestream_put_be32(&s->buf, 18);
+        }
+        update_size(chunkstart, s->buf);
+        update_size(jp2hstart, s->buf);
+
+        jp2cstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2c", 4);
+    }
+
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_SOC);
+    if ((ret = put_siz(s)) < 0)
+        return ret;
+    if ((ret = put_cod(s)) < 0)
+        return ret;
+    if ((ret = put_qcd(s, 0)) < 0)
+        return ret;
+    if ((ret = put_com(s, 0)) < 0)
+        return ret;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        uint8_t *psotptr;
+        if (!(psotptr = put_sot(s, tileno)))
+            return -1;
+        if (s->buf_end - s->buf < 2)
+            return -1;
+        bytestream_put_be16(&s->buf, JPEG2000_SOD);
+        if ((ret = encode_tile(s, s->tile + tileno, tileno)) < 0)
+            return ret;
+        bytestream_put_be32(&psotptr, s->buf - psotptr + 6);
+    }
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_EOC);
+
+    if (s->format == CODEC_JP2)
+        update_size(jp2cstart, s->buf);
+
+    av_log(s->avctx, AV_LOG_DEBUG, "end\n");
+    pkt->size = s->buf - s->buf_start;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int j2kenc_init(AVCodecContext *avctx)
+{
+    int i, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->avctx = avctx;
+    av_log(s->avctx, AV_LOG_DEBUG, "init\n");
+
+    // defaults:
+    // TODO: implement setting non-standard precinct size
+    memset(codsty->log2_prec_widths , 15, sizeof(codsty->log2_prec_widths ));
+    memset(codsty->log2_prec_heights, 15, sizeof(codsty->log2_prec_heights));
+    codsty->nreslevels2decode=
+    codsty->nreslevels       = 7;
+    codsty->log2_cblk_width  = 4;
+    codsty->log2_cblk_height = 4;
+    codsty->transform        = avctx->prediction_method ? FF_DWT53 : FF_DWT97_INT;
+
+    qntsty->nguardbits       = 1;
+
+    if ((s->tile_width  & (s->tile_width -1)) ||
+        (s->tile_height & (s->tile_height-1))) {
+        av_log(avctx, AV_LOG_WARNING, "Tile dimension not a power of 2\n");
+    }
+
+    if (codsty->transform == FF_DWT53)
+        qntsty->quantsty = JPEG2000_QSTY_NONE;
+    else
+        qntsty->quantsty = JPEG2000_QSTY_SE;
+
+    s->width = avctx->width;
+    s->height = avctx->height;
+
+    for (i = 0; i < 3; i++)
+        s->cbps[i] = 8;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24){
+        s->ncomponents = 3;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_GRAY8){
+        s->ncomponents = 1;
+    } else{ // planar YUV
+        s->planar = 1;
+        s->ncomponents = 3;
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt,
+                s->chroma_shift, s->chroma_shift + 1);
+    }
+
+    ff_jpeg2000_init_tier1_luts();
+    ff_mqc_init_context_tables();
+    init_luts();
+
+    init_quantization(s);
+    if ((ret=init_tiles(s)) < 0)
+        return ret;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "after init\n");
+
+    return 0;
+}
+
+static int j2kenc_destroy(AVCodecContext *avctx)
+{
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+
+    cleanup(s);
+    return 0;
+}
+
+// taken from the libopenjpeg wraper so it matches
+
+#define OFFSET(x) offsetof(Jpeg2000EncoderContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = CODEC_JP2   }, CODEC_J2K, CODEC_JP2,   VE, "format"      },
+    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_J2K   }, 0,         0,           VE, "format"      },
+    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_JP2   }, 0,         0,           VE, "format"      },
+    { "tile_width",    "Tile Width",        OFFSET(tile_width),    AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+    { "tile_height",   "Tile Height",       OFFSET(tile_height),   AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+
+    { NULL }
+};
+
+static const AVClass j2k_class = {
+    .class_name = "jpeg 2000 encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_jpeg2000_encoder = {
+    .name           = "jpeg2000",
+    .long_name      = NULL_IF_CONFIG_SMALL("JPEG 2000"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_JPEG2000,
+    .priv_data_size = sizeof(Jpeg2000EncoderContext),
+    .init           = j2kenc_init,
+    .encode2        = encode_frame,
+    .close          = j2kenc_destroy,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_YUV444P, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &j2k_class,
+};
diff --git a/libavcodec/jacosub.h b/libavcodec/jacosub.h
new file mode 100644
index 0000000000..c3665ae3fc
--- /dev/null
+++ b/libavcodec/jacosub.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub shared utils
+ */
+
+#ifndef AVCODEC_JACOSUB_H
+#define AVCODEC_JACOSUB_H
+
+#include "libavutil/common.h"
+
+#define JSS_MAX_LINESIZE 512
+
+static av_always_inline int jss_whitespace(char c)
+{
+    return c == ' ' || (c >= '\t' && c <= '\r');
+}
+
+static av_always_inline const char *jss_skip_whitespace(const char *p)
+{
+    while (jss_whitespace(*p))
+        p++;
+    return p;
+}
+
+#endif /* AVCODEC_JACOSUB_H */
diff --git a/libavcodec/jacosubdec.c b/libavcodec/jacosubdec.c
new file mode 100644
index 0000000000..0c97eb86c0
--- /dev/null
+++ b/libavcodec/jacosubdec.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub subtitle decoder
+ * @see http://unicorn.us.com/jacosub/jscripts.html
+ */
+
+#include <time.h>
+#include "ass.h"
+#include "jacosub.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "libavutil/time_internal.h"
+
+#undef time
+
+static int insert_text(AVBPrint *dst, const char *in, const char *arg)
+{
+    av_bprintf(dst, "%s", arg);
+    return 0;
+}
+
+static int insert_datetime(AVBPrint *dst, const char *in, const char *arg)
+{
+    char buf[16] = {0};
+    time_t now = time(0);
+    struct tm ltime;
+
+    localtime_r(&now, &ltime);
+    if (strftime(buf, sizeof(buf), arg, &ltime))
+        av_bprintf(dst, "%s", buf);
+    return 0;
+}
+
+static int insert_color(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static int insert_font(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static const struct {
+    const char *from;
+    const char *arg;
+    int (*func)(AVBPrint *dst, const char *in, const char *arg);
+} ass_codes_map[] = {
+    {"\\~", "~",        insert_text},       // tilde doesn't need escaping
+    {"~",   "{\\h}",    insert_text},       // hard space
+    {"\\n", "\\N",      insert_text},       // newline
+    {"\\D", "%d %b %Y", insert_datetime},   // current date
+    {"\\T", "%H:%M",    insert_datetime},   // current time
+    {"\\N", "{\\r}",    insert_text},       // reset to default style
+    {"\\I", "{\\i1}",   insert_text},       // italic on
+    {"\\i", "{\\i0}",   insert_text},       // italic off
+    {"\\B", "{\\b1}",   insert_text},       // bold on
+    {"\\b", "{\\b0}",   insert_text},       // bold off
+    {"\\U", "{\\u1}",   insert_text},       // underline on
+    {"\\u", "{\\u0}",   insert_text},       // underline off
+    {"\\C", "",         insert_color},      // TODO: color
+    {"\\F", "",         insert_font},       // TODO: font
+};
+
+enum {
+    ALIGN_VB = 1<<0, // vertical bottom, default
+    ALIGN_VM = 1<<1, // vertical middle
+    ALIGN_VT = 1<<2, // vertical top
+    ALIGN_JC = 1<<3, // justify center, default
+    ALIGN_JL = 1<<4, // justify left
+    ALIGN_JR = 1<<5, // justify right
+};
+
+static void jacosub_to_ass(AVCodecContext *avctx, AVBPrint *dst, const char *src)
+{
+    int i, valign = 0, halign = 0;
+    char c = av_toupper(*src);
+    char directives[128] = {0};
+
+    /* extract the optional directives */
+    if ((c >= 'A' && c <= 'Z') || c == '[') {
+        char *p    = directives;
+        char *pend = directives + sizeof(directives) - 1;
+
+        do *p++ = av_toupper(*src++);
+        while (*src && !jss_whitespace(*src) && p < pend);
+        *p = 0;
+        src = jss_skip_whitespace(src);
+    }
+
+    /* handle directives (TODO: handle more of them, and more reliably) */
+    if      (strstr(directives, "VB")) valign = ALIGN_VB;
+    else if (strstr(directives, "VM")) valign = ALIGN_VM;
+    else if (strstr(directives, "VT")) valign = ALIGN_VT;
+    if      (strstr(directives, "JC")) halign = ALIGN_JC;
+    else if (strstr(directives, "JL")) halign = ALIGN_JL;
+    else if (strstr(directives, "JR")) halign = ALIGN_JR;
+    if (valign || halign) {
+        if (!valign) valign = ALIGN_VB;
+        if (!halign) halign = ALIGN_JC;
+        switch (valign | halign) {
+        case ALIGN_VB | ALIGN_JL: av_bprintf(dst, "{\\an1}"); break; // bottom left
+        case ALIGN_VB | ALIGN_JC: av_bprintf(dst, "{\\an2}"); break; // bottom center
+        case ALIGN_VB | ALIGN_JR: av_bprintf(dst, "{\\an3}"); break; // bottom right
+        case ALIGN_VM | ALIGN_JL: av_bprintf(dst, "{\\an4}"); break; // middle left
+        case ALIGN_VM | ALIGN_JC: av_bprintf(dst, "{\\an5}"); break; // middle center
+        case ALIGN_VM | ALIGN_JR: av_bprintf(dst, "{\\an6}"); break; // middle right
+        case ALIGN_VT | ALIGN_JL: av_bprintf(dst, "{\\an7}"); break; // top left
+        case ALIGN_VT | ALIGN_JC: av_bprintf(dst, "{\\an8}"); break; // top center
+        case ALIGN_VT | ALIGN_JR: av_bprintf(dst, "{\\an9}"); break; // top right
+        }
+    }
+
+    /* process timed line */
+    while (*src && *src != '\n') {
+
+        /* text continue on the next line */
+        if (src[0] == '\\' && src[1] == '\n') {
+            src += 2;
+            while (jss_whitespace(*src))
+                src++;
+            continue;
+        }
+
+        /* special character codes */
+        for (i = 0; i < FF_ARRAY_ELEMS(ass_codes_map); i++) {
+            const char *from = ass_codes_map[i].from;
+            const char *arg  = ass_codes_map[i].arg;
+            size_t codemap_len = strlen(from);
+
+            if (!strncmp(src, from, codemap_len)) {
+                src += codemap_len;
+                src += ass_codes_map[i].func(dst, src, arg);
+                break;
+            }
+        }
+
+        /* simple char copy */
+        if (i == FF_ARRAY_ELEMS(ass_codes_map))
+            av_bprintf(dst, "%c", *src++);
+    }
+}
+
+static int jacosub_decode_frame(AVCodecContext *avctx,
+                                void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+
+    if (avpkt->size <= 0)
+        goto end;
+
+    if (*ptr) {
+        AVBPrint buffer;
+
+        // skip timers
+        ptr = jss_skip_whitespace(ptr);
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+
+        av_bprint_init(&buffer, JSS_MAX_LINESIZE, JSS_MAX_LINESIZE);
+        jacosub_to_ass(avctx, &buffer, ptr);
+        ret = ff_ass_add_rect_bprint(sub, &buffer, avpkt->pts, avpkt->duration);
+        av_bprint_finalize(&buffer, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+end:
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_jacosub_decoder = {
+    .name           = "jacosub",
+    .long_name      = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_JACOSUB,
+    .init           = ff_ass_subtitle_header_default,
+    .decode         = jacosub_decode_frame,
+};
diff --git a/libavcodec/jfdctint.c b/libavcodec/jfdctint.c
index ed6b7ffca2..6a39578f88 100644
--- a/libavcodec/jfdctint.c
+++ b/libavcodec/jfdctint.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jfdctint_template.c b/libavcodec/jfdctint_template.c
index c6a1638107..3ea2f5dadc 100644
--- a/libavcodec/jfdctint_template.c
+++ b/libavcodec/jfdctint_template.c
@@ -216,8 +216,8 @@ static av_always_inline void FUNC(row_fdct)(int16_t *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[0] = (int16_t) ((tmp10 + tmp11) << PASS1_BITS);
-    dataptr[4] = (int16_t) ((tmp10 - tmp11) << PASS1_BITS);
+    dataptr[0] = (int16_t) ((tmp10 + tmp11) * (1 << PASS1_BITS));
+    dataptr[4] = (int16_t) ((tmp10 - tmp11) * (1 << PASS1_BITS));
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[2] = (int16_t) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index f5a844f26e..d9869fd277 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
@@ -41,8 +42,7 @@ static int32_t tag_tree_size(uint16_t w, uint16_t h)
     uint32_t res = 0;
     while (w > 1 || h > 1) {
         res += w * h;
-        if (res + 1 >= INT32_MAX)
-            return -1;
+        av_assert0(res + 1 < INT32_MAX);
         w = (w + 1) >> 1;
         h = (h + 1) >> 1;
     }
@@ -56,8 +56,6 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     int32_t tt_size;
 
     tt_size = tag_tree_size(w, h);
-    if (tt_size == -1)
-        return NULL;
 
     t = res = av_mallocz_array(tt_size, sizeof(*t));
     if (!res)
@@ -82,6 +80,16 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     return res;
 }
 
+static void tag_tree_zero(Jpeg2000TgtNode *t, int w, int h)
+{
+    int i, siz = tag_tree_size(w, h);
+
+    for (i = 0; i < siz; i++) {
+        t[i].val = 0;
+        t[i].vis = 0;
+    }
+}
+
 uint8_t ff_jpeg2000_sigctxno_lut[256][4];
 
 static int getsigctxno(int flag, int bandno)
@@ -96,45 +104,33 @@ static int getsigctxno(int flag, int bandno)
         ((flag & JPEG2000_T1_SIG_NW) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SE) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SW) ? 1 : 0);
+
     if (bandno < 3) {
         if (bandno == 1)
             FFSWAP(int, h, v);
-        if (h == 2)
-            return 8;
+        if (h == 2) return 8;
         if (h == 1) {
-            if (v >= 1)
-                return 7;
-            if (d >= 1)
-                return 6;
+            if (v >= 1) return 7;
+            if (d >= 1) return 6;
             return 5;
         }
-        if (v == 2)
-            return 4;
-        if (v == 1)
-            return 3;
-        if (d >= 2)
-            return 2;
-        if (d == 1)
-            return 1;
+        if (v == 2) return 4;
+        if (v == 1) return 3;
+        if (d >= 2) return 2;
+        if (d == 1) return 1;
     } else {
-        if (d >= 3)
-            return 8;
+        if (d >= 3) return 8;
         if (d == 2) {
-            if (h + v >= 1)
-                return 7;
+            if (h+v >= 1) return 7;
             return 6;
         }
         if (d == 1) {
-            if (h + v >= 2)
-                return 5;
-            if (h + v == 1)
-                return 4;
+            if (h+v >= 2) return 5;
+            if (h+v == 1) return 4;
             return 3;
         }
-        if (h + v >= 2)
-            return 2;
-        if (h + v == 1)
-            return 1;
+        if (h+v >= 2) return 2;
+        if (h+v == 1) return 1;
     }
     return 0;
 }
@@ -175,25 +171,25 @@ void ff_jpeg2000_set_significance(Jpeg2000T1Context *t1, int x, int y,
 {
     x++;
     y++;
-    t1->flags[y][x] |= JPEG2000_T1_SIG;
+    t1->flags[(y) * t1->stride + x] |= JPEG2000_T1_SIG;
     if (negative) {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
     } else {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S;
     }
-    t1->flags[y + 1][x + 1] |= JPEG2000_T1_SIG_NW;
-    t1->flags[y + 1][x - 1] |= JPEG2000_T1_SIG_NE;
-    t1->flags[y - 1][x + 1] |= JPEG2000_T1_SIG_SW;
-    t1->flags[y - 1][x - 1] |= JPEG2000_T1_SIG_SE;
+    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_NW;
+    t1->flags[(y + 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_NE;
+    t1->flags[(y - 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_SW;
+    t1->flags[(y - 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_SE;
 }
 
-static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } };
+// static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } }; (unused)
 
 static void init_band_stepsize(AVCodecContext *avctx,
                                Jpeg2000Band *band,
@@ -206,27 +202,23 @@ static void init_band_stepsize(AVCodecContext *avctx,
      * see ISO/IEC 15444-1:2002 E.1 and A.6.4. */
     switch (qntsty->quantsty) {
         uint8_t gain;
-        int numbps;
     case JPEG2000_QSTY_NONE:
         /* TODO: to verify. No quantization in this case */
         band->f_stepsize = 1;
         break;
     case JPEG2000_QSTY_SI:
         /*TODO: Compute formula to implement. */
-        numbps = cbps +
-                 lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
-        band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
-                               2 + numbps - qntsty->expn[gbandno]);
-        break;
+//         numbps = cbps +
+//                  lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
+//         band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
+//                                2 + numbps - qntsty->expn[gbandno]);
+//         break;
     case JPEG2000_QSTY_SE:
         /* Exponent quantization step.
          * Formula:
          * delta_b = 2 ^ (R_b - expn_b) * (1 + (mant_b / 2 ^ 11))
          * R_b = R_I + log2 (gain_b )
          * see ISO/IEC 15444-1:2002 E.1.1 eqn. E-3 and E-4 */
-        /* TODO/WARN: value of log2 (gain_b ) not taken into account
-         * but it works (compared to OpenJPEG). Why?
-         * Further investigation needed. */
         gain            = cbps;
         band->f_stepsize  = pow(2.0, gain - qntsty->expn[gbandno]);
         band->f_stepsize *= qntsty->mant[gbandno] / 2048.0 + 1.0;
@@ -236,12 +228,29 @@ static void init_band_stepsize(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown quantization format\n");
         break;
     }
+    if (codsty->transform != FF_DWT53) {
+        int lband = 0;
+        switch (bandno + (reslevelno > 0)) {
+            case 1:
+            case 2:
+                band->f_stepsize *= F_LFTG_X * 2;
+                lband = 1;
+                break;
+            case 3:
+                band->f_stepsize *= F_LFTG_X * F_LFTG_X * 4;
+                break;
+        }
+        if (codsty->transform == FF_DWT97) {
+            band->f_stepsize *= pow(F_LFTG_K, 2*(codsty->nreslevels2decode - reslevelno) + lband - 2);
+        }
+    }
+
+    band->i_stepsize = band->f_stepsize * (1 << 15);
+
     /* FIXME: In openjepg code stespize = stepsize * 0.5. Why?
      * If not set output of entropic decoder is not correct. */
     if (!av_codec_is_encoder(avctx->codec))
         band->f_stepsize *= 0.5;
-
-    band->i_stepsize = band->f_stepsize * (1 << 16);
 }
 
 static int init_prec(Jpeg2000Band *band,
@@ -254,37 +263,40 @@ static int init_prec(Jpeg2000Band *band,
     Jpeg2000Prec *prec = band->prec + precno;
     int nb_codeblocks, cblkno;
 
+    prec->decoded_layers = 0;
+
     /* TODO: Explain formula for JPEG200 DCINEMA. */
     /* TODO: Verify with previous count of codeblocks per band */
 
     /* Compute P_x0 */
-    prec->coord[0][0] = (precno % reslevel->num_precincts_x) *
+    prec->coord[0][0] = ((band->coord[0][0] >> log2_band_prec_width) + precno % reslevel->num_precincts_x) *
                         (1 << log2_band_prec_width);
-    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
 
     /* Compute P_y0 */
-    prec->coord[1][0] = (precno / reslevel->num_precincts_x) *
+    prec->coord[1][0] = ((band->coord[1][0] >> log2_band_prec_height) + precno / reslevel->num_precincts_x) *
                         (1 << log2_band_prec_height);
-    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
 
     /* Compute P_x1 */
     prec->coord[0][1] = prec->coord[0][0] +
                         (1 << log2_band_prec_width);
+    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
     prec->coord[0][1] = FFMIN(prec->coord[0][1], band->coord[0][1]);
 
     /* Compute P_y1 */
     prec->coord[1][1] = prec->coord[1][0] +
                         (1 << log2_band_prec_height);
+    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
     prec->coord[1][1] = FFMIN(prec->coord[1][1], band->coord[1][1]);
 
     prec->nb_codeblocks_width =
-        ff_jpeg2000_ceildivpow2(prec->coord[0][1] -
-                                prec->coord[0][0],
-                                band->log2_cblk_width);
+        ff_jpeg2000_ceildivpow2(prec->coord[0][1],
+                                band->log2_cblk_width)
+        - (prec->coord[0][0] >> band->log2_cblk_width);
     prec->nb_codeblocks_height =
-        ff_jpeg2000_ceildivpow2(prec->coord[1][1] -
-                                prec->coord[1][0],
-                                band->log2_cblk_height);
+        ff_jpeg2000_ceildivpow2(prec->coord[1][1],
+                                band->log2_cblk_height)
+        - (prec->coord[1][0] >> band->log2_cblk_height);
+
 
     /* Tag trees initialization */
     prec->cblkincl =
@@ -299,6 +311,10 @@ static int init_prec(Jpeg2000Band *band,
     if (!prec->zerobits)
         return AVERROR(ENOMEM);
 
+    if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT_MAX) {
+        prec->cblk = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_codeblocks = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
     prec->cblk = av_mallocz_array(nb_codeblocks, sizeof(*prec->cblk));
     if (!prec->cblk)
@@ -309,12 +325,12 @@ static int init_prec(Jpeg2000Band *band,
 
         /* Compute coordinates of codeblocks */
         /* Compute Cx0*/
-        Cx0 = (prec->coord[0][0] >> band->log2_cblk_width) << band->log2_cblk_width;
+        Cx0 = ((prec->coord[0][0]) >> band->log2_cblk_width) << band->log2_cblk_width;
         Cx0 = Cx0 + ((cblkno % prec->nb_codeblocks_width)  << band->log2_cblk_width);
         cblk->coord[0][0] = FFMAX(Cx0, prec->coord[0][0]);
 
         /* Compute Cy0*/
-        Cy0 = (prec->coord[1][0] >> band->log2_cblk_height) << band->log2_cblk_height;
+        Cy0 = ((prec->coord[1][0]) >> band->log2_cblk_height) << band->log2_cblk_height;
         Cy0 = Cy0 + ((cblkno / prec->nb_codeblocks_width)   << band->log2_cblk_height);
         cblk->coord[1][0] = FFMAX(Cy0, prec->coord[1][0]);
 
@@ -342,7 +358,7 @@ static int init_prec(Jpeg2000Band *band,
         cblk->zero      = 0;
         cblk->lblock    = 3;
         cblk->length    = 0;
-        cblk->lengthinc = 0;
+        memset(cblk->lengthinc, 0, sizeof(cblk->lengthinc));
         cblk->npasses   = 0;
     }
 
@@ -366,7 +382,6 @@ static int init_band(AVCodecContext *avctx,
 
     init_band_stepsize(avctx, band, codsty, qntsty, bandno, gbandno, reslevelno, cbps);
 
-
     /* computation of tbx_0, tbx_1, tby_0, tby_1
      * see ISO/IEC 15444-1:2002 B.5 eq. B-15 and tbl B.1
      * codeblock width and height is computed for
@@ -376,7 +391,7 @@ static int init_band(AVCodecContext *avctx,
         for (i = 0; i < 2; i++)
             for (j = 0; j < 2; j++)
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0],
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j],
                                             declvl - 1);
         log2_band_prec_width  = reslevel->log2_prec_width;
         log2_band_prec_height = reslevel->log2_prec_height;
@@ -392,8 +407,8 @@ static int init_band(AVCodecContext *avctx,
             for (j = 0; j < 2; j++)
                 /* Formula example for tbx_0 = ceildiv((tcx_0 - 2 ^ (declvl - 1) * x0_b) / declvl) */
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0] -
-                                            (((bandno + 1 >> i) & 1) << declvl - 1),
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] -
+                                            (((bandno + 1 >> i) & 1LL) << declvl - 1),
                                             declvl);
         /* TODO: Manage case of 3 band offsets here or
          * in coding/decoding function? */
@@ -408,11 +423,10 @@ static int init_band(AVCodecContext *avctx,
         log2_band_prec_height = reslevel->log2_prec_height - 1;
     }
 
-    for (j = 0; j < 2; j++)
-        band->coord[0][j] = ff_jpeg2000_ceildiv(band->coord[0][j], dx);
-    for (j = 0; j < 2; j++)
-        band->coord[1][j] = ff_jpeg2000_ceildiv(band->coord[1][j], dy);
-
+    if (reslevel->num_precincts_x * (uint64_t)reslevel->num_precincts_y > INT_MAX) {
+        band->prec = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_precincts = reslevel->num_precincts_x * reslevel->num_precincts_y;
     band->prec = av_mallocz_array(nb_precincts, sizeof(*band->prec));
     if (!band->prec)
@@ -438,8 +452,8 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
 
-    if (!codsty->nreslevels2decode) {
-        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode uninitialized\n");
+    if (codsty->nreslevels2decode <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode %d invalid or uninitialized\n", codsty->nreslevels2decode);
         return AVERROR_INVALIDDATA;
     }
 
@@ -452,13 +466,15 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
             (comp->coord[1][1] - comp->coord[1][0]);
 
     if (codsty->transform == FF_DWT97) {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->f_data);
         comp->i_data = NULL;
-        comp->f_data = av_malloc_array(csize, sizeof(*comp->f_data));
+        comp->f_data = av_mallocz_array(csize, sizeof(*comp->f_data));
         if (!comp->f_data)
             return AVERROR(ENOMEM);
     } else {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->i_data);
         comp->f_data = NULL;
-        comp->i_data = av_malloc_array(csize, sizeof(*comp->i_data));
+        comp->i_data = av_mallocz_array(csize, sizeof(*comp->i_data));
         if (!comp->i_data)
             return AVERROR(ENOMEM);
     }
@@ -487,7 +503,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
         else
             reslevel->nbands = 3;
 
-        /* Number of precincts wich span the tile for resolution level reslevelno
+        /* Number of precincts which span the tile for resolution level reslevelno
          * see B.6 in ISO/IEC 15444-1:2002 eq. B-16
          * num_precincts_x = |- trx_1 / 2 ^ log2_prec_width) -| - (trx_0 / 2 ^ log2_prec_width)
          * num_precincts_y = |- try_1 / 2 ^ log2_prec_width) -| - (try_0 / 2 ^ log2_prec_width)
@@ -526,6 +542,27 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     return 0;
 }
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
+{
+    int reslevelno, bandno, cblkno, precno;
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
+        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+        for (bandno = 0; bandno < rlevel->nbands; bandno++) {
+            Jpeg2000Band *band = rlevel->band + bandno;
+            for(precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++) {
+                Jpeg2000Prec *prec = band->prec + precno;
+                tag_tree_zero(prec->zerobits, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                tag_tree_zero(prec->cblkincl, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                for (cblkno = 0; cblkno < prec->nb_codeblocks_width * prec->nb_codeblocks_height; cblkno++) {
+                    Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+                    cblk->length = 0;
+                    cblk->lblock = 3;
+                }
+            }
+        }
+    }
+}
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 {
     int reslevelno, bandno, precno;
@@ -546,16 +583,12 @@ void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 
             band = reslevel->band + bandno;
             for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++) {
-                Jpeg2000Prec *prec;
-
-                if (!band->prec)
-                    continue;
-
-                prec = band->prec + precno;
-                av_freep(&prec->zerobits);
-                av_freep(&prec->cblkincl);
-                av_freep(&prec->cblk);
-
+                if (band->prec) {
+                    Jpeg2000Prec *prec = band->prec + precno;
+                    av_freep(&prec->zerobits);
+                    av_freep(&prec->cblkincl);
+                    av_freep(&prec->cblk);
+                }
             }
 
             av_freep(&band->prec);
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index b96b7e2291..7a21c4e45f 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,19 +58,20 @@ enum Jpeg2000Markers {
     JPEG2000_EOC = 0xffd9, // end of codestream
 };
 
+#define JPEG2000_SOP_FIXED_BYTES 0xFF910004
+#define JPEG2000_SOP_BYTE_LENGTH 6
+
 enum Jpeg2000Quantsty { // quantization style
     JPEG2000_QSTY_NONE, // no quantization
     JPEG2000_QSTY_SI,   // scalar derived
     JPEG2000_QSTY_SE    // scalar expounded
 };
 
-#define JPEG2000_MAX_CBLKW 64
-#define JPEG2000_MAX_CBLKH 64
-
-
-#define JPEG2000_MAX_DECLEVELS 32
+#define JPEG2000_MAX_DECLEVELS 33
 #define JPEG2000_MAX_RESLEVELS (JPEG2000_MAX_DECLEVELS + 1)
 
+#define JPEG2000_MAX_PASSES 100
+
 // T1 flags
 // flags determining significance of neighbor coefficients
 #define JPEG2000_T1_SIG_N  0x0001
@@ -118,9 +119,10 @@ enum Jpeg2000Quantsty { // quantization style
 #define JPEG2000_PGOD_CPRL      0x04  // Component-position-resolution level-layer progression
 
 typedef struct Jpeg2000T1Context {
-    int data[JPEG2000_MAX_CBLKW][JPEG2000_MAX_CBLKH];
-    int flags[JPEG2000_MAX_CBLKW + 2][JPEG2000_MAX_CBLKH + 2];
+    int data[6144];
+    uint16_t flags[6156];
     MqcState mqc;
+    int stride;
 } Jpeg2000T1Context;
 
 typedef struct Jpeg2000TgtNode {
@@ -130,8 +132,8 @@ typedef struct Jpeg2000TgtNode {
 } Jpeg2000TgtNode;
 
 typedef struct Jpeg2000CodingStyle {
-    uint8_t nreslevels;       // number of resolution levels
-    uint8_t nreslevels2decode; // number of resolution levels to decode
+    int nreslevels;           // number of resolution levels
+    int nreslevels2decode;    // number of resolution levels to decode
     uint8_t log2_cblk_width,
             log2_cblk_height; // exponent of codeblock size
     uint8_t transform;        // DWT type
@@ -146,20 +148,32 @@ typedef struct Jpeg2000CodingStyle {
 
 typedef struct Jpeg2000QuantStyle {
     uint8_t expn[JPEG2000_MAX_DECLEVELS * 3];  // quantization exponent
-    uint32_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
+    uint16_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
     uint8_t quantsty;      // quantization style
     uint8_t nguardbits;    // number of guard bits
 } Jpeg2000QuantStyle;
 
+typedef struct Jpeg2000Pass {
+    uint16_t rate;
+    int64_t disto;
+    uint8_t flushed[4];
+    int flushed_len;
+} Jpeg2000Pass;
+
 typedef struct Jpeg2000Cblk {
     uint8_t npasses;
     uint8_t ninclpasses; // number coding of passes included in codestream
     uint8_t nonzerobits;
     uint16_t length;
-    uint16_t lengthinc;
+    uint16_t lengthinc[JPEG2000_MAX_PASSES];
+    uint8_t nb_lengthinc;
     uint8_t lblock;
     uint8_t zero;
     uint8_t data[8192];
+    int nb_terminations;
+    int nb_terminationsinc;
+    int data_start[JPEG2000_MAX_PASSES];
+    Jpeg2000Pass passes[JPEG2000_MAX_PASSES];
     uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Cblk; // code block
 
@@ -169,6 +183,7 @@ typedef struct Jpeg2000Prec {
     Jpeg2000TgtNode *zerobits;
     Jpeg2000TgtNode *cblkincl;
     Jpeg2000Cblk *cblk;
+    int decoded_layers;
     uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Prec; // precinct
 
@@ -200,7 +215,7 @@ typedef struct Jpeg2000Component {
 /* misc tools */
 static inline int ff_jpeg2000_ceildivpow2(int a, int b)
 {
-    return (a + (1 << b) - 1) >> b;
+    return -(((int64_t)(-a)) >> b);
 }
 
 static inline int ff_jpeg2000_ceildiv(int a, int b)
@@ -252,6 +267,25 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                int cbps, int dx, int dy,
                                AVCodecContext *ctx);
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
 
+static inline int needs_termination(int style, int passno) {
+    if (style & JPEG2000_CBLK_BYPASS) {
+        int type = passno % 3;
+        passno /= 3;
+        if (type == 0 && passno > 2)
+            return 2;
+        if (type == 2 && passno > 2)
+            return 1;
+        if (style & JPEG2000_CBLK_TERMALL) {
+            return passno > 2 ? 2 : 1;
+        }
+    }
+    if (style & JPEG2000_CBLK_TERMALL)
+        return 1;
+    return 0;
+}
+
 #endif /* AVCODEC_JPEG2000_H */
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index 6f74eefc89..f2441a92ac 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -40,10 +42,28 @@
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 #define JP2_CODESTREAM  0x6A703263
+#define JP2_HEADER      0x6A703268
 
 #define HAD_COC 0x01
 #define HAD_QCC 0x02
 
+#define MAX_POCS 32
+
+typedef struct Jpeg2000POCEntry {
+    uint16_t LYEpoc;
+    uint16_t CSpoc;
+    uint16_t CEpoc;
+    uint8_t RSpoc;
+    uint8_t REpoc;
+    uint8_t Ppoc;
+} Jpeg2000POCEntry;
+
+typedef struct Jpeg2000POC {
+    Jpeg2000POCEntry poc[MAX_POCS];
+    int nb_poc;
+    int is_default;
+} Jpeg2000POC;
+
 typedef struct Jpeg2000TilePart {
     uint8_t tile_index;                 // Tile index who refers the tile-part
     const uint8_t *tp_end;
@@ -57,14 +77,16 @@ typedef struct Jpeg2000Tile {
     uint8_t             properties[4];
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
-    Jpeg2000TilePart    tile_part[3];
+    Jpeg2000POC         poc;
+    Jpeg2000TilePart    tile_part[256];
     uint16_t tp_idx;                    // Tile-part index
+    int coord[2][2];                    // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Tile;
 
 typedef struct Jpeg2000DecoderContext {
     AVClass         *class;
     AVCodecContext  *avctx;
-    GetByteContext g;
+    GetByteContext  g;
 
     int             width, height;
     int             image_offset_x, image_offset_y;
@@ -75,16 +97,22 @@ typedef struct Jpeg2000DecoderContext {
     int             cdx[4], cdy[4];
     int             precision;
     int             ncomponents;
+    int             colour_space;
+    uint32_t        palette[256];
+    int8_t          pal8;
+    int             cdef[4];
     int             tile_width, tile_height;
     unsigned        numXtiles, numYtiles;
     int             maxtilelen;
 
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
+    Jpeg2000POC         poc;
 
     int             bit_index;
 
-    int16_t         curtileno;
+    int             curtileno;
+
     Jpeg2000Tile    *tile;
     Jpeg2000DSPContext dsp;
 
@@ -99,6 +127,7 @@ typedef struct Jpeg2000DecoderContext {
 static int get_bits(Jpeg2000DecoderContext *s, int n)
 {
     int res = 0;
+
     while (--n >= 0) {
         res <<= 1;
         if (s->bit_index == 0) {
@@ -124,8 +153,10 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     Jpeg2000TgtNode *stack[30];
     int sp = -1, curval = 0;
 
-    if (!node)
+    if (!node) {
+        av_log(s->avctx, AV_LOG_ERROR, "missing node\n");
         return AVERROR_INVALIDDATA;
+    }
 
     while (node && !node->vis) {
         stack[++sp] = node;
@@ -156,15 +187,82 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     return curval;
 }
 
+static int pix_fmt_match(enum AVPixelFormat pix_fmt, int components,
+                         int bpc, uint32_t log2_chroma_wh, int pal8)
+{
+    int match = 1;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+
+    av_assert2(desc);
+
+    if (desc->nb_components != components) {
+        return 0;
+    }
+
+    switch (components) {
+    case 4:
+        match = match && desc->comp[3].depth >= bpc &&
+                         (log2_chroma_wh >> 14 & 3) == 0 &&
+                         (log2_chroma_wh >> 12 & 3) == 0;
+    case 3:
+        match = match && desc->comp[2].depth >= bpc &&
+                         (log2_chroma_wh >> 10 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  8 & 3) == desc->log2_chroma_h;
+    case 2:
+        match = match && desc->comp[1].depth >= bpc &&
+                         (log2_chroma_wh >>  6 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  4 & 3) == desc->log2_chroma_h;
+
+    case 1:
+        match = match && desc->comp[0].depth >= bpc &&
+                         (log2_chroma_wh >>  2 & 3) == 0 &&
+                         (log2_chroma_wh       & 3) == 0 &&
+                         (desc->flags & AV_PIX_FMT_FLAG_PAL) == pal8 * AV_PIX_FMT_FLAG_PAL;
+    }
+    return match;
+}
+
+// pix_fmts with lower bpp have to be listed before
+// similar pix_fmts with higher bpp.
+#define RGB_PIXEL_FORMATS   AV_PIX_FMT_PAL8,AV_PIX_FMT_RGB24,AV_PIX_FMT_RGBA,AV_PIX_FMT_RGB48,AV_PIX_FMT_RGBA64
+#define GRAY_PIXEL_FORMATS  AV_PIX_FMT_GRAY8,AV_PIX_FMT_GRAY8A,AV_PIX_FMT_GRAY16,AV_PIX_FMT_YA16
+#define YUV_PIXEL_FORMATS   AV_PIX_FMT_YUV410P,AV_PIX_FMT_YUV411P,AV_PIX_FMT_YUVA420P, \
+                            AV_PIX_FMT_YUV420P,AV_PIX_FMT_YUV422P,AV_PIX_FMT_YUVA422P, \
+                            AV_PIX_FMT_YUV440P,AV_PIX_FMT_YUV444P,AV_PIX_FMT_YUVA444P, \
+                            AV_PIX_FMT_YUV420P9,AV_PIX_FMT_YUV422P9,AV_PIX_FMT_YUV444P9, \
+                            AV_PIX_FMT_YUVA420P9,AV_PIX_FMT_YUVA422P9,AV_PIX_FMT_YUVA444P9, \
+                            AV_PIX_FMT_YUV420P10,AV_PIX_FMT_YUV422P10,AV_PIX_FMT_YUV444P10, \
+                            AV_PIX_FMT_YUVA420P10,AV_PIX_FMT_YUVA422P10,AV_PIX_FMT_YUVA444P10, \
+                            AV_PIX_FMT_YUV420P12,AV_PIX_FMT_YUV422P12,AV_PIX_FMT_YUV444P12, \
+                            AV_PIX_FMT_YUV420P14,AV_PIX_FMT_YUV422P14,AV_PIX_FMT_YUV444P14, \
+                            AV_PIX_FMT_YUV420P16,AV_PIX_FMT_YUV422P16,AV_PIX_FMT_YUV444P16, \
+                            AV_PIX_FMT_YUVA420P16,AV_PIX_FMT_YUVA422P16,AV_PIX_FMT_YUVA444P16
+#define XYZ_PIXEL_FORMATS   AV_PIX_FMT_XYZ12
+
+static const enum AVPixelFormat rgb_pix_fmts[]  = {RGB_PIXEL_FORMATS};
+static const enum AVPixelFormat gray_pix_fmts[] = {GRAY_PIXEL_FORMATS};
+static const enum AVPixelFormat yuv_pix_fmts[]  = {YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat xyz_pix_fmts[]  = {XYZ_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat all_pix_fmts[]  = {RGB_PIXEL_FORMATS,
+                                                   GRAY_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS,
+                                                   XYZ_PIXEL_FORMATS};
+
 /* marker segments */
 /* get sizes and offsets of image, tiles; number of components */
 static int get_siz(Jpeg2000DecoderContext *s)
 {
     int i;
     int ncomponents;
+    uint32_t log2_chroma_wh = 0;
+    const enum AVPixelFormat *possible_fmts = NULL;
+    int possible_fmts_nb = 0;
 
-    if (bytestream2_get_bytes_left(&s->g) < 36)
+    if (bytestream2_get_bytes_left(&s->g) < 36) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for SIZ\n");
         return AVERROR_INVALIDDATA;
+    }
 
     s->avctx->profile = bytestream2_get_be16u(&s->g); // Rsiz
     s->width          = bytestream2_get_be32u(&s->g); // Width
@@ -177,6 +275,11 @@ static int get_siz(Jpeg2000DecoderContext *s)
     s->tile_offset_y  = bytestream2_get_be32u(&s->g); // YT0Siz
     ncomponents       = bytestream2_get_be16u(&s->g); // CSiz
 
+    if (s->image_offset_x || s->image_offset_y) {
+        avpriv_request_sample(s->avctx, "Support for image offsets");
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (ncomponents <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid number of components: %d\n",
                s->ncomponents);
@@ -185,21 +288,22 @@ static int get_siz(Jpeg2000DecoderContext *s)
 
     if (ncomponents > 4) {
         avpriv_request_sample(s->avctx, "Support for %d components",
-                              s->ncomponents);
+                              ncomponents);
         return AVERROR_PATCHWELCOME;
     }
 
     s->ncomponents = ncomponents;
 
-    if (s->tile_width <= 0 || s->tile_height <= 0 ||
-        s->tile_width > s->width || s->tile_height > s->height) {
+    if (s->tile_width <= 0 || s->tile_height <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid tile dimension %dx%d.\n",
                s->tile_width, s->tile_height);
         return AVERROR_INVALIDDATA;
     }
 
-    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents)
+    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for %d components in SIZ\n", s->ncomponents);
         return AVERROR_INVALIDDATA;
+    }
 
     for (i = 0; i < s->ncomponents; i++) { // Ssiz_i XRsiz_i, YRsiz_i
         uint8_t x    = bytestream2_get_byteu(&s->g);
@@ -208,21 +312,22 @@ static int get_siz(Jpeg2000DecoderContext *s)
         s->sgnd[i]   = !!(x & 0x80);
         s->cdx[i]    = bytestream2_get_byteu(&s->g);
         s->cdy[i]    = bytestream2_get_byteu(&s->g);
-
-        if (s->cdx[i] != 1 || s->cdy[i] != 1) {
-            avpriv_request_sample(s->avctx,
-                                  "CDxy values %d %d for component %d",
-                                  s->cdx[i], s->cdy[i], i);
-            if (!s->cdx[i] || !s->cdy[i])
-                return AVERROR_INVALIDDATA;
-            else
-                return AVERROR_PATCHWELCOME;
+        if (   !s->cdx[i] || s->cdx[i] == 3 || s->cdx[i] > 4
+            || !s->cdy[i] || s->cdy[i] == 3 || s->cdy[i] > 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid sample separation %d/%d\n", s->cdx[i], s->cdy[i]);
+            return AVERROR_INVALIDDATA;
         }
+        log2_chroma_wh |= s->cdy[i] >> 1 << i * 4 | s->cdx[i] >> 1 << i * 4 + 2;
     }
 
     s->numXtiles = ff_jpeg2000_ceildiv(s->width  - s->tile_offset_x, s->tile_width);
     s->numYtiles = ff_jpeg2000_ceildiv(s->height - s->tile_offset_y, s->tile_height);
 
+    if (s->numXtiles * (uint64_t)s->numYtiles > INT_MAX/sizeof(*s->tile)) {
+        s->numXtiles = s->numYtiles = 0;
+        return AVERROR(EINVAL);
+    }
+
     s->tile = av_mallocz_array(s->numXtiles * s->numYtiles, sizeof(*s->tile));
     if (!s->tile) {
         s->numXtiles = s->numYtiles = 0;
@@ -243,36 +348,74 @@ static int get_siz(Jpeg2000DecoderContext *s)
     s->avctx->height = ff_jpeg2000_ceildivpow2(s->height - s->image_offset_y,
                                                s->reduction_factor);
 
-    switch (s->ncomponents) {
-    case 1:
-        if (s->precision > 8)
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        else
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-        break;
-    case 3:
-        switch (s->avctx->profile) {
-        case FF_PROFILE_JPEG2000_DCINEMA_2K:
-        case FF_PROFILE_JPEG2000_DCINEMA_4K:
-            /* XYZ color-space for digital cinema profiles */
-            s->avctx->pix_fmt = AV_PIX_FMT_XYZ12;
+    if (s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_2K ||
+        s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_4K) {
+        possible_fmts = xyz_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(xyz_pix_fmts);
+    } else {
+        switch (s->colour_space) {
+        case 16:
+            possible_fmts = rgb_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+            break;
+        case 17:
+            possible_fmts = gray_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+            break;
+        case 18:
+            possible_fmts = yuv_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
             break;
         default:
-            if (s->precision > 8)
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            else
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            possible_fmts = all_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(all_pix_fmts);
             break;
         }
-        break;
-    case 4:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
-        break;
-    default:
-        /* pixel format can not be identified */
-        s->avctx->pix_fmt = AV_PIX_FMT_NONE;
-        break;
     }
+    for (i = 0; i < possible_fmts_nb; ++i) {
+        if (pix_fmt_match(possible_fmts[i], ncomponents, s->precision, log2_chroma_wh, s->pal8)) {
+            s->avctx->pix_fmt = possible_fmts[i];
+            break;
+        }
+    }
+
+    if (i == possible_fmts_nb) {
+        if (ncomponents == 4 &&
+            s->cdy[0] == 1 && s->cdx[0] == 1 &&
+            s->cdy[1] == 1 && s->cdx[1] == 1 &&
+            s->cdy[2] == s->cdy[3] && s->cdx[2] == s->cdx[3]) {
+            if (s->precision == 8 && s->cdy[2] == 2 && s->cdx[2] == 2 && !s->pal8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+                s->cdef[0] = 0;
+                s->cdef[1] = 1;
+                s->cdef[2] = 2;
+                s->cdef[3] = 3;
+                i = 0;
+            }
+        }
+    }
+
+
+    if (i == possible_fmts_nb) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Unknown pix_fmt, profile: %d, colour_space: %d, "
+               "components: %d, precision: %d\n"
+               "cdx[0]: %d, cdy[0]: %d\n"
+               "cdx[1]: %d, cdy[1]: %d\n"
+               "cdx[2]: %d, cdy[2]: %d\n"
+               "cdx[3]: %d, cdy[3]: %d\n",
+               s->avctx->profile, s->colour_space, ncomponents, s->precision,
+               s->cdx[0],
+               s->cdy[0],
+               ncomponents > 1 ? s->cdx[1] : 0,
+               ncomponents > 1 ? s->cdy[1] : 0,
+               ncomponents > 2 ? s->cdx[2] : 0,
+               ncomponents > 2 ? s->cdy[2] : 0,
+               ncomponents > 3 ? s->cdx[3] : 0,
+               ncomponents > 3 ? s->cdy[3] : 0);
+        return AVERROR_PATCHWELCOME;
+    }
+    s->avctx->bits_per_raw_sample = s->precision;
     return 0;
 }
 
@@ -281,24 +424,34 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 {
     uint8_t byte;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COX\n");
         return AVERROR_INVALIDDATA;
+    }
 
     /*  nreslevels = number of resolution levels
                    = number of decomposition level +1 */
     c->nreslevels = bytestream2_get_byteu(&s->g) + 1;
-
-    if (c->nreslevels > JPEG2000_MAX_RESLEVELS)
+    if (c->nreslevels >= JPEG2000_MAX_RESLEVELS) {
+        av_log(s->avctx, AV_LOG_ERROR, "nreslevels %d is invalid\n", c->nreslevels);
         return AVERROR_INVALIDDATA;
+    }
+
+    if (c->nreslevels <= s->reduction_factor) {
+        /* we are forced to update reduction_factor as its requested value is
+           not compatible with this bitstream, and as we might have used it
+           already in setup earlier we have to fail this frame until
+           reinitialization is implemented */
+        av_log(s->avctx, AV_LOG_ERROR, "reduction_factor too large for this bitstream, max is %d\n", c->nreslevels - 1);
+        s->reduction_factor = c->nreslevels - 1;
+        return AVERROR(EINVAL);
+    }
 
     /* compute number of resolution levels to decode */
-    if (c->nreslevels < s->reduction_factor)
-        c->nreslevels2decode = 1;
-    else
-        c->nreslevels2decode = c->nreslevels - s->reduction_factor;
+    c->nreslevels2decode = c->nreslevels - s->reduction_factor;
 
-    c->log2_cblk_width  = bytestream2_get_byteu(&s->g) + 2; // cblk width
-    c->log2_cblk_height = bytestream2_get_byteu(&s->g) + 2; // cblk height
+    c->log2_cblk_width  = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk width
+    c->log2_cblk_height = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk height
 
     if (c->log2_cblk_width > 10 || c->log2_cblk_height > 10 ||
         c->log2_cblk_width + c->log2_cblk_height > 12) {
@@ -308,13 +461,17 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 
     c->cblk_style = bytestream2_get_byteu(&s->g);
     if (c->cblk_style != 0) { // cblk style
-        avpriv_request_sample(s->avctx, "Support for extra cblk styles");
-        return AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "extra cblk styles %X\n", c->cblk_style);
+        if (c->cblk_style & JPEG2000_CBLK_BYPASS)
+            av_log(s->avctx, AV_LOG_WARNING, "Selective arithmetic coding bypass\n");
     }
     c->transform = bytestream2_get_byteu(&s->g); // DWT transformation type
     /* set integer 9/7 DWT in case of BITEXACT flag */
     if ((s->avctx->flags & AV_CODEC_FLAG_BITEXACT) && (c->transform == FF_DWT97))
         c->transform = FF_DWT97_INT;
+    else if (c->transform == FF_DWT53) {
+        s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
+    }
 
     if (c->csty & JPEG2000_CSTY_PREC) {
         int i;
@@ -322,6 +479,13 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
             byte = bytestream2_get_byte(&s->g);
             c->log2_prec_widths[i]  =  byte       & 0x0F;    // precinct PPx
             c->log2_prec_heights[i] = (byte >> 4) & 0x0F;    // precinct PPy
+            if (i)
+                if (c->log2_prec_widths[i] == 0 || c->log2_prec_heights[i] == 0) {
+                    av_log(s->avctx, AV_LOG_ERROR, "PPx %d PPy %d invalid\n",
+                           c->log2_prec_widths[i], c->log2_prec_heights[i]);
+                    c->log2_prec_widths[i] = c->log2_prec_heights[i] = 1;
+                    return AVERROR_INVALIDDATA;
+                }
         }
     } else {
         memset(c->log2_prec_widths , 15, sizeof(c->log2_prec_widths ));
@@ -337,8 +501,10 @@ static int get_cod(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
     Jpeg2000CodingStyle tmp;
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COD\n");
         return AVERROR_INVALIDDATA;
+    }
 
     tmp.csty = bytestream2_get_byteu(&s->g);
 
@@ -371,8 +537,10 @@ static int get_coc(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
 {
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 2)
+    if (bytestream2_get_bytes_left(&s->g) < 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COC\n");
         return AVERROR_INVALIDDATA;
+    }
 
     compno = bytestream2_get_byteu(&s->g);
 
@@ -409,7 +577,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     if (q->quantsty == JPEG2000_QSTY_NONE) {
         n -= 3;
         if (bytestream2_get_bytes_left(&s->g) < n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++)
             q->expn[i] = bytestream2_get_byteu(&s->g) >> 3;
@@ -427,7 +595,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     } else {
         n = (n - 3) >> 1;
         if (bytestream2_get_bytes_left(&s->g) < 2 * n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++) {
             x          = bytestream2_get_be16u(&s->g);
@@ -445,6 +613,8 @@ static int get_qcd(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     Jpeg2000QuantStyle tmp;
     int compno, ret;
 
+    memset(&tmp, 0, sizeof(tmp));
+
     if ((ret = get_qcx(s, n, &tmp)) < 0)
         return ret;
     for (compno = 0; compno < s->ncomponents; compno++)
@@ -476,40 +646,99 @@ static int get_qcc(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     return get_qcx(s, n - 1, q + compno);
 }
 
+static int get_poc(Jpeg2000DecoderContext *s, int size, Jpeg2000POC *p)
+{
+    int i;
+    int elem_size = s->ncomponents <= 257 ? 7 : 9;
+    Jpeg2000POC tmp = {{{0}}};
+
+    if (bytestream2_get_bytes_left(&s->g) < 5 || size < 2 + elem_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (elem_size > 7) {
+        avpriv_request_sample(s->avctx, "Fat POC not supported");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    tmp.nb_poc = (size - 2) / elem_size;
+    if (tmp.nb_poc > MAX_POCS) {
+        avpriv_request_sample(s->avctx, "Too many POCs (%d)", tmp.nb_poc);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    for (i = 0; i<tmp.nb_poc; i++) {
+        Jpeg2000POCEntry *e = &tmp.poc[i];
+        e->RSpoc  = bytestream2_get_byteu(&s->g);
+        e->CSpoc  = bytestream2_get_byteu(&s->g);
+        e->LYEpoc = bytestream2_get_be16u(&s->g);
+        e->REpoc  = bytestream2_get_byteu(&s->g);
+        e->CEpoc  = bytestream2_get_byteu(&s->g);
+        e->Ppoc   = bytestream2_get_byteu(&s->g);
+        if (!e->CEpoc)
+            e->CEpoc = 256;
+        if (e->CEpoc > s->ncomponents)
+            e->CEpoc = s->ncomponents;
+        if (   e->RSpoc >= e->REpoc || e->REpoc > 33
+            || e->CSpoc >= e->CEpoc || e->CEpoc > s->ncomponents
+            || !e->LYEpoc) {
+            av_log(s->avctx, AV_LOG_ERROR, "POC Entry %d is invalid (%d, %d, %d, %d, %d, %d)\n", i,
+                e->RSpoc, e->CSpoc, e->LYEpoc, e->REpoc, e->CEpoc, e->Ppoc
+            );
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (!p->nb_poc || p->is_default) {
+        *p = tmp;
+    } else {
+        if (p->nb_poc + tmp.nb_poc > MAX_POCS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+            return AVERROR_INVALIDDATA;
+        }
+        memcpy(p->poc + p->nb_poc, tmp.poc, tmp.nb_poc * sizeof(tmp.poc[0]));
+        p->nb_poc += tmp.nb_poc;
+    }
+
+    p->is_default = 0;
+
+    return 0;
+}
+
+
 /* Get start of tile segment. */
 static int get_sot(Jpeg2000DecoderContext *s, int n)
 {
     Jpeg2000TilePart *tp;
     uint16_t Isot;
     uint32_t Psot;
-    uint8_t TPsot;
+    unsigned TPsot;
 
     if (bytestream2_get_bytes_left(&s->g) < 8)
         return AVERROR_INVALIDDATA;
 
+    s->curtileno = 0;
     Isot = bytestream2_get_be16u(&s->g);        // Isot
     if (Isot >= s->numXtiles * s->numYtiles)
         return AVERROR_INVALIDDATA;
 
-    if (Isot) {
-        avpriv_request_sample(s->avctx, "Support for more than one tile");
-        return AVERROR_PATCHWELCOME;
-    }
+    s->curtileno = Isot;
     Psot  = bytestream2_get_be32u(&s->g);       // Psot
     TPsot = bytestream2_get_byteu(&s->g);       // TPsot
 
     /* Read TNSot but not used */
     bytestream2_get_byteu(&s->g);               // TNsot
 
+    if (!Psot)
+        Psot = bytestream2_get_bytes_left(&s->g) + n + 2;
+
     if (Psot > bytestream2_get_bytes_left(&s->g) + n + 2) {
         av_log(s->avctx, AV_LOG_ERROR, "Psot %"PRIu32" too big\n", Psot);
         return AVERROR_INVALIDDATA;
     }
 
-    if (TPsot >= FF_ARRAY_ELEMS(s->tile[Isot].tile_part)) {
-        avpriv_request_sample(s->avctx, "Support for %"PRIu8" components", TPsot);
-        return AVERROR_PATCHWELCOME;
-    }
+    av_assert0(TPsot < FF_ARRAY_ELEMS(s->tile[Isot].tile_part));
 
     s->tile[Isot].tp_idx = TPsot;
     tp             = s->tile[Isot].tile_part + TPsot;
@@ -522,6 +751,8 @@ static int get_sot(Jpeg2000DecoderContext *s, int n)
         /* copy defaults */
         memcpy(tile->codsty, s->codsty, s->ncomponents * sizeof(Jpeg2000CodingStyle));
         memcpy(tile->qntsty, s->qntsty, s->ncomponents * sizeof(Jpeg2000QuantStyle));
+        memcpy(&tile->poc  , &s->poc  , sizeof(tile->poc));
+        tile->poc.is_default = 1;
     }
 
     return 0;
@@ -569,6 +800,22 @@ static uint8_t get_tlm(Jpeg2000DecoderContext *s, int n)
     return 0;
 }
 
+static uint8_t get_plt(Jpeg2000DecoderContext *s, int n)
+{
+    int i;
+
+    av_log(s->avctx, AV_LOG_DEBUG,
+            "PLT marker at pos 0x%X\n", bytestream2_tell(&s->g) - 4);
+
+    /*Zplt =*/ bytestream2_get_byte(&s->g);
+
+    for (i = 0; i < n - 3; i++) {
+        bytestream2_get_byte(&s->g);
+    }
+
+    return 0;
+}
+
 static int init_tile(Jpeg2000DecoderContext *s, int tileno)
 {
     int compno;
@@ -579,16 +826,27 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
     if (!tile->comp)
         return AVERROR(ENOMEM);
 
+    tile->coord[0][0] = FFMAX(tilex       * s->tile_width  + s->tile_offset_x, s->image_offset_x);
+    tile->coord[0][1] = FFMIN((tilex + 1) * s->tile_width  + s->tile_offset_x, s->width);
+    tile->coord[1][0] = FFMAX(tiley       * s->tile_height + s->tile_offset_y, s->image_offset_y);
+    tile->coord[1][1] = FFMIN((tiley + 1) * s->tile_height + s->tile_offset_y, s->height);
+
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
         Jpeg2000QuantStyle  *qntsty = tile->qntsty + compno;
         int ret; // global bandno
 
-        comp->coord_o[0][0] = FFMAX(tilex       * s->tile_width  + s->tile_offset_x, s->image_offset_x);
-        comp->coord_o[0][1] = FFMIN((tilex + 1) * s->tile_width  + s->tile_offset_x, s->width);
-        comp->coord_o[1][0] = FFMAX(tiley       * s->tile_height + s->tile_offset_y, s->image_offset_y);
-        comp->coord_o[1][1] = FFMIN((tiley + 1) * s->tile_height + s->tile_offset_y, s->height);
+        comp->coord_o[0][0] = tile->coord[0][0];
+        comp->coord_o[0][1] = tile->coord[0][1];
+        comp->coord_o[1][0] = tile->coord[1][0];
+        comp->coord_o[1][1] = tile->coord[1][1];
+        if (compno) {
+            comp->coord_o[0][0] /= s->cdx[compno];
+            comp->coord_o[0][1] /= s->cdx[compno];
+            comp->coord_o[1][0] /= s->cdy[compno];
+            comp->coord_o[1][1] /= s->cdy[compno];
+        }
 
         comp->coord[0][0] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], s->reduction_factor);
         comp->coord[0][1] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][1], s->reduction_factor);
@@ -630,12 +888,26 @@ static int getlblockinc(Jpeg2000DecoderContext *s)
     return res;
 }
 
-static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
+static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile, int *tp_index,
                                   Jpeg2000CodingStyle *codsty,
                                   Jpeg2000ResLevel *rlevel, int precno,
                                   int layno, uint8_t *expn, int numgbits)
 {
     int bandno, cblkno, ret, nb_code_blocks;
+    int cwsno;
+
+    if (layno < rlevel->band[0].prec[precno].decoded_layers)
+        return 0;
+    rlevel->band[0].prec[precno].decoded_layers = layno + 1;
+
+    if (bytestream2_get_bytes_left(&s->g) == 0 && s->bit_index == 8) {
+        if (*tp_index < FF_ARRAY_ELEMS(tile->tile_part) - 1) {
+            s->g = tile->tile_part[++(*tp_index)].tpg;
+        }
+    }
+
+    if (bytestream2_peek_be32(&s->g) == JPEG2000_SOP_FIXED_BYTES)
+        bytestream2_skip(&s->g, JPEG2000_SOP_BYTE_LENGTH);
 
     if (!(ret = get_bits(s, 1))) {
         jpeg2000_flush(s);
@@ -677,19 +949,46 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
             }
             if ((newpasses = getnpasses(s)) < 0)
                 return newpasses;
+            av_assert2(newpasses > 0);
+            if (cblk->npasses + newpasses >= JPEG2000_MAX_PASSES) {
+                avpriv_request_sample(s->avctx, "Too many passes");
+                return AVERROR_PATCHWELCOME;
+            }
             if ((llen = getlblockinc(s)) < 0)
                 return llen;
-            cblk->lblock += llen;
-            if ((ret = get_bits(s, av_log2(newpasses) + cblk->lblock)) < 0)
-                return ret;
-            if (ret > sizeof(cblk->data)) {
+            if (cblk->lblock + llen + av_log2(newpasses) > 16) {
                 avpriv_request_sample(s->avctx,
-                                      "Block with lengthinc greater than %zu",
-                                      sizeof(cblk->data));
+                                      "Block with length beyond 16 bits");
                 return AVERROR_PATCHWELCOME;
             }
-            cblk->lengthinc = ret;
-            cblk->npasses  += newpasses;
+
+            cblk->lblock += llen;
+
+            cblk->nb_lengthinc = 0;
+            cblk->nb_terminationsinc = 0;
+            do {
+                int newpasses1 = 0;
+
+                while (newpasses1 < newpasses) {
+                    newpasses1 ++;
+                    if (needs_termination(codsty->cblk_style, cblk->npasses + newpasses1 - 1)) {
+                        cblk->nb_terminationsinc ++;
+                        break;
+                    }
+                }
+
+                if ((ret = get_bits(s, av_log2(newpasses1) + cblk->lblock)) < 0)
+                    return ret;
+                if (ret > sizeof(cblk->data)) {
+                    avpriv_request_sample(s->avctx,
+                                        "Block with lengthinc greater than %"SIZE_SPECIFIER"",
+                                        sizeof(cblk->data));
+                    return AVERROR_PATCHWELCOME;
+                }
+                cblk->lengthinc[cblk->nb_lengthinc++] = ret;
+                cblk->npasses  += newpasses1;
+                newpasses -= newpasses1;
+            } while(newpasses);
         }
     }
     jpeg2000_flush(s);
@@ -698,7 +997,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         if (bytestream2_peek_be16(&s->g) == JPEG2000_EPH)
             bytestream2_skip(&s->g, 2);
         else
-            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found.\n");
+            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found. instead %X\n", bytestream2_peek_be32(&s->g));
     }
 
     for (bandno = 0; bandno < rlevel->nbands; bandno++) {
@@ -708,148 +1007,330 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         nb_code_blocks = prec->nb_codeblocks_height * prec->nb_codeblocks_width;
         for (cblkno = 0; cblkno < nb_code_blocks; cblkno++) {
             Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-            if (bytestream2_get_bytes_left(&s->g) < cblk->lengthinc)
-                return AVERROR_INVALIDDATA;
-            /* Code-block data can be empty. In that case initialize data
-             * with 0xFFFF. */
-            if (cblk->lengthinc > 0) {
-                bytestream2_get_bufferu(&s->g, cblk->data, cblk->lengthinc);
-            } else {
-                cblk->data[0] = 0xFF;
-                cblk->data[1] = 0xFF;
-            }
-            cblk->length   += cblk->lengthinc;
-            cblk->lengthinc = 0;
+            for (cwsno = 0; cwsno < cblk->nb_lengthinc; cwsno ++) {
+                if (   bytestream2_get_bytes_left(&s->g) < cblk->lengthinc[cwsno]
+                    || sizeof(cblk->data) < cblk->length + cblk->lengthinc[cwsno] + 4
+                ) {
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "Block length %"PRIu16" or lengthinc %d is too large, left %d\n",
+                        cblk->length, cblk->lengthinc[cwsno], bytestream2_get_bytes_left(&s->g));
+                    return AVERROR_INVALIDDATA;
+                }
 
-            if (cblk->length > sizeof(cblk->data)) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "Block length %"PRIu16" > data size %zd\n",
-                       cblk->length, sizeof(cblk->data));
-                return AVERROR_INVALIDDATA;
+                bytestream2_get_bufferu(&s->g, cblk->data + cblk->length, cblk->lengthinc[cwsno]);
+                cblk->length   += cblk->lengthinc[cwsno];
+                cblk->lengthinc[cwsno] = 0;
+                if (cblk->nb_terminationsinc) {
+                    cblk->nb_terminationsinc--;
+                    cblk->nb_terminations++;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data_start[cblk->nb_terminations] = cblk->length;
+                }
             }
         }
     }
     return 0;
 }
 
-static int decode_pgod_lrcp(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+static int jpeg2000_decode_packets_po_iteration(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
+                                             int RSpoc, int CSpoc,
+                                             int LYEpoc, int REpoc, int CEpoc,
+                                             int Ppoc, int *tp_index)
 {
+    int ret = 0;
     int layno, reslevelno, compno, precno, ok_reslevel;
-    int ret;
+    int x, y;
+    int step_x, step_y;
 
-    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
+    switch (Ppoc) {
+    case JPEG2000_PGOD_RLCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order RLCP\n");
         ok_reslevel = 1;
-        for (reslevelno = 0; ok_reslevel; reslevelno++) {
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
             ok_reslevel = 0;
-            for (compno = 0; compno < s->ncomponents; compno++) {
-                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-                Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-                if (reslevelno < codsty->nreslevels) {
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
-                                               reslevelno;
-                    ok_reslevel = 1;
-                    for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
-                        if ((ret = jpeg2000_decode_packet(s,
-                                                          codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+            for (layno = 0; layno < LYEpoc; layno++) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                    }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int decode_pgod_cprl(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int layno, reslevelno, compno, precno;
-    int ret, x, y;
-
-    for (compno = 0; compno < s->ncomponents; compno++) {
-        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-
-        /* Set bit stream buffer address according to tile-part.
-         * For DCinema one tile-part per component, so can be
-         * indexed by component. */
-        s->g = tile->tile_part[compno].tpg;
-
-        /* Position loop (y axis)
-         * TODO: Automate computing of step 256.
-         * Fixed here, but to be computed before entering here. */
-        for (y = 0; y < s->height; y += 256) {
-            /* Position loop (y axis)
-             * TODO: automate computing of step 256.
-             * Fixed here, but to be computed before entering here. */
-            for (x = 0; x < s->width; x += 256) {
-                for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
-                    uint16_t prcx, prcy;
-                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel + reslevelno;
-
-                    if (!((y % (1 << (rlevel->log2_prec_height + reducedresno)) == 0) ||
-                          (y == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
-
-                    if (!((x % (1 << (rlevel->log2_prec_width + reducedresno)) == 0) ||
-                          (x == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
+        break;
 
-                    // check if a precinct exists
-                    prcx   = ff_jpeg2000_ceildivpow2(x, reducedresno) >> rlevel->log2_prec_width;
-                    prcy   = ff_jpeg2000_ceildivpow2(y, reducedresno) >> rlevel->log2_prec_height;
-                    precno = prcx + rlevel->num_precincts_x * prcy;
-                    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
-                        if ((ret = jpeg2000_decode_packet(s, codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+    case JPEG2000_PGOD_LRCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order LRCP\n");
+        for (layno = 0; layno < LYEpoc; layno++) {
+            ok_reslevel = 1;
+            for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+                ok_reslevel = 0;
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
                     }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int ret = 0;
-
-    s->bit_index = 8;
-    switch (tile->codsty[0].prog_order) {
-    case JPEG2000_PGOD_LRCP:
-        ret = decode_pgod_lrcp(s, tile);
         break;
 
     case JPEG2000_PGOD_CPRL:
-        ret = decode_pgod_cprl(s, tile);
-        break;
-
-    case JPEG2000_PGOD_RLCP:
-        avpriv_request_sample(s->avctx, "Progression order RLCP");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order CPRL\n");
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+            Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+            step_x = 32;
+            step_y = 32;
+
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+            av_assert0(step_x < 32 && step_y < 32);
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_RPCL:
-        avpriv_request_sample(s->avctx, "Progression order RPCL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order RPCL\n");
+        ok_reslevel = 1;
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+            ok_reslevel = 0;
+            step_x = 30;
+            step_y = 30;
+            for (compno = CSpoc; compno < CEpoc; compno++) {
+                Jpeg2000Component *comp     = tile->comp + compno;
+                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+                if (reslevelno < codsty->nreslevels) {
+                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                    Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                    step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                    step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+                }
+            }
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (compno = CSpoc; compno < CEpoc; compno++) {
+                        Jpeg2000Component *comp     = tile->comp + compno;
+                        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        unsigned prcx, prcy;
+
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (reslevelno >= codsty->nreslevels)
+                            continue;
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        ok_reslevel = 1;
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                            for (layno = 0; layno < LYEpoc; layno++) {
+                                if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                                codsty, rlevel,
+                                                                precno, layno,
+                                                                qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                                qntsty->nguardbits)) < 0)
+                                    return ret;
+                            }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_PCRL:
-        avpriv_request_sample(s->avctx, "Progression order PCRL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order PCRL\n");
+        step_x = 32;
+        step_y = 32;
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+        }
+        step_x = 1<<step_x;
+        step_y = 1<<step_y;
+
+        for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+            for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000Component *comp     = tile->comp + compno;
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    int xc = x / s->cdx[compno];
+                    int yc = y / s->cdy[compno];
+
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     default:
         break;
     }
 
+    return ret;
+}
+
+static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+{
+    int ret = AVERROR_BUG;
+    int i;
+    int tp_index = 0;
+
+    s->bit_index = 8;
+    if (tile->poc.nb_poc) {
+        for (i=0; i<tile->poc.nb_poc; i++) {
+            Jpeg2000POCEntry *e = &tile->poc.poc[i];
+            ret = jpeg2000_decode_packets_po_iteration(s, tile,
+                e->RSpoc, e->CSpoc,
+                FFMIN(e->LYEpoc, tile->codsty[0].nlayers),
+                e->REpoc,
+                FFMIN(e->CEpoc, s->ncomponents),
+                e->Ppoc, &tp_index
+                );
+            if (ret < 0)
+                return ret;
+        }
+    } else {
+        ret = jpeg2000_decode_packets_po_iteration(s, tile,
+            0, 0,
+            tile->codsty[0].nlayers,
+            33,
+            s->ncomponents,
+            tile->codsty[0].prog_order,
+            &tp_index
+        );
+    }
     /* EOC marker reached */
     bytestream2_skip(&s->g, 2);
 
@@ -858,7 +1339,7 @@ static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
 
 /* TIER-1 routines */
 static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno, int bandno, int bpass_csty_symbol,
+                           int bpno, int bandno,
                            int vert_causal_ctx_csty_symbol)
 {
     int mask = 3 << (bpno - 1), y0, x, y;
@@ -866,29 +1347,29 @@ static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++) {
-                if ((t1->flags[y+1][x+1] & JPEG2000_T1_SIG_NB)
-                && !(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                    int flags_mask = -1;
-                    if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                        flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask, bandno))) {
-                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y+1][x+1], &xorbit);
-                        if (bpass_csty_symbol)
-                             t1->data[y][x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
+                if ((t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB & flags_mask)
+                && !(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, bandno))) {
+                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, &xorbit);
+                        if (t1->mqc.raw)
+                             t1->data[(y) * t1->stride + x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
                         else
-                             t1->data[y][x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
+                             t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
                                                -mask : mask;
 
                         ff_jpeg2000_set_significance(t1, x, y,
-                                                     t1->data[y][x] < 0);
+                                                     t1->data[(y) * t1->stride + x] < 0);
                     }
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_VIS;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_VIS;
                 }
             }
 }
 
 static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno)
+                           int bpno, int vert_causal_ctx_csty_symbol)
 {
     int phalf, nhalf;
     int y0, x, y;
@@ -899,13 +1380,15 @@ static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++)
-                if ((t1->flags[y + 1][x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
-                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[y + 1][x + 1]);
+                if ((t1->flags[(y + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
+                    int flags_mask = (vert_causal_ctx_csty_symbol && y == y0 + 3) ?
+                        ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S) : -1;
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask);
                     int r     = ff_mqc_decode(&t1->mqc,
                                               t1->mqc.cx_states + ctxno)
                                 ? phalf : nhalf;
-                    t1->data[y][x]          += t1->data[y][x] < 0 ? -r : r;
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_REF;
+                    t1->data[(y) * t1->stride + x]          += t1->data[(y) * t1->stride + x] < 0 ? -r : r;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_REF;
                 }
 }
 
@@ -917,11 +1400,14 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
 
     for (y0 = 0; y0 < height; y0 += 4) {
         for (x = 0; x < width; x++) {
+            int flags_mask = -1;
+            if (vert_causal_ctx_csty_symbol)
+                flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
             if (y0 + 3 < height &&
-                !((t1->flags[y0 + 1][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 2][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 3][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 4][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)))) {
+                !((t1->flags[(y0 + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 2) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 3) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 4) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG) & flags_mask))) {
                 if (!ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL))
                     continue;
                 runlen = ff_mqc_decode(&t1->mqc,
@@ -936,27 +1422,27 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
             }
 
             for (y = y0 + runlen; y < y0 + 4 && y < height; y++) {
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
                 if (!dec) {
-                    if (!(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                        int flags_mask = -1;
-                        if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                            flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask,
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask,
                                                                                              bandno));
                     }
                 }
                 if (dec) {
                     int xorbit;
-                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y + 1][x + 1],
+                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask,
                                                         &xorbit);
-                    t1->data[y][x] = (ff_mqc_decode(&t1->mqc,
+                    t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc,
                                                     t1->mqc.cx_states + ctxno) ^
                                       xorbit)
                                      ? -mask : mask;
-                    ff_jpeg2000_set_significance(t1, x, y, t1->data[y][x] < 0);
+                    ff_jpeg2000_set_significance(t1, x, y, t1->data[(y) * t1->stride + x] < 0);
                 }
                 dec = 0;
-                t1->flags[y + 1][x + 1] &= ~JPEG2000_T1_VIS;
+                t1->flags[(y + 1) * t1->stride + x + 1] &= ~JPEG2000_T1_VIS;
             }
         }
     }
@@ -976,52 +1462,73 @@ static int decode_cblk(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *codsty,
                        Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk,
                        int width, int height, int bandpos)
 {
-    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1, y;
-    int clnpass_cnt = 0;
-    int bpass_csty_symbol           = codsty->cblk_style & JPEG2000_CBLK_BYPASS;
+    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1;
+    int pass_cnt = 0;
     int vert_causal_ctx_csty_symbol = codsty->cblk_style & JPEG2000_CBLK_VSC;
+    int term_cnt = 0;
+    int coder_type;
 
-    for (y = 0; y < height; y++)
-        memset(t1->data[y], 0, width * sizeof(**t1->data));
+    av_assert0(width <= 1024U && height <= 1024U);
+    av_assert0(width*height <= 4096);
+
+    memset(t1->data, 0, t1->stride * height * sizeof(*t1->data));
 
     /* If code-block contains no compressed data: nothing to do. */
     if (!cblk->length)
         return 0;
-    for (y = 0; y < height + 2; y++)
-        memset(t1->flags[y], 0, (width + 2) * sizeof(**t1->flags));
 
-    ff_mqc_initdec(&t1->mqc, cblk->data);
-    cblk->data[cblk->length]     = 0xff;
-    cblk->data[cblk->length + 1] = 0xff;
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    cblk->data[cblk->length] = 0xff;
+    cblk->data[cblk->length+1] = 0xff;
+    ff_mqc_initdec(&t1->mqc, cblk->data, 0, 1);
 
     while (passno--) {
-        switch (pass_t) {
+        switch(pass_t) {
         case 0:
             decode_sigpass(t1, width, height, bpno + 1, bandpos,
-                           bpass_csty_symbol && (clnpass_cnt >= 4),
                            vert_causal_ctx_csty_symbol);
             break;
         case 1:
-            decode_refpass(t1, width, height, bpno + 1);
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
+            decode_refpass(t1, width, height, bpno + 1, vert_causal_ctx_csty_symbol);
             break;
         case 2:
+            av_assert2(!t1->mqc.raw);
             decode_clnpass(s, t1, width, height, bpno + 1, bandpos,
                            codsty->cblk_style & JPEG2000_CBLK_SEGSYM,
                            vert_causal_ctx_csty_symbol);
-            clnpass_cnt = clnpass_cnt + 1;
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
             break;
         }
+        if (codsty->cblk_style & JPEG2000_CBLK_RESET) // XXX no testcase for just this
+            ff_mqc_init_contexts(&t1->mqc);
+
+        if (passno && (coder_type = needs_termination(codsty->cblk_style, pass_cnt))) {
+            if (term_cnt >= cblk->nb_terminations) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing needed termination \n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (FFABS(cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp) > 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mid mismatch %"PTRDIFF_SPECIFIER" in pass %d of %d\n",
+                    cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp,
+                    pass_cnt, cblk->npasses);
+            }
+
+            ff_mqc_initdec(&t1->mqc, cblk->data + cblk->data_start[++term_cnt], coder_type == 2, 0);
+        }
 
         pass_t++;
         if (pass_t == 3) {
             bpno--;
             pass_t = 0;
         }
+        pass_cnt ++;
     }
+
+    if (cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) != t1->mqc.bp) {
+        av_log(s->avctx, AV_LOG_WARNING, "End mismatch %"PTRDIFF_SPECIFIER"\n",
+               cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) - t1->mqc.bp);
+    }
+
     return 0;
 }
 
@@ -1040,7 +1547,7 @@ static void dequantization_float(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         float *datap = &comp->f_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
             datap[i] = src[i] * band->f_stepsize;
     }
@@ -1055,9 +1562,29 @@ static void dequantization_int(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
+        if (band->i_stepsize == 32768) {
+            for (i = 0; i < w; ++i)
+                datap[i] = src[i] / 2;
+        } else {
+            // This should be VERY uncommon
+            for (i = 0; i < w; ++i)
+                datap[i] = (src[i] * (int64_t)band->i_stepsize) / 65536;
+        }
+    }
+}
+
+static void dequantization_int_97(int x, int y, Jpeg2000Cblk *cblk,
+                               Jpeg2000Component *comp,
+                               Jpeg2000T1Context *t1, Jpeg2000Band *band)
+{
+    int i, j;
+    int w = cblk->coord[0][1] - cblk->coord[0][0];
+    for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
+        int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
-            datap[i] = (src[i] * band->i_stepsize + (1 << 15)) >> 16;
+            datap[i] = (src[i] * (int64_t)band->i_stepsize + (1<<15)) >> 16;
     }
 }
 
@@ -1066,6 +1593,17 @@ static inline void mct_decode(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
     int i, csize = 1;
     void *src[3];
 
+    for (i = 1; i < 3; i++) {
+        if (tile->codsty[0].transform != tile->codsty[i].transform) {
+            av_log(s->avctx, AV_LOG_ERROR, "Transforms mismatch, MCT not supported\n");
+            return;
+        }
+        if (memcmp(tile->comp[0].coord, tile->comp[i].coord, sizeof(tile->comp[0].coord))) {
+            av_log(s->avctx, AV_LOG_ERROR, "Coords mismatch, MCT not supported\n");
+            return;
+        }
+    }
+
     for (i = 0; i < 3; i++)
         if (tile->codsty[0].transform == FF_DWT97)
             src[i] = tile->comp[i].f_data;
@@ -1085,18 +1623,21 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
     int compno, reslevelno, bandno;
 
     /* Loop on tile components */
-
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp     = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
         /* Loop on resolution levels */
         for (reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
             Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
             /* Loop on bands */
             for (bandno = 0; bandno < rlevel->nbands; bandno++) {
-                uint16_t nb_precincts, precno;
+                int nb_precincts, precno;
                 Jpeg2000Band *band = rlevel->band + bandno;
                 int cblkno = 0, bandpos;
+
                 bandpos = bandno + (reslevelno > 0);
 
                 if (band->coord[0][0] == band->coord[0][1] ||
@@ -1119,11 +1660,13 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
                                     cblk->coord[1][1] - cblk->coord[1][0],
                                     bandpos);
 
-                        x = cblk->coord[0][0];
-                        y = cblk->coord[1][0];
+                        x = cblk->coord[0][0] - band->coord[0][0];
+                        y = cblk->coord[1][0] - band->coord[1][0];
 
                         if (codsty->transform == FF_DWT97)
                             dequantization_float(x, y, cblk, comp, &t1, band);
+                        else if (codsty->transform == FF_DWT97_INT)
+                            dequantization_int_97(x, y, cblk, comp, &t1, band);
                         else
                             dequantization_int(x, y, cblk, comp, &t1, band);
                    } /* end cblk */
@@ -1138,9 +1681,12 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
 
 #define WRITE_FRAME(D, PIXEL)                                                                     \
     static inline void write_frame_ ## D(Jpeg2000DecoderContext * s, Jpeg2000Tile * tile,         \
-                                         AVFrame * picture)                                       \
+                                         AVFrame * picture, int precision)                        \
     {                                                                                             \
-        int linesize = picture->linesize[0] / sizeof(PIXEL);                                      \
+        const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->avctx->pix_fmt);               \
+        int planar    = !!(pixdesc->flags & AV_PIX_FMT_FLAG_PLANAR);                              \
+        int pixelsize = planar ? 1 : pixdesc->nb_components;                                      \
+                                                                                                  \
         int compno;                                                                               \
         int x, y;                                                                                 \
                                                                                                   \
@@ -1152,35 +1698,39 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
             int32_t *i_datap = comp->i_data;                                                      \
             int cbps         = s->cbps[compno];                                                   \
             int w            = tile->comp[compno].coord[0][1] - s->image_offset_x;                \
+            int plane        = 0;                                                                 \
                                                                                                   \
-            y    = tile->comp[compno].coord[1][0] - s->image_offset_y;                            \
-            line = (PIXEL *)picture->data[0] + y * linesize;                                      \
-            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y += s->cdy[compno]) { \
+            if (planar)                                                                           \
+                plane = s->cdef[compno] ? s->cdef[compno]-1 : (s->ncomponents-1);                 \
+                                                                                                  \
+            y    = tile->comp[compno].coord[1][0] - s->image_offset_y / s->cdy[compno];           \
+            line = (PIXEL *)picture->data[plane] + y * (picture->linesize[plane] / sizeof(PIXEL));\
+            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y++) {                 \
                 PIXEL *dst;                                                                       \
                                                                                                   \
-                x   = tile->comp[compno].coord[0][0] - s->image_offset_x;                         \
-                dst = line + x * s->ncomponents + compno;                                         \
+                x   = tile->comp[compno].coord[0][0] - s->image_offset_x / s->cdx[compno];        \
+                dst = line + x * pixelsize + compno*!planar;                                      \
                                                                                                   \
                 if (codsty->transform == FF_DWT97) {                                              \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = lrintf(*datap) + (1 << (cbps - 1));                             \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         datap++;                                                                  \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 } else {                                                                          \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = *i_datap + (1 << (cbps - 1));                                   \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         i_datap++;                                                                \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 }                                                                                 \
-                line += linesize;                                                                 \
+                line += picture->linesize[plane] / sizeof(PIXEL);                                 \
             }                                                                                     \
         }                                                                                         \
                                                                                                   \
@@ -1194,16 +1744,30 @@ WRITE_FRAME(16, uint16_t)
 static int jpeg2000_decode_tile(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
                                 AVFrame *picture)
 {
+    int x;
+
     tile_codeblocks(s, tile);
 
     /* inverse MCT transformation */
     if (tile->codsty[0].mct)
         mct_decode(s, tile);
 
+    if (s->cdef[0] < 0) {
+        for (x = 0; x < s->ncomponents; x++)
+            s->cdef[x] = x + 1;
+        if ((s->ncomponents & 1) == 0)
+            s->cdef[s->ncomponents-1] = 0;
+    }
+
     if (s->precision <= 8) {
-        write_frame_8(s, tile, picture);
+        write_frame_8(s, tile, picture, 8);
     } else {
-        write_frame_16(s, tile, picture);
+        int precision = picture->format == AV_PIX_FMT_XYZ12 ||
+                        picture->format == AV_PIX_FMT_RGB48 ||
+                        picture->format == AV_PIX_FMT_RGBA64 ||
+                        picture->format == AV_PIX_FMT_GRAY16 ? 16 : s->precision;
+
+        write_frame_16(s, tile, picture, precision);
     }
 
     return 0;
@@ -1213,15 +1777,20 @@ static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
 {
     int tileno, compno;
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
-        for (compno = 0; compno < s->ncomponents; compno++) {
-            Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
-            Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
+        if (s->tile[tileno].comp) {
+            for (compno = 0; compno < s->ncomponents; compno++) {
+                Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
+                Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
 
-            ff_jpeg2000_cleanup(comp, codsty);
+                ff_jpeg2000_cleanup(comp, codsty);
+            }
+            av_freep(&s->tile[tileno].comp);
         }
-        av_freep(&s->tile[tileno].comp);
     }
     av_freep(&s->tile);
+    memset(s->codsty, 0, sizeof(s->codsty));
+    memset(s->qntsty, 0, sizeof(s->qntsty));
+    memset(&s->poc  , 0, sizeof(s->poc));
     s->numXtiles = s->numYtiles = 0;
 }
 
@@ -1229,6 +1798,7 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
 {
     Jpeg2000CodingStyle *codsty = s->codsty;
     Jpeg2000QuantStyle *qntsty  = s->qntsty;
+    Jpeg2000POC         *poc    = &s->poc;
     uint8_t *properties         = s->properties;
 
     for (;;) {
@@ -1248,17 +1818,21 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             Jpeg2000Tile *tile;
             Jpeg2000TilePart *tp;
 
-            if (s->curtileno < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
-                return AVERROR_INVALIDDATA;
-            }
             if (!s->tile) {
                 av_log(s->avctx, AV_LOG_ERROR, "Missing SIZ\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (s->curtileno < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
+                return AVERROR_INVALIDDATA;
+            }
 
             tile = s->tile + s->curtileno;
             tp = tile->tile_part + tile->tp_idx;
+            if (tp->tp_end < s->g.buffer) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tpend\n");
+                return AVERROR_INVALIDDATA;
+            }
             bytestream2_init(&tp->tpg, s->g.buffer, tp->tp_end - s->g.buffer);
             bytestream2_skip(&s->g, tp->tp_end - s->g.buffer);
 
@@ -1267,13 +1841,17 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         if (marker == JPEG2000_EOC)
             break;
 
-        len = bytestream2_get_be16u(&s->g);
-        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2)
+        len = bytestream2_get_be16(&s->g);
+        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid len %d left=%d\n", len, bytestream2_get_bytes_left(&s->g));
             return AVERROR_INVALIDDATA;
+        }
 
         switch (marker) {
         case JPEG2000_SIZ:
             ret = get_siz(s);
+            if (!s->tile)
+                s->numXtiles = s->numYtiles = 0;
             break;
         case JPEG2000_COC:
             ret = get_coc(s, codsty, properties);
@@ -1287,15 +1865,18 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         case JPEG2000_QCD:
             ret = get_qcd(s, len, qntsty, properties);
             break;
+        case JPEG2000_POC:
+            ret = get_poc(s, len, poc);
+            break;
         case JPEG2000_SOT:
             if (!(ret = get_sot(s, len))) {
+                av_assert1(s->curtileno >= 0);
                 codsty = s->tile[s->curtileno].codsty;
                 qntsty = s->tile[s->curtileno].qntsty;
+                poc    = &s->tile[s->curtileno].poc;
                 properties = s->tile[s->curtileno].properties;
             }
             break;
-        case JPEG2000_PLT:
-            // the PLT marker is ignored
         case JPEG2000_PLM:
             // the PLM marker is ignored
         case JPEG2000_COM:
@@ -1306,6 +1887,10 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             // Tile-part lengths
             ret = get_tlm(s, len);
             break;
+        case JPEG2000_PLT:
+            // Packet length, tile-part header
+            ret = get_plt(s, len);
+            break;
         default:
             av_log(s->avctx, AV_LOG_ERROR,
                    "unsupported marker 0x%.4"PRIX16" at pos 0x%X\n",
@@ -1332,11 +1917,11 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
         Jpeg2000Tile *tile = s->tile + tileno;
 
-        if (ret = init_tile(s, tileno))
+        if ((ret = init_tile(s, tileno)) < 0)
             return ret;
 
         s->g = tile->tile_part[0].tpg;
-        if (ret = jpeg2000_decode_packets(s, tile))
+        if ((ret = jpeg2000_decode_packets(s, tile)) < 0)
             return ret;
     }
 
@@ -1345,26 +1930,101 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
 
 static int jp2_find_codestream(Jpeg2000DecoderContext *s)
 {
-    uint32_t atom_size, atom;
-    int found_codestream = 0, search_range = 10;
+    uint32_t atom_size, atom, atom_end;
+    int search_range = 10;
 
-    while(!found_codestream && search_range
-          &&
-          bytestream2_get_bytes_left(&s->g) >= 8) {
+    while (search_range
+           &&
+           bytestream2_get_bytes_left(&s->g) >= 8) {
         atom_size = bytestream2_get_be32u(&s->g);
         atom      = bytestream2_get_be32u(&s->g);
-        if (atom == JP2_CODESTREAM) {
-            found_codestream = 1;
+        atom_end  = bytestream2_tell(&s->g) + atom_size - 8;
+
+        if (atom == JP2_CODESTREAM)
+            return 1;
+
+        if (bytestream2_get_bytes_left(&s->g) < atom_size || atom_end < atom_size)
+            return 0;
+
+        if (atom == JP2_HEADER &&
+                   atom_size >= 16) {
+            uint32_t atom2_size, atom2, atom2_end;
+            do {
+                atom2_size = bytestream2_get_be32u(&s->g);
+                atom2      = bytestream2_get_be32u(&s->g);
+                atom2_end  = bytestream2_tell(&s->g) + atom2_size - 8;
+                if (atom2_size < 8 || atom2_end > atom_end || atom2_end < atom2_size)
+                    break;
+                if (atom2 == JP2_CODESTREAM) {
+                    return 1;
+                } else if (atom2 == MKBETAG('c','o','l','r') && atom2_size >= 7) {
+                    int method = bytestream2_get_byteu(&s->g);
+                    bytestream2_skipu(&s->g, 2);
+                    if (method == 1) {
+                        s->colour_space = bytestream2_get_be32u(&s->g);
+                    }
+                } else if (atom2 == MKBETAG('p','c','l','r') && atom2_size >= 6) {
+                    int i, size, colour_count, colour_channels, colour_depth[3];
+                    uint32_t r, g, b;
+                    colour_count = bytestream2_get_be16u(&s->g);
+                    colour_channels = bytestream2_get_byteu(&s->g);
+                    // FIXME: Do not ignore channel_sign
+                    colour_depth[0] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[1] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[2] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    size = (colour_depth[0] + 7 >> 3) * colour_count +
+                           (colour_depth[1] + 7 >> 3) * colour_count +
+                           (colour_depth[2] + 7 >> 3) * colour_count;
+                    if (colour_count > 256   ||
+                        colour_channels != 3 ||
+                        colour_depth[0] > 16 ||
+                        colour_depth[1] > 16 ||
+                        colour_depth[2] > 16 ||
+                        atom2_size < size) {
+                        avpriv_request_sample(s->avctx, "Unknown palette");
+                        bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+                        continue;
+                    }
+                    s->pal8 = 1;
+                    for (i = 0; i < colour_count; i++) {
+                        if (colour_depth[0] <= 8) {
+                            r = bytestream2_get_byteu(&s->g) << 8 - colour_depth[0];
+                            r |= r >> colour_depth[0];
+                        } else {
+                            r = bytestream2_get_be16u(&s->g) >> colour_depth[0] - 8;
+                        }
+                        if (colour_depth[1] <= 8) {
+                            g = bytestream2_get_byteu(&s->g) << 8 - colour_depth[1];
+                            r |= r >> colour_depth[1];
+                        } else {
+                            g = bytestream2_get_be16u(&s->g) >> colour_depth[1] - 8;
+                        }
+                        if (colour_depth[2] <= 8) {
+                            b = bytestream2_get_byteu(&s->g) << 8 - colour_depth[2];
+                            r |= r >> colour_depth[2];
+                        } else {
+                            b = bytestream2_get_be16u(&s->g) >> colour_depth[2] - 8;
+                        }
+                        s->palette[i] = 0xffu << 24 | r << 16 | g << 8 | b;
+                    }
+                } else if (atom2 == MKBETAG('c','d','e','f') && atom2_size >= 2) {
+                    int n = bytestream2_get_be16u(&s->g);
+                    for (; n>0; n--) {
+                        int cn   = bytestream2_get_be16(&s->g);
+                        int av_unused typ  = bytestream2_get_be16(&s->g);
+                        int asoc = bytestream2_get_be16(&s->g);
+                        if (cn < 4 && asoc < 4)
+                            s->cdef[cn] = asoc;
+                    }
+                }
+                bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+            } while (atom_end - atom2_end >= 8);
         } else {
-            if (bytestream2_get_bytes_left(&s->g) < atom_size - 8)
-                return 0;
-            bytestream2_skipu(&s->g, atom_size - 8);
             search_range--;
         }
+        bytestream2_seek(&s->g, atom_end, SEEK_SET);
     }
 
-    if (found_codestream)
-        return 1;
     return 0;
 }
 
@@ -1387,7 +2047,8 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
 
     s->avctx     = avctx;
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    s->curtileno = 0; // TODO: only one tile in DCI JP2K. to implement for more tiles
+    s->curtileno = -1;
+    memset(s->cdef, -1, sizeof(s->cdef));
 
     if (bytestream2_get_bytes_left(&s->g) < 2) {
         ret = AVERROR_INVALIDDATA;
@@ -1409,6 +2070,9 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         bytestream2_seek(&s->g, 0, SEEK_SET);
     }
 
+    while (bytestream2_get_bytes_left(&s->g) >= 3 && bytestream2_peek_be16(&s->g) != JPEG2000_SOC)
+        bytestream2_skip(&s->g, 1);
+
     if (bytestream2_get_be16u(&s->g) != JPEG2000_SOC) {
         av_log(avctx, AV_LOG_ERROR, "SOC marker not present\n");
         ret = AVERROR_INVALIDDATA;
@@ -1418,15 +2082,14 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
 
     /* get picture buffer */
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed.\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto end;
-    }
     picture->pict_type = AV_PICTURE_TYPE_I;
     picture->key_frame = 1;
 
     if (ret = jpeg2000_read_bitstream_packets(s))
         goto end;
+
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++)
         if (ret = jpeg2000_decode_tile(s, s->tile + tileno, picture))
             goto end;
@@ -1435,6 +2098,9 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame = 1;
 
+    if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(picture->data[1], s->palette, 256 * sizeof(uint32_t));
+
     return bytestream2_tell(&s->g);
 
 end:
@@ -1466,7 +2132,7 @@ static const AVProfile profiles[] = {
     { FF_PROFILE_UNKNOWN },
 };
 
-static const AVClass class = {
+static const AVClass jpeg2000_class = {
     .class_name = "jpeg2000",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1483,6 +2149,7 @@ AVCodec ff_jpeg2000_decoder = {
     .init_static_data = jpeg2000_init_static_data,
     .init             = jpeg2000_decode_init,
     .decode           = jpeg2000_decode_frame,
-    .priv_class       = &class,
+    .priv_class       = &jpeg2000_class,
+    .max_lowres       = 5,
     .profiles         = NULL_IF_CONFIG_SMALL(profiles)
 };
diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c
index 6e04c3a37d..d183cbb87d 100644
--- a/libavcodec/jpeg2000dsp.c
+++ b/libavcodec/jpeg2000dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,4 +95,7 @@ av_cold void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c)
     c->mct_decode[FF_DWT97]     = ict_float;
     c->mct_decode[FF_DWT53]     = rct_int;
     c->mct_decode[FF_DWT97_INT] = ict_int;
+
+    if (ARCH_X86)
+        ff_jpeg2000dsp_init_x86(c);
 }
diff --git a/libavcodec/jpeg2000dsp.h b/libavcodec/jpeg2000dsp.h
index 45a32c0dd8..1ae5b95d9a 100644
--- a/libavcodec/jpeg2000dsp.h
+++ b/libavcodec/jpeg2000dsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,5 +31,6 @@ typedef struct Jpeg2000DSPContext {
 } Jpeg2000DSPContext;
 
 void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c);
+void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c);
 
 #endif /* AVCODEC_JPEG2000DSP_H */
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 6642a53ac8..28ac6c429e 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Discrete wavelet transform
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/mem.h"
 #include "jpeg2000dwt.h"
@@ -36,21 +37,16 @@
 #define F_LFTG_BETA   0.052980118572961f
 #define F_LFTG_GAMMA  0.882911075530934f
 #define F_LFTG_DELTA  0.443506852043971f
-#define F_LFTG_K      1.230174104914001f
-#define F_LFTG_X      1.625732422f
-/* FIXME: Why use 1.625732422 instead of 1/F_LFTG_K?
- * Incorrect value in JPEG2000 norm.
- * see (ISO/IEC 15444:1 (version 2002) F.3.8.2 */
 
 /* Lifting parameters in integer format.
  * Computed as param = (float param) * (1 << 16) */
-#define I_LFTG_ALPHA  103949
-#define I_LFTG_BETA     3472
-#define I_LFTG_GAMMA   57862
-#define I_LFTG_DELTA   29066
-#define I_LFTG_K       80621
-#define I_LFTG_X      106544
-
+#define I_LFTG_ALPHA  103949ll
+#define I_LFTG_BETA     3472ll
+#define I_LFTG_GAMMA   57862ll
+#define I_LFTG_DELTA   29066ll
+#define I_LFTG_K       80621ll
+#define I_LFTG_X       53274ll
+#define I_PRESHIFT 8
 
 static inline void extend53(int *p, int i0, int i1)
 {
@@ -80,18 +76,250 @@ static inline void extend97_int(int32_t *p, int i0, int i1)
     }
 }
 
+static void sd_1d53(int *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] <<= 1;
+        return;
+    }
+
+    extend53(p, i0, i1);
+
+    for (i = ((i0+1)>>1) - 1; i < (i1+1)>>1; i++)
+        p[2*i+1] -= (p[2*i] + p[2*i+2]) >> 1;
+    for (i = ((i0+1)>>1); i < (i1+1)>>1; i++)
+        p[2*i] += (p[2*i-1] + p[2*i+1] + 2) >> 2;
+}
+
+static void dwt_encode53(DWTContext *s, int *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    int *line = s->i_linebuf;
+    line += 3;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d53(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d53(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+    }
+}
+static void sd_1d97_float(float *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_X * 2;
+        else
+            p[0] *= F_LFTG_K;
+        return;
+    }
+
+    extend97_float(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2*i+1] -= 1.586134 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2*i] -= 0.052980 * (p[2*i-1] + p[2*i+1]);
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2*i+1] += 0.882911 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2*i] += 0.443506 * (p[2*i-1] + p[2*i+1]);
+}
+
+static void dwt_encode97_float(DWTContext *s, float *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    float *line = s->f_linebuf;
+    line += 5;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_float(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_float(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+    }
+}
+
+static void sd_1d97_int(int *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_X + (1<<14)) >> 15;
+        else
+            p[0] = (p[0] * I_LFTG_K + (1<<15)) >> 16;
+        return;
+    }
+
+    extend97_int(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2 * i + 1] -= (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2 * i]     -= (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2 * i + 1] += (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2 * i]     += (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+}
+
+static void dwt_encode97_int(DWTContext *s, int *t)
+{
+    int lev;
+    int w = s->linelen[s->ndeclevels-1][0];
+    int h = s->linelen[s->ndeclevels-1][1];
+    int i;
+    int *line = s->i_linebuf;
+    line += 5;
+
+    for (i = 0; i < w * h; i++)
+        t[i] <<= I_PRESHIFT;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_int(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_int(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+    }
+
+    for (i = 0; i < w * h; i++)
+        t[i] = (t[i] + ((1<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
+}
+
 static void sr_1d53(int *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] >>= 1;
         return;
+    }
 
     extend53(p, i0, i1);
 
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i] -= (p[2 * i - 1] + p[2 * i + 1] + 2) >> 2;
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += (p[2 * i] + p[2 * i + 2]) >> 1;
 }
 
@@ -148,21 +376,26 @@ static void sr_1d97_float(float *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_K/2;
+        else
+            p[0] *= F_LFTG_X;
         return;
+    }
 
     extend97_float(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
         p[2 * i]     -= F_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
         p[2 * i + 1] -= F_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]);
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i]     += F_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += F_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]);
 }
 
@@ -188,9 +421,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_K;
+                l[i] = data[w * lp + j];
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_X;
+                l[i] = data[w * lp + j];
 
             sr_1d97_float(line, mh, mh + lh);
 
@@ -204,9 +437,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_K;
+                l[i] = data[w * j + lp];
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_X;
+                l[i] = data[w * j + lp];
 
             sr_1d97_float(line, mv, mv + lv);
 
@@ -220,21 +453,26 @@ static void sr_1d97_int(int32_t *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_K + (1<<16)) >> 17;
+        else
+            p[0] = (p[0] * I_LFTG_X + (1<<15)) >> 16;
         return;
+    }
 
     extend97_int(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
         p[2 * i]     -= (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
         p[2 * i + 1] -= (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i]     += (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
 }
 
@@ -242,11 +480,16 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
 {
     int lev;
     int w       = s->linelen[s->ndeclevels - 1][0];
+    int h       = s->linelen[s->ndeclevels - 1][1];
+    int i;
     int32_t *line = s->i_linebuf;
     int32_t *data = t;
     /* position at index O of line range [0-5,w+5] cf. extend function */
     line += 5;
 
+    for (i = 0; i < w * h; i++)
+        data[i] <<= I_PRESHIFT;
+
     for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
@@ -262,7 +505,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mh; i < lh; i += 2, j++)
                 l[i] = ((data[w * lp + j] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = ((data[w * lp + j] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * lp + j];
 
             sr_1d97_int(line, mh, mh + lh);
 
@@ -278,7 +521,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mv; i < lv; i += 2, j++)
                 l[i] = ((data[w * j + lp] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = ((data[w * j + lp] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * j + lp];
 
             sr_1d97_int(line, mv, mv + lv);
 
@@ -286,6 +529,9 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
                 data[w * i + lp] = l[i];
         }
     }
+
+    for (i = 0; i < w * h; i++)
+        data[i] = (data[i] + ((1<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
 }
 
 int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
@@ -312,17 +558,17 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
         }
     switch (type) {
     case FF_DWT97:
-        s->f_linebuf = av_malloc((maxlen + 12) * sizeof(*s->f_linebuf));
+        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
      case FF_DWT97_INT:
-        s->i_linebuf = av_malloc((maxlen + 12) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
     case FF_DWT53:
-        s->i_linebuf = av_malloc((maxlen +  6) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
@@ -332,6 +578,21 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
     return 0;
 }
 
+int ff_dwt_encode(DWTContext *s, void *t)
+{
+    switch(s->type){
+        case FF_DWT97:
+            dwt_encode97_float(s, t); break;
+        case FF_DWT97_INT:
+            dwt_encode97_int(s, t); break;
+        case FF_DWT53:
+            dwt_encode53(s, t); break;
+        default:
+            return -1;
+    }
+    return 0;
+}
+
 int ff_dwt_decode(DWTContext *s, void *t)
 {
     switch (s->type) {
@@ -355,3 +616,125 @@ void ff_dwt_destroy(DWTContext *s)
     av_freep(&s->f_linebuf);
     av_freep(&s->i_linebuf);
 }
+
+#ifdef TEST
+
+#include "libavutil/lfg.h"
+
+#define MAX_W 256
+
+static int test_dwt(int *array, int *ref, uint16_t border[2][2], int decomp_levels, int type, int max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    int64_t err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, type);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%d != %d) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("%s, decomp:%2d border %3d %3d %3d %3d milli-err2:%9"PRId64"\n",
+           type == FF_DWT53 ? "5/3i" : "9/7i",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           1000*err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int test_dwtf(float *array, float *ref, uint16_t border[2][2], int decomp_levels, float max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    double err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, FF_DWT97);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%f != %f) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("9/7f, decomp:%2d border %3d %3d %3d %3d err2:%20.3f\n",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int array[MAX_W * MAX_W];
+static int ref  [MAX_W * MAX_W];
+static float arrayf[MAX_W * MAX_W];
+static float reff  [MAX_W * MAX_W];
+
+int main(void) {
+    AVLFG prng;
+    int i,j;
+    uint16_t border[2][2];
+    int ret, decomp_levels;
+
+    av_lfg_init(&prng, 1);
+
+    for (i = 0; i<MAX_W * MAX_W; i++)
+        arrayf[i] = reff[i] = array[i] = ref[i] =  av_lfg_get(&prng) % 2048;
+
+    for (i = 0; i < 100; i++) {
+        for (j=0; j<4; j++)
+            border[j>>1][j&1] = av_lfg_get(&prng) % MAX_W;
+        if (border[0][0] >= border[0][1] || border[1][0] >= border[1][1])
+            continue;
+        decomp_levels = av_lfg_get(&prng) % FF_DWT_MAX_DECLVLS;
+
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT53, 0);
+        if (ret)
+            return ret;
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT97_INT, FFMIN(7+5*decomp_levels, 15+3*decomp_levels));
+        if (ret)
+            return ret;
+        ret = test_dwtf(arrayf, reff, border, decomp_levels, 0.05);
+        if (ret)
+            return ret;
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index f08340dbc9..57297d4127 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -2,20 +2,20 @@
  * Discrete wavelet transform
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include <stdint.h>
 
 #define FF_DWT_MAX_DECLVLS 32 ///< max number of decomposition levels
+#define F_LFTG_K      1.230174104914001f
+#define F_LFTG_X      0.812893066115961f
 
 enum DWTType {
     FF_DWT97,
@@ -58,6 +60,7 @@ typedef struct DWTContext {
 int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
                          int decomp_levels, int type);
 
+int ff_dwt_encode(DWTContext *s, void *t);
 int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
diff --git a/libavcodec/jpegls.c b/libavcodec/jpegls.c
index 19d461fa09..7f9fa8d60c 100644
--- a/libavcodec/jpegls.c
+++ b/libavcodec/jpegls.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,10 +39,8 @@ void ff_jpegls_init_state(JLSState *state)
     for (state->qbpp = 0; (1 << state->qbpp) < state->range; state->qbpp++)
         ;
 
-    if (state->bpp < 8)
-        state->limit = 2 * state->bpp - state->qbpp + 16;
-    else
-        state->limit = 4 * state->bpp - state->qbpp;
+    state->bpp   = FFMAX(av_log2(state->maxval) + 1, 2);
+    state->limit = 2*(state->bpp + FFMAX(state->bpp, 8)) - state->qbpp;
 
     for (i = 0; i < 367; i++) {
         state->A[i] = FFMAX(state->range + 32 >> 6, 2);
diff --git a/libavcodec/jpegls.h b/libavcodec/jpegls.h
index eae3943c30..c8997c7861 100644
--- a/libavcodec/jpegls.h
+++ b/libavcodec/jpegls.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #include "libavutil/common.h"
 #include "avcodec.h"
+#include "internal.h"
 
 typedef struct JpeglsContext {
     AVCodecContext *avctx;
@@ -40,11 +41,9 @@ typedef struct JLSState {
     int A[367], B[367], C[365], N[367];
     int limit, reset, bpp, qbpp, maxval, range;
     int near, twonear;
-    int run_index[3];
+    int run_index[4];
 } JLSState;
 
-extern const uint8_t ff_log2_run[32];
-
 /**
  * Calculate initial JPEG-LS parameters
  */
@@ -98,6 +97,8 @@ static inline void ff_jpegls_downscale_state(JLSState *state, int Q)
 static inline int ff_jpegls_update_state_regular(JLSState *state,
                                                  int Q, int err)
 {
+    if(FFABS(err) > 0xFFFF)
+        return -0x10000;
     state->A[Q] += FFABS(err);
     err         *= state->twonear;
     state->B[Q] += err;
diff --git a/libavcodec/jpeglsdec.c b/libavcodec/jpeglsdec.c
index 9f8ccecec6..68151cbbd8 100644
--- a/libavcodec/jpeglsdec.c
+++ b/libavcodec/jpeglsdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,7 @@
  * (or test broken JPEG-LS decoder) and slow down ordinary decoding a bit.
  *
  * There is no Golomb code with length >= 32 bits possible, so check and
- * avoid situation of 32 zeros, Libav Golomb decoder is painfully slow
+ * avoid situation of 32 zeros, FFmpeg Golomb decoder is painfully slow
  * on this errors.
  */
 //#define JLS_BROKEN
@@ -51,27 +51,87 @@
 int ff_jpegls_decode_lse(MJpegDecodeContext *s)
 {
     int id;
+    int tid, wt, maxtab, i, j;
 
-    skip_bits(&s->gb, 16);  /* length: FIXME: verify field validity */
+    int len = get_bits(&s->gb, 16);
     id = get_bits(&s->gb, 8);
 
     switch (id) {
     case 1:
+        if (len < 13)
+            return AVERROR_INVALIDDATA;
+
         s->maxval = get_bits(&s->gb, 16);
         s->t1     = get_bits(&s->gb, 16);
         s->t2     = get_bits(&s->gb, 16);
         s->t3     = get_bits(&s->gb, 16);
         s->reset  = get_bits(&s->gb, 16);
 
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "Coding parameters maxval:%d T1:%d T2:%d T3:%d reset:%d\n",
+                   s->maxval, s->t1, s->t2, s->t3, s->reset);
+        }
+
 //        ff_jpegls_reset_coding_parameters(s, 0);
         //FIXME quant table?
         break;
     case 2:
+        s->palette_index = 0;
     case 3:
-        av_log(s->avctx, AV_LOG_ERROR, "palette not supported\n");
-        return AVERROR(ENOSYS);
+        tid= get_bits(&s->gb, 8);
+        wt = get_bits(&s->gb, 8);
+
+        if (len < 5)
+            return AVERROR_INVALIDDATA;
+
+        if (wt < 1 || wt > MAX_COMPONENTS) {
+            avpriv_request_sample(s->avctx, "wt %d", wt);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        if (!s->maxval)
+            maxtab = 255;
+        else if ((5 + wt*(s->maxval+1)) < 65535)
+            maxtab = s->maxval;
+        else
+            maxtab = 65530/wt - 1;
+
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "LSE palette %d tid:%d wt:%d maxtab:%d\n", id, tid, wt, maxtab);
+        }
+        if (maxtab >= 256) {
+            avpriv_request_sample(s->avctx, ">8bit palette");
+            return AVERROR_PATCHWELCOME;
+        }
+        maxtab = FFMIN(maxtab, (len - 5) / wt + s->palette_index);
+
+        if (s->palette_index > maxtab)
+            return AVERROR_INVALIDDATA;
+
+        if ((s->avctx->pix_fmt == AV_PIX_FMT_GRAY8 || s->avctx->pix_fmt == AV_PIX_FMT_PAL8) &&
+            (s->picture_ptr->format == AV_PIX_FMT_GRAY8 || s->picture_ptr->format == AV_PIX_FMT_PAL8)) {
+            uint32_t *pal = (uint32_t *)s->picture_ptr->data[1];
+            int shift = 0;
+
+            if (s->avctx->bits_per_raw_sample > 0 && s->avctx->bits_per_raw_sample < 8) {
+                maxtab = FFMIN(maxtab, (1<<s->avctx->bits_per_raw_sample)-1);
+                shift = 8 - s->avctx->bits_per_raw_sample;
+            }
+
+            s->picture_ptr->format =
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            for (i=s->palette_index; i<=maxtab; i++) {
+                uint8_t k = i << shift;
+                pal[k] = 0;
+                for (j=0; j<wt; j++) {
+                    pal[k] |= get_bits(&s->gb, 8) << (8*(wt-j-1));
+                }
+            }
+            s->palette_index = i;
+        }
+        break;
     case 4:
-        av_log(s->avctx, AV_LOG_ERROR, "oversize image not supported\n");
+        avpriv_request_sample(s->avctx, "oversize image");
         return AVERROR(ENOSYS);
     default:
         av_log(s->avctx, AV_LOG_ERROR, "invalid id %d\n", id);
@@ -149,6 +209,8 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
         ret = ret >> 1;
     }
 
+    if(FFABS(ret) > 0xFFFF)
+        return -0x10000;
     /* update state */
     state->A[Q] += FFABS(ret) - RItype;
     ret         *= state->twonear;
@@ -208,11 +270,20 @@ static inline void ls_decode_line(JLSState *state, MJpegDecodeContext *s,
             r = ff_log2_run[state->run_index[comp]];
             if (r)
                 r = get_bits_long(&s->gb, r);
+            if (x + r * stride > w) {
+                r = (w - x) / stride;
+            }
             for (i = 0; i < r; i++) {
                 W(dst, x, Ra);
                 x += stride;
             }
 
+            if (x >= w) {
+                av_log(NULL, AV_LOG_ERROR, "run overflow\n");
+                av_assert0(x <= w);
+                return;
+            }
+
             /* decode run termination value */
             Rb     = R(last, x);
             RItype = (FFABS(Ra - Rb) <= state->near) ? 1 : 0;
@@ -304,21 +375,23 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     else
         shift = point_transform + (16 - s->bits);
 
-    ff_dlog(s->avctx,
-            "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
-            "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
-            s->width, s->height, state->near, state->maxval,
-            state->T1, state->T2, state->T3,
-            state->reset, state->limit, state->qbpp, state->range);
-    ff_dlog(s->avctx, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
-            ilv, point_transform, s->bits, s->cur_scan);
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
+               "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
+                s->width, s->height, state->near, state->maxval,
+                state->T1, state->T2, state->T3,
+                state->reset, state->limit, state->qbpp, state->range);
+        av_log(s->avctx, AV_LOG_DEBUG, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
+                ilv, point_transform, s->bits, s->cur_scan);
+    }
     if (ilv == 0) { /* separate planes */
         if (s->cur_scan > s->nb_components) {
             ret = AVERROR_INVALIDDATA;
             goto end;
         }
-        off    = s->cur_scan - 1;
         stride = (s->nb_components > 1) ? 3 : 1;
+        off    = av_clip(s->cur_scan - 1, 0, stride - 1);
         width  = s->width * stride;
         cur   += off;
         for (i = 0; i < s->height; i++) {
@@ -340,12 +413,13 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     } else if (ilv == 1) { /* line interleaving */
         int j;
         int Rc[3] = { 0, 0, 0 };
+        stride = (s->nb_components > 1) ? 3 : 1;
         memset(cur, 0, s->picture_ptr->linesize[0]);
-        width = s->width * 3;
+        width = s->width * stride;
         for (i = 0; i < s->height; i++) {
-            for (j = 0; j < 3; j++) {
+            for (j = 0; j < stride; j++) {
                 ls_decode_line(state, s, last + j, cur + j,
-                               Rc[j], width, 3, j, 8);
+                               Rc[j], width, stride, j, 8);
                 Rc[j] = last[j];
 
                 if (s->restart_interval && !--s->restart_count) {
@@ -362,6 +436,53 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
         goto end;
     }
 
+    if (s->xfrm && s->nb_components == 3) {
+        int x, w;
+
+        w = s->width * s->nb_components;
+
+        if (s->bits <= 8) {
+            uint8_t *src = s->picture_ptr->data[0];
+
+            for (i = 0; i < s->height; i++) {
+                switch(s->xfrm) {
+                case 1:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += src[x+1] + 128;
+                    }
+                    break;
+                case 2:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += ((src[x  ] + src[x+1])>>1) + 128;
+                    }
+                    break;
+                case 3:
+                    for (x = off; x < w; x += 3) {
+                        int g = src[x+0] - ((src[x+2]+src[x+1])>>2) + 64;
+                        src[x+0] = src[x+2] + g + 128;
+                        src[x+2] = src[x+1] + g + 128;
+                        src[x+1] = g;
+                    }
+                    break;
+                case 4:
+                    for (x = off; x < w; x += 3) {
+                        int r    = src[x+0] - ((                       359 * (src[x+2]-128) + 490) >> 8);
+                        int g    = src[x+0] - (( 88 * (src[x+1]-128) - 183 * (src[x+2]-128) +  30) >> 8);
+                        int b    = src[x+0] + ((454 * (src[x+1]-128)                        + 574) >> 8);
+                        src[x+0] = av_clip_uint8(r);
+                        src[x+1] = av_clip_uint8(g);
+                        src[x+2] = av_clip_uint8(b);
+                    }
+                    break;
+                }
+                src += s->picture_ptr->linesize[0];
+            }
+        }else
+            avpriv_report_missing_feature(s->avctx, "16bit xfrm");
+    }
+
     if (shift) { /* we need to do point transform or normalize samples */
         int x, w;
 
diff --git a/libavcodec/jpeglsdec.h b/libavcodec/jpeglsdec.h
index d60a87bdd5..0cafaba7a4 100644
--- a/libavcodec/jpeglsdec.h
+++ b/libavcodec/jpeglsdec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jpeglsenc.c b/libavcodec/jpeglsenc.c
index 7fb4bde238..1e1a22bed2 100644
--- a/libavcodec/jpeglsenc.c
+++ b/libavcodec/jpeglsenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
+#include "put_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
@@ -257,7 +258,7 @@ static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
     uint8_t *zero = NULL;
     uint8_t *cur  = NULL;
     uint8_t *last = NULL;
-    JLSState *state;
+    JLSState *state = NULL;
     int i, size, ret;
     int comps;
 
@@ -267,11 +268,9 @@ static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
     else
         comps = 3;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * comps * 4 +
-                               AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width  *avctx->height * comps * 4 +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     buf2 = av_malloc(pkt->size);
     if (!buf2)
@@ -317,7 +316,7 @@ static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
 
     ls_store_lse(state, &pb);
 
-    zero = last = av_mallocz(p->linesize[0]);
+    zero = last = av_mallocz(FFABS(p->linesize[0]));
     if (!zero)
         goto memfail;
 
@@ -439,6 +438,7 @@ AVCodec ff_jpegls_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_JPEGLS,
     .init           = encode_init_ls,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_picture_ls,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_BGR24, AV_PIX_FMT_RGB24,
diff --git a/libavcodec/jpegtables.c b/libavcodec/jpegtables.c
index ce2bae2454..cbe5523cb4 100644
--- a/libavcodec/jpegtables.c
+++ b/libavcodec/jpegtables.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-const unsigned char std_luminance_quant_tbl[64] = {
+static const unsigned char std_luminance_quant_tbl[64] = {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -48,7 +48,7 @@ const unsigned char std_luminance_quant_tbl[64] = {
     49,  64,  78,  87, 103, 121, 120, 101,
     72,  92,  95,  98, 112, 100, 103,  99
 };
-const unsigned char std_chrominance_quant_tbl[64] = {
+static const unsigned char std_chrominance_quant_tbl[64] = {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
diff --git a/libavcodec/jpegtables.h b/libavcodec/jpegtables.h
index 1a909bef7a..6833b4b166 100644
--- a/libavcodec/jpegtables.h
+++ b/libavcodec/jpegtables.h
@@ -1,20 +1,20 @@
 /*
  * JPEG-related tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jrevdct.c b/libavcodec/jrevdct.c
index 8261269b25..55a7392e98 100644
--- a/libavcodec/jrevdct.c
+++ b/libavcodec/jrevdct.c
@@ -251,8 +251,8 @@ void ff_j_rev_dct(DCTBLOCK data)
       /* AC terms all zero */
       if (d0) {
           /* Compute a 32 bit value to assign. */
-          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
-          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
+          int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
+          register int v = (dcval & 0xffff) | ((dcval * (1 << 16)) & 0xffff0000);
 
           idataptr[0] = v;
           idataptr[1] = v;
@@ -274,8 +274,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -286,8 +286,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -300,8 +300,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -309,8 +309,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp12 = tmp1 - tmp2;
             } else {
                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
-                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
-                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
+                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
             }
       }
 
@@ -620,8 +620,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -632,8 +632,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -646,8 +646,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
 
-                    tmp0 = (d0 + d4) << CONST_BITS;
-                    tmp1 = (d0 - d4) << CONST_BITS;
+                    tmp0 = (d0 + d4) * CONST_SCALE;
+                    tmp1 = (d0 - d4) * CONST_SCALE;
 
                     tmp10 = tmp0 + tmp3;
                     tmp13 = tmp0 - tmp3;
@@ -655,8 +655,8 @@ void ff_j_rev_dct(DCTBLOCK data)
                     tmp12 = tmp1 - tmp2;
             } else {
                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
-                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
-                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
+                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
             }
     }
 
@@ -943,6 +943,219 @@ void ff_j_rev_dct(DCTBLOCK data)
   }
 }
 
+#undef DCTSIZE
+#define DCTSIZE 4
+#define DCTSTRIDE 8
+
+void ff_j_rev_dct4(DCTBLOCK data)
+{
+  int32_t tmp0, tmp1, tmp2, tmp3;
+  int32_t tmp10, tmp11, tmp12, tmp13;
+  int32_t z1;
+  int32_t d0, d2, d4, d6;
+  register int16_t *dataptr;
+  int rowctr;
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  data[0] += 4;
+
+  dataptr = data;
+
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any row in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * row DCT calculations can be simplified this way.
+     */
+
+    register int *idataptr = (int*)dataptr;
+
+    d0 = dataptr[0];
+    d2 = dataptr[1];
+    d4 = dataptr[2];
+    d6 = dataptr[3];
+
+    if ((d2 | d4 | d6) == 0) {
+      /* AC terms all zero */
+      if (d0) {
+          /* Compute a 32 bit value to assign. */
+          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
+          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
+
+          idataptr[0] = v;
+          idataptr[1] = v;
+      }
+
+      dataptr += DCTSTRIDE;     /* advance pointer to next row */
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+      }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSTRIDE;       /* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  dataptr = data;
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Columns of zeroes can be exploited in the same way as we did with rows.
+     * However, the row calculation has created many nonzero AC terms, so the
+     * simplification applies less often (typically 5% to 10% of the time).
+     * On machines with very fast multiplication, it's possible that the
+     * test takes more time than it's worth.  In that case this section
+     * may be commented out.
+     */
+
+    d0 = dataptr[DCTSTRIDE*0];
+    d2 = dataptr[DCTSTRIDE*1];
+    d4 = dataptr[DCTSTRIDE*2];
+    d6 = dataptr[DCTSTRIDE*3];
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+    }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
+
+    dataptr++;                  /* advance pointer to next column */
+  }
+}
+
+void ff_j_rev_dct2(DCTBLOCK data){
+  int d00, d01, d10, d11;
+
+  data[0] += 4;
+  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
+  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
+  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
+  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
+
+  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
+  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
+  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
+  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
+}
+
+void ff_j_rev_dct1(DCTBLOCK data){
+  data[0] = (data[0] + 4)>>3;
+}
+
+#undef FIX
+#undef CONST_BITS
+
 void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
diff --git a/libavcodec/jvdec.c b/libavcodec/jvdec.c
index c532b75a25..cbe83d3c10 100644
--- a/libavcodec/jvdec.c
+++ b/libavcodec/jvdec.c
@@ -2,20 +2,20 @@
  * Bitmap Brothers JV video decoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -147,24 +147,28 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
     JvContext *s = avctx->priv_data;
-    int buf_size = avpkt->size;
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = buf + buf_size;
+    const uint8_t *buf_end = buf + avpkt->size;
     int video_size, video_type, i, j, ret;
 
+    if (avpkt->size < 6)
+        return AVERROR_INVALIDDATA;
+
     video_size = AV_RL32(buf);
     video_type = buf[4];
     buf += 5;
 
     if (video_size) {
-        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return ret;
+        if (video_size < 0 || video_size > avpkt->size - 5) {
+            av_log(avctx, AV_LOG_ERROR, "video size %d invalid\n", video_size);
+            return AVERROR_INVALIDDATA;
         }
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
 
         if (video_type == 0 || video_type == 1) {
             GetBitContext gb;
-            init_get_bits(&gb, buf, 8 * FFMIN(video_size, buf_end - buf));
+            init_get_bits(&gb, buf, 8 * video_size);
 
             for (j = 0; j < avctx->height; j += 8)
                 for (i = 0; i < avctx->width; i += 8)
@@ -174,12 +178,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
             buf += video_size;
         } else if (video_type == 2) {
-            if (buf + 1 <= buf_end) {
-                int v = *buf++;
-                for (j = 0; j < avctx->height; j++)
-                    memset(s->frame->data[0] + j * s->frame->linesize[0],
-                           v, avctx->width);
-            }
+            int v = *buf++;
+            for (j = 0; j < avctx->height; j++)
+                memset(s->frame->data[0] + j * s->frame->linesize[0],
+                       v, avctx->width);
         } else {
             av_log(avctx, AV_LOG_WARNING,
                    "unsupported frame type %i\n", video_type);
@@ -187,9 +189,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
-    if (buf < buf_end) {
-        for (i = 0; i < AVPALETTE_COUNT && buf + 3 <= buf_end; i++) {
-            s->palette[i] = AV_RB24(buf) << 2;
+    if (buf_end - buf >= AVPALETTE_COUNT * 3) {
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            uint32_t pal = AV_RB24(buf);
+            s->palette[i] = 0xFFU << 24 | pal << 2 | ((pal >> 4) & 0x30303);
             buf += 3;
         }
         s->palette_has_changed = 1;
@@ -207,7 +210,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         *got_frame = 1;
     }
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_close(AVCodecContext *avctx)
diff --git a/libavcodec/kbdwin.c b/libavcodec/kbdwin.c
index 1b7313d1dc..bf32aeb317 100644
--- a/libavcodec/kbdwin.c
+++ b/libavcodec/kbdwin.c
@@ -1,22 +1,22 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+#include "libavutil/avassert.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/attributes.h"
 #include "kbdwin.h"
@@ -30,7 +30,7 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    double local_window[FF_KBD_WINDOW_MAX];
    double alpha2 = (alpha * M_PI / n) * (alpha * M_PI / n);
 
-   assert(n <= FF_KBD_WINDOW_MAX);
+   av_assert0(n <= FF_KBD_WINDOW_MAX);
 
    for (i = 0; i < n; i++) {
        tmp = i * (n - i) * alpha2;
@@ -45,3 +45,13 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    for (i = 0; i < n; i++)
        window[i] = sqrt(local_window[i] / sum);
 }
+
+av_cold void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n)
+{
+    int i;
+    float local_window[FF_KBD_WINDOW_MAX];
+
+    ff_kbd_window_init(local_window, alpha, n);
+    for (i = 0; i < n; i++)
+        window[i] = (int)floor(2147483647.0 * local_window[i] + 0.5);
+}
diff --git a/libavcodec/kbdwin.h b/libavcodec/kbdwin.h
index 89b569aa7c..4185c4206f 100644
--- a/libavcodec/kbdwin.h
+++ b/libavcodec/kbdwin.h
@@ -1,24 +1,26 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_KBDWIN_H
 #define AVCODEC_KBDWIN_H
 
+#include <stdint.h>
+
 /**
  * Maximum window size for ff_kbd_window_init.
  */
@@ -31,5 +33,6 @@
  * @param   n       size of half window, max FF_KBD_WINDOW_MAX
  */
 void ff_kbd_window_init(float *window, float alpha, int n);
+void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n);
 
 #endif /* AVCODEC_KBDWIN_H */
diff --git a/libavcodec/kgv1dec.c b/libavcodec/kgv1dec.c
index 0bf322eabc..5359411c76 100644
--- a/libavcodec/kgv1dec.c
+++ b/libavcodec/kgv1dec.c
@@ -2,20 +2,20 @@
  * Kega Game Video (KGV1) decoder
  * Copyright (c) 2010 Daniel Verkamp
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,6 @@
 #include "internal.h"
 
 typedef struct KgvContext {
-    AVCodecContext *avctx;
     uint16_t *frame_buffer;
     uint16_t *last_frame_buffer;
 } KgvContext;
@@ -52,7 +51,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf_end = buf + avpkt->size;
     KgvContext * const c = avctx->priv_data;
     int offsets[8];
-    uint16_t *out, *prev;
+    uint8_t *out, *prev;
     int outcnt = 0, maxcnt;
     int w, h, i, res;
 
@@ -83,22 +82,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    out  = c->frame_buffer;
-    prev = c->last_frame_buffer;
+    out  = (uint8_t*)c->frame_buffer;
+    prev = (uint8_t*)c->last_frame_buffer;
 
     for (i = 0; i < 8; i++)
         offsets[i] = -1;
 
-    while (outcnt < maxcnt && buf_end - 2 > buf) {
+    while (outcnt < maxcnt && buf_end - 2 >= buf) {
         int code = AV_RL16(buf);
         buf += 2;
 
         if (!(code & 0x8000)) {
-            out[outcnt++] = code; // rgb555 pixel coded directly
+            AV_WN16A(&out[2 * outcnt], code); // rgb555 pixel coded directly
+            outcnt++;
         } else {
             int count;
-            int inp_off;
-            uint16_t *inp;
 
             if ((code & 0x6000) == 0x6000) {
                 // copy from previous frame
@@ -116,7 +114,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
                 start = (outcnt + offsets[oidx]) % maxcnt;
 
-                if (maxcnt - start < count)
+                if (maxcnt - start < count || maxcnt - outcnt < count)
                     break;
 
                 if (!prev) {
@@ -125,8 +123,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     break;
                 }
 
-                inp = prev;
-                inp_off = start;
+                memcpy(out + 2 * outcnt, prev + 2 * start, 2 * count);
             } else {
                 // copy from earlier in this frame
                 int offset = (code & 0x1FFF) + 1;
@@ -141,19 +138,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     count = 4 + *buf++;
                 }
 
-                if (outcnt < offset)
+                if (outcnt < offset || maxcnt - outcnt < count)
                     break;
 
-                inp = out;
-                inp_off = outcnt - offset;
-            }
-
-            if (maxcnt - outcnt < count)
-                break;
-
-            for (i = inp_off; i < count + inp_off; i++) {
-                out[outcnt++] = inp[i];
+                av_memcpy_backptr(out + 2 * outcnt, 2 * offset, 2 * count);
             }
+            outcnt += count;
         }
     }
 
@@ -172,9 +162,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    KgvContext * const c = avctx->priv_data;
-
-    c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     return 0;
diff --git a/libavcodec/kmvc.c b/libavcodec/kmvc.c
index ca6b79fa7c..7acaba7d21 100644
--- a/libavcodec/kmvc.c
+++ b/libavcodec/kmvc.c
@@ -2,20 +2,20 @@
  * KMVC decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -107,6 +107,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = val & 0xF;
                             my = val >> 4;
+                            if ((l0x-mx) + 320*(l0y-my) < 0 || (l0x-mx) + 320*(l0y-my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->cur, l0x + (j & 3) - mx, l0y + (j >> 2) - my);
@@ -128,6 +132,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = val & 0xF;
                                     my = val >> 4;
+                                    if ((l1x-mx) + 320*(l1y-my) < 0 || (l1x-mx) + 320*(l1y-my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->cur, l1x - mx, l1y - my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->cur, l1x + 1 - mx, l1y - my);
@@ -199,6 +207,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = (val & 0xF) - 8;
                             my = (val >> 4) - 8;
+                            if ((l0x+mx) + 320*(l0y+my) < 0 || (l0x+mx) + 320*(l0y+my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->prev, l0x + (j & 3) + mx, l0y + (j >> 2) + my);
@@ -220,6 +232,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = (val & 0xF) - 8;
                                     my = (val >> 4) - 8;
+                                    if ((l1x+mx) + 320*(l1y+my) < 0 || (l1x+mx) + 320*(l1y+my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->prev, l1x + mx, l1y + my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->prev, l1x + 1 + mx, l1y + my);
@@ -256,10 +272,8 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
 
     bytestream2_init(&ctx->g, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     header = bytestream2_get_byte(&ctx->g);
 
@@ -267,7 +281,7 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
     if (bytestream2_peek_byte(&ctx->g) == 127) {
         bytestream2_skip(&ctx->g, 3);
         for (i = 0; i < 127; i++) {
-            ctx->pal[i + (header & 0x81)] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i + (header & 0x81)] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
             bytestream2_skip(&ctx->g, 1);
         }
         bytestream2_seek(&ctx->g, -127 * 4 - 3, SEEK_CUR);
@@ -285,7 +299,7 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
         frame->palette_has_changed = 1;
         // palette starts from index 1 and has 127 entries
         for (i = 1; i <= ctx->palsize; i++) {
-            ctx->pal[i] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
         }
     }
 
@@ -369,7 +383,7 @@ static av_cold int decode_init(AVCodecContext * avctx)
     c->prev = c->frm1;
 
     for (i = 0; i < 256; i++) {
-        c->pal[i] = i * 0x10101;
+        c->pal[i] = 0xFFU << 24 | i * 0x10101;
     }
 
     if (avctx->extradata_size < 12) {
@@ -378,7 +392,8 @@ static av_cold int decode_init(AVCodecContext * avctx)
         c->palsize = 127;
     } else {
         c->palsize = AV_RL16(avctx->extradata + 10);
-        if (c->palsize >= MAX_PALSIZE) {
+        if (c->palsize >= (unsigned)MAX_PALSIZE) {
+            c->palsize = 127;
             av_log(avctx, AV_LOG_ERROR, "KMVC palette too large\n");
             return AVERROR_INVALIDDATA;
         }
diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 98765fd491..94d723d319 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -2,20 +2,20 @@
  * Lagarith lossless decoder
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static int lag_decode_prob(GetBitContext *gb, uint32_t *value)
     }
 
     val  = get_bits_long(gb, bits);
-    val |= 1 << bits;
+    val |= 1U << bits;
 
     *value = val - 1;
 
@@ -160,8 +160,8 @@ static int lag_read_prob_header(lag_rac *rac, GetBitContext *gb)
                 av_log(rac->avctx, AV_LOG_ERROR, "Invalid probability run encountered.\n");
                 return -1;
             }
-            if (prob > 257 - i)
-                prob = 257 - i;
+            if (prob > 256 - i)
+                prob = 256 - i;
             for (j = 0; j < prob; j++)
                 rac->prob[++i] = 0;
         }
@@ -177,7 +177,15 @@ static int lag_read_prob_header(lag_rac *rac, GetBitContext *gb)
 
     if (cumul_prob & (cumul_prob - 1)) {
         uint64_t mul = softfloat_reciprocal(cumul_prob);
-        for (i = 1; i < 257; i++) {
+        for (i = 1; i <= 128; i++) {
+            rac->prob[i] = softfloat_mul(rac->prob[i], mul);
+            scaled_cumul_prob += rac->prob[i];
+        }
+        if (scaled_cumul_prob <= 0) {
+            av_log(rac->avctx, AV_LOG_ERROR, "Scaled probabilities invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (; i < 257; i++) {
             rac->prob[i] = softfloat_mul(rac->prob[i], mul);
             scaled_cumul_prob += rac->prob[i];
         }
@@ -251,11 +259,8 @@ static void lag_pred_line(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width = (width - 1) & ~31;
         /* Left prediction only for first line */
-        L = l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
+        L = l->hdsp.add_hfyu_left_pred(buf, buf, width, 0);
     } else {
         /* Left pixel is actually prev_row[width] */
         L = buf[width - stride - 1];
@@ -281,18 +286,12 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width;
-        if (is_luma) {
-            buf++;
-            width--;
-        }
-
-        align_width = (width - 1) & ~31;
-        l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
-
+        L= buf[0];
+        if (is_luma)
+            buf[0] = 0;
+        l->hdsp.add_hfyu_left_pred(buf, buf, width, 0);
+        if (is_luma)
+            buf[0] = L;
         return;
     }
     if (line == 1) {
@@ -371,6 +370,10 @@ static int lag_decode_zero_run_line(LagarithContext *l, uint8_t *dst,
     uint8_t mask2 = -(esc_count < 3);
     uint8_t *end = dst + (width - 2);
 
+    avpriv_request_sample(l->avctx, "zero_run_line");
+
+    memset(dst, 0, width);
+
 output_zeros:
     if (l->zeros_rem) {
         count = FFMIN(l->zeros_rem, width - i);
@@ -388,7 +391,7 @@ output_zeros:
         i = 0;
         while (!zero_run && dst + i < end) {
             i++;
-            if (src + i >= src_end)
+            if (i+2 >= src_end - src)
                 return AVERROR_INVALIDDATA;
             zero_run =
                 !(src[i] | (src[i + 1] & mask1) | (src[i + 2] & mask2));
@@ -408,7 +411,7 @@ output_zeros:
             dst += i;
         }
     }
-    return src_start - src;
+    return  src - src_start;
 }
 
 
@@ -421,22 +424,30 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
     int read = 0;
     uint32_t length;
     uint32_t offset = 1;
-    int esc_count = src[0];
+    int esc_count;
     GetBitContext gb;
     lag_rac rac;
     const uint8_t *src_end = src + src_size;
+    int ret;
 
     rac.avctx = l->avctx;
     l->zeros = 0;
 
+    if(src_size < 2)
+        return AVERROR_INVALIDDATA;
+
+    esc_count = src[0];
     if (esc_count < 4) {
         length = width * height;
+        if(src_size < 5)
+            return AVERROR_INVALIDDATA;
         if (esc_count && AV_RL32(src + 1) < length) {
             length = AV_RL32(src + 1);
             offset += 4;
         }
 
-        init_get_bits(&gb, src + offset, src_size * 8);
+        if ((ret = init_get_bits8(&gb, src + offset, src_size - offset)) < 0)
+            return ret;
 
         if (lag_read_prob_header(&rac, &gb) < 0)
             return -1;
@@ -453,6 +464,8 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
                    length);
     } else if (esc_count < 8) {
         esc_count -= 4;
+        src ++;
+        src_size --;
         if (esc_count > 0) {
             /* Zero run coding only, no range coding. */
             for (i = 0; i < height; i++) {
@@ -513,7 +526,7 @@ static int lag_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
+    unsigned int buf_size = avpkt->size;
     LagarithContext *l = avctx->priv_data;
     ThreadFrame frame = { .f = data };
     AVFrame *const p  = data;
@@ -522,6 +535,7 @@ static int lag_decode_frame(AVCodecContext *avctx,
     uint32_t offs[4];
     uint8_t *srcs[4], *dst;
     int i, j, planes = 3;
+    int ret;
 
     p->key_frame = 1;
 
@@ -533,18 +547,53 @@ static int lag_decode_frame(AVCodecContext *avctx,
     switch (frametype) {
     case FRAME_SOLID_RGBA:
         avctx->pix_fmt = AV_PIX_FMT_RGB32;
+    case FRAME_SOLID_GRAY:
+        if (frametype == FRAME_SOLID_GRAY)
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+                planes = 4;
+            }
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         dst = p->data[0];
+        if (frametype == FRAME_SOLID_RGBA) {
         for (j = 0; j < avctx->height; j++) {
             for (i = 0; i < avctx->width; i++)
                 AV_WN32(dst + i * 4, offset_gu);
             dst += p->linesize[0];
         }
+        } else {
+            for (j = 0; j < avctx->height; j++) {
+                memset(dst, buf[1], avctx->width * planes);
+                dst += p->linesize[0];
+            }
+        }
+        break;
+    case FRAME_SOLID_COLOR:
+        if (avctx->bits_per_coded_sample == 24) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+            offset_gu |= 0xFFU << 24;
+        }
+
+        if ((ret = ff_thread_get_buffer(avctx, &frame,0)) < 0)
+            return ret;
+
+        dst = p->data[0];
+        for (j = 0; j < avctx->height; j++) {
+            for (i = 0; i < avctx->width; i++)
+                if (avctx->bits_per_coded_sample == 24) {
+                    AV_WB24(dst + i * 3, offset_gu);
+                } else {
+                    AV_WN32(dst + i * 4, offset_gu);
+                }
+            dst += p->linesize[0];
+        }
         break;
     case FRAME_ARITH_RGBA:
         avctx->pix_fmt = AV_PIX_FMT_RGB32;
@@ -556,10 +605,8 @@ static int lag_decode_frame(AVCodecContext *avctx,
         if (frametype == FRAME_ARITH_RGB24 || frametype == FRAME_U_RGB24)
             avctx->pix_fmt = AV_PIX_FMT_RGB24;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         offs[0] = offset_bv;
         offs[1] = offset_gu;
@@ -574,14 +621,13 @@ static int lag_decode_frame(AVCodecContext *avctx,
         }
         for (i = 0; i < planes; i++)
             srcs[i] = l->rgb_planes + (i + 1) * l->rgb_stride * avctx->height - l->rgb_stride;
-        if (offset_ry >= buf_size ||
-            offset_gu >= buf_size ||
-            offset_bv >= buf_size ||
-            (planes == 4 && offs[3] >= buf_size)) {
-            av_log(avctx, AV_LOG_ERROR,
-                    "Invalid frame offsets\n");
-            return AVERROR_INVALIDDATA;
-        }
+        for (i = 0; i < planes; i++)
+            if (buf_size <= offs[i]) {
+                av_log(avctx, AV_LOG_ERROR,
+                        "Invalid frame offsets\n");
+                return AVERROR_INVALIDDATA;
+            }
+
         for (i = 0; i < planes; i++)
             lag_decode_arith_plane(l, srcs[i],
                                    avctx->width, avctx->height,
@@ -615,10 +661,8 @@ static int lag_decode_frame(AVCodecContext *avctx,
     case FRAME_ARITH_YUY2:
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         if (offset_ry >= buf_size ||
             offset_gu >= buf_size ||
@@ -631,19 +675,20 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[1],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[2],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     case FRAME_ARITH_YV12:
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
+        if (buf_size <= offset_ry || buf_size <= offset_gu || buf_size <= offset_bv) {
+            return AVERROR_INVALIDDATA;
         }
 
         if (offset_ry >= buf_size ||
@@ -657,17 +702,17 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
-                               avctx->height / 2, p->linesize[2],
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[2],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
-                               avctx->height / 2, p->linesize[1],
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[1],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported Lagarith frame type: %#"PRIx8"\n", frametype);
-        return -1;
+        return AVERROR_PATCHWELCOME;
     }
 
     *got_frame = 1;
diff --git a/libavcodec/lagarithrac.c b/libavcodec/lagarithrac.c
index edfb18fb74..37ac2cf570 100644
--- a/libavcodec/lagarithrac.c
+++ b/libavcodec/lagarithrac.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,19 +41,16 @@ void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length)
     left                = get_bits_left(gb) >> 3;
     l->bytestream_start =
     l->bytestream       = gb->buffer + get_bits_count(gb) / 8;
-    l->bytestream_end   = l->bytestream_start + FFMIN(length, left);
+    l->bytestream_end   = l->bytestream_start + left;
 
     l->range        = 0x80;
     l->low          = *l->bytestream >> 1;
-    l->hash_shift   = FFMAX(l->scale - 8, 0);
+    l->hash_shift   = FFMAX((int)l->scale - 10, 0);
 
-    for (i = j = 0; i < 256; i++) {
+    for (i = j = 0; i < 1024; i++) {
         unsigned r = i << l->hash_shift;
         while (l->prob[j + 1] <= r)
             j++;
         l->range_hash[i] = j;
     }
-
-    /* Add conversion factor to hash_shift so we don't have to in lag_get_rac. */
-    l->hash_shift += 23;
 }
diff --git a/libavcodec/lagarithrac.h b/libavcodec/lagarithrac.h
index e4f066e445..dfdfea0db3 100644
--- a/libavcodec/lagarithrac.h
+++ b/libavcodec/lagarithrac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@ typedef struct lag_rac {
     const uint8_t *bytestream_end;    /**< End position of input bytestream. */
 
     uint32_t prob[258];         /**< Table of cumulative probability for each symbol. */
-    uint8_t  range_hash[256];   /**< Hash table mapping upper byte to approximate symbol. */
+    uint8_t  range_hash[1024];   /**< Hash table mapping upper byte to approximate symbol. */
 } lag_rac;
 
 void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length);
@@ -72,9 +72,8 @@ static inline void lag_rac_refill(lag_rac *l)
  */
 static inline uint8_t lag_get_rac(lag_rac *l)
 {
-    unsigned range_scaled, low_scaled, div;
+    unsigned range_scaled, low_scaled;
     int val;
-    uint8_t shift;
 
     lag_rac_refill(l);
 
@@ -85,18 +84,9 @@ static inline uint8_t lag_get_rac(lag_rac *l)
         if (l->low < range_scaled * l->prob[1]) {
             val = 0;
         } else {
-            /* FIXME __builtin_clz is ~20% faster here, but not allowed in generic code. */
-            shift = 30 - av_log2(range_scaled);
-            div = ((range_scaled << shift) + (1 << 23) - 1) >> 23;
-            /* low>>24 ensures that any cases too big for exact FASTDIV are
-             * under- rather than over-estimated
-             */
-            low_scaled = FASTDIV(l->low - (l->low >> 24), div);
-            shift -= l->hash_shift;
-            shift &= 31;
-            low_scaled = (low_scaled << shift) | (low_scaled >> (32 - shift));
-            /* low_scaled is now a lower bound of low/range_scaled */
-            val = l->range_hash[(uint8_t) low_scaled];
+            low_scaled = l->low / (range_scaled<<(l->hash_shift));
+
+            val = l->range_hash[low_scaled];
             while (l->low >= range_scaled * l->prob[val + 1])
                 val++;
         }
diff --git a/libavcodec/latm_parser.c b/libavcodec/latm_parser.c
index 6fdb897375..3820f58d69 100644
--- a/libavcodec/latm_parser.c
+++ b/libavcodec/latm_parser.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Paul Kendall <paul@kcbbs.gen.nz>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,6 @@ static int latm_find_frame_end(AVCodecParserContext *s1, const uint8_t *buf,
     pic_found = pc->frame_start_found;
     state     = pc->state;
 
-    i = 0;
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state<<8) | buf[i];
diff --git a/libavcodec/lcl.h b/libavcodec/lcl.h
index 4e7e170138..b60c0e901a 100644
--- a/libavcodec/lcl.h
+++ b/libavcodec/lcl.h
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lcldec.c b/libavcodec/lcldec.c
index ac13a1964d..c04ed5e6ca 100644
--- a/libavcodec/lcldec.c
+++ b/libavcodec/lcldec.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,6 +42,7 @@
 #include <stdlib.h>
 
 #include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -95,7 +96,13 @@ static unsigned int mszh_decomp(const unsigned char * srcptr, int srclen, unsign
             ofs = FFMIN(ofs, destptr - destptr_bak);
             cnt *= 4;
             cnt = FFMIN(cnt, destptr_end - destptr);
-            av_memcpy_backptr(destptr, ofs, cnt);
+            if (ofs) {
+                av_memcpy_backptr(destptr, ofs, cnt);
+            } else {
+                // Not known what the correct behaviour is, but
+                // this at least avoids uninitialized data.
+                memset(destptr, 0, cnt);
+            }
             destptr += cnt;
         }
         maskbit >>= 1;
@@ -132,7 +139,7 @@ static int zlib_decomp(AVCodecContext *avctx, const uint8_t *src, int src_len, i
         av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
         return AVERROR_UNKNOWN;
     }
-    c->zstream.next_in = src;
+    c->zstream.next_in = (uint8_t *)src;
     c->zstream.avail_in = src_len;
     c->zstream.next_out = c->decomp_buf + offset;
     c->zstream.avail_out = c->decomp_size - offset;
@@ -164,7 +171,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     LclDecContext * const c = avctx->priv_data;
     unsigned int pixel_ptr;
     int row, col;
-    unsigned char *encoded, *outptr;
+    unsigned char *encoded = avpkt->data, *outptr;
     uint8_t *y_out, *u_out, *v_out;
     unsigned int width = avctx->width; // Real image width
     unsigned int height = avctx->height; // Real image height
@@ -173,11 +180,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int uqvq, ret;
     unsigned int mthread_inlen, mthread_outlen;
     unsigned int len = buf_size;
+    int linesize;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     outptr = frame->data[0]; // Output image pointer
 
@@ -186,8 +192,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case AV_CODEC_ID_MSZH:
         switch (c->compression) {
         case COMP_MSZH:
-            if (c->flags & FLAG_MULTITHREAD) {
+            if (c->imgtype == IMGTYPE_RGB24 && len == FFALIGN(width * 3, 4) * height ||
+                c->imgtype == IMGTYPE_YUV111 && len == width * height * 3) {
+                ;
+            } else if (c->flags & FLAG_MULTITHREAD) {
                 mthread_inlen = AV_RL32(buf);
+                if (len < 8) {
+                    av_log(avctx, AV_LOG_ERROR, "len %d is too small\n", len);
+                    return AVERROR_INVALIDDATA;
+                }
                 mthread_inlen = FFMIN(mthread_inlen, len - 8);
                 mthread_outlen = AV_RL32(buf + 4);
                 mthread_outlen = FFMIN(mthread_outlen, c->decomp_size);
@@ -399,10 +412,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         }
         break;
     case IMGTYPE_RGB24:
+        linesize = len < FFALIGN(3 * width, 4) * height ? 3 * width : FFALIGN(3 * width, 4);
         for (row = height - 1; row >= 0; row--) {
             pixel_ptr = row * frame->linesize[0];
             memcpy(outptr + pixel_ptr, encoded, 3 * width);
-            encoded += 3 * width;
+            encoded += linesize;
         }
         break;
     case IMGTYPE_YUV411:
@@ -471,6 +485,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int max_basesize = FFALIGN(avctx->width,  4) *
                                 FFALIGN(avctx->height, 4);
     unsigned int max_decomp_size;
+    int subsample_h, subsample_v;
 
     if (avctx->extradata_size < 8) {
         av_log(avctx, AV_LOG_ERROR, "Extradata size too small.\n");
@@ -496,6 +511,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         max_decomp_size = max_basesize * 2;
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
         av_log(avctx, AV_LOG_DEBUG, "Image type is YUV 4:2:2.\n");
+        if (avctx->width % 4) {
+            avpriv_request_sample(avctx, "Unsupported dimensions");
+            return AVERROR_INVALIDDATA;
+        }
         break;
     case IMGTYPE_RGB24:
         c->decomp_size = basesize * 3;
@@ -526,6 +545,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &subsample_h, &subsample_v);
+    if (avctx->width % (1<<subsample_h) || avctx->height % (1<<subsample_v)) {
+        avpriv_request_sample(avctx, "Unsupported dimensions");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* Detect compression method */
     c->compression = (int8_t)avctx->extradata[5];
     switch (avctx->codec_id) {
diff --git a/libavcodec/lclenc.c b/libavcodec/lclenc.c
index a73f6d6fed..1b1e08bd99 100644
--- a/libavcodec/lclenc.c
+++ b/libavcodec/lclenc.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "lcl.h"
@@ -71,19 +72,15 @@ typedef struct LclEncContext {
  *
  */
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+                        const AVFrame *p, int *got_packet)
 {
     LclEncContext *c = avctx->priv_data;
-    const AVFrame * const p = pict;
     int i, ret;
     int zret; // Zlib return code
     int max_size = deflateBound(&c->zstream, avctx->width * avctx->height * 3);
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_size)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error allocating packet of size %d.\n", max_size);
-            return ret;
-    }
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_size, 0)) < 0)
+        return ret;
 
     if(avctx->pix_fmt != AV_PIX_FMT_BGR24){
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
@@ -132,9 +129,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     c->avctx= avctx;
 
-    assert(avctx->width && avctx->height);
+    av_assert0(avctx->width && avctx->height);
 
-    avctx->extradata= av_mallocz(8);
+    avctx->extradata = av_mallocz(8 + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -145,8 +142,9 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    // Will be user settable someday
-    c->compression = 6;
+    c->compression = avctx->compression_level == FF_COMPRESSION_DEFAULT ?
+                            COMP_ZLIB_NORMAL :
+                            av_clip(avctx->compression_level, 0, 9);
     c->flags = 0;
     c->imgtype = IMGTYPE_RGB24;
     avctx->bits_per_coded_sample= 24;
@@ -197,6 +195,7 @@ AVCodec ff_zlib_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
                       FF_CODEC_CAP_INIT_CLEANUP,
diff --git a/libavcodec/libaacplus.c b/libavcodec/libaacplus.c
new file mode 100644
index 0000000000..fe380871b1
--- /dev/null
+++ b/libavcodec/libaacplus.c
@@ -0,0 +1,145 @@
+/*
+ * Interface to libaacplus for aac+ (sbr+ps) encoding
+ * Copyright (c) 2010 tipok <piratfm@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Interface to libaacplus for aac+ (sbr+ps) encoding.
+ */
+
+#include <aacplus.h>
+
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct aacPlusAudioContext {
+    aacplusEncHandle aacplus_handle;
+    unsigned long max_output_bytes;
+    unsigned long samples_input;
+} aacPlusAudioContext;
+
+static av_cold int aacPlus_encode_init(AVCodecContext *avctx)
+{
+    aacPlusAudioContext *s = avctx->priv_data;
+    aacplusEncConfiguration *aacplus_cfg;
+
+    /* number of channels */
+    if (avctx->channels < 1 || avctx->channels > 2) {
+        av_log(avctx, AV_LOG_ERROR, "encoding %d channel(s) is not allowed\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->profile != FF_PROFILE_AAC_LOW && avctx->profile != FF_PROFILE_UNKNOWN) {
+        av_log(avctx, AV_LOG_ERROR, "invalid AAC profile: %d, only LC supported\n", avctx->profile);
+        return AVERROR(EINVAL);
+    }
+
+    s->aacplus_handle = aacplusEncOpen(avctx->sample_rate, avctx->channels,
+                                       &s->samples_input, &s->max_output_bytes);
+    if (!s->aacplus_handle) {
+        av_log(avctx, AV_LOG_ERROR, "can't open encoder\n");
+        return AVERROR(EINVAL);
+    }
+
+    /* check aacplus version */
+    aacplus_cfg = aacplusEncGetCurrentConfiguration(s->aacplus_handle);
+
+    aacplus_cfg->bitRate = avctx->bit_rate;
+    aacplus_cfg->bandWidth = avctx->cutoff;
+    aacplus_cfg->outputFormat = !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
+    aacplus_cfg->inputFormat = avctx->sample_fmt == AV_SAMPLE_FMT_FLT ? AACPLUS_INPUT_FLOAT : AACPLUS_INPUT_16BIT;
+    if (!aacplusEncSetConfiguration(s->aacplus_handle, aacplus_cfg)) {
+        av_log(avctx, AV_LOG_ERROR, "libaacplus doesn't support this output format!\n");
+        return AVERROR(EINVAL);
+    }
+
+    avctx->frame_size = s->samples_input / avctx->channels;
+
+    /* Set decoder specific info */
+    avctx->extradata_size = 0;
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+
+        unsigned char *buffer = NULL;
+        unsigned long decoder_specific_info_size;
+
+        if (aacplusEncGetDecoderSpecificInfo(s->aacplus_handle, &buffer,
+                                           &decoder_specific_info_size) == 1) {
+            avctx->extradata = av_malloc(decoder_specific_info_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!avctx->extradata) {
+                free(buffer);
+                return AVERROR(ENOMEM);
+            }
+            avctx->extradata_size = decoder_specific_info_size;
+            memcpy(avctx->extradata, buffer, avctx->extradata_size);
+        }
+        free(buffer);
+    }
+    return 0;
+}
+
+static int aacPlus_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *frame, int *got_packet)
+{
+    aacPlusAudioContext *s = avctx->priv_data;
+    int32_t *input_buffer = (int32_t *)frame->data[0];
+    int ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->max_output_bytes, 0)) < 0)
+        return ret;
+
+    pkt->size = aacplusEncEncode(s->aacplus_handle, input_buffer,
+                                 s->samples_input, pkt->data, pkt->size);
+    *got_packet   = 1;
+    pkt->pts      = frame->pts;
+    return 0;
+}
+
+static av_cold int aacPlus_encode_close(AVCodecContext *avctx)
+{
+    aacPlusAudioContext *s = avctx->priv_data;
+
+    av_freep(&avctx->extradata);
+    aacplusEncClose(s->aacplus_handle);
+
+    return 0;
+}
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_AAC_LOW, "LC" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+AVCodec ff_libaacplus_encoder = {
+    .name           = "libaacplus",
+    .long_name      = NULL_IF_CONFIG_SMALL("libaacplus AAC+ (Advanced Audio Codec with SBR+PS)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AAC,
+    .priv_data_size = sizeof(aacPlusAudioContext),
+    .init           = aacPlus_encode_init,
+    .encode2        = aacPlus_encode_frame,
+    .close          = aacPlus_encode_close,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_FLT,
+                                                     AV_SAMPLE_FMT_NONE },
+    .profiles       = profiles,
+    .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                            AV_CH_LAYOUT_STEREO,
+                                            0 },
+};
diff --git a/libavcodec/libavcodec.v b/libavcodec/libavcodec.v
index bf148075c7..c923cd378f 100644
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@@ -1,4 +1,7 @@
 LIBAVCODEC_$MAJOR {
         global: av*;
+                #deprecated, remove after next bump
+                audio_resample;
+                audio_resample_close;
         local:  *;
 };
diff --git a/libavcodec/libcelt_dec.c b/libavcodec/libcelt_dec.c
new file mode 100644
index 0000000000..878e4cc673
--- /dev/null
+++ b/libavcodec/libcelt_dec.c
@@ -0,0 +1,140 @@
+/*
+ * Xiph CELT decoder using libcelt
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <celt/celt.h>
+#include <celt/celt_header.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+struct libcelt_context {
+    CELTMode *mode;
+    CELTDecoder *dec;
+    int discard;
+};
+
+static int ff_celt_error_to_averror(int err)
+{
+    switch (err) {
+        case CELT_BAD_ARG:          return AVERROR(EINVAL);
+#ifdef CELT_BUFFER_TOO_SMALL
+        case CELT_BUFFER_TOO_SMALL: return AVERROR(ENOBUFS);
+#endif
+        case CELT_INTERNAL_ERROR:   return AVERROR(EFAULT);
+        case CELT_CORRUPTED_DATA:   return AVERROR_INVALIDDATA;
+        case CELT_UNIMPLEMENTED:    return AVERROR(ENOSYS);
+#ifdef ENOTRECOVERABLE
+        case CELT_INVALID_STATE:    return AVERROR(ENOTRECOVERABLE);
+#endif
+        case CELT_ALLOC_FAIL:       return AVERROR(ENOMEM);
+        default:                    return AVERROR(EINVAL);
+    }
+}
+
+static int ff_celt_bitstream_version_hack(CELTMode *mode)
+{
+    CELTHeader header = { .version_id = 0 };
+    celt_header_init(&header, mode, 960, 2);
+    return header.version_id;
+}
+
+static av_cold int libcelt_dec_init(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+    int err;
+
+    if (!c->channels || !c->frame_size ||
+        c->frame_size > INT_MAX / sizeof(int16_t) / c->channels)
+        return AVERROR(EINVAL);
+    celt->mode = celt_mode_create(c->sample_rate, c->frame_size, &err);
+    if (!celt->mode)
+        return ff_celt_error_to_averror(err);
+    celt->dec = celt_decoder_create_custom(celt->mode, c->channels, &err);
+    if (!celt->dec) {
+        celt_mode_destroy(celt->mode);
+        return ff_celt_error_to_averror(err);
+    }
+    if (c->extradata_size >= 4) {
+        celt->discard = AV_RL32(c->extradata);
+        if (celt->discard < 0 || celt->discard >= c->frame_size) {
+            av_log(c, AV_LOG_WARNING,
+                   "Invalid overlap (%d), ignored.\n", celt->discard);
+            celt->discard = 0;
+        }
+    }
+    if (c->extradata_size >= 8) {
+        unsigned version = AV_RL32(c->extradata + 4);
+        unsigned lib_version = ff_celt_bitstream_version_hack(celt->mode);
+        if (version != lib_version)
+            av_log(c, AV_LOG_WARNING,
+                   "CELT bitstream version 0x%x may be "
+                   "improperly decoded by libcelt for version 0x%x.\n",
+                   version, lib_version);
+    }
+    c->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int libcelt_dec_close(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+
+    celt_decoder_destroy(celt->dec);
+    celt_mode_destroy(celt->mode);
+    return 0;
+}
+
+static int libcelt_dec_decode(AVCodecContext *c, void *data,
+                              int *got_frame_ptr, AVPacket *pkt)
+{
+    struct libcelt_context *celt = c->priv_data;
+    AVFrame *frame = data;
+    int err;
+    int16_t *pcm;
+
+    frame->nb_samples = c->frame_size;
+    if ((err = ff_get_buffer(c, frame, 0)) < 0)
+        return err;
+    pcm = (int16_t *)frame->data[0];
+    err = celt_decode(celt->dec, pkt->data, pkt->size, pcm, c->frame_size);
+    if (err < 0)
+        return ff_celt_error_to_averror(err);
+    if (celt->discard) {
+        frame->nb_samples -= celt->discard;
+        memmove(pcm, pcm + celt->discard * c->channels,
+                frame->nb_samples * c->channels * sizeof(int16_t));
+        celt->discard = 0;
+    }
+    *got_frame_ptr = 1;
+    return pkt->size;
+}
+
+AVCodec ff_libcelt_decoder = {
+    .name           = "libcelt",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xiph CELT decoder using libcelt"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_CELT,
+    .priv_data_size = sizeof(struct libcelt_context),
+    .init           = libcelt_dec_init,
+    .close          = libcelt_dec_close,
+    .decode         = libcelt_dec_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/libdcadec.c b/libavcodec/libdcadec.c
index b88f80763b..a0e34f9528 100644
--- a/libavcodec/libdcadec.c
+++ b/libavcodec/libdcadec.c
@@ -2,20 +2,20 @@
  * libdcadec decoder wrapper
  * Copyright (C) 2015 Hendrik Leppkes
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ typedef struct DCADecContext {
     struct dcadec_context *ctx;
     uint8_t *buffer;
     int buffer_size;
+    int downmix_warned;
 } DCADecContext;
 
 static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
@@ -41,6 +42,7 @@ static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
 {
     DCADecContext *s = avctx->priv_data;
     AVFrame *frame = data;
+    av_unused struct dcadec_exss_info *exss;
     int ret, i, k;
     int **samples, nsamples, channel_mask, sample_rate, bits_per_sample, profile;
     uint32_t mrk;
@@ -58,7 +60,10 @@ static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
         if (!s->buffer)
             return AVERROR(ENOMEM);
 
-        if ((ret = ff_dca_convert_bitstream(avpkt->data, avpkt->size, s->buffer, s->buffer_size)) < 0)
+        for (i = 0, ret = AVERROR_INVALIDDATA; i < input_size - 3 && ret < 0; i++)
+            ret = avpriv_dca_convert_bitstream(input + i, input_size - i, s->buffer, s->buffer_size);
+
+        if (ret < 0)
             return ret;
 
         input      = s->buffer;
@@ -67,12 +72,12 @@ static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
 
     if ((ret = dcadec_context_parse(s->ctx, input, input_size)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "dcadec_context_parse() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
     }
     if ((ret = dcadec_context_filter(s->ctx, &samples, &nsamples, &channel_mask,
                                      &sample_rate, &bits_per_sample, &profile)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "dcadec_context_filter() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
     }
 
     avctx->channels       = av_get_channel_layout_nb_channels(channel_mask);
@@ -81,7 +86,7 @@ static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
 
     if (bits_per_sample == 16)
         avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    else if (bits_per_sample <= 24)
+    else if (bits_per_sample > 16 && bits_per_sample <= 24)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
     else {
         av_log(avctx, AV_LOG_ERROR, "Unsupported number of bits per sample: %d\n",
@@ -124,6 +129,37 @@ static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
     } else
         avctx->bit_rate = 0;
 
+#if HAVE_STRUCT_DCADEC_EXSS_INFO_MATRIX_ENCODING
+    if (exss = dcadec_context_get_exss_info(s->ctx)) {
+        enum AVMatrixEncoding matrix_encoding = AV_MATRIX_ENCODING_NONE;
+
+        if (!s->downmix_warned) {
+            uint64_t layout = avctx->request_channel_layout;
+
+            if (((layout == AV_CH_LAYOUT_STEREO_DOWNMIX || layout == AV_CH_LAYOUT_STEREO) && !exss->embedded_stereo) ||
+                ( layout == AV_CH_LAYOUT_5POINT1 && !exss->embedded_6ch))
+                av_log(avctx, AV_LOG_WARNING, "%s downmix was requested but no custom coefficients are available, "
+                                              "this may result in clipping\n",
+                                              layout == AV_CH_LAYOUT_5POINT1 ? "5.1" : "Stereo");
+            s->downmix_warned = 1;
+        }
+
+        switch(exss->matrix_encoding) {
+        case DCADEC_MATRIX_ENCODING_SURROUND:
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+            break;
+        case DCADEC_MATRIX_ENCODING_HEADPHONE:
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBYHEADPHONE;
+            break;
+        }
+        dcadec_context_free_exss_info(exss);
+
+        if (matrix_encoding != AV_MATRIX_ENCODING_NONE &&
+            (ret = ff_side_data_update_matrix_encoding(frame, matrix_encoding)) < 0)
+            return ret;
+    }
+#endif
+
     frame->nb_samples = nsamples;
     if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
@@ -167,8 +203,34 @@ static av_cold int dcadec_close(AVCodecContext *avctx)
 static av_cold int dcadec_init(AVCodecContext *avctx)
 {
     DCADecContext *s = avctx->priv_data;
+    int flags = 0;
+
+    /* Affects only lossy DTS profiles. DTS-HD MA is always bitexact */
+    if (avctx->flags & AV_CODEC_FLAG_BITEXACT)
+        flags |= DCADEC_FLAG_CORE_BIT_EXACT;
+
+    if (avctx->request_channel_layout > 0 && avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE) {
+        switch (avctx->request_channel_layout) {
+        case AV_CH_LAYOUT_STEREO:
+        case AV_CH_LAYOUT_STEREO_DOWNMIX:
+            /* libdcadec ignores the 2ch flag if used alone when no custom downmix coefficients
+               are available, silently outputting a 5.1 downmix if possible instead.
+               Using both the 2ch and 6ch flags together forces a 2ch downmix using default
+               coefficients in such cases. This matches the behavior of the 6ch flag when used
+               alone, where a 5.1 downmix is generated if possible, regardless of custom
+               coefficients being available or not. */
+            flags |= DCADEC_FLAG_KEEP_DMIX_2CH | DCADEC_FLAG_KEEP_DMIX_6CH;
+            break;
+        case AV_CH_LAYOUT_5POINT1:
+            flags |= DCADEC_FLAG_KEEP_DMIX_6CH;
+            break;
+        default:
+            av_log(avctx, AV_LOG_WARNING, "Invalid request_channel_layout\n");
+            break;
+        }
+    }
 
-    s->ctx = dcadec_context_create(0);
+    s->ctx = dcadec_context_create(flags);
     if (!s->ctx)
         return AVERROR(ENOMEM);
 
diff --git a/libavcodec/libfaac.c b/libavcodec/libfaac.c
index 5cdbe27686..98b3ba8183 100644
--- a/libavcodec/libfaac.c
+++ b/libavcodec/libfaac.c
@@ -2,20 +2,20 @@
  * Interface to libfaac for aac encoding
  * Copyright (c) 2002 Gildas Bazin <gbazin@netcourrier.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,6 @@ typedef struct FaacAudioContext {
     AudioFrameQueue afq;
 } FaacAudioContext;
 
-
 static av_cold int Faac_encode_close(AVCodecContext *avctx)
 {
     FaacAudioContext *s = avctx->priv_data;
@@ -152,9 +151,20 @@ static av_cold int Faac_encode_init(AVCodecContext *avctx)
     }
 
     if (!faacEncSetConfiguration(s->faac_handle, faac_cfg)) {
-        av_log(avctx, AV_LOG_ERROR, "libfaac doesn't support this output format!\n");
-        ret = AVERROR(EINVAL);
-        goto error;
+        int i;
+        for (i = avctx->bit_rate/1000; i ; i--) {
+            faac_cfg->bitRate = 1000*i / avctx->channels;
+            if (faacEncSetConfiguration(s->faac_handle, faac_cfg))
+                break;
+        }
+        if (!i) {
+            av_log(avctx, AV_LOG_ERROR, "libfaac doesn't support this output format!\n");
+            ret = AVERROR(EINVAL);
+            goto error;
+        } else {
+            avctx->bit_rate = 1000*i;
+            av_log(avctx, AV_LOG_WARNING, "libfaac doesn't support the specified bitrate, using %dkbit/s instead\n", i);
+        }
     }
 
     avctx->initial_padding = FAAC_DELAY_SAMPLES;
@@ -174,10 +184,8 @@ static int Faac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int num_samples  = frame ? frame->nb_samples : 0;
     void *samples    = frame ? frame->data[0]    : NULL;
 
-    if ((ret = ff_alloc_packet(avpkt, (7 + 768) * avctx->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, (7 + 768) * avctx->channels, 0)) < 0)
         return ret;
-    }
 
     bytes_written = faacEncEncode(s->faac_handle, samples,
                                   num_samples * avctx->channels,
diff --git a/libavcodec/libfdk-aacdec.c b/libavcodec/libfdk-aacdec.c
index cdf7a0571f..e5f7c4ebdc 100644
--- a/libavcodec/libfdk-aacdec.c
+++ b/libavcodec/libfdk-aacdec.c
@@ -2,7 +2,7 @@
  * AAC decoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -44,7 +44,7 @@ typedef struct FDKAACDecContext {
     uint8_t *decoder_buffer;
     int decoder_buffer_size;
     uint8_t *anc_buffer;
-    enum ConcealMethod conceal_method;
+    int conceal_method;
     int drc_level;
     int drc_boost;
     int drc_heavy;
@@ -199,8 +199,8 @@ static av_cold int fdk_aac_decode_close(AVCodecContext *avctx)
 
     if (s->handle)
         aacDecoder_Close(s->handle);
-    av_free(s->decoder_buffer);
-    av_free(s->anc_buffer);
+    av_freep(&s->decoder_buffer);
+    av_freep(&s->anc_buffer);
 
     return 0;
 }
@@ -341,10 +341,9 @@ static int fdk_aac_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
     frame->nb_samples = avctx->frame_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         goto end;
-    }
+
     memcpy(frame->extended_data[0], s->decoder_buffer,
            avctx->channels * avctx->frame_size *
            av_get_bytes_per_sample(avctx->sample_fmt));
diff --git a/libavcodec/libfdk-aacenc.c b/libavcodec/libfdk-aacenc.c
index 2cea58f152..98a817b537 100644
--- a/libavcodec/libfdk-aacenc.c
+++ b/libavcodec/libfdk-aacenc.c
@@ -2,7 +2,7 @@
  * AAC encoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -215,8 +215,8 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
         }
         if ((err = aacEncoder_SetParam(s->handle, AACENC_BITRATE,
                                        avctx->bit_rate)) != AACENC_OK) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %d: %s\n",
-                   avctx->bit_rate, aac_get_error(err));
+            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %"PRId64": %s\n",
+                   (int64_t)avctx->bit_rate, aac_get_error(err));
             goto error;
         }
     }
@@ -342,10 +342,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
 
     /* The maximum packet size is 6144 bits aka 768 bytes per channel. */
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FFMAX(8192, 768 * avctx->channels), 0)) < 0)
         return ret;
-    }
 
     out_ptr                   = avpkt->data;
     out_buffer_size           = avpkt->size;
diff --git a/libavcodec/libgsmdec.c b/libavcodec/libgsmdec.c
index 4c21ff6728..a503215f67 100644
--- a/libavcodec/libgsmdec.c
+++ b/libavcodec/libgsmdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,8 @@ static av_cold int libgsm_decode_init(AVCodecContext *avctx) {
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
 
     s->state = gsm_create();
@@ -96,10 +97,8 @@ static int libgsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     for (i = 0; i < avctx->frame_size / GSM_FRAME_SIZE; i++) {
@@ -124,6 +123,7 @@ static void libgsm_flush(AVCodecContext *avctx) {
         gsm_option(s->state, GSM_OPT_WAV49, &one);
 }
 
+#if CONFIG_LIBGSM_DECODER
 AVCodec ff_libgsm_decoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -136,7 +136,8 @@ AVCodec ff_libgsm_decoder = {
     .flush          = libgsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_DECODER
 AVCodec ff_libgsm_ms_decoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -149,3 +150,4 @@ AVCodec ff_libgsm_ms_decoder = {
     .flush          = libgsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/libgsmenc.c b/libavcodec/libgsmenc.c
index 8f51321d46..69ce439ec1 100644
--- a/libavcodec/libgsmenc.c
+++ b/libavcodec/libgsmenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,12 @@
 #include "internal.h"
 #include "gsm.h"
 
+static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
+    gsm_destroy(avctx->priv_data);
+    avctx->priv_data = NULL;
+    return 0;
+}
+
 static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->channels > 1) {
         av_log(avctx, AV_LOG_ERROR, "Mono required for GSM, got %d channels\n",
@@ -56,13 +62,15 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->bit_rate != 13000 /* Official */ &&
         avctx->bit_rate != 13200 /* Very common */ &&
         avctx->bit_rate != 0 /* Unknown; a.o. mov does not set bitrate when decoding */ ) {
-        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %dbps\n",
-               avctx->bit_rate);
+        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %"PRId64"bps\n",
+               (int64_t)avctx->bit_rate);
         if (avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL)
             return -1;
     }
 
     avctx->priv_data = gsm_create();
+    if (!avctx->priv_data)
+        goto error;
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -78,12 +86,9 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     }
 
     return 0;
-}
-
-static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
-    gsm_destroy(avctx->priv_data);
-    avctx->priv_data = NULL;
-    return 0;
+error:
+    libgsm_encode_close(avctx);
+    return -1;
 }
 
 static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
@@ -93,10 +98,8 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     gsm_signal *samples = (gsm_signal *)frame->data[0];
     struct gsm_state *state = avctx->priv_data;
 
-    if ((ret = ff_alloc_packet(avpkt, avctx->block_align))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, avctx->block_align, 0)) < 0)
         return ret;
-    }
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -112,6 +115,7 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 
+#if CONFIG_LIBGSM_ENCODER
 AVCodec ff_libgsm_encoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -123,7 +127,8 @@ AVCodec ff_libgsm_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_ENCODER
 AVCodec ff_libgsm_ms_encoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -135,3 +140,4 @@ AVCodec ff_libgsm_ms_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/libilbc.c b/libavcodec/libilbc.c
index c5053f012b..c4c054fa5d 100644
--- a/libavcodec/libilbc.c
+++ b/libavcodec/libilbc.c
@@ -2,20 +2,20 @@
  * iLBC decoder/encoder stub
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,7 +51,10 @@ static const AVOption ilbc_dec_options[] = {
 };
 
 static const AVClass ilbc_dec_class = {
-    "libilbc", av_default_item_name, ilbc_dec_options, LIBAVUTIL_VERSION_INT
+    .class_name = "libilbc",
+    .item_name  = av_default_item_name,
+    .option     = ilbc_dec_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static av_cold int ilbc_decode_init(AVCodecContext *avctx)
@@ -90,13 +93,10 @@ static int ilbc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = s->decoder.blockl;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    WebRtcIlbcfix_DecodeImpl((WebRtc_Word16*) frame->data[0],
-                             (const WebRtc_UWord16*) buf, &s->decoder, 1);
+    WebRtcIlbcfix_DecodeImpl((int16_t *) frame->data[0], (const uint16_t *) buf, &s->decoder, 1);
 
     *got_frame_ptr = 1;
 
@@ -127,7 +127,10 @@ static const AVOption ilbc_enc_options[] = {
 };
 
 static const AVClass ilbc_enc_class = {
-    "libilbc", av_default_item_name, ilbc_enc_options, LIBAVUTIL_VERSION_INT
+    .class_name = "libilbc",
+    .item_name  = av_default_item_name,
+    .option     = ilbc_enc_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static av_cold int ilbc_encode_init(AVCodecContext *avctx)
@@ -163,12 +166,10 @@ static int ilbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ILBCEncContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, 50))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 50, 0)) < 0)
         return ret;
-    }
 
-    WebRtcIlbcfix_EncodeImpl((WebRtc_UWord16*) avpkt->data, (const WebRtc_Word16*) frame->data[0], &s->encoder);
+    WebRtcIlbcfix_EncodeImpl((uint16_t *) avpkt->data, (const int16_t *) frame->data[0], &s->encoder);
 
     avpkt->size     = s->encoder.no_of_bytes;
     *got_packet_ptr = 1;
diff --git a/libavcodec/libkvazaar.c b/libavcodec/libkvazaar.c
new file mode 100644
index 0000000000..9fb5be7c79
--- /dev/null
+++ b/libavcodec/libkvazaar.c
@@ -0,0 +1,248 @@
+/*
+ * libkvazaar encoder
+ *
+ * Copyright (c) 2015 Tampere University of Technology
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <kvazaar.h>
+#include <string.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/dict.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct LibkvazaarContext {
+    const AVClass *class;
+
+    const kvz_api *api;
+    kvz_encoder *encoder;
+    kvz_config *config;
+
+    char *kvz_params;
+} LibkvazaarContext;
+
+static av_cold int libkvazaar_init(AVCodecContext *avctx)
+{
+    int retval = 0;
+    kvz_config *cfg = NULL;
+    kvz_encoder *enc = NULL;
+    const kvz_api *const api = kvz_api_get(8);
+
+    LibkvazaarContext *const ctx = avctx->priv_data;
+
+    // Kvazaar requires width and height to be multiples of eight.
+    if (avctx->width % 8 || avctx->height % 8) {
+        av_log(avctx, AV_LOG_ERROR, "Video dimensions are not a multiple of 8.\n");
+        retval = AVERROR_INVALIDDATA;
+        goto done;
+    }
+
+    cfg = api->config_alloc();
+    if (!cfg) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate kvazaar config structure.\n");
+        retval = AVERROR(ENOMEM);
+        goto done;
+    }
+
+    if (!api->config_init(cfg)) {
+        av_log(avctx, AV_LOG_ERROR, "Could not initialize kvazaar config structure.\n");
+        retval = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    cfg->width = avctx->width;
+    cfg->height = avctx->height;
+    cfg->framerate =
+      (double)(avctx->time_base.num * avctx->ticks_per_frame) / avctx->time_base.den;
+    cfg->threads = avctx->thread_count;
+    cfg->target_bitrate = avctx->bit_rate;
+    cfg->vui.sar_width = avctx->sample_aspect_ratio.num;
+    cfg->vui.sar_height = avctx->sample_aspect_ratio.den;
+
+    if (ctx->kvz_params) {
+        AVDictionary *dict = NULL;
+        if (!av_dict_parse_string(&dict, ctx->kvz_params, "=", ",", 0)) {
+            AVDictionaryEntry *entry = NULL;
+            while ((entry = av_dict_get(dict, "", entry, AV_DICT_IGNORE_SUFFIX))) {
+                if (!api->config_parse(cfg, entry->key, entry->value)) {
+                    av_log(avctx, AV_LOG_WARNING,
+                           "Invalid option: %s=%s.\n",
+                           entry->key, entry->value);
+                }
+            }
+            av_dict_free(&dict);
+        }
+    }
+
+    enc = api->encoder_open(cfg);
+    if (!enc) {
+        av_log(avctx, AV_LOG_ERROR, "Could not open kvazaar encoder.\n");
+        retval = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    ctx->api = api;
+    ctx->encoder = enc;
+    ctx->config = cfg;
+    enc = NULL;
+    cfg = NULL;
+
+done:
+    if (cfg) api->config_destroy(cfg);
+    if (enc) api->encoder_close(enc);
+
+    return retval;
+}
+
+static av_cold int libkvazaar_close(AVCodecContext *avctx)
+{
+    LibkvazaarContext *ctx = avctx->priv_data;
+    if (!ctx->api) return 0;
+
+    if (ctx->encoder) {
+        ctx->api->encoder_close(ctx->encoder);
+        ctx->encoder = NULL;
+    }
+
+    if (ctx->config) {
+        ctx->api->config_destroy(ctx->config);
+        ctx->config = NULL;
+    }
+
+    return 0;
+}
+
+static int libkvazaar_encode(AVCodecContext *avctx,
+                             AVPacket *avpkt,
+                             const AVFrame *frame,
+                             int *got_packet_ptr)
+{
+    int retval = 0;
+    kvz_picture *img_in = NULL;
+    kvz_data_chunk *data_out = NULL;
+    uint32_t len_out = 0;
+    LibkvazaarContext *ctx = avctx->priv_data;
+
+    *got_packet_ptr = 0;
+
+    if (frame) {
+        int i = 0;
+
+        av_assert0(frame->width == ctx->config->width);
+        av_assert0(frame->height == ctx->config->height);
+        av_assert0(frame->format == avctx->pix_fmt);
+
+        // Allocate input picture for kvazaar.
+        img_in = ctx->api->picture_alloc(frame->width, frame->height);
+        if (!img_in) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate picture.\n");
+            retval = AVERROR(ENOMEM);
+            goto done;
+        }
+
+        // Copy pixels from frame to img_in.
+        for (i = 0; i < 3; ++i) {
+            uint8_t *dst = img_in->data[i];
+            uint8_t *src = frame->data[i];
+            int width = (i == 0) ? frame->width : (frame->width / 2);
+            int height = (i == 0) ? frame->height : (frame->height / 2);
+            int y = 0;
+            for (y = 0; y < height; ++y) {
+                memcpy(dst, src, width);
+                src += frame->linesize[i];
+                dst += width;
+            }
+        }
+    }
+
+    if (!ctx->api->encoder_encode(ctx->encoder, img_in, &data_out, &len_out, NULL)) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to encode frame.\n");
+        retval = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (data_out) {
+        kvz_data_chunk *chunk = NULL;
+        uint64_t written = 0;
+
+        retval = ff_alloc_packet(avpkt, len_out);
+        if (retval < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate output packet.\n");
+            goto done;
+        }
+
+        for (chunk = data_out; chunk != NULL; chunk = chunk->next) {
+            av_assert0(written + chunk->len <= len_out);
+            memcpy(avpkt->data + written, chunk->data, chunk->len);
+            written += chunk->len;
+        }
+        *got_packet_ptr = 1;
+
+        ctx->api->chunk_free(data_out);
+        data_out = NULL;
+    }
+
+done:
+    if (img_in) ctx->api->picture_free(img_in);
+    if (data_out) ctx->api->chunk_free(data_out);
+    return retval;
+}
+
+static const enum AVPixelFormat pix_fmts[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+static const AVOption options[] = {
+    { "kvazaar-params", "Set kvazaar parameters as a comma-separated list of name=value pairs.",
+      offsetof(LibkvazaarContext, kvz_params), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0,
+      AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL },
+};
+
+static const AVClass class = {
+    .class_name = "libkvazaar",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVCodecDefault defaults[] = {
+    { "b", "0" },
+    { NULL },
+};
+
+AVCodec ff_libkvazaar_encoder = {
+    .name             = "libkvazaar",
+    .long_name        = NULL_IF_CONFIG_SMALL("libkvazaar H.265 / HEVC"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_HEVC,
+    .capabilities     = AV_CODEC_CAP_DELAY,
+    .pix_fmts         = pix_fmts,
+
+    .priv_class       = &class,
+    .priv_data_size   = sizeof(LibkvazaarContext),
+    .defaults         = defaults,
+
+    .init             = libkvazaar_init,
+    .encode2          = libkvazaar_encode,
+    .close            = libkvazaar_close,
+};
diff --git a/libavcodec/libmp3lame.c b/libavcodec/libmp3lame.c
index 966212dcae..873b390c0c 100644
--- a/libavcodec/libmp3lame.c
+++ b/libavcodec/libmp3lame.c
@@ -2,20 +2,20 @@
  * Interface to libmp3lame for mp3 encoding
  * Copyright (c) 2002 Lennert Buytenhek <buytenh@gnu.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 #include "mpegaudio.h"
 #include "mpegaudiodecheader.h"
 
-#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4)
+#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4+1000) // FIXME: Buffer size to small? Adding 1000 to make up for it.
 
 typedef struct LAMEContext {
     AVClass *class;
@@ -52,7 +52,7 @@ typedef struct LAMEContext {
     int abr;
     float *samples_flt[2];
     AudioFrameQueue afq;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 } LAMEContext;
 
 
@@ -79,6 +79,7 @@ static av_cold int mp3lame_encode_close(AVCodecContext *avctx)
     av_freep(&s->samples_flt[0]);
     av_freep(&s->samples_flt[1]);
     av_freep(&s->buffer);
+    av_freep(&s->fdsp);
 
     ff_af_queue_close(&s->afq);
 
@@ -97,6 +98,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (!(s->gfp = lame_init()))
         return AVERROR(ENOMEM);
 
+
     lame_set_num_channels(s->gfp, avctx->channels);
     lame_set_mode(s->gfp, avctx->channels > 1 ? s->joint_stereo ? JOINT_STEREO : STEREO : MONO);
 
@@ -105,9 +107,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     lame_set_out_samplerate(s->gfp, avctx->sample_rate);
 
     /* algorithmic quality */
-    if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
-        lame_set_quality(s->gfp, 5);
-    else
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT)
         lame_set_quality(s->gfp, avctx->compression_level);
 
     /* rate control */
@@ -146,7 +146,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLTP) {
         int ch;
         for (ch = 0; ch < avctx->channels; ch++) {
-            s->samples_flt[ch] = av_malloc(avctx->frame_size *
+            s->samples_flt[ch] = av_malloc_array(avctx->frame_size,
                                            sizeof(*s->samples_flt[ch]));
             if (!s->samples_flt[ch]) {
                 ret = AVERROR(ENOMEM);
@@ -159,7 +159,12 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (ret < 0)
         goto error;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
 
     return 0;
 error:
@@ -198,7 +203,7 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                 return AVERROR(EINVAL);
             }
             for (ch = 0; ch < avctx->channels; ch++) {
-                s->fdsp.vector_fmul_scalar(s->samples_flt[ch],
+                s->fdsp->vector_fmul_scalar(s->samples_flt[ch],
                                            (const float *)frame->data[ch],
                                            32768.0f,
                                            FFALIGN(frame->nb_samples, 8));
@@ -208,6 +213,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         default:
             return AVERROR_BUG;
         }
+    } else if (!s->afq.frame_alloc) {
+        lame_result = 0;
     } else {
         lame_result = lame_encode_flush(s->gfp, s->buffer + s->buffer_index,
                                         s->buffer_size - s->buffer_index);
@@ -251,10 +258,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_dlog(avctx, "in:%d packet-len:%d index:%d\n", avctx->frame_size, len,
             s->buffer_index);
     if (len <= s->buffer_index) {
-        if ((ret = ff_alloc_packet(avpkt, len))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)) < 0)
             return ret;
-        }
         memcpy(avpkt->data, s->buffer, len);
         s->buffer_index -= len;
         memmove(s->buffer, s->buffer + len, s->buffer_index);
@@ -272,9 +277,9 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 #define OFFSET(x) offsetof(LAMEContext, x)
 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "reservoir", "Use bit reservoir.", OFFSET(reservoir), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "joint_stereo", "Use joint stereo.", OFFSET(joint_stereo), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "abr", "Use ABR", OFFSET(abr), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
+    { "reservoir",    "use bit reservoir", OFFSET(reservoir),    AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "joint_stereo", "use joint stereo",  OFFSET(joint_stereo), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "abr",          "use ABR",           OFFSET(abr),          AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AE },
     { NULL },
 };
 
diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c
index 763d80f200..f0e34268f7 100644
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@@ -1,21 +1,21 @@
 /*
  * AMR Audio decoder stub
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,8 @@ static int amr_decode_fix_avctx(AVCodecContext *avctx)
 {
     const int is_amr_wb = 1 + (avctx->codec_id == AV_CODEC_ID_AMR_WB);
 
-    avctx->sample_rate = 8000 * is_amr_wb;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 * is_amr_wb;
 
     if (avctx->channels > 1) {
         avpriv_report_missing_feature(avctx, "multi-channel AMR");
@@ -103,10 +104,8 @@ static int amr_nb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     dec_mode    = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[dec_mode] + 1;
@@ -181,7 +180,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrnb_class = {
     "libopencore_amrnb", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
@@ -189,7 +188,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
 {
     AMRContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 8000) {
+    if (avctx->sample_rate != 8000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 8000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -237,14 +236,12 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->enc_bitrate = avctx->bit_rate;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 32))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 32, 0)) < 0)
         return ret;
-    }
 
     if (frame) {
         if (frame->nb_samples < avctx->frame_size) {
-            flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+            flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
             if (!flush_buf)
                 return AVERROR(ENOMEM);
             memcpy(flush_buf, samples, frame->nb_samples * sizeof(*flush_buf));
@@ -259,7 +256,7 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     } else {
         if (s->enc_last_frame < 0)
             return 0;
-        flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+        flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
         if (!flush_buf)
             return AVERROR(ENOMEM);
         samples = flush_buf;
@@ -269,7 +266,7 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples,
                                        avpkt->data, 0);
     ff_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n",
-            written, s->enc_mode, frame[0]);
+            written, s->enc_mode, avpkt->data[0]);
 
     /* Get the next frame pts/duration */
     ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
@@ -293,7 +290,7 @@ AVCodec ff_libopencore_amrnb_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrnb_class,
 };
 #endif /* CONFIG_LIBOPENCORE_AMRNB_ENCODER */
 
@@ -335,10 +332,8 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 320;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     mode        = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[mode];
@@ -348,6 +343,10 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
                buf_size, packet_size + 1);
         return AVERROR_INVALIDDATA;
     }
+    if (!packet_size) {
+        av_log(avctx, AV_LOG_ERROR, "amr packet_size invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     D_IF_decode(s->state, buf, (short *)frame->data[0], _good_frame);
 
diff --git a/libavcodec/libopenh264enc.c b/libavcodec/libopenh264enc.c
index 0f18f80386..9603243fe3 100644
--- a/libavcodec/libopenh264enc.c
+++ b/libavcodec/libopenh264enc.c
@@ -2,20 +2,20 @@
  * OpenH264 video encoder
  * Copyright (C) 2014 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,12 +46,12 @@ typedef struct SVCContext {
 #define OFFSET(x) offsetof(SVCContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "slice_mode", "Slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
-    { "fixed", "A fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
-    { "rowmb", "One slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
-    { "auto", "Automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
-    { "loopfilter", "Enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
-    { "profile", "Set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "slice_mode", "set slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
+        { "fixed", "a fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
+        { "rowmb", "one slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
+        { "auto", "automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
+    { "loopfilter", "enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "profile", "set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { NULL }
 };
 
@@ -59,7 +59,7 @@ static const AVClass class = {
     "libopenh264enc", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
-// Convert libopenh264 log level to equivalent libav log level.
+// Convert libopenh264 log level to equivalent ffmpeg log level.
 static int libopenh264_to_libav_log_level(int libopenh264_log_level)
 {
     if      (libopenh264_log_level >= WELS_LOG_DETAIL)  return AV_LOG_TRACE;
@@ -78,8 +78,8 @@ static int libopenh264_to_libav_log_level(int libopenh264_log_level)
 
 static void libopenh264_trace_callback(void *ctx, int level, char const *msg)
 {
-    // The message will be logged only if the requested EQUIVALENT libav log level is
-    // less than or equal to the current libav log level.
+    // The message will be logged only if the requested EQUIVALENT ffmpeg log level is
+    // less than or equal to the current ffmpeg log level.
     int equiv_libav_log_level = libopenh264_to_libav_log_level(level);
     av_log(ctx, equiv_libav_log_level, "%s\n", msg);
 }
@@ -130,7 +130,7 @@ static av_cold int svc_encode_init(AVCodecContext *avctx)
 
     (*s->encoder)->GetDefaultParams(s->encoder, &param);
 
-    param.fMaxFrameRate              = avctx->time_base.den / avctx->time_base.num;
+    param.fMaxFrameRate              = 1/av_q2d(avctx->time_base);
     param.iPicWidth                  = avctx->width;
     param.iPicHeight                 = avctx->height;
     param.iTargetBitrate             = avctx->bit_rate;
@@ -240,7 +240,7 @@ static int svc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     av_log(avctx, AV_LOG_DEBUG, "%d slices\n", fbi.sLayerInfo[fbi.iLayerNum - 1].iNalCount);
 
-    if ((ret = ff_alloc_packet(avpkt, size))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, size, size))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
diff --git a/libavcodec/libopenjpegdec.c b/libavcodec/libopenjpegdec.c
index 401ea9b801..8539e2c9bd 100644
--- a/libavcodec/libopenjpegdec.c
+++ b/libavcodec/libopenjpegdec.c
@@ -2,20 +2,20 @@
  * JPEG 2000 decoding support via OpenJPEG
  * Copyright (c) 2009 Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,6 @@
  */
 
 #define  OPJ_STATIC
-#include <openjpeg.h>
 
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
@@ -37,6 +36,12 @@
 #include "internal.h"
 #include "thread.h"
 
+#if HAVE_OPENJPEG_1_5_OPENJPEG_H
+# include <openjpeg-1.5/openjpeg.h>
+#else
+# include <openjpeg.h>
+#endif
+
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 
@@ -46,72 +51,87 @@
                            AV_PIX_FMT_RGB48, AV_PIX_FMT_RGBA64
 
 #define GRAY_PIXEL_FORMATS AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8,                  \
-                           AV_PIX_FMT_GRAY16
-
-#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P,   AV_PIX_FMT_YUV411P,          \
-                           AV_PIX_FMT_YUVA420P,                               \
-                           AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,          \
-                           AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV444P,          \
-                           AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV422P9,         \
-                           AV_PIX_FMT_YUV444P9,                               \
-                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10,        \
-                           AV_PIX_FMT_YUV444P10,                              \
-                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16,        \
-                           AV_PIX_FMT_YUV444P16
+                           AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16
+
+#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUVA420P, \
+                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA422P, \
+                           AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, \
+                           AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9, \
+                           AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9, \
+                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10, \
+                           AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10, \
+                           AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12, \
+                           AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14, \
+                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, \
+                           AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16
 
 #define XYZ_PIXEL_FORMATS  AV_PIX_FMT_XYZ12
 
-static const enum AVPixelFormat rgb_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_rgb_pix_fmts[]  = {
     RGB_PIXEL_FORMATS
 };
-static const enum AVPixelFormat gray_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_gray_pix_fmts[] = {
     GRAY_PIXEL_FORMATS
 };
-static const enum AVPixelFormat yuv_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_yuv_pix_fmts[]  = {
     YUV_PIXEL_FORMATS
 };
-static const enum AVPixelFormat any_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_all_pix_fmts[]  = {
     RGB_PIXEL_FORMATS, GRAY_PIXEL_FORMATS, YUV_PIXEL_FORMATS, XYZ_PIXEL_FORMATS
 };
 
 typedef struct LibOpenJPEGContext {
     AVClass *class;
     opj_dparameters_t dec_params;
-    int lowres;
+    opj_event_mgr_t event_mgr;
     int lowqual;
 } LibOpenJPEGContext;
 
-static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
-                                       enum AVPixelFormat pix_fmt)
+static void error_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_ERROR, "%s", msg);
+}
+
+static void warning_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_WARNING, "%s", msg);
+}
+
+static void info_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_DEBUG, "%s", msg);
+}
+
+static inline int libopenjpeg_matches_pix_fmt(const opj_image_t *image, enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int match = 1;
 
-    if (desc->nb_components != img->numcomps) {
+    if (desc->nb_components != image->numcomps) {
         return 0;
     }
 
     switch (desc->nb_components) {
     case 4:
         match = match &&
-                desc->comp[3].depth >= img->comps[3].prec &&
-                1 == img->comps[3].dx &&
-                1 == img->comps[3].dy;
+                desc->comp[3].depth >= image->comps[3].prec &&
+                1 == image->comps[3].dx &&
+                1 == image->comps[3].dy;
     case 3:
         match = match &&
-                desc->comp[2].depth >= img->comps[2].prec &&
-                1 << desc->log2_chroma_w == img->comps[2].dx &&
-                1 << desc->log2_chroma_h == img->comps[2].dy;
+                desc->comp[2].depth >= image->comps[2].prec &&
+                1 << desc->log2_chroma_w == image->comps[2].dx &&
+                1 << desc->log2_chroma_h == image->comps[2].dy;
     case 2:
         match = match &&
-                desc->comp[1].depth >= img->comps[1].prec &&
-                1 << desc->log2_chroma_w == img->comps[1].dx &&
-                1 << desc->log2_chroma_h == img->comps[1].dy;
+                desc->comp[1].depth >= image->comps[1].prec &&
+                1 << desc->log2_chroma_w == image->comps[1].dx &&
+                1 << desc->log2_chroma_h == image->comps[1].dy;
     case 1:
         match = match &&
-                desc->comp[0].depth >= img->comps[0].prec &&
-                1 == img->comps[0].dx &&
-                1 == img->comps[0].dy;
+                desc->comp[0].depth >= image->comps[0].prec &&
+                1 == image->comps[0].dx &&
+                1 == image->comps[0].dy;
     default:
         break;
     }
@@ -119,28 +139,27 @@ static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
     return match;
 }
 
-static enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image)
-{
+static inline enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image) {
     int index;
     const enum AVPixelFormat *possible_fmts = NULL;
     int possible_fmts_nb = 0;
 
     switch (image->color_space) {
     case CLRSPC_SRGB:
-        possible_fmts    = rgb_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+        possible_fmts    = libopenjpeg_rgb_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_rgb_pix_fmts);
         break;
     case CLRSPC_GRAY:
-        possible_fmts    = gray_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+        possible_fmts    = libopenjpeg_gray_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_gray_pix_fmts);
         break;
     case CLRSPC_SYCC:
-        possible_fmts    = yuv_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
+        possible_fmts    = libopenjpeg_yuv_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_yuv_pix_fmts);
         break;
     default:
-        possible_fmts    = any_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(any_pix_fmts);
+        possible_fmts    = libopenjpeg_all_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_all_pix_fmts);
         break;
     }
 
@@ -167,40 +186,37 @@ static inline int libopenjpeg_ispacked(enum AVPixelFormat pix_fmt)
     return 1;
 }
 
-static void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image) {
     uint8_t *img_ptr;
     int index, x, y, c;
-
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = picture->data[0] + y * picture->linesize[0];
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index];
+                *img_ptr++ = 0x80 * image->comps[c].sgnd + image->comps[c].data[index];
     }
 }
 
-static void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image) {
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y, c;
     int adjust[4];
-
     for (x = 0; x < image->numcomps; x++)
-        adjust[x] = FFMAX(FFMIN(16 - image->comps[x].prec, 8), 0);
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = (uint16_t *) (picture->data[0] + y * picture->linesize[0]);
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index] << adjust[c];
+                *img_ptr++ = (1 << image->comps[c].prec - 1) * image->comps[c].sgnd +
+                             (unsigned)image->comps[c].data[index] << adjust[c];
     }
 }
 
-static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint8_t *img_ptr;
     int index, x, y;
@@ -210,7 +226,7 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
         for (y = 0; y < image->comps[index].h; y++) {
             img_ptr = picture->data[index] + y * picture->linesize[index];
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = (uint8_t) *comp_data;
+                *img_ptr = 0x80 * image->comps[index].sgnd + *comp_data;
                 img_ptr++;
                 comp_data++;
             }
@@ -218,18 +234,22 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
     }
 }
 
-static void libopenjpeg_copyto16(AVFrame *p, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto16(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y;
+    int adjust[4];
+    for (x = 0; x < image->numcomps; x++)
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (index = 0; index < image->numcomps; index++) {
         comp_data = image->comps[index].data;
         for (y = 0; y < image->comps[index].h; y++) {
-            img_ptr = (uint16_t *)(p->data[index] + y * p->linesize[index]);
+            img_ptr = (uint16_t *)(picture->data[index] + y * picture->linesize[index]);
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = *comp_data;
+                *img_ptr = (1 << image->comps[index].prec - 1) * image->comps[index].sgnd +
+                           (unsigned)*comp_data << adjust[index];
                 img_ptr++;
                 comp_data++;
             }
@@ -282,10 +302,12 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Error initializing decoder.\n");
         return AVERROR_UNKNOWN;
     }
-    opj_set_event_mgr((opj_common_ptr) dec, NULL, NULL);
-
+    memset(&ctx->event_mgr, 0, sizeof(ctx->event_mgr));
+    ctx->event_mgr.info_handler    = info_callback;
+    ctx->event_mgr.error_handler   = error_callback;
+    ctx->event_mgr.warning_handler = warning_callback;
+    opj_set_event_mgr((opj_common_ptr) dec, &ctx->event_mgr, avctx);
     ctx->dec_params.cp_limit_decoding = LIMIT_TO_MAIN_HEADER;
-    ctx->dec_params.cp_reduce         = ctx->lowres;
     ctx->dec_params.cp_layer          = ctx->lowqual;
     // Tie decoder with decoding parameters
     opj_setup_decoder(dec, &ctx->dec_params);
@@ -311,11 +333,6 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     width  = image->x1 - image->x0;
     height = image->y1 - image->y0;
 
-    if (ctx->lowres) {
-        width  = (width + (1 << ctx->lowres) - 1) >> ctx->lowres;
-        height = (height + (1 << ctx->lowres) - 1) >> ctx->lowres;
-    }
-
     ret = ff_set_dimensions(avctx, width, height);
     if (ret < 0)
         goto done;
@@ -329,20 +346,17 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
 
     if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
         av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format\n");
-        ret = AVERROR_INVALIDDATA;
         goto done;
     }
-
     for (i = 0; i < image->numcomps; i++)
         if (image->comps[i].prec > avctx->bits_per_raw_sample)
             avctx->bits_per_raw_sample = image->comps[i].prec;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto done;
-    }
 
     ctx->dec_params.cp_limit_decoding = NO_LIMITATION;
+    ctx->dec_params.cp_reduce = avctx->lowres;
     // Tie decoder with decoding parameters.
     opj_setup_decoder(dec, &ctx->dec_params);
     stream = opj_cio_open((opj_common_ptr) dec, buf, buf_size);
@@ -364,6 +378,15 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         goto done;
     }
 
+    for (i = 0; i < image->numcomps; i++) {
+        if (!image->comps[i].data) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Image component %d contains no data.\n", i);
+            ret = AVERROR_INVALIDDATA;
+            goto done;
+        }
+    }
+
     desc       = av_pix_fmt_desc_get(avctx->pix_fmt);
     pixel_size = desc->comp[0].step;
     ispacked   = libopenjpeg_ispacked(avctx->pix_fmt);
@@ -410,18 +433,25 @@ done:
     return ret;
 }
 
+static av_cold void libopenjpeg_static_init(AVCodec *codec)
+{
+    const char *version = opj_version();
+    int major, minor;
+
+    if (sscanf(version, "%d.%d", &major, &minor) == 2 && 1000*major + minor <= 1003)
+        codec->capabilities |= AV_CODEC_CAP_EXPERIMENTAL;
+}
+
 #define OFFSET(x) offsetof(LibOpenJPEGContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
 static const AVOption options[] = {
     { "lowqual", "Limit the number of layers used for decoding",
         OFFSET(lowqual), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
-    { "lowres",  "Lower the decoding resolution by a power of two",
-        OFFSET(lowres),  AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -437,5 +467,7 @@ AVCodec ff_libopenjpeg_decoder = {
     .init           = libopenjpeg_decode_init,
     .decode         = libopenjpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
-    .priv_class     = &class,
+    .max_lowres     = 31,
+    .priv_class     = &openjpeg_class,
+    .init_static_data = libopenjpeg_static_init,
 };
diff --git a/libavcodec/libopenjpegenc.c b/libavcodec/libopenjpegenc.c
index d1af0210e3..0debd24d80 100644
--- a/libavcodec/libopenjpegenc.c
+++ b/libavcodec/libopenjpegenc.c
@@ -1,21 +1,21 @@
 /*
  * JPEG 2000 encoding support via OpenJPEG
- * Copyright (c) 2011 Michael Bradshaw <mbradshaw@sorensonmedia.com>
+ * Copyright (c) 2011 Michael Bradshaw <mjbshaw gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
  */
 
 #define  OPJ_STATIC
-#include <openjpeg.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
@@ -34,11 +34,16 @@
 #include "avcodec.h"
 #include "internal.h"
 
+#if HAVE_OPENJPEG_1_5_OPENJPEG_H
+# include <openjpeg-1.5/openjpeg.h>
+#else
+# include <openjpeg.h>
+#endif
+
 typedef struct LibOpenJPEGContext {
     AVClass *avclass;
     opj_image_t *image;
     opj_cparameters_t enc_params;
-    opj_cinfo_t *compress;
     opj_event_mgr_t event_mgr;
     int format;
     int profile;
@@ -66,37 +71,79 @@ static void info_callback(const char *msg, void *data)
     av_log(data, AV_LOG_DEBUG, "%s\n", msg);
 }
 
-static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
-                                             opj_cparameters_t *parameters)
+static void cinema_parameters(opj_cparameters_t *p)
+{
+    p->tile_size_on = 0;
+    p->cp_tdx = 1;
+    p->cp_tdy = 1;
+
+    /* Tile part */
+    p->tp_flag = 'C';
+    p->tp_on = 1;
+
+    /* Tile and Image shall be at (0, 0) */
+    p->cp_tx0 = 0;
+    p->cp_ty0 = 0;
+    p->image_offset_x0 = 0;
+    p->image_offset_y0 = 0;
+
+    /* Codeblock size= 32 * 32 */
+    p->cblockw_init = 32;
+    p->cblockh_init = 32;
+    p->csty |= 0x01;
+
+    /* The progression order shall be CPRL */
+    p->prog_order = CPRL;
+
+    /* No ROI */
+    p->roi_compno = -1;
+
+    /* No subsampling */
+    p->subsampling_dx = 1;
+    p->subsampling_dy = 1;
+
+    /* 9-7 transform */
+    p->irreversible = 1;
+
+    p->tcp_mct = 1;
+}
+
+static opj_image_t *mj2_create_image(AVCodecContext *avctx, opj_cparameters_t *parameters)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
-    opj_image_cmptparm_t *cmptparm;
-    OPJ_COLOR_SPACE color_space;
+    opj_image_cmptparm_t cmptparm[4] = {{0}};
     opj_image_t *img;
     int i;
     int sub_dx[4];
     int sub_dy[4];
-    int numcomps = desc->nb_components;
+    int numcomps;
+    OPJ_COLOR_SPACE color_space = CLRSPC_UNKNOWN;
 
-    sub_dx[0] =
-    sub_dx[3] = 1;
-    sub_dy[0] =
-    sub_dy[3] = 1;
-    sub_dx[1] =
-    sub_dx[2] = 1 << desc->log2_chroma_w;
-    sub_dy[1] =
-    sub_dy[2] = 1 << desc->log2_chroma_h;
+    sub_dx[0] = sub_dx[3] = 1;
+    sub_dy[0] = sub_dy[3] = 1;
+    sub_dx[1] = sub_dx[2] = 1 << desc->log2_chroma_w;
+    sub_dy[1] = sub_dy[2] = 1 << desc->log2_chroma_h;
+
+    numcomps = desc->nb_components;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YA8:
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YA16:
         color_space = CLRSPC_GRAY;
         break;
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_XYZ12:
         color_space = CLRSPC_SRGB;
         break;
     case AV_PIX_FMT_YUV410P:
@@ -106,15 +153,32 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV420P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV444P16:
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P16:
         color_space = CLRSPC_SYCC;
         break;
     default:
@@ -124,24 +188,28 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
         return NULL;
     }
 
-    cmptparm = av_mallocz(numcomps * sizeof(*cmptparm));
-    if (!cmptparm) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough memory");
-        return NULL;
-    }
-
     for (i = 0; i < numcomps; i++) {
         cmptparm[i].prec = desc->comp[i].depth;
         cmptparm[i].bpp  = desc->comp[i].depth;
         cmptparm[i].sgnd = 0;
-        cmptparm[i].dx   = sub_dx[i];
-        cmptparm[i].dy   = sub_dy[i];
-        cmptparm[i].w    = avctx->width / sub_dx[i];
-        cmptparm[i].h    = avctx->height / sub_dy[i];
+        cmptparm[i].dx = sub_dx[i];
+        cmptparm[i].dy = sub_dy[i];
+        cmptparm[i].w = (avctx->width + sub_dx[i] - 1) / sub_dx[i];
+        cmptparm[i].h = (avctx->height + sub_dy[i] - 1) / sub_dy[i];
     }
 
     img = opj_image_create(numcomps, cmptparm, color_space);
-    av_freep(&cmptparm);
+
+    if (!img)
+        return NULL;
+
+    // x0, y0 is the top left corner of the image
+    // x1, y1 is the width, height of the reference grid
+    img->x0 = 0;
+    img->y0 = 0;
+    img->x1 = (avctx->width  - 1) * parameters->subsampling_dx + 1;
+    img->y1 = (avctx->height - 1) * parameters->subsampling_dy + 1;
+
     return img;
 }
 
@@ -152,155 +220,285 @@ static av_cold int libopenjpeg_encode_init(AVCodecContext *avctx)
 
     opj_set_default_encoder_parameters(&ctx->enc_params);
 
-    ctx->enc_params.cp_rsiz          = ctx->profile;
-    ctx->enc_params.mode             = !!avctx->global_quality;
-    ctx->enc_params.cp_cinema        = ctx->cinema_mode;
-    ctx->enc_params.prog_order       = ctx->prog_order;
-    ctx->enc_params.numresolution    = ctx->numresolution;
-    ctx->enc_params.cp_disto_alloc   = ctx->disto_alloc;
-    ctx->enc_params.cp_fixed_alloc   = ctx->fixed_alloc;
+    ctx->enc_params.cp_rsiz = ctx->profile;
+    ctx->enc_params.mode = !!avctx->global_quality;
+    ctx->enc_params.cp_cinema = ctx->cinema_mode;
+    ctx->enc_params.prog_order = ctx->prog_order;
+    ctx->enc_params.numresolution = ctx->numresolution;
+    ctx->enc_params.cp_disto_alloc = ctx->disto_alloc;
+    ctx->enc_params.cp_fixed_alloc = ctx->fixed_alloc;
     ctx->enc_params.cp_fixed_quality = ctx->fixed_quality;
-    ctx->enc_params.tcp_numlayers    = ctx->numlayers;
-    ctx->enc_params.tcp_rates[0]     = FFMAX(avctx->compression_level, 0) * 2;
+    ctx->enc_params.tcp_numlayers = ctx->numlayers;
+    ctx->enc_params.tcp_rates[0] = FFMAX(avctx->compression_level, 0) * 2;
 
-    ctx->compress = opj_create_compress(ctx->format);
-    if (!ctx->compress) {
-        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
-        return AVERROR(ENOMEM);
+    if (ctx->cinema_mode > 0) {
+        cinema_parameters(&ctx->enc_params);
     }
 
-    ctx->image = libopenjpeg_create_image(avctx, &ctx->enc_params);
+    ctx->image = mj2_create_image(avctx, &ctx->enc_params);
     if (!ctx->image) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the mj2 image\n");
         err = AVERROR(EINVAL);
         goto fail;
     }
 
-    ctx->event_mgr.info_handler    = info_callback;
-    ctx->event_mgr.error_handler   = error_callback;
-    ctx->event_mgr.warning_handler = warning_callback;
-    opj_set_event_mgr((opj_common_ptr) ctx->compress, &ctx->event_mgr, avctx);
-
     return 0;
 
 fail:
-    av_freep(&ctx->compress);
+    opj_image_destroy(ctx->image);
+    ctx->image = NULL;
     return err;
 }
 
-static void libopenjpeg_copy_packed8(AVCodecContext *avctx,
-                                     const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[0] + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame->data[0][frame_index];
+                image_line[x] = frame->data[0][frame_index];
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_packed16(AVCodecContext *avctx,
-                                      const AVFrame *frame, opj_image_t *image)
+// for XYZ 12 bit
+static int libopenjpeg_copy_packed12(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
     int x, y;
-    int image_index, frame_index;
+    int *image_line;
+    int frame_index;
     const int numcomps  = image->numcomps;
     uint16_t *frame_ptr = (uint16_t *)frame->data[0];
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[0] / 2) + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index];
+                image_line[x] = frame_ptr[frame_index] >> 4;
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked8(AVCodecContext *avctx,
-                                       const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
+    uint16_t *frame_ptr = (uint16_t*)frame->data[0];
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        for (y = 0; y < avctx->height; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            frame_index = y * (frame->linesize[0] / 2) + compno;
+            for (x = 0; x < avctx->width; ++x) {
+                image_line[x] = frame_ptr[frame_index];
+                frame_index += numcomps;
+            }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
+        }
+    }
+
+    return 1;
+}
+
+static int libopenjpeg_copy_unpacked8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
+{
+    int compno;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
+    const int numcomps = image->numcomps;
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
 
     for (compno = 0; compno < numcomps; ++compno) {
         width  = avctx->width / image->comps[compno].dx;
         height = avctx->height / image->comps[compno].dy;
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[compno];
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame->data[compno][frame_index++];
+                image_line[x] = frame->data[compno][frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked16(AVCodecContext *avctx,
-                                        const AVFrame *frame,
-                                        opj_image_t *image)
+static int libopenjpeg_copy_unpacked16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
     uint16_t *frame_ptr;
 
     for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         width     = avctx->width / image->comps[compno].dx;
         height    = avctx->height / image->comps[compno].dy;
         frame_ptr = (uint16_t *)frame->data[compno];
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[compno] / 2);
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index++];
+                image_line[x] = frame_ptr[frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
 static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                     const AVFrame *frame, int *got_packet)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
-    opj_cinfo_t *compress   = ctx->compress;
     opj_image_t *image      = ctx->image;
-    opj_cio_t *stream;
+    opj_cinfo_t *compress   = NULL;
+    opj_cio_t *stream       = NULL;
+    int cpyresult = 0;
     int ret, len;
-
-    // x0, y0 is the top left corner of the image
-    // x1, y1 is the width, height of the reference grid
-    image->x0 = 0;
-    image->y0 = 0;
-    image->x1 = (avctx->width - 1) * ctx->enc_params.subsampling_dx + 1;
-    image->y1 = (avctx->height - 1) * ctx->enc_params.subsampling_dy + 1;
+    AVFrame *gbrframe;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_YA8:
-        libopenjpeg_copy_packed8(avctx, frame, image);
+        cpyresult = libopenjpeg_copy_packed8(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_XYZ12:
+        cpyresult = libopenjpeg_copy_packed12(avctx, frame, image);
         break;
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
-        libopenjpeg_copy_packed16(avctx, frame, image);
+    case AV_PIX_FMT_YA16:
+        cpyresult = libopenjpeg_copy_packed16(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+        gbrframe = av_frame_clone(frame);
+        if (!gbrframe)
+            return AVERROR(ENOMEM);
+        gbrframe->data[0] = frame->data[2]; // swap to be rgb
+        gbrframe->data[1] = frame->data[0];
+        gbrframe->data[2] = frame->data[1];
+        gbrframe->linesize[0] = frame->linesize[2];
+        gbrframe->linesize[1] = frame->linesize[0];
+        gbrframe->linesize[2] = frame->linesize[1];
+        if (avctx->pix_fmt == AV_PIX_FMT_GBR24P) {
+            cpyresult = libopenjpeg_copy_unpacked8(avctx, gbrframe, image);
+        } else {
+            cpyresult = libopenjpeg_copy_unpacked16(avctx, gbrframe, image);
+        }
+        av_frame_free(&gbrframe);
         break;
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_YUV410P:
@@ -310,19 +508,36 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
-        libopenjpeg_copy_unpacked8(avctx, frame, image);
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
+        cpyresult = libopenjpeg_copy_unpacked8(avctx, frame, image);
         break;
     case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
-        libopenjpeg_copy_unpacked16(avctx, frame, image);
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
+        cpyresult = libopenjpeg_copy_unpacked16(avctx, frame, image);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
@@ -332,29 +547,51 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         break;
     }
 
+    if (!cpyresult) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not copy the frame data to the internal image buffer\n");
+        return -1;
+    }
+
+    compress = opj_create_compress(ctx->format);
+    if (!compress) {
+        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
+        return AVERROR(ENOMEM);
+    }
+
     opj_setup_encoder(compress, &ctx->enc_params, image);
+
     stream = opj_cio_open((opj_common_ptr) compress, NULL, 0);
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the cio stream\n");
         return AVERROR(ENOMEM);
     }
 
+    memset(&ctx->event_mgr, 0, sizeof(ctx->event_mgr));
+    ctx->event_mgr.info_handler    = info_callback;
+    ctx->event_mgr.error_handler   = error_callback;
+    ctx->event_mgr.warning_handler = warning_callback;
+    opj_set_event_mgr((opj_common_ptr) compress, &ctx->event_mgr, avctx);
+
     if (!opj_encode(compress, stream, image, NULL)) {
-        opj_cio_close(stream);
         av_log(avctx, AV_LOG_ERROR, "Error during the opj encode\n");
         return -1;
     }
 
     len = cio_tell(stream);
-    if ((ret = ff_alloc_packet(pkt, len)) < 0) {
-        opj_cio_close(stream);
+    if ((ret = ff_alloc_packet2(avctx, pkt, len, 0)) < 0) {
         return ret;
     }
 
     memcpy(pkt->data, stream->buffer, len);
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
+
     opj_cio_close(stream);
+    stream = NULL;
+    opj_destroy_compress(compress);
+    compress = NULL;
+
     return 0;
 }
 
@@ -362,8 +599,8 @@ static av_cold int libopenjpeg_encode_close(AVCodecContext *avctx)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
 
-    opj_destroy_compress(ctx->compress);
     opj_image_destroy(ctx->image);
+    ctx->image = NULL;
     return 0;
 }
 
@@ -388,15 +625,15 @@ static const AVOption options[] = {
     { "rpcl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = RPCL        }, 0,         0,           VE, "prog_order"  },
     { "pcrl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = PCRL        }, 0,         0,           VE, "prog_order"  },
     { "cprl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CPRL        }, 0,         0,           VE, "prog_order"  },
-    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6           }, 1,         10,          VE },
-    { "numlayers",     NULL,                OFFSET(numlayers),     AV_OPT_TYPE_INT,   { .i64 = 1           }, 1,         10,          VE },
-    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1           }, 0,         1,           VE },
-    { "fixed_alloc",   NULL,                OFFSET(fixed_alloc),   AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
-    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
+    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6           }, 1,         INT_MAX,     VE                },
+    { "numlayers",     NULL,                OFFSET(numlayers),     AV_OPT_TYPE_INT,   { .i64 = 1           }, 1,         10,          VE                },
+    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1           }, 0,         1,           VE                },
+    { "fixed_alloc",   NULL,                OFFSET(fixed_alloc),   AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE                },
+    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE                },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -412,18 +649,25 @@ AVCodec ff_libopenjpeg_encoder = {
     .init           = libopenjpeg_encode_init,
     .encode2        = libopenjpeg_encode_frame,
     .close          = libopenjpeg_encode_close,
-    .capabilities   = 0,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_RGB48,
-        AV_PIX_FMT_RGBA64,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA8,
+        AV_PIX_FMT_RGBA64, AV_PIX_FMT_GBR24P,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA420P,
-        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P,
+        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA422P,
+        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUVA444P,
         AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9,
         AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10,
+        AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+        AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
         AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_XYZ12,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &openjpeg_class,
 };
diff --git a/libavcodec/libopus.c b/libavcodec/libopus.c
index b511415962..16395c73df 100644
--- a/libavcodec/libopus.c
+++ b/libavcodec/libopus.c
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of libav.
+ * This file is part of FFmpeg.
  *
- * libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopus.h b/libavcodec/libopus.h
index b08257df4a..a8223d1d6f 100644
--- a/libavcodec/libopus.h
+++ b/libavcodec/libopus.h
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of libav.
+ * This file is part of FFmpeg.
  *
- * libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopusdec.c b/libavcodec/libopusdec.c
index 2d31b03967..4cbd14a38a 100644
--- a/libavcodec/libopusdec.c
+++ b/libavcodec/libopusdec.c
@@ -2,20 +2,20 @@
  * Opus decoder using libopus
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 
 struct libopus_context {
     OpusMSDecoder *dec;
+    int pre_skip;
+#ifndef OPUS_SET_GAIN
+    union { int i; double d; } gain;
+#endif
 };
 
 #define OPUS_HEAD_SIZE 19
@@ -49,6 +53,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
                           ff_vorbis_channel_layouts[avc->channels - 1];
 
     if (avc->extradata_size >= OPUS_HEAD_SIZE) {
+        opus->pre_skip = AV_RL16(avc->extradata + 10);
         gain_db     = sign_extend(AV_RL16(avc->extradata + 16), 16);
         channel_map = AV_RL8 (avc->extradata + 18);
     }
@@ -73,7 +78,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         const uint8_t *vorbis_offset = ff_vorbis_channel_layout_offsets[avc->channels - 1];
         int ch;
 
-        /* Remap channels from vorbis order to libav order */
+        /* Remap channels from vorbis order to ffmpeg order */
         for (ch = 0; ch < avc->channels; ch++)
             mapping_arr[ch] = mapping[vorbis_offset[ch]];
         mapping = mapping_arr;
@@ -88,12 +93,23 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         return ff_opus_error_to_averror(ret);
     }
 
+#ifdef OPUS_SET_GAIN
     ret = opus_multistream_decoder_ctl(opus->dec, OPUS_SET_GAIN(gain_db));
     if (ret != OPUS_OK)
         av_log(avc, AV_LOG_WARNING, "Failed to set gain: %s\n",
                opus_strerror(ret));
+#else
+    {
+        double gain_lin = pow(10, gain_db / (20.0 * 256));
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT)
+            opus->gain.d = gain_lin;
+        else
+            opus->gain.i = FFMIN(gain_lin * 65536, INT_MAX);
+    }
+#endif
 
-    avc->delay = 3840;  /* Decoder delay (in samples) at 48kHz */
+    /* Decoder delay (in samples) at 48kHz */
+    avc->delay = avc->internal->skip_samples = opus->pre_skip;
 
     return 0;
 }
@@ -116,11 +132,8 @@ static int libopus_decode(AVCodecContext *avc, void *data,
     int ret, nb_samples;
 
     frame->nb_samples = MAX_FRAME_SIZE;
-    ret = ff_get_buffer(avc, frame, 0);
-    if (ret < 0) {
-        av_log(avc, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avc, frame, 0)) < 0)
         return ret;
-    }
 
     if (avc->sample_fmt == AV_SAMPLE_FMT_S16)
         nb_samples = opus_multistream_decode(opus->dec, pkt->data, pkt->size,
@@ -137,6 +150,21 @@ static int libopus_decode(AVCodecContext *avc, void *data,
         return ff_opus_error_to_averror(nb_samples);
     }
 
+#ifndef OPUS_SET_GAIN
+    {
+        int i = avc->channels * nb_samples;
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT) {
+            float *pcm = (float *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clipf(*pcm * opus->gain.d, -1, 1);
+        } else {
+            int16_t *pcm = (int16_t *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clip_int16(((int64_t)opus->gain.i * *pcm) >> 16);
+        }
+    }
+#endif
+
     frame->nb_samples = nb_samples;
     *got_frame_ptr    = 1;
 
@@ -148,6 +176,9 @@ static void libopus_flush(AVCodecContext *avc)
     struct libopus_context *opus = avc->priv_data;
 
     opus_multistream_decoder_ctl(opus->dec, OPUS_RESET_STATE);
+    /* The stream can have been extracted by a tool that is not Opus-aware.
+       Therefore, any packet can become the first of the stream. */
+    avc->internal->skip_samples = opus->pre_skip;
 }
 
 AVCodec ff_libopus_decoder = {
diff --git a/libavcodec/libopusenc.c b/libavcodec/libopusenc.c
index 08f0f28ac8..8ac02f9520 100644
--- a/libavcodec/libopusenc.c
+++ b/libavcodec/libopusenc.c
@@ -2,20 +2,20 @@
  * Opus encoder using libopus
  * Copyright (c) 2012 Nathan Caldwell
  *
- * This file is part of libav.
+ * This file is part of FFmpeg.
  *
- * libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,8 +65,8 @@ static const uint8_t opus_vorbis_channel_map[8][8] = {
     { 0, 6, 1, 2, 3, 4, 5, 7 },
 };
 
-/* libav to libopus channel order mapping, passed to libopus */
-static const uint8_t libav_libopus_channel_map[8][8] = {
+/* libavcodec to libopus channel order mapping, passed to libopus */
+static const uint8_t libavcodec_libopus_channel_map[8][8] = {
     { 0 },
     { 0, 1 },
     { 0, 1, 2 },
@@ -107,6 +107,13 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
 {
     int ret;
 
+    if (avctx->global_quality) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Quality-based encoding not supported, "
+               "please specify a bitrate and VBR setting.\n");
+        return AVERROR(EINVAL);
+    }
+
     ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate));
     if (ret != OPUS_OK) {
         av_log(avctx, AV_LOG_ERROR,
@@ -149,7 +156,7 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
     return OPUS_OK;
 }
 
-static int av_cold libopus_encode_init(AVCodecContext *avctx)
+static av_cold int libopus_encode_init(AVCodecContext *avctx)
 {
     LibopusEncContext *opus = avctx->priv_data;
     const uint8_t *channel_mapping;
@@ -159,7 +166,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
 
     coupled_stream_count = opus_coupled_streams[avctx->channels - 1];
     opus->stream_count   = avctx->channels - coupled_stream_count;
-    channel_mapping      = libav_libopus_channel_map[avctx->channels - 1];
+    channel_mapping      = libavcodec_libopus_channel_map[avctx->channels - 1];
 
     /* FIXME: Opus can handle up to 255 channels. However, the mapping for
      * anything greater than 8 is undefined. */
@@ -173,12 +180,12 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
         avctx->bit_rate = 64000 * opus->stream_count +
                           32000 * coupled_stream_count;
         av_log(avctx, AV_LOG_WARNING,
-               "No bit rate set. Defaulting to %d bps.\n", avctx->bit_rate);
+               "No bit rate set. Defaulting to %"PRId64" bps.\n", (int64_t)avctx->bit_rate);
     }
 
     if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * avctx->channels) {
-        av_log(avctx, AV_LOG_ERROR, "The bit rate %d bps is unsupported. "
-               "Please choose a value between 500 and %d.\n", avctx->bit_rate,
+        av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. "
+               "Please choose a value between 500 and %d.\n", (int64_t)avctx->bit_rate,
                256000 * avctx->channels);
         return AVERROR(EINVAL);
     }
@@ -270,7 +277,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
     }
     avctx->extradata_size = header_size;
 
-    opus->samples = av_mallocz(frame_size * avctx->channels *
+    opus->samples = av_mallocz_array(frame_size, avctx->channels *
                                av_get_bytes_per_sample(avctx->sample_fmt));
     if (!opus->samples) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n");
@@ -307,6 +314,7 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
                               av_get_bytes_per_sample(avctx->sample_fmt);
     uint8_t *audio;
     int ret;
+    int discard_padding;
 
     if (frame) {
         ret = ff_af_queue_add(&opus->afq, frame);
@@ -318,7 +326,7 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
         } else
             audio = frame->data[0];
     } else {
-        if (!opus->afq.remaining_samples)
+        if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count))
             return 0;
         audio = opus->samples;
         memset(audio, 0, opus->opts.packet_size * sample_size);
@@ -327,10 +335,8 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
     /* Maximum packet size taken from opusenc in opus-tools. 60ms packets
      * consist of 3 frames in one packet. The maximum frame size is 1275
      * bytes along with the largest possible packet header of 7 bytes. */
-    if (ret = ff_alloc_packet(avpkt, (1275 * 3 + 7) * opus->stream_count)) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, (1275 * 3 + 7) * opus->stream_count, 0)) < 0)
         return ret;
-    }
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT)
         ret = opus_multistream_encode_float(opus->enc, (float *)audio,
@@ -352,12 +358,31 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&opus->afq, opus->opts.packet_size,
                        &avpkt->pts, &avpkt->duration);
 
+    discard_padding = opus->opts.packet_size - avpkt->duration;
+    // Check if subtraction resulted in an overflow
+    if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0)) {
+        av_free_packet(avpkt);
+        av_free(avpkt);
+        return AVERROR(EINVAL);
+    }
+    if (discard_padding > 0) {
+        uint8_t* side_data = av_packet_new_side_data(avpkt,
+                                                     AV_PKT_DATA_SKIP_SAMPLES,
+                                                     10);
+        if(!side_data) {
+            av_free_packet(avpkt);
+            av_free(avpkt);
+            return AVERROR(ENOMEM);
+        }
+        AV_WL32(side_data + 4, discard_padding);
+    }
+
     *got_packet_ptr = 1;
 
     return 0;
 }
 
-static int av_cold libopus_encode_close(AVCodecContext *avctx)
+static av_cold int libopus_encode_close(AVCodecContext *avctx)
 {
     LibopusEncContext *opus = avctx->priv_data;
 
diff --git a/libavcodec/libschroedinger.c b/libavcodec/libschroedinger.c
index 157433ea95..9f0b25c02b 100644
--- a/libavcodec/libschroedinger.c
+++ b/libavcodec/libschroedinger.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -191,9 +191,10 @@ SchroFrame *ff_create_schro_frame(AVCodecContext *avctx,
     uv_height = y_height >> (SCHRO_FRAME_FORMAT_V_SHIFT(schro_frame_fmt));
 
     p_pic = av_mallocz(sizeof(AVPicture));
-    if (!p_pic)
+    if (!p_pic || avpicture_alloc(p_pic, avctx->pix_fmt, y_width, y_height) < 0) {
+        av_free(p_pic);
         return NULL;
-    avpicture_alloc(p_pic, avctx->pix_fmt, y_width, y_height);
+    }
 
     p_frame         = schro_frame_new();
     p_frame->format = schro_frame_fmt;
diff --git a/libavcodec/libschroedinger.h b/libavcodec/libschroedinger.h
index 5481f92ce0..12fe57c7f1 100644
--- a/libavcodec/libschroedinger.h
+++ b/libavcodec/libschroedinger.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libschroedingerdec.c b/libavcodec/libschroedingerdec.c
index d2594c68fe..6ddb811813 100644
--- a/libavcodec/libschroedingerdec.c
+++ b/libavcodec/libschroedingerdec.c
@@ -2,20 +2,20 @@
  * Dirac decoder support via Schroedinger libraries
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,10 +37,6 @@
 #include "internal.h"
 #include "libschroedinger.h"
 
-#undef NDEBUG
-#include <assert.h>
-
-
 #include <schroedinger/schro.h>
 #include <schroedinger/schrodebug.h>
 #include <schroedinger/schrovideoformat.h>
@@ -134,7 +130,7 @@ static SchroBuffer *find_next_parse_unit(SchroParseUnitContext *parse_ctx)
 }
 
 /**
-* Returns Libav chroma format.
+* Returns FFmpeg chroma format.
 */
 static enum AVPixelFormat get_chroma_format(SchroChromaFormat schro_pix_fmt)
 {
@@ -179,7 +175,7 @@ static void libschroedinger_handle_first_access_unit(AVCodecContext *avctx)
 
     p_schro_params->format = schro_decoder_get_video_format(decoder);
 
-    /* Tell Libav about sequence details. */
+    /* Tell FFmpeg about sequence details. */
     if (av_image_check_size(p_schro_params->format->width,
                             p_schro_params->format->height, 0, avctx) < 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid dimensions (%dx%d)\n",
@@ -312,10 +308,10 @@ static int libschroedinger_decode_frame(AVCodecContext *avctx,
     framewithpts = ff_schro_queue_pop(&p_schro_params->dec_frame_queue);
 
     if (framewithpts && framewithpts->frame) {
-        if (ff_get_buffer(avctx, avframe, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to allocate buffer\n");
-            return AVERROR(ENOMEM);
-        }
+        int ret;
+
+        if ((ret = ff_get_buffer(avctx, avframe, 0)) < 0)
+            return ret;
 
         memcpy(avframe->data[0],
                framewithpts->frame->components[0].data,
diff --git a/libavcodec/libschroedingerenc.c b/libavcodec/libschroedingerenc.c
index ecf5e37d55..d70552ed56 100644
--- a/libavcodec/libschroedingerenc.c
+++ b/libavcodec/libschroedingerenc.c
@@ -2,20 +2,20 @@
  * Dirac encoder support via Schroedinger libraries
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include <schroedinger/schrovideoformat.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "libschroedinger.h"
@@ -378,10 +379,8 @@ static int libschroedinger_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     pkt_size = p_frame_output->size;
     if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0)
         pkt_size += p_schro_params->enc_buf_size;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
         goto error;
-    }
 
     memcpy(pkt->data, p_frame_output->p_encbuf, p_frame_output->size);
 #if FF_API_CODED_FRAME
diff --git a/libavcodec/libshine.c b/libavcodec/libshine.c
new file mode 100644
index 0000000000..f4cf5981bc
--- /dev/null
+++ b/libavcodec/libshine.c
@@ -0,0 +1,149 @@
+/*
+ * Interface to libshine for mp3 encoding
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <shine/layer3.h>
+
+#include "libavutil/intreadwrite.h"
+#include "audio_frame_queue.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "mpegaudio.h"
+#include "mpegaudiodecheader.h"
+
+#define BUFFER_SIZE (4096 * 20)
+
+typedef struct SHINEContext {
+    shine_config_t  config;
+    shine_t         shine;
+    uint8_t         buffer[BUFFER_SIZE];
+    int             buffer_index;
+    AudioFrameQueue afq;
+} SHINEContext;
+
+static av_cold int libshine_encode_init(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    if (avctx->channels <= 0 || avctx->channels > 2){
+        av_log(avctx, AV_LOG_ERROR, "only mono or stereo is supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    shine_set_config_mpeg_defaults(&s->config.mpeg);
+    if (avctx->bit_rate)
+        s->config.mpeg.bitr = avctx->bit_rate / 1000;
+    s->config.mpeg.mode = avctx->channels == 2 ? STEREO : MONO;
+    s->config.wave.samplerate = avctx->sample_rate;
+    s->config.wave.channels   = avctx->channels == 2 ? PCM_STEREO : PCM_MONO;
+    if (shine_check_config(s->config.wave.samplerate, s->config.mpeg.bitr) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "invalid configuration\n");
+        return AVERROR(EINVAL);
+    }
+    s->shine = shine_initialise(&s->config);
+    if (!s->shine)
+        return AVERROR(ENOMEM);
+    avctx->frame_size = shine_samples_per_pass(s->shine);
+    ff_af_queue_init(avctx, &s->afq);
+    return 0;
+}
+
+static int libshine_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                 const AVFrame *frame, int *got_packet_ptr)
+{
+    SHINEContext *s = avctx->priv_data;
+    MPADecodeHeader hdr;
+    unsigned char *data;
+    int written;
+    int ret, len;
+
+    if (frame)
+        data = shine_encode_buffer(s->shine, (int16_t **)frame->data, &written);
+    else
+        data = shine_flush(s->shine, &written);
+    if (written < 0)
+        return -1;
+    if (written > 0) {
+        if (s->buffer_index + written > BUFFER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "internal buffer too small\n");
+            return AVERROR_BUG;
+        }
+        memcpy(s->buffer + s->buffer_index, data, written);
+        s->buffer_index += written;
+    }
+    if (frame) {
+        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+            return ret;
+    }
+
+    if (s->buffer_index < 4 || !s->afq.frame_count)
+        return 0;
+    if (avpriv_mpegaudio_decode_header(&hdr, AV_RB32(s->buffer))) {
+        av_log(avctx, AV_LOG_ERROR, "free format output not supported\n");
+        return -1;
+    }
+
+    len = hdr.frame_size;
+    if (len <= s->buffer_index) {
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)))
+            return ret;
+        memcpy(avpkt->data, s->buffer, len);
+        s->buffer_index -= len;
+        memmove(s->buffer, s->buffer + len, s->buffer_index);
+
+        ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
+                           &avpkt->duration);
+
+        avpkt->size = len;
+        *got_packet_ptr = 1;
+    }
+    return 0;
+}
+
+static av_cold int libshine_encode_close(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    ff_af_queue_close(&s->afq);
+    shine_close(s->shine);
+    return 0;
+}
+
+static const int libshine_sample_rates[] = {
+    44100, 48000, 32000, 0
+};
+
+AVCodec ff_libshine_encoder = {
+    .name                  = "libshine",
+    .long_name             = NULL_IF_CONFIG_SMALL("libshine MP3 (MPEG audio layer 3)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP3,
+    .priv_data_size        = sizeof(SHINEContext),
+    .init                  = libshine_encode_init,
+    .encode2               = libshine_encode_frame,
+    .close                 = libshine_encode_close,
+    .capabilities          = AV_CODEC_CAP_DELAY,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16P,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = libshine_sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  0 },
+};
diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index 949a9344c1..044883af73 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@ typedef struct LibSpeexContext {
     SpeexStereoState stereo;
     void *dec_state;
     int frame_size;
+    int pktsize;
 } LibSpeexContext;
 
 
@@ -43,14 +44,30 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
     SpeexHeader *header = NULL;
     int spx_mode;
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
     if (avctx->extradata && avctx->extradata_size >= 80) {
         header = speex_packet_to_header(avctx->extradata,
                                         avctx->extradata_size);
         if (!header)
             av_log(avctx, AV_LOG_WARNING, "Invalid Speex header\n");
     }
-    if (header) {
+    if (avctx->codec_tag == MKTAG('S', 'P', 'X', 'N')) {
+        int quality;
+        if (!avctx->extradata || avctx->extradata && avctx->extradata_size < 47) {
+            av_log(avctx, AV_LOG_ERROR, "Missing or invalid extradata.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        quality = avctx->extradata[37];
+        if (quality > 10) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported quality mode %d.\n", quality);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        s->pktsize = ((const int[]){5,10,15,20,20,28,28,38,38,46,62})[quality];
+
+        spx_mode           = 0;
+    } else if (header) {
+        avctx->sample_rate = header->rate;
         avctx->channels    = header->nb_channels;
         spx_mode           = header->mode;
         speex_header_free(header);
@@ -73,8 +90,9 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", spx_mode);
         return AVERROR_INVALIDDATA;
     }
-    avctx->sample_rate = 8000 << spx_mode;
     s->frame_size      =  160 << spx_mode;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 << spx_mode;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
         /* libspeex can handle mono or stereo if initialized as stereo */
@@ -113,13 +131,12 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame     = data;
     int16_t *output;
     int ret, consumed = 0;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     /* get output buffer */
     frame->nb_samples = s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output = (int16_t *)frame->data[0];
 
     /* if there is not enough data left for the smallest possible frame or the
@@ -133,9 +150,11 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame_ptr = 0;
             return buf_size;
         }
+        if (s->pktsize && buf_size == 62)
+            buf_size = s->pktsize;
         /* set new buffer */
         speex_bits_read_from(&s->bits, buf, buf_size);
-        consumed = buf_size;
+        consumed = avpkt->size;
     }
 
     /* decode a single frame */
@@ -149,6 +168,8 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
+    if (!avctx->bit_rate)
+        speex_decoder_ctl(s->dec_state, SPEEX_GET_BITRATE, &avctx->bit_rate);
     return consumed;
 }
 
diff --git a/libavcodec/libspeexenc.c b/libavcodec/libspeexenc.c
index 88e5d2fea4..65a84dc56d 100644
--- a/libavcodec/libspeexenc.c
+++ b/libavcodec/libspeexenc.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2009 Justin Ruggles
  * Copyright (c) 2009 Xuggle Incorporated
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,7 +76,7 @@
  *     encodes them with just enough bits to reproduce the background noise.
  *
  * Discontinuous Transmission (DTX)
- *     DTX is an addition to VAD/VBR operation, that allows to stop transmitting
+ *     DTX is an addition to VAD/VBR operation, that makes it possible to stop transmitting
  *     completely when the background noise is stationary.
  *     In file-based operation only 5 bits are used for such frames.
  */
@@ -92,6 +92,7 @@
 #include "internal.h"
 #include "audio_frame_queue.h"
 
+/* TODO: Think about converting abr, vad, dtx and such flags to a bit field */
 typedef struct LibSpeexEncContext {
     AVClass *class;             ///< AVClass for private options
     SpeexBits bits;             ///< libspeex bitwriter context
@@ -124,10 +125,10 @@ static av_cold void print_enc_params(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "  quality: %f\n", s->vbr_quality);
     } else if (s->abr) {
         av_log(avctx, AV_LOG_DEBUG, "rate control: ABR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", (int64_t)avctx->bit_rate);
     } else {
         av_log(avctx, AV_LOG_DEBUG, "rate control: CBR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", (int64_t)avctx->bit_rate);
     }
     av_log(avctx, AV_LOG_DEBUG, "complexity: %d\n",
            avctx->compression_level);
@@ -293,10 +294,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     /* write output if all frames for the packet have been encoded */
     if (s->pkt_frame_count == s->frames_per_packet) {
         s->pkt_frame_count = 0;
-        if ((ret = ff_alloc_packet(avpkt, speex_bits_nbytes(&s->bits)))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, speex_bits_nbytes(&s->bits), 0)) < 0)
             return ret;
-        }
         ret = speex_bits_write(&s->bits, avpkt->data, avpkt->size);
         speex_bits_reset(&s->bits);
 
@@ -335,7 +334,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass speex_class = {
     .class_name = "libspeex",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -364,6 +363,6 @@ AVCodec ff_libspeex_encoder = {
                                            AV_CH_LAYOUT_STEREO,
                                            0 },
     .supported_samplerates = (const int[]){ 8000, 16000, 32000, 0 },
-    .priv_class     = &class,
+    .priv_class     = &speex_class,
     .defaults       = defaults,
 };
diff --git a/libavcodec/libstagefright.cpp b/libavcodec/libstagefright.cpp
new file mode 100644
index 0000000000..07cac33bb7
--- /dev/null
+++ b/libavcodec/libstagefright.cpp
@@ -0,0 +1,591 @@
+/*
+ * Interface to the Android Stagefright library for
+ * H/W accelerated H.264 decoding
+ *
+ * Copyright (C) 2011 Mohamed Naufal
+ * Copyright (C) 2011 Martin Storsjö
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <binder/ProcessState.h>
+#include <media/stagefright/MetaData.h>
+#include <media/stagefright/MediaBufferGroup.h>
+#include <media/stagefright/MediaDebug.h>
+#include <media/stagefright/MediaDefs.h>
+#include <media/stagefright/OMXClient.h>
+#include <media/stagefright/OMXCodec.h>
+#include <utils/List.h>
+#include <new>
+#include <map>
+
+extern "C" {
+#include "avcodec.h"
+#include "libavutil/imgutils.h"
+#include "internal.h"
+}
+
+#define OMX_QCOM_COLOR_FormatYVU420SemiPlanar 0x7FA30C00
+
+using namespace android;
+
+struct Frame {
+    status_t status;
+    size_t size;
+    int64_t time;
+    int key;
+    uint8_t *buffer;
+    AVFrame *vframe;
+};
+
+struct TimeStamp {
+    int64_t pts;
+    int64_t reordered_opaque;
+};
+
+class CustomSource;
+
+struct StagefrightContext {
+    AVCodecContext *avctx;
+    AVBitStreamFilterContext *bsfc;
+    uint8_t* orig_extradata;
+    int orig_extradata_size;
+    sp<MediaSource> *source;
+    List<Frame*> *in_queue, *out_queue;
+    pthread_mutex_t in_mutex, out_mutex;
+    pthread_cond_t condition;
+    pthread_t decode_thread_id;
+
+    Frame *end_frame;
+    bool source_done;
+    volatile sig_atomic_t thread_started, thread_exited, stop_decode;
+
+    AVFrame *prev_frame;
+    std::map<int64_t, TimeStamp> *ts_map;
+    int64_t frame_index;
+
+    uint8_t *dummy_buf;
+    int dummy_bufsize;
+
+    OMXClient *client;
+    sp<MediaSource> *decoder;
+    const char *decoder_component;
+};
+
+class CustomSource : public MediaSource {
+public:
+    CustomSource(AVCodecContext *avctx, sp<MetaData> meta) {
+        s = (StagefrightContext*)avctx->priv_data;
+        source_meta = meta;
+        frame_size  = (avctx->width * avctx->height * 3) / 2;
+        buf_group.add_buffer(new MediaBuffer(frame_size));
+    }
+
+    virtual sp<MetaData> getFormat() {
+        return source_meta;
+    }
+
+    virtual status_t start(MetaData *params) {
+        return OK;
+    }
+
+    virtual status_t stop() {
+        return OK;
+    }
+
+    virtual status_t read(MediaBuffer **buffer,
+                          const MediaSource::ReadOptions *options) {
+        Frame *frame;
+        status_t ret;
+
+        if (s->thread_exited)
+            return ERROR_END_OF_STREAM;
+        pthread_mutex_lock(&s->in_mutex);
+
+        while (s->in_queue->empty())
+            pthread_cond_wait(&s->condition, &s->in_mutex);
+
+        frame = *s->in_queue->begin();
+        ret = frame->status;
+
+        if (ret == OK) {
+            ret = buf_group.acquire_buffer(buffer);
+            if (ret == OK) {
+                memcpy((*buffer)->data(), frame->buffer, frame->size);
+                (*buffer)->set_range(0, frame->size);
+                (*buffer)->meta_data()->clear();
+                (*buffer)->meta_data()->setInt32(kKeyIsSyncFrame,frame->key);
+                (*buffer)->meta_data()->setInt64(kKeyTime, frame->time);
+            } else {
+                av_log(s->avctx, AV_LOG_ERROR, "Failed to acquire MediaBuffer\n");
+            }
+            av_freep(&frame->buffer);
+        }
+
+        s->in_queue->erase(s->in_queue->begin());
+        pthread_mutex_unlock(&s->in_mutex);
+
+        av_freep(&frame);
+        return ret;
+    }
+
+private:
+    MediaBufferGroup buf_group;
+    sp<MetaData> source_meta;
+    StagefrightContext *s;
+    int frame_size;
+};
+
+void* decode_thread(void *arg)
+{
+    AVCodecContext *avctx = (AVCodecContext*)arg;
+    StagefrightContext *s = (StagefrightContext*)avctx->priv_data;
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+    Frame* frame;
+    MediaBuffer *buffer;
+    int32_t w, h;
+    int decode_done = 0;
+    int ret;
+    int src_linesize[3];
+    const uint8_t *src_data[3];
+    int64_t out_frame_index = 0;
+
+    do {
+        buffer = NULL;
+        frame = (Frame*)av_mallocz(sizeof(Frame));
+        if (!frame) {
+            frame         = s->end_frame;
+            frame->status = AVERROR(ENOMEM);
+            decode_done   = 1;
+            s->end_frame  = NULL;
+            goto push_frame;
+        }
+        frame->status = (*s->decoder)->read(&buffer);
+        if (frame->status == OK) {
+            sp<MetaData> outFormat = (*s->decoder)->getFormat();
+            outFormat->findInt32(kKeyWidth , &w);
+            outFormat->findInt32(kKeyHeight, &h);
+            frame->vframe = av_frame_alloc();
+            if (!frame->vframe) {
+                frame->status = AVERROR(ENOMEM);
+                decode_done   = 1;
+                buffer->release();
+                goto push_frame;
+            }
+            ret = ff_get_buffer(avctx, frame->vframe, AV_GET_BUFFER_FLAG_REF);
+            if (ret < 0) {
+                frame->status = ret;
+                decode_done   = 1;
+                buffer->release();
+                goto push_frame;
+            }
+
+            // The OMX.SEC decoder doesn't signal the modified width/height
+            if (s->decoder_component && !strncmp(s->decoder_component, "OMX.SEC", 7) &&
+                (w & 15 || h & 15)) {
+                if (((w + 15)&~15) * ((h + 15)&~15) * 3/2 == buffer->range_length()) {
+                    w = (w + 15)&~15;
+                    h = (h + 15)&~15;
+                }
+            }
+
+            if (!avctx->width || !avctx->height || avctx->width > w || avctx->height > h) {
+                avctx->width  = w;
+                avctx->height = h;
+            }
+
+            src_linesize[0] = av_image_get_linesize(avctx->pix_fmt, w, 0);
+            src_linesize[1] = av_image_get_linesize(avctx->pix_fmt, w, 1);
+            src_linesize[2] = av_image_get_linesize(avctx->pix_fmt, w, 2);
+
+            src_data[0] = (uint8_t*)buffer->data();
+            src_data[1] = src_data[0] + src_linesize[0] * h;
+            src_data[2] = src_data[1] + src_linesize[1] * -(-h>>pix_desc->log2_chroma_h);
+            av_image_copy(frame->vframe->data, frame->vframe->linesize,
+                          src_data, src_linesize,
+                          avctx->pix_fmt, avctx->width, avctx->height);
+
+            buffer->meta_data()->findInt64(kKeyTime, &out_frame_index);
+            if (out_frame_index && s->ts_map->count(out_frame_index) > 0) {
+                frame->vframe->pts = (*s->ts_map)[out_frame_index].pts;
+                frame->vframe->reordered_opaque = (*s->ts_map)[out_frame_index].reordered_opaque;
+                s->ts_map->erase(out_frame_index);
+            }
+            buffer->release();
+            } else if (frame->status == INFO_FORMAT_CHANGED) {
+                if (buffer)
+                    buffer->release();
+                av_free(frame);
+                continue;
+            } else {
+                decode_done = 1;
+            }
+push_frame:
+        while (true) {
+            pthread_mutex_lock(&s->out_mutex);
+            if (s->out_queue->size() >= 10) {
+                pthread_mutex_unlock(&s->out_mutex);
+                usleep(10000);
+                continue;
+            }
+            break;
+        }
+        s->out_queue->push_back(frame);
+        pthread_mutex_unlock(&s->out_mutex);
+    } while (!decode_done && !s->stop_decode);
+
+    s->thread_exited = true;
+
+    return 0;
+}
+
+static av_cold int Stagefright_init(AVCodecContext *avctx)
+{
+    StagefrightContext *s = (StagefrightContext*)avctx->priv_data;
+    sp<MetaData> meta, outFormat;
+    int32_t colorFormat = 0;
+    int ret;
+
+    if (!avctx->extradata || !avctx->extradata_size || avctx->extradata[0] != 1)
+        return -1;
+
+    s->avctx = avctx;
+    s->bsfc  = av_bitstream_filter_init("h264_mp4toannexb");
+    if (!s->bsfc) {
+        av_log(avctx, AV_LOG_ERROR, "Cannot open the h264_mp4toannexb BSF!\n");
+        return -1;
+    }
+
+    s->orig_extradata_size = avctx->extradata_size;
+    s->orig_extradata = (uint8_t*) av_mallocz(avctx->extradata_size +
+                                              AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!s->orig_extradata) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    memcpy(s->orig_extradata, avctx->extradata, avctx->extradata_size);
+
+    meta = new MetaData;
+    if (!meta) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    meta->setCString(kKeyMIMEType, MEDIA_MIMETYPE_VIDEO_AVC);
+    meta->setInt32(kKeyWidth, avctx->width);
+    meta->setInt32(kKeyHeight, avctx->height);
+    meta->setData(kKeyAVCC, kTypeAVCC, avctx->extradata, avctx->extradata_size);
+
+    android::ProcessState::self()->startThreadPool();
+
+    s->source    = new sp<MediaSource>();
+    *s->source   = new CustomSource(avctx, meta);
+    s->in_queue  = new List<Frame*>;
+    s->out_queue = new List<Frame*>;
+    s->ts_map    = new std::map<int64_t, TimeStamp>;
+    s->client    = new OMXClient;
+    s->end_frame = (Frame*)av_mallocz(sizeof(Frame));
+    if (s->source == NULL || !s->in_queue || !s->out_queue || !s->client ||
+        !s->ts_map || !s->end_frame) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    if (s->client->connect() !=  OK) {
+        av_log(avctx, AV_LOG_ERROR, "Cannot connect OMX client\n");
+        ret = -1;
+        goto fail;
+    }
+
+    s->decoder  = new sp<MediaSource>();
+    *s->decoder = OMXCodec::Create(s->client->interface(), meta,
+                                  false, *s->source, NULL,
+                                  OMXCodec::kClientNeedsFramebuffer);
+    if ((*s->decoder)->start() !=  OK) {
+        av_log(avctx, AV_LOG_ERROR, "Cannot start decoder\n");
+        ret = -1;
+        s->client->disconnect();
+        goto fail;
+    }
+
+    outFormat = (*s->decoder)->getFormat();
+    outFormat->findInt32(kKeyColorFormat, &colorFormat);
+    if (colorFormat == OMX_QCOM_COLOR_FormatYVU420SemiPlanar ||
+        colorFormat == OMX_COLOR_FormatYUV420SemiPlanar)
+        avctx->pix_fmt = AV_PIX_FMT_NV21;
+    else if (colorFormat == OMX_COLOR_FormatYCbYCr)
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+    else if (colorFormat == OMX_COLOR_FormatCbYCrY)
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+    else
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    outFormat->findCString(kKeyDecoderComponent, &s->decoder_component);
+    if (s->decoder_component)
+        s->decoder_component = av_strdup(s->decoder_component);
+
+    pthread_mutex_init(&s->in_mutex, NULL);
+    pthread_mutex_init(&s->out_mutex, NULL);
+    pthread_cond_init(&s->condition, NULL);
+    return 0;
+
+fail:
+    av_bitstream_filter_close(s->bsfc);
+    av_freep(&s->orig_extradata);
+    av_freep(&s->end_frame);
+    delete s->in_queue;
+    delete s->out_queue;
+    delete s->ts_map;
+    delete s->client;
+    return ret;
+}
+
+static int Stagefright_decode_frame(AVCodecContext *avctx, void *data,
+                                    int *got_frame, AVPacket *avpkt)
+{
+    StagefrightContext *s = (StagefrightContext*)avctx->priv_data;
+    Frame *frame;
+    status_t status;
+    int orig_size = avpkt->size;
+    AVPacket pkt = *avpkt;
+    AVFrame *ret_frame;
+
+    if (!s->thread_started) {
+        if(pthread_create(&s->decode_thread_id, NULL, &decode_thread, avctx))
+            return AVERROR(ENOMEM);
+        s->thread_started = true;
+    }
+
+    if (avpkt && avpkt->data) {
+        av_bitstream_filter_filter(s->bsfc, avctx, NULL, &pkt.data, &pkt.size,
+                                   avpkt->data, avpkt->size, avpkt->flags & AV_PKT_FLAG_KEY);
+        avpkt = &pkt;
+    }
+
+    if (!s->source_done) {
+        if(!s->dummy_buf) {
+            s->dummy_buf = (uint8_t*)av_malloc(avpkt->size);
+            if (!s->dummy_buf)
+                return AVERROR(ENOMEM);
+            s->dummy_bufsize = avpkt->size;
+            memcpy(s->dummy_buf, avpkt->data, avpkt->size);
+        }
+
+        frame = (Frame*)av_mallocz(sizeof(Frame));
+        if (avpkt->data) {
+            frame->status  = OK;
+            frame->size    = avpkt->size;
+            frame->key     = avpkt->flags & AV_PKT_FLAG_KEY ? 1 : 0;
+            frame->buffer  = (uint8_t*)av_malloc(avpkt->size);
+            if (!frame->buffer) {
+                av_freep(&frame);
+                return AVERROR(ENOMEM);
+            }
+            uint8_t *ptr = avpkt->data;
+            // The OMX.SEC decoder fails without this.
+            if (avpkt->size == orig_size + avctx->extradata_size) {
+                ptr += avctx->extradata_size;
+                frame->size = orig_size;
+            }
+            memcpy(frame->buffer, ptr, orig_size);
+            if (avpkt == &pkt)
+                av_free(avpkt->data);
+
+            frame->time = ++s->frame_index;
+            (*s->ts_map)[s->frame_index].pts = avpkt->pts;
+            (*s->ts_map)[s->frame_index].reordered_opaque = avctx->reordered_opaque;
+        } else {
+            frame->status  = ERROR_END_OF_STREAM;
+            s->source_done = true;
+        }
+
+        while (true) {
+            if (s->thread_exited) {
+                s->source_done = true;
+                break;
+            }
+            pthread_mutex_lock(&s->in_mutex);
+            if (s->in_queue->size() >= 10) {
+                pthread_mutex_unlock(&s->in_mutex);
+                usleep(10000);
+                continue;
+            }
+            s->in_queue->push_back(frame);
+            pthread_cond_signal(&s->condition);
+            pthread_mutex_unlock(&s->in_mutex);
+            break;
+        }
+    }
+    while (true) {
+        pthread_mutex_lock(&s->out_mutex);
+        if (!s->out_queue->empty()) break;
+        pthread_mutex_unlock(&s->out_mutex);
+        if (!s->source_done) {
+            usleep(10000);
+            continue;
+        } else {
+            return orig_size;
+        }
+    }
+
+    frame = *s->out_queue->begin();
+    s->out_queue->erase(s->out_queue->begin());
+    pthread_mutex_unlock(&s->out_mutex);
+
+    ret_frame = frame->vframe;
+    status  = frame->status;
+    av_freep(&frame);
+
+    if (status == ERROR_END_OF_STREAM)
+        return 0;
+    if (status != OK) {
+        if (status == AVERROR(ENOMEM))
+            return status;
+        av_log(avctx, AV_LOG_ERROR, "Decode failed: %x\n", status);
+        return -1;
+    }
+
+    if (s->prev_frame)
+        av_frame_free(&s->prev_frame);
+    s->prev_frame = ret_frame;
+
+    *got_frame = 1;
+    *(AVFrame*)data = *ret_frame;
+    return orig_size;
+}
+
+static av_cold int Stagefright_close(AVCodecContext *avctx)
+{
+    StagefrightContext *s = (StagefrightContext*)avctx->priv_data;
+    Frame *frame;
+
+    if (s->thread_started) {
+        if (!s->thread_exited) {
+            s->stop_decode = 1;
+
+            // Make sure decode_thread() doesn't get stuck
+            pthread_mutex_lock(&s->out_mutex);
+            while (!s->out_queue->empty()) {
+                frame = *s->out_queue->begin();
+                s->out_queue->erase(s->out_queue->begin());
+                if (frame->vframe)
+                    av_frame_free(&frame->vframe);
+                av_freep(&frame);
+            }
+            pthread_mutex_unlock(&s->out_mutex);
+
+            // Feed a dummy frame prior to signalling EOF.
+            // This is required to terminate the decoder(OMX.SEC)
+            // when only one frame is read during stream info detection.
+            if (s->dummy_buf && (frame = (Frame*)av_mallocz(sizeof(Frame)))) {
+                frame->status = OK;
+                frame->size   = s->dummy_bufsize;
+                frame->key    = 1;
+                frame->buffer = s->dummy_buf;
+                pthread_mutex_lock(&s->in_mutex);
+                s->in_queue->push_back(frame);
+                pthread_cond_signal(&s->condition);
+                pthread_mutex_unlock(&s->in_mutex);
+                s->dummy_buf = NULL;
+            }
+
+            pthread_mutex_lock(&s->in_mutex);
+            s->end_frame->status = ERROR_END_OF_STREAM;
+            s->in_queue->push_back(s->end_frame);
+            pthread_cond_signal(&s->condition);
+            pthread_mutex_unlock(&s->in_mutex);
+            s->end_frame = NULL;
+        }
+
+        pthread_join(s->decode_thread_id, NULL);
+
+        if (s->prev_frame)
+            av_frame_free(&s->prev_frame);
+
+        s->thread_started = false;
+    }
+
+    while (!s->in_queue->empty()) {
+        frame = *s->in_queue->begin();
+        s->in_queue->erase(s->in_queue->begin());
+        if (frame->size)
+            av_freep(&frame->buffer);
+        av_freep(&frame);
+    }
+
+    while (!s->out_queue->empty()) {
+        frame = *s->out_queue->begin();
+        s->out_queue->erase(s->out_queue->begin());
+        if (frame->vframe)
+            av_frame_free(&frame->vframe);
+        av_freep(&frame);
+    }
+
+    (*s->decoder)->stop();
+    s->client->disconnect();
+
+    if (s->decoder_component)
+        av_freep(&s->decoder_component);
+    av_freep(&s->dummy_buf);
+    av_freep(&s->end_frame);
+
+    // Reset the extradata back to the original mp4 format, so that
+    // the next invocation (both when decoding and when called from
+    // av_find_stream_info) get the original mp4 format extradata.
+    av_freep(&avctx->extradata);
+    avctx->extradata = s->orig_extradata;
+    avctx->extradata_size = s->orig_extradata_size;
+
+    delete s->in_queue;
+    delete s->out_queue;
+    delete s->ts_map;
+    delete s->client;
+    delete s->decoder;
+    delete s->source;
+
+    pthread_mutex_destroy(&s->in_mutex);
+    pthread_mutex_destroy(&s->out_mutex);
+    pthread_cond_destroy(&s->condition);
+    av_bitstream_filter_close(s->bsfc);
+    return 0;
+}
+
+AVCodec ff_libstagefright_h264_decoder = {
+    "libstagefright_h264",
+    NULL_IF_CONFIG_SMALL("libstagefright H.264"),
+    AVMEDIA_TYPE_VIDEO,
+    AV_CODEC_ID_H264,
+    AV_CODEC_CAP_DELAY,
+    NULL, //supported_framerates
+    NULL, //pix_fmts
+    NULL, //supported_samplerates
+    NULL, //sample_fmts
+    NULL, //channel_layouts
+    0,    //max_lowres
+    NULL, //priv_class
+    NULL, //profiles
+    sizeof(StagefrightContext),
+    NULL, //next
+    NULL, //init_thread_copy
+    NULL, //update_thread_context
+    NULL, //defaults
+    NULL, //init_static_data
+    Stagefright_init,
+    NULL, //encode
+    NULL, //encode2
+    Stagefright_decode_frame,
+    Stagefright_close,
+};
diff --git a/libavcodec/libtheoraenc.c b/libavcodec/libtheoraenc.c
index 9ed9a50493..c581b34e3b 100644
--- a/libavcodec/libtheoraenc.c
+++ b/libavcodec/libtheoraenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Paul Richards <paul.richards@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
  * and o_ prefixes on variables which are libogg types.
  */
 
-/* Libav includes */
+/* FFmpeg includes */
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/pixdesc.h"
@@ -96,13 +96,14 @@ static int get_stats(AVCodecContext *avctx, int eos)
     bytes = th_encode_ctl(h->t_state, TH_ENCCTL_2PASS_OUT, &buf, sizeof(buf));
     if (bytes < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting first pass stats\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
     if (!eos) {
-        h->stats = av_fast_realloc(h->stats, &h->stats_size,
+        void *tmp = av_fast_realloc(h->stats, &h->stats_size,
                                    h->stats_offset + bytes);
-        if (!h->stats)
+        if (!tmp)
             return AVERROR(ENOMEM);
+        h->stats = tmp;
         memcpy(h->stats + h->stats_offset, buf, bytes);
         h->stats_offset += bytes;
     } else {
@@ -117,7 +118,7 @@ static int get_stats(AVCodecContext *avctx, int eos)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -131,12 +132,14 @@ static int submit_stats(AVCodecContext *avctx)
     if (!h->stats) {
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR, "No statsfile for second pass\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         h->stats_size = strlen(avctx->stats_in) * 3/4;
         h->stats      = av_malloc(h->stats_size);
-        if (!h->stats)
+        if (!h->stats) {
+            h->stats_size = 0;
             return AVERROR(ENOMEM);
+        }
         h->stats_size = av_base64_decode(h->stats, avctx->stats_in, h->stats_size);
     }
     while (h->stats_size - h->stats_offset > 0) {
@@ -145,7 +148,7 @@ static int submit_stats(AVCodecContext *avctx)
                               h->stats_size - h->stats_offset);
         if (bytes < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error submitting stats\n");
-            return -1;
+            return AVERROR_EXTERNAL;
         }
         if (!bytes)
             return 0;
@@ -154,7 +157,7 @@ static int submit_stats(AVCodecContext *avctx)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -166,6 +169,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     unsigned int offset;
     TheoraContext *h = avc_context->priv_data;
     uint32_t gop_size = avc_context->gop_size;
+    int ret;
 
     /* Set up the theora_info struct */
     th_info_init(&t_info);
@@ -202,17 +206,16 @@ static av_cold int encode_init(AVCodecContext* avc_context)
         t_info.pixel_fmt = TH_PF_444;
     else {
         av_log(avc_context, AV_LOG_ERROR, "Unsupported pix_fmt\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
-    av_pix_fmt_get_chroma_sub_sample(avc_context->pix_fmt,
-                                     &h->uv_hshift, &h->uv_vshift);
+    avcodec_get_chroma_sub_sample(avc_context->pix_fmt, &h->uv_hshift, &h->uv_vshift);
 
     if (avc_context->flags & AV_CODEC_FLAG_QSCALE) {
-        /* to be constant with the libvorbis implementation, clip global_quality to 0 - 10
-           Theora accepts a quality parameter p, which is:
-                * 0 <= p <=63
-                * an int value
-         */
+        /* Clip global_quality in QP units to the [0 - 10] range
+           to be consistent with the libvorbis implementation.
+           Theora accepts a quality parameter which is an int value in
+           the [0 - 63] range.
+        */
         t_info.quality        = av_clipf(avc_context->global_quality / (float)FF_QP2LAMBDA, 0, 10) * 6.3;
         t_info.target_bitrate = 0;
     } else {
@@ -224,7 +227,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     h->t_state = th_encode_alloc(&t_info);
     if (!h->t_state) {
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_init failed\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     h->keyframe_mask = (1 << t_info.keyframe_granule_shift) - 1;
@@ -234,16 +237,16 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     if (th_encode_ctl(h->t_state, TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
                       &gop_size, sizeof(gop_size))) {
         av_log(avc_context, AV_LOG_ERROR, "Error setting GOP size\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     // need to enable 2 pass (via TH_ENCCTL_2PASS_) before encoding headers
     if (avc_context->flags & AV_CODEC_FLAG_PASS1) {
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
     } else if (avc_context->flags & AV_CODEC_FLAG_PASS2) {
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
     }
 
     /*
@@ -259,8 +262,8 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     th_comment_init(&t_comment);
 
     while (th_encode_flushheader(h->t_state, &t_comment, &o_packet))
-        if (concatenate_packet(&offset, avc_context, &o_packet))
-            return -1;
+        if ((ret = concatenate_packet(&offset, avc_context, &o_packet)) < 0)
+            return ret;
 
     th_comment_clear(&t_comment);
 
@@ -279,8 +282,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     if (!frame) {
         th_encode_packetout(h->t_state, 1, &o_packet);
         if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-            if (get_stats(avc_context, 1))
-                return -1;
+            if ((ret = get_stats(avc_context, 1)) < 0)
+                return ret;
         return 0;
     }
 
@@ -293,8 +296,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS2)
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
 
     /* Now call into theora_encode_YUVin */
     result = th_encode_ycbcr_in(h->t_state, t_yuv_buffer);
@@ -312,12 +315,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
             break;
         }
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_YUVin failed (%s) [%d]\n", message, result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
 
     /* Pick up returned ogg_packet */
     result = th_encode_packetout(h->t_state, 0, &o_packet);
@@ -330,14 +333,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
         break;
     default:
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_packetout failed [%d]\n", result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     /* Copy ogg_packet content out to buffer */
-    if ((ret = ff_alloc_packet(pkt, o_packet.bytes)) < 0) {
-        av_log(avc_context, AV_LOG_ERROR, "Error getting output packet of size %ld.\n", o_packet.bytes);
+    if ((ret = ff_alloc_packet2(avc_context, pkt, o_packet.bytes, 0)) < 0)
         return ret;
-    }
     memcpy(pkt->data, o_packet.packet, o_packet.bytes);
 
     // HACK: assumes no encoder delay, this is true until libtheora becomes
diff --git a/libavcodec/libtwolame.c b/libavcodec/libtwolame.c
index 714c30acde..12d71e7acb 100644
--- a/libavcodec/libtwolame.c
+++ b/libavcodec/libtwolame.c
@@ -2,20 +2,20 @@
  * Interface to libtwolame for mp2 encoding
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ static av_cold int twolame_encode_init(AVCodecContext *avctx)
     twolame_set_num_channels(s->glopts, avctx->channels);
     twolame_set_in_samplerate(s->glopts, avctx->sample_rate);
     twolame_set_out_samplerate(s->glopts, avctx->sample_rate);
+
+    if (!avctx->bit_rate)
+        avctx->bit_rate = avctx->sample_rate < 28000 ? 160000 : 384000;
+
     if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) {
         twolame_set_VBR(s->glopts, TRUE);
         twolame_set_VBR_level(s->glopts,
@@ -102,7 +106,7 @@ static int twolame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     TWOLAMEContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
 
     if (frame) {
@@ -190,7 +194,7 @@ static const AVClass twolame_class = {
 };
 
 static const AVCodecDefault twolame_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
diff --git a/libavcodec/libutvideo.h b/libavcodec/libutvideo.h
new file mode 100644
index 0000000000..5fb1174c84
--- /dev/null
+++ b/libavcodec/libutvideo.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2011-2012 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+#ifndef AVCODEC_LIBUTVIDEO_H
+#define AVCODEC_LIBUTVIDEO_H
+
+#include <stdlib.h>
+#include <utvideo/utvideo.h>
+#include <utvideo/Codec.h>
+
+/*
+ * Ut Video version 12.0.0 changed the RGB format names and removed
+ * the _WIN names, so if the new names are absent, define them
+ * against the old names so compatibility with pre-v12 versions
+ * is maintained.
+ */
+#if !defined(UTVF_NFCC_BGR_BU)
+#define UTVF_NFCC_BGR_BU UTVF_RGB24_WIN
+#endif
+
+#if !defined(UTVF_NFCC_BGRA_BU)
+#define UTVF_NFCC_BGRA_BU UTVF_RGB32_WIN
+#endif
+
+/*
+ * Ut Video version 13.0.1 introduced new BT.709 variants.
+ * Special-case these and only use them if v13 is detected.
+ */
+#if defined(UTVF_HDYC)
+#define UTV_BT709
+#endif
+
+typedef struct {
+    uint32_t version;
+    uint32_t original_format;
+    uint32_t frameinfo_size;
+    uint32_t flags;
+} UtVideoExtra;
+
+typedef struct {
+    CCodec *codec;
+    unsigned int buf_size;
+    uint8_t *buffer;
+} UtVideoContext;
+
+#endif /* AVCODEC_LIBUTVIDEO_H */
diff --git a/libavcodec/libutvideodec.cpp b/libavcodec/libutvideodec.cpp
new file mode 100644
index 0000000000..47261a6c7f
--- /dev/null
+++ b/libavcodec/libutvideodec.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2011 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+extern "C" {
+#include "avcodec.h"
+}
+
+#include "libutvideo.h"
+#include "get_bits.h"
+
+static av_cold int utvideo_decode_init(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    UtVideoExtra info;
+    int format;
+    int begin_ret;
+
+    if (avctx->extradata_size != 16 && avctx->extradata_size != 8 ) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata size (%d) mismatch.\n", avctx->extradata_size);
+        return -1;
+    }
+
+    /* Read extradata */
+    info.version = AV_RL32(avctx->extradata);
+    info.original_format = AV_RL32(avctx->extradata + 4);
+    info.frameinfo_size = AV_RL32(avctx->extradata + 8);
+    info.flags = AV_RL32(avctx->extradata + 12);
+
+    /* Pick format based on FOURCC */
+    switch (avctx->codec_tag) {
+#ifdef UTV_BT709
+    case MKTAG('U', 'L', 'H', '0'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        format = UTVF_YV12;
+        break;
+    case MKTAG('U', 'L', 'H', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        format = UTVF_YUY2;
+        break;
+#endif
+    case MKTAG('U', 'L', 'Y', '0'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        format = UTVF_YV12;
+        break;
+    case MKTAG('U', 'L', 'Y', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+        format = UTVF_YUY2;
+        break;
+    case MKTAG('U', 'L', 'R', 'G'):
+        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        format = UTVF_NFCC_BGR_BU;
+        break;
+    case MKTAG('U', 'L', 'R', 'A'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        format = UTVF_NFCC_BGRA_BU;
+        break;
+#ifdef UTVF_UQY2
+    case MKTAG('U', 'Q', 'Y', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        format = UTVF_v210;
+        break;
+#endif
+    default:
+        av_log(avctx, AV_LOG_ERROR,
+              "Not a Ut Video FOURCC: %X\n", avctx->codec_tag);
+        return -1;
+    }
+
+    /* Only allocate the buffer once */
+    utv->buf_size = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
+#ifdef UTVF_UQY2
+    if (format == UTVF_v210)
+        utv->buf_size += avctx->height * ((avctx->width + 47) / 48) * 128; // the linesize used by the decoder, this does not seem to be exported
+#endif
+    utv->buffer = (uint8_t *)av_malloc(utv->buf_size * sizeof(uint8_t));
+
+    if (utv->buffer == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to allocate output buffer.\n");
+        return -1;
+    }
+
+    /* Allocate the output frame */
+    avctx->coded_frame = av_frame_alloc();
+
+    /* Ut Video only supports 8-bit */
+    avctx->bits_per_raw_sample = 8;
+
+    /* Is it interlaced? */
+    avctx->coded_frame->interlaced_frame = info.flags & 0x800 ? 1 : 0;
+
+    /* Apparently Ut Video doesn't store this info... */
+    avctx->coded_frame->top_field_first = 1;
+
+    /*
+     * Create a Ut Video instance. Since the function wants
+     * an "interface name" string, pass it the name of the lib.
+     */
+    utv->codec = CCodec::CreateInstance(UNFCC(avctx->codec_tag), "libavcodec");
+
+    /* Initialize Decoding */
+    begin_ret = utv->codec->DecodeBegin(format, avctx->width, avctx->height,
+                            CBGROSSWIDTH_WINDOWS, &info, sizeof(UtVideoExtra));
+
+    /* Check to see if the decoder initlized properly */
+    if (begin_ret != 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not initialize decoder: %d\n", begin_ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int utvideo_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame, AVPacket *avpkt)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    AVFrame *pic = avctx->coded_frame;
+    int w = avctx->width, h = avctx->height;
+
+    /* Set flags */
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    /* Decode the frame */
+    utv->codec->DecodeFrame(utv->buffer, avpkt->data, true);
+
+    /* Set the output data depending on the colorspace */
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        pic->linesize[0] = w;
+        pic->linesize[1] = pic->linesize[2] = w / 2;
+        pic->data[0] = utv->buffer;
+        pic->data[2] = utv->buffer + (w * h);
+        pic->data[1] = pic->data[2] + (w * h / 4);
+        break;
+    case AV_PIX_FMT_YUYV422:
+        pic->linesize[0] = w * 2;
+        pic->data[0] = utv->buffer;
+        break;
+    case AV_PIX_FMT_YUV422P10: {
+        uint16_t *y, *u, *v;
+        int i,j;
+        int linesize = ((w + 47) / 48) * 128;
+
+        pic->linesize[0] = w * 2;
+        pic->linesize[1] =
+        pic->linesize[2] = w;
+        pic->data[0] = utv->buffer + linesize * h;
+        pic->data[1] = pic->data[0] + h*pic->linesize[0];
+        pic->data[2] = pic->data[1] + h*pic->linesize[1];
+        y = (uint16_t*)pic->data[0];
+        u = (uint16_t*)pic->data[1];
+        v = (uint16_t*)pic->data[2];
+        for (j = 0; j < h; j++) {
+            const uint8_t *in = utv->buffer + j * linesize;
+
+            for (i = 0; i + 1 < w; i += 6, in += 4) {
+                unsigned a,b;
+                a = AV_RL32(in);
+                in += 4;
+                b = AV_RL32(in);
+                *u++ = (a    ) & 0x3FF;
+                *y++ = (a>>10) & 0x3FF;
+                *v++ = (a>>20) & 0x3FF;
+                *y++ = (b    ) & 0x3FF;
+
+                if (i + 3 >= w)
+                    break;
+
+                in += 4;
+                a = AV_RL32(in);
+                *u++ = (b>>10) & 0x3FF;
+                *y++ = (b>>20) & 0x3FF;
+                *v++ = (a    ) & 0x3FF;
+                *y++ = (a>>10) & 0x3FF;
+
+                if (i + 5 >= w)
+                    break;
+
+                in += 4;
+                b = AV_RL32(in);
+                *u++ = (a>>20) & 0x3FF;
+                *y++ = (b    ) & 0x3FF;
+                *v++ = (b>>10) & 0x3FF;
+                *y++ = (b>>20) & 0x3FF;
+            }
+        }
+        break;
+    }
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB32:
+        /* Make the linesize negative, since Ut Video uses bottom-up BGR */
+        pic->linesize[0] = -1 * w * (avctx->pix_fmt == AV_PIX_FMT_BGR24 ? 3 : 4);
+        pic->data[0] = utv->buffer + utv->buf_size + pic->linesize[0];
+        break;
+    }
+
+    *got_frame = 1;
+    av_frame_move_ref((AVFrame*)data, pic);
+
+    return avpkt->size;
+}
+
+static av_cold int utvideo_decode_close(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+
+    /* Free output */
+    av_frame_free(&avctx->coded_frame);
+    av_freep(&utv->buffer);
+
+    /* Finish decoding and clean up the instance */
+    utv->codec->DecodeEnd();
+    CCodec::DeleteInstance(utv->codec);
+
+    return 0;
+}
+
+AVCodec ff_libutvideo_decoder = {
+    "libutvideo",
+    NULL_IF_CONFIG_SMALL("Ut Video"),
+    AVMEDIA_TYPE_VIDEO,
+    AV_CODEC_ID_UTVIDEO,
+    0,    //capabilities
+    NULL, //supported_framerates
+    NULL, //pix_fmts
+    NULL, //supported_samplerates
+    NULL, //sample_fmts
+    NULL, //channel_layouts
+    0,    //max_lowres
+    NULL, //priv_class
+    NULL, //profiles
+    sizeof(UtVideoContext),
+    NULL, //next
+    NULL, //init_thread_copy
+    NULL, //update_thread_context
+    NULL, //defaults
+    NULL, //init_static_data
+    utvideo_decode_init,
+    NULL, //encode
+    NULL, //encode2
+    utvideo_decode_frame,
+    utvideo_decode_close,
+};
diff --git a/libavcodec/libutvideoenc.cpp b/libavcodec/libutvideoenc.cpp
new file mode 100644
index 0000000000..87462470e2
--- /dev/null
+++ b/libavcodec/libutvideoenc.cpp
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2012 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+extern "C" {
+#include "libavutil/avassert.h"
+#include "avcodec.h"
+#include "internal.h"
+}
+
+#include "libutvideo.h"
+#include "put_bits.h"
+
+static av_cold int utvideo_encode_init(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    UtVideoExtra *info;
+    uint32_t flags, in_format;
+    int ret;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        in_format = UTVF_YV12;
+        avctx->bits_per_coded_sample = 12;
+        if (avctx->colorspace == AVCOL_SPC_BT709)
+            avctx->codec_tag = MKTAG('U', 'L', 'H', '0');
+        else
+            avctx->codec_tag = MKTAG('U', 'L', 'Y', '0');
+        break;
+    case AV_PIX_FMT_YUYV422:
+        in_format = UTVF_YUYV;
+        avctx->bits_per_coded_sample = 16;
+        if (avctx->colorspace == AVCOL_SPC_BT709)
+            avctx->codec_tag = MKTAG('U', 'L', 'H', '2');
+        else
+            avctx->codec_tag = MKTAG('U', 'L', 'Y', '2');
+        break;
+    case AV_PIX_FMT_BGR24:
+        in_format = UTVF_NFCC_BGR_BU;
+        avctx->bits_per_coded_sample = 24;
+        avctx->codec_tag = MKTAG('U', 'L', 'R', 'G');
+        break;
+    case AV_PIX_FMT_RGB32:
+        in_format = UTVF_NFCC_BGRA_BU;
+        avctx->bits_per_coded_sample = 32;
+        avctx->codec_tag = MKTAG('U', 'L', 'R', 'A');
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    /* Check before we alloc anything */
+    if (avctx->prediction_method != 0 && avctx->prediction_method != 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid prediction method.\n");
+        return AVERROR(EINVAL);
+    }
+
+    flags = ((avctx->prediction_method + 1) << 8) | (avctx->thread_count - 1);
+
+    avctx->priv_data = utv;
+
+    /* Alloc extradata buffer */
+    info = (UtVideoExtra *)av_malloc(sizeof(*info));
+
+    if (!info) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate extradata buffer.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    /*
+     * We use this buffer to hold the data that Ut Video returns,
+     * since we cannot decode planes separately with it.
+     */
+    ret = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
+    if (ret < 0) {
+        av_free(info);
+        return ret;
+    }
+    utv->buf_size = ret;
+
+    utv->buffer = (uint8_t *)av_malloc(utv->buf_size);
+
+    if (utv->buffer == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate output buffer.\n");
+        av_free(info);
+        return AVERROR(ENOMEM);
+    }
+
+    /*
+     * Create a Ut Video instance. Since the function wants
+     * an "interface name" string, pass it the name of the lib.
+     */
+    utv->codec = CCodec::CreateInstance(UNFCC(avctx->codec_tag), "libavcodec");
+
+    /* Initialize encoder */
+    utv->codec->EncodeBegin(in_format, avctx->width, avctx->height,
+                            CBGROSSWIDTH_WINDOWS);
+
+    /* Get extradata from encoder */
+    avctx->extradata_size = utv->codec->EncodeGetExtraDataSize();
+    utv->codec->EncodeGetExtraData(info, avctx->extradata_size, in_format,
+                                   avctx->width, avctx->height,
+                                   CBGROSSWIDTH_WINDOWS);
+    avctx->extradata = (uint8_t *)info;
+
+    /* Set flags */
+    utv->codec->SetState(&flags, sizeof(flags));
+
+    return 0;
+}
+
+static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *pic, int *got_packet)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    int w = avctx->width, h = avctx->height;
+    int ret, rgb_size, i;
+    bool keyframe;
+    uint8_t *y, *u, *v;
+    uint8_t *dst;
+
+    /* Alloc buffer */
+    if ((ret = ff_alloc_packet2(avctx, pkt, utv->buf_size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    /* Move input if needed data into Ut Video friendly buffer */
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        y = utv->buffer;
+        u = y + w * h;
+        v = u + w * h / 4;
+        for (i = 0; i < h; i++) {
+            memcpy(y, pic->data[0] + i * pic->linesize[0], w);
+            y += w;
+        }
+        for (i = 0; i < h / 2; i++) {
+            memcpy(u, pic->data[2] + i * pic->linesize[2], w >> 1);
+            memcpy(v, pic->data[1] + i * pic->linesize[1], w >> 1);
+            u += w >> 1;
+            v += w >> 1;
+        }
+        break;
+    case AV_PIX_FMT_YUYV422:
+        for (i = 0; i < h; i++)
+            memcpy(utv->buffer + i * (w << 1),
+                   pic->data[0] + i * pic->linesize[0], w << 1);
+        break;
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB32:
+        /* Ut Video takes bottom-up BGR */
+        rgb_size = avctx->pix_fmt == AV_PIX_FMT_BGR24 ? 3 : 4;
+        for (i = 0; i < h; i++)
+            memcpy(utv->buffer + (h - i - 1) * w * rgb_size,
+                   pic->data[0] + i * pic->linesize[0],
+                   w * rgb_size);
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    /* Encode frame */
+    pkt->size = utv->codec->EncodeFrame(dst, &keyframe, utv->buffer);
+
+    if (!pkt->size) {
+        av_log(avctx, AV_LOG_ERROR, "EncodeFrame failed!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*
+     * Ut Video is intra-only and every frame is a keyframe,
+     * and the API always returns true. In case something
+     * durastic changes in the future, such as inter support,
+     * assert that this is true.
+     */
+    av_assert2(keyframe == true);
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int utvideo_encode_close(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+
+    av_freep(&avctx->extradata);
+    av_freep(&utv->buffer);
+
+    utv->codec->EncodeEnd();
+    CCodec::DeleteInstance(utv->codec);
+
+    return 0;
+}
+
+AVCodec ff_libutvideo_encoder = {
+    "libutvideo",
+    NULL_IF_CONFIG_SMALL("Ut Video"),
+    AVMEDIA_TYPE_VIDEO,
+    AV_CODEC_ID_UTVIDEO,
+    AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_LOSSLESS | AV_CODEC_CAP_INTRA_ONLY,
+    NULL, /* supported_framerates */
+    (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUYV422, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
+    },
+    NULL, /* supported_samplerates */
+    NULL, /* sample_fmts */
+    NULL, /* channel_layouts */
+    0,    /* max_lowres */
+    NULL, /* priv_class */
+    NULL, /* profiles */
+    sizeof(UtVideoContext),
+    NULL, /* next */
+    NULL, /* init_thread_copy */
+    NULL, /* update_thread_context */
+    NULL, /* defaults */
+    NULL, /* init_static_data */
+    utvideo_encode_init,
+    NULL, /* encode */
+    utvideo_encode_frame,
+    NULL, /* decode */
+    utvideo_encode_close,
+    NULL, /* flush */
+};
diff --git a/libavcodec/libvo-aacenc.c b/libavcodec/libvo-aacenc.c
index 876ef4c197..7f21ad2e0d 100644
--- a/libavcodec/libvo-aacenc.c
+++ b/libavcodec/libvo-aacenc.c
@@ -2,20 +2,20 @@
  * AAC encoder wrapper
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,7 +65,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     s->last_frame     = 2;
     ff_af_queue_init(avctx, &s->afq);
 
-    s->end_buffer = av_mallocz(avctx->frame_size * avctx->channels * 2);
+    s->end_buffer = av_mallocz_array(avctx->channels, avctx->frame_size * 2);
     if (!s->end_buffer) {
         ret = AVERROR(ENOMEM);
         goto error;
@@ -153,10 +153,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             return ret;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FFMAX(8192, 768 * avctx->channels), 0)) < 0)
         return ret;
-    }
 
     input.Buffer  = samples;
     input.Length  = 2 * avctx->channels * avctx->frame_size;
@@ -179,6 +177,13 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     return 0;
 }
 
+/* duplicated from avpriv_mpeg4audio_sample_rates to avoid shared build
+ * failures */
+static const int mpeg4audio_sample_rates[16] = {
+    96000, 88200, 64000, 48000, 44100, 32000,
+    24000, 22050, 16000, 12000, 11025, 8000, 7350
+};
+
 AVCodec ff_libvo_aacenc_encoder = {
     .name           = "libvo_aacenc",
     .long_name      = NULL_IF_CONFIG_SMALL("Android VisualOn AAC (Advanced Audio Coding)"),
@@ -188,6 +193,7 @@ AVCodec ff_libvo_aacenc_encoder = {
     .init           = aac_encode_init,
     .encode2        = aac_encode_frame,
     .close          = aac_encode_close,
+    .supported_samplerates = mpeg4audio_sample_rates,
     .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
diff --git a/libavcodec/libvo-amrwbenc.c b/libavcodec/libvo-amrwbenc.c
index da3941b312..92fa185081 100644
--- a/libavcodec/libvo-amrwbenc.c
+++ b/libavcodec/libvo-amrwbenc.c
@@ -1,21 +1,21 @@
 /*
  * AMR Audio encoder stub
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrwb_class = {
     "libvo_amrwbenc", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
@@ -79,7 +79,7 @@ static av_cold int amr_wb_encode_init(AVCodecContext *avctx)
 {
     AMRWBContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 16000) {
+    if (avctx->sample_rate != 16000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 16000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -115,10 +115,8 @@ static int amr_wb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     const int16_t *samples = (const int16_t *)frame->data[0];
     int size, ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MAX_PACKET_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MAX_PACKET_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->last_bitrate != avctx->bit_rate) {
         s->mode         = get_wb_bitrate_mode(avctx->bit_rate, avctx);
@@ -150,5 +148,5 @@ AVCodec ff_libvo_amrwbenc_encoder = {
     .close          = amr_wb_encode_close,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrwb_class,
 };
diff --git a/libavcodec/libvorbisdec.c b/libavcodec/libvorbisdec.c
new file mode 100644
index 0000000000..ecf690a553
--- /dev/null
+++ b/libavcodec/libvorbisdec.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vorbis/vorbisenc.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct OggVorbisDecContext {
+    vorbis_info vi;                     /**< vorbis_info used during init   */
+    vorbis_dsp_state vd;                /**< DSP state used for analysis    */
+    vorbis_block vb;                    /**< vorbis_block used for analysis */
+    vorbis_comment vc;                  /**< VorbisComment info             */
+    ogg_packet op;                      /**< ogg packet                     */
+} OggVorbisDecContext;
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext);
+
+static int oggvorbis_decode_init(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    uint8_t *p= avccontext->extradata;
+    int i, hsizes[3], ret;
+    unsigned char *headers[3], *extradata = avccontext->extradata;
+
+    if(! avccontext->extradata_size || ! p) {
+        av_log(avccontext, AV_LOG_ERROR, "vorbis extradata absent\n");
+        return AVERROR(EINVAL);
+    }
+
+    vorbis_info_init(&context->vi) ;
+    vorbis_comment_init(&context->vc) ;
+
+    if(p[0] == 0 && p[1] == 30) {
+        for(i = 0; i < 3; i++){
+            hsizes[i] = bytestream_get_be16((const uint8_t **)&p);
+            headers[i] = p;
+            p += hsizes[i];
+        }
+    } else if(*p == 2) {
+        unsigned int offset = 1;
+        p++;
+        for(i=0; i<2; i++) {
+            hsizes[i] = 0;
+            while((*p == 0xFF) && (offset < avccontext->extradata_size)) {
+                hsizes[i] += 0xFF;
+                offset++;
+                p++;
+            }
+            if(offset >= avccontext->extradata_size - 1) {
+                av_log(avccontext, AV_LOG_ERROR,
+                       "vorbis header sizes damaged\n");
+                ret = AVERROR_INVALIDDATA;
+                goto error;
+            }
+            hsizes[i] += *p;
+            offset++;
+            p++;
+        }
+        hsizes[2] = avccontext->extradata_size - hsizes[0]-hsizes[1]-offset;
+#if 0
+        av_log(avccontext, AV_LOG_DEBUG,
+               "vorbis header sizes: %d, %d, %d, / extradata_len is %d \n",
+               hsizes[0], hsizes[1], hsizes[2], avccontext->extradata_size);
+#endif
+        headers[0] = extradata + offset;
+        headers[1] = extradata + offset + hsizes[0];
+        headers[2] = extradata + offset + hsizes[0] + hsizes[1];
+    } else {
+        av_log(avccontext, AV_LOG_ERROR,
+               "vorbis initial header len is wrong: %d\n", *p);
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
+
+    for(i=0; i<3; i++){
+        context->op.b_o_s= i==0;
+        context->op.bytes = hsizes[i];
+        context->op.packet = headers[i];
+        if(vorbis_synthesis_headerin(&context->vi, &context->vc, &context->op)<0){
+            av_log(avccontext, AV_LOG_ERROR, "%d. vorbis header damaged\n", i+1);
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+    }
+
+    avccontext->channels = context->vi.channels;
+    avccontext->sample_rate = context->vi.rate;
+    avccontext->sample_fmt = AV_SAMPLE_FMT_S16;
+    avccontext->time_base= (AVRational){1, avccontext->sample_rate};
+
+    vorbis_synthesis_init(&context->vd, &context->vi);
+    vorbis_block_init(&context->vd, &context->vb);
+
+    return 0 ;
+
+  error:
+    oggvorbis_decode_close(avccontext);
+    return ret;
+}
+
+
+static inline int conv(int samples, float **pcm, char *buf, int channels) {
+    int i, j;
+    ogg_int16_t *ptr, *data = (ogg_int16_t*)buf ;
+    float *mono ;
+
+    for(i = 0 ; i < channels ; i++){
+        ptr = &data[i];
+        mono = pcm[i] ;
+
+        for(j = 0 ; j < samples ; j++) {
+            *ptr = av_clip_int16(mono[j] * 32767.f);
+            ptr += channels;
+        }
+    }
+
+    return 0 ;
+}
+
+static int oggvorbis_decode_frame(AVCodecContext *avccontext, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    AVFrame *frame = data;
+    float **pcm ;
+    ogg_packet *op= &context->op;
+    int samples, total_samples, total_bytes;
+    int ret;
+    int16_t *output;
+
+    if(!avpkt->size){
+    //FIXME flush
+        return 0;
+    }
+
+    frame->nb_samples = 8192*4;
+    if ((ret = ff_get_buffer(avccontext, frame, 0)) < 0)
+        return ret;
+    output = (int16_t *)frame->data[0];
+
+
+    op->packet = avpkt->data;
+    op->bytes  = avpkt->size;
+
+//    av_log(avccontext, AV_LOG_DEBUG, "%d %d %d %"PRId64" %"PRId64" %d %d\n", op->bytes, op->b_o_s, op->e_o_s, op->granulepos, op->packetno, buf_size, context->vi.rate);
+
+/*    for(i=0; i<op->bytes; i++)
+      av_log(avccontext, AV_LOG_DEBUG, "%02X ", op->packet[i]);
+    av_log(avccontext, AV_LOG_DEBUG, "\n");*/
+
+    if(vorbis_synthesis(&context->vb, op) == 0)
+        vorbis_synthesis_blockin(&context->vd, &context->vb) ;
+
+    total_samples = 0 ;
+    total_bytes = 0 ;
+
+    while((samples = vorbis_synthesis_pcmout(&context->vd, &pcm)) > 0) {
+        conv(samples, pcm, (char*)output + total_bytes, context->vi.channels) ;
+        total_bytes += samples * 2 * context->vi.channels ;
+        total_samples += samples ;
+        vorbis_synthesis_read(&context->vd, samples) ;
+    }
+
+    frame->nb_samples = total_samples;
+    *got_frame_ptr   = total_samples > 0;
+    return avpkt->size;
+}
+
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+
+    vorbis_block_clear(&context->vb);
+    vorbis_dsp_clear(&context->vd);
+    vorbis_info_clear(&context->vi) ;
+    vorbis_comment_clear(&context->vc) ;
+
+    return 0 ;
+}
+
+
+AVCodec ff_libvorbis_decoder = {
+    .name           = "libvorbis",
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_VORBIS,
+    .priv_data_size = sizeof(OggVorbisDecContext),
+    .init           = oggvorbis_decode_init,
+    .decode         = oggvorbis_decode_frame,
+    .close          = oggvorbis_decode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+};
diff --git a/libavcodec/libvorbis.c b/libavcodec/libvorbisenc.c
index 86c1ed6a75..3ca5b55e8e 100644
--- a/libavcodec/libvorbis.c
+++ b/libavcodec/libvorbisenc.c
@@ -1,42 +1,34 @@
 /*
- * copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/**
- * @file
- * Vorbis encoding support via libvorbisenc.
- * @author Mark Hills <mark@pogo.org.uk>
- */
-
 #include <vorbis/vorbisenc.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "audio_frame_queue.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "vorbis.h"
 #include "vorbis_parser.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 /* Number of samples the user should send in each call.
  * This value is used because it is the LCD of all possible frame sizes, so
@@ -47,7 +39,7 @@
 
 #define BUFFER_SIZE (1024 * 64)
 
-typedef struct LibvorbisContext {
+typedef struct LibvorbisEncContext {
     AVClass *av_class;                  /**< class for AVOptions            */
     vorbis_info vi;                     /**< vorbis_info used during init   */
     vorbis_dsp_state vd;                /**< DSP state used for analysis    */
@@ -56,14 +48,13 @@ typedef struct LibvorbisContext {
     int eof;                            /**< end-of-file flag               */
     int dsp_initialized;                /**< vd has been initialized        */
     vorbis_comment vc;                  /**< VorbisComment info             */
-    ogg_packet op;                      /**< ogg packet                     */
     double iblock;                      /**< impulse block bias option      */
     AVVorbisParseContext *vp;           /**< parse context to get durations */
     AudioFrameQueue afq;                /**< frame queue for timestamps     */
-} LibvorbisContext;
+} LibvorbisEncContext;
 
 static const AVOption options[] = {
-    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 };
 
@@ -72,8 +63,12 @@ static const AVCodecDefault defaults[] = {
     { NULL },
 };
 
-static const AVClass class = { "libvorbis", av_default_item_name, options, LIBAVUTIL_VERSION_INT };
-
+static const AVClass vorbis_class = {
+    .class_name = "libvorbis",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 static int vorbis_error_to_averror(int ov_err)
 {
@@ -87,7 +82,7 @@ static int vorbis_error_to_averror(int ov_err)
 
 static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     double cfreq;
     int ret;
 
@@ -117,14 +112,14 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
         /* variable bitrate by estimate, disable slow rate management */
         if (minrate == -1 && maxrate == -1)
             if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL)))
-                goto error;
+                goto error; /* should not happen */
     }
 
     /* cutoff frequency */
     if (avctx->cutoff > 0) {
         cfreq = avctx->cutoff / 1000.0;
         if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq)))
-            goto error;
+            goto error; /* should not happen */
     }
 
     /* impulse block bias */
@@ -133,6 +128,35 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
             goto error;
     }
 
+    if (avctx->channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        avctx->channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        avctx->channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        avctx->channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK ||
+        avctx->channels == 7 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_5POINT1|AV_CH_BACK_CENTER) ||
+        avctx->channels == 8 &&
+            avctx->channel_layout != AV_CH_LAYOUT_7POINT1) {
+        if (avctx->channel_layout) {
+            char name[32];
+            av_get_channel_layout_string(name, sizeof(name), avctx->channels,
+                                         avctx->channel_layout);
+            av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n", name);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Vorbis channel layout for "
+                                               "%d channels.\n", avctx->channels);
+        }
+    }
+
     if ((ret = vorbis_encode_setup_init(vi)))
         goto error;
 
@@ -149,7 +173,7 @@ static int xiph_len(int l)
 
 static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
 
     /* notify vorbisenc this is EOF */
     if (s->dsp_initialized)
@@ -159,7 +183,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
     vorbis_dsp_clear(&s->vd);
     vorbis_info_clear(&s->vi);
 
-    av_fifo_free(s->pkt_fifo);
+    av_fifo_freep(&s->pkt_fifo);
     ff_af_queue_close(&s->afq);
     av_freep(&avctx->extradata);
 
@@ -170,7 +194,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 
 static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet header, header_comm, header_code;
     uint8_t *p;
     unsigned int offset;
@@ -194,7 +218,8 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     }
 
     vorbis_comment_init(&s->vc);
-    vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
+    if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
 
     if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm,
                                          &header_code))) {
@@ -221,7 +246,7 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     offset += header_comm.bytes;
     memcpy(&p[offset], header_code.packet, header_code.bytes);
     offset += header_code.bytes;
-    assert(offset == avctx->extradata_size);
+    av_assert0(offset == avctx->extradata_size);
 
     s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size);
     if (!s->vp) {
@@ -249,7 +274,7 @@ error:
 static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                   const AVFrame *frame, int *got_packet_ptr)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet op;
     int ret, duration;
 
@@ -273,7 +298,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
             return ret;
     } else {
-        if (!s->eof)
+        if (!s->eof && s->afq.frame_alloc)
             if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
                 return vorbis_error_to_averror(ret);
@@ -291,7 +316,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         /* add any available packets to the output packet buffer */
         while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) {
             if (av_fifo_space(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) {
-                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small");
+                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n");
                 return AVERROR_BUG;
             }
             av_fifo_generic_write(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
@@ -313,10 +338,8 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     av_fifo_generic_read(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
 
-    if ((ret = ff_alloc_packet(avpkt, op.bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, op.bytes, 0)) < 0)
         return ret;
-    }
     av_fifo_generic_read(s->pkt_fifo, avpkt->data, op.bytes, NULL);
 
     avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos);
@@ -325,9 +348,12 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (duration > 0) {
         /* we do not know encoder delay until we get the first packet from
          * libvorbis, so we have to update the AudioFrameQueue counts */
-        if (!avctx->initial_padding) {
+        if (!avctx->initial_padding && s->afq.frames) {
             avctx->initial_padding    = duration;
-            s->afq.remaining_delay   += duration;
+            av_assert0(!s->afq.remaining_delay);
+            s->afq.frames->duration  += duration;
+            if (s->afq.frames->pts != AV_NOPTS_VALUE)
+                s->afq.frames->pts       -= duration;
             s->afq.remaining_samples += duration;
         }
         ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration);
@@ -339,16 +365,16 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
 AVCodec ff_libvorbis_encoder = {
     .name           = "libvorbis",
-    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis Vorbis"),
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_VORBIS,
-    .priv_data_size = sizeof(LibvorbisContext),
+    .priv_data_size = sizeof(LibvorbisEncContext),
     .init           = libvorbis_encode_init,
     .encode2        = libvorbis_encode_frame,
     .close          = libvorbis_encode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &vorbis_class,
     .defaults       = defaults,
 };
diff --git a/libavcodec/libvpx.c b/libavcodec/libvpx.c
index 49f966deba..a60d186f1f 100644
--- a/libavcodec/libvpx.c
+++ b/libavcodec/libvpx.c
@@ -1,27 +1,83 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <vpx/vpx_codec.h>
-
 #include "libvpx.h"
+#include "config.h"
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+#include <vpx/vpx_encoder.h>
+#include <vpx/vp8cx.h>
+#endif
+
+static const enum AVPixelFormat vp9_pix_fmts_def[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const enum AVPixelFormat vp9_pix_fmts_highcol[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat vp9_pix_fmts_highbd[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10LE,
+    AV_PIX_FMT_YUV422P10LE,
+    AV_PIX_FMT_YUV440P10LE,
+    AV_PIX_FMT_YUV444P10LE,
+    AV_PIX_FMT_YUV420P12LE,
+    AV_PIX_FMT_YUV422P12LE,
+    AV_PIX_FMT_YUV440P12LE,
+    AV_PIX_FMT_YUV444P12LE,
+    AV_PIX_FMT_NONE
+};
+#endif
 
+av_cold void ff_vp9_init_static(AVCodec *codec)
+{
+    if (    vpx_codec_version_major() < 1
+        || (vpx_codec_version_major() == 1 && vpx_codec_version_minor() < 3))
+        codec->capabilities |= AV_CODEC_CAP_EXPERIMENTAL;
+    codec->pix_fmts = vp9_pix_fmts_def;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (    vpx_codec_version_major() > 1
+        || (vpx_codec_version_major() == 1 && vpx_codec_version_minor() >= 4)) {
+#ifdef VPX_CODEC_CAP_HIGHBITDEPTH
+        vpx_codec_caps_t codec_caps = vpx_codec_get_caps(vpx_codec_vp9_cx());
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH)
+            codec->pix_fmts = vp9_pix_fmts_highbd;
+        else
+#endif
+            codec->pix_fmts = vp9_pix_fmts_highcol;
+    }
+#endif
+}
+#if 0
 enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img)
 {
     switch (img) {
@@ -77,3 +133,4 @@ vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix)
     default:                      return VPX_IMG_FMT_NONE;
     }
 }
+#endif
diff --git a/libavcodec/libvpx.h b/libavcodec/libvpx.h
index b437f37c81..22b697fa01 100644
--- a/libavcodec/libvpx.h
+++ b/libavcodec/libvpx.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,10 @@
 
 #include "avcodec.h"
 
+void ff_vp9_init_static(AVCodec *codec);
+#if 0
 enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img);
 vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix);
+#endif
 
 #endif /* AVCODEC_LIBVPX_H */
diff --git a/libavcodec/libvpxdec.c b/libavcodec/libvpxdec.c
index 28b7733969..7267590e01 100644
--- a/libavcodec/libvpxdec.c
+++ b/libavcodec/libvpxdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,108 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     return 0;
 }
 
+// returns 0 on success, AVERROR_INVALIDDATA otherwise
+static int set_pix_fmt(AVCodecContext *avctx, struct vpx_image *img)
+{
+#if VPX_IMAGE_ABI_VERSION >= 3
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    avctx->colorspace = colorspaces[img->cs];
+#endif
+    if (avctx->codec_id == AV_CODEC_ID_VP8 && img->fmt != VPX_IMG_FMT_I420)
+        return AVERROR_INVALIDDATA;
+    switch (img->fmt) {
+    case VPX_IMG_FMT_I420:
+        if (avctx->codec_id == AV_CODEC_ID_VP9)
+            avctx->profile = FF_PROFILE_VP9_0;
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        return 0;
+#if CONFIG_LIBVPX_VP9_DECODER
+    case VPX_IMG_FMT_I422:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        return 0;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case VPX_IMG_FMT_I440:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+        return 0;
+#endif
+    case VPX_IMG_FMT_I444:
+        avctx->profile = FF_PROFILE_VP9_1;
+#if VPX_IMAGE_ABI_VERSION >= 3
+        avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                         AV_PIX_FMT_GBRP : AV_PIX_FMT_YUV444P;
+#else
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+#endif
+        return 0;
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    case VPX_IMG_FMT_I42016:
+        avctx->profile = FF_PROFILE_VP9_2;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10LE;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12LE;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case VPX_IMG_FMT_I42216:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10LE;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12LE;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case VPX_IMG_FMT_I44016:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P10LE;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P12LE;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#endif
+    case VPX_IMG_FMT_I44416:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+#if VPX_IMAGE_ABI_VERSION >= 3
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP10LE : AV_PIX_FMT_YUV444P10LE;
+#else
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10LE;
+#endif
+            return 0;
+        } else if (img->bit_depth == 12) {
+#if VPX_IMAGE_ABI_VERSION >= 3
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP12LE : AV_PIX_FMT_YUV444P12LE;
+#else
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12LE;
+#endif
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#endif
+#endif
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+}
+
 static int vp8_decode(AVCodecContext *avctx,
                       void *data, int *got_frame, AVPacket *avpkt)
 {
@@ -81,11 +183,15 @@ static int vp8_decode(AVCodecContext *avctx,
     }
 
     if ((img = vpx_codec_get_frame(&ctx->decoder, &iter))) {
-        avctx->pix_fmt = ff_vpx_imgfmt_to_pixfmt(img->fmt);
-        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d)\n",
-                   img->fmt);
-            return AVERROR_INVALIDDATA;
+        if ((ret = set_pix_fmt(avctx, img)) < 0) {
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
+                   img->fmt, img->bit_depth);
+#else
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
+                   img->fmt, 8);
+#endif
+            return ret;
         }
 
         if ((int) img->d_w != avctx->width || (int) img->d_h != avctx->height) {
@@ -97,7 +203,7 @@ static int vp8_decode(AVCodecContext *avctx,
         }
         if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
             return ret;
-        av_image_copy(picture->data, picture->linesize, img->planes,
+        av_image_copy(picture->data, picture->linesize, (const uint8_t **)img->planes,
                       img->stride, avctx->pix_fmt, img->d_w, img->d_h);
         *got_frame           = 1;
     }
@@ -136,6 +242,14 @@ static av_cold int vp9_init(AVCodecContext *avctx)
     return vpx_init(avctx, &vpx_codec_vp9_dx_algo);
 }
 
+static const AVProfile profiles[] = {
+    { FF_PROFILE_VP9_0, "Profile 0" },
+    { FF_PROFILE_VP9_1, "Profile 1" },
+    { FF_PROFILE_VP9_2, "Profile 2" },
+    { FF_PROFILE_VP9_3, "Profile 3" },
+    { FF_PROFILE_UNKNOWN },
+};
+
 AVCodec ff_libvpx_vp9_decoder = {
     .name           = "libvpx-vp9",
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP9"),
@@ -145,6 +259,8 @@ AVCodec ff_libvpx_vp9_decoder = {
     .init           = vp9_init,
     .close          = vp8_free,
     .decode         = vp8_decode,
-    .capabilities   = AV_CODEC_CAP_AUTO_THREADS,
+    .capabilities   = AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_DR1,
+    .init_static_data = ff_vp9_init_static,
+    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
 };
 #endif /* CONFIG_LIBVPX_VP9_DECODER */
diff --git a/libavcodec/libvpxenc.c b/libavcodec/libvpxenc.c
index 563117fe4c..5f39783087 100644
--- a/libavcodec/libvpxenc.c
+++ b/libavcodec/libvpxenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,11 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libvpx.h"
 #include "libavutil/base64.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 
@@ -43,11 +45,16 @@
 struct FrameListData {
     void *buf;                       /**< compressed data buffer */
     size_t sz;                       /**< length of compressed data */
+    void *buf_alpha;
+    size_t sz_alpha;
     int64_t pts;                     /**< time stamp to show frame
                                           (in timebase units) */
     unsigned long duration;          /**< duration to show frame
                                           (in timebase units) */
     uint32_t flags;                  /**< flags for this frame */
+    uint64_t sse[4];
+    int have_sse;                    /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *next;
 };
 
@@ -55,18 +62,44 @@ typedef struct VP8EncoderContext {
     AVClass *class;
     struct vpx_codec_ctx encoder;
     struct vpx_image rawimg;
+    struct vpx_codec_ctx encoder_alpha;
+    struct vpx_image rawimg_alpha;
+    uint8_t is_alpha;
     struct vpx_fixed_buf twopass_stats;
-    unsigned long deadline; //i.e., RT/GOOD/BEST
+    int deadline; //i.e., RT/GOOD/BEST
+    uint64_t sse[4];
+    int have_sse; /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *coded_frame_list;
+
     int cpu_used;
+    /**
+     * VP8 specific flags, see VP8F_* below.
+     */
+    int flags;
+#define VP8F_ERROR_RESILIENT 0x00000001 ///< Enable measures appropriate for streaming over lossy links
+#define VP8F_AUTO_ALT_REF    0x00000002 ///< Enable automatic alternate reference frame generation
+
     int auto_alt_ref;
+
     int arnr_max_frames;
     int arnr_strength;
     int arnr_type;
+
     int lag_in_frames;
     int error_resilient;
     int crf;
     int static_thresh;
+    int max_intra_rate;
+    int rc_undershoot_pct;
+    int rc_overshoot_pct;
+
+    // VP9-only
+    int lossless;
+    int tile_columns;
+    int tile_rows;
+    int frame_parallel;
+    int aq_mode;
 } VP8Context;
 
 /** String mappings for enum vp8e_enc_control_id */
@@ -88,6 +121,17 @@ static const char *const ctlidstr[] = {
     [VP8E_SET_ARNR_STRENGTH]     = "VP8E_SET_ARNR_STRENGTH",
     [VP8E_SET_ARNR_TYPE]         = "VP8E_SET_ARNR_TYPE",
     [VP8E_SET_CQ_LEVEL]          = "VP8E_SET_CQ_LEVEL",
+    [VP8E_SET_MAX_INTRA_BITRATE_PCT] = "VP8E_SET_MAX_INTRA_BITRATE_PCT",
+#if CONFIG_LIBVPX_VP9_ENCODER
+    [VP9E_SET_LOSSLESS]                = "VP9E_SET_LOSSLESS",
+    [VP9E_SET_TILE_COLUMNS]            = "VP9E_SET_TILE_COLUMNS",
+    [VP9E_SET_TILE_ROWS]               = "VP9E_SET_TILE_ROWS",
+    [VP9E_SET_FRAME_PARALLEL_DECODING] = "VP9E_SET_FRAME_PARALLEL_DECODING",
+    [VP9E_SET_AQ_MODE]                 = "VP9E_SET_AQ_MODE",
+#if VPX_ENCODER_ABI_VERSION > 8
+    [VP9E_SET_COLOR_SPACE]             = "VP9E_SET_COLOR_SPACE",
+#endif
+#endif
 };
 
 static av_cold void log_encoder_error(AVCodecContext *avctx, const char *desc)
@@ -110,19 +154,26 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
     av_log(avctx, level, "vpx_codec_enc_cfg\n");
     av_log(avctx, level, "generic settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+           "  %*s%u\n  %*s%u\n"
+#endif
            "  %*s{%u/%u}\n  %*s%u\n  %*s%d\n  %*s%u\n",
            width, "g_usage:",           cfg->g_usage,
            width, "g_threads:",         cfg->g_threads,
            width, "g_profile:",         cfg->g_profile,
            width, "g_w:",               cfg->g_w,
            width, "g_h:",               cfg->g_h,
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+           width, "g_bit_depth:",       cfg->g_bit_depth,
+           width, "g_input_bit_depth:", cfg->g_input_bit_depth,
+#endif
            width, "g_timebase:",        cfg->g_timebase.num, cfg->g_timebase.den,
            width, "g_error_resilient:", cfg->g_error_resilient,
            width, "g_pass:",            cfg->g_pass,
            width, "g_lag_in_frames:",   cfg->g_lag_in_frames);
     av_log(avctx, level, "rate control settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
-           "  %*s%d\n  %*s%p(%zu)\n  %*s%u\n",
+           "  %*s%d\n  %*s%p(%"SIZE_SPECIFIER")\n  %*s%u\n",
            width, "rc_dropframe_thresh:",   cfg->rc_dropframe_thresh,
            width, "rc_resize_allowed:",     cfg->rc_resize_allowed,
            width, "rc_resize_up_thresh:",   cfg->rc_resize_up_thresh,
@@ -169,6 +220,8 @@ static void coded_frame_add(void *list, struct FrameListData *cx_frame)
 static av_cold void free_coded_frame(struct FrameListData *cx_frame)
 {
     av_freep(&cx_frame->buf);
+    if (cx_frame->buf_alpha)
+        av_freep(&cx_frame->buf_alpha);
     av_freep(&cx_frame);
 }
 
@@ -209,27 +262,158 @@ static av_cold int vp8_free(AVCodecContext *avctx)
     VP8Context *ctx = avctx->priv_data;
 
     vpx_codec_destroy(&ctx->encoder);
+    if (ctx->is_alpha)
+        vpx_codec_destroy(&ctx->encoder_alpha);
     av_freep(&ctx->twopass_stats.buf);
     av_freep(&avctx->stats_out);
     free_frame_list(ctx->coded_frame_list);
     return 0;
 }
 
+#if CONFIG_LIBVPX_VP9_ENCODER
+static int set_pix_fmt(AVCodecContext *avctx, vpx_codec_caps_t codec_caps,
+                       struct vpx_codec_enc_cfg *enccfg, vpx_codec_flags_t *flags,
+                       vpx_img_fmt_t *img_fmt)
+{
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    enccfg->g_bit_depth = enccfg->g_input_bit_depth = 8;
+#endif
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        enccfg->g_profile = 0;
+        *img_fmt = VPX_IMG_FMT_I420;
+        return 0;
+    case AV_PIX_FMT_YUV422P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I422;
+        return 0;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case AV_PIX_FMT_YUV440P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I440;
+        return 0;
+#endif
+    case AV_PIX_FMT_YUV444P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I444;
+        return 0;
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    case AV_PIX_FMT_YUV420P10LE:
+    case AV_PIX_FMT_YUV420P12LE:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV420P10LE ? 10 : 12;
+            enccfg->g_profile = 2;
+            *img_fmt = VPX_IMG_FMT_I42016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV422P10LE:
+    case AV_PIX_FMT_YUV422P12LE:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV422P10LE ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I42216;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case AV_PIX_FMT_YUV440P10LE:
+    case AV_PIX_FMT_YUV440P12LE:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV440P10LE ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+#endif
+    case AV_PIX_FMT_YUV444P10LE:
+    case AV_PIX_FMT_YUV444P12LE:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV444P10LE ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44416;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+#endif
+    default:
+        break;
+    }
+    av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format.\n");
+    return AVERROR_INVALIDDATA;
+}
+
+#if VPX_ENCODER_ABI_VERSION > 8
+static void set_colorspace(AVCodecContext *avctx)
+{
+    enum vpx_color_space vpx_cs;
+
+    switch (avctx->colorspace) {
+    case AVCOL_SPC_RGB:         vpx_cs = VPX_CS_SRGB;      break;
+    case AVCOL_SPC_BT709:       vpx_cs = VPX_CS_BT_709;    break;
+    case AVCOL_SPC_UNSPECIFIED: vpx_cs = VPX_CS_UNKNOWN;   break;
+    case AVCOL_SPC_RESERVED:    vpx_cs = VPX_CS_RESERVED;  break;
+    case AVCOL_SPC_BT470BG:     vpx_cs = VPX_CS_BT_601;    break;
+    case AVCOL_SPC_SMPTE170M:   vpx_cs = VPX_CS_SMPTE_170; break;
+    case AVCOL_SPC_SMPTE240M:   vpx_cs = VPX_CS_SMPTE_240; break;
+    case AVCOL_SPC_BT2020_NCL:  vpx_cs = VPX_CS_BT_2020;   break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Unsupported colorspace (%d)\n",
+               avctx->colorspace);
+        return;
+    }
+    codecctl_int(avctx, VP9E_SET_COLOR_SPACE, vpx_cs);
+}
+#endif
+#endif
+
 static av_cold int vpx_init(AVCodecContext *avctx,
                             const struct vpx_codec_iface *iface)
 {
     VP8Context *ctx = avctx->priv_data;
     struct vpx_codec_enc_cfg enccfg = { 0 };
+    struct vpx_codec_enc_cfg enccfg_alpha;
+    vpx_codec_flags_t flags = (avctx->flags & AV_CODEC_FLAG_PSNR) ? VPX_CODEC_USE_PSNR : 0;
     int res;
+    vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    vpx_codec_caps_t codec_caps = vpx_codec_get_caps(iface);
+#endif
 
     av_log(avctx, AV_LOG_INFO, "%s\n", vpx_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", vpx_codec_build_config());
 
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P)
+        ctx->is_alpha = 1;
+
     if ((res = vpx_codec_enc_config_default(iface, &enccfg, 0)) != VPX_CODEC_OK) {
         av_log(avctx, AV_LOG_ERROR, "Failed to get config: %s\n",
                vpx_codec_err_to_string(res));
         return AVERROR(EINVAL);
     }
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (set_pix_fmt(avctx, codec_caps, &enccfg, &flags, &img_fmt))
+            return AVERROR(EINVAL);
+    }
+#endif
+
+    if(!avctx->bit_rate)
+        if(avctx->rc_max_rate || avctx->rc_buffer_size || avctx->rc_initial_buffer_occupancy) {
+            av_log( avctx, AV_LOG_ERROR, "Rate control parameters set without a bitrate\n");
+            return AVERROR(EINVAL);
+        }
+
     dump_enc_cfg(avctx, &enccfg);
 
     enccfg.g_w            = avctx->width;
@@ -237,9 +421,7 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     enccfg.g_timebase.num = avctx->time_base.num;
     enccfg.g_timebase.den = avctx->time_base.den;
     enccfg.g_threads      = avctx->thread_count;
-
-    if (ctx->lag_in_frames >= 0)
-        enccfg.g_lag_in_frames = ctx->lag_in_frames;
+    enccfg.g_lag_in_frames= ctx->lag_in_frames;
 
     if (avctx->flags & AV_CODEC_FLAG_PASS1)
         enccfg.g_pass = VPX_RC_FIRST_PASS;
@@ -248,28 +430,64 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     else
         enccfg.g_pass = VPX_RC_ONE_PASS;
 
-    if (!avctx->bit_rate)
-        avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
-    else
+    if (avctx->rc_min_rate == avctx->rc_max_rate &&
+        avctx->rc_min_rate == avctx->bit_rate && avctx->bit_rate) {
+        enccfg.rc_end_usage = VPX_CBR;
+    } else if (ctx->crf >= 0) {
+        enccfg.rc_end_usage = VPX_CQ;
+#if CONFIG_LIBVPX_VP9_ENCODER
+        if (!avctx->bit_rate && avctx->codec_id == AV_CODEC_ID_VP9)
+            enccfg.rc_end_usage = VPX_Q;
+#endif
+    }
+
+    if (avctx->bit_rate) {
         enccfg.rc_target_bitrate = av_rescale_rnd(avctx->bit_rate, 1, 1000,
-                                              AV_ROUND_NEAR_INF);
+                                                  AV_ROUND_NEAR_INF);
+#if CONFIG_LIBVPX_VP9_ENCODER
+    } else if (enccfg.rc_end_usage == VPX_Q) {
+#endif
+    } else {
+        if (enccfg.rc_end_usage == VPX_CQ) {
+            enccfg.rc_target_bitrate = 1000000;
+        } else {
+            avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
+            av_log(avctx, AV_LOG_WARNING,
+                   "Neither bitrate nor constrained quality specified, using default bitrate of %dkbit/sec\n",
+                   enccfg.rc_target_bitrate);
+        }
+    }
 
-    if (ctx->crf)
-        enccfg.rc_end_usage = VPX_CQ;
-    else if (avctx->rc_min_rate == avctx->rc_max_rate &&
-             avctx->rc_min_rate == avctx->bit_rate)
-        enccfg.rc_end_usage = VPX_CBR;
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && ctx->lossless == 1) {
+        enccfg.rc_min_quantizer =
+        enccfg.rc_max_quantizer = 0;
+    } else {
+        if (avctx->qmin >= 0)
+            enccfg.rc_min_quantizer = avctx->qmin;
+        if (avctx->qmax >= 0)
+            enccfg.rc_max_quantizer = avctx->qmax;
+    }
+
+    if (enccfg.rc_end_usage == VPX_CQ
+#if CONFIG_LIBVPX_VP9_ENCODER
+        || enccfg.rc_end_usage == VPX_Q
+#endif
+       ) {
+        if (ctx->crf < enccfg.rc_min_quantizer || ctx->crf > enccfg.rc_max_quantizer) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "CQ level %d must be between minimum and maximum quantizer value (%d-%d)\n",
+                   ctx->crf, enccfg.rc_min_quantizer, enccfg.rc_max_quantizer);
+            return AVERROR(EINVAL);
+        }
+    }
 
-    if (avctx->qmin > 0)
-        enccfg.rc_min_quantizer = avctx->qmin;
-    if (avctx->qmax > 0)
-        enccfg.rc_max_quantizer = avctx->qmax;
     enccfg.rc_dropframe_thresh = avctx->frame_skip_threshold;
 
     //0-100 (0 => CBR, 100 => VBR)
     enccfg.rc_2pass_vbr_bias_pct           = round(avctx->qcompress * 100);
-    enccfg.rc_2pass_vbr_minsection_pct     =
-        avctx->rc_min_rate * 100LL / avctx->bit_rate;
+    if (avctx->bit_rate)
+        enccfg.rc_2pass_vbr_minsection_pct =
+            avctx->rc_min_rate * 100LL / avctx->bit_rate;
     if (avctx->rc_max_rate)
         enccfg.rc_2pass_vbr_maxsection_pct =
             avctx->rc_max_rate * 100LL / avctx->bit_rate;
@@ -281,6 +499,19 @@ static av_cold int vpx_init(AVCodecContext *avctx,
         enccfg.rc_buf_initial_sz =
             avctx->rc_initial_buffer_occupancy * 1000LL / avctx->bit_rate;
     enccfg.rc_buf_optimal_sz     = enccfg.rc_buf_sz * 5 / 6;
+#if FF_API_MPV_OPT
+    FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->rc_buffer_aggressivity != 1.0) {
+        av_log(avctx, AV_LOG_WARNING, "The rc_buffer_aggressivity option is "
+               "deprecated, use the undershoot-pct private option instead.\n");
+        enccfg.rc_undershoot_pct = round(avctx->rc_buffer_aggressivity * 100);
+    }
+    FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    if (ctx->rc_undershoot_pct >= 0)
+        enccfg.rc_undershoot_pct = ctx->rc_undershoot_pct;
+    if (ctx->rc_overshoot_pct >= 0)
+        enccfg.rc_overshoot_pct = ctx->rc_overshoot_pct;
 
     //_enc_init() will balk if kf_min_dist differs from max w/VPX_KF_AUTO
     if (avctx->keyint_min >= 0 && avctx->keyint_min == avctx->gop_size)
@@ -302,8 +533,9 @@ static av_cold int vpx_init(AVCodecContext *avctx,
         ret = av_reallocp(&ctx->twopass_stats.buf, ctx->twopass_stats.sz);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "Stat buffer alloc (%zu bytes) failed\n",
+                   "Stat buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                    ctx->twopass_stats.sz);
+            ctx->twopass_stats.sz = 0;
             return ret;
         }
         decode_size = av_base64_decode(ctx->twopass_stats.buf, avctx->stats_in,
@@ -322,25 +554,31 @@ static av_cold int vpx_init(AVCodecContext *avctx,
        quality. */
     if (avctx->profile != FF_PROFILE_UNKNOWN)
         enccfg.g_profile = avctx->profile;
-    else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_0;
-    else
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_1;
 
-    enccfg.g_error_resilient = ctx->error_resilient;
+    enccfg.g_error_resilient = ctx->error_resilient || ctx->flags & VP8F_ERROR_RESILIENT;
 
     dump_enc_cfg(avctx, &enccfg);
     /* Construct Encoder Context */
-    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, 0);
+    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, flags);
     if (res != VPX_CODEC_OK) {
         log_encoder_error(avctx, "Failed to initialize encoder");
         return AVERROR(EINVAL);
     }
 
+    if (ctx->is_alpha) {
+        enccfg_alpha = enccfg;
+        res = vpx_codec_enc_init(&ctx->encoder_alpha, iface, &enccfg_alpha, flags);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Failed to initialize alpha encoder");
+            return AVERROR(EINVAL);
+        }
+    }
+
     //codec control failures are currently treated only as warnings
     av_log(avctx, AV_LOG_DEBUG, "vpx_codec_control\n");
-    if (ctx->cpu_used != INT_MIN)
-        codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    if (ctx->flags & VP8F_AUTO_ALT_REF)
+        ctx->auto_alt_ref = 1;
     if (ctx->auto_alt_ref >= 0)
         codecctl_int(avctx, VP8E_SET_ENABLEAUTOALTREF, ctx->auto_alt_ref);
     if (ctx->arnr_max_frames >= 0)
@@ -350,7 +588,7 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     if (ctx->arnr_type >= 0)
         codecctl_int(avctx, VP8E_SET_ARNR_TYPE,        ctx->arnr_type);
 
-    if (CONFIG_LIBVPX_VP8_ENCODER && iface == &vpx_codec_vp8_cx_algo) {
+    if (CONFIG_LIBVPX_VP8_ENCODER && avctx->codec_id == AV_CODEC_ID_VP8) {
         codecctl_int(avctx, VP8E_SET_NOISE_SENSITIVITY, avctx->noise_reduction);
         codecctl_int(avctx, VP8E_SET_TOKEN_PARTITIONS,  av_log2(avctx->slices));
     }
@@ -364,23 +602,79 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     codecctl_int(avctx, VP8E_SET_STATIC_THRESHOLD,  ctx->static_thresh);
-    codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->crf >= 0)
+        codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->max_intra_rate >= 0)
+        codecctl_int(avctx, VP8E_SET_MAX_INTRA_BITRATE_PCT, ctx->max_intra_rate);
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (ctx->lossless >= 0)
+            codecctl_int(avctx, VP9E_SET_LOSSLESS, ctx->lossless);
+        if (ctx->tile_columns >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_COLUMNS, ctx->tile_columns);
+        if (ctx->tile_rows >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_ROWS, ctx->tile_rows);
+        if (ctx->frame_parallel >= 0)
+            codecctl_int(avctx, VP9E_SET_FRAME_PARALLEL_DECODING, ctx->frame_parallel);
+        if (ctx->aq_mode >= 0)
+            codecctl_int(avctx, VP9E_SET_AQ_MODE, ctx->aq_mode);
+#if VPX_ENCODER_ABI_VERSION > 8
+        set_colorspace(avctx);
+#endif
+    }
+#endif
+
+    av_log(avctx, AV_LOG_DEBUG, "Using deadline: %d\n", ctx->deadline);
 
     //provide dummy value to initialize wrapper, values will be updated each _encode()
-    vpx_img_wrap(&ctx->rawimg, ff_vpx_pixfmt_to_imgfmt(avctx->pix_fmt),
-                 avctx->width, avctx->height, 1, (unsigned char *)1);
+    vpx_img_wrap(&ctx->rawimg, img_fmt, avctx->width, avctx->height, 1,
+                 (unsigned char*)1);
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH))
+        ctx->rawimg.bit_depth = enccfg.g_bit_depth;
+#endif
+
+    if (ctx->is_alpha)
+        vpx_img_wrap(&ctx->rawimg_alpha, VPX_IMG_FMT_I420, avctx->width, avctx->height, 1,
+                     (unsigned char*)1);
 
     return 0;
 }
 
 static inline void cx_pktcpy(struct FrameListData *dst,
-                             const struct vpx_codec_cx_pkt *src)
+                             const struct vpx_codec_cx_pkt *src,
+                             const struct vpx_codec_cx_pkt *src_alpha,
+                             VP8Context *ctx)
 {
     dst->pts      = src->data.frame.pts;
     dst->duration = src->data.frame.duration;
     dst->flags    = src->data.frame.flags;
     dst->sz       = src->data.frame.sz;
     dst->buf      = src->data.frame.buf;
+    dst->have_sse = 0;
+    /* For alt-ref frame, don't store PSNR or increment frame_number */
+    if (!(dst->flags & VPX_FRAME_IS_INVISIBLE)) {
+        dst->frame_number = ++ctx->frame_number;
+        dst->have_sse = ctx->have_sse;
+        if (ctx->have_sse) {
+            /* associate last-seen SSE to the frame. */
+            /* Transfers ownership from ctx to dst. */
+            /* WARNING! This makes the assumption that PSNR_PKT comes
+               just before the frame it refers to! */
+            memcpy(dst->sse, ctx->sse, sizeof(dst->sse));
+            ctx->have_sse = 0;
+        }
+    } else {
+        dst->frame_number = -1;   /* sanity marker */
+    }
+    if (src_alpha) {
+        dst->buf_alpha = src_alpha->data.frame.buf;
+        dst->sz_alpha = src_alpha->data.frame.sz;
+    } else {
+        dst->buf_alpha = NULL;
+        dst->sz_alpha = 0;
+    }
 }
 
 /**
@@ -393,8 +687,10 @@ static inline void cx_pktcpy(struct FrameListData *dst,
 static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
                       AVPacket *pkt)
 {
-    int ret = ff_alloc_packet(pkt, cx_frame->sz);
+    int ret = ff_alloc_packet2(avctx, pkt, cx_frame->sz, 0);
+    uint8_t *side_data;
     if (ret >= 0) {
+        int pict_type;
         memcpy(pkt->data, cx_frame->buf, pkt->size);
         pkt->pts = pkt->dts = cx_frame->pts;
 #if FF_API_CODED_FRAME
@@ -405,22 +701,54 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
         if (!!(cx_frame->flags & VPX_FRAME_IS_KEY)) {
+            pict_type = AV_PICTURE_TYPE_I;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
             pkt->flags |= AV_PKT_FLAG_KEY;
         } else {
+            pict_type = AV_PICTURE_TYPE_P;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+            avctx->coded_frame->pict_type = pict_type;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+        }
+
+        ff_side_data_set_encoder_stats(pkt, 0, cx_frame->sse + 1,
+                                       cx_frame->have_sse ? 3 : 0, pict_type);
+
+        if (cx_frame->have_sse) {
+            int i;
+            /* Beware of the Y/U/V/all order! */
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            avctx->coded_frame->error[0] = cx_frame->sse[1];
+            avctx->coded_frame->error[1] = cx_frame->sse[2];
+            avctx->coded_frame->error[2] = cx_frame->sse[3];
+            avctx->coded_frame->error[3] = 0;    // alpha
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+            for (i = 0; i < 3; ++i) {
+                avctx->error[i] += cx_frame->sse[i + 1];
+            }
+            cx_frame->have_sse = 0;
+        }
+        if (cx_frame->sz_alpha > 0) {
+            side_data = av_packet_new_side_data(pkt,
+                                                AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+                                                cx_frame->sz_alpha + 8);
+            if(!side_data) {
+                av_free_packet(pkt);
+                av_free(pkt);
+                return AVERROR(ENOMEM);
+            }
+            AV_WB64(side_data, 1);
+            memcpy(side_data + 8, cx_frame->buf_alpha, cx_frame->sz_alpha);
         }
     } else {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error getting output packet of size %zu.\n", cx_frame->sz);
         return ret;
     }
     return pkt->size;
@@ -438,7 +766,9 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 {
     VP8Context *ctx = avctx->priv_data;
     const struct vpx_codec_cx_pkt *pkt;
+    const struct vpx_codec_cx_pkt *pkt_alpha = NULL;
     const void *iter = NULL;
+    const void *iter_alpha = NULL;
     int size = 0;
 
     if (ctx->coded_frame_list) {
@@ -453,7 +783,9 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
     /* consume all available output from the encoder before returning. buffers
        are only good through the next vpx_codec call */
-    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter))) {
+    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter)) &&
+           (!ctx->is_alpha ||
+            (ctx->is_alpha && (pkt_alpha = vpx_codec_get_cx_data(&ctx->encoder_alpha, &iter_alpha))))) {
         switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT:
             if (!size) {
@@ -461,8 +793,8 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
                 /* avoid storing the frame when the list is empty and we haven't yet
                    provided a frame for output */
-                assert(!ctx->coded_frame_list);
-                cx_pktcpy(&cx_frame, pkt);
+                av_assert0(!ctx->coded_frame_list);
+                cx_pktcpy(&cx_frame, pkt, pkt_alpha, ctx);
                 size = storeframe(avctx, &cx_frame, pkt_out);
                 if (size < 0)
                     return size;
@@ -475,17 +807,28 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
                            "Frame queue element alloc failed\n");
                     return AVERROR(ENOMEM);
                 }
-                cx_pktcpy(cx_frame, pkt);
+                cx_pktcpy(cx_frame, pkt, pkt_alpha, ctx);
                 cx_frame->buf = av_malloc(cx_frame->sz);
 
                 if (!cx_frame->buf) {
                     av_log(avctx, AV_LOG_ERROR,
-                           "Data buffer alloc (%zu bytes) failed\n",
+                           "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                            cx_frame->sz);
                     av_freep(&cx_frame);
                     return AVERROR(ENOMEM);
                 }
                 memcpy(cx_frame->buf, pkt->data.frame.buf, pkt->data.frame.sz);
+                if (ctx->is_alpha) {
+                    cx_frame->buf_alpha = av_malloc(cx_frame->sz_alpha);
+                    if (!cx_frame->buf_alpha) {
+                        av_log(avctx, AV_LOG_ERROR,
+                               "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
+                               cx_frame->sz_alpha);
+                        av_free(cx_frame);
+                        return AVERROR(ENOMEM);
+                    }
+                    memcpy(cx_frame->buf_alpha, pkt_alpha->data.frame.buf, pkt_alpha->data.frame.sz);
+                }
                 coded_frame_add(&ctx->coded_frame_list, cx_frame);
             }
             break;
@@ -504,7 +847,14 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
             stats->sz += pkt->data.twopass_stats.sz;
             break;
         }
-        case VPX_CODEC_PSNR_PKT: //FIXME add support for AV_CODEC_FLAG_PSNR
+        case VPX_CODEC_PSNR_PKT:
+            av_assert0(!ctx->have_sse);
+            ctx->sse[0] = pkt->data.psnr.sse[0];
+            ctx->sse[1] = pkt->data.psnr.sse[1];
+            ctx->sse[2] = pkt->data.psnr.sse[2];
+            ctx->sse[3] = pkt->data.psnr.sse[3];
+            ctx->have_sse = 1;
+            break;
         case VPX_CODEC_CUSTOM_PKT:
             //ignore unsupported/unrecognized packet types
             break;
@@ -519,6 +869,7 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
 {
     VP8Context *ctx = avctx->priv_data;
     struct vpx_image *rawimg = NULL;
+    struct vpx_image *rawimg_alpha = NULL;
     int64_t timestamp = 0;
     int res, coded_size;
     vpx_enc_frame_flags_t flags = 0;
@@ -531,6 +882,25 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         rawimg->stride[VPX_PLANE_Y] = frame->linesize[0];
         rawimg->stride[VPX_PLANE_U] = frame->linesize[1];
         rawimg->stride[VPX_PLANE_V] = frame->linesize[2];
+        if (ctx->is_alpha) {
+            uint8_t *u_plane, *v_plane;
+            rawimg_alpha = &ctx->rawimg_alpha;
+            rawimg_alpha->planes[VPX_PLANE_Y] = frame->data[3];
+            u_plane = av_malloc(frame->linesize[1] * frame->height);
+            v_plane = av_malloc(frame->linesize[2] * frame->height);
+            if (!u_plane || !v_plane) {
+                av_free(u_plane);
+                av_free(v_plane);
+                return AVERROR(ENOMEM);
+            }
+            memset(u_plane, 0x80, frame->linesize[1] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_U] = u_plane;
+            memset(v_plane, 0x80, frame->linesize[2] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_V] = v_plane;
+            rawimg_alpha->stride[VPX_PLANE_Y] = frame->linesize[0];
+            rawimg_alpha->stride[VPX_PLANE_U] = frame->linesize[1];
+            rawimg_alpha->stride[VPX_PLANE_V] = frame->linesize[2];
+        }
         timestamp                   = frame->pts;
         if (frame->pict_type == AV_PICTURE_TYPE_I)
             flags |= VPX_EFLAG_FORCE_KF;
@@ -542,6 +912,16 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         log_encoder_error(avctx, "Error encoding frame");
         return AVERROR_INVALIDDATA;
     }
+
+    if (ctx->is_alpha) {
+        res = vpx_codec_encode(&ctx->encoder_alpha, rawimg_alpha, timestamp,
+                               avctx->ticks_per_frame, flags, ctx->deadline);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Error encoding alpha frame");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     coded_size = queue_frames(avctx, pkt);
 
     if (!frame && avctx->flags & AV_CODEC_FLAG_PASS1) {
@@ -557,40 +937,89 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
                          ctx->twopass_stats.sz);
     }
 
+    if (rawimg_alpha) {
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_U]);
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_V]);
+    }
+
     *got_packet = !!coded_size;
     return 0;
 }
 
 #define OFFSET(x) offsetof(VP8Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, INT_MIN, INT_MAX, VE},
-    { "auto-alt-ref",    "Enable use of alternate reference "
-                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      1,       VE},
-    { "lag-in-frames",   "Number of frames to look ahead for "
-                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"},
-    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" },
-    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" },
-    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" },
-    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"},
-    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"},
-    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"},
-    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"},
-    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"},
-#ifdef VPX_ERROR_RESILIENT_DEFAULT
-    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"},
-    { "partitions",      "The frame partitions are independently decodable "
-                         "by the bool decoder, meaning that partitions can be decoded even "
-                         "though earlier partitions have been lost. Note that intra predicition"
-                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"},
+
+#ifndef VPX_ERROR_RESILIENT_DEFAULT
+#define VPX_ERROR_RESILIENT_DEFAULT 1
+#define VPX_ERROR_RESILIENT_PARTITIONS 2
 #endif
-    { "crf",              "Select the quality for constant quality mode", offsetof(VP8Context, crf), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 63, VE },
-    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+
+#define COMMON_OPTIONS \
+    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1},       -16,     16,      VE}, \
+    { "auto-alt-ref",    "Enable use of alternate reference " \
+                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      1,       VE}, \
+    { "lag-in-frames",   "Number of frames to look ahead for " \
+                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"}, \
+    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" }, \
+    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" }, \
+    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" }, \
+    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"}, \
+    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"}, \
+    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"}, \
+    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"}, \
+    { "max-intra-rate",  "Maximum I-frame bitrate (pct) 0=unlimited",  OFFSET(max_intra_rate),  AV_OPT_TYPE_INT,  {.i64 = -1}, -1,      INT_MAX, VE}, \
+    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"}, \
+    { "partitions",      "The frame partitions are independently decodable " \
+                         "by the bool decoder, meaning that partitions can be decoded even " \
+                         "though earlier partitions have been lost. Note that intra predicition" \
+                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"}, \
+    { "crf",              "Select the quality for constant quality mode", offsetof(VP8Context, crf), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, VE }, \
+    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE }, \
+    { "undershoot-pct",  "Datarate undershoot (min) target (%)", OFFSET(rc_undershoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 100, VE }, \
+    { "overshoot-pct",   "Datarate overshoot (max) target (%)", OFFSET(rc_overshoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1000, VE }, \
+
+#define LEGACY_OPTIONS \
+    {"speed", "", offsetof(VP8Context, cpu_used), AV_OPT_TYPE_INT, {.i64 = 1}, -16, 16, VE}, \
+    {"quality", "", offsetof(VP8Context, deadline), AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    {"vp8flags", "", offsetof(VP8Context, flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, UINT_MAX, VE, "flags"}, \
+    {"error_resilient", "enable error resilience", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_ERROR_RESILIENT}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"altref", "enable use of alternate reference frames (VP8/2-pass only)", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_AUTO_ALT_REF}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"arnr_max_frames", "altref noise reduction max frame count", offsetof(VP8Context, arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 15, VE}, \
+    {"arnr_strength", "altref noise reduction filter strength", offsetof(VP8Context, arnr_strength), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 6, VE}, \
+    {"arnr_type", "altref noise reduction filter type", offsetof(VP8Context, arnr_type), AV_OPT_TYPE_INT, {.i64 = 3}, 1, 3, VE}, \
+    {"rc_lookahead", "Number of frames to look ahead for alternate reference frame selection", offsetof(VP8Context, lag_in_frames), AV_OPT_TYPE_INT, {.i64 = 25}, 0, 25, VE}, \
+
+#if CONFIG_LIBVPX_VP8_ENCODER
+static const AVOption vp8_options[] = {
+    COMMON_OPTIONS
+    LEGACY_OPTIONS
     { NULL }
 };
+#endif
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const AVOption vp9_options[] = {
+    COMMON_OPTIONS
+    { "lossless",        "Lossless mode",                               OFFSET(lossless),        AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE},
+    { "tile-columns",    "Number of tile columns to use, log2",         OFFSET(tile_columns),    AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "tile-rows",       "Number of tile rows to use, log2",            OFFSET(tile_rows),       AV_OPT_TYPE_INT, {.i64 = -1}, -1, 2, VE},
+    { "frame-parallel",  "Enable frame parallel decodability features", OFFSET(frame_parallel),  AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE},
+    { "aq-mode",         "adaptive quantization mode",                  OFFSET(aq_mode),         AV_OPT_TYPE_INT, {.i64 = -1}, -1, 3, VE, "aq_mode"},
+    { "none",            "Aq not used",         0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 0, VE, "aq_mode" },
+    { "variance",        "Variance based Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "aq_mode" },
+    { "complexity",      "Complexity based Aq", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "aq_mode" },
+    { "cyclic",          "Cyclic Refresh Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "aq_mode" },
+    LEGACY_OPTIONS
+    { NULL }
+};
+#endif
+
+#undef COMMON_OPTIONS
+#undef LEGACY_OPTIONS
 
 static const AVCodecDefault defaults[] = {
     { "qmin",             "-1" },
@@ -603,13 +1032,13 @@ static const AVCodecDefault defaults[] = {
 #if CONFIG_LIBVPX_VP8_ENCODER
 static av_cold int vp8_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp8_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp8_cx());
 }
 
 static const AVClass class_vp8 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp8 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp8_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
@@ -623,7 +1052,7 @@ AVCodec ff_libvpx_vp8_encoder = {
     .encode2        = vp8_encode,
     .close          = vp8_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P, AV_PIX_FMT_NONE },
     .priv_class     = &class_vp8,
     .defaults       = defaults,
 };
@@ -632,13 +1061,13 @@ AVCodec ff_libvpx_vp8_encoder = {
 #if CONFIG_LIBVPX_VP9_ENCODER
 static av_cold int vp9_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp9_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp9_cx());
 }
 
 static const AVClass class_vp9 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp9 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp9_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
@@ -660,17 +1089,9 @@ AVCodec ff_libvpx_vp9_encoder = {
     .encode2        = vp8_encode,
     .close          = vp8_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,
-#if VPX_IMAGE_ABI_VERSION >= 3
-        AV_PIX_FMT_YUV422P,
-        AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV440P,
-#endif
-        AV_PIX_FMT_NONE,
-    },
     .profiles       = NULL_IF_CONFIG_SMALL(profiles),
     .priv_class     = &class_vp9,
     .defaults       = defaults,
+    .init_static_data = ff_vp9_init_static,
 };
 #endif /* CONFIG_LIBVPX_VP9_ENCODER */
diff --git a/libavcodec/libwavpackenc.c b/libavcodec/libwavpackenc.c
index 1455d91b21..6d5708985a 100644
--- a/libavcodec/libwavpackenc.c
+++ b/libavcodec/libwavpackenc.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libwebpenc.c b/libavcodec/libwebpenc.c
index ef311b7592..0bcf628e58 100644
--- a/libavcodec/libwebpenc.c
+++ b/libavcodec/libwebpenc.c
@@ -2,213 +2,48 @@
  * WebP encoding support via libwebp
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * WebP encoder using libwebp
+ * WebP encoder using libwebp (WebPEncode API)
  */
 
-#include <webp/encode.h>
+#include "libwebpenc_common.h"
 
-#include "libavutil/common.h"
-#include "libavutil/frame.h"
-#include "libavutil/imgutils.h"
-#include "libavutil/opt.h"
-#include "avcodec.h"
-#include "internal.h"
-
-typedef struct LibWebPContext {
-    AVClass *class;         // class for AVOptions
-    float quality;          // lossy quality 0 - 100
-    int lossless;           // use lossless encoding
-    int preset;             // configuration preset
-    int chroma_warning;     // chroma linesize mismatch warning has been printed
-    int conversion_warning; // pixel format conversion warning has been printed
-    WebPConfig config;      // libwebp configuration
-} LibWebPContext;
-
-static int libwebp_error_to_averror(int err)
-{
-    switch (err) {
-    case VP8_ENC_ERROR_OUT_OF_MEMORY:
-    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
-        return AVERROR(ENOMEM);
-    case VP8_ENC_ERROR_NULL_PARAMETER:
-    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
-    case VP8_ENC_ERROR_BAD_DIMENSION:
-        return AVERROR(EINVAL);
-    }
-    return AVERROR_UNKNOWN;
-}
+typedef LibWebPContextCommon LibWebPContext;
 
 static av_cold int libwebp_encode_init(AVCodecContext *avctx)
 {
-    LibWebPContext *s = avctx->priv_data;
-    int ret;
-
-    if (avctx->global_quality < 0)
-        avctx->global_quality = 75 * FF_QP2LAMBDA;
-    s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
-                          0.0f, 100.0f);
-
-    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
-        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
-               avctx->compression_level);
-        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
-    }
-
-    if (s->preset >= WEBP_PRESET_DEFAULT) {
-        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-        s->lossless              = s->config.lossless;
-        s->quality               = s->config.quality;
-        avctx->compression_level = s->config.method;
-    } else {
-        ret = WebPConfigInit(&s->config);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-
-        s->config.lossless = s->lossless;
-        s->config.quality  = s->quality;
-        s->config.method   = avctx->compression_level;
-
-        ret = WebPValidateConfig(&s->config);
-        if (!ret)
-            return AVERROR(EINVAL);
-    }
-
-    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
-           s->lossless ? "Lossless" : "Lossy", s->quality,
-           avctx->compression_level);
-
-    return 0;
+    return ff_libwebp_encode_init_common(avctx);
 }
 
 static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                 const AVFrame *frame, int *got_packet)
 {
     LibWebPContext *s  = avctx->priv_data;
-    AVFrame *alt_frame = NULL;
     WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
     WebPMemoryWriter mw = { 0 };
-    int ret;
 
-    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
-        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
-               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
-        return AVERROR(EINVAL);
-    }
-
-    pic = av_malloc(sizeof(*pic));
-    if (!pic)
-        return AVERROR(ENOMEM);
-
-    ret = WebPPictureInit(pic);
-    if (!ret) {
-        ret = AVERROR_UNKNOWN;
+    int ret = ff_libwebp_get_frame(avctx, s, frame, &alt_frame, &pic);
+    if (ret < 0)
         goto end;
-    }
-    pic->width  = avctx->width;
-    pic->height = avctx->height;
-
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
-        if (!s->lossless) {
-            /* libwebp will automatically convert RGB input to YUV when
-               encoding lossy. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for RGB-to-YUV conversion. You may want "
-                       "to consider passing in YUV instead for lossy "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-        }
-        pic->use_argb    = 1;
-        pic->argb        = (uint32_t *)frame->data[0];
-        pic->argb_stride = frame->linesize[0] / 4;
-    } else {
-        if (frame->linesize[1] != frame->linesize[2]) {
-            if (!s->chroma_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Copying frame due to differing chroma linesizes.\n");
-                s->chroma_warning = 1;
-            }
-            alt_frame = av_frame_alloc();
-            if (!alt_frame) {
-                ret = AVERROR(ENOMEM);
-                goto end;
-            }
-            alt_frame->width  = frame->width;
-            alt_frame->height = frame->height;
-            alt_frame->format = frame->format;
-            ret = av_frame_get_buffer(alt_frame, 32);
-            if (ret < 0)
-                goto end;
-            av_frame_copy(alt_frame, frame);
-            frame = alt_frame;
-        }
-        pic->use_argb  = 0;
-        pic->y         = frame->data[0];
-        pic->u         = frame->data[1];
-        pic->v         = frame->data[2];
-        pic->y_stride  = frame->linesize[0];
-        pic->uv_stride = frame->linesize[1];
-        if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
-            pic->colorspace = WEBP_YUV420A;
-            pic->a          = frame->data[3];
-            pic->a_stride   = frame->linesize[3];
-        } else {
-            pic->colorspace = WEBP_YUV420;
-        }
-
-        if (s->lossless) {
-            /* We do not have a way to automatically prioritize RGB over YUV
-               in automatic pixel format conversion based on whether we're
-               encoding lossless or lossy, so we do conversion with libwebp as
-               a convenience. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for YUV-to-RGB conversion. You may want "
-                       "to consider passing in RGB instead for lossless "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-
-#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
-            /* libwebp should do the conversion automatically, but there is a
-               bug that causes it to return an error instead, so a work-around
-               is required.
-               See https://code.google.com/p/webp/issues/detail?id=178 */
-            pic->memory_ = (void*)1;  /* something non-null */
-            ret = WebPPictureYUVAToARGB(pic);
-            if (!ret) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "WebPPictureYUVAToARGB() failed with error: %d\n",
-                       pic->error_code);
-                ret = libwebp_error_to_averror(pic->error_code);
-                goto end;
-            }
-            pic->memory_ = NULL;  /* restore pointer */
-#endif
-        }
-    }
 
     WebPMemoryWriterInit(&mw);
     pic->custom_ptr = &mw;
@@ -218,11 +53,11 @@ static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (!ret) {
         av_log(avctx, AV_LOG_ERROR, "WebPEncode() failed with error: %d\n",
                pic->error_code);
-        ret = libwebp_error_to_averror(pic->error_code);
+        ret = ff_libwebp_error_to_averror(pic->error_code);
         goto end;
     }
 
-    ret = ff_alloc_packet(pkt, mw.size);
+    ret = ff_alloc_packet2(avctx, pkt, mw.size, mw.size);
     if (ret < 0)
         goto end;
     memcpy(pkt->data, mw.mem, mw.size);
@@ -243,20 +78,13 @@ end:
     return ret;
 }
 
-#define OFFSET(x) offsetof(LibWebPContext, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
-    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
-    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
-    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
-    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
-    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
-    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
-    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
-    { NULL },
-};
+static int libwebp_encode_close(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s  = avctx->priv_data;
+    av_frame_free(&s->ref);
+
+    return 0;
+}
 
 static const AVClass class = {
     .class_name = "libwebp",
@@ -265,12 +93,6 @@ static const AVClass class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVCodecDefault libwebp_defaults[] = {
-    { "compression_level",  "4"  },
-    { "global_quality",     "-1" },
-    { NULL },
-};
-
 AVCodec ff_libwebp_encoder = {
     .name           = "libwebp",
     .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
@@ -279,6 +101,7 @@ AVCodec ff_libwebp_encoder = {
     .priv_data_size = sizeof(LibWebPContext),
     .init           = libwebp_encode_init,
     .encode2        = libwebp_encode_frame,
+    .close          = libwebp_encode_close,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB32,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
diff --git a/libavcodec/libwebpenc_animencoder.c b/libavcodec/libwebpenc_animencoder.c
new file mode 100644
index 0000000000..d7437a9a57
--- /dev/null
+++ b/libavcodec/libwebpenc_animencoder.c
@@ -0,0 +1,150 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2015 Urvang Joshi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp (WebPAnimEncoder API)
+ */
+
+#include "config.h"
+#include "libwebpenc_common.h"
+
+#include <webp/mux.h>
+
+typedef struct LibWebPAnimContext {
+    LibWebPContextCommon cc;
+    WebPAnimEncoder *enc;     // the main AnimEncoder object
+    int64_t prev_frame_pts;   // pts of the previously encoded frame.
+    int done;                 // If true, we have assembled the bitstream already
+} LibWebPAnimContext;
+
+static av_cold int libwebp_anim_encode_init(AVCodecContext *avctx)
+{
+    int ret = ff_libwebp_encode_init_common(avctx);
+    if (!ret) {
+        LibWebPAnimContext *s = avctx->priv_data;
+        WebPAnimEncoderOptions enc_options;
+        WebPAnimEncoderOptionsInit(&enc_options);
+        // TODO(urvang): Expose some options on command-line perhaps.
+        s->enc = WebPAnimEncoderNew(avctx->width, avctx->height, &enc_options);
+        if (!s->enc)
+            return AVERROR(EINVAL);
+        s->prev_frame_pts = -1;
+        s->done = 0;
+    }
+    return ret;
+}
+
+static int libwebp_anim_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                     const AVFrame *frame, int *got_packet) {
+    LibWebPAnimContext *s = avctx->priv_data;
+    int ret;
+
+    if (!frame) {
+        if (s->done) {  // Second flush: return empty package to denote finish.
+            *got_packet = 0;
+            return 0;
+        } else {  // First flush: assemble bitstream and return it.
+            WebPData assembled_data = { 0 };
+            ret = WebPAnimEncoderAssemble(s->enc, &assembled_data);
+            if (ret) {
+                ret = ff_alloc_packet2(avctx, pkt, assembled_data.size, assembled_data.size);
+                if (ret < 0)
+                    return ret;
+                memcpy(pkt->data, assembled_data.bytes, assembled_data.size);
+                s->done = 1;
+                pkt->flags |= AV_PKT_FLAG_KEY;
+                pkt->pts = pkt->dts = s->prev_frame_pts + 1;
+                *got_packet = 1;
+                return 0;
+            } else {
+                av_log(s, AV_LOG_ERROR,
+                       "WebPAnimEncoderAssemble() failed with error: %d\n",
+                       VP8_ENC_ERROR_OUT_OF_MEMORY);
+                return AVERROR(ENOMEM);
+            }
+        }
+    } else {
+        int timestamp_ms;
+        WebPPicture *pic = NULL;
+        AVFrame *alt_frame = NULL;
+        ret = ff_libwebp_get_frame(avctx, &s->cc, frame, &alt_frame, &pic);
+        if (ret < 0)
+            goto end;
+
+        timestamp_ms =
+            avctx->time_base.num * frame->pts * 1000 / avctx->time_base.den;
+        ret = WebPAnimEncoderAdd(s->enc, pic, timestamp_ms, &s->cc.config);
+        if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Encoding WebP frame failed with error: %d\n",
+                   pic->error_code);
+            ret = ff_libwebp_error_to_averror(pic->error_code);
+            goto end;
+        }
+
+        pkt->pts = pkt->dts = frame->pts;
+        s->prev_frame_pts = frame->pts;  // Save for next frame.
+        ret = 0;
+        *got_packet = 1;
+
+end:
+        WebPPictureFree(pic);
+        av_freep(&pic);
+        av_frame_free(&alt_frame);
+        return ret;
+    }
+}
+
+static int libwebp_anim_encode_close(AVCodecContext *avctx)
+{
+    LibWebPAnimContext *s = avctx->priv_data;
+    av_frame_free(&s->cc.ref);
+    WebPAnimEncoderDelete(s->enc);
+
+    return 0;
+}
+
+static const AVClass class = {
+    .class_name = "libwebp_anim",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libwebp_anim_encoder = {
+    .name           = "libwebp_anim",
+    .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WEBP,
+    .priv_data_size = sizeof(LibWebPAnimContext),
+    .init           = libwebp_anim_encode_init,
+    .encode2        = libwebp_anim_encode_frame,
+    .close          = libwebp_anim_encode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB32,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &class,
+    .defaults       = libwebp_defaults,
+};
diff --git a/libavcodec/libwebpenc_common.c b/libavcodec/libwebpenc_common.c
new file mode 100644
index 0000000000..a76b6da5d6
--- /dev/null
+++ b/libavcodec/libwebpenc_common.c
@@ -0,0 +1,254 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#include "libwebpenc_common.h"
+
+int ff_libwebp_error_to_averror(int err)
+{
+    switch (err) {
+    case VP8_ENC_ERROR_OUT_OF_MEMORY:
+    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
+        return AVERROR(ENOMEM);
+    case VP8_ENC_ERROR_NULL_PARAMETER:
+    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
+    case VP8_ENC_ERROR_BAD_DIMENSION:
+        return AVERROR(EINVAL);
+    }
+    return AVERROR_UNKNOWN;
+}
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s = avctx->priv_data;
+    int ret;
+
+    if (avctx->global_quality >= 0)
+        s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
+                              0.0f, 100.0f);
+
+    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
+        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
+               avctx->compression_level);
+        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
+    }
+
+    if (s->preset >= WEBP_PRESET_DEFAULT) {
+        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+        s->lossless              = s->config.lossless;
+        s->quality               = s->config.quality;
+        avctx->compression_level = s->config.method;
+    } else {
+        ret = WebPConfigInit(&s->config);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+
+        s->config.lossless = s->lossless;
+        s->config.quality  = s->quality;
+        s->config.method   = avctx->compression_level;
+
+        ret = WebPValidateConfig(&s->config);
+        if (!ret)
+            return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
+           s->lossless ? "Lossless" : "Lossy", s->quality,
+           avctx->compression_level);
+
+    return 0;
+}
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr) {
+    int ret;
+    WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
+
+    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
+        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
+               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
+        return AVERROR(EINVAL);
+    }
+
+    *pic_ptr = av_malloc(sizeof(*pic));
+    pic = *pic_ptr;
+    if (!pic)
+        return AVERROR(ENOMEM);
+
+    ret = WebPPictureInit(pic);
+    if (!ret) {
+        ret = AVERROR_UNKNOWN;
+        goto end;
+    }
+    pic->width  = avctx->width;
+    pic->height = avctx->height;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if (!s->lossless) {
+            /* libwebp will automatically convert RGB input to YUV when
+               encoding lossy. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for RGB-to-YUV conversion. You may want "
+                       "to consider passing in YUV instead for lossy "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+        }
+        pic->use_argb    = 1;
+        pic->argb        = (uint32_t *)frame->data[0];
+        pic->argb_stride = frame->linesize[0] / 4;
+    } else {
+        if (frame->linesize[1] != frame->linesize[2] || s->cr_threshold) {
+            if (!s->chroma_warning && !s->cr_threshold) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Copying frame due to differing chroma linesizes.\n");
+                s->chroma_warning = 1;
+            }
+            *alt_frame_ptr = av_frame_alloc();
+            alt_frame = *alt_frame_ptr;
+            if (!alt_frame) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            alt_frame->width  = frame->width;
+            alt_frame->height = frame->height;
+            alt_frame->format = frame->format;
+            if (s->cr_threshold)
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+            ret = av_frame_get_buffer(alt_frame, 32);
+            if (ret < 0)
+                goto end;
+            alt_frame->format = frame->format;
+            av_frame_copy(alt_frame, frame);
+            frame = alt_frame;
+            if (s->cr_threshold) {
+                int x,y, x2, y2, p;
+                int bs = s->cr_size;
+
+                if (!s->ref) {
+                    s->ref = av_frame_clone(frame);
+                    if (!s->ref) {
+                        ret = AVERROR(ENOMEM);
+                        goto end;
+                    }
+                }
+
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+                for (y = 0; y < frame->height; y+= bs) {
+                    for (x = 0; x < frame->width; x+= bs) {
+                        int skip;
+                        int sse = 0;
+                        for (p = 0; p < 3; p++) {
+                            int bs2 = bs >> !!p;
+                            int w = FF_CEIL_RSHIFT(frame->width , !!p);
+                            int h = FF_CEIL_RSHIFT(frame->height, !!p);
+                            int xs = x >> !!p;
+                            int ys = y >> !!p;
+                            for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                for (x2 = xs; x2 < FFMIN(xs + bs2, w); x2++) {
+                                    int diff =  frame->data[p][frame->linesize[p] * y2 + x2]
+                                              -s->ref->data[p][frame->linesize[p] * y2 + x2];
+                                    sse += diff*diff;
+                                }
+                            }
+                        }
+                        skip = sse < s->cr_threshold && frame->data[3] != s->ref->data[3];
+                        if (!skip)
+                            for (p = 0; p < 3; p++) {
+                                int bs2 = bs >> !!p;
+                                int w = FF_CEIL_RSHIFT(frame->width , !!p);
+                                int h = FF_CEIL_RSHIFT(frame->height, !!p);
+                                int xs = x >> !!p;
+                                int ys = y >> !!p;
+                                for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                    memcpy(&s->ref->data[p][frame->linesize[p] * y2 + xs],
+                                            & frame->data[p][frame->linesize[p] * y2 + xs], FFMIN(bs2, w-xs));
+                                }
+                            }
+                        for (y2 = y; y2 < FFMIN(y+bs, frame->height); y2++) {
+                            memset(&frame->data[3][frame->linesize[3] * y2 + x],
+                                    skip ? 0 : 255,
+                                    FFMIN(bs, frame->width-x));
+                        }
+                    }
+                }
+            }
+        }
+
+        pic->use_argb  = 0;
+        pic->y         = frame->data[0];
+        pic->u         = frame->data[1];
+        pic->v         = frame->data[2];
+        pic->y_stride  = frame->linesize[0];
+        pic->uv_stride = frame->linesize[1];
+        if (frame->format == AV_PIX_FMT_YUVA420P) {
+            pic->colorspace = WEBP_YUV420A;
+            pic->a          = frame->data[3];
+            pic->a_stride   = frame->linesize[3];
+            if (alt_frame)
+                WebPCleanupTransparentArea(pic);
+        } else {
+            pic->colorspace = WEBP_YUV420;
+        }
+
+        if (s->lossless) {
+            /* We do not have a way to automatically prioritize RGB over YUV
+               in automatic pixel format conversion based on whether we're
+               encoding lossless or lossy, so we do conversion with libwebp as
+               a convenience. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for YUV-to-RGB conversion. You may want "
+                       "to consider passing in RGB instead for lossless "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+
+#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
+            /* libwebp should do the conversion automatically, but there is a
+               bug that causes it to return an error instead, so a work-around
+               is required.
+               See https://code.google.com/p/webp/issues/detail?id=178 */
+            pic->memory_ = (void*)1;  /* something non-null */
+            ret = WebPPictureYUVAToARGB(pic);
+            if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "WebPPictureYUVAToARGB() failed with error: %d\n",
+                       pic->error_code);
+                ret = libwebp_error_to_averror(pic->error_code);
+                goto end;
+            }
+            pic->memory_ = NULL;  /* restore pointer */
+#endif
+        }
+    }
+end:
+    return ret;
+}
diff --git a/libavcodec/libwebpenc_common.h b/libavcodec/libwebpenc_common.h
new file mode 100644
index 0000000000..e74e57939e
--- /dev/null
+++ b/libavcodec/libwebpenc_common.h
@@ -0,0 +1,84 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#ifndef AVCODEC_LIBWEBPENC_COMMON_H
+#define AVCODEC_LIBWEBPENC_COMMON_H
+
+#include <webp/encode.h>
+
+#include "libavutil/common.h"
+#include "libavutil/frame.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct LibWebPContextCommon {
+    AVClass *class;         // class for AVOptions
+    float quality;          // lossy quality 0 - 100
+    int lossless;           // use lossless encoding
+    int preset;             // configuration preset
+    int chroma_warning;     // chroma linesize mismatch warning has been printed
+    int conversion_warning; // pixel format conversion warning has been printed
+    WebPConfig config;      // libwebp configuration
+    AVFrame *ref;
+    int cr_size;
+    int cr_threshold;
+} LibWebPContextCommon;
+
+int ff_libwebp_error_to_averror(int err);
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx);
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr);
+
+#define OFFSET(x) offsetof(LibWebPContextCommon, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
+    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
+    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
+    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
+    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
+    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
+    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
+    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
+    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
+    { "cr_threshold","Conditional replenishment threshold",     OFFSET(cr_threshold), AV_OPT_TYPE_INT, { .i64 =  0  },  0, INT_MAX, VE           },
+    { "cr_size"     ,"Conditional replenishment block size",    OFFSET(cr_size)     , AV_OPT_TYPE_INT, { .i64 =  16 },  0, 256,     VE           },
+    { "quality"     ,"Quality",                OFFSET(quality),  AV_OPT_TYPE_FLOAT, { .dbl =  75 }, 0, 100,                         VE           },
+    { NULL },
+};
+
+static const AVCodecDefault libwebp_defaults[] = {
+    { "compression_level",  "4"  },
+    { "global_quality",     "-1" },
+    { NULL },
+};
+
+#endif /* AVCODEC_LIBWEBPENC_COMMON_H */
diff --git a/libavcodec/libx264.c b/libavcodec/libx264.c
index 50de007302..75b5a5feba 100644
--- a/libavcodec/libx264.c
+++ b/libavcodec/libx264.c
@@ -2,28 +2,30 @@
  * H.264 encoding using the x264 library
  * Copyright (C) 2005  Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/eval.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/stereo3d.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -48,7 +50,10 @@ typedef struct X264Context {
     char *preset;
     char *tune;
     char *profile;
+    char *level;
     int fastfirstpass;
+    char *wpredp;
+    char *x264opts;
     float crf;
     float crf_max;
     int cqp;
@@ -76,8 +81,10 @@ typedef struct X264Context {
     int slice_max_size;
     char *stats;
     int nal_hrd;
+    int avcintra_class;
     int motion_est;
     int forced_idr;
+    int a53_cc;
     char *x264_params;
 } X264Context;
 
@@ -98,7 +105,7 @@ static void X264_log(void *p, int level, const char *fmt, va_list args)
 
 
 static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
-                       x264_nal_t *nals, int nnal)
+                       const x264_nal_t *nals, int nnal)
 {
     X264Context *x4 = ctx->priv_data;
     uint8_t *p;
@@ -110,16 +117,21 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0)
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
 
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
     if (x4->sei_size > 0 && nnal > 0) {
+        if (x4->sei_size > size) {
+            av_log(ctx, AV_LOG_ERROR, "Error: nal buffer is too small\n");
+            return -1;
+        }
         memcpy(p, x4->sei, x4->sei_size);
         p += x4->sei_size;
         x4->sei_size = 0;
+        av_freep(&x4->sei);
     }
 
     for (i = 0; i < nnal; i++){
@@ -130,18 +142,39 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     return 1;
 }
 
+static int avfmt2_num_planes(int avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVJ420P:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV444P:
+        return 3;
+
+    case AV_PIX_FMT_BGR0:
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB24:
+        return 1;
+
+    default:
+        return 3;
+    }
+}
+
 static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
 {
     X264Context *x4 = ctx->priv_data;
     AVFrameSideData *side_data;
 
 
-    if (x4->params.b_tff != frame->top_field_first) {
+  if (x4->avcintra_class < 0) {
+    if (x4->params.b_interlaced && x4->params.b_tff != frame->top_field_first) {
+
         x4->params.b_tff = frame->top_field_first;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
-    if (x4->params.vui.i_sar_height != ctx->sample_aspect_ratio.den ||
-        x4->params.vui.i_sar_width  != ctx->sample_aspect_ratio.num) {
+    if (x4->params.vui.i_sar_height*ctx->sample_aspect_ratio.num != ctx->sample_aspect_ratio.den * x4->params.vui.i_sar_width) {
         x4->params.vui.i_sar_height = ctx->sample_aspect_ratio.den;
         x4->params.vui.i_sar_width  = ctx->sample_aspect_ratio.num;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -168,6 +201,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
     }
 
     if (x4->params.rc.i_rc_method == X264_RC_CQP &&
+        x4->cqp >= 0 &&
         x4->params.rc.i_qp_constant != x4->cqp) {
         x4->params.rc.i_qp_constant = x4->cqp;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -178,6 +212,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
         x4->params.rc.f_rf_constant_max = x4->crf_max;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
+  }
 
     side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_STEREO3D);
     if (side_data) {
@@ -221,16 +256,18 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     X264Context *x4 = ctx->priv_data;
     x264_nal_t *nal;
     int nnal, i, ret;
-    x264_picture_t pic_out;
+    x264_picture_t pic_out = {0};
+    int pict_type;
+    AVFrameSideData *side_data;
 
     x264_picture_init( &x4->pic );
     x4->pic.img.i_csp   = x4->params.i_csp;
     if (x264_bit_depth > 8)
         x4->pic.img.i_csp |= X264_CSP_HIGH_DEPTH;
-    x4->pic.img.i_plane = 3;
+    x4->pic.img.i_plane = avfmt2_num_planes(ctx->pix_fmt);
 
     if (frame) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < x4->pic.img.i_plane; i++) {
             x4->pic.img.plane[i]    = frame->data[i];
             x4->pic.img.i_stride[i] = frame->linesize[i];
         }
@@ -239,8 +276,8 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
 
         switch (frame->pict_type) {
         case AV_PICTURE_TYPE_I:
-            x4->pic.i_type = x4->forced_idr ? X264_TYPE_IDR
-                                            : X264_TYPE_KEYFRAME;
+            x4->pic.i_type = x4->forced_idr >= 0 ? X264_TYPE_IDR
+                                                 : X264_TYPE_KEYFRAME;
             break;
         case AV_PICTURE_TYPE_P:
             x4->pic.i_type = X264_TYPE_P;
@@ -253,10 +290,51 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
             break;
         }
         reconfig_encoder(ctx, frame);
+
+        if (x4->a53_cc) {
+            side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC);
+            if (side_data) {
+                x4->pic.extra_sei.payloads = av_mallocz(sizeof(x4->pic.extra_sei.payloads[0]));
+                if (x4->pic.extra_sei.payloads == NULL) {
+                    av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+                    goto skip_a53cc;
+                }
+                x4->pic.extra_sei.sei_free = av_free;
+
+                x4->pic.extra_sei.payloads[0].payload_size = side_data->size + 11;
+                x4->pic.extra_sei.payloads[0].payload = av_mallocz(x4->pic.extra_sei.payloads[0].payload_size);
+                if (x4->pic.extra_sei.payloads[0].payload == NULL) {
+                    av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+                    av_freep(&x4->pic.extra_sei.payloads);
+                    goto skip_a53cc;
+                }
+                x4->pic.extra_sei.num_payloads = 1;
+                x4->pic.extra_sei.payloads[0].payload_type = 4;
+                memcpy(x4->pic.extra_sei.payloads[0].payload + 10, side_data->data, side_data->size);
+                x4->pic.extra_sei.payloads[0].payload[0] = 181;
+                x4->pic.extra_sei.payloads[0].payload[1] = 0;
+                x4->pic.extra_sei.payloads[0].payload[2] = 49;
+
+                /**
+                 * 'GA94' is standard in North America for ATSC, but hard coding
+                 * this style may not be the right thing to do -- other formats
+                 * do exist. This information is not available in the side_data
+                 * so we are going with this right now.
+                 */
+                AV_WL32(x4->pic.extra_sei.payloads[0].payload + 3,
+                    MKTAG('G', 'A', '9', '4'));
+                x4->pic.extra_sei.payloads[0].payload[7] = 3;
+                x4->pic.extra_sei.payloads[0].payload[8] =
+                    ((side_data->size/3) & 0x1f) | 0x40;
+                x4->pic.extra_sei.payloads[0].payload[9] = 0;
+                x4->pic.extra_sei.payloads[0].payload[side_data->size+10] = 255;
+            }
+        }
     }
+skip_a53cc:
     do {
         if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0)
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
 
         ret = encode_nals(ctx, pkt, nal, nnal);
         if (ret < 0)
@@ -266,31 +344,31 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     pkt->pts = pic_out.i_pts;
     pkt->dts = pic_out.i_dts;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
+
     switch (pic_out.i_type) {
     case X264_TYPE_IDR:
     case X264_TYPE_I:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case X264_TYPE_P:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case X264_TYPE_B:
     case X264_TYPE_BREF:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    ctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     pkt->flags |= AV_PKT_FLAG_KEY*pic_out.b_keyframe;
     if (ret) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+        ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -318,6 +396,20 @@ static av_cold int X264_close(AVCodecContext *avctx)
     return 0;
 }
 
+#define OPT_STR(opt, param)                                                   \
+    do {                                                                      \
+        int ret;                                                              \
+        if (param && (ret = x264_param_parse(&x4->params, opt, param)) < 0) { \
+            if(ret == X264_PARAM_BAD_NAME)                                    \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad option '%s': '%s'\n", opt, param);               \
+            else                                                              \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad value for '%s': '%s'\n", opt, param);            \
+            return -1;                                                        \
+        }                                                                     \
+    } while (0)
+
 static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
 {
     switch (pix_fmt) {
@@ -332,6 +424,15 @@ static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
     case AV_PIX_FMT_YUVJ444P:
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV444P10: return X264_CSP_I444;
+#ifdef X264_CSP_BGR
+    case AV_PIX_FMT_BGR0:
+        return X264_CSP_BGRA;
+    case AV_PIX_FMT_BGR24:
+        return X264_CSP_BGR;
+
+    case AV_PIX_FMT_RGB24:
+        return X264_CSP_RGB;
+#endif
     case AV_PIX_FMT_NV12:      return X264_CSP_NV12;
     case AV_PIX_FMT_NV16:
     case AV_PIX_FMT_NV20:      return X264_CSP_NV16;
@@ -351,21 +452,33 @@ static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
 static av_cold int X264_init(AVCodecContext *avctx)
 {
     X264Context *x4 = avctx->priv_data;
+    int sw,sh;
+
+    if (avctx->global_quality > 0)
+        av_log(avctx, AV_LOG_WARNING, "-qscale is ignored, -crf is recommended.\n");
 
 #if CONFIG_LIBX262_ENCODER
     if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
         x4->params.b_mpeg2 = 1;
         x264_param_default_mpeg2(&x4->params);
     } else
-#else
-    x264_param_default(&x4->params);
 #endif
+    x264_param_default(&x4->params);
 
     x4->params.b_deblocking_filter         = avctx->flags & AV_CODEC_FLAG_LOOP_FILTER;
 
     if (x4->preset || x4->tune)
         if (x264_param_default_preset(&x4->params, x4->preset, x4->tune) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting preset/tune %s/%s.\n", x4->preset, x4->tune);
+            av_log(avctx, AV_LOG_INFO, "Possible presets:");
+            for (i = 0; x264_preset_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_preset_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
+            av_log(avctx, AV_LOG_INFO, "Possible tunes:");
+            for (i = 0; x264_tune_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_tune_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
@@ -377,6 +490,8 @@ static av_cold int X264_init(AVCodecContext *avctx)
     x4->params.i_log_level          = X264_LOG_DEBUG;
     x4->params.i_csp                = convert_pix_fmt(avctx->pix_fmt);
 
+    OPT_STR("weightp", x4->wpredp);
+
     if (avctx->bit_rate) {
         x4->params.rc.i_bitrate   = avctx->bit_rate / 1000;
         x4->params.rc.i_rc_method = X264_RC_ABR;
@@ -405,10 +520,14 @@ static av_cold int X264_init(AVCodecContext *avctx)
             (float)avctx->rc_initial_buffer_occupancy / avctx->rc_buffer_size;
     }
 
+    OPT_STR("level", x4->level);
+
     if (avctx->i_quant_factor > 0)
         x4->params.rc.f_ip_factor         = 1 / fabs(avctx->i_quant_factor);
-    x4->params.rc.f_pb_factor             = avctx->b_quant_factor;
-    x4->params.analyse.i_chroma_qp_offset = avctx->chromaoffset;
+    if (avctx->b_quant_factor > 0)
+        x4->params.rc.f_pb_factor         = avctx->b_quant_factor;
+    if (avctx->chromaoffset)
+        x4->params.analyse.i_chroma_qp_offset = avctx->chromaoffset;
 
     if (avctx->gop_size >= 0)
         x4->params.i_keyint_max         = avctx->gop_size;
@@ -428,6 +547,28 @@ static av_cold int X264_init(AVCodecContext *avctx)
         x4->params.rc.f_qcompress       = avctx->qcompress; /* 0.0 => cbr, 1.0 => constant qp */
     if (avctx->refs >= 0)
         x4->params.i_frame_reference    = avctx->refs;
+    else if (x4->level) {
+        int i;
+        int mbn = FF_CEIL_RSHIFT(avctx->width, 4) * FF_CEIL_RSHIFT(avctx->height, 4);
+        int level_id = -1;
+        char *tail;
+        int scale = X264_BUILD < 129 ? 384 : 1;
+
+        if (!strcmp(x4->level, "1b")) {
+            level_id = 9;
+        } else if (strlen(x4->level) <= 3){
+            level_id = av_strtod(x4->level, &tail) * 10 + 0.5;
+            if (*tail)
+                level_id = -1;
+        }
+        if (level_id <= 0)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse level\n");
+
+        for (i = 0; i<x264_levels[i].level_idc; i++)
+            if (x264_levels[i].level_idc == level_id)
+                x4->params.i_frame_reference = av_clip(x264_levels[i].dpb / mbn / scale, 1, x4->params.i_frame_reference);
+    }
+
     if (avctx->trellis >= 0)
         x4->params.analyse.i_trellis    = avctx->trellis;
     if (avctx->me_range >= 0)
@@ -472,6 +613,13 @@ static av_cold int X264_init(AVCodecContext *avctx)
         x4->params.b_bluray_compat = x4->bluray_compat;
         x4->params.b_vfr_input = 0;
     }
+    if (x4->avcintra_class >= 0)
+#if X264_BUILD >= 142
+        x4->params.i_avcintra_class = x4->avcintra_class;
+#else
+        av_log(avctx, AV_LOG_ERROR,
+               "x264 too old for AVC Intra, at least version 142 needed\n");
+#endif
     if (x4->b_bias != INT_MIN)
         x4->params.i_bframe_bias              = x4->b_bias;
     if (x4->b_pyramid >= 0)
@@ -491,10 +639,45 @@ static av_cold int X264_init(AVCodecContext *avctx)
 
     if (x4->slice_max_size >= 0)
         x4->params.i_slice_max_size =  x4->slice_max_size;
+    else {
+        /*
+         * Allow x264 to be instructed through AVCodecContext about the maximum
+         * size of the RTP payload. For example, this enables the production of
+         * payload suitable for the H.264 RTP packetization-mode 0 i.e. single
+         * NAL unit per RTP packet.
+         */
+        if (avctx->rtp_payload_size)
+            x4->params.i_slice_max_size = avctx->rtp_payload_size;
+    }
 
     if (x4->fastfirstpass)
         x264_param_apply_fastfirstpass(&x4->params);
 
+    /* Allow specifying the x264 profile through AVCodecContext. */
+    if (!x4->profile)
+        switch (avctx->profile) {
+        case FF_PROFILE_H264_BASELINE:
+            x4->profile = av_strdup("baseline");
+            break;
+        case FF_PROFILE_H264_HIGH:
+            x4->profile = av_strdup("high");
+            break;
+        case FF_PROFILE_H264_HIGH_10:
+            x4->profile = av_strdup("high10");
+            break;
+        case FF_PROFILE_H264_HIGH_422:
+            x4->profile = av_strdup("high422");
+            break;
+        case FF_PROFILE_H264_HIGH_444:
+            x4->profile = av_strdup("high444");
+            break;
+        case FF_PROFILE_H264_MAIN:
+            x4->profile = av_strdup("main");
+            break;
+        default:
+            break;
+        }
+
     if (x4->nal_hrd >= 0)
         x4->params.i_nal_hrd = x4->nal_hrd;
 
@@ -519,16 +702,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (x4->profile)
         if (x264_param_apply_profile(&x4->params, x4->profile) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting profile %s.\n", x4->profile);
+            av_log(avctx, AV_LOG_INFO, "Possible profiles:");
+            for (i = 0; x264_profile_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_profile_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
     x4->params.i_width          = avctx->width;
     x4->params.i_height         = avctx->height;
-    x4->params.vui.i_sar_width  = avctx->sample_aspect_ratio.num;
-    x4->params.vui.i_sar_height = avctx->sample_aspect_ratio.den;
-    x4->params.i_fps_num = x4->params.i_timebase_den = avctx->time_base.den;
-    x4->params.i_fps_den = x4->params.i_timebase_num = avctx->time_base.num;
+    av_reduce(&sw, &sh, avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den, 4096);
+    x4->params.vui.i_sar_width  = sw;
+    x4->params.vui.i_sar_height = sh;
+    x4->params.i_timebase_den = avctx->time_base.den;
+    x4->params.i_timebase_num = avctx->time_base.num;
+    x4->params.i_fps_num = avctx->time_base.den;
+    x4->params.i_fps_den = avctx->time_base.num * avctx->ticks_per_frame;
 
     x4->params.analyse.b_psnr = avctx->flags & AV_CODEC_FLAG_PSNR;
 
@@ -547,14 +738,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
                                  avctx->color_range == AVCOL_RANGE_JPEG;
 
-    // x264 validates the values internally
-    x4->params.vui.i_colorprim = avctx->color_primaries;
-    x4->params.vui.i_transfer  = avctx->color_trc;
-    x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->colorspace != AVCOL_SPC_UNSPECIFIED)
+        x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED)
+        x4->params.vui.i_colorprim = avctx->color_primaries;
+    if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED)
+        x4->params.vui.i_transfer  = avctx->color_trc;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)
         x4->params.b_repeat_headers = 0;
 
+    if(x4->x264opts){
+        const char *p= x4->x264opts;
+        while(p){
+            char param[256]={0}, val[256]={0};
+            if(sscanf(p, "%255[^:=]=%255[^:]", param, val) == 1){
+                OPT_STR(param, "1");
+            }else
+                OPT_STR(param, val);
+            p= strchr(p, ':');
+            p+=!!p;
+        }
+    }
+
     if (x4->x264_params) {
         AVDictionary *dict    = NULL;
         AVDictionaryEntry *en = NULL;
@@ -581,7 +787,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     x4->enc = x264_encoder_open(&x4->params);
     if (!x4->enc)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
         x264_nal_t *nal;
@@ -639,6 +845,14 @@ static const enum AVPixelFormat pix_fmts_10bit[] = {
     AV_PIX_FMT_NV20,
     AV_PIX_FMT_NONE
 };
+static const enum AVPixelFormat pix_fmts_8bit_rgb[] = {
+#ifdef X264_CSP_BGR
+    AV_PIX_FMT_BGR0,
+    AV_PIX_FMT_BGR24,
+    AV_PIX_FMT_RGB24,
+#endif
+    AV_PIX_FMT_NONE
+};
 
 static av_cold void X264_init_static(AVCodec *codec)
 {
@@ -656,14 +870,22 @@ static const AVOption options[] = {
     { "preset",        "Set the encoding preset (cf. x264 --fullhelp)",   OFFSET(preset),        AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE},
     { "tune",          "Tune the encoding params (cf. x264 --fullhelp)",  OFFSET(tune),          AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
     { "profile",       "Set profile restrictions (cf. x264 --fullhelp) ", OFFSET(profile),       AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
-    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 1, VE},
+    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE},
+    {"level", "Specify level (as defined by Annex A)", OFFSET(level), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"passlogfile", "Filename for 2 pass stats", OFFSET(stats), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"wpredp", "Weighted prediction for P-frames", OFFSET(wpredp), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"a53cc",          "Use A53 Closed Captions (if available)",          OFFSET(a53_cc),        AV_OPT_TYPE_BOOL,   {.i64 = 0}, 0, 1, VE},
+    {"x264opts", "x264 options", OFFSET(x264opts), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
     { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "crf_max",       "In CRF mode, prevents VBV from lowering quality beyond this point.",OFFSET(crf_max), AV_OPT_TYPE_FLOAT, {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE },
     { "aq-mode",       "AQ method",                                       OFFSET(aq_mode),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "aq_mode"},
     { "none",          NULL,                              0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_NONE},         INT_MIN, INT_MAX, VE, "aq_mode" },
     { "variance",      "Variance AQ (complexity mask)",   0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_VARIANCE},     INT_MIN, INT_MAX, VE, "aq_mode" },
-    { "autovariance",  "Auto-variance AQ (experimental)", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+    { "autovariance",  "Auto-variance AQ",                0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#if X264_BUILD >= 144
+    { "autovariance-biased", "Auto-variance AQ with bias to dark scenes", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE_BIASED}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#endif
     { "aq-strength",   "AQ strength. Reduces blocking and blurring in flat and textured areas.", OFFSET(aq_strength), AV_OPT_TYPE_FLOAT, {.dbl = -1}, -1, FLT_MAX, VE},
     { "psy",           "Use psychovisual optimizations.",                 OFFSET(psy),           AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
     { "psy-rd",        "Strength of psychovisual optimization, in <psy-rd>:<psy-trellis> format.", OFFSET(psy_rd), AV_OPT_TYPE_STRING,  {0 }, 0, 0, VE},
@@ -702,13 +924,14 @@ static const AVOption options[] = {
     { "none",          NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_NONE}, INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "vbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_VBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "cbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_CBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
+    { "avcintra-class","AVC-Intra class 50/100/200",                      OFFSET(avcintra_class),AV_OPT_TYPE_INT,     { .i64 = -1 }, -1, 200   , VE},
     { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, X264_ME_TESA, VE, "motion-est"},
     { "dia",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_DIA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "hex",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_HEX },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "umh",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_UMH },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "esa",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_ESA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "tesa",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_TESA }, INT_MIN, INT_MAX, VE, "motion-est" },
-    { "forced-idr",   "If forwarding iframes, require them to be IDR frames.", OFFSET(forced_idr),  AV_OPT_TYPE_INT,    { .i64 = 0 }, 0, 1, VE },
+    { "forced-idr",   "If forcing keyframes, force them as IDR frames.",                                  OFFSET(forced_idr),  AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
     { "x264-params",  "Override the x264 configuration using a :-separated list of key=value parameters", OFFSET(x264_params), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { NULL },
 };
@@ -716,13 +939,16 @@ static const AVOption options[] = {
 static const AVCodecDefault x264_defaults[] = {
     { "b",                "0" },
     { "bf",               "-1" },
+    { "flags2",           "0" },
     { "g",                "-1" },
     { "i_qfactor",        "-1" },
+    { "b_qfactor",        "-1" },
     { "qmin",             "-1" },
     { "qmax",             "-1" },
     { "qdiff",            "-1" },
     { "qblur",            "-1" },
     { "qcomp",            "-1" },
+//     { "rc_lookahead",     "-1" },
     { "refs",             "-1" },
     { "sc_threshold",     "-1" },
     { "trellis",          "-1" },
@@ -744,13 +970,20 @@ static const AVCodecDefault x264_defaults[] = {
 };
 
 #if CONFIG_LIBX264_ENCODER
-static const AVClass class = {
+static const AVClass x264_class = {
     .class_name = "libx264",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
+static const AVClass rgbclass = {
+    .class_name = "libx264rgb",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_libx264_encoder = {
     .name             = "libx264",
     .long_name        = NULL_IF_CONFIG_SMALL("libx264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
@@ -761,12 +994,27 @@ AVCodec ff_libx264_encoder = {
     .encode2          = X264_frame,
     .close            = X264_close,
     .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .priv_class       = &class,
+    .priv_class       = &x264_class,
     .defaults         = x264_defaults,
     .init_static_data = X264_init_static,
     .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
                         FF_CODEC_CAP_INIT_CLEANUP,
 };
+
+AVCodec ff_libx264rgb_encoder = {
+    .name           = "libx264rgb",
+    .long_name      = NULL_IF_CONFIG_SMALL("libx264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 RGB"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(X264Context),
+    .init           = X264_init,
+    .encode2        = X264_frame,
+    .close          = X264_close,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
+    .priv_class     = &rgbclass,
+    .defaults       = x264_defaults,
+    .pix_fmts       = pix_fmts_8bit_rgb,
+};
 #endif
 
 #if CONFIG_LIBX262_ENCODER
@@ -786,7 +1034,7 @@ AVCodec ff_libx262_encoder = {
     .init             = X264_init,
     .encode2          = X264_frame,
     .close            = X264_close,
-    .capabilities     = CODEC_CAP_DELAY | CODEC_CAP_AUTO_THREADS,
+    .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
     .priv_class       = &X262_class,
     .defaults         = x264_defaults,
     .pix_fmts         = pix_fmts_8bit,
diff --git a/libavcodec/libx265.c b/libavcodec/libx265.c
index f5d3d0f3c0..a1770facb1 100644
--- a/libavcodec/libx265.c
+++ b/libavcodec/libx265.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013-2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -272,7 +272,7 @@ static int libx265_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     ret = ctx->api->encoder_encode(ctx->encoder, &nal, &nnal,
                                    pic ? &x265pic : NULL, &x265pic_out);
     if (ret < 0)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (!nnal)
         return 0;
@@ -338,10 +338,10 @@ static const enum AVPixelFormat x265_csp_twelve[] = {
 
 static av_cold void libx265_encode_init_csp(AVCodec *codec)
 {
-    if (x265_max_bit_depth == 8)
-        codec->pix_fmts = x265_csp_eight;
-    else if (x265_max_bit_depth == 12)
+    if (x265_api_get(10))
         codec->pix_fmts = x265_csp_twelve;
+    else if (x265_api_get(8))
+        codec->pix_fmts = x265_csp_eight;
 }
 
 #define OFFSET(x) offsetof(libx265Context, x)
diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index 43b0915c52..45b4dcb76f 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -2,20 +2,20 @@
  * AVS encoding using the xavs library
  * Copyright (C) 2010 Amanda, Y.N. Wu <amanda11192003@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -89,10 +89,8 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", size);
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
-    }
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
@@ -120,7 +118,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
     xavs_nal_t *nal;
     int nnal, i, ret;
     xavs_picture_t pic_out;
-    uint8_t *sd;
+    int pict_type;
 
     x4->pic.img.i_csp   = XAVS_CSP_I420;
     x4->pic.img.i_plane = 3;
@@ -147,7 +145,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     if (!ret) {
         if (!frame && !(x4->end_of_stream)) {
-            if ((ret = ff_alloc_packet(pkt, 4)) < 0)
+            if ((ret = ff_alloc_packet2(avctx, pkt, 4, 0)) < 0)
                 return ret;
 
             pkt->data[0] = 0x0;
@@ -155,7 +153,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
             pkt->data[2] = 0x01;
             pkt->data[3] = 0xb1;
             pkt->dts = 2*x4->pts_buffer[(x4->out_frame_count-1)%(avctx->max_b_frames+1)] -
-                       x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
+                         x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
             x4->end_of_stream = END_OF_STREAM;
             *got_packet = 1;
         }
@@ -176,21 +174,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
     } else
         pkt->dts = pkt->pts;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
     switch (pic_out.i_type) {
     case XAVS_TYPE_IDR:
     case XAVS_TYPE_I:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case XAVS_TYPE_P:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case XAVS_TYPE_B:
     case XAVS_TYPE_BREF:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
@@ -211,10 +212,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
     x4->out_frame_count++;
     *got_packet = ret;
@@ -226,7 +224,7 @@ static av_cold int XAVS_close(AVCodecContext *avctx)
     XavsContext *x4 = avctx->priv_data;
 
     av_freep(&avctx->extradata);
-    av_free(x4->sei);
+    av_freep(&x4->sei);
     av_freep(&x4->pts_buffer);
 
     if (x4->enc)
@@ -385,12 +383,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!x4->enc)
         return -1;
 
-    if (!(x4->pts_buffer = av_mallocz((avctx->max_b_frames+1) * sizeof(*x4->pts_buffer))))
+    if (!(x4->pts_buffer = av_mallocz_array((avctx->max_b_frames+1), sizeof(*x4->pts_buffer))))
         return AVERROR(ENOMEM);
 
     /* TAG: Do we have GLOBAL HEADER in AVS */
     /* We Have PPS and SPS in AVS */
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER && 0) {
         xavs_nal_t *nal;
         int nnal, s, i, size;
         uint8_t *p;
@@ -420,10 +418,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(XavsContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE },
+    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE },
     { "b-bias",        "Influences how often B-frames are used",          OFFSET(b_bias),        AV_OPT_TYPE_INT,    {.i64 = INT_MIN}, INT_MIN, INT_MAX, VE },
-    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE},
+    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE},
     { "direct-pred",   "Direct MV prediction mode",                       OFFSET(direct_pred),   AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE, "direct-pred" },
     { "none",          NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_NONE },     0, 0, VE, "direct-pred" },
     { "spatial",       NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_SPATIAL },  0, 0, VE, "direct-pred" },
@@ -443,7 +441,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass xavs_class = {
     .class_name = "libxavs",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -466,6 +464,6 @@ AVCodec ff_libxavs_encoder = {
     .close          = XAVS_close,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &xavs_class,
     .defaults       = xavs_defaults,
 };
diff --git a/libavcodec/libxvid.c b/libavcodec/libxvid.c
index 4b24ca6292..cbb72c2227 100644
--- a/libavcodec/libxvid.c
+++ b/libavcodec/libxvid.c
@@ -2,20 +2,20 @@
  * Interface to xvidcore for mpeg4 encoding
  * Copyright (c) 2004 Adam Thayer <krevnik@comcast.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,10 +27,11 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <unistd.h>
 #include <xvid.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
+#include "libavutil/file.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
@@ -41,6 +42,14 @@
 #include "libxvid.h"
 #include "mpegutils.h"
 
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if HAVE_IO_H
+#include <io.h>
+#endif
+
 /**
  * Buffer management macros.
  */
@@ -53,7 +62,7 @@
  * This stores all the private context for the codec.
  */
 struct xvid_context {
-    AVClass *class;                /**< Handle for Xvid encoder */
+    AVClass *class;
     void *encoder_handle;          /**< Handle for Xvid encoder */
     int xsize;                     /**< Frame x size */
     int ysize;                     /**< Frame y size */
@@ -65,6 +74,7 @@ struct xvid_context {
     char *twopassbuffer;           /**< Character buffer for two-pass */
     char *old_twopassbuffer;       /**< Old character buffer (two-pass) */
     char *twopassfile;             /**< second pass temp file name */
+    int twopassfd;
     unsigned char *intra_matrix;   /**< P-Frame Quant Matrix */
     unsigned char *inter_matrix;   /**< I-Frame Quant Matrix */
     int lumi_aq;                   /**< Lumi masking as an aq method */
@@ -83,6 +93,8 @@ struct xvid_ff_pass1 {
     struct xvid_context *context;   /**< Pointer to private context */
 };
 
+static int xvid_encode_close(AVCodecContext *avctx);
+
 /*
  * Xvid 2-Pass Kludge Section
  *
@@ -113,7 +125,7 @@ static int xvid_ff_2pass_create(xvid_plg_create_t *param, void **handle)
     /* This is because we can safely prevent a buffer overflow */
     log[0] = 0;
     snprintf(log, BUFFER_REMAINING(log),
-             "# avconv 2-pass log file, using xvid codec\n");
+             "# ffmpeg 2-pass log file, using xvid codec\n");
     snprintf(BUFFER_CAT(log), BUFFER_REMAINING(log),
              "# Do not modify. libxvidcore version: %d.%d.%d\n\n",
              XVID_VERSION_MAJOR(XVID_VERSION),
@@ -355,32 +367,34 @@ static void xvid_correct_framerate(AVCodecContext *avctx)
 
 static av_cold int xvid_encode_init(AVCodecContext *avctx)
 {
-    int xerr, i;
+    int xerr, i, ret = -1;
     int xvid_flags = avctx->flags;
     struct xvid_context *x = avctx->priv_data;
     uint16_t *intra, *inter;
     int fd;
 
-    xvid_plugin_single_t single         = { 0 };
-    struct xvid_ff_pass1 rc2pass1       = { 0 };
-    xvid_plugin_2pass2_t rc2pass2       = { 0 };
-    xvid_plugin_lumimasking_t masking_l = { 0 }; /* For lumi masking */
-    xvid_plugin_lumimasking_t masking_v = { 0 }; /* For variance AQ */
-    xvid_plugin_ssim_t ssim             = { 0 };
-    xvid_gbl_init_t xvid_gbl_init       = { 0 };
-    xvid_enc_create_t xvid_enc_create   = { 0 };
-    xvid_enc_plugin_t plugins[7];
-
-    /* Bring in VOP flags from avconv command-line */
-    x->vop_flags = XVID_VOP_HALFPEL; /* Bare minimum quality */
+    xvid_plugin_single_t      single          = { 0 };
+    struct xvid_ff_pass1      rc2pass1        = { 0 };
+    xvid_plugin_2pass2_t      rc2pass2        = { 0 };
+    xvid_plugin_lumimasking_t masking_l       = { 0 }; /* For lumi masking */
+    xvid_plugin_lumimasking_t masking_v       = { 0 }; /* For variance AQ */
+    xvid_plugin_ssim_t        ssim            = { 0 };
+    xvid_gbl_init_t           xvid_gbl_init   = { 0 };
+    xvid_enc_create_t         xvid_enc_create = { 0 };
+    xvid_enc_plugin_t         plugins[4];
+
+    x->twopassfd = -1;
+
+    /* Bring in VOP flags from ffmpeg command-line */
+    x->vop_flags = XVID_VOP_HALFPEL;              /* Bare minimum quality */
     if (xvid_flags & AV_CODEC_FLAG_4MV)
-        x->vop_flags |= XVID_VOP_INTER4V; /* Level 3 */
+        x->vop_flags    |= XVID_VOP_INTER4V;      /* Level 3 */
     if (avctx->trellis)
-        x->vop_flags |= XVID_VOP_TRELLISQUANT; /* Level 5 */
+        x->vop_flags    |= XVID_VOP_TRELLISQUANT; /* Level 5 */
     if (xvid_flags & AV_CODEC_FLAG_AC_PRED)
-        x->vop_flags |= XVID_VOP_HQACPRED; /* Level 6 */
+        x->vop_flags    |= XVID_VOP_HQACPRED;     /* Level 6 */
     if (xvid_flags & AV_CODEC_FLAG_GRAY)
-        x->vop_flags |= XVID_VOP_GREYSCALE;
+        x->vop_flags    |= XVID_VOP_GREYSCALE;
 
     /* Decide which ME quality setting to use */
     x->me_flags = 0;
@@ -442,7 +456,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     }
 
-    /* Bring in VOL flags from avconv command-line */
+    /* Bring in VOL flags from ffmpeg command-line */
 #if FF_API_GMC
     if (avctx->flags & CODEC_FLAG_GMC)
         x->gmc = 1;
@@ -484,6 +498,18 @@ FF_ENABLE_DEPRECATION_WARNINGS
     xvid_enc_create.num_zones = 0;
 
     xvid_enc_create.num_threads = avctx->thread_count;
+#if (XVID_VERSION <= 0x010303) && (XVID_VERSION >= 0x010300)
+    /* workaround for a bug in libxvidcore */
+    if (avctx->height <= 16) {
+        if (avctx->thread_count < 2) {
+            xvid_enc_create.num_threads = 0;
+        } else {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Too small height for threads > 1.");
+            return AVERROR(EINVAL);
+        }
+    }
+#endif
 
     xvid_enc_create.plugins     = plugins;
     xvid_enc_create.num_plugins = 0;
@@ -513,26 +539,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
         rc2pass2.version = XVID_VERSION;
         rc2pass2.bitrate = avctx->bit_rate;
 
-        fd = ff_tempfile("xvidff.", &x->twopassfile);
+        fd = av_tempfile("xvidff.", &x->twopassfile, 0, avctx);
         if (fd < 0) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write 2-pass pipe\n");
             return fd;
         }
+        x->twopassfd = fd;
 
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR,
                    "Xvid: No 2-pass information loaded for second pass\n");
-            return AVERROR_INVALIDDATA;
+            return AVERROR(EINVAL);
         }
 
-        if (strlen(avctx->stats_in) >
-            write(fd, avctx->stats_in, strlen(avctx->stats_in))) {
-            close(fd);
+        ret = write(fd, avctx->stats_in, strlen(avctx->stats_in));
+        if (ret == -1)
+            ret = AVERROR(errno);
+        else if (strlen(avctx->stats_in) > ret) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write to 2-pass pipe\n");
-            return AVERROR(EIO);
+            ret = AVERROR(EIO);
         }
+        if (ret < 0)
+            return ret;
 
-        close(fd);
         rc2pass2.filename                          = x->twopassfile;
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_2pass2;
         plugins[xvid_enc_create.num_plugins].param = &rc2pass2;
@@ -550,12 +579,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->lumi_masking != 0.0)
         x->lumi_aq = 1;
 
-    if (x->lumi_aq && x->variance_aq) {
-        x->variance_aq = 0;
-        av_log(avctx, AV_LOG_WARNING,
-               "variance_aq is ignored when lumi_aq is set.\n");
-    }
-
     /* Luminance Masking */
     if (x->lumi_aq) {
         masking_l.method                          = 0;
@@ -576,6 +599,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         xvid_enc_create.num_plugins++;
     }
 
+    if (x->lumi_aq && x->variance_aq )
+        av_log(avctx, AV_LOG_INFO,
+               "Both lumi_aq and variance_aq are enabled. The resulting quality"
+               "will be the worse one of the two effects made by the AQ.\n");
+
     /* SSIM */
     if (x->ssim) {
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_ssim;
@@ -667,11 +695,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->max_b_frames > 0 && !x->quicktime_format)
         xvid_enc_create.global |= XVID_GLOBAL_PACKED;
 
+    av_assert0(xvid_enc_create.num_plugins + (!!x->ssim) + (!!x->variance_aq) + (!!x->lumi_aq) <= FF_ARRAY_ELEMS(plugins));
+
     /* Create encoder context */
     xerr = xvid_encore(NULL, XVID_ENC_CREATE, &xvid_enc_create, NULL);
     if (xerr) {
         av_log(avctx, AV_LOG_ERROR, "Xvid: Could not create encoder reference\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     x->encoder_handle  = xvid_enc_create.handle;
@@ -691,11 +721,8 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     xvid_enc_frame_t xvid_enc_frame = { 0 };
     xvid_enc_stats_t xvid_enc_stats = { 0 };
 
-    if (!user_packet &&
-        (ret = av_new_packet(pkt, mb_width * mb_height * MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, mb_width*(int64_t)mb_height*MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     /* Start setting up the frame */
     xvid_enc_frame.version = XVID_VERSION;
@@ -709,7 +736,7 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (avctx->pix_fmt != AV_PIX_FMT_YUV420P) {
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Color spaces other than 420P not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     xvid_enc_frame.input.csp = XVID_CSP_PLANAR; /* YUV420P */
@@ -730,11 +757,13 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                                   XVID_TYPE_AUTO;
 
     /* Pixel aspect ratio setting */
-    if (avctx->sample_aspect_ratio.num < 1 || avctx->sample_aspect_ratio.num > 255 ||
-        avctx->sample_aspect_ratio.den < 1 || avctx->sample_aspect_ratio.den > 255) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid pixel aspect ratio %i/%i\n",
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.num > 255 ||
+        avctx->sample_aspect_ratio.den < 0 || avctx->sample_aspect_ratio.den > 255) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
-        return -1;
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
     }
     xvid_enc_frame.par        = XVID_PAR_EXT;
     xvid_enc_frame.par_width  = avctx->sample_aspect_ratio.num;
@@ -767,27 +796,28 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (xerr > 0) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = xvid_enc_stats.quant * FF_QP2LAMBDA;
+        int pict_type;
 
         *got_packet = 1;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
         if (xvid_enc_stats.type == XVID_TYPE_PVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            pict_type = AV_PICTURE_TYPE_P;
         else if (xvid_enc_stats.type == XVID_TYPE_BVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+            pict_type = AV_PICTURE_TYPE_B;
         else if (xvid_enc_stats.type == XVID_TYPE_SVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_S;
+            pict_type = AV_PICTURE_TYPE_S;
         else
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            pict_type = AV_PICTURE_TYPE_I;
+
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+        avctx->coded_frame->pict_type = pict_type;
+        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+
+        ff_side_data_set_encoder_stats(pkt, xvid_enc_stats.quant * FF_QP2LAMBDA, NULL, 0, pict_type);
+
         if (xvid_enc_frame.out_flags & XVID_KEYFRAME) {
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -816,7 +846,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return 0;
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Encoding Error Occurred: %i\n", xerr);
-        return xerr;
+        return AVERROR_EXTERNAL;
     }
 }
 
@@ -831,12 +861,18 @@ static av_cold int xvid_encode_close(AVCodecContext *avctx)
 
     av_freep(&avctx->extradata);
     if (x->twopassbuffer) {
-        av_free(x->twopassbuffer);
-        av_free(x->old_twopassbuffer);
+        av_freep(&x->twopassbuffer);
+        av_freep(&x->old_twopassbuffer);
+        avctx->stats_out = NULL;
+    }
+    if (x->twopassfd>=0) {
+        unlink(x->twopassfile);
+        close(x->twopassfd);
+        x->twopassfd = -1;
     }
-    av_free(x->twopassfile);
-    av_free(x->intra_matrix);
-    av_free(x->inter_matrix);
+    av_freep(&x->twopassfile);
+    av_freep(&x->intra_matrix);
+    av_freep(&x->inter_matrix);
 
     return 0;
 }
diff --git a/libavcodec/libxvid.h b/libavcodec/libxvid.h
index 15f908f6f5..bffe07d808 100644
--- a/libavcodec/libxvid.h
+++ b/libavcodec/libxvid.h
@@ -1,20 +1,20 @@
 /*
  * copyright (C) 2006 Corey Hickey
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libxvid_rc.c b/libavcodec/libxvid_rc.c
index 26f3c495c1..0fd030c706 100644
--- a/libavcodec/libxvid_rc.c
+++ b/libavcodec/libxvid_rc.c
@@ -3,75 +3,42 @@
  *
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
-#if !HAVE_MKSTEMP
-#include <fcntl.h>
+#if HAVE_IO_H
+#include <io.h>
 #endif
+
+#if HAVE_UNISTD_H
 #include <unistd.h>
+#endif
+
 #include <xvid.h>
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
+#include "libavutil/file.h"
 
 #include "avcodec.h"
 #include "libxvid.h"
 #include "mpegvideo.h"
 
-/* Wrapper to work around the lack of mkstemp() on mingw.
- * Also, tries to create file in /tmp first, if possible.
- * *prefix can be a character constant; *filename will be allocated internally.
- * @return file descriptor of opened file (or -1 on error)
- * and opened file name in **filename. */
-int ff_tempfile(const char *prefix, char **filename)
-{
-    int fd = -1;
-#if !HAVE_MKSTEMP
-    *filename = tempnam(".", prefix);
-#else
-    size_t len = strlen(prefix) + 12; /* room for "/tmp/" and "XXXXXX\0" */
-    *filename  = av_malloc(len);
-#endif
-    /* -----common section-----*/
-    if (!(*filename)) {
-        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot allocate file name\n");
-        return AVERROR(ENOMEM);
-    }
-#if !HAVE_MKSTEMP
-    fd = avpriv_open(*filename, O_RDWR | O_BINARY | O_CREAT, 0444);
-#else
-    snprintf(*filename, len, "/tmp/%sXXXXXX", prefix);
-    fd = mkstemp(*filename);
-    if (fd < 0) {
-        snprintf(*filename, len, "./%sXXXXXX", prefix);
-        fd = mkstemp(*filename);
-    }
-#endif
-    /* -----common section-----*/
-    if (fd < 0) {
-        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot open temporary file %s\n", *filename);
-        return AVERROR(EIO);
-    }
-    return fd; /* success */
-}
-
 av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
 {
     char *tmp_name;
@@ -79,7 +46,7 @@ av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
     xvid_plg_create_t xvid_plg_create = { 0 };
     xvid_plugin_2pass2_t xvid_2pass2  = { 0 };
 
-    fd = ff_tempfile("xvidrc.", &tmp_name);
+    fd = av_tempfile("xvidrc.", &tmp_name, 0, s->avctx);
     if (fd < 0) {
         av_log(NULL, AV_LOG_ERROR, "Can't create temporary pass2 file.\n");
         return fd;
@@ -100,7 +67,13 @@ av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
                  (rce->i_tex_bits + rce->p_tex_bits + rce->misc_bits + 7) / 8,
                  (rce->header_bits + rce->mv_bits + 7) / 8);
 
-        write(fd, tmp, strlen(tmp));
+        if (write(fd, tmp, strlen(tmp)) < 0) {
+            int ret = AVERROR(errno);
+            av_log(NULL, AV_LOG_ERROR, "Error %s writing 2pass logfile\n", av_err2str(ret));
+            av_free(tmp_name);
+            close(fd);
+            return ret;
+        }
     }
 
     close(fd);
diff --git a/libavcodec/libzvbi-teletextdec.c b/libavcodec/libzvbi-teletextdec.c
new file mode 100644
index 0000000000..3733d885d6
--- /dev/null
+++ b/libavcodec/libzvbi-teletextdec.c
@@ -0,0 +1,572 @@
+/*
+ * Teletext decoding for ffmpeg
+ * Copyright (c) 2005-2010, 2012 Wolfram Gloger
+ * Copyright (c) 2013 Marton Balint
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "libavcodec/ass.h"
+#include "libavutil/opt.h"
+#include "libavutil/bprint.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+
+#include <libzvbi.h>
+
+#define TEXT_MAXSZ    (25 * (56 + 1) * 4 + 2)
+#define VBI_NB_COLORS 40
+#define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+#define VBI_R(rgba)   (((rgba) >> 0) & 0xFF)
+#define VBI_G(rgba)   (((rgba) >> 8) & 0xFF)
+#define VBI_B(rgba)   (((rgba) >> 16) & 0xFF)
+#define VBI_A(rgba)   (((rgba) >> 24) & 0xFF)
+#define MAX_BUFFERED_PAGES 25
+#define BITMAP_CHAR_WIDTH  12
+#define BITMAP_CHAR_HEIGHT 10
+#define MAX_SLICES 64
+
+typedef struct TeletextPage
+{
+    AVSubtitleRect *sub_rect;
+    int pgno;
+    int subno;
+    int64_t pts;
+} TeletextPage;
+
+typedef struct TeletextContext
+{
+    AVClass        *class;
+    char           *pgno;
+    int             x_offset;
+    int             y_offset;
+    int             format_id; /* 0 = bitmap, 1 = text/ass */
+    int             chop_top;
+    int             sub_duration; /* in msec */
+    int             transparent_bg;
+    int             chop_spaces;
+
+    int             lines_processed;
+    TeletextPage    *pages;
+    int             nb_pages;
+    int64_t         pts;
+    int             handler_ret;
+
+    vbi_decoder *   vbi;
+#ifdef DEBUG
+    vbi_export *    ex;
+#endif
+    vbi_sliced      sliced[MAX_SLICES];
+} TeletextContext;
+
+static int chop_spaces_utf8(const unsigned char* t, int len)
+{
+    t += len;
+    while (len > 0) {
+        if (*--t != ' ' || (len-1 > 0 && *(t-1) & 0x80))
+            break;
+        --len;
+    }
+    return len;
+}
+
+static void subtitle_rect_free(AVSubtitleRect **sub_rect)
+{
+    av_freep(&(*sub_rect)->pict.data[0]);
+    av_freep(&(*sub_rect)->pict.data[1]);
+    av_freep(&(*sub_rect)->ass);
+    av_freep(sub_rect);
+}
+
+static int create_ass_text(TeletextContext *ctx, const char *text, char **ass)
+{
+    int ret;
+    AVBPrint buf, buf2;
+    const int ts_start    = av_rescale_q(ctx->pts,          AV_TIME_BASE_Q,        (AVRational){1, 100});
+    const int ts_duration = av_rescale_q(ctx->sub_duration, (AVRational){1, 1000}, (AVRational){1, 100});
+
+    /* First we escape the plain text into buf. */
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    ff_ass_bprint_text_event(&buf, text, strlen(text), "", 0);
+    av_bprintf(&buf, "\r\n");
+
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    /* Then we create the ass dialog line in buf2 from the escaped text in buf. */
+    av_bprint_init(&buf2, 0, AV_BPRINT_SIZE_UNLIMITED);
+    ff_ass_bprint_dialog(&buf2, buf.str, ts_start, ts_duration, 0);
+    av_bprint_finalize(&buf, NULL);
+
+    if (!av_bprint_is_complete(&buf2)) {
+        av_bprint_finalize(&buf2, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    if ((ret = av_bprint_finalize(&buf2, ass)) < 0)
+        return ret;
+
+    return 0;
+}
+
+/* Draw a page as text */
+static int gen_sub_text(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    const char *in;
+    AVBPrint buf;
+    char *vbi_text = av_malloc(TEXT_MAXSZ);
+    int sz;
+
+    if (!vbi_text)
+        return AVERROR(ENOMEM);
+
+    sz = vbi_print_page_region(page, vbi_text, TEXT_MAXSZ-1, "UTF-8",
+                                   /*table mode*/ TRUE, FALSE,
+                                   0,             chop_top,
+                                   page->columns, page->rows-chop_top);
+    if (sz <= 0) {
+        av_log(ctx, AV_LOG_ERROR, "vbi_print error\n");
+        av_free(vbi_text);
+        return AVERROR_EXTERNAL;
+    }
+    vbi_text[sz] = '\0';
+    in  = vbi_text;
+    av_bprint_init(&buf, 0, TEXT_MAXSZ);
+
+    if (ctx->chop_spaces) {
+        for (;;) {
+            int nl, sz;
+
+            // skip leading spaces and newlines
+            in += strspn(in, " \n");
+            // compute end of row
+            for (nl = 0; in[nl]; ++nl)
+                if (in[nl] == '\n' && (nl==0 || !(in[nl-1] & 0x80)))
+                    break;
+            if (!in[nl])
+                break;
+            // skip trailing spaces
+            sz = chop_spaces_utf8(in, nl);
+            av_bprint_append_data(&buf, in, sz);
+            av_bprintf(&buf, "\n");
+            in += nl;
+        }
+    } else {
+        av_bprintf(&buf, "%s\n", vbi_text);
+    }
+    av_free(vbi_text);
+
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    if (buf.len) {
+        int ret;
+        sub_rect->type = SUBTITLE_ASS;
+        if ((ret = create_ass_text(ctx, buf.str, &sub_rect->ass)) < 0) {
+            av_bprint_finalize(&buf, NULL);
+            return ret;
+        }
+        av_log(ctx, AV_LOG_DEBUG, "subtext:%s:txetbus\n", sub_rect->ass);
+    } else {
+        sub_rect->type = SUBTITLE_NONE;
+    }
+    av_bprint_finalize(&buf, NULL);
+    return 0;
+}
+
+static void fix_transparency(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page,
+                             int chop_top, uint8_t transparent_color, int resx, int resy)
+{
+    int iy;
+
+    // Hack for transparency, inspired by VLC code...
+    for (iy = 0; iy < resy; iy++) {
+        uint8_t *pixel = sub_rect->pict.data[0] + iy * sub_rect->pict.linesize[0];
+        vbi_char *vc = page->text + (iy / BITMAP_CHAR_HEIGHT + chop_top) * page->columns;
+        vbi_char *vcnext = vc + page->columns;
+        for (; vc < vcnext; vc++) {
+            uint8_t *pixelnext = pixel + BITMAP_CHAR_WIDTH;
+            switch (vc->opacity) {
+                case VBI_TRANSPARENT_SPACE:
+                    memset(pixel, transparent_color, BITMAP_CHAR_WIDTH);
+                    break;
+                case VBI_OPAQUE:
+                case VBI_SEMI_TRANSPARENT:
+                    if (!ctx->transparent_bg)
+                        break;
+                case VBI_TRANSPARENT_FULL:
+                    for(; pixel < pixelnext; pixel++)
+                        if (*pixel == vc->background)
+                            *pixel = transparent_color;
+                    break;
+            }
+            pixel = pixelnext;
+        }
+    }
+}
+
+/* Draw a page as bitmap */
+static int gen_sub_bitmap(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    int resx = page->columns * BITMAP_CHAR_WIDTH;
+    int resy = (page->rows - chop_top) * BITMAP_CHAR_HEIGHT;
+    uint8_t ci, cmax = 0;
+    int ret;
+    vbi_char *vc = page->text + (chop_top * page->columns);
+    vbi_char *vcend = page->text + (page->rows * page->columns);
+
+    for (; vc < vcend; vc++) {
+        if (vc->opacity != VBI_TRANSPARENT_SPACE) {
+            cmax = VBI_NB_COLORS;
+            break;
+        }
+    }
+
+    if (cmax == 0) {
+        av_log(ctx, AV_LOG_DEBUG, "dropping empty page %3x\n", page->pgno);
+        sub_rect->type = SUBTITLE_NONE;
+        return 0;
+    }
+
+    if ((ret = avpicture_alloc(&sub_rect->pict, AV_PIX_FMT_PAL8, resx, resy)) < 0)
+        return ret;
+    // Yes, we want to allocate the palette on our own because AVSubtitle works this way
+    sub_rect->pict.data[1] = NULL;
+
+    vbi_draw_vt_page_region(page, VBI_PIXFMT_PAL8,
+                            sub_rect->pict.data[0], sub_rect->pict.linesize[0],
+                            0, chop_top, page->columns, page->rows - chop_top,
+                            /*reveal*/ 1, /*flash*/ 1);
+
+    fix_transparency(ctx, sub_rect, page, chop_top, cmax, resx, resy);
+    sub_rect->x = ctx->x_offset;
+    sub_rect->y = ctx->y_offset + chop_top * BITMAP_CHAR_HEIGHT;
+    sub_rect->w = resx;
+    sub_rect->h = resy;
+    sub_rect->nb_colors = (int)cmax + 1;
+    sub_rect->pict.data[1] = av_mallocz(AVPALETTE_SIZE);
+    if (!sub_rect->pict.data[1]) {
+        av_freep(&sub_rect->pict.data[0]);
+        return AVERROR(ENOMEM);
+    }
+    for (ci = 0; ci < cmax; ci++) {
+        int r, g, b, a;
+
+        r = VBI_R(page->color_map[ci]);
+        g = VBI_G(page->color_map[ci]);
+        b = VBI_B(page->color_map[ci]);
+        a = VBI_A(page->color_map[ci]);
+        ((uint32_t *)sub_rect->pict.data[1])[ci] = RGBA(r, g, b, a);
+        ff_dlog(ctx, "palette %0x\n", ((uint32_t *)sub_rect->pict.data[1])[ci]);
+    }
+    ((uint32_t *)sub_rect->pict.data[1])[cmax] = RGBA(0, 0, 0, 0);
+    sub_rect->type = SUBTITLE_BITMAP;
+    return 0;
+}
+
+static void handler(vbi_event *ev, void *user_data)
+{
+    TeletextContext *ctx = user_data;
+    TeletextPage *new_pages;
+    vbi_page page;
+    int res;
+    char pgno_str[12];
+    vbi_subno subno;
+    vbi_page_type vpt;
+    int chop_top;
+    char *lang;
+
+    snprintf(pgno_str, sizeof pgno_str, "%03x", ev->ev.ttx_page.pgno);
+    av_log(ctx, AV_LOG_DEBUG, "decoded page %s.%02x\n",
+           pgno_str, ev->ev.ttx_page.subno & 0xFF);
+
+    if (strcmp(ctx->pgno, "*") && !strstr(ctx->pgno, pgno_str))
+        return;
+    if (ctx->handler_ret < 0)
+        return;
+
+    res = vbi_fetch_vt_page(ctx->vbi, &page,
+                            ev->ev.ttx_page.pgno,
+                            ev->ev.ttx_page.subno,
+                            VBI_WST_LEVEL_3p5, 25, TRUE);
+
+    if (!res)
+        return;
+
+#ifdef DEBUG
+    fprintf(stderr, "\nSaving res=%d dy0=%d dy1=%d...\n",
+            res, page.dirty.y0, page.dirty.y1);
+    fflush(stderr);
+
+    if (!vbi_export_stdio(ctx->ex, stderr, &page))
+        fprintf(stderr, "failed: %s\n", vbi_export_errstr(ctx->ex));
+#endif
+
+    vpt = vbi_classify_page(ctx->vbi, ev->ev.ttx_page.pgno, &subno, &lang);
+    chop_top = ctx->chop_top ||
+        ((page.rows > 1) && (vpt == VBI_SUBTITLE_PAGE));
+
+    av_log(ctx, AV_LOG_DEBUG, "%d x %d page chop:%d\n",
+           page.columns, page.rows, chop_top);
+
+    if (ctx->nb_pages < MAX_BUFFERED_PAGES) {
+        if ((new_pages = av_realloc_array(ctx->pages, ctx->nb_pages + 1, sizeof(TeletextPage)))) {
+            TeletextPage *cur_page = new_pages + ctx->nb_pages;
+            ctx->pages = new_pages;
+            cur_page->sub_rect = av_mallocz(sizeof(*cur_page->sub_rect));
+            cur_page->pts = ctx->pts;
+            cur_page->pgno = ev->ev.ttx_page.pgno;
+            cur_page->subno = ev->ev.ttx_page.subno;
+            if (cur_page->sub_rect) {
+                res = (ctx->format_id == 0) ?
+                    gen_sub_bitmap(ctx, cur_page->sub_rect, &page, chop_top) :
+                    gen_sub_text  (ctx, cur_page->sub_rect, &page, chop_top);
+                if (res < 0) {
+                    av_freep(&cur_page->sub_rect);
+                    ctx->handler_ret = res;
+                } else {
+                    ctx->pages[ctx->nb_pages++] = *cur_page;
+                }
+            } else {
+                ctx->handler_ret = AVERROR(ENOMEM);
+            }
+        } else {
+            ctx->handler_ret = AVERROR(ENOMEM);
+        }
+    } else {
+        //TODO: If multiple packets contain more than one page, pages may got queued up, and this may happen...
+        av_log(ctx, AV_LOG_ERROR, "Buffered too many pages, dropping page %s.\n", pgno_str);
+        ctx->handler_ret = AVERROR(ENOSYS);
+    }
+
+    vbi_unref_page(&page);
+}
+
+static inline int data_identifier_is_teletext(int data_identifier) {
+    /* See EN 301 775 section 4.4.2. */
+    return (data_identifier >= 0x10 && data_identifier <= 0x1F ||
+            data_identifier >= 0x99 && data_identifier <= 0x9B);
+}
+
+static int slice_to_vbi_lines(TeletextContext *ctx, uint8_t* buf, int size)
+{
+    int lines = 0;
+    while (size >= 2 && lines < MAX_SLICES) {
+        int data_unit_id     = buf[0];
+        int data_unit_length = buf[1];
+        if (data_unit_length + 2 > size)
+            return AVERROR_INVALIDDATA;
+        if (data_unit_id == 0x02 || data_unit_id == 0x03) {
+            if (data_unit_length != 0x2c)
+                return AVERROR_INVALIDDATA;
+            else {
+                int line_offset  = buf[2] & 0x1f;
+                int field_parity = buf[2] & 0x20;
+                int i;
+                ctx->sliced[lines].id = VBI_SLICED_TELETEXT_B;
+                ctx->sliced[lines].line = (line_offset > 0 ? (line_offset + (field_parity ? 0 : 313)) : 0);
+                for (i = 0; i < 42; i++)
+                    ctx->sliced[lines].data[i] = vbi_rev8(buf[4 + i]);
+                lines++;
+            }
+        }
+        size -= data_unit_length + 2;
+        buf += data_unit_length + 2;
+    }
+    if (size)
+        av_log(ctx, AV_LOG_WARNING, "%d bytes remained after slicing data\n", size);
+    return lines;
+}
+
+static int teletext_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *pkt)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    AVSubtitle      *sub = data;
+    int             ret = 0;
+
+    if (!ctx->vbi) {
+        if (!(ctx->vbi = vbi_decoder_new()))
+            return AVERROR(ENOMEM);
+        if (!vbi_event_handler_add(ctx->vbi, VBI_EVENT_TTX_PAGE, handler, ctx)) {
+            vbi_decoder_delete(ctx->vbi);
+            ctx->vbi = NULL;
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (avctx->pkt_timebase.den && pkt->pts != AV_NOPTS_VALUE)
+        ctx->pts = av_rescale_q(pkt->pts, avctx->pkt_timebase, AV_TIME_BASE_Q);
+
+    if (pkt->size) {
+        int lines;
+        const int full_pes_size = pkt->size + 45; /* PES header is 45 bytes */
+
+        // We allow unreasonably big packets, even if the standard only allows a max size of 1472
+        if (full_pes_size < 184 || full_pes_size > 65504 || full_pes_size % 184 != 0)
+            return AVERROR_INVALIDDATA;
+
+        ctx->handler_ret = pkt->size;
+
+        if (data_identifier_is_teletext(*pkt->data)) {
+            if ((lines = slice_to_vbi_lines(ctx, pkt->data + 1, pkt->size - 1)) < 0)
+                return lines;
+            ff_dlog(avctx, "ctx=%p buf_size=%d lines=%u pkt_pts=%7.3f\n",
+                    ctx, pkt->size, lines, (double)pkt->pts/90000.0);
+            if (lines > 0) {
+#ifdef DEBUG
+                int i;
+                av_log(avctx, AV_LOG_DEBUG, "line numbers:");
+                for(i = 0; i < lines; i++)
+                    av_log(avctx, AV_LOG_DEBUG, " %d", ctx->sliced[i].line);
+                av_log(avctx, AV_LOG_DEBUG, "\n");
+#endif
+                vbi_decode(ctx->vbi, ctx->sliced, lines, 0.0);
+                ctx->lines_processed += lines;
+            }
+        }
+        ctx->pts = AV_NOPTS_VALUE;
+        ret = ctx->handler_ret;
+    }
+
+    if (ret < 0)
+        return ret;
+
+    // is there a subtitle to pass?
+    if (ctx->nb_pages) {
+        int i;
+        sub->format = ctx->format_id;
+        sub->start_display_time = 0;
+        sub->end_display_time = ctx->sub_duration;
+        sub->num_rects = 0;
+        sub->pts = ctx->pages->pts;
+
+        if (ctx->pages->sub_rect->type != SUBTITLE_NONE) {
+            sub->rects = av_malloc(sizeof(*sub->rects));
+            if (sub->rects) {
+                sub->num_rects = 1;
+                sub->rects[0] = ctx->pages->sub_rect;
+            } else {
+                ret = AVERROR(ENOMEM);
+            }
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "sending empty sub\n");
+            sub->rects = NULL;
+        }
+        if (!sub->rects) // no rect was passed
+            subtitle_rect_free(&ctx->pages->sub_rect);
+
+        for (i = 0; i < ctx->nb_pages - 1; i++)
+            ctx->pages[i] = ctx->pages[i + 1];
+        ctx->nb_pages--;
+
+        if (ret >= 0)
+            *data_size = 1;
+    } else
+        *data_size = 0;
+
+    return ret;
+}
+
+static int teletext_init_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    unsigned int maj, min, rev;
+
+    vbi_version(&maj, &min, &rev);
+    if (!(maj > 0 || min > 2 || min == 2 && rev >= 26)) {
+        av_log(avctx, AV_LOG_ERROR, "decoder needs zvbi version >= 0.2.26.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    if (ctx->format_id == 0) {
+        avctx->width  = 41 * BITMAP_CHAR_WIDTH;
+        avctx->height = 25 * BITMAP_CHAR_HEIGHT;
+    }
+
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+
+#ifdef DEBUG
+    {
+        char *t;
+        ctx->ex = vbi_export_new("text", &t);
+    }
+#endif
+    av_log(avctx, AV_LOG_VERBOSE, "page filter: %s\n", ctx->pgno);
+    return (ctx->format_id == 1) ? ff_ass_subtitle_header_default(avctx) : 0;
+}
+
+static int teletext_close_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+
+    ff_dlog(avctx, "lines_total=%u\n", ctx->lines_processed);
+    while (ctx->nb_pages)
+        subtitle_rect_free(&ctx->pages[--ctx->nb_pages].sub_rect);
+    av_freep(&ctx->pages);
+
+    vbi_decoder_delete(ctx->vbi);
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+    return 0;
+}
+
+static void teletext_flush(AVCodecContext *avctx)
+{
+    teletext_close_decoder(avctx);
+}
+
+#define OFFSET(x) offsetof(TeletextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"txt_page",        "list of teletext page numbers to decode, * is all", OFFSET(pgno),           AV_OPT_TYPE_STRING, {.str = "*"},      0, 0,        SD},
+    {"txt_chop_top",    "discards the top teletext line",                    OFFSET(chop_top),       AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_format",      "format of the subtitles (bitmap or text)",          OFFSET(format_id),      AV_OPT_TYPE_INT,    {.i64 = 0},        0, 1,        SD,  "txt_format"},
+    {"bitmap",          NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 0},        0, 0,        SD,  "txt_format"},
+    {"text",            NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 1},        0, 0,        SD,  "txt_format"},
+    {"txt_left",        "x offset of generated bitmaps",                     OFFSET(x_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_top",         "y offset of generated bitmaps",                     OFFSET(y_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_chop_spaces", "chops leading and trailing spaces from text",       OFFSET(chop_spaces),    AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_duration",    "display duration of teletext pages in msecs",       OFFSET(sub_duration),   AV_OPT_TYPE_INT,    {.i64 = 30000},    0, 86400000, SD},
+    {"txt_transparent", "force transparent background of the teletext",      OFFSET(transparent_bg), AV_OPT_TYPE_INT,    {.i64 = 0},        0, 1,        SD},
+    { NULL },
+};
+
+static const AVClass teletext_class = {
+    .class_name = "libzvbi_teletextdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libzvbi_teletext_decoder = {
+    .name      = "libzvbi_teletextdec",
+    .long_name = NULL_IF_CONFIG_SMALL("Libzvbi DVB teletext decoder"),
+    .type      = AVMEDIA_TYPE_SUBTITLE,
+    .id        = AV_CODEC_ID_DVB_TELETEXT,
+    .priv_data_size = sizeof(TeletextContext),
+    .init      = teletext_init_decoder,
+    .close     = teletext_close_decoder,
+    .decode    = teletext_decode_frame,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .flush     = teletext_flush,
+    .priv_class= &teletext_class,
+};
diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c
index 41dba89b7b..7f2dd602db 100644
--- a/libavcodec/ljpegenc.c
+++ b/libavcodec/ljpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,8 +47,8 @@ typedef struct LJpegEncContext {
     ScanTable scantable;
     uint16_t matrix[64];
 
-    int vsample[3];
-    int hsample[3];
+    int vsample[4];
+    int hsample[4];
 
     uint16_t huff_code_dc_luminance[12];
     uint16_t huff_code_dc_chrominance[12];
@@ -67,30 +67,38 @@ static int ljpeg_encode_bgr(AVCodecContext *avctx, PutBitContext *pb,
     const int linesize    = frame->linesize[0];
     uint16_t (*buffer)[4] = s->scratch;
     const int predictor   = avctx->prediction_method+1;
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     int x, y, i;
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         buffer[0][i] = 1 << (9 - 1);
 
     for (y = 0; y < height; y++) {
         const int modified_predictor = y ? predictor : 1;
         uint8_t *ptr = frame->data[0] + (linesize * y);
 
-        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 3 * 3) {
+        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 4 * 4) {
             av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
             return -1;
         }
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i]= left[i]= topleft[i]= buffer[0][i];
 
         for (x = 0; x < width; x++) {
-            buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            if(avctx->pix_fmt == AV_PIX_FMT_BGR24){
+                buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            }else{
+                buffer[x][1] =  ptr[4 * x + 0] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[4 * x + 2] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[4 * x + 0] + 2 * ptr[4 * x + 1] + ptr[4 * x + 2]) >> 2;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGRA)
+                    buffer[x][3] =  ptr[4 * x + 3];
+            }
 
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA); i++) {
                 int pred, diff;
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
@@ -102,7 +110,7 @@ static int ljpeg_encode_bgr(AVCodecContext *avctx, PutBitContext *pb,
 
                 diff       = ((left[i] - pred + 0x100) & 0x1FF) - 0x100;
 
-                if (i == 0)
+                if (i == 0 || i == 3)
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_luminance, s->huff_code_dc_luminance); //FIXME ugly
                 else
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_chrominance, s->huff_code_dc_chrominance);
@@ -213,25 +221,29 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int max_pkt_size = AV_INPUT_BUFFER_MIN_SIZE;
     int ret, header_bits;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
-        max_pkt_size += width * height * 3 * 3;
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
+        max_pkt_size += width * height * 3 * 4;
+    else if(avctx->pix_fmt == AV_PIX_FMT_BGRA)
+        max_pkt_size += width * height * 4 * 4;
     else {
         max_pkt_size += mb_width * mb_height * 3 * 4
                         * s->hsample[0] * s->vsample[0];
     }
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
     ff_mjpeg_encode_picture_header(avctx, &pb, &s->scantable,
-                                   s->matrix);
+                                   s->matrix, s->matrix);
 
     header_bits = put_bits_count(&pb);
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGRA
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
         ret = ljpeg_encode_bgr(avctx, &pb, pict);
     else
         ret = ljpeg_encode_yuv(avctx, &pb, pict);
@@ -240,6 +252,7 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     emms_c();
 
+    ff_mjpeg_escape_FF(&pb, header_bits >> 3);
     ff_mjpeg_encode_picture_trailer(&pb, header_bits);
 
     flush_put_bits(&pb);
@@ -262,7 +275,6 @@ static av_cold int ljpeg_encode_close(AVCodecContext *avctx)
 static av_cold int ljpeg_encode_init(AVCodecContext *avctx)
 {
     LJpegEncContext *s = avctx->priv_data;
-    int chroma_v_shift, chroma_h_shift;
 
     if ((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
@@ -283,26 +295,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     s->scratch = av_malloc_array(avctx->width + 1, sizeof(*s->scratch));
+    if (!s->scratch)
+        goto fail;
 
     ff_idctdsp_init(&s->idsp, avctx);
     ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
                       ff_zigzag_direct);
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
-                                     &chroma_v_shift);
-
-    if (avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
-        s->vsample[0] = s->hsample[0] =
-        s->vsample[1] = s->hsample[1] =
-        s->vsample[2] = s->hsample[2] = 1;
-    } else {
-        s->vsample[0] = 2;
-        s->vsample[1] = 2 >> chroma_v_shift;
-        s->vsample[2] = 2 >> chroma_v_shift;
-        s->hsample[0] = 2;
-        s->hsample[1] = 2 >> chroma_h_shift;
-        s->hsample[2] = 2 >> chroma_h_shift;
-    }
+    ff_mjpeg_init_hvsample(avctx, s->hsample, s->vsample);
 
     ff_mjpeg_build_huffman_codes(s->huff_size_dc_luminance,
                                  s->huff_code_dc_luminance,
@@ -314,6 +314,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avpriv_mjpeg_val_dc);
 
     return 0;
+fail:
+    ljpeg_encode_close(avctx);
+    return AVERROR(ENOMEM);
 }
 
 AVCodec ff_ljpeg_encoder = {
@@ -325,12 +328,10 @@ AVCodec ff_ljpeg_encoder = {
     .init           = ljpeg_encode_init,
     .encode2        = ljpeg_encode_frame,
     .close          = ljpeg_encode_close,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ420P,
-                                                    AV_PIX_FMT_YUVJ422P,
-                                                    AV_PIX_FMT_YUVJ444P,
-                                                    AV_PIX_FMT_BGR24,
-                                                    AV_PIX_FMT_YUV420P,
-                                                    AV_PIX_FMT_YUV422P,
-                                                    AV_PIX_FMT_YUV444P,
-                                                    AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_BGR24   , AV_PIX_FMT_BGRA    , AV_PIX_FMT_BGR0,
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
+        AV_PIX_FMT_YUV420P , AV_PIX_FMT_YUV444P , AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/loco.c b/libavcodec/loco.c
index f25ef61f37..9d0f144451 100644
--- a/libavcodec/loco.c
+++ b/libavcodec/loco.c
@@ -2,20 +2,20 @@
  * LOCO codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ enum LOCO_MODE {
 typedef struct LOCOContext {
     AVCodecContext *avctx;
     int lossy;
-    int mode;
+    enum LOCO_MODE mode;
 } LOCOContext;
 
 typedef struct RICEContext {
@@ -130,9 +130,15 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
 {
     RICEContext rc;
     int val;
+    int ret;
     int i, j;
 
-    init_get_bits(&rc.gb, buf, buf_size*8);
+    if(buf_size<=0)
+        return -1;
+
+    if ((ret = init_get_bits8(&rc.gb, buf, buf_size)) < 0)
+        return ret;
+
     rc.save  = 0;
     rc.run   = 0;
     rc.run2  = 0;
@@ -165,6 +171,23 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
     return (get_bits_count(&rc.gb) + 7) >> 3;
 }
 
+static void rotate_faulty_loco(uint8_t *data, int width, int height, int stride, int step)
+{
+    int y;
+
+    for (y=1; y<height; y++) {
+        if (width>=y) {
+            memmove(data + y*stride,
+                    data + y*(stride + step),
+                    step*(width-y));
+            if (y+1 < height)
+                memmove(data + y*stride + step*(width-y),
+                        data + (y+1)*stride,
+                        step*y);
+        }
+    }
+}
+
 static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
@@ -175,88 +198,72 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p     = data;
     int decoded, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->key_frame = 1;
 
+#define ADVANCE_BY_DECODED do { \
+    if (decoded < 0 || decoded >= buf_size) goto buf_too_small; \
+    buf += decoded; buf_size -= decoded; \
+} while(0)
     switch(l->mode) {
     case LOCO_CYUY2: case LOCO_YUY2: case LOCO_UYVY:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
                                     p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height,
                                     p->linesize[1], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height,
                                     p->linesize[2], buf, buf_size, 1);
         break;
     case LOCO_CYV12: case LOCO_YV12:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
                                     p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height / 2,
                                     p->linesize[2], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height / 2,
                                     p->linesize[1], buf, buf_size, 1);
         break;
     case LOCO_CRGB: case LOCO_RGB:
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
+        if (avctx->width & 1)
+            rotate_faulty_loco(p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height, -p->linesize[0], 3);
         break;
+    case LOCO_CRGBA:
     case LOCO_RGBA:
-        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 1, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 2, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 3, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 3, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
         break;
+    default:
+        av_assert0(0);
     }
 
+    if (decoded < 0 || decoded > buf_size)
+        goto buf_too_small;
+    buf_size -= decoded;
+
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size - buf_size;
 buf_too_small:
     av_log(avctx, AV_LOG_ERROR, "Input data too small.\n");
     return AVERROR(EINVAL);
@@ -303,7 +310,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case LOCO_CRGBA:
     case LOCO_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_BGRA;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "Unknown colorspace, index = %i\n", l->mode);
diff --git a/libavcodec/lossless_audiodsp.c b/libavcodec/lossless_audiodsp.c
new file mode 100644
index 0000000000..32f4c9e856
--- /dev/null
+++ b/libavcodec/lossless_audiodsp.c
@@ -0,0 +1,49 @@
+/*
+ * Monkey's Audio lossless audio decoder
+ * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
+ *  based upon libdemac from Dave Chapman.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "lossless_audiodsp.h"
+
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
+av_cold void ff_llauddsp_init(LLAudDSPContext *c)
+{
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+
+    if (ARCH_ARM)
+        ff_llauddsp_init_arm(c);
+    if (ARCH_PPC)
+        ff_llauddsp_init_ppc(c);
+    if (ARCH_X86)
+        ff_llauddsp_init_x86(c);
+}
diff --git a/libavcodec/apedsp.h b/libavcodec/lossless_audiodsp.h
index 64e2749679..9ce2e63d45 100644
--- a/libavcodec/apedsp.h
+++ b/libavcodec/lossless_audiodsp.h
@@ -3,42 +3,44 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_APEDSP_H
-#define AVCODEC_APEDSP_H
+#ifndef AVCODEC_LLAUDDSP_H
+#define AVCODEC_LLAUDDSP_H
 
 #include <stdint.h>
 
-typedef struct APEDSPContext {
+typedef struct LLAudDSPContext {
     /**
      * Calculate scalar product of v1 and v2,
      * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
+     * @param len length of vectors, should be multiple of 16,
+     *            or padd v3 and v1 or v2 with zeros.
      */
     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
                                             const int16_t *v2,
                                             const int16_t *v3,
                                             int len, int mul);
-} APEDSPContext;
+} LLAudDSPContext;
 
-void ff_apedsp_init_arm(APEDSPContext *c);
-void ff_apedsp_init_ppc(APEDSPContext *c);
-void ff_apedsp_init_x86(APEDSPContext *c);
+void ff_llauddsp_init(LLAudDSPContext *c);
+void ff_llauddsp_init_arm(LLAudDSPContext *c);
+void ff_llauddsp_init_ppc(LLAudDSPContext *c);
+void ff_llauddsp_init_x86(LLAudDSPContext *c);
 
-#endif /* AVCODEC_APEDSP_H */
+#endif /* AVCODEC_LLAUDDSP_H */
diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
new file mode 100644
index 0000000000..3491621dc4
--- /dev/null
+++ b/libavcodec/lossless_videodsp.c
@@ -0,0 +1,128 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "lossless_videodsp.h"
+#include "libavcodec/mathops.h"
+
+static void add_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w){
+    long i;
+    unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL;
+    unsigned long pw_msb = pw_lsb +  0x0001000100010001ULL;
+    for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+        long a = *(long*)(src+i);
+        long b = *(long*)(dst+i);
+        *(long*)(dst+i) = ((a&pw_lsb) + (b&pw_lsb)) ^ ((a^b)&pw_msb);
+    }
+    for(; i<w; i++)
+        dst[i] = (dst[i] + src[i]) & mask;
+}
+
+static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){
+    long i;
+#if !HAVE_FAST_UNALIGNED
+    if((long)src2 & (sizeof(long)-1)){
+        for(i=0; i+3<w; i+=4){
+            dst[i+0] = (src1[i+0]-src2[i+0]) & mask;
+            dst[i+1] = (src1[i+1]-src2[i+1]) & mask;
+            dst[i+2] = (src1[i+2]-src2[i+2]) & mask;
+            dst[i+3] = (src1[i+3]-src2[i+3]) & mask;
+        }
+    }else
+#endif
+    {
+        unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL;
+        unsigned long pw_msb = pw_lsb +  0x0001000100010001ULL;
+
+        for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+            long a = *(long*)(src1+i);
+            long b = *(long*)(src2+i);
+            *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb);
+        }
+    }
+    for (; i<w; i++)
+        dst[i] = (src1[i] - src2[i]) & mask;
+}
+
+static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top){
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for(i=0; i<w; i++){
+        l  = (mid_pred(l, src[i], (l + src[i] - lt) & mask) + diff[i]) & mask;
+        lt = src[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for(i=0; i<w; i++){
+        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & mask);
+        lt = src1[i];
+        l  = src2[i];
+        dst[i] = (l - pred) & mask;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static int add_hfyu_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc){
+    int i;
+
+    for(i=0; i<w-1; i++){
+        acc+= src[i];
+        dst[i]= acc & mask;
+        i++;
+        acc+= src[i];
+        dst[i]= acc & mask;
+    }
+
+    for(; i<w; i++){
+        acc+= src[i];
+        dst[i]= acc & mask;
+    }
+
+    return acc;
+}
+
+
+void ff_llviddsp_init(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+    c->add_int16 = add_int16_c;
+    c->diff_int16= diff_int16_c;
+    c->add_hfyu_left_pred_int16   = add_hfyu_left_pred_int16_c;
+    c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c;
+    c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
+
+    if (ARCH_X86)
+        ff_llviddsp_init_x86(c, avctx);
+}
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
new file mode 100644
index 0000000000..040902e7ba
--- /dev/null
+++ b/libavcodec/lossless_videodsp.h
@@ -0,0 +1,40 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVCODEC_LOSSLESS_VIDEODSP_H
+#define AVCODEC_LOSSLESS_VIDEODSP_H
+
+#include "avcodec.h"
+#include "libavutil/cpu.h"
+
+typedef struct LLVidDSPContext {
+    void (*add_int16)(uint16_t *dst/*align 16*/, const uint16_t *src/*align 16*/, unsigned mask, int w);
+    void (*diff_int16)(uint16_t *dst/*align 16*/, const uint16_t *src1/*align 16*/, const uint16_t *src2/*align 1*/, unsigned mask, int w);
+
+    void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+    void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+    int  (*add_hfyu_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned left);
+} LLVidDSPContext;
+
+void ff_llviddsp_init(LLVidDSPContext *llviddsp, AVCodecContext *avctx);
+void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp, AVCodecContext *avctx);
+
+#endif //AVCODEC_LOSSLESS_VIDEODSP_H
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 1482e57615..3839119cc2 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #define LPC_USE_DOUBLE
 #include "lpc.h"
+#include "libavutil/avassert.h"
 
 
 /**
@@ -36,13 +37,19 @@ static void lpc_apply_welch_window_c(const int32_t *data, int len,
     double w;
     double c;
 
-    /* The optimization in commit fa4ed8c does not support odd len.
-     * If someone wants odd len extend that change. */
-    assert(!(len & 1));
-
     n2 = (len >> 1);
     c = 2.0 / (len - 1.0);
 
+    if (len & 1) {
+        for(i=0; i<n2; i++) {
+            w = c - i - 1.0;
+            w = 1.0 - (w * w);
+            w_data[i] = data[i] * w;
+            w_data[len-1-i] = data[len-1-i] * w;
+        }
+        return;
+    }
+
     w_data+=n2;
       data+=n2;
     for(i=0; i<n2; i++) {
@@ -160,6 +167,28 @@ int ff_lpc_calc_ref_coefs(LPCContext *s,
     return order;
 }
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref)
+{
+    int i;
+    double signal = 0.0f, avg_err = 0.0f;
+    double autoc[MAX_LPC_ORDER+1] = {0}, error[MAX_LPC_ORDER+1] = {0};
+    const double a = 0.5f, b = 1.0f - a;
+
+    /* Apply windowing */
+    for (i = 0; i < len; i++) {
+        double weight = a - b*cos((2*M_PI*i)/(len - 1));
+        s->windowed_samples[i] = weight*samples[i];
+    }
+
+    s->lpc_compute_autocorr(s->windowed_samples, len, order, autoc);
+    signal = autoc[0];
+    compute_ref_coefs(autoc, order, ref, error);
+    for (i = 0; i < order; i++)
+        avg_err = (avg_err + error[i])/2.0f;
+    return signal/avg_err;
+}
+
 /**
  * Calculate LPC coefficients for multiple orders
  *
@@ -179,8 +208,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
     int i, j, pass = 0;
     int opt_order;
 
-    assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
+    av_assert2(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
            lpc_type > FF_LPC_TYPE_FIXED);
+    av_assert0(lpc_type == FF_LPC_TYPE_CHOLESKY || lpc_type == FF_LPC_TYPE_LEVINSON);
 
     /* reinit LPC context if parameters have changed */
     if (blocksize != s->blocksize || max_order != s->max_order ||
@@ -189,6 +219,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
         ff_lpc_init(s, blocksize, max_order, lpc_type);
     }
 
+    if(lpc_passes <= 0)
+        lpc_passes = 2;
+
     if (lpc_type == FF_LPC_TYPE_LEVINSON || (lpc_type == FF_LPC_TYPE_CHOLESKY && lpc_passes > 1)) {
         s->lpc_apply_welch_window(samples, blocksize, s->windowed_samples);
 
@@ -203,7 +236,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
     }
 
     if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
-        LLSModel m[2];
+        LLSModel *m = s->lls_models;
         LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]);
         double av_uninit(weight);
         memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var));
@@ -244,6 +277,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
         for(i=max_order-1; i>0; i--)
             ref[i] = ref[i-1] - ref[i];
     }
+
     opt_order = max_order;
 
     if(omethod == ORDER_METHOD_EST) {
diff --git a/libavcodec/lpc.h b/libavcodec/lpc.h
index 642854c59a..7e0ee3e172 100644
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,9 @@
 #define AVCODEC_LPC_H
 
 #include <stdint.h>
+#include "libavutil/avassert.h"
+#include "libavutil/lls.h"
+#include "aac_defines.h"
 
 #define ORDER_METHOD_EST     0
 #define ORDER_METHOD_2LEVEL  1
@@ -66,7 +69,7 @@ typedef struct LPCContext {
     /**
      * Perform autocorrelation on input samples with delay of 0 to lag.
      * @param data  input samples.
-     *              constraints: no alignment needed, but must have have at
+     *              constraints: no alignment needed, but must have at
      *              least lag*sizeof(double) valid bytes preceding it, and
      *              size must be at least (len+1)*sizeof(double) if data is
      *              16-byte aligned or (len+2)*sizeof(double) if data is
@@ -78,6 +81,9 @@ typedef struct LPCContext {
      */
     void (*lpc_compute_autocorr)(const double *data, int len, int lag,
                                  double *autoc);
+
+    // TODO: these should be allocated to reduce ABI compatibility issues
+    LLSModel lls_models[2];
 } LPCContext;
 
 
@@ -94,6 +100,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
 int ff_lpc_calc_ref_coefs(LPCContext *s,
                           const int32_t *samples, int order, double *ref);
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref);
+
 /**
  * Initialize LPCContext.
  */
@@ -106,11 +115,15 @@ void ff_lpc_init_x86(LPCContext *s);
  */
 void ff_lpc_end(LPCContext *s);
 
+#if USE_FIXED
+#define LPC_TYPE int
+#else
 #ifdef LPC_USE_DOUBLE
 #define LPC_TYPE double
 #else
 #define LPC_TYPE float
 #endif
+#endif // USE_FIXED
 
 /**
  * Schur recursion.
@@ -147,7 +160,7 @@ static inline void compute_ref_coefs(const LPC_TYPE *autoc, int max_order,
  * Levinson-Durbin recursion.
  * Produce LPC coefficients from autocorrelation data.
  */
-static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
+static inline int AAC_RENAME(compute_lpc_coefs)(const LPC_TYPE *autoc, int max_order,
                                     LPC_TYPE *lpc, int lpc_stride, int fail,
                                     int normalize)
 {
@@ -155,6 +168,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
     LPC_TYPE err = 0;
     LPC_TYPE *lpc_last = lpc;
 
+    av_assert2(normalize || !fail);
+
     if (normalize)
         err = *autoc++;
 
@@ -162,14 +177,14 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         return -1;
 
     for(i=0; i<max_order; i++) {
-        LPC_TYPE r = -autoc[i];
+        LPC_TYPE r = AAC_SRA_R(-autoc[i], 5);
 
         if (normalize) {
             for(j=0; j<i; j++)
                 r -= lpc_last[j] * autoc[i-j-1];
 
             r /= err;
-            err *= 1.0 - (r * r);
+            err *= FIXR(1.0) - (r * r);
         }
 
         lpc[i] = r;
@@ -177,8 +192,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         for(j=0; j < (i+1)>>1; j++) {
             LPC_TYPE f = lpc_last[    j];
             LPC_TYPE b = lpc_last[i-1-j];
-            lpc[    j] = f + r * b;
-            lpc[i-1-j] = b + r * f;
+            lpc[    j] = f + AAC_MUL26(r, b);
+            lpc[i-1-j] = b + AAC_MUL26(r, f);
         }
 
         if (fail && err < 0)
diff --git a/libavcodec/lsp.c b/libavcodec/lsp.c
index 8a05aede62..17f59ea77d 100644
--- a/libavcodec/lsp.c
+++ b/libavcodec/lsp.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet (QCELP decoder)
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #define FRAC_BITS 14
 #include "mathops.h"
 #include "lsp.h"
+#include "libavcodec/mips/lsp_mips.h"
+#include "libavutil/avassert.h"
 
 void ff_acelp_reorder_lsf(int16_t* lsfq, int lsfq_min_distance, int lsfq_min, int lsfq_max, int lp_order)
 {
@@ -73,7 +75,7 @@ static int16_t ff_cos(uint16_t arg)
     uint8_t offset= arg;
     uint8_t ind = arg >> 8;
 
-    assert(arg <= 0x3fff);
+    av_assert2(arg <= 0x3fff);
 
     return tab_cos[ind] + (offset * (tab_cos[ind+1] - tab_cos[ind]) >> 8);
 }
@@ -173,7 +175,11 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
 
     /* LSP values for first subframe (3.2.5 of G.729, Equation 24)*/
     for(i=0; i<lp_order; i++)
+#ifdef G729_BITEXACT
+        lsp_1st[i] = (lsp_2nd[i] >> 1) + (lsp_prev[i] >> 1);
+#else
         lsp_1st[i] = (lsp_2nd[i] + lsp_prev[i]) >> 1;
+#endif
 
     ff_acelp_lsp2lpc(lp_1st, lsp_1st, lp_order >> 1);
 
@@ -181,6 +187,7 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
     ff_acelp_lsp2lpc(lp_2nd, lsp_2nd, lp_order >> 1);
 }
 
+#ifndef ff_lsp2polyf
 void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
 {
     int i, j;
@@ -197,13 +204,14 @@ void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
         f[1] += val;
     }
 }
+#endif /* ff_lsp2polyf */
 
 void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
 {
     double pa[MAX_LP_HALF_ORDER+1], qa[MAX_LP_HALF_ORDER+1];
     float *lpc2 = lpc + (lp_half_order << 1) - 1;
 
-    assert(lp_half_order <= MAX_LP_HALF_ORDER);
+    av_assert2(lp_half_order <= MAX_LP_HALF_ORDER);
 
     ff_lsp2polyf(lsp,     pa, lp_half_order);
     ff_lsp2polyf(lsp + 1, qa, lp_half_order);
diff --git a/libavcodec/lsp.h b/libavcodec/lsp.h
index 4b955678ee..46a2d47beb 100644
--- a/libavcodec/lsp.h
+++ b/libavcodec/lsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.c b/libavcodec/lzf.c
index 35b932bd08..409a7ffdd3 100644
--- a/libavcodec/lzf.c
+++ b/libavcodec/lzf.c
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.h b/libavcodec/lzf.h
index 4951f2511f..0ad73d9f79 100644
--- a/libavcodec/lzf.h
+++ b/libavcodec/lzf.h
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzw.c b/libavcodec/lzw.c
index fae5687ab7..6832c122a2 100644
--- a/libavcodec/lzw.c
+++ b/libavcodec/lzw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -93,12 +93,6 @@ static int lzw_get_code(struct LZWState * s)
     return c & s->curmask;
 }
 
-int ff_lzw_size_read(LZWState *p)
-{
-    struct LZWState *s = p;
-    return bytestream2_tell(&s->gb);
-}
-
 void ff_lzw_decode_tail(LZWState *p)
 {
     struct LZWState *s = (struct LZWState *)p;
diff --git a/libavcodec/lzw.h b/libavcodec/lzw.h
index d925d35e27..4653c1c74f 100644
--- a/libavcodec/lzw.h
+++ b/libavcodec/lzw.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,6 @@ void ff_lzw_decode_open(LZWState **p);
 void ff_lzw_decode_close(LZWState **p);
 int ff_lzw_decode_init(LZWState *s, int csize, const uint8_t *buf, int buf_size, int mode);
 int ff_lzw_decode(LZWState *s, uint8_t *buf, int len);
-int ff_lzw_size_read(LZWState *lzw);
 void ff_lzw_decode_tail(LZWState *lzw);
 
 /** LZW encode state */
diff --git a/libavcodec/lzwenc.c b/libavcodec/lzwenc.c
index 7c37bf2094..03080ee587 100644
--- a/libavcodec/lzwenc.c
+++ b/libavcodec/lzwenc.c
@@ -2,20 +2,20 @@
  * LZW encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,7 +77,7 @@ static inline int hash(int head, const int add)
     head ^= (add << LZW_HASH_SHIFT);
     if (head >= LZW_HASH_SIZE)
         head -= LZW_HASH_SIZE;
-    assert(head >= 0 && head < LZW_HASH_SIZE);
+    av_assert2(head >= 0 && head < LZW_HASH_SIZE);
     return head;
 }
 
@@ -112,7 +112,7 @@ static inline int hashOffset(const int head)
  */
 static inline void writeCode(LZWEncodeState * s, int c)
 {
-    assert(0 <= c && c < 1 << s->bits);
+    av_assert2(0 <= c && c < 1 << s->bits);
     s->put_bits(&s->pb, s->bits, c);
 }
 
@@ -208,7 +208,7 @@ void ff_lzw_encode_init(LZWEncodeState *s, uint8_t *outbuf, int outsize,
     s->maxbits = maxbits;
     init_put_bits(&s->pb, outbuf, outsize);
     s->bufsize = outsize;
-    assert(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
+    av_assert0(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
     s->maxcode = 1 << s->maxbits;
     s->output_bytes = 0;
     s->last_code = LZW_PREFIX_EMPTY;
@@ -263,6 +263,9 @@ int ff_lzw_encode_flush(LZWEncodeState *s,
     if (s->last_code != -1)
         writeCode(s, s->last_code);
     writeCode(s, s->end_code);
+    if (s->mode == FF_LZW_GIF)
+        s->put_bits(&s->pb, 1, 0);
+
     lzw_flush_put_bits(&s->pb);
     s->last_code = -1;
 
diff --git a/libavcodec/mace.c b/libavcodec/mace.c
index c6eddc05da..e332a72d6d 100644
--- a/libavcodec/mace.c
+++ b/libavcodec/mace.c
@@ -2,20 +2,20 @@
  * MACE decoder
  * Copyright (c) 2002 Laszlo Torok <torokl@alpha.dfmk.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -244,12 +244,17 @@ static int mace_decode_frame(AVCodecContext *avctx, void *data,
     int i, j, k, l, ret;
     int is_mace3 = (avctx->codec_id == AV_CODEC_ID_MACE3);
 
+    if (buf_size % (avctx->channels << is_mace3)) {
+        av_log(avctx, AV_LOG_ERROR, "buffer size %d is odd\n", buf_size);
+        buf_size -= buf_size % (avctx->channels << is_mace3);
+        if (!buf_size)
+            return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = 3 * (buf_size << (1 - is_mace3)) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
 
     for(i = 0; i < avctx->channels; i++) {
diff --git a/libavcodec/mathops.c b/libavcodec/mathops.c
new file mode 100644
index 0000000000..31c8e69e61
--- /dev/null
+++ b/libavcodec/mathops.c
@@ -0,0 +1,26 @@
+#include "mathops.h"
+
+#ifdef TEST
+
+#include <stdlib.h>
+
+int main(void)
+{
+    unsigned u;
+
+    for(u=0; u<65536; u++) {
+        unsigned s = u*u;
+        unsigned root = ff_sqrt(s);
+        unsigned root_m1 = ff_sqrt(s-1);
+        if (s && root != u) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+        if (u && root_m1 != u - 1) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+    }
+    return 0;
+}
+#endif /* TEST */
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index 7af13e19e0..c065018f56 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_MATHOPS_H
@@ -30,7 +30,6 @@
 #define MAX_NEG_CROP 1024
 
 extern const uint32_t ff_inverse[257];
-extern const uint8_t  ff_reverse[256];
 extern const uint8_t ff_sqrt_tab[256];
 extern const uint8_t ff_crop_tab[256 + 2 * MAX_NEG_CROP];
 extern const uint8_t ff_zigzag_direct[64];
@@ -39,8 +38,6 @@ extern const uint8_t ff_zigzag_direct[64];
 #   include "arm/mathops.h"
 #elif ARCH_AVR32
 #   include "avr32/mathops.h"
-#elif ARCH_BFIN
-#   include "bfin/mathops.h"
 #elif ARCH_MIPS
 #   include "mips/mathops.h"
 #elif ARCH_PPC
@@ -123,6 +120,20 @@ static inline av_const int mid_pred(int a, int b, int c)
 }
 #endif
 
+#ifndef median4
+#define median4 median4
+static inline av_const int median4(int a, int b, int c, int d)
+{
+    if (a < b) {
+        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
+        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
+    } else {
+        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
+        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
+    }
+}
+#endif
+
 #ifndef sign_extend
 static inline av_const int sign_extend(int val, unsigned bits)
 {
@@ -199,6 +210,8 @@ if ((y) < (x)) {\
 #   define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
 #endif /* FASTDIV */
 
+#ifndef ff_sqrt
+#define ff_sqrt ff_sqrt
 static inline av_const unsigned int ff_sqrt(unsigned int a)
 {
     unsigned int b;
@@ -218,6 +231,7 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
 
     return b - (a < b * b);
 }
+#endif
 
 static inline int8_t ff_u8_to_s8(uint8_t a)
 {
diff --git a/libavcodec/mathtables.c b/libavcodec/mathtables.c
index 47695bc343..7b5efb88a6 100644
--- a/libavcodec/mathtables.c
+++ b/libavcodec/mathtables.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,25 +71,6 @@ const uint8_t ff_sqrt_tab[256]={
 240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255
 };
 
-const uint8_t ff_reverse[256] = {
-0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
-0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
-0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
-0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
-0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
-0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
-0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
-0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
-0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
-0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
-0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
-0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
-0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
-0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
-0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
-0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF,
-};
-
 #define times4(x) x, x, x, x
 #define times256(x) times4(times4(times4(times4(times4(x)))))
 
diff --git a/libavcodec/mdct_fixed.c b/libavcodec/mdct_fixed.c
index 9e06861dba..a32cb00ca0 100644
--- a/libavcodec/mdct_fixed.c
+++ b/libavcodec/mdct_fixed.c
@@ -1,22 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
 
 /* same as ff_mdct_calcw_c with double-width unscaled output */
diff --git a/libavcodec/mdct_fixed_32.c b/libavcodec/mdct_fixed_32.c
new file mode 100644
index 0000000000..5a34dfe760
--- /dev/null
+++ b/libavcodec/mdct_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "mdct_template.c"
diff --git a/libavcodec/mdct_float.c b/libavcodec/mdct_float.c
index a0a62b3001..cff2d211c4 100644
--- a/libavcodec/mdct_float.c
+++ b/libavcodec/mdct_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c
index bad890ef7f..e7e5f622f1 100644
--- a/libavcodec/mdct_template.c
+++ b/libavcodec/mdct_template.c
@@ -2,20 +2,20 @@
  * MDCT/IMDCT transforms
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,11 @@
 #if FFT_FLOAT
 #   define RSCALE(x) (x)
 #else
+#if FFT_FIXED_32
+#   define RSCALE(x) (((x) + 32) >> 6)
+#else /* FFT_FIXED_32 */
 #   define RSCALE(x) ((x) >> 1)
+#endif /* FFT_FIXED_32 */
 #endif
 
 /**
@@ -56,7 +60,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
         goto fail;
 
-    s->tcos = av_malloc(n/2 * sizeof(FFTSample));
+    s->tcos = av_malloc_array(n/2, sizeof(FFTSample));
     if (!s->tcos)
         goto fail;
 
@@ -77,8 +81,13 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     scale = sqrt(fabs(scale));
     for(i=0;i<n4;i++) {
         alpha = 2 * M_PI * (i + theta) / n;
+#if FFT_FIXED_32
+        s->tcos[i*tstep] = (FFTSample)floor(-cos(alpha) * 2147483648.0 + 0.5);
+        s->tsin[i*tstep] = (FFTSample)floor(-sin(alpha) * 2147483648.0 + 0.5);
+#else
         s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
         s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
+#endif
     }
     return 0;
  fail:
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index 39395e2ea9..1cc4ca4742 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -4,20 +4,20 @@
  *
  * based upon code from Sebastian Jedruszkiewicz <elf@frogger.rules.pl>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "bswapdsp.h"
 #include "idctdsp.h"
 #include "mpeg12.h"
 #include "thread.h"
@@ -36,6 +37,7 @@
 typedef struct MDECContext {
     AVCodecContext *avctx;
     BlockDSPContext bdsp;
+    BswapDSPContext bbdsp;
     IDCTDSPContext idsp;
     ThreadFrame frame;
     GetBitContext gb;
@@ -129,7 +131,7 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
 static inline int decode_mb(MDECContext *a, int16_t block[6][64])
 {
     int i, ret;
-    const int block_index[6] = { 5, 4, 0, 1, 2, 3 };
+    static const int block_index[6] = { 5, 4, 0, 1, 2, 3 };
 
     a->bdsp.clear_blocks(block[0]);
 
@@ -171,23 +173,19 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf    = avpkt->data;
     int buf_size          = avpkt->size;
     ThreadFrame frame     = { .f = data };
-    int i, ret;
+    int ret;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
     frame.f->pict_type = AV_PICTURE_TYPE_I;
     frame.f->key_frame = 1;
 
-    av_fast_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size);
     if (!a->bitstream_buffer)
         return AVERROR(ENOMEM);
-    for (i = 0; i < buf_size; i += 2) {
-        a->bitstream_buffer[i]     = buf[i + 1];
-        a->bitstream_buffer[i + 1] = buf[i];
-    }
-    init_get_bits(&a->gb, a->bitstream_buffer, buf_size * 8);
+    a->bbdsp.bswap16_buf((uint16_t *)a->bitstream_buffer, (uint16_t *)buf, (buf_size + 1) / 2);
+    if ((ret = init_get_bits8(&a->gb, a->bitstream_buffer, buf_size)) < 0)
+        return ret;
 
     /* skip over 4 preamble bytes in stream (typically 0xXX 0xXX 0x00 0x38) */
     skip_bits(&a->gb, 32);
@@ -221,6 +219,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     a->avctx           = avctx;
 
     ff_blockdsp_init(&a->bdsp, avctx);
+    ff_bswapdsp_init(&a->bbdsp);
     ff_idctdsp_init(&a->idsp, avctx);
     ff_mpeg12_init_vlcs();
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable,
@@ -234,6 +233,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     MDECContext * const a = avctx->priv_data;
@@ -242,6 +242,7 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index eb98a72f1b..dc76b07ba2 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1,22 +1,27 @@
 /*
- * This file is part of Libav.
+ * DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "avcodec.h"
 #include "copy_block.h"
 #include "simple_idct.h"
@@ -103,8 +108,8 @@ static int sum_abs_dctelem_c(int16_t *block)
     return sum;
 }
 
-#define avg2(a, b) ((a + b + 1) >> 1)
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
+#define avg2(a, b) (((a) + (b) + 1) >> 1)
+#define avg4(a, b, c, d) (((a) + (b) + (c) + (d) + 2) >> 2)
 
 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                               ptrdiff_t stride, int h)
@@ -409,6 +414,14 @@ void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
         case FF_CMP_NSSE:
             cmp[i] = c->nsse[i];
             break;
+#if CONFIG_DWT
+        case FF_CMP_W53:
+            cmp[i]= c->w53[i];
+            break;
+        case FF_CMP_W97:
+            cmp[i]= c->w97[i];
+            break;
+#endif
         default:
             av_log(NULL, AV_LOG_ERROR,
                    "internal error in cmp function selection\n");
@@ -436,7 +449,7 @@ static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -488,7 +501,7 @@ static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -540,7 +553,7 @@ static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
 {
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
@@ -607,7 +620,7 @@ static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
@@ -625,7 +638,7 @@ static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
     int16_t *const bak = temp + 64;
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
     s->mb_intra = 0;
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
@@ -654,7 +667,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     copy_block8(lsrc1, src1, 8, stride, 8);
     copy_block8(lsrc2, src2, 8, stride, 8);
@@ -698,7 +711,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0) {
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -729,7 +742,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
 
@@ -770,7 +783,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0)
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -803,20 +816,24 @@ static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
 VSAD_INTRA(8)
 VSAD_INTRA(16)
 
-static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSAD(size)                                                             \
+static int vsad ## size ## _c(MpegEncContext *c,                               \
+                              uint8_t *s1, uint8_t *s2,                        \
+                              ptrdiff_t stride, int h)                               \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSAD(8)
+VSAD(16)
 
 #define SQ(a) ((a) * (a))
 #define VSSE_INTRA(size)                                                \
@@ -841,20 +858,23 @@ static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
 VSSE_INTRA(8)
 VSSE_INTRA(16)
 
-static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSSE(size)                                                             \
+static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
+                              ptrdiff_t stride, int h)                         \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSSE(8)
+VSSE(16)
 
 #define WRAPPER8_16_SQ(name8, name16)                                   \
 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
@@ -892,8 +912,31 @@ av_cold void ff_me_cmp_init_static(void)
         ff_square_tab[i] = (i - 256) * (i - 256);
 }
 
+int ff_check_alignment(void)
+{
+    static int did_fail = 0;
+    LOCAL_ALIGNED_16(int, aligned, [4]);
+
+    if ((intptr_t)aligned & 15) {
+        if (!did_fail) {
+#if HAVE_MMX || HAVE_ALTIVEC
+            av_log(NULL, AV_LOG_ERROR,
+                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
+                "and may be very slow or crash. This is not a bug in libavcodec,\n"
+                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
+                "Do not report crashes to FFmpeg developers.\n");
+#endif
+            did_fail=1;
+        }
+        return -1;
+    }
+    return 0;
+}
+
 av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
 {
+    ff_check_alignment();
+
     c->sum_abs_dctelem = sum_abs_dctelem_c;
 
     /* TODO [0] 16  [1] 8 */
@@ -927,18 +970,27 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     SET_CMP_FUNC(rd)
     SET_CMP_FUNC(bit)
     c->vsad[0] = vsad16_c;
+    c->vsad[1] = vsad8_c;
     c->vsad[4] = vsad_intra16_c;
     c->vsad[5] = vsad_intra8_c;
     c->vsse[0] = vsse16_c;
+    c->vsse[1] = vsse8_c;
     c->vsse[4] = vsse_intra16_c;
     c->vsse[5] = vsse_intra8_c;
     c->nsse[0] = nsse16_c;
     c->nsse[1] = nsse8_c;
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
+    ff_dsputil_init_dwt(c);
+#endif
 
+    if (ARCH_ALPHA)
+        ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_me_cmp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_me_cmp_init_mips(c, avctx);
 }
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index 725f9b25ee..a3603ec2c1 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,20 @@
 
 extern uint32_t ff_square_tab[512];
 
+
+/* minimum alignment rules ;)
+ * If you notice errors in the align stuff, need more alignment for some ASM code
+ * for some CPU or need to use a function with less aligned data then send a mail
+ * to the ffmpeg-devel mailing list, ...
+ *
+ * !warning These alignments might not match reality, (missing attribute((align))
+ * stuff somewhere possible).
+ * I (Michael) did not check them, these are just the alignments which I think
+ * could be reached easily ...
+ *
+ * !future video codecs might need functions with less strict alignment
+ */
+
 struct MpegEncContext;
 /* Motion estimation:
  * h is limited to { width / 2, width, 2 * width },
@@ -49,6 +63,8 @@ typedef struct MECmpContext {
     me_cmp_func vsad[6];
     me_cmp_func vsse[6];
     me_cmp_func nsse[6];
+    me_cmp_func w53[6];
+    me_cmp_func w97[6];
     me_cmp_func dct_max[6];
     me_cmp_func dct264_sad[6];
 
@@ -64,11 +80,17 @@ typedef struct MECmpContext {
 
 void ff_me_cmp_init_static(void);
 
+int ff_check_alignment(void);
+
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
 
 void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
 
+void ff_dsputil_init_dwt(MECmpContext *c);
+
 #endif /* AVCODEC_ME_CMP_H */
diff --git a/libavcodec/metasound.c b/libavcodec/metasound.c
index dbb2a63e3e..5a7f4c3fa1 100644
--- a/libavcodec/metasound.c
+++ b/libavcodec/metasound.c
@@ -4,20 +4,20 @@
  * based on TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -236,7 +236,7 @@ static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
             skip_bits(&gb, 4 - (get_bits_count(&gb) & 3));
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 typedef struct MetasoundProps {
diff --git a/libavcodec/metasound_data.c b/libavcodec/metasound_data.c
index 8aa53e556c..6d87117406 100644
--- a/libavcodec/metasound_data.c
+++ b/libavcodec/metasound_data.c
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -11208,6 +11208,14 @@ static const int16_t fcb16m[] = {
       -688,   -209,    915,    622,  -1038,   -474,   -343,    -91,
       -173,   -104,    255,     96,   1547,    773,   -625,   2272,
        -90,   -509,   -527,   -247,   -147,   -234,    -45,    166,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static const int16_t fcb16sl[] = {
@@ -12739,185 +12747,6 @@ static const float lsp16[] = {
     -0.0429, -0.0615, -0.0893, -0.0618, -0.0384, -0.0134, -0.0232, -0.0238,
 };
 
-static const float lsp16s[] = {
-     0.1813,  0.3911,  0.6301,  0.8012,  1.0057,  1.2041,  1.4271,  1.6943,
-     1.9402,  2.1733,  2.3521,  2.4989,  2.5839,  2.6846,  2.7634,  2.8950,
-     0.1311,  0.3183,  0.4659,  0.5601,  0.6658,  0.7828,  1.0065,  1.2717,
-     1.5185,  1.7339,  1.9530,  2.2189,  2.3739,  2.4991,  2.6984,  2.9256,
-     0.1627,  0.4519,  0.6323,  0.7012,  0.7848,  0.9801,  1.1810,  1.3222,
-     1.5413,  1.8129,  1.9338,  2.0809,  2.3180,  2.5189,  2.7066,  2.9514,
-     0.1475,  0.2447,  0.4240,  0.5669,  0.7872,  0.9838,  1.1823,  1.3814,
-     1.5358,  1.6820,  1.8794,  2.1419,  2.4132,  2.6112,  2.7911,  2.9511,
-     0.1224,  0.2876,  0.5013,  0.6985,  0.8902,  1.0901,  1.2835,  1.4768,
-     1.6596,  1.8538,  2.0467,  2.2304,  2.4124,  2.5942,  2.7729,  2.9531,
-     0.1741,  0.3034,  0.4677,  0.5879,  0.7258,  0.9648,  1.1417,  1.3220,
-     1.5081,  1.7151,  1.9212,  2.1286,  2.3208,  2.4938,  2.6765,  2.8891,
-     0.1657,  0.3174,  0.4907,  0.6559,  0.8295,  1.0254,  1.2071,  1.3880,
-     1.5737,  1.7845,  1.9027,  2.1139,  2.3323,  2.5157,  2.7323,  2.9015,
-     0.1592,  0.2758,  0.4417,  0.6315,  0.8257,  0.9873,  1.1277,  1.2830,
-     1.4337,  1.6315,  1.8899,  2.1356,  2.3572,  2.5632,  2.7468,  2.9420,
-     0.1524,  0.4325,  0.5931,  0.7036,  0.7696,  0.8923,  1.1739,  1.4773,
-     1.6609,  1.7911,  1.9666,  2.1972,  2.3754,  2.5045,  2.6613,  2.8882,
-     0.2130,  0.3013,  0.3721,  0.4257,  0.5079,  0.7015,  0.9815,  1.2554,
-     1.4648,  1.6966,  1.9138,  2.1075,  2.3318,  2.5292,  2.7453,  2.9347,
-     0.1142,  0.3748,  0.6205,  0.7642,  0.8121,  0.9022,  0.9843,  1.1558,
-     1.4467,  1.7422,  1.9574,  2.1302,  2.3812,  2.5898,  2.7720,  2.9583,
-     0.1255,  0.2339,  0.3570,  0.5323,  0.7458,  1.0003,  1.1729,  1.3567,
-     1.5217,  1.6977,  1.8924,  2.0942,  2.3145,  2.5408,  2.7553,  2.9337,
-     0.1316,  0.2289,  0.4327,  0.6663,  0.8509,  0.9994,  1.1697,  1.3804,
-     1.5609,  1.6903,  1.8572,  2.1019,  2.3687,  2.5789,  2.7715,  2.9472,
-     0.1502,  0.2546,  0.3883,  0.5333,  0.6976,  0.9163,  1.1071,  1.3364,
-     1.5420,  1.7525,  1.8948,  2.0839,  2.2819,  2.4651,  2.6875,  2.8987,
-     0.1593,  0.3014,  0.4573,  0.6354,  0.8157,  0.9805,  1.1783,  1.3747,
-     1.5678,  1.7326,  1.9286,  2.1340,  2.3253,  2.5280,  2.7180,  2.9298,
-     0.1811,  0.3167,  0.4655,  0.6507,  0.8198,  1.0075,  1.1892,  1.3743,
-     1.5227,  1.7090,  1.8849,  2.0743,  2.2750,  2.4830,  2.6896,  2.8953,
-     0.1846,  0.3577,  0.5315,  0.7290,  0.9176,  1.1016,  1.2654,  1.4525,
-     1.6315,  1.8268,  2.0238,  2.1934,  2.3868,  2.5753,  2.7682,  2.9469,
-     0.0876,  0.1439,  0.2048,  0.3654,  0.6281,  0.8853,  1.0907,  1.2992,
-     1.5227,  1.7373,  1.9395,  2.1419,  2.3488,  2.5486,  2.7466,  2.9348,
-     0.1391,  0.4170,  0.6561,  0.7953,  0.8734,  0.9986,  1.1870,  1.4520,
-     1.6042,  1.7910,  2.0135,  2.1870,  2.3358,  2.5066,  2.7409,  2.9955,
-     0.0804,  0.1355,  0.2599,  0.4998,  0.7408,  0.9474,  1.1276,  1.3428,
-     1.5556,  1.7712,  1.9699,  2.1535,  2.3605,  2.5548,  2.7489,  2.9325,
-     0.1304,  0.3087,  0.4979,  0.6584,  0.8414,  1.0329,  1.2244,  1.4189,
-     1.6118,  1.8200,  1.9985,  2.1893,  2.3915,  2.5794,  2.7647,  2.9344,
-     0.1895,  0.2849,  0.3705,  0.4126,  0.6265,  0.9207,  1.1774,  1.3762,
-     1.5757,  1.7728,  1.9568,  2.1662,  2.3615,  2.5575,  2.7561,  2.9416,
-     0.1800,  0.3078,  0.4805,  0.6796,  0.8503,  1.0046,  1.1703,  1.3269,
-     1.4862,  1.6502,  1.8454,  2.0873,  2.3175,  2.5356,  2.7516,  2.9469,
-     0.1950,  0.3233,  0.4568,  0.5940,  0.7589,  0.9978,  1.1701,  1.3383,
-     1.5017,  1.6565,  1.8243,  2.0605,  2.2938,  2.5147,  2.7419,  2.9396,
-     0.2531,  0.4391,  0.5790,  0.7170,  0.8998,  1.1430,  1.3577,  1.5326,
-     1.6328,  1.7627,  1.9726,  2.1762,  2.3563,  2.5478,  2.7385,  2.9067,
-     0.1805,  0.2788,  0.3591,  0.3881,  0.5441,  0.8055,  1.0766,  1.3165,
-     1.5316,  1.7508,  1.9477,  2.1374,  2.3438,  2.5484,  2.7501,  2.9410,
-     0.2044,  0.3671,  0.5396,  0.7042,  0.8582,  0.9831,  1.1261,  1.3194,
-     1.4769,  1.6979,  1.8717,  2.0463,  2.2620,  2.4739,  2.7054,  2.9208,
-     0.1048,  0.2175,  0.4206,  0.5923,  0.7483,  0.9400,  1.1356,  1.3799,
-     1.5958,  1.7320,  1.8984,  2.1296,  2.3594,  2.5492,  2.7387,  2.9305,
-     0.0842,  0.1729,  0.3951,  0.6447,  0.8688,  1.0605,  1.2472,  1.4330,
-     1.6232,  1.8144,  2.0216,  2.1915,  2.3878,  2.5763,  2.7685,  2.9464,
-     0.1461,  0.2593,  0.4105,  0.5677,  0.7328,  0.8919,  1.0484,  1.2302,
-     1.4386,  1.6635,  1.8873,  2.1024,  2.3116,  2.5268,  2.7273,  2.9269,
-     0.1503,  0.3108,  0.4756,  0.6731,  0.8600,  1.0233,  1.2115,  1.3971,
-     1.5915,  1.7892,  1.9517,  2.1603,  2.3487,  2.5460,  2.7308,  2.8998,
-     0.2163,  0.3669,  0.5125,  0.6709,  0.8143,  0.9930,  1.2095,  1.4205,
-     1.6176,  1.7112,  1.8398,  2.0896,  2.3513,  2.5290,  2.6667,  2.8960,
-     0.2133,  0.4382,  0.6287,  0.8702,  1.1088,  1.3749,  1.6062,  1.7446,
-     1.8333,  1.9122,  1.9614,  2.0669,  2.1789,  2.3449,  2.6038,  2.8849,
-     0.1598,  0.2719,  0.3877,  0.4815,  0.5926,  0.7795,  1.0449,  1.3045,
-     1.5210,  1.7391,  1.9462,  2.1397,  2.3553,  2.5458,  2.7540,  2.9392,
-     0.2918,  0.5607,  0.6801,  0.7404,  0.8285,  0.9431,  1.1579,  1.4080,
-     1.6332,  1.8472,  1.9738,  2.0771,  2.2890,  2.5178,  2.7445,  2.9830,
-     0.1664,  0.2842,  0.3965,  0.5463,  0.8162,  1.0346,  1.1849,  1.3446,
-     1.5122,  1.7563,  1.9960,  2.2002,  2.3796,  2.5689,  2.7712,  2.9550,
-     0.0911,  0.2397,  0.5052,  0.7868,  1.0299,  1.1311,  1.2244,  1.3333,
-     1.4395,  1.6790,  1.9369,  2.1717,  2.3689,  2.5538,  2.7340,  2.9326,
-     0.1647,  0.2931,  0.3836,  0.4978,  0.6255,  0.9243,  1.1339,  1.3001,
-     1.5269,  1.8010,  1.9715,  2.1419,  2.3784,  2.5503,  2.6719,  2.8745,
-     0.2440,  0.3802,  0.4756,  0.6613,  0.8627,  1.0292,  1.2291,  1.4060,
-     1.5198,  1.7354,  1.9044,  2.1010,  2.3147,  2.4996,  2.7171,  2.9041,
-     0.1590,  0.2876,  0.4572,  0.5996,  0.7713,  0.9490,  1.1205,  1.2815,
-     1.4516,  1.6385,  1.8179,  2.0457,  2.2759,  2.4785,  2.6861,  2.9080,
-     0.2297,  0.4309,  0.5712,  0.6717,  0.8138,  1.0463,  1.2492,  1.4560,
-     1.6796,  1.8458,  1.9642,  2.1452,  2.3636,  2.5395,  2.7456,  2.9495,
-     0.2975,  0.4678,  0.4996,  0.5809,  0.6279,  0.6884,  0.8606,  1.1386,
-     1.4412,  1.6876,  1.8760,  2.0932,  2.3178,  2.5166,  2.7345,  2.9280,
-     0.1278,  0.3737,  0.6004,  0.7069,  0.8147,  1.0180,  1.2581,  1.3812,
-     1.4855,  1.7268,  1.9970,  2.1258,  2.2936,  2.5702,  2.7563,  2.8983,
-     0.1314,  0.2508,  0.3999,  0.5680,  0.7424,  0.9367,  1.1286,  1.3175,
-     1.5336,  1.7404,  1.9317,  2.1404,  2.3514,  2.5562,  2.7510,  2.9402,
-     0.1043,  0.2367,  0.4293,  0.6376,  0.8160,  0.9836,  1.1779,  1.3850,
-     1.5835,  1.7875,  1.9765,  2.1593,  2.3654,  2.5577,  2.7465,  2.9398,
-     0.1529,  0.2515,  0.3454,  0.4374,  0.7011,  0.9015,  1.0744,  1.3532,
-     1.5699,  1.7545,  2.0021,  2.1259,  2.2278,  2.4546,  2.7264,  2.9425,
-     0.1429,  0.2808,  0.4395,  0.6334,  0.8069,  0.9705,  1.1520,  1.3250,
-     1.5109,  1.7285,  1.9356,  2.1469,  2.3479,  2.5554,  2.7512,  2.9348,
-     0.1625,  0.3022,  0.4756,  0.6315,  0.8032,  0.9924,  1.1596,  1.3204,
-     1.4994,  1.6929,  1.8955,  2.1090,  2.3025,  2.5018,  2.6908,  2.8980,
-     0.1692,  0.3427,  0.5228,  0.7756,  0.9688,  1.0950,  1.3056,  1.4360,
-     1.5675,  1.8049,  1.9376,  2.1151,  2.3407,  2.5012,  2.7192,  2.9258,
-     0.0474,  0.1251,  0.1939,  0.3841,  0.6501,  0.9231,  1.1153,  1.3240,
-     1.5478,  1.7599,  1.9651,  2.1510,  2.3645,  2.5552,  2.7542,  2.9393,
-     0.2196,  0.4656,  0.7492,  0.9922,  1.1678,  1.2489,  1.3112,  1.3657,
-     1.4223,  1.5302,  1.7212,  1.9996,  2.2523,  2.4844,  2.7036,  2.9145,
-     0.1128,  0.2368,  0.3704,  0.5476,  0.7723,  0.9968,  1.1930,  1.3992,
-     1.6013,  1.7957,  1.9888,  2.1857,  2.3825,  2.5705,  2.7616,  2.9434,
-     0.1341,  0.2768,  0.4510,  0.6359,  0.8332,  1.0335,  1.2004,  1.3952,
-     1.5762,  1.7681,  1.9815,  2.1735,  2.3657,  2.5552,  2.7514,  2.9498,
-     0.1247,  0.2559,  0.3516,  0.4726,  0.6861,  0.9483,  1.1852,  1.3858,
-     1.5851,  1.7815,  1.9778,  2.1737,  2.3729,  2.5664,  2.7620,  2.9429,
-     0.1988,  0.3320,  0.4777,  0.6737,  0.8425,  1.0265,  1.1694,  1.3655,
-     1.5463,  1.7135,  1.9385,  2.1650,  2.3529,  2.5367,  2.7545,  2.9585,
-     0.1376,  0.2620,  0.4273,  0.6169,  0.7755,  0.9441,  1.1169,  1.3157,
-     1.5179,  1.7020,  1.8931,  2.1059,  2.3112,  2.5136,  2.7169,  2.9198,
-     0.2112,  0.4385,  0.6091,  0.7618,  0.9553,  1.1543,  1.3445,  1.5396,
-     1.7153,  1.9192,  2.1263,  2.3593,  2.5958,  2.8171,  2.9394,  3.0409,
-     0.1347,  0.2099,  0.2646,  0.3453,  0.5266,  0.7869,  1.0513,  1.2795,
-     1.4880,  1.7181,  1.9294,  2.1332,  2.3362,  2.5442,  2.7433,  2.9362,
-     0.3141,  0.5935,  0.7517,  0.8313,  0.8568,  0.9570,  1.0250,  1.1275,
-     1.3422,  1.6303,  1.8577,  2.0705,  2.2957,  2.5095,  2.7244,  2.9262,
-     0.0962,  0.2116,  0.3961,  0.5641,  0.7122,  0.8883,  1.1023,  1.3481,
-     1.5623,  1.7554,  1.9618,  2.1675,  2.3706,  2.5556,  2.7430,  2.9337,
-     0.0898,  0.1510,  0.3060,  0.5820,  0.8221,  1.0388,  1.2261,  1.4289,
-     1.6054,  1.8103,  1.9941,  2.1844,  2.3742,  2.5711,  2.7632,  2.9474,
-     0.1326,  0.2316,  0.3761,  0.5177,  0.6782,  0.8761,  1.0952,  1.3175,
-     1.5078,  1.7034,  1.9051,  2.1245,  2.3424,  2.5484,  2.7444,  2.9389,
-     0.1740,  0.3293,  0.5174,  0.6824,  0.8394,  1.0372,  1.2046,  1.3723,
-     1.5656,  1.7444,  1.9442,  2.1386,  2.3139,  2.4960,  2.7071,  2.9297,
-     0.2304,  0.3775,  0.4865,  0.6182,  0.7842,  0.9208,  1.1151,  1.2843,
-     1.4641,  1.6988,  1.9209,  2.1260,  2.3099,  2.5229,  2.7414,  2.9276,
-     0.0094,  0.0261, -0.0037,  0.0041, -0.0092, -0.0044, -0.0232, -0.0073,
-    -0.0047, -0.0021,  0.0250, -0.0580, -0.0140, -0.0342, -0.0586,  0.0020,
-     0.0449,  0.0155, -0.0523, -0.0279,  0.0299, -0.0183, -0.0736, -0.0639,
-    -0.0017,  0.0336,  0.0209,  0.0046,  0.0077, -0.0148, -0.0114, -0.0120,
-     0.0115, -0.0050,  0.0445,  0.0048,  0.0188, -0.0137, -0.0080,  0.0239,
-    -0.0184, -0.0524, -0.0195, -0.0126,  0.0284,  0.0632,  0.0141, -0.0093,
-    -0.0096,  0.0196,  0.0230,  0.0379,  0.0308,  0.0237, -0.0224, -0.0600,
-    -0.0755, -0.1074, -0.0988, -0.0606, -0.1038, -0.1552, -0.1480, -0.0672,
-     0.0504,  0.0676,  0.0336, -0.0042,  0.0729,  0.1013,  0.0868,  0.0846,
-     0.0954,  0.0515, -0.0066, -0.0851, -0.0485,  0.0294,  0.0395,  0.0087,
-     0.0078,  0.0446,  0.0881,  0.0672, -0.0384, -0.0025,  0.0415,  0.0353,
-     0.0080,  0.0052,  0.0190,  0.0182,  0.0069,  0.0168,  0.0374,  0.0037,
-    -0.0292, -0.0429,  0.0302,  0.0681, -0.0233, -0.0238, -0.0003, -0.0043,
-     0.0054, -0.0029, -0.0149,  0.0642,  0.0622,  0.0341, -0.0232, -0.0461,
-    -0.0082, -0.0469, -0.0618, -0.0326, -0.0452, -0.0649, -0.0597, -0.0398,
-    -0.0318, -0.0116,  0.0011,  0.0009, -0.0384, -0.0384, -0.0156, -0.0260,
-    -0.0007,  0.0473,  0.0111, -0.0358, -0.0484, -0.0204, -0.0029, -0.0090,
-    -0.0285, -0.0495, -0.0376,  0.0917,  0.1192,  0.1026,  0.0745,  0.0397,
-     0.0463,  0.0253,  0.0025,  0.0465,  0.0100,  0.0488,  0.0416,  0.0223,
-     0.0263,  0.0072, -0.0053,  0.0595,  0.0060, -0.0518, -0.0316, -0.0043,
-    -0.0133, -0.0233, -0.0075, -0.0251,  0.0277, -0.0067, -0.0136, -0.0004,
-     0.0235,  0.0112, -0.0182, -0.0324, -0.0210, -0.0035, -0.0395, -0.0384,
-     0.0005, -0.0150, -0.0356,  0.0127, -0.0033, -0.0034,  0.0205,  0.0747,
-     0.1138,  0.1015,  0.0995, -0.0161, -0.0045,  0.0129,  0.0472,  0.0575,
-     0.0222,  0.0091,  0.0037, -0.0471,  0.0371,  0.0132,  0.0208,  0.0247,
-     0.0117,  0.0164,  0.0225,  0.0124, -0.0023,  0.0088, -0.0046,  0.0047,
-    -0.0393,  0.0018,  0.0148,  0.0020,  0.0044,  0.0165,  0.0229, -0.0208,
-    -0.0477, -0.0310, -0.0164, -0.0390, -0.0764, -0.0525, -0.0094,  0.0075,
-    -0.0102, -0.0045, -0.0504, -0.0709,  0.0822,  0.0710,  0.0426,  0.0014,
-    -0.0371, -0.0400, -0.0157, -0.0155, -0.0173, -0.0138, -0.0015,  0.0134,
-    -0.0418, -0.0682, -0.0256,  0.0050,  0.0360,  0.0354,  0.0074, -0.0396,
-    -0.0235,  0.0284,  0.0494,  0.0153,  0.0448,  0.0025, -0.0061,  0.0252,
-     0.1000,  0.2260,  0.2158,  0.2116,  0.2198,  0.2055,  0.2110,  0.1873,
-     0.1907,  0.2071,  0.2164,  0.2009,  0.2059,  0.2124,  0.2141,  0.2093,
-     0.0875,  0.0981,  0.1177,  0.1071,  0.1033,  0.1248,  0.1048,  0.1238,
-     0.1166,  0.1008,  0.1062,  0.0992,  0.0994,  0.1067,  0.0999,  0.1187,
-     0.0750,  0.0794,  0.0828,  0.0854,  0.0859,  0.0801,  0.0891,  0.0933,
-     0.0969,  0.0920,  0.0915,  0.0862,  0.0868,  0.0891,  0.0842,  0.0824,
-     0.0625,  0.0930,  0.0815,  0.0853,  0.0898,  0.0828,  0.0822,  0.0910,
-     0.0873,  0.0906,  0.0856,  0.0840,  0.0774,  0.0785,  0.0684,  0.0711,
-     0.3319,  0.4219,  0.4588,  0.4090,  0.4092,  0.4014,  0.3548,  0.3353,
-     0.3708,  0.3352,  0.3720,  0.3538,  0.4084,  0.4289,  0.4060,  0.4210,
-     0.0588,  0.0209, -0.0082, -0.0115, -0.0343, -0.0621, -0.0541, -0.0346,
-    -0.0346, -0.0366, -0.0220, -0.0265, -0.0102,  0.0374,  0.0306,  0.0404,
-     0.0306,  0.0090, -0.0054,  0.0333,  0.0047,  0.0238,  0.0141,  0.0165,
-     0.0306,  0.0420,  0.0159,  0.0124,  0.0414,  0.0158, -0.0237,  0.0141,
-     0.0765,  0.0057, -0.0260, -0.0426, -0.0395, -0.0126, -0.0579, -0.0417,
-    -0.0429, -0.0615, -0.0893, -0.0618, -0.0384, -0.0134, -0.0232, -0.0238,
-};
-
 static const float lsp22[] = {
      0.0664,  0.1875,  0.4300,  0.6730,  0.8793,  1.0640,  1.2563,  1.4433,
      1.6394,  1.8176,  2.0029,  2.1921,  2.3796,  2.5671,  2.7595,  2.9536,
@@ -15117,9 +14946,10 @@ static const uint16_t bark_tab_s16_128[] = {
     2, 2, 2, 3, 3, 5, 7, 12, 25, 67
 };
 
+/* unused
 static const uint16_t bark_tab_s16_64[] = {
     1, 1, 2, 2, 3, 6, 11, 38
-};
+}; */
 
 static const uint16_t bark_tab_l16s_1024[] = {
       9,   9,   8,   9,  10,   9,  10,  10,
diff --git a/libavcodec/metasound_data.h b/libavcodec/metasound_data.h
index 4925516e21..5c334113cc 100644
--- a/libavcodec/metasound_data.h
+++ b/libavcodec/metasound_data.h
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/microdvddec.c b/libavcodec/microdvddec.c
new file mode 100644
index 0000000000..96034a042a
--- /dev/null
+++ b/libavcodec/microdvddec.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MicroDVD subtitle decoder
+ *
+ * Based on the specifications found here:
+ * https://trac.videolan.org/vlc/ticket/1825#comment:6
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/bprint.h"
+#include "avcodec.h"
+#include "ass.h"
+
+static int indexof(const char *s, int c)
+{
+    char *f = strchr(s, c);
+    return f ? (f - s) : -1;
+}
+
+struct microdvd_tag {
+    char key;
+    int persistent;
+    uint32_t data1;
+    uint32_t data2;
+    char *data_string;
+    int data_string_len;
+};
+
+#define MICRODVD_PERSISTENT_OFF     0
+#define MICRODVD_PERSISTENT_ON      1
+#define MICRODVD_PERSISTENT_OPENED  2
+
+// Color, Font, Size, cHarset, stYle, Position, cOordinate
+#define MICRODVD_TAGS "cfshyYpo"
+
+static void microdvd_set_tag(struct microdvd_tag *tags, struct microdvd_tag tag)
+{
+    int tag_index = indexof(MICRODVD_TAGS, tag.key);
+
+    if (tag_index < 0)
+        return;
+    memcpy(&tags[tag_index], &tag, sizeof(tag));
+}
+
+// italic, bold, underline, strike-through
+#define MICRODVD_STYLES "ibus"
+
+/* some samples have lines that start with a / indicating non persistent italic
+ * marker */
+static char *check_for_italic_slash_marker(struct microdvd_tag *tags, char *s)
+{
+    if (*s == '/') {
+        struct microdvd_tag tag = tags[indexof(MICRODVD_TAGS, 'y')];
+        tag.key = 'y';
+        tag.data1 |= 1 << 0 /* 'i' position in MICRODVD_STYLES */;
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return s;
+}
+
+static char *microdvd_load_tags(struct microdvd_tag *tags, char *s)
+{
+    s = check_for_italic_slash_marker(tags, s);
+
+    while (*s == '{') {
+        char *start = s;
+        char tag_char = *(s + 1);
+        struct microdvd_tag tag = {0};
+
+        if (!tag_char || *(s + 2) != ':')
+            break;
+        s += 3;
+
+        switch (tag_char) {
+
+        /* Style */
+        case 'Y':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'y':
+            while (*s && *s != '}') {
+                int style_index = indexof(MICRODVD_STYLES, *s);
+
+                if (style_index >= 0)
+                    tag.data1 |= (1 << style_index);
+                s++;
+            }
+            if (*s != '}')
+                break;
+            /* We must distinguish persistent and non-persistent styles
+             * to handle this kind of style tags: {y:ib}{Y:us} */
+            tag.key = tag_char;
+            break;
+
+        /* Color */
+        case 'C':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'c':
+            while (*s == '$' || *s == '#')
+                s++;
+            tag.data1 = strtol(s, &s, 16) & 0x00ffffff;
+            if (*s != '}')
+                break;
+            tag.key = 'c';
+            break;
+
+        /* Font name */
+        case 'F':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'f': {
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'f';
+            break;
+        }
+
+        /* Font size */
+        case 'S':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 's':
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 's';
+            break;
+
+        /* Charset */
+        case 'H': {
+            //TODO: not yet handled, just parsed.
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'h';
+            break;
+        }
+
+        /* Position */
+        case 'P':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = (*s++ == '1');
+            if (*s != '}')
+                break;
+            tag.key = 'p';
+            break;
+
+        /* Coordinates */
+        case 'o':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != ',')
+                break;
+            s++;
+            tag.data2 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 'o';
+            break;
+
+        default:    /* Unknown tag, we consider it's text */
+            break;
+        }
+
+        if (tag.key == 0)
+            return start;
+
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return check_for_italic_slash_marker(tags, s);
+}
+
+static void microdvd_open_tags(AVBPrint *new_line, struct microdvd_tag *tags)
+{
+    int i, sidx;
+    for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+        if (tags[i].persistent == MICRODVD_PERSISTENT_OPENED)
+            continue;
+        switch (tags[i].key) {
+        case 'Y':
+        case 'y':
+            for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c1}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c&H%06X&}", tags[i].data1);
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn%.*s}",
+                       tags[i].data_string_len, tags[i].data_string);
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs%d}", tags[i].data1);
+            break;
+
+        case 'p':
+            if (tags[i].data1 == 0)
+                av_bprintf(new_line, "{\\an8}");
+            break;
+
+        case 'o':
+            av_bprintf(new_line, "{\\pos(%d,%d)}",
+                       tags[i].data1, tags[i].data2);
+            break;
+        }
+        if (tags[i].persistent == MICRODVD_PERSISTENT_ON)
+            tags[i].persistent = MICRODVD_PERSISTENT_OPENED;
+    }
+}
+
+static void microdvd_close_no_persistent_tags(AVBPrint *new_line,
+                                              struct microdvd_tag *tags)
+{
+    int i, sidx;
+
+    for (i = sizeof(MICRODVD_TAGS) - 2; i >= 0; i--) {
+        if (tags[i].persistent != MICRODVD_PERSISTENT_OFF)
+            continue;
+        switch (tags[i].key) {
+
+        case 'y':
+            for (sidx = sizeof(MICRODVD_STYLES) - 2; sidx >= 0; sidx--)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c0}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c}");
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn}");
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs}");
+            break;
+        }
+        tags[i].key = 0;
+    }
+}
+
+static int microdvd_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    AVBPrint new_line;
+    char *line = avpkt->data;
+    char *end = avpkt->data + avpkt->size;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    if (avpkt->size <= 0)
+        return avpkt->size;
+
+    av_bprint_init(&new_line, 0, 2048);
+
+    // subtitle content
+    while (line < end && *line) {
+
+        // parse MicroDVD tags, and open them in ASS
+        line = microdvd_load_tags(tags, line);
+        microdvd_open_tags(&new_line, tags);
+
+        // simple copy until EOL or forced carriage return
+        while (line < end && *line && *line != '|') {
+            av_bprint_chars(&new_line, *line, 1);
+            line++;
+        }
+
+        // line split
+        if (line < end && *line == '|') {
+            microdvd_close_no_persistent_tags(&new_line, tags);
+            av_bprintf(&new_line, "\\N");
+            line++;
+        }
+    }
+    if (new_line.len) {
+        int ret;
+            int64_t start    = avpkt->pts;
+            int64_t duration = avpkt->duration;
+            int ts_start     = av_rescale_q(start,    avctx->time_base, (AVRational){1,100});
+            int ts_duration  = duration != -1 ?
+                av_rescale_q(duration, avctx->time_base, (AVRational){1,100}) : -1;
+
+        ret = ff_ass_add_rect_bprint(sub, &new_line, ts_start, ts_duration);
+        av_bprint_finalize(&new_line, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int microdvd_init(AVCodecContext *avctx)
+{
+    int i, sidx;
+    AVBPrint font_buf;
+    int font_size    = ASS_DEFAULT_FONT_SIZE;
+    int color        = ASS_DEFAULT_COLOR;
+    int bold         = ASS_DEFAULT_BOLD;
+    int italic       = ASS_DEFAULT_ITALIC;
+    int underline    = ASS_DEFAULT_UNDERLINE;
+    int alignment    = ASS_DEFAULT_ALIGNMENT;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    av_bprint_init(&font_buf, 0, AV_BPRINT_SIZE_AUTOMATIC);
+    av_bprintf(&font_buf, "%s", ASS_DEFAULT_FONT);
+
+    if (avctx->extradata) {
+        microdvd_load_tags(tags, avctx->extradata);
+        for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+            switch (av_tolower(tags[i].key)) {
+            case 'y':
+                for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++) {
+                    if (tags[i].data1 & (1 << sidx)) {
+                        switch (MICRODVD_STYLES[sidx]) {
+                        case 'i': italic    = 1; break;
+                        case 'b': bold      = 1; break;
+                        case 'u': underline = 1; break;
+                        }
+                    }
+                }
+                break;
+
+            case 'c': color     = tags[i].data1; break;
+            case 's': font_size = tags[i].data1; break;
+            case 'p': alignment =             8; break;
+
+            case 'f':
+                av_bprint_clear(&font_buf);
+                av_bprintf(&font_buf, "%.*s",
+                           tags[i].data_string_len, tags[i].data_string);
+                break;
+            }
+        }
+    }
+    return ff_ass_subtitle_header(avctx, font_buf.str, font_size, color,
+                                  ASS_DEFAULT_BACK_COLOR, bold, italic,
+                                  underline, alignment);
+}
+
+AVCodec ff_microdvd_decoder = {
+    .name         = "microdvd",
+    .long_name    = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MICRODVD,
+    .init         = microdvd_init,
+    .decode       = microdvd_decode_frame,
+};
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index 6c4fc8ac7d..d3f9c0764a 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2005  Ole André Vadla Ravnås <oleavr@gmail.com>
  * Copyright (C) 2008  Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -120,7 +120,8 @@ static av_cold int mimic_decode_end(AVCodecContext *avctx)
     MimicContext *ctx = avctx->priv_data;
     int i;
 
-    av_free(ctx->swap_buf);
+    av_freep(&ctx->swap_buf);
+    ctx->swap_buf_size = 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->frames); i++) {
         if (ctx->frames[i].f)
@@ -166,6 +167,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCodecContext *avctx_from)
 {
     MimicContext *dst = avctx->priv_data, *src = avctx_from->priv_data;
@@ -181,7 +183,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     for (i = 0; i < FF_ARRAY_ELEMS(dst->frames); i++) {
         ff_thread_release_buffer(avctx, &dst->frames[i]);
-        if (src->frames[i].f->data[0]) {
+        if (i != src->next_cur_index && src->frames[i].f->data[0]) {
             ret = ff_thread_ref_frame(&dst->frames[i], &src->frames[i]);
             if (ret < 0)
                 return ret;
@@ -190,6 +192,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     return 0;
 }
+#endif
 
 static const int8_t vlcdec_lookup[9][64] = {
     {    0, },
@@ -257,7 +260,7 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
 
         value = get_bits(&ctx->gb, num_bits);
 
-        /* Libav's IDCT behaves somewhat different from the original code, so
+        /* FFmpeg's IDCT behaves somewhat different from the original code, so
          * a factor of 4 was added to the input */
 
         coeff = vlcdec_lookup[num_bits][value];
@@ -394,8 +397,8 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
         avctx->height  = height;
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
         for (i = 0; i < 3; i++) {
-            ctx->num_vblocks[i] = -((-height) >> (3 + !!i));
-            ctx->num_hblocks[i] =     width   >> (3 + !!i);
+            ctx->num_vblocks[i] = FF_CEIL_RSHIFT(height,   3 + !!i);
+            ctx->num_hblocks[i] =                width >> (3 + !!i);
         }
     } else if (width != ctx->avctx->width || height != ctx->avctx->height) {
         avpriv_request_sample(avctx, "Resolution changing");
@@ -411,10 +414,8 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     ctx->frames[ctx->cur_index].f->pict_type = is_pframe ? AV_PICTURE_TYPE_P :
                                                            AV_PICTURE_TYPE_I;
     if ((res = ff_thread_get_buffer(avctx, &ctx->frames[ctx->cur_index],
-                                    AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                    AV_GET_BUFFER_FLAG_REF)) < 0)
         return res;
-    }
 
     ctx->next_prev_index = ctx->cur_index;
     ctx->next_cur_index  = (ctx->cur_index - 1) & 15;
@@ -455,6 +456,7 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     return buf_size;
 }
 
+#if HAVE_THREADS
 static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 {
     MimicContext *ctx = avctx->priv_data;
@@ -470,6 +472,7 @@ static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 AVCodec ff_mimic_decoder = {
     .name                  = "mimic",
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
new file mode 100644
index 0000000000..2e8b1ee479
--- /dev/null
+++ b/libavcodec/mips/Makefile
@@ -0,0 +1,79 @@
+MIPSFPU-OBJS-$(CONFIG_AMRNB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/amrwbdec_mips.o          \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_float.o
+MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP)     += mips/mpegaudiodsp_mips_fixed.o
+MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
+MIPSFPU-OBJS-$(CONFIG_FMTCONVERT)         += mips/fmtconvert_mips.o
+OBJS-$(CONFIG_AC3DSP)                     += mips/ac3dsp_mips.o
+OBJS-$(CONFIG_AAC_DECODER)                += mips/aacdec_mips.o            \
+                                             mips/aacsbr_mips.o            \
+                                             mips/sbrdsp_mips.o            \
+                                             mips/aacpsdsp_mips.o
+MIPSDSPR1-OBJS-$(CONFIG_AAC_ENCODER)      += mips/aaccoder_mips.o
+MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER)        += mips/iirfilter_mips.o
+OBJS-$(CONFIG_HEVC_DECODER)               += mips/hevcdsp_init_mips.o      \
+                                             mips/hevcpred_init_mips.o
+OBJS-$(CONFIG_VP9_DECODER)                += mips/vp9dsp_init_mips.o
+OBJS-$(CONFIG_VP8_DECODER)                += mips/vp8dsp_init_mips.o
+OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
+OBJS-$(CONFIG_H264QPEL)                   += mips/h264qpel_init_mips.o
+OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
+OBJS-$(CONFIG_H264PRED)                   += mips/h264pred_init_mips.o
+OBJS-$(CONFIG_H263DSP)                    += mips/h263dsp_init_mips.o
+OBJS-$(CONFIG_QPELDSP)                    += mips/qpeldsp_init_mips.o
+OBJS-$(CONFIG_HPELDSP)                    += mips/hpeldsp_init_mips.o
+OBJS-$(CONFIG_BLOCKDSP)                   += mips/blockdsp_init_mips.o
+OBJS-$(CONFIG_PIXBLOCKDSP)                += mips/pixblockdsp_init_mips.o
+OBJS-$(CONFIG_IDCTDSP)                    += mips/idctdsp_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEO)                  += mips/mpegvideo_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEOENC)               += mips/mpegvideoencdsp_init_mips.o
+OBJS-$(CONFIG_ME_CMP)                     += mips/me_cmp_init_mips.o
+OBJS-$(CONFIG_MPEG4_DECODER)              += mips/xvididct_init_mips.o
+MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
+                                             mips/hevc_mc_uni_msa.o        \
+                                             mips/hevc_mc_uniw_msa.o       \
+                                             mips/hevc_mc_bi_msa.o         \
+                                             mips/hevc_mc_biw_msa.o        \
+                                             mips/hevc_idct_msa.o          \
+                                             mips/hevc_lpf_sao_msa.o       \
+                                             mips/hevcpred_msa.o
+MSA-OBJS-$(CONFIG_VP9_DECODER)            += mips/vp9_mc_msa.o             \
+                                             mips/vp9_lpf_msa.o            \
+                                             mips/vp9_idct_msa.o           \
+                                             mips/vp9_intra_msa.o
+MSA-OBJS-$(CONFIG_VP8_DECODER)            += mips/vp8_mc_msa.o             \
+                                             mips/vp8_idct_msa.o           \
+                                             mips/vp8_lpf_msa.o
+MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o            \
+                                             mips/h264idct_msa.o
+MSA-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_msa.o
+MSA-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_msa.o
+MSA-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_msa.o
+MSA-OBJS-$(CONFIG_H263DSP)                += mips/h263dsp_msa.o
+MSA-OBJS-$(CONFIG_QPELDSP)                += mips/qpeldsp_msa.o
+MSA-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_msa.o
+MSA-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_msa.o
+MSA-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_msa.o
+MSA-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_msa.o           \
+                                             mips/simple_idct_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEOENC)           += mips/mpegvideoencdsp_msa.o
+MSA-OBJS-$(CONFIG_ME_CMP)                 += mips/me_cmp_msa.o
+MMI-OBJS                                  += mips/constants.o
+MMI-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_mmi.o
+MMI-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_mmi.o
+MMI-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_mmi.o
+MMI-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_mmi.o           \
+                                             mips/simple_idct_mmi.o
+MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
+MMI-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_mmi.o
+MMI-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_mmi.o
+MMI-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_mmi.o
diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c
new file mode 100644
index 0000000000..18d3f88743
--- /dev/null
+++ b/libavcodec/mips/aaccoder_mips.c
@@ -0,0 +1,2356 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *          Szabolcs Pal     (sabolc@mips.com)
+ *
+ * AAC coefficients encoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aaccoder.c
+ */
+
+#include "libavutil/libm.h"
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/put_bits.h"
+#include "libavcodec/aac.h"
+#include "libavcodec/aacenc.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/aacenctab.h"
+
+#if HAVE_INLINE_ASM
+typedef struct BandCodingPath {
+    int prev_idx;
+    float cost;
+    int run;
+} BandCodingPath;
+
+static const uint8_t uquad_sign_bits[81] = {
+    0, 1, 1, 1, 2, 2, 1, 2, 2,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4
+};
+
+static const uint8_t upair7_sign_bits[64] = {
+    0, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const uint8_t upair12_sign_bits[169] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+static const uint8_t esc_sign_bits[289] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+#define ROUND_STANDARD 0.4054f
+#define ROUND_TO_ZERO 0.1054f
+
+static void abs_pow34_v(float *out, const float *in, const int size) {
+#ifndef USE_REALLY_FULL_SEARCH
+    int i;
+    float a, b, c, d;
+    float ax, bx, cx, dx;
+
+    for (i = 0; i < size; i += 4) {
+        a = fabsf(in[i  ]);
+        b = fabsf(in[i+1]);
+        c = fabsf(in[i+2]);
+        d = fabsf(in[i+3]);
+
+        ax = sqrtf(a);
+        bx = sqrtf(b);
+        cx = sqrtf(c);
+        dx = sqrtf(d);
+
+        a = a * ax;
+        b = b * bx;
+        c = c * cx;
+        d = d * dx;
+
+        out[i  ] = sqrtf(a);
+        out[i+1] = sqrtf(b);
+        out[i+2] = sqrtf(c);
+        out[i+3] = sqrtf(d);
+    }
+#endif /* USE_REALLY_FULL_SEARCH */
+}
+
+static float find_max_val(int group_len, int swb_size, const float *scaled) {
+    float maxval = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        for (i = 0; i < swb_size; i++) {
+            maxval = FFMAX(maxval, scaled[w2*128+i]);
+        }
+    }
+    return maxval;
+}
+
+static int find_min_book(float maxval, int sf) {
+    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
+    float Q34 = sqrtf(Q * sqrtf(Q));
+    int qmaxval, cb;
+    if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
+        cb = 11;
+    else
+        cb = aac_maxval_cb[qmaxval];
+    return cb;
+}
+
+/**
+ * Functions developed from template function and optimized for quantizing and encoding band
+ */
+static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        put_bits(pb, p_bits[curidx], p_codes[curidx]);
+
+        if (out) {
+           vec = &p_vec[curidx*4];
+           out[i+0] = vec[0] * IQ;
+           out[i+1] = vec[1] * IQ;
+           out[i+2] = vec[2] * IQ;
+           out[i+3] = vec[3] * IQ;
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, sign, count;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      2       \n\t"
+            "ori    %[sign],    $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign],    %[t0],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc2]  \n\t"
+            "slt    %[t4],      $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count],   $zero,      %[qc3]  \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t2]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t4]   \n\t"
+            "addu   %[count],   %[count],   %[t1]   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign]"=&r"(sign), [count]"=&r"(count),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
+        v_bits  = p_bits[curidx] + count;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec = &p_vec[curidx*4];
+           out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
+        v_bits  = p_bits[curidx] + p_bits[curidx2];
+        put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx*2 ];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = vec1[0] * IQ;
+           out[i+1] = vec1[1] * IQ;
+           out[i+2] = vec2[0] * IQ;
+           out[i+3] = vec2[1] * IQ;
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                                      PutBitContext *pb, const float *in, float *out,
+                                                      const float *scaled, int size, int scale_idx,
+                                                      int cb, const float lambda, const float uplim,
+                                                      int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      7       \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "t0", "t1", "t2", "t3", "t4",
+              "memory"
+        );
+
+        curidx1  = 8 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx1*2];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                                       PutBitContext *pb, const float *in, float *out,
+                                                       const float *scaled, int size, int scale_idx,
+                                                       int cb, const float lambda, const float uplim,
+                                                       int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      12      \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx1  = 13 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx1*2];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
+                                                   PutBitContext *pb, const float *in, float *out,
+                                                   const float *scaled, int size, int scale_idx,
+                                                   int cb, const float lambda, const float uplim,
+                                                   int *bits, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+
+    if (cb < 11) {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (out) {
+               vec1 = &p_vectors[curidx*2 ];
+               vec2 = &p_vectors[curidx2*2];
+               out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+               out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+               out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+               out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+            }
+        }
+    } else {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int c1, c2, c3, c4;
+            int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
+                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
+                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
+                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
+                "srl        %[c1],      %[c1],      18      \n\t"
+                "srl        %[c2],      %[c2],      18      \n\t"
+                "srl        %[c3],      %[c3],      18      \n\t"
+                "srl        %[c4],      %[c4],      18      \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [c1]"=&r"(c1), [c2]"=&r"(c2),
+                  [c3]"=&r"(c3), [c4]"=&r"(c4),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx*2  ] == 64.0f) {
+                int len = av_log2(c1);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx*2+1] == 64.0f) {
+                int len = av_log2(c2);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
+                put_bits(pb, len*2-3, v_codes);
+            }
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx2*2  ] == 64.0f) {
+                int len = av_log2(c3);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
+                put_bits(pb, len* 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx2*2+1] == 64.0f) {
+                int len = av_log2(c4);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+
+            if (out) {
+               vec1 = &p_vectors[curidx*2];
+               vec2 = &p_vectors[curidx2*2];
+               out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+               out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+               out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+               out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+            }
+        }
+    }
+}
+
+static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, const float ROUNDING) {
+    av_assert0(0);
+}
+
+static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, const float ROUNDING) {
+    int i;
+    if (bits)
+        *bits = 0;
+    if (out) {
+        for (i = 0; i < size; i += 4) {
+           out[i  ] = 0.0f;
+           out[i+1] = 0.0f;
+           out[i+2] = 0.0f;
+           out[i+3] = 0.0f;
+        }
+    }
+}
+
+static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, const float ROUNDING) = {
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_ESC_mips,
+    quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+};
+
+#define quantize_and_encode_band_cost(                                       \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, ROUNDING)               \
+    quantize_and_encode_band_cost_arr[cb](                                   \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, ROUNDING)
+
+static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
+                                          const float *in, float *out, int size, int scale_idx,
+                                          int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the number of bits
+ */
+static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    return 0;
+}
+
+static float get_band_numbits_NONE_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  2       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx  = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2  = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx] + p_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  7       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 8 * qc1;
+        curidx += qc2;
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   upair7_sign_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair7_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
+                                           PutBitContext *pb, const float *in,
+                                           const float *scaled, int size, int scale_idx,
+                                           int cb, const float lambda, const float uplim,
+                                           int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  12      \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 13 * qc1;
+        curidx += qc2;
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair12_sign_bits[curidx] +
+                   upair12_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_ESC_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t4, t5;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],      $zero,  15          \n\t"
+            "ori        %[t5],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
+            "ori        %[t5],      $zero,  31          \n\t"
+            "clz        %[c1],      %[c1]               \n\t"
+            "clz        %[c2],      %[c2]               \n\t"
+            "clz        %[c3],      %[c3]               \n\t"
+            "clz        %[c4],      %[c4]               \n\t"
+            "subu       %[c1],      %[t5],  %[c1]       \n\t"
+            "subu       %[c2],      %[t5],  %[c2]       \n\t"
+            "subu       %[c3],      %[t5],  %[c3]       \n\t"
+            "subu       %[c4],      %[t5],  %[c4]       \n\t"
+            "sll        %[c1],      %[c1],  1           \n\t"
+            "sll        %[c2],      %[c2],  1           \n\t"
+            "sll        %[c3],      %[c3],  1           \n\t"
+            "sll        %[c4],      %[c4],  1           \n\t"
+            "addiu      %[c1],      %[c1],  -3          \n\t"
+            "addiu      %[c2],      %[c2],  -3          \n\t"
+            "addiu      %[c3],      %[c3],  -3          \n\t"
+            "addiu      %[c4],      %[c4],  -3          \n\t"
+            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
+            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
+            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
+            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
+            "and        %[c1],      %[c1],  %[cond0]    \n\t"
+            "and        %[c2],      %[c2],  %[cond1]    \n\t"
+            "and        %[c3],      %[c3],  %[cond2]    \n\t"
+            "and        %[c4],      %[c4],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t4]"=&r"(t4), [t5]"=&r"(t5)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+
+        curbits += c1;
+        curbits += c2;
+        curbits += c3;
+        curbits += c4;
+    }
+    return curbits;
+}
+
+static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
+                                             PutBitContext *pb, const float *in,
+                                             const float *scaled, int size, int scale_idx,
+                                             int cb, const float lambda, const float uplim,
+                                             int *bits) = {
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_ESC_mips,
+    get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+};
+
+#define get_band_numbits(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)                    \
+    get_band_numbits_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)
+
+static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, int rtz)
+{
+    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the band cost
+ */
+#if HAVE_MIPSFPU
+static float get_band_cost_ZERO_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits)
+{
+    int i;
+    float cost = 0;
+
+    for (i = 0; i < size; i += 4) {
+        cost += in[i  ] * in[i  ];
+        cost += in[i+1] * in[i+1];
+        cost += in[i+2] * in[i+2];
+        cost += in[i+3] * in[i+3];
+    }
+    if (bits)
+        *bits = 0;
+    return cost * lambda;
+}
+
+static float get_band_cost_NONE_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "slt        %[qc1], $zero,  %[qc1]          \n\t"
+            "slt        %[qc2], $zero,  %[qc2]          \n\t"
+            "slt        %[qc3], $zero,  %[qc3]          \n\t"
+            "slt        %[qc4], $zero,  %[qc4]          \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    8(%[vec])               \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  2               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       %[di0], 0(%[in_pos])            \n\t"
+            "lwc1       %[di1], 4(%[in_pos])            \n\t"
+            "lwc1       %[di2], 8(%[in_pos])            \n\t"
+            "lwc1       %[di3], 12(%[in_pos])           \n\t"
+            "abs.s      %[di0], %[di0]                  \n\t"
+            "abs.s      %[di1], %[di1]                  \n\t"
+            "abs.s      %[di2], %[di2]                  \n\t"
+            "abs.s      %[di3], %[di3]                  \n\t"
+            "lwc1       $f0,    0(%[vec])               \n\t"
+            "lwc1       $f1,    4(%[vec])               \n\t"
+            "lwc1       $f2,    8(%[vec])               \n\t"
+            "lwc1       $f3,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  4               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    0(%[vec2])              \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    4(%[vec2])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2, sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      7               \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 8 * qc1;
+        curidx += qc2;
+
+        curidx2 = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += upair7_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += upair7_sign_bits[curidx2];
+        vec2    = &p_codes[curidx2*2];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      12              \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 13 * qc1;
+        curidx += qc2;
+
+        curidx2 = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += upair12_sign_bits[curidx];
+        curbits += upair12_sign_bits[curidx2];
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_ESC_mips(struct AACEncContext *s,
+                                    PutBitContext *pb, const float *in,
+                                    const float *scaled, int size, int scale_idx,
+                                    int cb, const float lambda, const float uplim,
+                                    int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f * IQ;
+    int i;
+    float cost = 0;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        float t1, t2, t3, t4;
+        float di1, di2, di3, di4;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t6],      $zero,  15          \n\t"
+            "ori        %[t7],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t6]"=&r"(t6), [t7]"=&r"(t7)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+        vec2     = &p_codes[curidx2*2];
+
+        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
+        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
+        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
+        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
+
+        t1 = fabsf(in[i  ]);
+        t2 = fabsf(in[i+1]);
+        t3 = fabsf(in[i+2]);
+        t4 = fabsf(in[i+3]);
+
+        if (cond0) {
+            if (t1 >= CLIPPED_ESCAPE) {
+                di1 = t1 - CLIPPED_ESCAPE;
+            } else {
+                di1 = t1 - c1 * cbrtf(c1) * IQ;
+            }
+        } else
+            di1 = t1 - vec[0] * IQ;
+
+        if (cond1) {
+            if (t2 >= CLIPPED_ESCAPE) {
+                di2 = t2 - CLIPPED_ESCAPE;
+            } else {
+                di2 = t2 - c2 * cbrtf(c2) * IQ;
+            }
+        } else
+            di2 = t2 - vec[1] * IQ;
+
+        if (cond2) {
+            if (t3 >= CLIPPED_ESCAPE) {
+                di3 = t3 - CLIPPED_ESCAPE;
+            } else {
+                di3 = t3 - c3 * cbrtf(c3) * IQ;
+            }
+        } else
+            di3 = t3 - vec2[0] * IQ;
+
+        if (cond3) {
+            if (t4 >= CLIPPED_ESCAPE) {
+                di4 = t4 - CLIPPED_ESCAPE;
+            } else {
+                di4 = t4 - c4 * cbrtf(c4) * IQ;
+            }
+        } else
+            di4 = t4 - vec2[1]*IQ;
+
+        cost += di1 * di1 + di2 * di2
+                + di3 * di3 + di4 * di4;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float (*const get_band_cost_arr[])(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits) = {
+    get_band_cost_ZERO_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_ESC_mips,
+    get_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+};
+
+#define get_band_cost(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)                    \
+    get_band_cost_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)
+
+static float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, int rtz)
+{
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+}
+
+#include "libavcodec/aaccoder_twoloop.h"
+
+static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
+{
+    int start = 0, i, w, w2, g;
+    float M[128], S[128];
+    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
+                float dist1 = 0.0f, dist2 = 0.0f;
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                    float minthr = FFMIN(band0->threshold, band1->threshold);
+                    float maxthr = FFMAX(band0->threshold, band1->threshold);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
+                        M[i  ] = (sce0->coeffs[start+w2*128+i  ]
+                                + sce1->coeffs[start+w2*128+i  ]) * 0.5;
+                        M[i+1] = (sce0->coeffs[start+w2*128+i+1]
+                                + sce1->coeffs[start+w2*128+i+1]) * 0.5;
+                        M[i+2] = (sce0->coeffs[start+w2*128+i+2]
+                                + sce1->coeffs[start+w2*128+i+2]) * 0.5;
+                        M[i+3] = (sce0->coeffs[start+w2*128+i+3]
+                                + sce1->coeffs[start+w2*128+i+3]) * 0.5;
+
+                        S[i  ] =  M[i  ]
+                                - sce1->coeffs[start+w2*128+i  ];
+                        S[i+1] =  M[i+1]
+                                - sce1->coeffs[start+w2*128+i+1];
+                        S[i+2] =  M[i+2]
+                                - sce1->coeffs[start+w2*128+i+2];
+                        S[i+3] =  M[i+3]
+                                - sce1->coeffs[start+w2*128+i+3];
+                   }
+                    abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                    dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                L34,
+                                                sce0->ics.swb_sizes[g],
+                                                sce0->sf_idx[(w+w2)*16+g],
+                                                sce0->band_type[(w+w2)*16+g],
+                                                lambda / band0->threshold, INFINITY, NULL, 0);
+                    dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                R34,
+                                                sce1->ics.swb_sizes[g],
+                                                sce1->sf_idx[(w+w2)*16+g],
+                                                sce1->band_type[(w+w2)*16+g],
+                                                lambda / band1->threshold, INFINITY, NULL, 0);
+                    dist2 += quantize_band_cost(s, M,
+                                                M34,
+                                                sce0->ics.swb_sizes[g],
+                                                sce0->sf_idx[(w+w2)*16+g],
+                                                sce0->band_type[(w+w2)*16+g],
+                                                lambda / maxthr, INFINITY, NULL, 0);
+                    dist2 += quantize_band_cost(s, S,
+                                                S34,
+                                                sce1->ics.swb_sizes[g],
+                                                sce1->sf_idx[(w+w2)*16+g],
+                                                sce1->band_type[(w+w2)*16+g],
+                                                lambda / minthr, INFINITY, NULL, 0);
+                }
+                cpe->ms_mask[w*16+g] = dist2 < dist1;
+            }
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+}
+#endif /*HAVE_MIPSFPU */
+
+#include "libavcodec/aaccoder_trellis.h"
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aac_coder_init_mips(AACEncContext *c) {
+#if HAVE_INLINE_ASM
+    AACCoefficientsEncoder *e = c->coder;
+    int option = c->options.aac_coder;
+
+    if (option == 2) {
+        e->quantize_and_encode_band = quantize_and_encode_band_mips;
+        e->encode_window_bands_info = codebook_trellis_rate;
+#if HAVE_MIPSFPU
+        e->search_for_quantizers    = search_for_quantizers_twoloop;
+#endif /* HAVE_MIPSFPU */
+    }
+#if HAVE_MIPSFPU
+    e->search_for_ms            = search_for_ms_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.c b/libavcodec/mips/aacdec_mips.c
new file mode 100644
index 0000000000..253cdeb80b
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#include "libavcodec/aac.h"
+#include "aacdec_mips.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/sinewin.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static av_always_inline void float_copy(float *dst, const float *src, int count)
+{
+    // Copy 'count' floats from src to dst
+    const float *loop_end = src + count;
+    int temp[8];
+
+    // count must be a multiple of 8
+    av_assert2(count % 8 == 0);
+
+    // loop unrolled 8 times
+    __asm__ volatile (
+        ".set push                               \n\t"
+        ".set noreorder                          \n\t"
+    "1:                                          \n\t"
+        "lw      %[temp0],    0(%[src])          \n\t"
+        "lw      %[temp1],    4(%[src])          \n\t"
+        "lw      %[temp2],    8(%[src])          \n\t"
+        "lw      %[temp3],    12(%[src])         \n\t"
+        "lw      %[temp4],    16(%[src])         \n\t"
+        "lw      %[temp5],    20(%[src])         \n\t"
+        "lw      %[temp6],    24(%[src])         \n\t"
+        "lw      %[temp7],    28(%[src])         \n\t"
+        PTR_ADDIU "%[src],    %[src],      32    \n\t"
+        "sw      %[temp0],    0(%[dst])          \n\t"
+        "sw      %[temp1],    4(%[dst])          \n\t"
+        "sw      %[temp2],    8(%[dst])          \n\t"
+        "sw      %[temp3],    12(%[dst])         \n\t"
+        "sw      %[temp4],    16(%[dst])         \n\t"
+        "sw      %[temp5],    20(%[dst])         \n\t"
+        "sw      %[temp6],    24(%[dst])         \n\t"
+        "sw      %[temp7],    28(%[dst])         \n\t"
+        "bne     %[src],      %[loop_end], 1b    \n\t"
+        PTR_ADDIU "%[dst],    %[dst],      32    \n\t"
+        ".set pop                                \n\t"
+
+        : [temp0]"=&r"(temp[0]), [temp1]"=&r"(temp[1]),
+          [temp2]"=&r"(temp[2]), [temp3]"=&r"(temp[3]),
+          [temp4]"=&r"(temp[4]), [temp5]"=&r"(temp[5]),
+          [temp6]"=&r"(temp[6]), [temp7]"=&r"(temp[7]),
+          [src]"+r"(src), [dst]"+r"(dst)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *in    = sce->coeffs;
+    float *out   = sce->ret;
+    float *saved = sce->saved;
+    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float *buf  = ac->buf_mdct;
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        float_copy(out, saved, 448);
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            {
+                float wi;
+                float wj;
+                int i;
+                float temp0, temp1, temp2, temp3;
+                float *dst0 = out + 448 + 0*128;
+                float *dst1 = dst0 + 64 + 63;
+                float *dst2 = saved + 63;
+                float *win0 = (float*)swindow;
+                float *win1 = win0 + 64 + 63;
+                float *win0_prev = (float*)swindow_prev;
+                float *win1_prev = win0_prev + 64 + 63;
+                float *src0_prev = saved + 448;
+                float *src1_prev = buf + 0*128 + 63;
+                float *src0 = buf + 0*128 + 64;
+                float *src1 = buf + 1*128 + 63;
+
+                for(i = 0; i < 64; i++)
+                {
+                    temp0 = src0_prev[0];
+                    temp1 = src1_prev[0];
+                    wi = *win0_prev;
+                    wj = *win1_prev;
+                    temp2 = src0[0];
+                    temp3 = src1[0];
+                    dst0[0] = temp0 * wj - temp1 * wi;
+                    dst1[0] = temp0 * wi + temp1 * wj;
+
+                    wi = *win0;
+                    wj = *win1;
+
+                    temp0 = src0[128];
+                    temp1 = src1[128];
+                    dst0[128] = temp2 * wj - temp3 * wi;
+                    dst1[128] = temp2 * wi + temp3 * wj;
+
+                    temp2 = src0[256];
+                    temp3 = src1[256];
+                    dst0[256] = temp0 * wj - temp1 * wi;
+                    dst1[256] = temp0 * wi + temp1 * wj;
+                    dst0[384] = temp2 * wj - temp3 * wi;
+                    dst1[384] = temp2 * wi + temp3 * wj;
+
+                    temp0 = src0[384];
+                    temp1 = src1[384];
+                    dst0[512] = temp0 * wj - temp1 * wi;
+                    dst2[0] = temp0 * wi + temp1 * wj;
+
+                    src0++;
+                    src1--;
+                    src0_prev++;
+                    src1_prev--;
+                    win0++;
+                    win1--;
+                    win0_prev++;
+                    win1_prev--;
+                    dst0++;
+                    dst1--;
+                    dst2--;
+                }
+            }
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            float_copy(out + 576, buf + 64, 448);
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float_copy(saved, buf + 512, 448);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        float_copy(saved, buf + 512, 512);
+    }
+}
+
+static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+    int j, k;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        float *predTime = sce->ret;
+        float *predFreq = ac->buf_mdct;
+        float *p_predTime;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+            j = (2048 - num_samples) >> 2;
+            k = (2048 - num_samples) & 3;
+            p_predTime = &predTime[num_samples];
+
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
+        for (i = 0; i < j; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                "sw      $0,              4(%[p_predTime])        \n\t"
+                "sw      $0,              8(%[p_predTime])        \n\t"
+                "sw      $0,              12(%[p_predTime])       \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     16   \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+        for (i = 0; i < k; i++) {
+
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     4    \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+#if HAVE_MIPSFPU
+static av_always_inline void fmul_and_reverse(float *dst, const float *src0, const float *src1, int count)
+{
+    /* Multiply 'count' floats in src0 by src1 and store the results in dst in reverse */
+    /* This should be equivalent to a normal fmul, followed by reversing dst */
+
+    // count must be a multiple of 4
+    av_assert2(count % 4 == 0);
+
+    // move src0 and src1 to the last element of their arrays
+    src0 += count - 1;
+    src1 += count - 1;
+
+    for (; count > 0; count -= 4){
+        float temp[12];
+
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+            "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+            "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+            "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+            "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+            "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+            "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+            "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+            "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+            "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+            "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+            "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+            "swc1    %[temp8],    0(%[ptr1])                \n\t"
+            "swc1    %[temp9],    4(%[ptr1])                \n\t"
+            "swc1    %[temp10],   8(%[ptr1])                \n\t"
+            "swc1    %[temp11],   12(%[ptr1])               \n\t"
+            PTR_ADDIU "%[ptr1],   %[ptr1],      16          \n\t"
+            PTR_ADDIU "%[ptr2],   %[ptr2],      -16         \n\t"
+            PTR_ADDIU "%[ptr3],   %[ptr3],      -16         \n\t"
+
+            : [temp0]"=&f"(temp[0]), [temp1]"=&f"(temp[1]),
+              [temp2]"=&f"(temp[2]), [temp3]"=&f"(temp[3]),
+              [temp4]"=&f"(temp[4]), [temp5]"=&f"(temp[5]),
+              [temp6]"=&f"(temp[6]), [temp7]"=&f"(temp[7]),
+              [temp8]"=&f"(temp[8]), [temp9]"=&f"(temp[9]),
+              [temp10]"=&f"(temp[10]), [temp11]"=&f"(temp[11]),
+              [ptr1]"+r"(dst), [ptr2]"+r"(src0), [ptr3]"+r"(src1)
+            :
+            : "memory"
+        );
+    }
+}
+
+static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *saved     = sce->saved;
+    float *saved_ltp = sce->coeffs;
+    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        float *p_saved_ltp = saved_ltp + 576;
+        float *loop_end1 = p_saved_ltp + 448;
+
+        float_copy(saved_ltp, saved, 512);
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                                   \n\t"
+            "sw     $0,              0(%[p_saved_ltp])        \n\t"
+            "sw     $0,              4(%[p_saved_ltp])        \n\t"
+            "sw     $0,              8(%[p_saved_ltp])        \n\t"
+            "sw     $0,              12(%[p_saved_ltp])       \n\t"
+            "sw     $0,              16(%[p_saved_ltp])       \n\t"
+            "sw     $0,              20(%[p_saved_ltp])       \n\t"
+            "sw     $0,              24(%[p_saved_ltp])       \n\t"
+            "sw     $0,              28(%[p_saved_ltp])       \n\t"
+            PTR_ADDIU "%[p_saved_ltp],%[p_saved_ltp],    32   \n\t"
+            "bne    %[p_saved_ltp],  %[loop_end1],       1b   \n\t"
+
+            : [p_saved_ltp]"+r"(p_saved_ltp)
+            : [loop_end1]"r"(loop_end1)
+            : "memory"
+        );
+
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float *buff0 = saved;
+        float *buff1 = saved_ltp;
+        float *loop_end = saved + 448;
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+        "1:                                             \n\t"
+            "lw      %[temp0],    0(%[src])             \n\t"
+            "lw      %[temp1],    4(%[src])             \n\t"
+            "lw      %[temp2],    8(%[src])             \n\t"
+            "lw      %[temp3],    12(%[src])            \n\t"
+            "lw      %[temp4],    16(%[src])            \n\t"
+            "lw      %[temp5],    20(%[src])            \n\t"
+            "lw      %[temp6],    24(%[src])            \n\t"
+            "lw      %[temp7],    28(%[src])            \n\t"
+            PTR_ADDIU "%[src],    %[src],         32    \n\t"
+            "sw      %[temp0],    0(%[dst])             \n\t"
+            "sw      %[temp1],    4(%[dst])             \n\t"
+            "sw      %[temp2],    8(%[dst])             \n\t"
+            "sw      %[temp3],    12(%[dst])            \n\t"
+            "sw      %[temp4],    16(%[dst])            \n\t"
+            "sw      %[temp5],    20(%[dst])            \n\t"
+            "sw      %[temp6],    24(%[dst])            \n\t"
+            "sw      %[temp7],    28(%[dst])            \n\t"
+            "sw      $0,          2304(%[dst])          \n\t"
+            "sw      $0,          2308(%[dst])          \n\t"
+            "sw      $0,          2312(%[dst])          \n\t"
+            "sw      $0,          2316(%[dst])          \n\t"
+            "sw      $0,          2320(%[dst])          \n\t"
+            "sw      $0,          2324(%[dst])          \n\t"
+            "sw      $0,          2328(%[dst])          \n\t"
+            "sw      $0,          2332(%[dst])          \n\t"
+            "bne     %[src],      %[loop_end],    1b    \n\t"
+            PTR_ADDIU "%[dst],    %[dst],         32    \n\t"
+            ".set pop                                   \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [src]"+r"(buff0), [dst]"+r"(buff1)
+            : [loop_end]"r"(loop_end)
+            : "memory"
+        );
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 512, lwindow, 512);
+    }
+
+    float_copy(sce->ltp_state, sce->ltp_state + 1024, 1024);
+    float_copy(sce->ltp_state + 1024, sce->ret, 1024);
+    float_copy(sce->ltp_state + 2048, saved_ltp, 1024);
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacdec_init_mips(AACContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->imdct_and_windowing         = imdct_and_windowing_mips;
+    c->apply_ltp                   = apply_ltp_mips;
+#if HAVE_MIPSFPU
+    c->update_ltp                  = update_ltp_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.h b/libavcodec/mips/aacdec_mips.h
new file mode 100644
index 0000000000..054a9fba62
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#ifndef AVCODEC_MIPS_AACDEC_FLOAT_H
+#define AVCODEC_MIPS_AACDEC_FLOAT_H
+
+#include "libavcodec/aac.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+static inline float *VMUL2_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    float temp0, temp1, temp2;
+    int temp3, temp4;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp3],  %[idx],       0x0F         \n\t"
+        "andi    %[temp4],  %[idx],       0xF0         \n\t"
+        "sll     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     2            \n\t"
+        "lwc1    %[temp2],  0(%[scale])                \n\t"
+        "lwxc1   %[temp0],  %[temp3](%[v])             \n\t"
+        "lwxc1   %[temp1],  %[temp4](%[v])             \n\t"
+        "mul.s   %[temp0],  %[temp0],     %[temp2]     \n\t"
+        "mul.s   %[temp1],  %[temp1],     %[temp2]     \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8            \n\t"
+        "swc1    %[temp0],  0(%[dst])                  \n\t"
+        "swc1    %[temp1],  4(%[dst])                  \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    int temp0, temp1, temp2, temp3;
+    float temp4, temp5, temp6, temp7, temp8;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x03        \n\t"
+        "andi    %[temp1],  %[idx],       0x0C        \n\t"
+        "andi    %[temp2],  %[idx],       0x30        \n\t"
+        "andi    %[temp3],  %[idx],       0xC0        \n\t"
+        "sll     %[temp0],  %[temp0],     2           \n\t"
+        "srl     %[temp2],  %[temp2],     2           \n\t"
+        "srl     %[temp3],  %[temp3],     4           \n\t"
+        "lwc1    %[temp4],  0(%[scale])               \n\t"
+        "lwxc1   %[temp5],  %[temp0](%[v])            \n\t"
+        "lwxc1   %[temp6],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp7],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp8],  %[temp3](%[v])            \n\t"
+        "mul.s   %[temp5],  %[temp5],     %[temp4]    \n\t"
+        "mul.s   %[temp6],  %[temp6],     %[temp4]    \n\t"
+        "mul.s   %[temp7],  %[temp7],     %[temp4]    \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp4]    \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       16          \n\t"
+        "swc1    %[temp5],  0(%[dst])                 \n\t"
+        "swc1    %[temp6],  4(%[dst])                 \n\t"
+        "swc1    %[temp7],  8(%[dst])                 \n\t"
+        "swc1    %[temp8],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL2S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x0F       \n\t"
+        "andi    %[temp1],  %[idx],       0xF0       \n\t"
+        "lw      %[temp4],  0(%[scale])              \n\t"
+        "srl     %[temp2],  %[sign],      1          \n\t"
+        "sll     %[temp3],  %[sign],      31         \n\t"
+        "sll     %[temp2],  %[temp2],     31         \n\t"
+        "sll     %[temp0],  %[temp0],     2          \n\t"
+        "srl     %[temp1],  %[temp1],     2          \n\t"
+        "lwxc1   %[temp8],  %[temp0](%[v])           \n\t"
+        "lwxc1   %[temp9],  %[temp1](%[v])           \n\t"
+        "xor     %[temp5],  %[temp4],     %[temp2]   \n\t"
+        "xor     %[temp4],  %[temp4],     %[temp3]   \n\t"
+        "mtc1    %[temp5],  %[temp6]                 \n\t"
+        "mtc1    %[temp4],  %[temp7]                 \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp6]   \n\t"
+        "mul.s   %[temp9],  %[temp9],     %[temp7]   \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8          \n\t"
+        "swc1    %[temp8],  0(%[dst])                \n\t"
+        "swc1    %[temp9],  4(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+          [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [sign]"r"(sign)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4;
+    float temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+    float *ret;
+    unsigned int mask = 1U << 31;
+
+    __asm__ volatile(
+        "lw      %[temp0],   0(%[scale])               \n\t"
+        "andi    %[temp1],  %[idx],       0x03         \n\t"
+        "andi    %[temp2],  %[idx],       0x0C         \n\t"
+        "andi    %[temp3],  %[idx],       0x30         \n\t"
+        "andi    %[temp4],  %[idx],       0xC0         \n\t"
+        "sll     %[temp1],  %[temp1],     2            \n\t"
+        "srl     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     4            \n\t"
+        "lwxc1   %[temp10],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp11],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp12],  %[temp3](%[v])            \n\t"
+        "lwxc1   %[temp13],  %[temp4](%[v])            \n\t"
+        "and     %[temp1],   %[sign],      %[mask]     \n\t"
+        "srl     %[temp2],   %[idx],       12          \n\t"
+        "srl     %[temp3],   %[idx],       13          \n\t"
+        "srl     %[temp4],   %[idx],       14          \n\t"
+        "andi    %[temp2],   %[temp2],     1           \n\t"
+        "andi    %[temp3],   %[temp3],     1           \n\t"
+        "andi    %[temp4],   %[temp4],     1           \n\t"
+        "sllv    %[sign],    %[sign],      %[temp2]    \n\t"
+        "xor     %[temp1],   %[temp0],     %[temp1]    \n\t"
+        "and     %[temp2],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp1],   %[temp14]                 \n\t"
+        "xor     %[temp2],   %[temp0],     %[temp2]    \n\t"
+        "sllv    %[sign],    %[sign],      %[temp3]    \n\t"
+        "mtc1    %[temp2],   %[temp15]                 \n\t"
+        "and     %[temp3],   %[sign],      %[mask]     \n\t"
+        "sllv    %[sign],    %[sign],      %[temp4]    \n\t"
+        "xor     %[temp3],   %[temp0],     %[temp3]    \n\t"
+        "and     %[temp4],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp3],   %[temp16]                 \n\t"
+        "xor     %[temp4],   %[temp0],     %[temp4]    \n\t"
+        "mtc1    %[temp4],   %[temp17]                 \n\t"
+        "mul.s   %[temp10],  %[temp10],    %[temp14]   \n\t"
+        "mul.s   %[temp11],  %[temp11],    %[temp15]   \n\t"
+        "mul.s   %[temp12],  %[temp12],    %[temp16]   \n\t"
+        "mul.s   %[temp13],  %[temp13],    %[temp17]   \n\t"
+        PTR_ADDIU "%[ret],   %[dst],       16          \n\t"
+        "swc1    %[temp10],  0(%[dst])                 \n\t"
+        "swc1    %[temp11],  4(%[dst])                 \n\t"
+        "swc1    %[temp12],  8(%[dst])                 \n\t"
+        "swc1    %[temp13],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp10]"=&f"(temp10),
+          [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+          [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+          [temp15]"=&f"(temp15), [temp16]"=&f"(temp16),
+          [temp17]"=&f"(temp17), [ret]"=&r"(ret),
+          [sign]"+r"(sign)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [mask]"r"(mask)
+        : "memory"
+    );
+    return ret;
+}
+
+#define VMUL2 VMUL2_mips
+#define VMUL4 VMUL4_mips
+#define VMUL2S VMUL2S_mips
+#define VMUL4S VMUL4S_mips
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+#endif /* AVCODEC_MIPS_AACDEC_FLOAT_H */
diff --git a/libavcodec/mips/aacpsdsp_mips.c b/libavcodec/mips/aacpsdsp_mips.c
new file mode 100644
index 0000000000..695f9ef3c6
--- /dev/null
+++ b/libavcodec/mips/aacpsdsp_mips.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/aacpsdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ps_hybrid_analysis_ileave_mips(float (*out)[32][2], float L[2][38][64],
+                                        int i, int len)
+{
+    int temp0, temp1, temp2, temp3;
+    int temp4, temp5, temp6, temp7;
+    float *out1=&out[i][0][0];
+    float *L1=&L[0][0][i];
+    float *j=out1+ len*2;
+
+    for (; i < 64; i++) {
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                          \n\t"
+            "lw      %[temp0],   0(%[L1])            \n\t"
+            "lw      %[temp1],   9728(%[L1])         \n\t"
+            "lw      %[temp2],   256(%[L1])          \n\t"
+            "lw      %[temp3],   9984(%[L1])         \n\t"
+            "lw      %[temp4],   512(%[L1])          \n\t"
+            "lw      %[temp5],   10240(%[L1])        \n\t"
+            "lw      %[temp6],   768(%[L1])          \n\t"
+            "lw      %[temp7],   10496(%[L1])        \n\t"
+            "sw      %[temp0],   0(%[out1])          \n\t"
+            "sw      %[temp1],   4(%[out1])          \n\t"
+            "sw      %[temp2],   8(%[out1])          \n\t"
+            "sw      %[temp3],   12(%[out1])         \n\t"
+            "sw      %[temp4],   16(%[out1])         \n\t"
+            "sw      %[temp5],   20(%[out1])         \n\t"
+            "sw      %[temp6],   24(%[out1])         \n\t"
+            "sw      %[temp7],   28(%[out1])         \n\t"
+            PTR_ADDIU "%[out1],  %[out1],      32    \n\t"
+            PTR_ADDIU "%[L1],    %[L1],        1024  \n\t"
+            "bne     %[out1],    %[j],         1b    \n\t"
+
+            : [out1]"+r"(out1), [L1]"+r"(L1), [j]"+r"(j),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            : [len]"r"(len)
+            : "memory"
+        );
+        out1-=(len<<1)-64;
+        L1-=(len<<6)-1;
+        j+=len*2;
+    }
+}
+
+static void ps_hybrid_synthesis_deint_mips(float out[2][38][64],
+                                        float (*in)[32][2],
+                                        int i, int len)
+{
+    int n;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *out1 = (float*)out + i;
+    float *out2 = (float*)out + 2432 + i;
+    float *in1 = (float*)in + 64 * i;
+    float *in2 = (float*)in + 64 * i + 1;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < 7; n++) {
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                 "lw      %[temp0],   0(%[in1])               \n\t"
+                 "lw      %[temp1],   0(%[in2])               \n\t"
+                 "lw      %[temp2],   8(%[in1])               \n\t"
+                 "lw      %[temp3],   8(%[in2])               \n\t"
+                 "lw      %[temp4],   16(%[in1])              \n\t"
+                 "lw      %[temp5],   16(%[in2])              \n\t"
+                 "lw      %[temp6],   24(%[in1])              \n\t"
+                 "lw      %[temp7],   24(%[in2])              \n\t"
+                 PTR_ADDIU "%[out1],  %[out1],         1024   \n\t"
+                 PTR_ADDIU "%[out2],  %[out2],         1024   \n\t"
+                 PTR_ADDIU "%[in1],   %[in1],          32     \n\t"
+                 PTR_ADDIU "%[in2],   %[in2],          32     \n\t"
+                 "sw      %[temp0],   -1024(%[out1])          \n\t"
+                 "sw      %[temp1],   -1024(%[out2])          \n\t"
+                 "sw      %[temp2],   -768(%[out1])           \n\t"
+                 "sw      %[temp3],   -768(%[out2])           \n\t"
+                 "sw      %[temp4],   -512(%[out1])           \n\t"
+                 "sw      %[temp5],   -512(%[out2])           \n\t"
+                 "sw      %[temp6],   -256(%[out1])           \n\t"
+                 "sw      %[temp7],   -256(%[out2])           \n\t"
+
+                 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                   [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                   [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                   [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                   [out1]"+r"(out1), [out2]"+r"(out2),
+                   [in1]"+r"(in1), [in2]"+r"(in2)
+                 :
+                 : "memory"
+            );
+        }
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lw      %[temp0],   0(%[in1])               \n\t"
+            "lw      %[temp1],   0(%[in2])               \n\t"
+            "lw      %[temp2],   8(%[in1])               \n\t"
+            "lw      %[temp3],   8(%[in2])               \n\t"
+            "lw      %[temp4],   16(%[in1])              \n\t"
+            "lw      %[temp5],   16(%[in2])              \n\t"
+            "lw      %[temp6],   24(%[in1])              \n\t"
+            "lw      %[temp7],   24(%[in2])              \n\t"
+            PTR_ADDIU "%[out1],  %[out1],        -7164   \n\t"
+            PTR_ADDIU "%[out2],  %[out2],        -7164   \n\t"
+            PTR_ADDIU "%[in1],   %[in1],         32      \n\t"
+            PTR_ADDIU "%[in2],   %[in2],         32      \n\t"
+            "sw      %[temp0],   7164(%[out1])           \n\t"
+            "sw      %[temp1],   7164(%[out2])           \n\t"
+            "sw      %[temp2],   7420(%[out1])           \n\t"
+            "sw      %[temp3],   7420(%[out2])           \n\t"
+            "sw      %[temp4],   7676(%[out1])           \n\t"
+            "sw      %[temp5],   7676(%[out2])           \n\t"
+            "sw      %[temp6],   7932(%[out1])           \n\t"
+            "sw      %[temp7],   7932(%[out2])           \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [out1]"+r"(out1), [out2]"+r"(out2),
+              [in1]"+r"(in1), [in2]"+r"(in2)
+            :
+            : "memory"
+        );
+    }
+}
+
+#if HAVE_MIPSFPU
+static void ps_add_squares_mips(float *dst, const float (*src)[2], int n)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *src0 = (float*)&src[0][0];
+    float *dst0 = &dst[0];
+
+    for (i = 0; i < 8; i++) {
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1     %[temp0],    0(%[src0])                          \n\t"
+            "lwc1     %[temp1],    4(%[src0])                          \n\t"
+            "lwc1     %[temp2],    8(%[src0])                          \n\t"
+            "lwc1     %[temp3],    12(%[src0])                         \n\t"
+            "lwc1     %[temp4],    16(%[src0])                         \n\t"
+            "lwc1     %[temp5],    20(%[src0])                         \n\t"
+            "lwc1     %[temp6],    24(%[src0])                         \n\t"
+            "lwc1     %[temp7],    28(%[src0])                         \n\t"
+            "lwc1     %[temp8],    0(%[dst0])                          \n\t"
+            "lwc1     %[temp9],    4(%[dst0])                          \n\t"
+            "lwc1     %[temp10],   8(%[dst0])                          \n\t"
+            "lwc1     %[temp11],   12(%[dst0])                         \n\t"
+            "mul.s    %[temp1],    %[temp1],    %[temp1]               \n\t"
+            "mul.s    %[temp3],    %[temp3],    %[temp3]               \n\t"
+            "mul.s    %[temp5],    %[temp5],    %[temp5]               \n\t"
+            "mul.s    %[temp7],    %[temp7],    %[temp7]               \n\t"
+            "madd.s   %[temp0],    %[temp1],    %[temp0],   %[temp0]   \n\t"
+            "madd.s   %[temp2],    %[temp3],    %[temp2],   %[temp2]   \n\t"
+            "madd.s   %[temp4],    %[temp5],    %[temp4],   %[temp4]   \n\t"
+            "madd.s   %[temp6],    %[temp7],    %[temp6],   %[temp6]   \n\t"
+            "add.s    %[temp0],    %[temp8],    %[temp0]               \n\t"
+            "add.s    %[temp2],    %[temp9],    %[temp2]               \n\t"
+            "add.s    %[temp4],    %[temp10],   %[temp4]               \n\t"
+            "add.s    %[temp6],    %[temp11],   %[temp6]               \n\t"
+            "swc1     %[temp0],    0(%[dst0])                          \n\t"
+            "swc1     %[temp2],    4(%[dst0])                          \n\t"
+            "swc1     %[temp4],    8(%[dst0])                          \n\t"
+            "swc1     %[temp6],    12(%[dst0])                         \n\t"
+            PTR_ADDIU "%[dst0],    %[dst0],     16                     \n\t"
+            PTR_ADDIU "%[src0],    %[src0],     32                     \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [dst0]"+r"(dst0), [src0]"+r"(src0),
+              [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            : "memory"
+        );
+   }
+}
+
+static void ps_mul_pair_single_mips(float (*dst)[2], float (*src0)[2], float *src1,
+                                 int n)
+{
+    float temp0, temp1, temp2;
+    float *p_d, *p_s0, *p_s1, *end;
+    p_d = &dst[0][0];
+    p_s0 = &src0[0][0];
+    p_s1 = &src1[0];
+    end = p_s1 + n;
+
+    __asm__ volatile(
+        ".set push                                      \n\t"
+        ".set noreorder                                 \n\t"
+        "1:                                             \n\t"
+        "lwc1     %[temp2],   0(%[p_s1])                \n\t"
+        "lwc1     %[temp0],   0(%[p_s0])                \n\t"
+        "lwc1     %[temp1],   4(%[p_s0])                \n\t"
+        PTR_ADDIU "%[p_d],    %[p_d],       8           \n\t"
+        "mul.s    %[temp0],   %[temp0],     %[temp2]    \n\t"
+        "mul.s    %[temp1],   %[temp1],     %[temp2]    \n\t"
+        PTR_ADDIU "%[p_s0],   %[p_s0],      8           \n\t"
+        "swc1     %[temp0],   -8(%[p_d])                \n\t"
+        "swc1     %[temp1],   -4(%[p_d])                \n\t"
+        "bne      %[p_s1],    %[end],       1b          \n\t"
+        PTR_ADDIU "%[p_s1],   %[p_s1],      4           \n\t"
+        ".set pop                                       \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_d]"+r"(p_d),
+          [p_s0]"+r"(p_s0), [p_s1]"+r"(p_s1)
+        : [end]"r"(end)
+        : "memory"
+    );
+}
+
+static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2],
+                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const float phi_fract[2], const float (*Q_fract)[2],
+                             const float *transient_gain,
+                             float g_decay_slope,
+                             int len)
+{
+    float *p_delay = &delay[0][0];
+    float *p_out = &out[0][0];
+    float *p_ap_delay = &ap_delay[0][0][0];
+    const float *p_t_gain = transient_gain;
+    const float *p_Q_fract = &Q_fract[0][0];
+    float ag0, ag1, ag2;
+    float phi_fract0 = phi_fract[0];
+    float phi_fract1 = phi_fract[1];
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+
+    float *p_delay_end = (p_delay + (len << 1));
+
+    /* merged 2 loops */
+    __asm__ volatile(
+        ".set    push                                                    \n\t"
+        ".set    noreorder                                               \n\t"
+        "li.s    %[ag0],        0.65143905753106                         \n\t"
+        "li.s    %[ag1],        0.56471812200776                         \n\t"
+        "li.s    %[ag2],        0.48954165955695                         \n\t"
+        "mul.s   %[ag0],        %[ag0],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag1],        %[ag1],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag2],        %[ag2],        %[g_decay_slope]          \n\t"
+    "1:                                                                  \n\t"
+        "lwc1    %[temp0],      0(%[p_delay])                            \n\t"
+        "lwc1    %[temp1],      4(%[p_delay])                            \n\t"
+        "lwc1    %[temp4],      16(%[p_ap_delay])                        \n\t"
+        "lwc1    %[temp5],      20(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[temp0],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp6],      0(%[p_Q_fract])                          \n\t"
+        "mul.s   %[temp2],      %[temp1],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp7],      4(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[temp1], %[phi_fract0]   \n\t"
+        "msub.s  %[temp2],      %[temp2],      %[temp0], %[phi_fract0]   \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      12(%[p_Q_fract])                         \n\t"
+        "mul.s   %[temp0],      %[ag0],        %[temp2]                  \n\t"
+        "mul.s   %[temp1],      %[ag0],        %[temp3]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "lwc1    %[temp4],      304(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      308(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag0],   %[temp0]        \n\t"
+        "lwc1    %[temp6],      8(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag0],   %[temp1]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      20(%[p_Q_fract])                         \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "swc1    %[temp2],      40(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp2],      %[ag1],        %[temp0]                  \n\t"
+        "swc1    %[temp3],      44(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[ag1],        %[temp1]                  \n\t"
+        "lwc1    %[temp4],      592(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      596(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp2],      %[temp8],      %[temp2]                  \n\t"
+        "sub.s   %[temp3],      %[temp9],      %[temp3]                  \n\t"
+        "lwc1    %[temp6],      16(%[p_Q_fract])                         \n\t"
+        "madd.s  %[temp0],      %[temp0],      %[ag1],   %[temp2]        \n\t"
+        "madd.s  %[temp1],      %[temp1],      %[ag1],   %[temp3]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "swc1    %[temp0],      336(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp0],      %[ag2],        %[temp2]                  \n\t"
+        "swc1    %[temp1],      340(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp1],      %[ag2],        %[temp3]                  \n\t"
+        "lwc1    %[temp4],      0(%[p_t_gain])                           \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        PTR_ADDIU "%[p_ap_delay], %[p_ap_delay], 8                       \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        PTR_ADDIU "%[p_t_gain], %[p_t_gain],   4                         \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag2],   %[temp0]        \n\t"
+        PTR_ADDIU "%[p_delay],  %[p_delay],    8                         \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag2],   %[temp1]        \n\t"
+        PTR_ADDIU "%[p_out],    %[p_out],      8                         \n\t"
+        "mul.s   %[temp5],      %[temp4],      %[temp0]                  \n\t"
+        "mul.s   %[temp6],      %[temp4],      %[temp1]                  \n\t"
+        "swc1    %[temp2],      624(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp3],      628(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp5],      -8(%[p_out])                             \n\t"
+        "swc1    %[temp6],      -4(%[p_out])                             \n\t"
+        "bne     %[p_delay],    %[p_delay_end],1b                        \n\t"
+        " swc1   %[temp6],      -4(%[p_out])                             \n\t"
+        ".set    pop                                                     \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+          [temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay),
+          [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out),
+          [ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2)
+        : [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1),
+          [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope)
+        : "memory"
+    );
+}
+
+static void ps_stereo_interpolate_mips(float (*l)[2], float (*r)[2],
+                                    float h[2][4], float h_step[2][4],
+                                    int len)
+{
+    float h0 = h[0][0];
+    float h1 = h[0][1];
+    float h2 = h[0][2];
+    float h3 = h[0][3];
+    float hs0 = h_step[0][0];
+    float hs1 = h_step[0][1];
+    float hs2 = h_step[0][2];
+    float hs3 = h_step[0][3];
+    float temp0, temp1, temp2, temp3;
+    float l_re, l_im, r_re, r_im;
+
+    float *l_end = ((float *)l + (len << 1));
+
+    __asm__ volatile(
+        ".set    push                                     \n\t"
+        ".set    noreorder                                \n\t"
+    "1:                                                   \n\t"
+        "add.s   %[h0],     %[h0],     %[hs0]             \n\t"
+        "lwc1    %[l_re],   0(%[l])                       \n\t"
+        "add.s   %[h1],     %[h1],     %[hs1]             \n\t"
+        "lwc1    %[r_re],   0(%[r])                       \n\t"
+        "add.s   %[h2],     %[h2],     %[hs2]             \n\t"
+        "lwc1    %[l_im],   4(%[l])                       \n\t"
+        "add.s   %[h3],     %[h3],     %[hs3]             \n\t"
+        "lwc1    %[r_im],   4(%[r])                       \n\t"
+        "mul.s   %[temp0],  %[h0],     %[l_re]            \n\t"
+        PTR_ADDIU "%[l],    %[l],      8                  \n\t"
+        "mul.s   %[temp2],  %[h1],     %[l_re]            \n\t"
+        PTR_ADDIU "%[r],    %[r],      8                  \n\t"
+        "madd.s  %[temp0],  %[temp0],  %[h2],   %[r_re]   \n\t"
+        "madd.s  %[temp2],  %[temp2],  %[h3],   %[r_re]   \n\t"
+        "mul.s   %[temp1],  %[h0],     %[l_im]            \n\t"
+        "mul.s   %[temp3],  %[h1],     %[l_im]            \n\t"
+        "madd.s  %[temp1],  %[temp1],  %[h2],   %[r_im]   \n\t"
+        "madd.s  %[temp3],  %[temp3],  %[h3],   %[r_im]   \n\t"
+        "swc1    %[temp0],  -8(%[l])                      \n\t"
+        "swc1    %[temp2],  -8(%[r])                      \n\t"
+        "swc1    %[temp1],  -4(%[l])                      \n\t"
+        "bne     %[l],      %[l_end],  1b                 \n\t"
+        " swc1   %[temp3],  -4(%[r])                      \n\t"
+        ".set    pop                                      \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+          [h0]"+f"(h0), [h1]"+f"(h1), [h2]"+f"(h2),
+          [h3]"+f"(h3), [l]"+r"(l), [r]"+r"(r),
+          [l_re]"=&f"(l_re), [l_im]"=&f"(l_im),
+          [r_re]"=&f"(r_re), [r_im]"=&f"(r_im)
+        : [hs0]"f"(hs0), [hs1]"f"(hs1), [hs2]"f"(hs2),
+          [hs3]"f"(hs3), [l_end]"r"(l_end)
+        : "memory"
+    );
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_psdsp_init_mips(PSDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_mips;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_mips;
+#if HAVE_MIPSFPU
+    s->add_squares            = ps_add_squares_mips;
+    s->mul_pair_single        = ps_mul_pair_single_mips;
+    s->decorrelate            = ps_decorrelate_mips;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacpsy_mips.h b/libavcodec/mips/aacpsy_mips.h
new file mode 100644
index 0000000000..596dcaddd0
--- /dev/null
+++ b/libavcodec/mips/aacpsy_mips.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic   (bojan@mips.com)
+ *
+ * AAC encoder psychoacoustic model routines optimized
+ * for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsy.c
+ */
+
+#ifndef AVCODEC_MIPS_AACPSY_MIPS_H
+#define AVCODEC_MIPS_AACPSY_MIPS_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU && ( PSY_LAME_FIR_LEN == 21 )
+static void calc_thr_3gpp_mips(const FFPsyWindowInfo *wi, const int num_bands,
+                               AacPsyChannel *pch, const uint8_t *band_sizes,
+                               const float *coefs)
+{
+    int i, w, g;
+    int start = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            for (i = 0; i < band_sizes[g]; i+=4) {
+                float a, b, c, d;
+                float ax, bx, cx, dx;
+                float *cf = (float *)&coefs[start+i];
+
+                __asm__ volatile (
+                    "lwc1   %[a],   0(%[cf])                \n\t"
+                    "lwc1   %[b],   4(%[cf])                \n\t"
+                    "lwc1   %[c],   8(%[cf])                \n\t"
+                    "lwc1   %[d],   12(%[cf])               \n\t"
+                    "abs.s  %[a],   %[a]                    \n\t"
+                    "abs.s  %[b],   %[b]                    \n\t"
+                    "abs.s  %[c],   %[c]                    \n\t"
+                    "abs.s  %[d],   %[d]                    \n\t"
+                    "sqrt.s %[ax],  %[a]                    \n\t"
+                    "sqrt.s %[bx],  %[b]                    \n\t"
+                    "sqrt.s %[cx],  %[c]                    \n\t"
+                    "sqrt.s %[dx],  %[d]                    \n\t"
+                    "madd.s %[e],   %[e],   %[a],   %[a]    \n\t"
+                    "madd.s %[e],   %[e],   %[b],   %[b]    \n\t"
+                    "madd.s %[e],   %[e],   %[c],   %[c]    \n\t"
+                    "madd.s %[e],   %[e],   %[d],   %[d]    \n\t"
+                    "add.s  %[f],   %[f],   %[ax]           \n\t"
+                    "add.s  %[f],   %[f],   %[bx]           \n\t"
+                    "add.s  %[f],   %[f],   %[cx]           \n\t"
+                    "add.s  %[f],   %[f],   %[dx]           \n\t"
+
+                    : [a]"=&f"(a), [b]"=&f"(b),
+                      [c]"=&f"(c), [d]"=&f"(d),
+                      [e]"+f"(band->energy), [f]"+f"(form_factor),
+                      [ax]"=&f"(ax), [bx]"=&f"(bx),
+                      [cx]"=&f"(cx), [dx]"=&f"(dx)
+                    : [cf]"r"(cf)
+                    : "memory"
+                );
+            }
+
+            Temp = sqrtf((float)band_sizes[g] / band->energy);
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+            start += band_sizes[g];
+        }
+    }
+}
+
+static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float * psy_fir_coeffs)
+{
+    float sum1, sum2, sum3, sum4;
+    float *fb = (float*)firbuf;
+    float *fb_end = fb + AAC_BLOCK_SIZE_LONG;
+    float *hp = hpfsmpl;
+
+    float coeff0 = psy_fir_coeffs[1];
+    float coeff1 = psy_fir_coeffs[3];
+    float coeff2 = psy_fir_coeffs[5];
+    float coeff3 = psy_fir_coeffs[7];
+    float coeff4 = psy_fir_coeffs[9];
+
+    __asm__ volatile (
+        ".set push                                          \n\t"
+        ".set noreorder                                     \n\t"
+
+        "li.s   $f12,       32768                           \n\t"
+        "1:                                                 \n\t"
+        "lwc1   $f0,        40(%[fb])                       \n\t"
+        "lwc1   $f1,        4(%[fb])                        \n\t"
+        "lwc1   $f2,        80(%[fb])                       \n\t"
+        "lwc1   $f3,        44(%[fb])                       \n\t"
+        "lwc1   $f4,        8(%[fb])                        \n\t"
+        "madd.s %[sum1],    $f0,        $f1,    %[coeff0]   \n\t"
+        "lwc1   $f5,        84(%[fb])                       \n\t"
+        "lwc1   $f6,        48(%[fb])                       \n\t"
+        "madd.s %[sum2],    $f3,        $f4,    %[coeff0]   \n\t"
+        "lwc1   $f7,        12(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff0]   \n\t"
+        "lwc1   $f8,        88(%[fb])                       \n\t"
+        "lwc1   $f9,        52(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff0]   \n\t"
+        "madd.s %[sum3],    $f6,        $f7,    %[coeff0]   \n\t"
+        "lwc1   $f10,       16(%[fb])                       \n\t"
+        "lwc1   $f11,       92(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff1]   \n\t"
+        "lwc1   $f1,        72(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff0]   \n\t"
+        "madd.s %[sum4],    $f9,        $f10,   %[coeff0]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff1]   \n\t"
+        "lwc1   $f4,        76(%[fb])                       \n\t"
+        "lwc1   $f8,        20(%[fb])                       \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff0]   \n\t"
+        "lwc1   $f11,       24(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f4,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff2]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff1]   \n\t"
+        "lwc1   $f7,        64(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff2]   \n\t"
+        "lwc1   $f10,       68(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff2]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff2]   \n\t"
+        "lwc1   $f2,        28(%[fb])                       \n\t"
+        "lwc1   $f5,        32(%[fb])                       \n\t"
+        "lwc1   $f8,        56(%[fb])                       \n\t"
+        "lwc1   $f11,       60(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f4,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff3]   \n\t"
+        "lwc1   $f1,        36(%[fb])                       \n\t"
+        PTR_ADDIU "%[fb],   %[fb],      16                  \n\t"
+        "madd.s %[sum4],    %[sum4],    $f0,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f0,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f10,   %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f7,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f9,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f3,    %[coeff4]   \n\t"
+        "mul.s  %[sum1],    %[sum1],    $f12                \n\t"
+        "mul.s  %[sum2],    %[sum2],    $f12                \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff4]   \n\t"
+        "swc1   %[sum1],    0(%[hp])                        \n\t"
+        "swc1   %[sum2],    4(%[hp])                        \n\t"
+        "mul.s  %[sum4],    %[sum4],    $f12                \n\t"
+        "mul.s  %[sum3],    %[sum3],    $f12                \n\t"
+        "swc1   %[sum4],    12(%[hp])                       \n\t"
+        "swc1   %[sum3],    8(%[hp])                        \n\t"
+        "bne    %[fb],      %[fb_end],  1b                  \n\t"
+        PTR_ADDIU "%[hp],   %[hp],      16                  \n\t"
+
+        ".set pop                                           \n\t"
+
+        : [sum1]"=&f"(sum1), [sum2]"=&f"(sum2),
+          [sum3]"=&f"(sum3), [sum4]"=&f"(sum4),
+          [fb]"+r"(fb), [hp]"+r"(hp)
+        : [coeff0]"f"(coeff0), [coeff1]"f"(coeff1),
+          [coeff2]"f"(coeff2), [coeff3]"f"(coeff3),
+          [coeff4]"f"(coeff4), [fb_end]"r"(fb_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6",
+          "$f7", "$f8", "$f9", "$f10", "$f11", "$f12",
+          "memory"
+    );
+}
+
+#define calc_thr_3gpp calc_thr_3gpp_mips
+#define psy_hp_filter psy_hp_filter_mips
+
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+#endif /* AVCODEC_MIPS_AACPSY_MIPS_H */
diff --git a/libavcodec/mips/aacsbr_mips.c b/libavcodec/mips/aacsbr_mips.c
new file mode 100644
index 0000000000..e478290e47
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#include "libavcodec/aac.h"
+#include "libavcodec/aacsbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+
+#if HAVE_INLINE_ASM
+static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
+                      float X_low[32][40][2], const float W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *p_x_low = &X_low[0][8][0];
+    float *p_w = (float*)&W[buf_idx][0][0][0];
+    float *p_x1_low = &X_low[0][0][0];
+    float *p_w1 = (float*)&W[1-buf_idx][24][0][0];
+
+    float *loop_end=p_x1_low + 2560;
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                                 \n\t"
+        "sw     $0,            0(%[p_x1_low])           \n\t"
+        "sw     $0,            4(%[p_x1_low])           \n\t"
+        "sw     $0,            8(%[p_x1_low])           \n\t"
+        "sw     $0,            12(%[p_x1_low])          \n\t"
+        "sw     $0,            16(%[p_x1_low])          \n\t"
+        "sw     $0,            20(%[p_x1_low])          \n\t"
+        "sw     $0,            24(%[p_x1_low])          \n\t"
+        "sw     $0,            28(%[p_x1_low])          \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      32     \n\t"
+        "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      -10240 \n\t"
+
+        : [p_x1_low]"+r"(p_x1_low)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = 0; i < 32; i+=4) {
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],   0(%[p_w])               \n\t"
+                "lw     %[temp1],   4(%[p_w])               \n\t"
+                "lw     %[temp2],   256(%[p_w])             \n\t"
+                "lw     %[temp3],   260(%[p_w])             \n\t"
+                "lw     %[temp4],   512(%[p_w])             \n\t"
+                "lw     %[temp5],   516(%[p_w])             \n\t"
+                "lw     %[temp6],   768(%[p_w])             \n\t"
+                "lw     %[temp7],   772(%[p_w])             \n\t"
+                "sw     %[temp0],   0(%[p_x_low])           \n\t"
+                "sw     %[temp1],   4(%[p_x_low])           \n\t"
+                "sw     %[temp2],   8(%[p_x_low])           \n\t"
+                "sw     %[temp3],   12(%[p_x_low])          \n\t"
+                "sw     %[temp4],   16(%[p_x_low])          \n\t"
+                "sw     %[temp5],   20(%[p_x_low])          \n\t"
+                "sw     %[temp6],   24(%[p_x_low])          \n\t"
+                "sw     %[temp7],   28(%[p_x_low])          \n\t"
+                PTR_ADDIU "%[p_x_low], %[p_x_low],  32      \n\t"
+                PTR_ADDIU "%[p_w],     %[p_w],      1024    \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
+                :
+                : "memory"
+            );
+        }
+        p_x_low += 16;
+        p_w -= 2046;
+    }
+
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < 2; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],    0(%[p_w1])             \n\t"
+                "lw     %[temp1],    4(%[p_w1])             \n\t"
+                "lw     %[temp2],    256(%[p_w1])           \n\t"
+                "lw     %[temp3],    260(%[p_w1])           \n\t"
+                "lw     %[temp4],    512(%[p_w1])           \n\t"
+                "lw     %[temp5],    516(%[p_w1])           \n\t"
+                "lw     %[temp6],    768(%[p_w1])           \n\t"
+                "lw     %[temp7],    772(%[p_w1])           \n\t"
+                "sw     %[temp0],    0(%[p_x1_low])         \n\t"
+                "sw     %[temp1],    4(%[p_x1_low])         \n\t"
+                "sw     %[temp2],    8(%[p_x1_low])         \n\t"
+                "sw     %[temp3],    12(%[p_x1_low])        \n\t"
+                "sw     %[temp4],    16(%[p_x1_low])        \n\t"
+                "sw     %[temp5],    20(%[p_x1_low])        \n\t"
+                "sw     %[temp6],    24(%[p_x1_low])        \n\t"
+                "sw     %[temp7],    28(%[p_x1_low])        \n\t"
+                PTR_ADDIU "%[p_x1_low], %[p_x1_low], 32     \n\t"
+                PTR_ADDIU "%[p_w1],     %[p_w1],     1024   \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
+                :
+                : "memory"
+            );
+        }
+        p_x1_low += 64;
+        p_w1 -= 510;
+    }
+    return 0;
+}
+
+static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
+                     const float Y0[38][64][2], const float Y1[38][64][2],
+                     const float X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    int temp0, temp1, temp2, temp3;
+    const float *X_low1, *Y01, *Y11;
+    float *x1=&X[0][0][0];
+    float *j=x1+4864;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                       \n\t"
+        "sw     $0,      0(%[x1])             \n\t"
+        "sw     $0,      4(%[x1])             \n\t"
+        "sw     $0,      8(%[x1])             \n\t"
+        "sw     $0,      12(%[x1])            \n\t"
+        "sw     $0,      16(%[x1])            \n\t"
+        "sw     $0,      20(%[x1])            \n\t"
+        "sw     $0,      24(%[x1])            \n\t"
+        "sw     $0,      28(%[x1])            \n\t"
+        PTR_ADDIU "%[x1],%[x1],      32       \n\t"
+        "bne    %[x1],   %[j],       1b       \n\t"
+        PTR_ADDIU "%[x1],%[x1],      -19456   \n\t"
+
+        : [x1]"+r"(x1)
+        : [j]"r"(j)
+        : "memory"
+    );
+
+    if (i_Temp != 0) {
+
+        X_low1=&X_low[0][2][0];
+
+        for (k = 0; k < sbr->kx[0]; k++) {
+
+            __asm__ volatile (
+                "move    %[i],        $zero                  \n\t"
+            "2:                                              \n\t"
+                "lw      %[temp0],    0(%[X_low1])           \n\t"
+                "lw      %[temp1],    4(%[X_low1])           \n\t"
+                "sw      %[temp0],    0(%[x1])               \n\t"
+                "sw      %[temp1],    9728(%[x1])            \n\t"
+                PTR_ADDIU "%[x1],     %[x1],         256     \n\t"
+                PTR_ADDIU "%[X_low1], %[X_low1],     8       \n\t"
+                "addiu   %[i],        %[i],          1       \n\t"
+                "bne     %[i],        %[i_Temp],     2b      \n\t"
+
+                : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1-=(i_Temp<<6)-1;
+            X_low1-=(i_Temp<<1)-80;
+        }
+
+        x1=&X[0][0][k];
+        Y01=(float*)&Y0[32][k][0];
+
+        for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+            __asm__ volatile (
+                "move    %[i],       $zero               \n\t"
+            "3:                                          \n\t"
+                "lw      %[temp0],   0(%[Y01])           \n\t"
+                "lw      %[temp1],   4(%[Y01])           \n\t"
+                "sw      %[temp0],   0(%[x1])            \n\t"
+                "sw      %[temp1],   9728(%[x1])         \n\t"
+                PTR_ADDIU "%[x1],    %[x1],      256     \n\t"
+                PTR_ADDIU "%[Y01],   %[Y01],     512     \n\t"
+                "addiu   %[i],       %[i],       1       \n\t"
+                "bne     %[i],       %[i_Temp],  3b      \n\t"
+
+                : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1 -=(i_Temp<<6)-1;
+            Y01 -=(i_Temp<<7)-2;
+        }
+    }
+
+    x1=&X[0][i_Temp][0];
+    X_low1=&X_low[0][i_Temp+2][0];
+    temp3=38;
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+
+        __asm__ volatile (
+            "move    %[i],       %[i_Temp]              \n\t"
+        "4:                                             \n\t"
+            "lw      %[temp0],   0(%[X_low1])           \n\t"
+            "lw      %[temp1],   4(%[X_low1])           \n\t"
+            "sw      %[temp0],   0(%[x1])               \n\t"
+            "sw      %[temp1],   9728(%[x1])            \n\t"
+            PTR_ADDIU "%[x1],    %[x1],         256     \n\t"
+            PTR_ADDIU "%[X_low1],%[X_low1],     8       \n\t"
+            "addiu   %[i],       %[i],          1       \n\t"
+            "bne     %[i],       %[temp3],      4b      \n\t"
+
+            : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2)
+            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
+            : "memory"
+        );
+        x1 -= ((38-i_Temp)<<6)-1;
+        X_low1 -= ((38-i_Temp)<<1)- 80;
+    }
+
+    x1=&X[0][i_Temp][k];
+    Y11=&Y1[i_Temp][k][0];
+    temp2=32;
+
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+
+        __asm__ volatile (
+           "move    %[i],       %[i_Temp]               \n\t"
+        "5:                                             \n\t"
+           "lw      %[temp0],   0(%[Y11])               \n\t"
+           "lw      %[temp1],   4(%[Y11])               \n\t"
+           "sw      %[temp0],   0(%[x1])                \n\t"
+           "sw      %[temp1],   9728(%[x1])             \n\t"
+           PTR_ADDIU "%[x1],    %[x1],          256     \n\t"
+           PTR_ADDIU "%[Y11],   %[Y11],         512     \n\t"
+           "addiu   %[i],       %[i],           1       \n\t"
+           "bne     %[i],       %[temp2],       5b      \n\t"
+
+           : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
+             [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+           : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
+             [temp2]"r"(temp2)
+           : "memory"
+        );
+
+        x1 -= ((32-i_Temp)<<6)-1;
+        Y11 -= ((32-i_Temp)<<7)-2;
+   }
+      return 0;
+}
+
+#if HAVE_MIPSFPU
+static void sbr_hf_assemble_mips(float Y1[38][64][2],
+                            const float X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const float h_smooth[5] = {
+        0.33333333333333,
+        0.30150283239582,
+        0.21816949906249,
+        0.11516383427084,
+        0.03183050093751,
+    };
+
+    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+    float *g_temp1, *q_temp1, *pok, *pok1;
+    float temp1, temp2, temp3, temp4;
+    int size = m_max;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
+        memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            g_temp1 = g_temp[h_SL + i];
+            pok = sbr->gain[e];
+            q_temp1 = q_temp[h_SL + i];
+            pok1 = sbr->q_m[e];
+
+            /* loop unrolled 4 times */
+            for (j=0; j<(size>>2); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])               \n\t"
+                    "lw      %[temp2],   4(%[pok])               \n\t"
+                    "lw      %[temp3],   8(%[pok])               \n\t"
+                    "lw      %[temp4],   12(%[pok])              \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[g_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[g_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[g_temp1])          \n\t"
+                    "lw      %[temp1],   0(%[pok1])              \n\t"
+                    "lw      %[temp2],   4(%[pok1])              \n\t"
+                    "lw      %[temp3],   8(%[pok1])              \n\t"
+                    "lw      %[temp4],   12(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[q_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[q_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[q_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],         16    \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],     16    \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],        16    \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],     16    \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+
+            for (j=0; j<(size&3); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])              \n\t"
+                    "lw      %[temp2],   0(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])          \n\t"
+                    "sw      %[temp2],   0(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],        4     \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],    4     \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],       4     \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],    4     \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
+            LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
+            float *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m] = 0.0f;
+                    q_filt[m] = 0.0f;
+
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
+                        q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                float temp0, temp1, temp2, temp3, temp4, temp5;
+                float A_f = (float)A;
+                float B_f = (float)B;
+
+                for (m = 0; m+1 < m_max; m+=2) {
+
+                    temp2 = out[0];
+                    temp3 = out[2];
+
+                    __asm__ volatile(
+                        "lwc1    %[temp0],  0(%[in])                     \n\t"
+                        "lwc1    %[temp1],  4(%[in])                     \n\t"
+                        "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
+                        "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
+                        "swc1    %[temp4],  0(%[out])                    \n\t"
+                        "swc1    %[temp5],  8(%[out])                    \n\t"
+                        PTR_ADDIU "%[in],   %[in],     8                 \n\t"
+                        PTR_ADDIU "%[out],  %[out],    16                \n\t"
+
+                        : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
+                          [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
+                          [in]"+r"(in), [out]"+r"(out)
+                        : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
+                          [temp3]"f"(temp3)
+                        : "memory"
+                    );
+                }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
+                                  float (*alpha0)[2], float (*alpha1)[2],
+                                  const float X_low[32][40][2], int k0)
+{
+    int k;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
+    float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
+
+    c = 1.000001f;
+
+    for (k = 0; k < k0; k++) {
+        LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
+        float dk;
+        phi1 = &phi[0][0][0];
+        alpha_1 = &alpha1[k][0];
+        alpha_0 = &alpha0[k][0];
+        dsp->autocorrelate(X_low[k], phi);
+
+        __asm__ volatile (
+            "lwc1    %[temp0],  40(%[phi1])                       \n\t"
+            "lwc1    %[temp1],  16(%[phi1])                       \n\t"
+            "lwc1    %[temp2],  24(%[phi1])                       \n\t"
+            "lwc1    %[temp3],  28(%[phi1])                       \n\t"
+            "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
+            "lwc1    %[temp4],  0(%[phi1])                        \n\t"
+            "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
+            "lwc1    %[temp5],  4(%[phi1])                        \n\t"
+            "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
+            "lwc1    %[temp6],  8(%[phi1])                        \n\t"
+            "div.s   %[res2],   %[res2],     %[c]                 \n\t"
+            "lwc1    %[temp0],  12(%[phi1])                       \n\t"
+            "sub.s   %[dk],     %[dk],       %[res2]              \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
+            : [phi1]"r"(phi1), [c]"f"(c)
+            : "memory"
+        );
+
+        if (!dk) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+        } else {
+            __asm__ volatile (
+                "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
+                "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
+                "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
+                "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"
+
+                : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
+                : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
+                  [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
+                  [temp5]"f"(temp5), [temp6]"f"(temp6),
+                  [alpha_1]"r"(alpha_1), [dk]"f"(dk)
+                : "memory"
+            );
+        }
+
+        if (!phi1[4]) {
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        } else {
+            __asm__ volatile (
+                "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
+                "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
+                "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
+                "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
+                "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
+                "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
+                "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
+                "neg.s   %[temp_real], %[temp_real]                      \n\t"
+                "neg.s   %[temp_im],   %[temp_im]                        \n\t"
+                "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"
+
+                : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2)
+                : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
+                  [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
+                  [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
+                : "memory"
+            );
+        }
+
+        __asm__ volatile (
+            "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
+            "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
+            "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
+            "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
+            "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
+            "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
+            "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
+            "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"
+
+            : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [res1]"=&f"(res1), [res2]"=&f"(res2)
+            : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
+            : "memory"
+        );
+
+        if (res1 >= 16.0f || res2 >= 16.0f) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        }
+    }
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->sbr_lf_gen            = sbr_lf_gen_mips;
+    c->sbr_x_gen             = sbr_x_gen_mips;
+#if HAVE_MIPSFPU
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
+    c->sbr_hf_assemble       = sbr_hf_assemble_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacsbr_mips.h b/libavcodec/mips/aacsbr_mips.h
new file mode 100644
index 0000000000..da8389f484
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#ifndef AVCODEC_MIPS_AACSBR_FLOAT_H
+#define AVCODEC_MIPS_AACSBR_FLOAT_H
+
+#include "libavcodec/aac.h"
+#include "libavcodec/sbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_analysis_mips(AVFloatDSPContext *fdsp, FFTContext *mdct,
+                             SBRDSPContext *sbrdsp, const float *in, float *x,
+                             float z[320], float W[2][32][32][2], int buf_idx)
+{
+    int i;
+    float *w0;
+    float *w1;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    w0 = x;
+    w1 = x + 1024;
+    for(i = 0; i < 36; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw      %[temp0],   0(%[w1])         \n\t"
+            "lw      %[temp1],   4(%[w1])         \n\t"
+            "lw      %[temp2],   8(%[w1])         \n\t"
+            "lw      %[temp3],   12(%[w1])        \n\t"
+            "lw      %[temp4],   16(%[w1])        \n\t"
+            "lw      %[temp5],   20(%[w1])        \n\t"
+            "lw      %[temp6],   24(%[w1])        \n\t"
+            "lw      %[temp7],   28(%[w1])        \n\t"
+            "sw      %[temp0],   0(%[w0])         \n\t"
+            "sw      %[temp1],   4(%[w0])         \n\t"
+            "sw      %[temp2],   8(%[w0])         \n\t"
+            "sw      %[temp3],   12(%[w0])        \n\t"
+            "sw      %[temp4],   16(%[w0])        \n\t"
+            "sw      %[temp5],   20(%[w0])        \n\t"
+            "sw      %[temp6],   24(%[w0])        \n\t"
+            "sw      %[temp7],   28(%[w0])        \n\t"
+            PTR_ADDIU " %[w0],      %[w0],     32 \n\t"
+            PTR_ADDIU " %[w1],      %[w1],     32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    w0 = x + 288;
+    w1 = (float*)in;
+    for(i = 0; i < 128; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw       %[temp0],    0(%[w1])        \n\t"
+            "lw       %[temp1],    4(%[w1])        \n\t"
+            "lw       %[temp2],    8(%[w1])        \n\t"
+            "lw       %[temp3],    12(%[w1])       \n\t"
+            "lw       %[temp4],    16(%[w1])       \n\t"
+            "lw       %[temp5],    20(%[w1])       \n\t"
+            "lw       %[temp6],    24(%[w1])       \n\t"
+            "lw       %[temp7],    28(%[w1])       \n\t"
+            "sw       %[temp0],    0(%[w0])        \n\t"
+            "sw       %[temp1],    4(%[w0])        \n\t"
+            "sw       %[temp2],    8(%[w0])        \n\t"
+            "sw       %[temp3],    12(%[w0])       \n\t"
+            "sw       %[temp4],    16(%[w0])       \n\t"
+            "sw       %[temp5],    20(%[w0])       \n\t"
+            "sw       %[temp6],    24(%[w0])       \n\t"
+            "sw       %[temp7],    28(%[w0])       \n\t"
+            PTR_ADDIU "  %[w0],       %[w0],    32 \n\t"
+            PTR_ADDIU "  %[w1],       %[w1],    32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        fdsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+
+#if HAVE_MIPSFPU
+static void sbr_qmf_synthesis_mips(FFTContext *mdct,
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *fdsp,
+                              float *out, float X[2][38][64],
+                              float mdct_buf[2][64],
+                              float *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    float *v;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+    float temp14, temp15, temp16, temp17, temp18, temp19;
+    float *vv0, *s0, *dst;
+    dst = out;
+
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+
+        if(div == 0)
+        {
+            float *v0_end;
+            vv0 = v;
+            v0_end = v + 60;
+            s0 = (float*)sbr_qmf_window;
+
+            /* 10 calls of function vector_fmul_add merged into one loop
+               and loop unrolled 4 times */
+            __asm__ volatile(
+                ".set    push                                           \n\t"
+                ".set    noreorder                                      \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+            "1:                                                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                PTR_ADDIU "%[dst],     %[dst],      16                  \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                PTR_ADDIU " %[s0],      %[s0],      16                  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                PTR_ADDIU " %[v0],      %[v0],      16                  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2044(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4848(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2288(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4852(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2292(%[s0])                        \n\t"
+                "lwc1    %[temp16],  4856(%[v0])                        \n\t"
+                "lwc1    %[temp17],  2296(%[s0])                        \n\t"
+                "lwc1    %[temp18],  4860(%[v0])                        \n\t"
+                "lwc1    %[temp19],  2300(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "bne     %[v0],      %[v0_end],  1b                     \n\t"
+                " swc1   %[temp3],   -4(%[dst])                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2060(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4864(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2304(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4868(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2308(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp16],  4872(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp17],  2312(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp18],  4876(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp19],  2316(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                PTR_ADDIU "%[dst],     %[dst],     16                   \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "swc1    %[temp3],   -4(%[dst])                         \n\t"
+                ".set    pop                                            \n\t"
+
+                : [dst]"+r"(dst), [v0]"+r"(vv0), [s0]"+r"(s0),
+                  [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+                  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+                  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [temp12]"=&f"(temp12), [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+                  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16), [temp17]"=&f"(temp17),
+                  [temp18]"=&f"(temp18), [temp19]"=&f"(temp19)
+                : [v0_end]"r"(v0_end)
+                : "memory"
+            );
+        }
+        else
+        {
+            fdsp->vector_fmul   (out, v                , sbr_qmf_window                       , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+            out += 64 >> div;
+        }
+    }
+}
+
+#define sbr_qmf_analysis sbr_qmf_analysis_mips
+#define sbr_qmf_synthesis sbr_qmf_synthesis_mips
+
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_AACSBR_FLOAT_H */
diff --git a/libavcodec/mips/ac3dsp_mips.c b/libavcodec/mips/ac3dsp_mips.c
new file mode 100644
index 0000000000..01c7de57a0
--- /dev/null
+++ b/libavcodec/mips/ac3dsp_mips.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Branimir Vasic (bvasic@mips.com)
+ *           Nedeljko Babic (nbabic@mips.com)
+ *
+ * Various AC-3 DSP Utils optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/ac3dsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/ac3dsp.h"
+#include "libavcodec/ac3.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSPR1
+static void ac3_bit_alloc_calc_bap_mips(int16_t *mask, int16_t *psd,
+                                        int start, int end,
+                                        int snr_offset, int floor,
+                                        const uint8_t *bap_tab, uint8_t *bap)
+{
+    int band, band_end, cond;
+    int m, address1, address2;
+    int16_t *psd1, *psd_end;
+    uint8_t *bap1;
+
+    if (snr_offset == -960) {
+        memset(bap, 0, AC3_MAX_COEFS);
+        return;
+    }
+
+    psd1 = &psd[start];
+    bap1 = &bap[start];
+    band = ff_ac3_bin_to_band_tab[start];
+
+    do {
+        m = (FFMAX(mask[band] - snr_offset - floor, 0) & 0x1FE0) + floor;
+        band_end = ff_ac3_band_start_tab[++band];
+        band_end = FFMIN(band_end, end);
+        psd_end = psd + band_end - 1;
+
+        __asm__ volatile (
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        1f                          \n\t"
+            "2:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            "lh         %[address2],    2(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        4           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "subu       %[address2],    %[address2],    %[m]        \n\t"
+            "sra        %[address2],    %[address2],    5           \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "addiu      %[address2],    %[address2],    -32         \n\t"
+            "shll_s.w   %[address2],    %[address2],    26          \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "sra        %[address2],    %[address2],    26          \n\t"
+            "addiu      %[address2],    %[address2],    32          \n\t"
+            "lbux       %[address2],    %[address2](%[bap_tab])     \n\t"
+            "sb         %[address2],    1(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        2           \n\t"
+            "bnez       %[cond],        2b                          \n\t"
+            PTR_ADDIU " %[psd_end],     %[psd_end],     2           \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        3f                          \n\t"
+            "1:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        2           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        1           \n\t"
+            "3:                                                     \n\t"
+
+            : [address1]"=&r"(address1), [address2]"=&r"(address2),
+              [cond]"=&r"(cond), [bap1]"+r"(bap1),
+              [psd1]"+r"(psd1), [psd_end]"+r"(psd_end)
+            : [m]"r"(m), [bap_tab]"r"(bap_tab)
+            : "memory"
+        );
+    } while (end > band_end);
+}
+
+static void ac3_update_bap_counts_mips(uint16_t mant_cnt[16], uint8_t *bap,
+                                       int len)
+{
+    void *temp0, *temp2, *temp4, *temp5, *temp6, *temp7;
+    int temp1, temp3;
+
+    __asm__ volatile (
+        "andi   %[temp3],   %[len],         3               \n\t"
+        PTR_ADDU "%[temp2], %[bap],         %[len]          \n\t"
+        PTR_ADDU "%[temp4], %[bap],         %[temp3]        \n\t"
+        "beq    %[temp2],   %[temp4],       4f              \n\t"
+        "1:                                                 \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        "lbu    %[temp5],   -2(%[temp2])                    \n\t"
+        "lbu    %[temp6],   -3(%[temp2])                    \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "sll    %[temp5],   %[temp5],       1               \n\t"
+        PTR_ADDU "%[temp5], %[mant_cnt],    %[temp5]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "sll    %[temp6],   %[temp6],       1               \n\t"
+        PTR_ADDU "%[temp6], %[mant_cnt],    %[temp6]        \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "lhu    %[temp1],   0(%[temp5])                     \n\t"
+        "lbu    %[temp7],   -4(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -4              \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp5])                     \n\t"
+        "lhu    %[temp1],   0(%[temp6])                     \n\t"
+        "sll    %[temp7],   %[temp7],       1               \n\t"
+        PTR_ADDU "%[temp7], %[mant_cnt],    %[temp7]        \n\t"
+        "addiu  %[temp1],   %[temp1],1                      \n\t"
+        "sh     %[temp1],   0(%[temp6])                     \n\t"
+        "lhu    %[temp1],   0(%[temp7])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp7])                     \n\t"
+        "bne    %[temp2],   %[temp4],       1b              \n\t"
+        "4:                                                 \n\t"
+        "beqz   %[temp3],   2f                              \n\t"
+        "3:                                                 \n\t"
+        "addiu  %[temp3],   %[temp3],       -1              \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -1              \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "bgtz   %[temp3],   3b                              \n\t"
+        "2:                                                 \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
+          [temp6] "=&r" (temp6), [temp7] "=&r" (temp7)
+        : [len] "r" (len), [bap] "r" (bap),
+          [mant_cnt] "r" (mant_cnt)
+        : "memory"
+    );
+}
+#endif
+
+#if HAVE_MIPSFPU
+static void float_to_fixed24_mips(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = 1 << 24;
+    float src0, src1, src2, src3, src4, src5, src6, src7;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    do {
+        __asm__ volatile (
+            "lwc1       %[src0],    0(%[src])               \n\t"
+            "lwc1       %[src1],    4(%[src])               \n\t"
+            "lwc1       %[src2],    8(%[src])               \n\t"
+            "lwc1       %[src3],    12(%[src])              \n\t"
+            "lwc1       %[src4],    16(%[src])              \n\t"
+            "lwc1       %[src5],    20(%[src])              \n\t"
+            "lwc1       %[src6],    24(%[src])              \n\t"
+            "lwc1       %[src7],    28(%[src])              \n\t"
+            "mul.s      %[src0],    %[src0],    %[scale]    \n\t"
+            "mul.s      %[src1],    %[src1],    %[scale]    \n\t"
+            "mul.s      %[src2],    %[src2],    %[scale]    \n\t"
+            "mul.s      %[src3],    %[src3],    %[scale]    \n\t"
+            "mul.s      %[src4],    %[src4],    %[scale]    \n\t"
+            "mul.s      %[src5],    %[src5],    %[scale]    \n\t"
+            "mul.s      %[src6],    %[src6],    %[scale]    \n\t"
+            "mul.s      %[src7],    %[src7],    %[scale]    \n\t"
+            "cvt.w.s    %[src0],    %[src0]                 \n\t"
+            "cvt.w.s    %[src1],    %[src1]                 \n\t"
+            "cvt.w.s    %[src2],    %[src2]                 \n\t"
+            "cvt.w.s    %[src3],    %[src3]                 \n\t"
+            "cvt.w.s    %[src4],    %[src4]                 \n\t"
+            "cvt.w.s    %[src5],    %[src5]                 \n\t"
+            "cvt.w.s    %[src6],    %[src6]                 \n\t"
+            "cvt.w.s    %[src7],    %[src7]                 \n\t"
+            "mfc1       %[temp0],   %[src0]                 \n\t"
+            "mfc1       %[temp1],   %[src1]                 \n\t"
+            "mfc1       %[temp2],   %[src2]                 \n\t"
+            "mfc1       %[temp3],   %[src3]                 \n\t"
+            "mfc1       %[temp4],   %[src4]                 \n\t"
+            "mfc1       %[temp5],   %[src5]                 \n\t"
+            "mfc1       %[temp6],   %[src6]                 \n\t"
+            "mfc1       %[temp7],   %[src7]                 \n\t"
+            "sw         %[temp0],   0(%[dst])               \n\t"
+            "sw         %[temp1],   4(%[dst])               \n\t"
+            "sw         %[temp2],   8(%[dst])               \n\t"
+            "sw         %[temp3],   12(%[dst])              \n\t"
+            "sw         %[temp4],   16(%[dst])              \n\t"
+            "sw         %[temp5],   20(%[dst])              \n\t"
+            "sw         %[temp6],   24(%[dst])              \n\t"
+            "sw         %[temp7],   28(%[dst])              \n\t"
+
+            : [dst] "+r" (dst), [src] "+r" (src),
+              [src0] "=&f" (src0), [src1] "=&f" (src1),
+              [src2] "=&f" (src2), [src3] "=&f" (src3),
+              [src4] "=&f" (src4), [src5] "=&f" (src5),
+              [src6] "=&f" (src6), [src7] "=&f" (src7),
+              [temp0] "=r" (temp0), [temp1] "=r" (temp1),
+              [temp2] "=r" (temp2), [temp3] "=r" (temp3),
+              [temp4] "=r" (temp4), [temp5] "=r" (temp5),
+              [temp6] "=r" (temp6), [temp7] "=r" (temp7)
+            : [scale] "f" (scale)
+            : "memory"
+        );
+        src = src + 8;
+        dst = dst + 8;
+        len -= 8;
+    } while (len > 0);
+}
+
+static void ac3_downmix_mips(float **samples, float (*matrix)[2],
+                          int out_ch, int in_ch, int len)
+{
+    int i, j, i1, i2, i3;
+    float v0, v1, v2, v3;
+    float v4, v5, v6, v7;
+    float samples0, samples1, samples2, samples3, matrix_j, matrix_j2;
+    float *samples_p, *samples_sw, *matrix_p, **samples_x, **samples_end;
+
+    __asm__ volatile(
+        ".set   push                                                \n\t"
+        ".set   noreorder                                           \n\t"
+
+        "li     %[i1],          2                                   \n\t"
+        "sll    %[len],         2                                   \n\t"
+        "move   %[i],           $zero                               \n\t"
+        "sll    %[j],           %[in_ch],             " PTRLOG "    \n\t"
+
+        "bne    %[out_ch],      %[i1],                  3f          \n\t"   // if (out_ch == 2)
+        " li    %[i2],          1                                   \n\t"
+
+        "2:                                                         \n\t"   // start of the for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "mtc1   $zero,          %[v4]                               \n\t"
+        "mtc1   $zero,          %[v5]                               \n\t"
+        "mtc1   $zero,          %[v6]                               \n\t"
+        "mtc1   $zero,          %[v7]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "1:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwc1   %[matrix_j2],   4(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "madd.s %[v4],          %[v4],  %[samples0],    %[matrix_j2]\n\t"
+        "madd.s %[v5],          %[v5],  %[samples1],    %[matrix_j2]\n\t"
+        "madd.s %[v6],          %[v6],  %[samples2],    %[matrix_j2]\n\t"
+        "madd.s %[v7],          %[v7],  %[samples3],    %[matrix_j2]\n\t"
+        "bne    %[samples_x],   %[samples_end],         1b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        PTR_L " %[samples_p],  " PTRSIZE "(%[samples])              \n\t"
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "swxc1  %[v3],          %[i3](%[samples_sw])                \n\t"
+        "swxc1  %[v4],          %[i](%[samples_p])                  \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v5],          %[i1](%[samples_p])                 \n\t"
+        "swxc1  %[v6],          %[i2](%[samples_p])                 \n\t"
+        "bne    %[i],           %[len],                 2b          \n\t"
+        " swxc1 %[v7],          %[i3](%[samples_p])                 \n\t"
+
+        "3:                                                         \n\t"
+        "bne    %[out_ch],      %[i2],                  6f          \n\t"   // if (out_ch == 1)
+        " nop                                                       \n\t"
+
+        "5:                                                         \n\t"   // start of the outer for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "4:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "bne    %[samples_x],   %[samples_end],         4b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "bne    %[i],           %[len],                 5b          \n\t"
+        " swxc1 %[v3],          %[i3](%[samples_sw])                \n\t"
+        "6:                                                         \n\t"
+
+        ".set   pop"
+        :[samples_p]"=&r"(samples_p), [matrix_j]"=&f"(matrix_j), [matrix_j2]"=&f"(matrix_j2),
+         [samples0]"=&f"(samples0), [samples1]"=&f"(samples1),
+         [samples2]"=&f"(samples2), [samples3]"=&f"(samples3),
+         [v0]"=&f"(v0), [v1]"=&f"(v1), [v2]"=&f"(v2), [v3]"=&f"(v3),
+         [v4]"=&f"(v4), [v5]"=&f"(v5), [v6]"=&f"(v6), [v7]"=&f"(v7),
+         [samples_x]"=&r"(samples_x), [matrix_p]"=&r"(matrix_p),
+         [samples_end]"=&r"(samples_end), [samples_sw]"=&r"(samples_sw),
+         [i1]"=&r"(i1), [i2]"=&r"(i2), [i3]"=&r"(i3), [i]"=&r"(i),
+         [j]"=&r"(j), [len]"+r"(len)
+        :[samples]"r"(samples), [matrix]"r"(matrix),
+         [in_ch]"r"(in_ch), [out_ch]"r"(out_ch)
+        :"memory"
+    );
+}
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact) {
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSPR1
+    c->bit_alloc_calc_bap = ac3_bit_alloc_calc_bap_mips;
+    c->update_bap_counts  = ac3_update_bap_counts_mips;
+#endif
+#if HAVE_MIPSFPU
+    c->float_to_fixed24 = float_to_fixed24_mips;
+    c->downmix          = ac3_downmix_mips;
+#endif
+#endif
+
+}
diff --git a/libavcodec/mips/acelp_filters_mips.c b/libavcodec/mips/acelp_filters_mips.c
new file mode 100644
index 0000000000..ba789abe3f
--- /dev/null
+++ b/libavcodec/mips/acelp_filters_mips.c
@@ -0,0 +1,217 @@
+ /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/acelp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ff_acelp_interpolatef_mips(float *out, const float *in,
+                           const float *filter_coeffs, int precision,
+                           int frac_pos, int filter_length, int length)
+{
+    int n, i;
+    int prec = precision * 4;
+    int fc_offset = precision - frac_pos;
+    float in_val_p, in_val_m, fc_val_p, fc_val_m;
+
+    for (n = 0; n < length; n++) {
+        /**
+        * four pointers are defined in order to minimize number of
+        * computations done in inner loop
+        */
+        const float *p_in_p = &in[n];
+        const float *p_in_m = &in[n-1];
+        const float *p_filter_coeffs_p = &filter_coeffs[frac_pos];
+        const float *p_filter_coeffs_m = filter_coeffs + fc_offset;
+        float v = 0;
+
+        for (i = 0; i < filter_length;i++) {
+            __asm__ volatile (
+                "lwc1   %[in_val_p],           0(%[p_in_p])                    \n\t"
+                "lwc1   %[fc_val_p],           0(%[p_filter_coeffs_p])         \n\t"
+                "lwc1   %[in_val_m],           0(%[p_in_m])                    \n\t"
+                "lwc1   %[fc_val_m],           0(%[p_filter_coeffs_m])         \n\t"
+                PTR_ADDIU "%[p_in_p],          %[p_in_p],              4       \n\t"
+                "madd.s %[v],%[v],             %[in_val_p],%[fc_val_p]         \n\t"
+                PTR_ADDIU "%[p_in_m],          %[p_in_m],              -4      \n\t"
+                PTR_ADDU "%[p_filter_coeffs_p],%[p_filter_coeffs_p],   %[prec] \n\t"
+                PTR_ADDU "%[p_filter_coeffs_m],%[p_filter_coeffs_m],   %[prec] \n\t"
+                "madd.s %[v],%[v],%[in_val_m], %[fc_val_m]                     \n\t"
+
+                : [v] "+&f" (v),[p_in_p] "+r" (p_in_p), [p_in_m] "+r" (p_in_m),
+                  [p_filter_coeffs_p] "+r" (p_filter_coeffs_p),
+                  [in_val_p] "=&f" (in_val_p), [in_val_m] "=&f" (in_val_m),
+                  [fc_val_p] "=&f" (fc_val_p), [fc_val_m] "=&f" (fc_val_m),
+                  [p_filter_coeffs_m] "+r" (p_filter_coeffs_m)
+                : [prec] "r" (prec)
+                : "memory"
+            );
+        }
+        out[n] = v;
+    }
+}
+
+static void ff_acelp_apply_order_2_transfer_function_mips(float *out, const float *in,
+                                              const float zero_coeffs[2],
+                                              const float pole_coeffs[2],
+                                              float gain, float mem[2], int n)
+{
+    /**
+    * loop is unrolled eight times
+    */
+
+    __asm__ volatile (
+        "lwc1   $f0,    0(%[mem])                                              \n\t"
+        "blez   %[n],   ff_acelp_apply_order_2_transfer_function_end%=         \n\t"
+        "lwc1   $f1,    4(%[mem])                                              \n\t"
+        "lwc1   $f2,    0(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f3,    4(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f4,    0(%[zero_coeffs])                                      \n\t"
+        "lwc1   $f5,    4(%[zero_coeffs])                                      \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_madd%=:                      \n\t"
+
+        "lwc1   $f6,    0(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f1                                          \n\t"
+        "mul.s  $f7,    $f2,      $f0                                          \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f0                                 \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f1                                 \n\t"
+        "lwc1   $f11,   4(%[in])                                               \n\t"
+        "mul.s  $f12,   $f3,      $f0                                          \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f0                                 \n\t"
+        "swc1   $f8,    0(%[out])                                              \n\t"
+        "lwc1   $f6,    8(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f15,   $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f15,     $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   4(%[out])                                              \n\t"
+        "lwc1   $f11,   12(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f15                                         \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f16,   $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f16,     $f4,     $f15                                \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    8(%[out])                                              \n\t"
+        "lwc1   $f6,    16(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f15                                         \n\t"
+        "mul.s  $f7,    $f2,      $f16                                         \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f16                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f15                                \n\t"
+        "swc1   $f14,   12(%[out])                                             \n\t"
+        "lwc1   $f11,   20(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f16                                         \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f16                                \n\t"
+        "swc1   $f8,    16(%[out])                                             \n\t"
+        "lwc1   $f6,    24(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f1,    $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f1,      $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   20(%[out])                                             \n\t"
+        "lwc1   $f11,   28(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f1                                          \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f0,    $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f0,      $f4,     $f1                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    24(%[out])                                             \n\t"
+        PTR_ADDIU "%[out], 32                                                  \n\t"
+        PTR_ADDIU "%[in],  32                                                  \n\t"
+        "addiu  %[n],   -8                                                     \n\t"
+        "swc1   $f14,   -4(%[out])                                             \n\t"
+        "bnez   %[n],   ff_acelp_apply_order_2_transfer_function_madd%=        \n\t"
+        "swc1   $f1,    4(%[mem])                                              \n\t"
+        "swc1   $f0,    0(%[mem])                                              \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_end%=:                       \n\t"
+
+         : [out] "+r" (out),
+           [in] "+r" (in), [gain] "+f" (gain),
+           [n] "+r" (n), [mem] "+r" (mem)
+         : [zero_coeffs] "r" (zero_coeffs),
+           [pole_coeffs] "r" (pole_coeffs)
+         : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+           "$f6", "$f7",  "$f8", "$f9", "$f10", "$f11",
+           "$f12", "$f13", "$f14", "$f15", "$f16", "memory"
+    );
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_filter_init_mips(ACELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->acelp_interpolatef                      = ff_acelp_interpolatef_mips;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function_mips;
+#endif
+}
diff --git a/libavcodec/mips/acelp_vectors_mips.c b/libavcodec/mips/acelp_vectors_mips.c
new file mode 100644
index 0000000000..ad9434866e
--- /dev/null
+++ b/libavcodec/mips/acelp_vectors_mips.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * adaptive and fixed codebook vector operations for ACELP-based codecs
+ * optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_vectors.c
+ */
+#include "config.h"
+#include "libavcodec/acelp_vectors.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ff_weighted_vector_sumf_mips(
+                  float *out, const float *in_a, const float *in_b,
+                  float weight_coeff_a, float weight_coeff_b, int length)
+{
+    const float *a_end = in_a + length;
+
+    /* loop unrolled two times */
+    __asm__ volatile (
+        "blez   %[length], ff_weighted_vector_sumf_end%=                     \n\t"
+
+        "ff_weighted_vector_sumf_madd%=:                                     \n\t"
+        "lwc1   $f0,       0(%[in_a])                                        \n\t"
+        "lwc1   $f3,       4(%[in_a])                                        \n\t"
+        "lwc1   $f1,       0(%[in_b])                                        \n\t"
+        "lwc1   $f4,       4(%[in_b])                                        \n\t"
+        "mul.s  $f2,       %[weight_coeff_a], $f0                            \n\t"
+        "mul.s  $f5,       %[weight_coeff_a], $f3                            \n\t"
+        "madd.s $f2,       $f2,               %[weight_coeff_b], $f1         \n\t"
+        "madd.s $f5,       $f5,               %[weight_coeff_b], $f4         \n\t"
+        PTR_ADDIU "%[in_a],8                                                 \n\t"
+        PTR_ADDIU "%[in_b],8                                                 \n\t"
+        "swc1   $f2,       0(%[out])                                         \n\t"
+        "swc1   $f5,       4(%[out])                                         \n\t"
+        PTR_ADDIU "%[out], 8                                                 \n\t"
+        "bne   %[in_a],    %[a_end],          ff_weighted_vector_sumf_madd%= \n\t"
+
+        "ff_weighted_vector_sumf_end%=:                                      \n\t"
+
+        : [out] "+r" (out), [in_a] "+r" (in_a),   [in_b] "+r" (in_b)
+        : [weight_coeff_a] "f" (weight_coeff_a),
+          [weight_coeff_b] "f" (weight_coeff_b),
+          [length] "r" (length), [a_end]"r"(a_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+    );
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_vectors_init_mips(ACELPVContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->weighted_vector_sumf = ff_weighted_vector_sumf_mips;
+#endif
+}
diff --git a/libavcodec/mips/amrwbdec_mips.c b/libavcodec/mips/amrwbdec_mips.c
new file mode 100644
index 0000000000..1d6ed2dfca
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#include "libavutil/avutil.h"
+#include "libavcodec/amrwbdata.h"
+#include "amrwbdec_mips.h"
+
+#if HAVE_INLINE_ASM
+void hb_fir_filter_mips(float *out, const float fir_coef[HB_FIR_SIZE + 1],
+                          float mem[HB_FIR_SIZE], const float *in)
+{
+    int i;
+    float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
+
+    memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
+    memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float));
+
+    for (i = 0; i < AMRWB_SFR_SIZE_16k; i++) {
+        float output;
+        float * p_data = (data+i);
+
+        /**
+        * inner loop is entirely unrolled and instructions are scheduled
+        * to minimize pipeline stall
+        */
+        __asm__ volatile(
+            "mtc1       $zero,     %[output]                      \n\t"
+            "lwc1       $f0,       0(%[p_data])                   \n\t"
+            "lwc1       $f1,       0(%[fir_coef])                 \n\t"
+            "lwc1       $f2,       4(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       4(%[fir_coef])                 \n\t"
+            "lwc1       $f4,       8(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       8(%[fir_coef])                 \n\t"
+
+            "lwc1       $f0,       12(%[p_data])                  \n\t"
+            "lwc1       $f1,       12(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       16(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       16(%[fir_coef])                \n\t"
+            "lwc1       $f4,       20(%[p_data])                  \n\t"
+            "lwc1       $f5,       20(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       24(%[p_data])                  \n\t"
+            "lwc1       $f1,       24(%[fir_coef])                \n\t"
+            "lwc1       $f2,       28(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       28(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       32(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       32(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       36(%[p_data])                  \n\t"
+            "lwc1       $f1,       36(%[fir_coef])                \n\t"
+            "lwc1       $f2,       40(%[p_data])                  \n\t"
+            "lwc1       $f3,       40(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       44(%[p_data])                  \n\t"
+            "lwc1       $f5,       44(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       48(%[p_data])                  \n\t"
+            "lwc1       $f1,       48(%[fir_coef])                \n\t"
+            "lwc1       $f2,       52(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       52(%[fir_coef])                \n\t"
+            "lwc1       $f4,       56(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       56(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       60(%[p_data])                  \n\t"
+            "lwc1       $f1,       60(%[fir_coef])                \n\t"
+            "lwc1       $f2,       64(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       64(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       68(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       68(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       72(%[p_data])                  \n\t"
+            "lwc1       $f1,       72(%[fir_coef])                \n\t"
+            "lwc1       $f2,       76(%[p_data])                  \n\t"
+            "lwc1       $f3,       76(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       80(%[p_data])                  \n\t"
+            "lwc1       $f5,       80(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       84(%[p_data])                  \n\t"
+            "lwc1       $f1,       84(%[fir_coef])                \n\t"
+            "lwc1       $f2,       88(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       88(%[fir_coef])                \n\t"
+            "lwc1       $f4,       92(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       92(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       96(%[p_data])                  \n\t"
+            "lwc1       $f1,       96(%[fir_coef])                \n\t"
+            "lwc1       $f2,       100(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       100(%[fir_coef])               \n\t"
+            "lwc1       $f4,       104(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       104(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       108(%[p_data])                 \n\t"
+            "lwc1       $f1,       108(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       112(%[p_data])                 \n\t"
+            "lwc1       $f3,       112(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       116(%[p_data])                 \n\t"
+            "lwc1       $f5,       116(%[fir_coef])               \n\t"
+            "lwc1       $f0,       120(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f1,       120(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+
+            : [output]"=&f"(output)
+            : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+        );
+        out[i] = output;
+    }
+    memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
+}
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h
new file mode 100644
index 0000000000..a469918d2c
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#ifndef AVCODEC_AMRWBDEC_MIPS_H
+#define AVCODEC_AMRWBDEC_MIPS_H
+#include "config.h"
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+void hb_fir_filter_mips(float *out, const float fir_coef[],
+                          float mem[], const float *in);
+#define hb_fir_filter hb_fir_filter_mips
+#endif
+
+#endif /* AVCODEC_AMRWBDEC_MIPS_H  */
diff --git a/libavcodec/mips/blockdsp_init_mips.c b/libavcodec/mips/blockdsp_init_mips.c
new file mode 100644
index 0000000000..30ae95fa10
--- /dev/null
+++ b/libavcodec/mips/blockdsp_init_mips.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void blockdsp_init_msa(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_msa;
+    c->clear_blocks = ff_clear_blocks_msa;
+
+    c->fill_block_tab[0] = ff_fill_block16_msa;
+    c->fill_block_tab[1] = ff_fill_block8_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void blockdsp_init_mmi(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_mmi;
+    c->clear_blocks = ff_clear_blocks_mmi;
+
+    c->fill_block_tab[0] = ff_fill_block16_mmi;
+    c->fill_block_tab[1] = ff_fill_block8_mmi;
+}
+#endif /* HAVE_MMI */
+
+void ff_blockdsp_init_mips(BlockDSPContext *c)
+{
+#if HAVE_MSA
+    blockdsp_init_msa(c);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    blockdsp_init_mmi(c);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/blockdsp_mips.h b/libavcodec/mips/blockdsp_mips.h
new file mode 100644
index 0000000000..9559d40eaa
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mips.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_BLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height);
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height);
+void ff_clear_block_msa(int16_t *block);
+void ff_clear_blocks_msa(int16_t *block);
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h);
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h);
+void ff_clear_block_mmi(int16_t *block);
+void ff_clear_blocks_mmi(int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/blockdsp_mmi.c b/libavcodec/mips/blockdsp_mmi.c
new file mode 100644
index 0000000000..63eaf69a89
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mmi.c
@@ -0,0 +1,147 @@
+/*
+ * Loongson SIMD optimized blockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
+{
+    __asm__ volatile (
+        "move $8, %3                \r\n"
+        "move $9, %0                \r\n"
+        "dmtc1 %1, $f2              \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "1:                         \r\n"
+        "gssdlc1 $f2, 7($9)         \r\n"
+        "gssdrc1 $f2, 0($9)         \r\n"
+        "gssdlc1 $f2, 15($9)        \r\n"
+        "gssdrc1 $f2, 8($9)         \r\n"
+        "daddi $8, $8, -1           \r\n"
+        "daddu $9, $9, %2           \r\n"
+        "bnez $8, 1b                \r\n"
+        ::"r"(block),"r"(value),"r"(line_size),"r"(h)
+        : "$8","$9"
+    );
+}
+
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
+{
+    __asm__ volatile (
+        "move $8, %3                \r\n"
+        "move $9, %0                \r\n"
+        "dmtc1 %1, $f2              \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "punpcklbh $f2, $f2, $f2    \r\n"
+        "1:                         \r\n"
+        "gssdlc1 $f2, 7($9)         \r\n"
+        "gssdrc1 $f2, 0($9)         \r\n"
+        "daddi $8, $8, -1           \r\n"
+        "daddu $9, $9, %2           \r\n"
+        "bnez $8, 1b                \r\n"
+        ::"r"(block),"r"(value),"r"(line_size),"r"(h)
+        : "$8","$9"
+    );
+}
+
+void ff_clear_block_mmi(int16_t *block)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0              \r\n"
+        "xor $f2, $f2, $f2              \r\n"
+        "gssqc1 $f0, $f2,   0(%0)       \r\n"
+        "gssqc1 $f0, $f2,  16(%0)       \r\n"
+        "gssqc1 $f0, $f2,  32(%0)       \r\n"
+        "gssqc1 $f0, $f2,  48(%0)       \r\n"
+        "gssqc1 $f0, $f2,  64(%0)       \r\n"
+        "gssqc1 $f0, $f2,  80(%0)       \r\n"
+        "gssqc1 $f0, $f2,  96(%0)       \r\n"
+        "gssqc1 $f0, $f2, 112(%0)       \r\n"
+        ::"r"(block)
+        : "memory"
+    );
+}
+
+void ff_clear_blocks_mmi(int16_t *block)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0              \r\n"
+        "xor $f2, $f2, $f2              \r\n"
+        "gssqc1 $f0, $f2,   0(%0)       \r\n"
+        "gssqc1 $f0, $f2,  16(%0)       \r\n"
+        "gssqc1 $f0, $f2,  32(%0)       \r\n"
+        "gssqc1 $f0, $f2,  48(%0)       \r\n"
+        "gssqc1 $f0, $f2,  64(%0)       \r\n"
+        "gssqc1 $f0, $f2,  80(%0)       \r\n"
+        "gssqc1 $f0, $f2,  96(%0)       \r\n"
+        "gssqc1 $f0, $f2, 112(%0)       \r\n"
+
+        "gssqc1 $f0, $f2, 128(%0)       \r\n"
+        "gssqc1 $f0, $f2, 144(%0)       \r\n"
+        "gssqc1 $f0, $f2, 160(%0)       \r\n"
+        "gssqc1 $f0, $f2, 176(%0)       \r\n"
+        "gssqc1 $f0, $f2, 192(%0)       \r\n"
+        "gssqc1 $f0, $f2, 208(%0)       \r\n"
+        "gssqc1 $f0, $f2, 224(%0)       \r\n"
+        "gssqc1 $f0, $f2, 240(%0)       \r\n"
+
+        "gssqc1 $f0, $f2, 256(%0)       \r\n"
+        "gssqc1 $f0, $f2, 272(%0)       \r\n"
+        "gssqc1 $f0, $f2, 288(%0)       \r\n"
+        "gssqc1 $f0, $f2, 304(%0)       \r\n"
+        "gssqc1 $f0, $f2, 320(%0)       \r\n"
+        "gssqc1 $f0, $f2, 336(%0)       \r\n"
+        "gssqc1 $f0, $f2, 352(%0)       \r\n"
+        "gssqc1 $f0, $f2, 368(%0)       \r\n"
+
+        "gssqc1 $f0, $f2, 384(%0)       \r\n"
+        "gssqc1 $f0, $f2, 400(%0)       \r\n"
+        "gssqc1 $f0, $f2, 416(%0)       \r\n"
+        "gssqc1 $f0, $f2, 432(%0)       \r\n"
+        "gssqc1 $f0, $f2, 448(%0)       \r\n"
+        "gssqc1 $f0, $f2, 464(%0)       \r\n"
+        "gssqc1 $f0, $f2, 480(%0)       \r\n"
+        "gssqc1 $f0, $f2, 496(%0)       \r\n"
+
+        "gssqc1 $f0, $f2, 512(%0)       \r\n"
+        "gssqc1 $f0, $f2, 528(%0)       \r\n"
+        "gssqc1 $f0, $f2, 544(%0)       \r\n"
+        "gssqc1 $f0, $f2, 560(%0)       \r\n"
+        "gssqc1 $f0, $f2, 576(%0)       \r\n"
+        "gssqc1 $f0, $f2, 592(%0)       \r\n"
+        "gssqc1 $f0, $f2, 608(%0)       \r\n"
+        "gssqc1 $f0, $f2, 624(%0)       \r\n"
+
+        "gssqc1 $f0, $f2, 640(%0)       \r\n"
+        "gssqc1 $f0, $f2, 656(%0)       \r\n"
+        "gssqc1 $f0, $f2, 672(%0)       \r\n"
+        "gssqc1 $f0, $f2, 688(%0)       \r\n"
+        "gssqc1 $f0, $f2, 704(%0)       \r\n"
+        "gssqc1 $f0, $f2, 720(%0)       \r\n"
+        "gssqc1 $f0, $f2, 736(%0)       \r\n"
+        "gssqc1 $f0, $f2, 752(%0)       \r\n"
+        ::"r"(block)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/blockdsp_msa.c b/libavcodec/mips/blockdsp_msa.c
new file mode 100644
index 0000000000..32ac858e1d
--- /dev/null
+++ b/libavcodec/mips/blockdsp_msa.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "blockdsp_mips.h"
+
+static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val,
+                                       int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    uint64_t dst0;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+    dst0 = __msa_copy_u_d((v2i64) val0, 0);
+
+    for (cnt = (height >> 2); cnt--;) {
+        SD4(dst0, dst0, dst0, dst0, src, src_stride);
+        src += (4 * src_stride);
+    }
+}
+
+static void copy_8bit_value_width16_msa(uint8_t *src, uint8_t val,
+                                        int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+
+    for (cnt = (height >> 3); cnt--;) {
+        ST_UB8(val0, val0, val0, val0, val0, val0, val0, val0, src, src_stride);
+        src += (8 * src_stride);
+    }
+}
+
+static void memset_zero_16width_msa(uint8_t *src, int32_t stride,
+                                    int32_t height)
+{
+    int8_t cnt;
+    v16u8 zero = { 0 };
+
+    for (cnt = (height / 2); cnt--;) {
+        ST_UB(zero, src);
+        src += stride;
+        ST_UB(zero, src);
+        src += stride;
+    }
+}
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height)
+{
+    copy_8bit_value_width16_msa(src, val, stride, height);
+}
+
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height)
+{
+    copy_8bit_value_width8_msa(src, val, stride, height);
+}
+
+void ff_clear_block_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8);
+}
+
+void ff_clear_blocks_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8 * 6);
+}
diff --git a/libavcodec/mips/celp_filters_mips.c b/libavcodec/mips/celp_filters_mips.c
new file mode 100644
index 0000000000..88ac45841d
--- /dev/null
+++ b/libavcodec/mips/celp_filters_mips.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for CELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavcodec/celp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ff_celp_lp_synthesis_filterf_mips(float *out,
+                                  const float *filter_coeffs,
+                                  const float* in, int buffer_length,
+                                  int filter_length)
+{
+    int i,n;
+
+    float out0, out1, out2, out3;
+    float old_out0, old_out1, old_out2, old_out3;
+    float a,b,c;
+    const float *p_filter_coeffs;
+    float *p_out;
+
+    a = filter_coeffs[0];
+    b = filter_coeffs[1];
+    c = filter_coeffs[2];
+    b -= filter_coeffs[0] * filter_coeffs[0];
+    c -= filter_coeffs[1] * filter_coeffs[0];
+    c -= filter_coeffs[0] * b;
+
+    old_out0 = out[-4];
+    old_out1 = out[-3];
+    old_out2 = out[-2];
+    old_out3 = out[-1];
+    for (n = 0; n <= buffer_length - 4; n+=4) {
+        p_filter_coeffs = filter_coeffs;
+        p_out = out;
+
+        out0 = in[0];
+        out1 = in[1];
+        out2 = in[2];
+        out3 = in[3];
+
+        __asm__ volatile(
+            "lwc1       $f2,     8(%[filter_coeffs])                        \n\t"
+            "lwc1       $f1,     4(%[filter_coeffs])                        \n\t"
+            "lwc1       $f0,     0(%[filter_coeffs])                        \n\t"
+            "nmsub.s    %[out0], %[out0],             $f2, %[old_out1]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f2, %[old_out2]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f2, %[old_out3]      \n\t"
+            "lwc1       $f3,     12(%[filter_coeffs])                       \n\t"
+            "nmsub.s    %[out0], %[out0],             $f1, %[old_out2]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f1, %[old_out3]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f3, %[old_out2]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f0, %[old_out3]      \n\t"
+            "nmsub.s    %[out3], %[out3],             $f3, %[old_out3]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f3, %[old_out1]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f3, %[old_out0]      \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [old_out0]"f"(old_out0), [old_out1]"f"(old_out1),
+              [old_out2]"f"(old_out2), [old_out3]"f"(old_out3),
+              [filter_coeffs]"r"(filter_coeffs)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "memory"
+        );
+
+        for (i = 5; i <= filter_length; i += 2) {
+            __asm__ volatile(
+                "lwc1    %[old_out3], -20(%[p_out])                         \n\t"
+                "lwc1    $f5,         16(%[p_filter_coeffs])                \n\t"
+                PTR_ADDIU "%[p_out],  -8                                    \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 8                            \n\t"
+                "nmsub.s %[out1],     %[out1],      $f5, %[old_out0]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f5, %[old_out2]        \n\t"
+                "lwc1    $f4,         12(%[p_filter_coeffs])                \n\t"
+                "lwc1    %[old_out2], -16(%[p_out])                         \n\t"
+                "nmsub.s %[out0],     %[out0],      $f5, %[old_out3]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f5, %[old_out1]        \n\t"
+                "nmsub.s %[out1],     %[out1],      $f4, %[old_out3]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f4, %[old_out1]        \n\t"
+                "mov.s   %[old_out1], %[old_out3]                           \n\t"
+                "nmsub.s %[out0],     %[out0],      $f4, %[old_out2]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f4, %[old_out0]        \n\t"
+
+                : [out0]"+f"(out0), [out1]"+f"(out1),
+                  [out2]"+f"(out2), [out3]"+f"(out3), [old_out0]"+f"(old_out0),
+                  [old_out1]"+f"(old_out1), [old_out2]"+f"(old_out2),
+                  [old_out3]"+f"(old_out3),[p_filter_coeffs]"+r"(p_filter_coeffs),
+                  [p_out]"+r"(p_out)
+                :
+                : "$f4", "$f5", "memory"
+            );
+            FFSWAP(float, old_out0, old_out2);
+        }
+
+        __asm__ volatile(
+            "nmsub.s    %[out3], %[out3], %[a], %[out2]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[a], %[out1]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[b], %[out1]                     \n\t"
+            "nmsub.s    %[out1], %[out1], %[a], %[out0]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[b], %[out0]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[c], %[out0]                     \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [a]"f"(a), [b]"f"(b), [c]"f"(c)
+        );
+
+        out[0] = out0;
+        out[1] = out1;
+        out[2] = out2;
+        out[3] = out3;
+
+        old_out0 = out0;
+        old_out1 = out1;
+        old_out2 = out2;
+        old_out3 = out3;
+
+        out += 4;
+        in  += 4;
+    }
+
+    out -= n;
+    in -= n;
+    for (; n < buffer_length; n++) {
+        float out_val, out_val_i, fc_val;
+        p_filter_coeffs = filter_coeffs;
+        p_out = &out[n];
+        out_val = in[n];
+        for (i = 1; i <= filter_length; i++) {
+            __asm__ volatile(
+                "lwc1    %[fc_val],          0(%[p_filter_coeffs])                        \n\t"
+                "lwc1    %[out_val_i],       -4(%[p_out])                                 \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 4                                          \n\t"
+                PTR_ADDIU "%[p_out],         -4                                           \n\t"
+                "nmsub.s %[out_val],         %[out_val],          %[fc_val], %[out_val_i] \n\t"
+
+                : [fc_val]"=&f"(fc_val), [out_val]"+f"(out_val),
+                  [out_val_i]"=&f"(out_val_i), [p_out]"+r"(p_out),
+                  [p_filter_coeffs]"+r"(p_filter_coeffs)
+                :
+                : "memory"
+            );
+        }
+        out[n] = out_val;
+    }
+}
+
+static void ff_celp_lp_zero_synthesis_filterf_mips(float *out,
+                                       const float *filter_coeffs,
+                                       const float *in, int buffer_length,
+                                       int filter_length)
+{
+    int i,n;
+    float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
+    float sum_out3, sum_out2, sum_out1;
+    const float *p_filter_coeffs, *p_in;
+
+    for (n = 0; n < buffer_length; n+=8) {
+        p_in = &in[n];
+        p_filter_coeffs = filter_coeffs;
+        sum_out8 = in[n+7];
+        sum_out7 = in[n+6];
+        sum_out6 = in[n+5];
+        sum_out5 = in[n+4];
+        sum_out4 = in[n+3];
+        sum_out3 = in[n+2];
+        sum_out2 = in[n+1];
+        sum_out1 = in[n];
+        i = filter_length;
+
+        /* i is always greater than 0
+        * outer loop is unrolled eight times so there is less memory access
+        * inner loop is unrolled two times
+        */
+        __asm__ volatile(
+            "filt_lp_inner%=:                                               \n\t"
+            "lwc1   %[fc_val],   0(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         6*4(%[p_in])                               \n\t"
+            "lwc1   $f6,         5*4(%[p_in])                               \n\t"
+            "lwc1   $f5,         4*4(%[p_in])                               \n\t"
+            "lwc1   $f4,         3*4(%[p_in])                               \n\t"
+            "lwc1   $f3,         2*4(%[p_in])                               \n\t"
+            "lwc1   $f2,         4(%[p_in])                                 \n\t"
+            "lwc1   $f1,         0(%[p_in])                                 \n\t"
+            "lwc1   $f0,         -4(%[p_in])                                \n\t"
+            "addiu  %[i],        -2                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f7       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f0       \n\t"
+            "lwc1   %[fc_val],   4(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         -8(%[p_in])                                \n\t"
+            PTR_ADDIU "%[p_filter_coeffs], 8                                \n\t"
+            PTR_ADDIU "%[p_in],  -8                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f0       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f7       \n\t"
+            "bgtz   %[i],        filt_lp_inner%=                            \n\t"
+
+            : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
+              [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
+              [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
+              [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
+              [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
+              [p_in]"+r"(p_in), [i]"+r"(i)
+            :
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "memory"
+        );
+
+        out[n+7] = sum_out8;
+        out[n+6] = sum_out7;
+        out[n+5] = sum_out6;
+        out[n+4] = sum_out5;
+        out[n+3] = sum_out4;
+        out[n+2] = sum_out3;
+        out[n+1] = sum_out2;
+        out[n] = sum_out1;
+    }
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_filter_init_mips(CELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf_mips;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf_mips;
+#endif
+}
diff --git a/libavcodec/mips/celp_math_mips.c b/libavcodec/mips/celp_math_mips.c
new file mode 100644
index 0000000000..008dd80308
--- /dev/null
+++ b/libavcodec/mips/celp_math_mips.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * Math operations optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_math.c
+ */
+#include "config.h"
+#include "libavcodec/celp_math.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static float ff_dot_productf_mips(const float* a, const float* b,
+                                              int length)
+{
+    float sum;
+    const float* a_end = a + length;
+
+    __asm__ volatile (
+        "mtc1   $zero,      %[sum]                              \n\t"
+        "blez   %[length],  ff_dot_productf_end%=               \n\t"
+        "ff_dot_productf_madd%=:                                \n\t"
+        "lwc1   $f2,        0(%[a])                             \n\t"
+        "lwc1   $f1,        0(%[b])                             \n\t"
+        PTR_ADDIU "%[a],    %[a],      4                        \n\t"
+        PTR_ADDIU "%[b],    %[b],      4                        \n\t"
+        "madd.s %[sum],     %[sum],    $f1, $f2                 \n\t"
+        "bne   %[a],        %[a_end],  ff_dot_productf_madd%=   \n\t"
+        "ff_dot_productf_end%=:                                 \n\t"
+
+        : [sum] "=&f" (sum), [a] "+r" (a), [b] "+r" (b)
+        : [a_end]"r"(a_end), [length] "r" (length)
+        : "$f1", "$f2", "memory"
+    );
+    return sum;
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_math_init_mips(CELPMContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->dot_productf = ff_dot_productf_mips;
+#endif
+}
diff --git a/libavcodec/mips/compute_antialias_fixed.h b/libavcodec/mips/compute_antialias_fixed.h
new file mode 100644
index 0000000000..a967f67de7
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_fixed.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+
+#if HAVE_INLINE_ASM
+static void compute_antialias_mips_fixed(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    int32_t *ptr, *csa;
+    int n, i;
+    int MAX_lo = 0xffffffff;
+
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        n = 1;
+    } else {
+        n = SBLIMIT - 1;
+    }
+
+
+    ptr = g->sb_hybrid + 18;
+
+    for(i = n;i > 0;i--) {
+        int tmp0, tmp1, tmp2, tmp00, tmp11;
+        int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+        csa = &csa_table[0][0];
+
+        /**
+         * instructions are scheduled to minimize pipeline stall.
+         */
+        __asm__ volatile (
+            "lw   %[tmp0],      -1*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      0*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 0*4(%[csa])                             \n\t"
+            "lw   %[temp_reg2], 2*4(%[csa])                             \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 3*4(%[csa])                             \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "lw   %[tmp00],     -2*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg4], 4*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 6*4(%[csa])                             \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg6], 7*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sw   %[temp_reg1], -1*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sw   %[temp_reg2], 0*4(%[ptr])                             \n\t"
+            "lw   %[tmp0],      -3*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      2*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 8*4(%[csa])                             \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sw   %[temp_reg4], -2*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg2], 10*4(%[csa])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "lw   %[temp_reg3], 11*4(%[csa])                            \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "lw   %[tmp00],     -4*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     3*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[temp_reg4], 12*4(%[csa])                            \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg5], 14*4(%[csa])                            \n\t"
+            "lw   %[temp_reg6], 15*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "sw   %[temp_reg1], -3*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sw   %[temp_reg2], 2*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp0],      -5*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      4*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 16*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 18*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 19*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -4*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 3*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -6*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     5*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 20*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 22*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 23*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -5*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 4*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "lw   %[tmp0],      -7*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp1],      6*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 24*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 26*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 27*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -6*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 5*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -8*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     7*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 28*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 30*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 31*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -7*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 6*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -8*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 7*4(%[ptr])                             \n\t"
+
+            : [tmp0] "=&r" (tmp0), [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+              [tmp00] "=&r" (tmp00), [tmp11] "=&r" (tmp11),
+              [temp_reg1] "=&r" (temp_reg1), [temp_reg2] "=&r" (temp_reg2),
+              [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6)
+            : [csa] "r" (csa), [ptr] "r" (ptr),
+              [MAX_lo] "r" (MAX_lo)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+
+        ptr += 18;
+    }
+}
+#define compute_antialias compute_antialias_mips_fixed
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H */
diff --git a/libavcodec/mips/compute_antialias_float.h b/libavcodec/mips/compute_antialias_float.h
new file mode 100644
index 0000000000..f6cf46508b
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_float.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void compute_antialias_mips_float(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    float *ptr, *ptr_end;
+    float *csa = &csa_table[0][0];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float out1, out2, out3, out4;
+
+    ptr = g->sb_hybrid + 18;
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        ptr_end = ptr + 18;
+    } else {
+        ptr_end = ptr + 558;
+    }
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+
+    __asm__ volatile (
+        "compute_antialias_float_loop%=:                                \t\n"
+        "lwc1    %[in1],  -1*4(%[ptr])                                  \t\n"
+        "lwc1    %[in2],  0(%[csa])                                     \t\n"
+        "lwc1    %[in3],  1*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  0(%[ptr])                                     \t\n"
+        "lwc1    %[in5],  -2*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  4*4(%[csa])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  5*4(%[csa])                                   \t\n"
+        "lwc1    %[in8],  1*4(%[ptr])                                   \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "lwc1    %[in1],  -3*4(%[ptr])                                  \t\n"
+        "swc1    %[out1], -1*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 0(%[ptr])                                     \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  8*4(%[csa])                                   \t\n"
+        "swc1    %[out3], -2*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 1*4(%[ptr])                                   \t\n"
+        "lwc1    %[in3],  9*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  2*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  12*4(%[csa])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  13*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -3*4(%[ptr])                                  \t\n"
+        "lwc1    %[in1],  -5*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "swc1    %[out2], 2*4(%[ptr])                                   \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  16*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  17*4(%[csa])                                  \t\n"
+        "swc1    %[out3], -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in4],  4*4(%[ptr])                                   \t\n"
+        "swc1    %[out4], 3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in5],  -6*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  20*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  21*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "lwc1    %[in8],  5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -5*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 4*4(%[ptr])                                   \t\n"
+        "lwc1    %[in1],  -7*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  24*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  25*4(%[csa])                                  \t\n"
+        "lwc1    %[in4],  6*4(%[ptr])                                   \t\n"
+        "swc1    %[out3], -6*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -8*4(%[ptr])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in6],  28*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  29*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  7*4(%[ptr])                                   \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -7*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 6*4(%[ptr])                                   \t\n"
+        PTR_ADDIU "%[ptr],%[ptr],  72                                   \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "swc1    %[out3], -26*4(%[ptr])                                 \t\n"
+        "swc1    %[out4], -11*4(%[ptr])                                 \t\n"
+        "bne     %[ptr],  %[ptr_end],  compute_antialias_float_loop%=   \t\n"
+
+        : [ptr] "+r" (ptr),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4)
+        : [csa] "r" (csa), [ptr_end] "r" (ptr_end)
+        : "memory"
+    );
+}
+#define compute_antialias compute_antialias_mips_float
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H */
diff --git a/libavcodec/mips/constants.c b/libavcodec/mips/constants.c
new file mode 100644
index 0000000000..f8130d9ee1
--- /dev/null
+++ b/libavcodec/mips/constants.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/mem.h"
+#include "constants.h"
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1) =       {0x0001000100010001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) =       {0x0003000300030003ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4) =       {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5) =       {0x0005000500050005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8) =       {0x0008000800080008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_9) =       {0x0009000900090009ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_10) =      {0x000A000A000A000AULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_16) =      {0x0010001000100010ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_18) =      {0x0012001200120012ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) =      {0x0014001400140014ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_28) =      {0x001C001C001C001CULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_32) =      {0x0020002000200020ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) =      {0x0035003500350035ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_64) =      {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) =     {0x0080008000800080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_512) =     {0x0200020002000200ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m8tom5) =  {0xFFFBFFFAFFF9FFF8ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m4tom1) =  {0xFFFFFFFEFFFDFFFCULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1to4) =    {0x0004000300020001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5to8) =    {0x0008000700060005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_0to3) =    {0x0003000200010000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4to7) =    {0x0007000600050004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8tob) =    {0x000b000a00090008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_ctof) =    {0x000f000e000d000cULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_1) =       {0x0101010101010101ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_3) =       {0x0303030303030303ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_80) =      {0x8080808080808080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1) =      {0xA1A1A1A1A1A1A1A1ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FE) =      {0xFEFEFEFEFEFEFEFEULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd) =        {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd2) =       {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd3) =       {0x0020002000200020ULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_wm1010) =     {0xFFFF0000FFFF0000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_d40000) =     {0x0000000000040000ULL};
diff --git a/libavcodec/mips/constants.h b/libavcodec/mips/constants.h
new file mode 100644
index 0000000000..0a4effdaa9
--- /dev/null
+++ b/libavcodec/mips/constants.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_CONSTANTS_H
+#define AVCODEC_MIPS_CONSTANTS_H
+
+#include <stdint.h>
+
+extern const uint64_t ff_pw_1;
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const uint64_t ff_pw_5;
+extern const uint64_t ff_pw_8;
+extern const uint64_t ff_pw_9;
+extern const uint64_t ff_pw_10;
+extern const uint64_t ff_pw_16;
+extern const uint64_t ff_pw_18;
+extern const uint64_t ff_pw_20;
+extern const uint64_t ff_pw_28;
+extern const uint64_t ff_pw_32;
+extern const uint64_t ff_pw_53;
+extern const uint64_t ff_pw_64;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_512;
+extern const uint64_t ff_pw_m8tom5;
+extern const uint64_t ff_pw_m4tom1;
+extern const uint64_t ff_pw_1to4;
+extern const uint64_t ff_pw_5to8;
+extern const uint64_t ff_pw_0to3;
+extern const uint64_t ff_pw_4to7;
+extern const uint64_t ff_pw_8tob;
+extern const uint64_t ff_pw_ctof;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_80;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FE;
+
+extern const uint64_t ff_rnd;
+extern const uint64_t ff_rnd2;
+extern const uint64_t ff_rnd3;
+
+extern const uint64_t ff_wm1010;
+extern const uint64_t ff_d40000;
+
+#endif /* AVCODEC_MIPS_CONSTANTS_H */
diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
new file mode 100644
index 0000000000..cf008c6561
--- /dev/null
+++ b/libavcodec/mips/fft_mips.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ *
+ * Optimized MDCT/IMDCT and FFT transforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft_table.h"
+#include "libavutil/mips/asmdefs.h"
+
+/**
+ * FFT transform
+ */
+
+#if HAVE_INLINE_ASM
+static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
+{
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    float w_re, w_im;
+    float *w_re_ptr, *w_im_ptr;
+    const int fft_size = (1 << s->nbits);
+    float pom,  pom1,  pom2,  pom3;
+    float temp, temp1, temp3, temp4;
+    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
+    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        __asm__ volatile (
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
+            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
+            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
+            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
+            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
+            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
+            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
+            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
+            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
+            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
+            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
+            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
+            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
+            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
+            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
+            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
+            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
+            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
+            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
+            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
+            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
+            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
+            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
+            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
+            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
+            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
+            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
+            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
+            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
+            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
+            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
+            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
+            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
+            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
+            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
+            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
+            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
+            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
+            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
+            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
+            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
+            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
+            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
+            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
+            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
+            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
+            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
+            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
+            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
+            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
+            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
+            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
+            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
+            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
+            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
+            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
+            : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
+              [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5),  [tmp7]"=&f"(tmp7),
+              [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
+            : [tmpz]"r"(tmpz)
+            : "memory"
+        );
+    }
+
+    step = 1 << (MAX_LOG2_NFFT - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++) {
+        num_transforms = (num_transforms >> 1) | 1;
+        n2  = 2 * n4;
+        n34 = 3 * n4;
+
+        for (n=0; n<num_transforms; n++) {
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmpz_n2  = tmpz +  n2;
+            tmpz_n4  = tmpz +  n4;
+            tmpz_n34 = tmpz +  n34;
+
+            __asm__ volatile (
+                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
+                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
+                "lwc1  %[temp1],0(%[tmpz])               \n\t"
+                "lwc1  %[temp3],4(%[tmpz])               \n\t"
+                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
+                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
+                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
+                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
+                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
+                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
+                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
+                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
+                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
+                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
+                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
+                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
+                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
+                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
+                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
+                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
+                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
+                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
+                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
+                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
+                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
+                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
+                : [tmp5]"=&f"(tmp5),
+                  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
+                  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6),          [pom3]"=&f"(pom3),
+                  [temp]"=&f"(temp), [temp1]"=&f"(temp1),     [temp3]"=&f"(temp3),       [temp4]"=&f"(temp4)
+                : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
+                : "memory"
+            );
+
+            w_re_ptr = (float*)(ff_cos_65536 + step);
+            w_im_ptr = (float*)(ff_cos_65536 + MAX_FFT_SIZE/4 - step);
+
+            for (i=1; i<n4; i++) {
+                w_re = w_re_ptr[0];
+                w_im = w_im_ptr[0];
+                tmpz_n2_i = tmpz_n2  + i;
+                tmpz_n4_i = tmpz_n4  + i;
+                tmpz_n34_i= tmpz_n34 + i;
+                tmpz_i    = tmpz     + i;
+
+                __asm__ volatile (
+                    "lwc1     %[temp],  0(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[temp1], 4(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[pom],   0(%[tmpz_n34_i])              \n\t"
+                    "lwc1     %[pom1],  4(%[tmpz_n34_i])              \n\t"
+                    "mul.s    %[temp3], %[w_im],    %[temp]           \n\t"
+                    "mul.s    %[temp4], %[w_im],    %[temp1]          \n\t"
+                    "mul.s    %[pom2],  %[w_im],    %[pom1]           \n\t"
+                    "mul.s    %[pom3],  %[w_im],    %[pom]            \n\t"
+                    "msub.s   %[tmp2],  %[temp3],   %[w_re], %[temp1] \n\t"  // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
+                    "madd.s   %[tmp1],  %[temp4],   %[w_re], %[temp]  \n\t"  // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
+                    "msub.s   %[tmp3],  %[pom2],    %[w_re], %[pom]   \n\t"  // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
+                    "madd.s   %[tmp4],  %[pom3],    %[w_re], %[pom1]  \n\t"  // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
+                    "lwc1     %[temp],  0(%[tmpz_i])                  \n\t"
+                    "lwc1     %[pom],   4(%[tmpz_i])                  \n\t"
+                    "add.s    %[tmp5],  %[tmp1],    %[tmp3]           \n\t"  // tmp5 = tmp1 + tmp3;
+                    "sub.s    %[tmp1],  %[tmp1],    %[tmp3]           \n\t"  // tmp1 = tmp1 - tmp3;
+                    "add.s    %[tmp6],  %[tmp2],    %[tmp4]           \n\t"  // tmp6 = tmp2 + tmp4;
+                    "sub.s    %[tmp2],  %[tmp2],    %[tmp4]           \n\t"  // tmp2 = tmp2 - tmp4;
+                    "sub.s    %[temp1], %[temp],    %[tmp5]           \n\t"
+                    "add.s    %[temp],  %[temp],    %[tmp5]           \n\t"
+                    "sub.s    %[pom1],  %[pom],     %[tmp6]           \n\t"
+                    "add.s    %[pom],   %[pom],     %[tmp6]           \n\t"
+                    "lwc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"
+                    "lwc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"
+                    "swc1     %[temp1], 0(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                    "swc1     %[temp],  0(%[tmpz_i])                  \n\t"  // tmpz[    i].re = tmpz[   i].re + tmp5;
+                    "swc1     %[pom1],  4(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                    "swc1     %[pom] ,  4(%[tmpz_i])                  \n\t"  // tmpz[    i].im = tmpz[   i].im + tmp6;
+                    "sub.s    %[temp4], %[temp3],   %[tmp2]           \n\t"
+                    "add.s    %[pom3],  %[pom2],    %[tmp1]           \n\t"
+                    "add.s    %[temp3], %[temp3],   %[tmp2]           \n\t"
+                    "sub.s    %[pom2],  %[pom2],    %[tmp1]           \n\t"
+                    "swc1     %[temp4], 0(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                    "swc1     %[pom3],  4(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                    "swc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                    "swc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+                    : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
+                      [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
+                      [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+                      [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
+                    : [w_re]"f"(w_re), [w_im]"f"(w_im),
+                      [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
+                      [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
+                    : "memory"
+                );
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+/**
+ * MDCT/IMDCT transforms.
+ */
+
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k, n8, n4, n2, n, j;
+    const uint16_t *revtab = s->revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2, *in3, *in4;
+    FFTComplex *z = (FFTComplex *)output;
+
+    int j1;
+    const float *tcos1, *tsin1, *tcos2, *tsin2;
+    float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+        temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+    FFTComplex *z1, *z2;
+
+    n = 1 << s->mdct_bits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    in3 = input + 2;
+    in4 = input + n2 - 3;
+
+    tcos1 = tcos;
+    tsin1 = tsin;
+
+    /* n4 = 64 or 128 */
+    for(k = 0; k < n4; k += 2) {
+        j  = revtab[k    ];
+        j1 = revtab[k + 1];
+
+        __asm__ volatile (
+            "lwc1           %[temp1],       0(%[in2])                           \t\n"
+            "lwc1           %[temp2],       0(%[tcos1])                         \t\n"
+            "lwc1           %[temp3],       0(%[tsin1])                         \t\n"
+            "lwc1           %[temp4],       0(%[in1])                           \t\n"
+            "lwc1           %[temp5],       0(%[in4])                           \t\n"
+            "mul.s          %[temp9],       %[temp1],   %[temp2]                \t\n"
+            "mul.s          %[temp10],      %[temp1],   %[temp3]                \t\n"
+            "lwc1           %[temp6],       4(%[tcos1])                         \t\n"
+            "lwc1           %[temp7],       4(%[tsin1])                         \t\n"
+            "nmsub.s        %[temp9],       %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s         %[temp10],      %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s          %[temp11],      %[temp5],   %[temp6]                \t\n"
+            "mul.s          %[temp12],      %[temp5],   %[temp7]                \t\n"
+            "lwc1           %[temp8],       0(%[in3])                           \t\n"
+            PTR_ADDIU "     %[tcos1],       %[tcos1],   8                       \t\n"
+            PTR_ADDIU "     %[tsin1],       %[tsin1],   8                       \t\n"
+            PTR_ADDIU "     %[in1],         %[in1],     16                      \t\n"
+            "nmsub.s        %[temp11],      %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s         %[temp12],      %[temp12],  %[temp8],   %[temp6]    \t\n"
+            PTR_ADDIU "     %[in2],         %[in2],     -16                     \t\n"
+            PTR_ADDIU "     %[in3],         %[in3],     16                      \t\n"
+            PTR_ADDIU "     %[in4],         %[in4],     -16                     \t\n"
+
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
+              [in1]"+r"(in1), [in2]"+r"(in2),
+              [in3]"+r"(in3), [in4]"+r"(in4)
+            :
+            : "memory"
+        );
+
+        z[j ].re = temp9;
+        z[j ].im = temp10;
+        z[j1].re = temp11;
+        z[j1].im = temp12;
+    }
+
+    s->fft_calc(s, z);
+
+    /* post rotation + reordering */
+    /* n8 = 32 or 64 */
+    for(k = 0; k < n8; k += 2) {
+        tcos1 = &tcos[n8 - k - 2];
+        tsin1 = &tsin[n8 - k - 2];
+        tcos2 = &tcos[n8 + k];
+        tsin2 = &tsin[n8 + k];
+        z1 = &z[n8 - k - 2];
+        z2 = &z[n8 + k    ];
+
+        __asm__ volatile (
+            "lwc1       %[temp1],   12(%[z1])                           \t\n"
+            "lwc1       %[temp2],   4(%[tsin1])                         \t\n"
+            "lwc1       %[temp3],   4(%[tcos1])                         \t\n"
+            "lwc1       %[temp4],   8(%[z1])                            \t\n"
+            "lwc1       %[temp5],   4(%[z1])                            \t\n"
+            "mul.s      %[temp9],   %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp10],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp6],   0(%[tsin1])                         \t\n"
+            "lwc1       %[temp7],   0(%[tcos1])                         \t\n"
+            "nmsub.s    %[temp9],   %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp10],  %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp11],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp12],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   0(%[z1])                            \t\n"
+            "lwc1       %[temp1],   4(%[z2])                            \t\n"
+            "lwc1       %[temp2],   0(%[tsin2])                         \t\n"
+            "lwc1       %[temp3],   0(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp11],  %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp12],  %[temp12],  %[temp8],   %[temp6]    \t\n"
+            "mul.s      %[temp13],  %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp14],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp4],   0(%[z2])                            \t\n"
+            "lwc1       %[temp5],   12(%[z2])                           \t\n"
+            "lwc1       %[temp6],   4(%[tsin2])                         \t\n"
+            "lwc1       %[temp7],   4(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp13],  %[temp13],  %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp14],  %[temp14],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp15],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp16],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   8(%[z2])                            \t\n"
+            "nmsub.s    %[temp15],  %[temp15],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp16],  %[temp16],  %[temp8],   %[temp6]    \t\n"
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+              [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
+            : [z1]"r"(z1), [z2]"r"(z2),
+              [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
+              [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
+            : "memory"
+        );
+
+        z1[1].re = temp9;
+        z1[1].im = temp14;
+        z2[0].re = temp13;
+        z2[0].im = temp10;
+
+        z1[0].re = temp11;
+        z1[0].im = temp16;
+        z2[1].re = temp15;
+        z2[1].im = temp12;
+    }
+}
+
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ */
+static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k;
+    int n = 1 << s->mdct_bits;
+    int n2 = n >> 1;
+    int n4 = n >> 2;
+
+    ff_imdct_half_mips(s, output+n4, input);
+
+    for(k = 0; k < n4; k+=4) {
+        output[k] = -output[n2-k-1];
+        output[k+1] = -output[n2-k-2];
+        output[k+2] = -output[n2-k-3];
+        output[k+3] = -output[n2-k-4];
+
+        output[n-k-1] = output[n2+k];
+        output[n-k-2] = output[n2+k+1];
+        output[n-k-3] = output[n2+k+2];
+        output[n-k-4] = output[n2+k+3];
+    }
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fft_init_mips(FFTContext *s)
+{
+    int n=0;
+
+    ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 16, &n);
+    ff_init_ff_cos_tabs(16);
+
+#if HAVE_INLINE_ASM
+    s->fft_calc     = ff_fft_calc_mips;
+#if CONFIG_MDCT
+    s->imdct_calc   = ff_imdct_calc_mips;
+    s->imdct_half   = ff_imdct_half_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c
new file mode 100644
index 0000000000..990958402c
--- /dev/null
+++ b/libavcodec/mips/fmtconvert_mips.c
@@ -0,0 +1,141 @@
+/*
+ * Format Conversion Utils for MIPS
+ *
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
+        float mul, int len)
+{
+    /*
+     * variables used in inline assembler
+     */
+    float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
+
+    int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23;
+    const int *src_end = src + len;
+    /*
+     * loop is 8 times unrolled in assembler in order to achieve better performance
+     */
+    __asm__ volatile (
+        "i32tf_lp%=:                                    \n\t"
+        "lw       %[rpom11],     0(%[src])              \n\t"
+        "lw       %[rpom21],     4(%[src])              \n\t"
+        "lw       %[rpom1],      8(%[src])              \n\t"
+        "lw       %[rpom2],      12(%[src])             \n\t"
+        "mtc1     %[rpom11],     %[temp1]               \n\t"
+        "mtc1     %[rpom21],     %[temp3]               \n\t"
+        "mtc1     %[rpom1],      %[temp5]               \n\t"
+        "mtc1     %[rpom2],      %[temp7]               \n\t"
+
+        "lw       %[rpom13],     16(%[src])             \n\t"
+        "lw       %[rpom23],     20(%[src])             \n\t"
+        "lw       %[rpom12],     24(%[src])             \n\t"
+        "lw       %[rpom22],     28(%[src])             \n\t"
+        "mtc1     %[rpom13],     %[temp9]               \n\t"
+        "mtc1     %[rpom23],     %[temp11]              \n\t"
+        "mtc1     %[rpom12],     %[temp13]              \n\t"
+        "mtc1     %[rpom22],     %[temp15]              \n\t"
+
+        PTR_ADDIU "%[src],       32                     \n\t"
+        "cvt.s.w  %[temp1],      %[temp1]               \n\t"
+        "cvt.s.w  %[temp3],      %[temp3]               \n\t"
+        "cvt.s.w  %[temp5],      %[temp5]               \n\t"
+        "cvt.s.w  %[temp7],      %[temp7]               \n\t"
+
+        "cvt.s.w  %[temp9],      %[temp9]               \n\t"
+        "cvt.s.w  %[temp11],     %[temp11]              \n\t"
+        "cvt.s.w  %[temp13],     %[temp13]              \n\t"
+        "cvt.s.w  %[temp15],     %[temp15]              \n\t"
+
+        "mul.s   %[temp1],       %[temp1],    %[mul]    \n\t"
+        "mul.s   %[temp3],       %[temp3],    %[mul]    \n\t"
+        "mul.s   %[temp5],       %[temp5],    %[mul]    \n\t"
+        "mul.s   %[temp7],       %[temp7],    %[mul]    \n\t"
+
+        "mul.s   %[temp9],       %[temp9],    %[mul]    \n\t"
+        "mul.s   %[temp11],      %[temp11],   %[mul]    \n\t"
+        "mul.s   %[temp13],      %[temp13],   %[mul]    \n\t"
+        "mul.s   %[temp15],      %[temp15],   %[mul]    \n\t"
+
+        "swc1    %[temp1],       0(%[dst])              \n\t" /*dst[i] = src[i] * mul;    */
+        "swc1    %[temp3],       4(%[dst])              \n\t" /*dst[i+1] = src[i+1] * mul;*/
+        "swc1    %[temp5],       8(%[dst])              \n\t" /*dst[i+2] = src[i+2] * mul;*/
+        "swc1    %[temp7],       12(%[dst])             \n\t" /*dst[i+3] = src[i+3] * mul;*/
+
+        "swc1    %[temp9],       16(%[dst])             \n\t" /*dst[i+4] = src[i+4] * mul;*/
+        "swc1    %[temp11],      20(%[dst])             \n\t" /*dst[i+5] = src[i+5] * mul;*/
+        "swc1    %[temp13],      24(%[dst])             \n\t" /*dst[i+6] = src[i+6] * mul;*/
+        "swc1    %[temp15],      28(%[dst])             \n\t" /*dst[i+7] = src[i+7] * mul;*/
+        PTR_ADDIU "%[dst],       32                     \n\t"
+        "bne     %[src],        %[src_end], i32tf_lp%=  \n\t"
+        : [temp1]"=&f"(temp1),   [temp11]"=&f"(temp11),
+          [temp13]"=&f"(temp13), [temp15]"=&f"(temp15),
+          [temp3]"=&f"(temp3),   [temp5]"=&f"(temp5),
+          [temp7]"=&f"(temp7),   [temp9]"=&f"(temp9),
+          [rpom1]"=&r"(rpom1),   [rpom2]"=&r"(rpom2),
+          [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21),
+          [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22),
+          [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23),
+          [dst]"+r"(dst),       [src]"+r"(src)
+        : [mul]"f"(mul),        [src_end]"r"(src_end)
+        : "memory"
+    );
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
+#endif
+}
diff --git a/libavcodec/mips/h263dsp_init_mips.c b/libavcodec/mips/h263dsp_init_mips.c
new file mode 100644
index 0000000000..09bd93707d
--- /dev/null
+++ b/libavcodec/mips/h263dsp_init_mips.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h263dsp_init_msa(H263DSPContext *c)
+{
+    c->h263_h_loop_filter = ff_h263_h_loop_filter_msa;
+    c->h263_v_loop_filter = ff_h263_v_loop_filter_msa;
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_h263dsp_init_mips(H263DSPContext *c)
+{
+#if HAVE_MSA
+    h263dsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
new file mode 100644
index 0000000000..99a43cd44a
--- /dev/null
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H263DSP_MIPS_H
+#define AVCODEC_MIPS_H263DSP_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+                                       int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+int ff_pix_sum_msa(uint8_t *pix, int line_size);
+
+#endif  // #ifndef AVCODEC_MIPS_H263DSP_MIPS_H
diff --git a/libavcodec/mips/h263dsp_msa.c b/libavcodec/mips/h263dsp_msa.c
new file mode 100644
index 0000000000..472bcbd70a
--- /dev/null
+++ b/libavcodec/mips/h263dsp_msa.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static const uint8_t h263_loop_filter_strength_msa[32] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7,
+    7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
+};
+
+static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 temp0, temp1, temp2;
+    v8i16 diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2;
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in3, in2, in1);
+
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
+    in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
+    in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+    ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+}
+
+static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    uint64_t res0, res1, res2, res3;
+    v16u8 in0, in1, in2, in3;
+    v8i16 temp0, temp2, diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2 * stride;
+    LD_UB4(src, stride, in0, in3, in2, in1);
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    res0 = __msa_copy_u_d((v2i64) in0, 0);
+    res1 = __msa_copy_u_d((v2i64) in3, 0);
+    res2 = __msa_copy_u_d((v2i64) in2, 0);
+    res3 = __msa_copy_u_d((v2i64) in1, 0);
+    SD4(res0, res1, res2, res3, src, stride);
+}
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_h_loop_filter_msa(src, stride, q_scale);
+}
+
+void ff_h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_v_loop_filter_msa(src, stride, q_scale);
+}
diff --git a/libavcodec/mips/h264chroma_init_mips.c b/libavcodec/mips/h264chroma_init_mips.c
new file mode 100644
index 0000000000..122148dc78
--- /dev/null
+++ b/libavcodec/mips/h264chroma_init_mips.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa;
+
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
+{
+    int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmi;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmi;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmi;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_MSA
+    h264chroma_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264chroma_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h
new file mode 100644
index 0000000000..7a373b8395
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mips.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef H264_CHROMA_MIPS_H
+#define H264_CHROMA_MIPS_H
+
+#include "libavcodec/h264.h"
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+
+#endif /* H264_CHROMA_MIPS_H */
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
new file mode 100644
index 0000000000..ef294762b3
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -0,0 +1,582 @@
+/*
+ * Loongson SIMD optimized h264chroma
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    int i;
+
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);
+
+    if (D) {
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "ldl $4, %6                 \r\n"
+                "ldr $4, %5                 \r\n"
+                "ldl $5, %8                 \r\n"
+                "ldr $5, %7                 \r\n"
+                "daddiu $6, $0, 32          \r\n"
+                "mtc1 %9, $f6               \r\n"
+                "mtc1 %10, $f8              \r\n"
+                "mtc1 %11, $f10             \r\n"
+                "mtc1 %12, $f12             \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "pshufh $f12, $f12, $f20    \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "dsrl32 $2, $2, 0           \r\n"
+                "dsrl32 $3, $3, 0           \r\n"
+                "dsrl32 $4, $4, 0           \r\n"
+                "dsrl32 $5, $5, 0           \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f24, $f2, $f22      \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f24, $f2    \r\n"
+                "sdc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*src),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
+                  "m"(*(src+stride)),"m"(*(src+stride+7)),
+                  "m"(*(src+stride+1)),"m"(*(src+stride+8)),
+                  "r"(A),"r"(B),"r"(C),"r"(D)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "daddiu $6, $0, 32          \r\n"
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "mtc1 %5, $f6               \r\n"
+                "mtc1 %6, $f8               \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "dsrl32 $2, $2, 0           \r\n"
+                "dsrl32 $3, $3, 0           \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f24, $f2, $f22      \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f24, $f2    \r\n"
+                "sdc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),
+                  "m"(*(src+step)),"m"(*(src+step+7)),
+                  "r"(A),"r"(E)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else {
+        for (i = 0; i < h; i++) {
+            __asm__ volatile (
+                "daddiu $6, $0, 32          \r\n"
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "mtc1 %3, $f6               \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "dsrl32 $2, $2, 0           \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "psrah $f24, $f2, $f22      \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f24, $f2    \r\n"
+                "sdc1 $f2, %0               \r\n"
+                :"=m"(*dst)
+                :"m"(*src),"m"(*(src+7)),"r"(A)
+                :"$2"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    }
+}
+
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    int i;
+
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);
+
+    if (D) {
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "ldl $4, %6                 \r\n"
+                "ldr $4, %5                 \r\n"
+                "ldl $5, %8                 \r\n"
+                "ldr $5, %7                 \r\n"
+                "daddiu $6, $0, 32          \r\n"
+                "mtc1 %9, $f6               \r\n"
+                "mtc1 %10, $f8              \r\n"
+                "mtc1 %11, $f10             \r\n"
+                "mtc1 %12, $f12             \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "pshufh $f12, $f12, $f20    \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "dsrl32 $2, $2, 0           \r\n"
+                "dsrl32 $3, $3, 0           \r\n"
+                "dsrl32 $4, $4, 0           \r\n"
+                "dsrl32 $5, $5, 0           \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f24, $f2, $f22      \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f24, $f2    \r\n"
+                "ldc1 $f4, %0               \r\n"
+                "pavgb $f2, $f2, $f4        \r\n"
+                "sdc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
+                  "m"(*(src+stride)),"m"(*(src+stride+7)),
+                  "m"(*(src+stride+1)),"m"(*(src+stride+8)),
+                  "r"(A),"r"(B),"r"(C),"r"(D)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else {
+        const int step = C ? stride : 1;
+
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "daddiu $6, $0, 32          \r\n"
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "mtc1 %5, $f6               \r\n"
+                "mtc1 %6, $f8               \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "dsrl32 $2, $2, 0           \r\n"
+                "dsrl32 $3, $3, 0           \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f24, $f2, $f22      \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f24, $f2    \r\n"
+                "ldc1 $f4, %0               \r\n"
+                "pavgb $f2, $f2, $f4        \r\n"
+                "sdc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),
+                  "m"(*(src+step)),"m"(*(src+step+7)),"r"(A),"r"(E)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    }
+}
+
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) *  y;
+    const int D = x *  y;
+    const int E = B + C;
+    int i;
+
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);
+
+    if (D) {
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "ldl $4, %6                 \r\n"
+                "ldr $4, %5                 \r\n"
+                "ldl $5, %8                 \r\n"
+                "ldr $5, %7                 \r\n"
+                "daddiu $6, $0, 32          \r\n"
+                "mtc1 %9, $f6               \r\n"
+                "mtc1 %10, $f8              \r\n"
+                "mtc1 %11, $f10             \r\n"
+                "mtc1 %12, $f12             \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "pshufh $f12, $f12, $f20    \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f2, $f2     \r\n"
+                "swc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
+                  "m"(*(src+stride)),"m"(*(src+stride+7)),
+                  "m"(*(src+stride+1)),"m"(*(src+stride+8)),
+                  "r"(A),"r"(B),"r"(C),"r"(D)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "daddiu $4, $0, 32          \r\n"
+                "mtc1 %5, $f6               \r\n"
+                "mtc1 %6, $f8               \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f10              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "daddiu $4, $0, 6           \r\n"
+                "mtc1 $4, $f22              \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f10       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f2, $f20    \r\n"
+                "swc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),"m"(*(src+step)),
+                  "m"(*(src+step+7)),"r"(A),"r"(E)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else {
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "lwl $2, %2                 \r\n"
+                "lwr $2, %1                 \r\n"
+                "sw $2, %0                  \r\n"
+                : "=m"(*dst)
+                : "m"(*src),"m"(*(src+3))
+                : "$2"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    }
+}
+
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) *(8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    int i;
+
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);
+
+    if (D) {
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "ldl $4, %6                 \r\n"
+                "ldr $4, %5                 \r\n"
+                "ldl $5, %8                 \r\n"
+                "ldr $5, %7                 \r\n"
+                "daddiu $6, $0, 32          \r\n"
+                "mtc1 %9, $f6               \r\n"
+                "mtc1 %10, $f8              \r\n"
+                "mtc1 %11, $f10             \r\n"
+                "mtc1 %12, $f12             \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f16              \r\n"
+                "mtc1 $5, $f18              \r\n"
+                "mtc1 $6, $f14              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "pshufh $f12, $f12, $f20    \r\n"
+                "pshufh $f14, $f14, $f20    \r\n"
+                "punpcklbh $f16, $f16, $f20 \r\n"
+                "punpcklbh $f18, $f18, $f20 \r\n"
+                "daddiu $6, $0, 6           \r\n"
+                "mtc1 $6, $f22              \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "pmullh $f16, $f10, $f16    \r\n"
+                "pmullh $f18, $f12, $f18    \r\n"
+                "paddh $f2, $f2, $f14       \r\n"
+                "paddh $f4, $f4, $f16       \r\n"
+                "paddh $f2, $f2, $f18       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f2, $f2     \r\n"
+                "lwc1 $f4, %0               \r\n"
+                "pavgb $f2, $f2, $f4        \r\n"
+                "swc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),"m"(*(src+1)),"m"(*(src+8)),
+                  "m"(*(src+stride)),"m"(*(src+stride+7)),
+                  "m"(*(src+stride+1)),"m"(*(src+stride+8)),
+                  "r"(A),"r"(B),"r"(C),"r"(D)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    } else {
+        const int E = B + C;
+        const int step = C ? stride : 1;
+
+        for (i=0; i<h; i++) {
+            __asm__ volatile (
+                "ldl $2, %2                 \r\n"
+                "ldr $2, %1                 \r\n"
+                "ldl $3, %4                 \r\n"
+                "ldr $3, %3                 \r\n"
+                "daddiu $4, $0, 32          \r\n"
+                "mtc1 %5, $f6               \r\n"
+                "mtc1 %6, $f8               \r\n"
+                "mtc1 $0, $f20              \r\n"
+                "mtc1 $2, $f2               \r\n"
+                "mtc1 $3, $f4               \r\n"
+                "mtc1 $4, $f10              \r\n"
+                "punpcklbh $f2, $f2, $f20   \r\n"
+                "punpcklbh $f4, $f4, $f20   \r\n"
+                "pshufh $f6, $f6, $f20      \r\n"
+                "pshufh $f8, $f8, $f20      \r\n"
+                "pshufh $f10, $f10, $f20    \r\n"
+                "daddiu $4, $0, 6           \r\n"
+                "mtc1 $4, $f22              \r\n"
+                "pmullh $f2, $f2, $f6       \r\n"
+                "pmullh $f4, $f4, $f8       \r\n"
+                "paddh $f2, $f2, $f10       \r\n"
+                "paddh $f2, $f2, $f4        \r\n"
+                "psrah $f2, $f2, $f22       \r\n"
+                "packushb $f2, $f2, $f20    \r\n"
+                "lwc1 $f4, %0               \r\n"
+                "pavgb $f2, $f2, $f4        \r\n"
+                "swc1 $f2, %0               \r\n"
+                : "=m"(*dst)
+                : "m"(*(src)),"m"(*(src+7)),"m"(*(src+step)),
+                  "m"(*(src+step+7)),"r"(A),"r"(E)
+                : "$2","$3","$4","$5","$6"
+            );
+
+            dst += stride;
+            src += stride;
+        }
+    }
+}
diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
new file mode 100644
index 0000000000..67d0bc12ab
--- /dev/null
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -0,0 +1,2003 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264chroma_mips.h"
+
+static const uint8_t chroma_mask_arr[16 * 5] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+
+    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    res_r = __msa_dotp_u_h(src4, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v4i32 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coeff0, uint32_t coeff1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3;
+    v8u16 res0_r, res1_r;
+    v4i32 res0, res1;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else {
+        avc_chroma_hz_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+                                      coeff1, height);
+    }
+}
+
+static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+
+    if (0 != (height % 4)) {
+        for (row = (height % 4); row--;) {
+            src0 = LD_UB(src);
+            src += src_stride;
+
+            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+
+            res0 = __msa_dotp_u_h(src0, coeff_vec);
+            res0 <<= 3;
+            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
+            res0 = __msa_sat_u_h(res0, 7);
+            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+
+            ST8x1_UB(res0, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v4i32 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coeff0, uint32_t coeff1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res0_r, res1_r;
+    v4i32 res0, res1;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   tmp0, tmp1, tmp2, tmp3);
+        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else {
+        avc_chroma_vt_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+                                      coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   src0, src1, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res_vert;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    out0 = __msa_copy_u_h(res_vert, 0);
+    out1 = __msa_copy_u_h(res_vert, 1);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 mask;
+    v4i32 res;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+    LD_UB3(src, src_stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coef_hor0,
+                                          uint32_t coef_hor1,
+                                          uint32_t coef_ver0,
+                                          uint32_t coef_ver1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+    v4i32 res0, res1;
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                    res_hz3);
+        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+        SRARI_H2_UH(res_vt0, res_vt1, 6);
+        SAT_UH2_UH(res_vt0, res_vt1, 7);
+        PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else {
+        avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride,
+                                      coef_hor0, coef_hor1, coef_ver0,
+                                      coef_ver1, height);
+    }
+}
+
+static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                    res_hz4);
+        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+
+        res_vt0 += (res_hz0 * coeff_vt_vec1);
+        res_vt1 += (res_hz1 * coeff_vt_vec1);
+        res_vt2 += (res_hz2 * coeff_vt_vec1);
+        res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        res_hz0 = res_hz4;
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    uint32_t load0, load1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b(res, dst_data);
+
+    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_r;
+    v16i8 res, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_UB(&chroma_mask_arr[64]);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+    DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
+
+    res0_r <<= 3;
+    res1_r <<= 3;
+
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 res, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST4x2_UB(dst_data, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       uint32_t coeff0,
+                                                       uint32_t coeff1,
+                                                       int32_t height)
+{
+    uint32_t load0, load1;
+    uint32_t row;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0 = { 0 };
+    v16u8 dst1 = { 0 };
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_UB(&chroma_mask_arr[0]);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        load0 = LW(dst);
+        load1 = LW(dst + dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst0);
+
+        load0 = LW(dst + 2 * dst_stride);
+        load1 = LW(dst + 3 * dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst1);
+
+        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else {
+        avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride,
+                                                   dst, dst_stride,
+                                                   coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, tmp0, tmp1, res;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+    v16u8 dst_data = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+
+    load0 = LW(dst + 2 * dst_stride);
+    load1 = LW(dst + 3 * dst_stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1, load2, load3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+    v16u8 dst_data0 = { 0 };
+    v16u8 dst_data1 = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+
+    LW4(dst, dst_stride, load0, load1, load2, load3);
+
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
+
+    LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
+
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+               tmp0, tmp1, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = __msa_aver_u_b(res, dst_data);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  uint32_t coeff0,
+                                                  uint32_t coeff1,
+                                                  int32_t height)
+{
+    uint32_t load0, load1, row;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0 = { 0 };
+    v16u8 dst1 = { 0 };
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        load0 = LW(dst);
+        load1 = LW(dst + dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst0);
+        load0 = LW(dst + 2 * dst_stride);
+        load1 = LW(dst + 3 * dst_stride);
+        INSERT_W2_UB(load0, load1, dst1);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   tmp0, tmp1, tmp2, tmp3);
+        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else {
+        avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+                                              coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   src0, src1, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 dst0, dst1;
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+    out0 = __msa_copy_u_h((v8i16) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 1);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst4 = __msa_aver_u_b((v16u8) res, dst4);
+
+    ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2;
+    v16u8 dst0, dst1;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  uint32_t coef_hor0,
+                                                  uint32_t coef_hor1,
+                                                  uint32_t coef_ver0,
+                                                  uint32_t coef_ver1,
+                                                  int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+    v16u8 res0, res1;
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                    res_hz3);
+        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+        SRARI_H2_UH(res_vt0, res_vt1, 6);
+        SAT_UH2_UH(res_vt0, res_vt1, 7);
+        PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+        dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+        dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else {
+        avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+                                              coef_hor0, coef_hor1,
+                                              coef_ver0, coef_ver1, height);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2;
+    v8u16 res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                    res_hz4);
+        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+
+        res_vt0 += (res_hz0 * coeff_vt_vec1);
+        res_vt1 += (res_hz1 * coeff_vt_vec1);
+        res_vt2 += (res_hz2 * coeff_vt_vec1);
+        res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+
+        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        res_hz0 = res_hz4;
+    }
+}
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width4_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_8w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_8w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_8w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        copy_width8_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_4w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_4w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_4w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            *((uint32_t *) dst) = *((uint32_t *) src);
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_2w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_2w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_2w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            *((uint16_t *) dst) = *((uint16_t *) src);
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        avg_width8_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        avg_width4_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            dst[0] = (dst[0] + src[0] + 1) >> 1;
+            dst[1] = (dst[1] + src[1] + 1) >> 1;
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
diff --git a/libavcodec/mips/h264dsp_init_mips.c b/libavcodec/mips/h264dsp_init_mips.c
new file mode 100644
index 0000000000..1fe7f8468c
--- /dev/null
+++ b/libavcodec/mips/h264dsp_init_mips.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264dsp_init_msa(H264DSPContext *c,
+                                     const int bit_depth,
+                                     const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma_mbaff =
+            ff_h264_h_loop_filter_luma_mbaff_msa;
+        c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_mbaff_intra =
+            ff_h264_h_loop_filter_luma_mbaff_intra_msa;
+        c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_inter_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_inter_msa;
+        else
+            c->h264_h_loop_filter_chroma =
+                ff_h264_h_loop_filter_chroma422_msa;
+
+        if (chroma_format_idc > 1)
+            c->h264_h_loop_filter_chroma_mbaff =
+                ff_h264_h_loop_filter_chroma422_mbaff_msa;
+
+        c->h264_v_loop_filter_chroma_intra =
+            ff_h264_v_lpf_chroma_intra_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma_intra =
+                ff_h264_h_lpf_chroma_intra_msa;
+
+        /* Weighted MC */
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_msa;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_msa;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_msa;
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_msa;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_msa;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_msa;
+
+        c->h264_idct_add = ff_h264_idct_add_msa;
+        c->h264_idct8_add = ff_h264_idct8_addblk_msa;
+        c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_msa;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_msa;
+        c->h264_idct_add16 = ff_h264_idct_add16_msa;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_msa;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_msa;
+
+        c->h264_idct_add16intra = ff_h264_idct_add16_intra_msa;
+        c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_msa;
+    }  // if (8 == bit_depth)
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264dsp_init_mmi(H264DSPContext * c, const int bit_depth,
+        const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_mmi;
+        c->h264_idct_add = ff_h264_idct_add_8_mmi;
+        c->h264_idct8_add = ff_h264_idct8_add_8_mmi;
+        c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmi;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmi;
+        c->h264_idct_add16 = ff_h264_idct_add16_8_mmi;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmi;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_8_mmi;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmi;
+
+        c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma_dc_dequant_idct_8_mmi;
+        else
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma422_dc_dequant_idct_8_mmi;
+
+        c->weight_h264_pixels_tab[0] = ff_h264_weight_pixels16_8_mmi;
+        c->weight_h264_pixels_tab[1] = ff_h264_weight_pixels8_8_mmi;
+        c->weight_h264_pixels_tab[2] = ff_h264_weight_pixels4_8_mmi;
+
+        c->biweight_h264_pixels_tab[0] = ff_h264_biweight_pixels16_8_mmi;
+        c->biweight_h264_pixels_tab[1] = ff_h264_biweight_pixels8_8_mmi;
+        c->biweight_h264_pixels_tab[2] = ff_h264_biweight_pixels4_8_mmi;
+
+        c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmi;
+        c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmi;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma =
+                ff_deblock_h_chroma_8_mmi;
+            c->h264_h_loop_filter_chroma_intra =
+                ff_deblock_h_chroma_intra_8_mmi;
+        }
+
+        c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmi;
+        c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmi;
+        c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmi;
+        c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                                  const int chroma_format_idc)
+{
+#if HAVE_MSA
+    h264dsp_init_msa(c, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264dsp_init_mmi(c, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264dsp_mips.h b/libavcodec/mips/h264dsp_mips.h
new file mode 100644
index 0000000000..3fdbf4fb5e
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mips.h
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+                      Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef H264_DSP_MIPS_H
+#define H264_DSP_MIPS_H
+
+#include "libavcodec/h264.h"
+#include "constants.h"
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                               int32_t alpha, int32_t beta,
+                                               int8_t *tc0);
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t stride,
+                                          int32_t alpha, int32_t beta,
+                                          int8_t *tc0);
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride);
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_q_val);
+void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t stride,
+                            const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset,
+                                  int16_t *block, int32_t dst_stride,
+                                  const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nnzc[15 * 8]);
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride);
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int *blk_offset,
+                            int16_t *blk, int dst_stride,
+                            const uint8_t nnzc[15 * 8]);
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int stride,
+                                                int alpha, int beta);
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     int stride, int height, int log2_denom,
+                                     int weightd, int weights, int offset);
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, int height,
+                                   int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, int height,
+                                  int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, int height,
+                                  int log2_denom, int weight, int offset);
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul);
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+#endif  // #ifndef H264_DSP_MIPS_H
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
new file mode 100644
index 0000000000..14c4a4320e
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -0,0 +1,2500 @@
+/*
+ * Loongson SIMD optimized h264dsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *                    Heiher <r@hev.cc>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h264dsp_mips.h"
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0              \r\n"
+        "ldc1 $f2, 0(%[src])            \r\n"
+        "ldc1 $f4, 8(%[src])            \r\n"
+        "ldc1 $f6, 16(%[src])           \r\n"
+        "ldc1 $f8, 24(%[src])           \r\n"
+        "lwc1 $f10, 0(%[dst0])          \r\n"
+        "lwc1 $f12, 0(%[dst1])          \r\n"
+        "lwc1 $f14, 0(%[dst2])          \r\n"
+        "lwc1 $f16, 0(%[dst3])          \r\n"
+        "punpcklbh $f10, $f10, $f0      \r\n"
+        "punpcklbh $f12, $f12, $f0      \r\n"
+        "punpcklbh $f14, $f14, $f0      \r\n"
+        "punpcklbh $f16, $f16, $f0      \r\n"
+        "paddh $f2, $f2, $f10           \r\n"
+        "paddh $f4, $f4, $f12           \r\n"
+        "paddh $f6, $f6, $f14           \r\n"
+        "paddh $f8, $f8, $f16           \r\n"
+        "packushb $f2, $f2, $f0         \r\n"
+        "packushb $f4, $f4, $f0         \r\n"
+        "packushb $f6, $f6, $f0         \r\n"
+        "packushb $f8, $f8, $f0         \r\n"
+        "swc1 $f2, 0(%[dst0])           \r\n"
+        "swc1 $f4, 0(%[dst1])           \r\n"
+        "swc1 $f6, 0(%[dst2])           \r\n"
+        "swc1 $f8, 0(%[dst3])           \r\n"
+        ::[dst0]"r"(dst),[dst1]"r"(dst+stride),[dst2]"r"(dst+2*stride),
+          [dst3]"r"(dst+3*stride),[src]"r"(src)
+        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
+    );
+
+    memset(src, 0, 32);
+}
+
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile (
+        "dli $8, 1                              \r\n"
+        "ldc1 $f0, 0(%[block])                  \r\n"
+        "dmtc1 $8, $f16                         \r\n"
+        "ldc1 $f2, 8(%[block])                  \r\n"
+        "dli $8, 6                              \r\n"
+        "ldc1 $f4, 16(%[block])                 \r\n"
+        "dmtc1 $8, $f18                         \r\n"
+        "psrah $f8, $f2, $f16                   \r\n"
+        "ldc1 $f6, 24(%[block])                 \r\n"
+        "psrah $f10, $f6, $f16                  \r\n"
+        "psubh $f8, $f8, $f6                    \r\n"
+        "paddh $f10, $f10, $f2                  \r\n"
+        "paddh $f20, $f4, $f0                   \r\n"
+        "psubh $f0, $f0, $f4                    \r\n"
+        "paddh $f22, $f10, $f20                 \r\n"
+        "psubh $f4, $f20, $f10                  \r\n"
+        "paddh $f20, $f8, $f0                   \r\n"
+        "psubh $f0, $f0, $f8                    \r\n"
+        "punpckhhw $f2, $f22, $f20              \r\n"
+        "punpcklhw $f10, $f22, $f20             \r\n"
+        "punpckhhw $f8, $f0, $f4                \r\n"
+        "punpcklhw $f0, $f0, $f4                \r\n"
+        "punpckhwd $f4, $f10, $f0               \r\n"
+        "punpcklwd $f10, $f10, $f0              \r\n"
+        "punpcklwd $f20, $f2, $f8               \r\n"
+        "punpckhwd $f0, $f2, $f8                \r\n"
+        "paddh $f10, $f10, %[ff_pw_32]          \r\n"
+        "psrah $f8, $f4, $f16                   \r\n"
+        "psrah $f6, $f0, $f16                   \r\n"
+        "psubh $f8, $f8, $f0                    \r\n"
+        "paddh $f6, $f6, $f4                    \r\n"
+        "paddh $f2, $f20, $f10                  \r\n"
+        "psubh $f10, $f10, $f20                 \r\n"
+        "paddh $f20, $f6, $f2                   \r\n"
+        "psubh $f2, $f2, $f6                    \r\n"
+        "paddh $f22, $f8, $f10                  \r\n"
+        "xor $f14, $f14, $f14                   \r\n"
+        "psubh $f10, $f10, $f8                  \r\n"
+        "sdc1 $f14, 0(%[block])                 \r\n"
+        "sdc1 $f14, 8(%[block])                 \r\n"
+        "sdc1 $f14, 16(%[block])                \r\n"
+        "sdc1 $f14, 24(%[block])                \r\n"
+        "lwc1 $f4, 0(%[dst])                    \r\n"
+        "psrah $f6, $f20, $f18                  \r\n"
+        "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
+        "psrah $f8, $f22, $f18                  \r\n"
+        "punpcklbh $f4, $f4, $f14               \r\n"
+        "punpcklbh $f0, $f0, $f14               \r\n"
+        "paddh $f4, $f4, $f6                    \r\n"
+        "paddh $f0, $f0, $f8                    \r\n"
+        "packushb $f4, $f4, $f14                \r\n"
+        "packushb $f0, $f0, $f14                \r\n"
+        "swc1 $f4, 0(%[dst])                    \r\n"
+        "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
+        "daddu %[dst], %[dst], %[stride]        \r\n"
+        "daddu %[dst], %[dst], %[stride]        \r\n"
+        "lwc1 $f4, 0(%[dst])                    \r\n"
+        "psrah $f10, $f10, $f18                 \r\n"
+        "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
+        "psrah $f2, $f2, $f18                   \r\n"
+        "punpcklbh $f4, $f4, $f14               \r\n"
+        "punpcklbh $f0, $f0, $f14               \r\n"
+        "paddh $f4, $f4, $f10                   \r\n"
+        "paddh $f0, $f0, $f2                    \r\n"
+        "packushb $f4, $f4, $f14                \r\n"
+        "swc1 $f4, 0(%[dst])                    \r\n"
+        "packushb $f0, $f0, $f14                \r\n"
+        "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
+        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride),
+          [ff_pw_32]"f"(ff_pw_32)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20","$f22"
+    );
+
+    memset(block, 0, 32);
+}
+
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile (
+        "lhu $10, 0x0(%[block])                     \r\n"
+        "daddiu $29, $29, -0x20                     \r\n"
+        "daddiu $10, $10, 0x20                      \r\n"
+        "ldc1 $f2, 0x10(%[block])                   \r\n"
+        "sh $10, 0x0(%[block])                      \r\n"
+        "ldc1 $f4, 0x20(%[block])                   \r\n"
+        "dli $10, 0x1                               \r\n"
+        "ldc1 $f6, 0x30(%[block])                   \r\n"
+        "dmtc1 $10, $f16                            \r\n"
+        "ldc1 $f10, 0x50(%[block])                  \r\n"
+        "ldc1 $f12, 0x60(%[block])                  \r\n"
+        "ldc1 $f14, 0x70(%[block])                  \r\n"
+        "mov.d $f0, $f2                             \r\n"
+        "psrah $f2, $f2, $f16                       \r\n"
+        "psrah $f8, $f10, $f16                      \r\n"
+        "paddh $f2, $f2, $f0                        \r\n"
+        "paddh $f8, $f8, $f10                       \r\n"
+        "paddh $f2, $f2, $f10                       \r\n"
+        "paddh $f8, $f8, $f14                       \r\n"
+        "paddh $f2, $f2, $f6                        \r\n"
+        "psubh $f8, $f8, $f0                        \r\n"
+        "psubh $f0, $f0, $f6                        \r\n"
+        "psubh $f10, $f10, $f6                      \r\n"
+        "psrah $f6, $f6, $f16                       \r\n"
+        "paddh $f0, $f0, $f14                       \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "psrah $f14, $f14, $f16                     \r\n"
+        "psubh $f0, $f0, $f6                        \r\n"
+        "dli $10, 0x2                               \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "dmtc1 $10, $f18                            \r\n"
+        "mov.d $f14, $f2                            \r\n"
+        "psrah $f2, $f2, $f18                       \r\n"
+        "psrah $f6, $f8, $f18                       \r\n"
+        "paddh $f6, $f6, $f0                        \r\n"
+        "psrah $f0, $f0, $f18                       \r\n"
+        "paddh $f2, $f2, $f10                       \r\n"
+        "psrah $f10, $f10, $f18                     \r\n"
+        "psubh $f0, $f0, $f8                        \r\n"
+        "psubh $f14, $f14, $f10                     \r\n"
+        "mov.d $f10, $f12                           \r\n"
+        "psrah $f12, $f12, $f16                     \r\n"
+        "psrah $f8, $f4, $f16                       \r\n"
+        "paddh $f12, $f12, $f4                      \r\n"
+        "psubh $f8, $f8, $f10                       \r\n"
+        "ldc1 $f4, 0x0(%[block])                    \r\n"
+        "ldc1 $f10, 0x40(%[block])                  \r\n"
+        "paddh $f10, $f10, $f4                      \r\n"
+        "paddh $f4, $f4, $f4                        \r\n"
+        "paddh $f12, $f12, $f10                     \r\n"
+        "psubh $f4, $f4, $f10                       \r\n"
+        "paddh $f10, $f10, $f10                     \r\n"
+        "paddh $f8, $f8, $f4                        \r\n"
+        "psubh $f10, $f10, $f12                     \r\n"
+        "paddh $f4, $f4, $f4                        \r\n"
+        "paddh $f14, $f14, $f12                     \r\n"
+        "psubh $f4, $f4, $f8                        \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f0, $f0, $f8                        \r\n"
+        "psubh $f12, $f12, $f14                     \r\n"
+        "paddh $f8, $f8, $f8                        \r\n"
+        "paddh $f6, $f6, $f4                        \r\n"
+        "psubh $f8, $f8, $f0                        \r\n"
+        "paddh $f4, $f4, $f4                        \r\n"
+        "paddh $f2, $f2, $f10                       \r\n"
+        "psubh $f4, $f4, $f6                        \r\n"
+        "paddh $f10, $f10, $f10                     \r\n"
+        "sdc1 $f12, 0x0(%[block])                   \r\n"
+        "psubh $f10, $f10, $f2                      \r\n"
+        "punpckhhw $f12, $f14, $f0                  \r\n"
+        "punpcklhw $f14, $f14, $f0                  \r\n"
+        "punpckhhw $f0, $f6, $f2                    \r\n"
+        "punpcklhw $f6, $f6, $f2                    \r\n"
+        "punpckhwd $f2, $f14, $f6                   \r\n"
+        "punpcklwd $f14, $f14, $f6                  \r\n"
+        "punpckhwd $f6, $f12, $f0                   \r\n"
+        "punpcklwd $f12, $f12, $f0                  \r\n"
+        "ldc1 $f0, 0x0(%[block])                    \r\n"
+        "sdc1 $f14, 0x0($29)                        \r\n"
+        "sdc1 $f2, 0x10($29)                        \r\n"
+        "dmfc1 $8, $f12                             \r\n"
+        "dmfc1 $11, $f6                             \r\n"
+        "punpckhhw $f6, $f10, $f4                   \r\n"
+        "punpcklhw $f10, $f10, $f4                  \r\n"
+        "punpckhhw $f4, $f8, $f0                    \r\n"
+        "punpcklhw $f8, $f8, $f0                    \r\n"
+        "punpckhwd $f0, $f10, $f8                   \r\n"
+        "punpcklwd $f10, $f10, $f8                  \r\n"
+        "punpckhwd $f8, $f6, $f4                    \r\n"
+        "punpcklwd $f6, $f6, $f4                    \r\n"
+        "sdc1 $f10, 0x8($29)                        \r\n"
+        "sdc1 $f0, 0x18($29)                        \r\n"
+        "dmfc1 $9, $f6                              \r\n"
+        "dmfc1 $12, $f8                             \r\n"
+        "ldc1 $f2, 0x18(%[block])                   \r\n"
+        "ldc1 $f12, 0x28(%[block])                  \r\n"
+        "ldc1 $f4, 0x38(%[block])                   \r\n"
+        "ldc1 $f0, 0x58(%[block])                   \r\n"
+        "ldc1 $f6, 0x68(%[block])                   \r\n"
+        "ldc1 $f8, 0x78(%[block])                   \r\n"
+        "mov.d $f14, $f2                            \r\n"
+        "psrah $f10, $f0, $f16                      \r\n"
+        "psrah $f2, $f2, $f16                       \r\n"
+        "paddh $f10, $f10, $f0                      \r\n"
+        "paddh $f2, $f2, $f14                       \r\n"
+        "paddh $f10, $f10, $f8                      \r\n"
+        "paddh $f2, $f2, $f0                        \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "paddh $f2, $f2, $f4                        \r\n"
+        "psubh $f14, $f14, $f4                      \r\n"
+        "psubh $f0, $f0, $f4                        \r\n"
+        "psrah $f4, $f4, $f16                       \r\n"
+        "paddh $f14, $f14, $f8                      \r\n"
+        "psubh $f0, $f0, $f8                        \r\n"
+        "psrah $f8, $f8, $f16                       \r\n"
+        "psubh $f14, $f14, $f4                      \r\n"
+        "psubh $f0, $f0, $f8                        \r\n"
+        "mov.d $f8, $f2                             \r\n"
+        "psrah $f4, $f10, $f18                      \r\n"
+        "psrah $f2, $f2, $f18                       \r\n"
+        "paddh $f4, $f4, $f14                       \r\n"
+        "psrah $f14, $f14, $f18                     \r\n"
+        "paddh $f2, $f2, $f0                        \r\n"
+        "psrah $f0, $f0, $f18                       \r\n"
+        "psubh $f14, $f14, $f10                     \r\n"
+        "psubh $f8, $f8, $f0                        \r\n"
+        "mov.d $f0, $f6                             \r\n"
+        "psrah $f6, $f6, $f16                       \r\n"
+        "psrah $f10, $f12, $f16                     \r\n"
+        "paddh $f6, $f6, $f12                       \r\n"
+        "psubh $f10, $f10, $f0                      \r\n"
+        "ldc1 $f12, 0x8(%[block])                   \r\n"
+        "ldc1 $f0, 0x48(%[block])                   \r\n"
+        "paddh $f0, $f0, $f12                       \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f6, $f6, $f0                        \r\n"
+        "psubh $f12, $f12, $f0                      \r\n"
+        "paddh $f0, $f0, $f0                        \r\n"
+        "paddh $f10, $f10, $f12                     \r\n"
+        "psubh $f0, $f0, $f6                        \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f8, $f8, $f6                        \r\n"
+        "psubh $f12, $f12, $f10                     \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "paddh $f14, $f14, $f10                     \r\n"
+        "psubh $f6, $f6, $f8                        \r\n"
+        "paddh $f10, $f10, $f10                     \r\n"
+        "paddh $f4, $f4, $f12                       \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f2, $f2, $f0                        \r\n"
+        "psubh $f12, $f12, $f4                      \r\n"
+        "paddh $f0, $f0, $f0                        \r\n"
+        "sdc1 $f6, 0x8(%[block])                    \r\n"
+        "psubh $f0, $f0, $f2                        \r\n"
+        "punpckhhw $f6, $f8, $f14                   \r\n"
+        "punpcklhw $f8, $f8, $f14                   \r\n"
+        "punpckhhw $f14, $f4, $f2                   \r\n"
+        "punpcklhw $f4, $f4, $f2                    \r\n"
+        "punpckhwd $f2, $f8, $f4                    \r\n"
+        "punpcklwd $f8, $f8, $f4                    \r\n"
+        "punpckhwd $f4, $f6, $f14                   \r\n"
+        "punpcklwd $f6, $f6, $f14                   \r\n"
+        "ldc1 $f14, 0x8(%[block])                   \r\n"
+        "dmfc1 $13, $f8                             \r\n"
+        "dmfc1 $15, $f2                             \r\n"
+        "mov.d $f24, $f6                            \r\n"
+        "mov.d $f28, $f4                            \r\n"
+        "punpckhhw $f4, $f0, $f12                   \r\n"
+        "punpcklhw $f0, $f0, $f12                   \r\n"
+        "punpckhhw $f12, $f10, $f14                 \r\n"
+        "punpcklhw $f10, $f10, $f14                 \r\n"
+        "punpckhwd $f14, $f0, $f10                  \r\n"
+        "punpcklwd $f0, $f0, $f10                   \r\n"
+        "punpckhwd $f10, $f4, $f12                  \r\n"
+        "punpcklwd $f4, $f4, $f12                   \r\n"
+        "dmfc1 $14, $f0                             \r\n"
+        "mov.d $f22, $f14                           \r\n"
+        "mov.d $f26, $f4                            \r\n"
+        "mov.d $f30, $f10                           \r\n"
+        "daddiu $10, %[dst], 0x4                    \r\n"
+        "dmtc1 $15, $f14                            \r\n"
+        "dmtc1 $11, $f12                            \r\n"
+        "ldc1 $f2, 0x10($29)                        \r\n"
+        "dmtc1 $8, $f6                              \r\n"
+        "mov.d $f8, $f2                             \r\n"
+        "psrah $f2, $f2, $f16                       \r\n"
+        "psrah $f0, $f14, $f16                      \r\n"
+        "paddh $f2, $f2, $f8                        \r\n"
+        "paddh $f0, $f0, $f14                       \r\n"
+        "paddh $f2, $f2, $f14                       \r\n"
+        "paddh $f0, $f0, $f28                       \r\n"
+        "paddh $f2, $f2, $f12                       \r\n"
+        "psubh $f0, $f0, $f8                        \r\n"
+        "psubh $f8, $f8, $f12                       \r\n"
+        "psubh $f14, $f14, $f12                     \r\n"
+        "psrah $f12, $f12, $f16                     \r\n"
+        "paddh $f8, $f8, $f28                       \r\n"
+        "psubh $f14, $f14, $f28                     \r\n"
+        "psrah $f10, $f28, $f16                     \r\n"
+        "psubh $f8, $f8, $f12                       \r\n"
+        "psubh $f14, $f14, $f10                     \r\n"
+        "mov.d $f10, $f2                            \r\n"
+        "psrah $f2, $f2, $f18                       \r\n"
+        "psrah $f12, $f0, $f18                      \r\n"
+        "paddh $f2, $f2, $f14                       \r\n"
+        "paddh $f12, $f12, $f8                      \r\n"
+        "psrah $f8, $f8, $f18                       \r\n"
+        "psrah $f14, $f14, $f18                     \r\n"
+        "psubh $f8, $f8, $f0                        \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "mov.d $f14, $f24                           \r\n"
+        "psrah $f4, $f24, $f16                      \r\n"
+        "psrah $f0, $f6, $f16                       \r\n"
+        "paddh $f4, $f4, $f6                        \r\n"
+        "psubh $f0, $f0, $f14                       \r\n"
+        "ldc1 $f6, 0x0($29)                         \r\n"
+        "dmtc1 $13, $f14                            \r\n"
+        "paddh $f14, $f14, $f6                      \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "paddh $f4, $f4, $f14                       \r\n"
+        "psubh $f6, $f6, $f14                       \r\n"
+        "paddh $f14, $f14, $f14                     \r\n"
+        "paddh $f0, $f0, $f6                        \r\n"
+        "psubh $f14, $f14, $f4                      \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "paddh $f10, $f10, $f4                      \r\n"
+        "psubh $f6, $f6, $f0                        \r\n"
+        "paddh $f4, $f4, $f4                        \r\n"
+        "paddh $f8, $f8, $f0                        \r\n"
+        "psubh $f4, $f4, $f10                       \r\n"
+        "paddh $f0, $f0, $f0                        \r\n"
+        "paddh $f12, $f12, $f6                      \r\n"
+        "psubh $f0, $f0, $f8                        \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "paddh $f2, $f2, $f14                       \r\n"
+        "psubh $f6, $f6, $f12                       \r\n"
+        "paddh $f14, $f14, $f14                     \r\n"
+        "sdc1 $f6, 0x0($29)                         \r\n"
+        "psubh $f14, $f14, $f2                      \r\n"
+        "sdc1 $f0, 0x10($29)                        \r\n"
+        "dmfc1 $8, $f4                              \r\n"
+        "xor $f4, $f4, $f4                          \r\n"
+        "sdc1 $f4, 0x0(%[block])                    \r\n"
+        "sdc1 $f4, 0x8(%[block])                    \r\n"
+        "sdc1 $f4, 0x10(%[block])                   \r\n"
+        "sdc1 $f4, 0x18(%[block])                   \r\n"
+        "sdc1 $f4, 0x20(%[block])                   \r\n"
+        "sdc1 $f4, 0x28(%[block])                   \r\n"
+        "sdc1 $f4, 0x30(%[block])                   \r\n"
+        "sdc1 $f4, 0x38(%[block])                   \r\n"
+        "sdc1 $f4, 0x40(%[block])                   \r\n"
+        "sdc1 $f4, 0x48(%[block])                   \r\n"
+        "sdc1 $f4, 0x50(%[block])                   \r\n"
+        "sdc1 $f4, 0x58(%[block])                   \r\n"
+        "sdc1 $f4, 0x60(%[block])                   \r\n"
+        "sdc1 $f4, 0x68(%[block])                   \r\n"
+        "sdc1 $f4, 0x70(%[block])                   \r\n"
+        "sdc1 $f4, 0x78(%[block])                   \r\n"
+        "dli $11, 0x6                               \r\n"
+        "lwc1 $f6, 0x0(%[dst])                      \r\n"
+        "dmtc1 $11, $f20                            \r\n"
+        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "psrah $f10, $f10, $f20                     \r\n"
+        "psrah $f8, $f8, $f20                       \r\n"
+        "punpcklbh $f6, $f6, $f4                    \r\n"
+        "punpcklbh $f0, $f0, $f4                    \r\n"
+        "paddh $f6, $f6, $f10                       \r\n"
+        "paddh $f0, $f0, $f8                        \r\n"
+        "packushb $f6, $f6, $f4                     \r\n"
+        "packushb $f0, $f0, $f4                     \r\n"
+        "swc1 $f6, 0x0(%[dst])                      \r\n"
+        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "lwc1 $f6, 0x0(%[dst])                      \r\n"
+        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "psrah $f12, $f12, $f20                     \r\n"
+        "psrah $f2, $f2, $f20                       \r\n"
+        "punpcklbh $f6, $f6, $f4                    \r\n"
+        "punpcklbh $f0, $f0, $f4                    \r\n"
+        "paddh $f6, $f6, $f12                       \r\n"
+        "paddh $f0, $f0, $f2                        \r\n"
+        "packushb $f6, $f6, $f4                     \r\n"
+        "packushb $f0, $f0, $f4                     \r\n"
+        "swc1 $f6, 0x0(%[dst])                      \r\n"
+        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "ldc1 $f10, 0x0($29)                        \r\n"
+        "ldc1 $f8, 0x10($29)                        \r\n"
+        "dmtc1 $8, $f12                             \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "lwc1 $f6, 0x0(%[dst])                      \r\n"
+        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "psrah $f14, $f14, $f20                     \r\n"
+        "psrah $f10, $f10, $f20                     \r\n"
+        "punpcklbh $f6, $f6, $f4                    \r\n"
+        "punpcklbh $f0, $f0, $f4                    \r\n"
+        "paddh $f6, $f6, $f14                       \r\n"
+        "paddh $f0, $f0, $f10                       \r\n"
+        "packushb $f6, $f6, $f4                     \r\n"
+        "packushb $f0, $f0, $f4                     \r\n"
+        "swc1 $f6, 0x0(%[dst])                      \r\n"
+        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "daddu %[dst], %[dst], %[stride]            \r\n"
+        "lwc1 $f6, 0x0(%[dst])                      \r\n"
+        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "psrah $f8, $f8, $f20                       \r\n"
+        "psrah $f12, $f12, $f20                     \r\n"
+        "punpcklbh $f6, $f6, $f4                    \r\n"
+        "punpcklbh $f0, $f0, $f4                    \r\n"
+        "paddh $f6, $f6, $f8                        \r\n"
+        "paddh $f0, $f0, $f12                       \r\n"
+        "packushb $f6, $f6, $f4                     \r\n"
+        "packushb $f0, $f0, $f4                     \r\n"
+        "swc1 $f6, 0x0(%[dst])                      \r\n"
+        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
+        "dmtc1 $12, $f2                             \r\n"
+        "dmtc1 $9, $f12                             \r\n"
+        "ldc1 $f8, 0x18($29)                        \r\n"
+        "mov.d $f10, $f8                            \r\n"
+        "psrah $f8, $f8, $f16                       \r\n"
+        "psrah $f14, $f22, $f16                     \r\n"
+        "paddh $f14, $f14, $f22                     \r\n"
+        "paddh $f8, $f8, $f10                       \r\n"
+        "paddh $f14, $f14, $f30                     \r\n"
+        "paddh $f8, $f8, $f22                       \r\n"
+        "psubh $f14, $f14, $f10                     \r\n"
+        "paddh $f8, $f8, $f2                        \r\n"
+        "psubh $f10, $f10, $f2                      \r\n"
+        "psubh $f6, $f22, $f2                       \r\n"
+        "psrah $f2, $f2, $f16                       \r\n"
+        "paddh $f10, $f10, $f30                     \r\n"
+        "psubh $f6, $f6, $f30                       \r\n"
+        "psrah $f4, $f30, $f16                      \r\n"
+        "psubh $f10, $f10, $f2                      \r\n"
+        "psubh $f6, $f6, $f4                        \r\n"
+        "mov.d $f4, $f8                             \r\n"
+        "psrah $f8, $f8, $f18                       \r\n"
+        "psrah $f2, $f14, $f18                      \r\n"
+        "paddh $f8, $f8, $f6                        \r\n"
+        "paddh $f2, $f2, $f10                       \r\n"
+        "psrah $f10, $f10, $f18                     \r\n"
+        "psrah $f6, $f6, $f18                       \r\n"
+        "psubh $f10, $f10, $f14                     \r\n"
+        "psubh $f4, $f4, $f6                        \r\n"
+        "mov.d $f6, $f26                            \r\n"
+        "psrah $f0, $f26, $f16                      \r\n"
+        "psrah $f14, $f12, $f16                     \r\n"
+        "paddh $f0, $f0, $f12                       \r\n"
+        "psubh $f14, $f14, $f6                      \r\n"
+        "ldc1 $f12, 0x8($29)                        \r\n"
+        "dmtc1 $14, $f6                             \r\n"
+        "paddh $f6, $f6, $f12                       \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f0, $f0, $f6                        \r\n"
+        "psubh $f12, $f12, $f6                      \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "paddh $f14, $f14, $f12                     \r\n"
+        "psubh $f6, $f6, $f0                        \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f4, $f4, $f0                        \r\n"
+        "psubh $f12, $f12, $f14                     \r\n"
+        "paddh $f0, $f0, $f0                        \r\n"
+        "paddh $f10, $f10, $f14                     \r\n"
+        "psubh $f0, $f0, $f4                        \r\n"
+        "paddh $f14, $f14, $f14                     \r\n"
+        "paddh $f2, $f2, $f12                       \r\n"
+        "psubh $f14, $f14, $f10                     \r\n"
+        "paddh $f12, $f12, $f12                     \r\n"
+        "paddh $f8, $f8, $f6                        \r\n"
+        "psubh $f12, $f12, $f2                      \r\n"
+        "paddh $f6, $f6, $f6                        \r\n"
+        "sdc1 $f12, 0x8($29)                        \r\n"
+        "psubh $f6, $f6, $f8                        \r\n"
+        "sdc1 $f14, 0x18($29)                       \r\n"
+        "dmfc1 $9, $f0                              \r\n"
+        "xor $f0, $f0, $f0                          \r\n"
+        "lwc1 $f12, 0x0($10)                        \r\n"
+        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "psrah $f4, $f4, $f20                       \r\n"
+        "psrah $f10, $f10, $f20                     \r\n"
+        "punpcklbh $f12, $f12, $f0                  \r\n"
+        "punpcklbh $f14, $f14, $f0                  \r\n"
+        "paddh $f12, $f12, $f4                      \r\n"
+        "paddh $f14, $f14, $f10                     \r\n"
+        "packushb $f12, $f12, $f0                   \r\n"
+        "packushb $f14, $f14, $f0                   \r\n"
+        "swc1 $f12, 0x0($10)                        \r\n"
+        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "lwc1 $f12, 0x0($10)                        \r\n"
+        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "psrah $f2, $f2, $f20                       \r\n"
+        "psrah $f8, $f8, $f20                       \r\n"
+        "punpcklbh $f12, $f12, $f0                  \r\n"
+        "punpcklbh $f14, $f14, $f0                  \r\n"
+        "paddh $f12, $f12, $f2                      \r\n"
+        "paddh $f14, $f14, $f8                      \r\n"
+        "packushb $f12, $f12, $f0                   \r\n"
+        "packushb $f14, $f14, $f0                   \r\n"
+        "swc1 $f12, 0x0($10)                        \r\n"
+        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "ldc1 $f4, 0x8($29)                         \r\n"
+        "ldc1 $f10, 0x18($29)                       \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "dmtc1 $9, $f2                              \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "lwc1 $f12, 0x0($10)                        \r\n"
+        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "psrah $f6, $f6, $f20                       \r\n"
+        "psrah $f4, $f4, $f20                       \r\n"
+        "punpcklbh $f12, $f12, $f0                  \r\n"
+        "punpcklbh $f14, $f14, $f0                  \r\n"
+        "paddh $f12, $f12, $f6                      \r\n"
+        "paddh $f14, $f14, $f4                      \r\n"
+        "packushb $f12, $f12, $f0                   \r\n"
+        "packushb $f14, $f14, $f0                   \r\n"
+        "swc1 $f12, 0x0($10)                        \r\n"
+        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "daddu $10, $10, %[stride]                  \r\n"
+        "lwc1 $f12, 0x0($10)                        \r\n"
+        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "psrah $f10, $f10, $f20                     \r\n"
+        "psrah $f2, $f2, $f20                       \r\n"
+        "punpcklbh $f12, $f12, $f0                  \r\n"
+        "punpcklbh $f14, $f14, $f0                  \r\n"
+        "paddh $f12, $f12, $f10                     \r\n"
+        "paddh $f14, $f14, $f2                      \r\n"
+        "packushb $f12, $f12, $f0                   \r\n"
+        "packushb $f14, $f14, $f0                   \r\n"
+        "swc1 $f12, 0x0($10)                        \r\n"
+        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
+        "daddiu $29, $29, 0x20                      \r\n"
+        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
+        :"$8","$9","$10","$11","$12","$13","$14","$15","$29","$f0","$f2","$f4",
+         "$f8","$f10","$f12","$f14","$f16","$f18","$f20","$f22","$f24","$f26",
+         "$f28","$f30"
+    );
+
+    memset(block, 0, 128);
+}
+
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile (
+        "lh $8, 0x0(%[block])                       \r\n"
+        "sd $0, 0x0(%[block])                       \r\n"
+        "daddiu $8, $8, 0x20                        \r\n"
+        "daddu $10, %[stride], %[stride]            \r\n"
+        "dsra $8, $8, 0x6                           \r\n"
+        "xor $f2, $f2, $f2                          \r\n"
+        "mtc1 $8, $f0                               \r\n"
+        "pshufh $f0, $f0, $f2                       \r\n"
+        "daddu $8, $10, %[stride]                   \r\n"
+        "psubh $f2, $f2, $f0                        \r\n"
+        "packushb $f0, $f0, $f0                     \r\n"
+        "packushb $f2, $f2, $f2                     \r\n"
+        "lwc1 $f4, 0x0(%[dst])                      \r\n"
+        "gslwxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gslwxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "gslwxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        "paddusb $f4, $f4, $f0                      \r\n"
+        "paddusb $f6, $f6, $f0                      \r\n"
+        "paddusb $f8, $f8, $f0                      \r\n"
+        "paddusb $f10, $f10, $f0                    \r\n"
+        "psubusb $f4, $f4, $f2                      \r\n"
+        "psubusb $f6, $f6, $f2                      \r\n"
+        "psubusb $f8, $f8, $f2                      \r\n"
+        "psubusb $f10, $f10, $f2                    \r\n"
+        "swc1 $f4, 0x0(%[dst])                      \r\n"
+        "gsswxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gsswxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "gsswxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
+        : "$8","$10","$f0","$f2","$f4","$f6","$f8","$f10"
+    );
+}
+
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile (
+        "lh $8, 0x0(%[block])                       \r\n"
+        "sd $0, 0x0(%[block])                       \r\n"
+        "daddiu $8, $8, 0x20                        \r\n"
+        "daddu $10, %[stride], %[stride]            \r\n"
+        "dsra $8, $8, 0x6                           \r\n"
+        "xor $f2, $f2, $f2                          \r\n"
+        "mtc1 $8, $f0                               \r\n"
+        "pshufh $f0, $f0, $f2                       \r\n"
+        "daddu $8, $10, %[stride]                   \r\n"
+        "psubh $f2, $f2, $f0                        \r\n"
+        "packushb $f0, $f0, $f0                     \r\n"
+        "packushb $f2, $f2, $f2                     \r\n"
+        "ldc1 $f4, 0x0(%[dst])                      \r\n"
+        "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        "paddusb $f4, $f4, $f0                      \r\n"
+        "paddusb $f6, $f6, $f0                      \r\n"
+        "paddusb $f8, $f8, $f0                      \r\n"
+        "paddusb $f10, $f10, $f0                    \r\n"
+        "psubusb $f4, $f4, $f2                      \r\n"
+        "psubusb $f6, $f6, $f2                      \r\n"
+        "psubusb $f8, $f8, $f2                      \r\n"
+        "psubusb $f10, $f10, $f2                    \r\n"
+        "sdc1 $f4, 0x0(%[dst])                      \r\n"
+        "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "daddu $9, $10, $10                         \r\n"
+        "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        "daddu %[dst], %[dst], $9                   \r\n"
+        "ldc1 $f4, 0x0(%[dst])                      \r\n"
+        "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        "paddusb $f4, $f4, $f0                      \r\n"
+        "paddusb $f6, $f6, $f0                      \r\n"
+        "paddusb $f8, $f8, $f0                      \r\n"
+        "paddusb $f10, $f10, $f0                    \r\n"
+        "psubusb $f4, $f4, $f2                      \r\n"
+        "psubusb $f6, $f6, $f2                      \r\n"
+        "psubusb $f8, $f8, $f2                      \r\n"
+        "psubusb $f10, $f10, $f2                    \r\n"
+        "sdc1 $f4, 0x0(%[dst])                      \r\n"
+        "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
+        "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
+        "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
+        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
+        : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10"
+    );
+}
+
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+            else
+                ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
+        else if(((int16_t*)block)[i*16])
+            ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                    stride);
+    }
+}
+
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
+                        block + i*16, stride);
+            else
+                ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+
+    for(j=1; j<3; j++){
+        for(i=j*16+4; i<j*16+8; i++){
+            if(nnzc[ scan8[i+4] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul)
+{
+    __asm__ volatile (
+        ".set noreorder                                 \r\n"
+        "dli $10, 0x8                                   \r\n"
+        "ldc1 $f6, 0x18(%[input])                       \r\n"
+        "dmtc1 $10, $f16                                \r\n"
+        "ldc1 $f4, 0x10(%[input])                       \r\n"
+        "dli $10, 0x20                                  \r\n"
+        "ldc1 $f2, 0x8(%[input])                        \r\n"
+        "dmtc1 $10, $f18                                \r\n"
+        "ldc1 $f0, 0x0(%[input])                        \r\n"
+        "mov.d $f8, $f6                                 \r\n"
+        "paddh $f6, $f6, $f4                            \r\n"
+        "psubh $f4, $f4, $f8                            \r\n"
+        "mov.d $f8, $f2                                 \r\n"
+        "paddh $f2, $f2, $f0                            \r\n"
+        "psubh $f0, $f0, $f8                            \r\n"
+        "mov.d $f8, $f6                                 \r\n"
+        "paddh $f6, $f6, $f2                            \r\n"
+        "psubh $f2, $f2, $f8                            \r\n"
+        "mov.d $f8, $f4                                 \r\n"
+        "paddh $f4, $f4, $f0                            \r\n"
+        "psubh $f0, $f0, $f8                            \r\n"
+        "mov.d $f8, $f6                                 \r\n"
+        "punpcklhw $f6, $f6, $f2                        \r\n"
+        "punpckhhw $f8, $f8, $f2                        \r\n"
+        "punpckhhw $f2, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhwd $f4, $f6, $f0                        \r\n"
+        "punpcklwd $f6, $f6, $f0                        \r\n"
+        "mov.d $f0, $f8                                 \r\n"
+        "punpcklwd $f8, $f8, $f2                        \r\n"
+        "punpckhwd $f0, $f0, $f2                        \r\n"
+        "mov.d $f2, $f0                                 \r\n"
+        "paddh $f0, $f0, $f8                            \r\n"
+        "psubh $f8, $f8, $f2                            \r\n"
+        "mov.d $f2, $f4                                 \r\n"
+        "paddh $f4, $f4, $f6                            \r\n"
+        "psubh $f6, $f6, $f2                            \r\n"
+        "mov.d $f2, $f0                                 \r\n"
+        "paddh $f0, $f0, $f4                            \r\n"
+        "psubh $f4, $f4, $f2                            \r\n"
+        "mov.d $f2, $f8                                 \r\n"
+        "daddiu $10, %[qmul], -0x7fff                   \r\n"
+        "paddh $f8, $f8, $f6                            \r\n"
+        "bgtz $10, 1f                                   \r\n"
+        "psubh $f6, $f6, $f2                            \r\n"
+        "ori $10, $0, 0x80                              \r\n"
+        "dsll $10, $10, 0x10                            \r\n"
+        "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
+        "daddu %[qmul], %[qmul], $10                    \r\n"
+        "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
+        "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
+        "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
+        "mtc1 %[qmul], $f14                             \r\n"
+        "punpcklwd $f14, $f14, $f14                     \r\n"
+        "pmaddhw $f0, $f0, $f14                         \r\n"
+        "pmaddhw $f4, $f4, $f14                         \r\n"
+        "pmaddhw $f2, $f2, $f14                         \r\n"
+        "pmaddhw $f10, $f10, $f14                       \r\n"
+        "psraw $f0, $f0, $f16                           \r\n"
+        "psraw $f4, $f4, $f16                           \r\n"
+        "psraw $f2, $f2, $f16                           \r\n"
+        "psraw $f10, $f10, $f16                         \r\n"
+        "packsswh $f0, $f0, $f2                         \r\n"
+        "packsswh $f4, $f4, $f10                        \r\n"
+        "mfc1 $9, $f0                                   \r\n"
+        "dsrl $f0, $f0, $f18                            \r\n"
+        "mfc1 %[input], $f0                             \r\n"
+        "sh $9, 0x0(%[output])                          \r\n"
+        "sh %[input], 0x80(%[output])                   \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x20(%[output])                         \r\n"
+        "sh %[input], 0xa0(%[output])                   \r\n"
+        "mfc1 $9, $f4                                   \r\n"
+        "dsrl $f4, $f4, $f18                            \r\n"
+        "mfc1 %[input], $f4                             \r\n"
+        "sh $9, 0x40(%[output])                         \r\n"
+        "sh %[input], 0xc0(%[output])                   \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x60(%[output])                         \r\n"
+        "sh %[input], 0xe0(%[output])                   \r\n"
+        "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
+        "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
+        "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
+        "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
+        "mtc1 %[qmul], $f14                             \r\n"
+        "punpcklwd $f14, $f14, $f14                     \r\n"
+        "pmaddhw $f6, $f6, $f14                         \r\n"
+        "pmaddhw $f8, $f8, $f14                         \r\n"
+        "pmaddhw $f2, $f2, $f14                         \r\n"
+        "pmaddhw $f10, $f10, $f14                       \r\n"
+        "psraw $f6, $f6, $f16                           \r\n"
+        "psraw $f8, $f8, $f16                           \r\n"
+        "psraw $f2, $f2, $f16                           \r\n"
+        "psraw $f10, $f10, $f16                         \r\n"
+        "packsswh $f6, $f6, $f2                         \r\n"
+        "packsswh $f8, $f8, $f10                        \r\n"
+        "mfc1 $9, $f6                                   \r\n"
+        "dsrl $f6, $f6, $f18                            \r\n"
+        "mfc1 %[input], $f6                             \r\n"
+        "sh $9, 0x100(%[output])                        \r\n"
+        "sh %[input], 0x180(%[output])                  \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x120(%[output])                        \r\n"
+        "sh %[input], 0x1a0(%[output])                  \r\n"
+        "mfc1 $9, $f8                                   \r\n"
+        "dsrl $f8, $f8, $f18                            \r\n"
+        "mfc1 %[input], $f8                             \r\n"
+        "sh $9, 0x140(%[output])                        \r\n"
+        "sh %[input], 0x1c0(%[output])                  \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x160(%[output])                        \r\n"
+        "jr $31                                         \r\n"
+        "sh %[input], 0x1e0(%[output])                  \r\n"
+        "1:                                             \r\n"
+        "ori $10, $0, 0x1f                              \r\n"
+        "clz $9, %[qmul]                                \r\n"
+        "ori %[input], $0, 0x7                          \r\n"
+        "dsubu $9, $10, $9                              \r\n"
+        "ori $10, $0, 0x80                              \r\n"
+        "dsll $10, $10, 0x10                            \r\n"
+        "daddu %[qmul], %[qmul], $10                    \r\n"
+        "dsubu $10, $9, %[input]                        \r\n"
+        "movn $9, %[input], $10                         \r\n"
+        "daddiu %[input], %[input], 0x1                 \r\n"
+        "andi $10, $9, 0xff                             \r\n"
+        "dsrlv %[qmul], %[qmul], $10                    \r\n"
+        "dsubu %[input], %[input], $9                   \r\n"
+        "mtc1 %[input], $f12                            \r\n"
+        "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
+        "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
+        "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
+        "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
+        "mtc1 %[qmul], $f14                             \r\n"
+        "punpcklwd $f14, $f14, $f14                     \r\n"
+        "pmaddhw $f0, $f0, $f14                         \r\n"
+        "pmaddhw $f4, $f4, $f14                         \r\n"
+        "pmaddhw $f2, $f2, $f14                         \r\n"
+        "pmaddhw $f10, $f10, $f14                       \r\n"
+        "psraw $f0, $f0, $f12                           \r\n"
+        "psraw $f4, $f4, $f12                           \r\n"
+        "psraw $f2, $f2, $f12                           \r\n"
+        "psraw $f10, $f10, $f12                         \r\n"
+        "packsswh $f0, $f0, $f2                         \r\n"
+        "packsswh $f4, $f4, $f10                        \r\n"
+        "mfc1 $9, $f0                                   \r\n"
+        "dsrl $f0, $f0, $f18                            \r\n"
+        "sh $9, 0x0(%[output])                          \r\n"
+        "mfc1 %[input], $f0                             \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "sh %[input], 0x80(%[output])                   \r\n"
+        "sh $9, 0x20(%[output])                         \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "mfc1 $9, $f4                                   \r\n"
+        "sh %[input], 0xa0(%[output])                   \r\n"
+        "dsrl $f4, $f4, $f18                            \r\n"
+        "sh $9, 0x40(%[output])                         \r\n"
+        "mfc1 %[input], $f4                             \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "sh %[input], 0xc0(%[output])                   \r\n"
+        "sh $9, 0x60(%[output])                         \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh %[input], 0xe0(%[output])                   \r\n"
+        "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
+        "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
+        "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
+        "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
+        "mtc1 %[qmul], $f14                             \r\n"
+        "punpcklwd $f14, $f14, $f14                     \r\n"
+        "pmaddhw $f6, $f6, $f14                         \r\n"
+        "pmaddhw $f8, $f8, $f14                         \r\n"
+        "pmaddhw $f2, $f2, $f14                         \r\n"
+        "pmaddhw $f10, $f10, $f14                       \r\n"
+        "psraw $f6, $f6, $f12                           \r\n"
+        "psraw $f8, $f8, $f12                           \r\n"
+        "psraw $f2, $f2, $f12                           \r\n"
+        "psraw $f10, $f10, $f12                         \r\n"
+        "packsswh $f6, $f6, $f2                         \r\n"
+        "packsswh $f8, $f8, $f10                        \r\n"
+        "mfc1 $9, $f6                                   \r\n"
+        "dsrl $f6, $f6, $f18                            \r\n"
+        "mfc1 %[input], $f6                             \r\n"
+        "sh $9, 0x100(%[output])                        \r\n"
+        "sh %[input], 0x180(%[output])                  \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x120(%[output])                        \r\n"
+        "sh %[input], 0x1a0(%[output])                  \r\n"
+        "mfc1 $9, $f8                                   \r\n"
+        "dsrl $f8, $f8, $f18                            \r\n"
+        "mfc1 %[input], $f8                             \r\n"
+        "sh $9, 0x140(%[output])                        \r\n"
+        "sh %[input], 0x1c0(%[output])                  \r\n"
+        "dsrl $9, $9, 0x10                              \r\n"
+        "dsrl %[input], %[input], 0x10                  \r\n"
+        "sh $9, 0x160(%[output])                        \r\n"
+        "sh %[input], 0x1e0(%[output])                  \r\n"
+        ".set reorder                                   \r\n"
+        ::[output]"r"(output),[input]"r"(input),[qmul]"r"((uint64_t)qmul),
+          [ff_pw_1]"f"(ff_pw_1)
+        : "$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18"
+    );
+}
+
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int temp[8];
+    int t[8];
+
+    temp[0] = block[0] + block[16];
+    temp[1] = block[0] - block[16];
+    temp[2] = block[32] + block[48];
+    temp[3] = block[32] - block[48];
+    temp[4] = block[64] + block[80];
+    temp[5] = block[64] - block[80];
+    temp[6] = block[96] + block[112];
+    temp[7] = block[96] - block[112];
+
+    t[0] = temp[0] + temp[4] + temp[2] + temp[6];
+    t[1] = temp[0] - temp[4] + temp[2] - temp[6];
+    t[2] = temp[0] - temp[4] - temp[2] + temp[6];
+    t[3] = temp[0] + temp[4] - temp[2] - temp[6];
+    t[4] = temp[1] + temp[5] + temp[3] + temp[7];
+    t[5] = temp[1] - temp[5] + temp[3] - temp[7];
+    t[6] = temp[1] - temp[5] - temp[3] + temp[7];
+    t[7] = temp[1] + temp[5] - temp[3] - temp[7];
+
+    block[  0]= (t[0]*qmul + 128) >> 8;
+    block[ 32]= (t[1]*qmul + 128) >> 8;
+    block[ 64]= (t[2]*qmul + 128) >> 8;
+    block[ 96]= (t[3]*qmul + 128) >> 8;
+    block[ 16]= (t[4]*qmul + 128) >> 8;
+    block[ 48]= (t[5]*qmul + 128) >> 8;
+    block[ 80]= (t[6]*qmul + 128) >> 8;
+    block[112]= (t[7]*qmul + 128) >> 8;
+}
+
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int a,b,c,d;
+
+    d = block[0] - block[16];
+    a = block[0] + block[16];
+    b = block[32] - block[48];
+    c = block[32] + block[48];
+    block[0] = ((a+c)*qmul) >> 7;
+    block[16]= ((d+b)*qmul) >> 7;
+    block[32]= ((a-c)*qmul) >> 7;
+    block[48]= ((d-b)*qmul) >> 7;
+}
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
+        int height, int log2_denom, int weight, int offset)
+{
+    int y;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "ldc1 $f2, %0                   \r\n"
+            "ldc1 $f4, %1                   \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "mtc1 %2, $f6                   \r\n"
+            "mtc1 %3, $f8                   \r\n"
+            "mtc1 %4, $f10                  \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "punpckhbh $f14, $f2, $f20      \r\n"
+            "punpckhbh $f16, $f4, $f20      \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "punpcklbh $f4, $f4, $f20       \r\n"
+            "pmullh $f14, $f14, $f6         \r\n"
+            "pmullh $f16, $f16, $f6         \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "pmullh $f4, $f4, $f6           \r\n"
+            "paddsh $f14, $f14, $f8         \r\n"
+            "paddsh $f16, $f16, $f8         \r\n"
+            "paddsh $f2, $f2, $f8           \r\n"
+            "paddsh $f4, $f4, $f8           \r\n"
+            "psrah $f14, $f14, $f10         \r\n"
+            "psrah $f16, $f16, $f10         \r\n"
+            "psrah $f2, $f2, $f10           \r\n"
+            "psrah $f4, $f4, $f10           \r\n"
+            "packushb $f2, $f2, $f14        \r\n"
+            "packushb $f4, $f4, $f16        \r\n"
+            "sdc1 $f2, %0                   \r\n"
+            "sdc1 $f4, %1                   \r\n"
+            : "=m"(*block),"=m"(*(block + 8))
+            : "r"(weight),"r"(offset),"r"(log2_denom)
+        );
+    }
+}
+
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "ldc1 $f2, %2                   \r\n"
+            "ldc1 $f4, %3                   \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "mtc1 %6, $f6                   \r\n"
+            "mtc1 %7, $f8                   \r\n"
+            "mtc1 %8, $f10                  \r\n"
+            "mtc1 %9, $f12                  \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "pshufh $f10, $f10, $f20        \r\n"
+            "punpckhbh $f14, $f2, $f20      \r\n"
+            "punpckhbh $f16, $f4, $f20      \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "punpcklbh $f4, $f4, $f20       \r\n"
+            "pmullh $f14, $f14, $f6         \r\n"
+            "pmullh $f16, $f16, $f8         \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "pmullh $f4, $f4, $f8           \r\n"
+            "paddsh $f14, $f14, $f10        \r\n"
+            "paddsh $f2, $f2, $f10          \r\n"
+            "paddsh $f14, $f14, $f16        \r\n"
+            "paddsh $f2, $f2, $f4           \r\n"
+            "psrah $f14, $f14, $f12         \r\n"
+            "psrah $f2, $f2, $f12           \r\n"
+            "packushb $f2, $f2, $f14        \r\n"
+            "sdc1 $f2, %0                   \r\n"
+            "ldc1 $f2, %4                   \r\n"
+            "ldc1 $f4, %5                   \r\n"
+            "punpckhbh $f14, $f2, $f20      \r\n"
+            "punpckhbh $f16, $f4, $f20      \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "punpcklbh $f4, $f4, $f20       \r\n"
+            "pmullh $f14, $f14, $f6         \r\n"
+            "pmullh $f16, $f16, $f8         \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "pmullh $f4, $f4, $f8           \r\n"
+            "paddsh $f14, $f14, $f10        \r\n"
+            "paddsh $f2, $f2, $f10          \r\n"
+            "paddsh $f14, $f14, $f16        \r\n"
+            "paddsh $f2, $f2, $f4           \r\n"
+            "psrah $f14, $f14, $f12         \r\n"
+            "psrah $f2, $f2, $f12           \r\n"
+            "packushb $f2, $f2, $f14        \r\n"
+            "sdc1 $f2, %1                   \r\n"
+            : "=m"(*dst),"=m"(*(dst+8))
+            : "m"(*src),"m"(*dst),"m"(*(src+8)),"m"(*(dst+8)),
+              "r"(weights),"r"(weightd),"r"(offset),"r"(log2_denom+1)
+        );
+    }
+}
+
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "ldc1 $f2, %0                   \r\n"
+            "mtc1 %1, $f6                   \r\n"
+            "mtc1 %2, $f8                   \r\n"
+            "mtc1 %3, $f10                  \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "punpckhbh $f14, $f2, $f20      \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "pmullh $f14, $f14, $f6         \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "paddsh $f14, $f14, $f8         \r\n"
+            "paddsh $f2, $f2, $f8           \r\n"
+            "psrah $f14, $f14, $f10         \r\n"
+            "psrah $f2, $f2, $f10           \r\n"
+            "packushb $f2, $f2, $f14        \r\n"
+            "sdc1 $f2, %0                   \r\n"
+            : "=m"(*block)
+            : "r"(weight),"r"(offset),"r"(log2_denom)
+        );
+    }
+}
+
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "ldc1 $f2, %1                   \r\n"
+            "ldc1 $f4, %2                   \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "mtc1 %3, $f6                   \r\n"
+            "mtc1 %4, $f8                   \r\n"
+            "mtc1 %5, $f10                  \r\n"
+            "mtc1 %6, $f12                  \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "pshufh $f10, $f10, $f20        \r\n"
+            "punpckhbh $f14, $f2, $f20      \r\n"
+            "punpckhbh $f16, $f4, $f20      \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "punpcklbh $f4, $f4, $f20       \r\n"
+            "pmullh $f14, $f14, $f6         \r\n"
+            "pmullh $f16, $f16, $f8         \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "pmullh $f4, $f4, $f8           \r\n"
+            "paddsh $f14, $f14, $f10        \r\n"
+            "paddsh $f2, $f2, $f10          \r\n"
+            "paddsh $f14, $f14, $f16        \r\n"
+            "paddsh $f2, $f2, $f4           \r\n"
+            "psrah $f14, $f14, $f12         \r\n"
+            "psrah $f2, $f2, $f12           \r\n"
+            "packushb $f2, $f2, $f14        \r\n"
+            "sdc1 $f2, %0                   \r\n"
+            : "=m"(*dst)
+            : "m"(*src),"m"(*dst),"r"(weights),
+              "r"(weightd),"r"(offset),"r"(log2_denom+1)
+        );
+    }
+}
+
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "lwc1 $f2, %0                   \r\n"
+            "mtc1 %1, $f6                   \r\n"
+            "mtc1 %2, $f8                   \r\n"
+            "mtc1 %3, $f10                  \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "paddsh $f2, $f2, $f8           \r\n"
+            "psrah $f2, $f2, $f10           \r\n"
+            "packushb $f2, $f2, $f20        \r\n"
+            "swc1 $f2, %0                   \r\n"
+            : "=m"(*block)
+            : "r"(weight),"r"(offset),"r"(log2_denom)
+        );
+    }
+}
+
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
+{
+    int y;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "lwc1 $f2, %1                   \r\n"
+            "lwc1 $f4, %2                   \r\n"
+            "dmtc1 $0, $f20                 \r\n"
+            "mtc1 %3, $f6                   \r\n"
+            "mtc1 %4, $f8                   \r\n"
+            "mtc1 %5, $f10                  \r\n"
+            "mtc1 %6, $f12                  \r\n"
+            "pshufh $f6, $f6, $f20          \r\n"
+            "pshufh $f8, $f8, $f20          \r\n"
+            "pshufh $f10, $f10, $f20        \r\n"
+            "punpcklbh $f2, $f2, $f20       \r\n"
+            "punpcklbh $f4, $f4, $f20       \r\n"
+            "pmullh $f2, $f2, $f6           \r\n"
+            "pmullh $f4, $f4, $f8           \r\n"
+            "paddsh $f2, $f2, $f10          \r\n"
+            "paddsh $f2, $f2, $f4           \r\n"
+            "psrah $f2, $f2, $f12           \r\n"
+            "packushb $f2, $f2, $f20        \r\n"
+            "swc1 $f2, %0                   \r\n"
+            : "=m"(*dst)
+            : "m"(*src),"m"(*dst),"r"(weights),
+              "r"(weightd),"r"(offset),"r"(log2_denom+1)
+        );
+    }
+}
+
+static void inline chroma_inter_body_mmi(uint8_t *pix, int stride,
+        int alpha, int beta, int8_t *tc0)
+{
+    __asm__ volatile (
+        "xor $f16, $f16, $f16                           \r\n"
+        "mtc1 %[alpha], $f8                             \r\n"
+        "mtc1 %[beta], $f10                             \r\n"
+        "pshufh $f8, $f8, $f16                          \r\n"
+        "pshufh $f10, $f10, $f16                        \r\n"
+        "packushb $f8, $f8, $f8                         \r\n"
+        "packushb $f10, $f10, $f10                      \r\n"
+        "psubusb $f12, $f4, $f2                         \r\n"
+        "psubusb $f14, $f2, $f4                         \r\n"
+        "or $f14, $f14, $f12                            \r\n"
+        "psubusb $f14, $f14, $f8                        \r\n"
+        "psubusb $f12, $f2, $f0                         \r\n"
+        "psubusb $f8, $f0, $f2                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "psubusb $f12, $f4, $f6                         \r\n"
+        "psubusb $f8, $f6, $f4                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "xor $f12, $f12, $f12                           \r\n"
+        "pcmpeqb $f14, $f14, $f12                       \r\n"
+        "lwc1 $f12, 0x0(%[tc0])                         \r\n"
+        "punpcklbh $f12, $f12, $f12                     \r\n"
+        "and $f14, $f14, $f12                           \r\n"
+        "pcmpeqb $f8, $f8, $f8                          \r\n"
+        "xor $f10, $f2, $f4                             \r\n"
+        "xor $f6, $f6, $f8                              \r\n"
+        "and $f10, $f10, %[ff_pb_1]                     \r\n"
+        "pavgb $f6, $f6, $f0                            \r\n"
+        "xor $f8, $f8, $f2                              \r\n"
+        "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
+        "pavgb $f8, $f8, $f4                            \r\n"
+        "pavgb $f6, $f6, $f10                           \r\n"
+        "paddusb $f6, $f6, $f8                          \r\n"
+        "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
+        "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
+        "pminub $f12, $f12, $f14                        \r\n"
+        "pminub $f6, $f6, $f14                          \r\n"
+        "psubusb $f2, $f2, $f12                         \r\n"
+        "psubusb $f4, $f4, $f6                          \r\n"
+        "paddusb $f2, $f2, $f6                          \r\n"
+        "paddusb $f4, $f4, $f12                         \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
+          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
+          [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
+        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
+    );
+}
+
+static void inline chroma_intra_body_mmi(uint8_t *pix, int stride,
+        int alpha, int beta)
+{
+    __asm__ volatile (
+        "xor $f16, $f16, $f16                           \r\n"
+        "mtc1 %[alpha], $f8                             \r\n"
+        "mtc1 %[beta], $f10                             \r\n"
+        "pshufh $f8, $f8, $f16                          \r\n"
+        "pshufh $f10, $f10, $f16                        \r\n"
+        "packushb $f8, $f8, $f8                         \r\n"
+        "packushb $f10, $f10, $f10                      \r\n"
+        "psubusb $f12, $f4, $f2                         \r\n"
+        "psubusb $f14, $f2, $f4                         \r\n"
+        "or $f14, $f14, $f12                            \r\n"
+        "psubusb $f14, $f14, $f8                        \r\n"
+        "psubusb $f12, $f2, $f0                         \r\n"
+        "psubusb $f8, $f0, $f2                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "psubusb $f12, $f4, $f6                         \r\n"
+        "psubusb $f8, $f6, $f4                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "xor $f12, $f12, $f12                           \r\n"
+        "pcmpeqb $f14, $f14, $f12                       \r\n"
+        "mov.d $f10, $f2                                \r\n"
+        "mov.d $f12, $f4                                \r\n"
+        "xor $f8, $f2, $f6                              \r\n"
+        "and $f8, $f8, %[ff_pb_1]                       \r\n"
+        "pavgb $f2, $f2, $f6                            \r\n"
+        "psubusb $f2, $f2, $f8                          \r\n"
+        "pavgb $f2, $f2, $f0                            \r\n"
+        "xor $f8, $f4, $f0                              \r\n"
+        "and $f8, $f8, %[ff_pb_1]                       \r\n"
+        "pavgb $f4, $f4, $f0                            \r\n"
+        "psubusb $f4, $f4, $f8                          \r\n"
+        "pavgb $f4, $f4, $f6                            \r\n"
+        "psubb $f2, $f2, $f10                           \r\n"
+        "psubb $f4, $f4, $f12                           \r\n"
+        "and $f2, $f2, $f14                             \r\n"
+        "and $f4, $f4, $f14                             \r\n"
+        "paddb $f2, $f2, $f10                           \r\n"
+        "paddb $f4, $f4, $f12                           \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
+          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
+          [ff_pb_1]"f"(ff_pb_1)
+        : "$f0","$f2","$f4","$f8","$f10","$f12","$f14","$f16"
+    );
+}
+
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    __asm__ volatile (
+        "daddu $8, %[stride], %[stride]                 \r\n"
+        "xor $f16, $f16, $f16                           \r\n"
+        "daddu $9, %[stride], $8                        \r\n"
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "dsubu $9, $0, $9                               \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "daddu $9, $9, %[pix]                           \r\n"
+        "ldc1 $f4, 0x0(%[pix])                          \r\n"
+        "gsldxc1 $f0, 0x0($9, %[stride])                \r\n"
+        "gsldxc1 $f2, 0x0($9, $8)                       \r\n"
+        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
+        "mtc1 %[alpha], $f8                             \r\n"
+        "mtc1 %[beta], $f10                             \r\n"
+        "pshufh $f8, $f8, $f16                          \r\n"
+        "pshufh $f10, $f10, $f16                        \r\n"
+        "packushb $f8, $f8, $f8                         \r\n"
+        "packushb $f10, $f10, $f10                      \r\n"
+        "psubusb $f12, $f4, $f2                         \r\n"
+        "psubusb $f14, $f2, $f4                         \r\n"
+        "or $f14, $f14, $f12                            \r\n"
+        "psubusb $f12, $f2, $f0                         \r\n"
+        "psubusb $f14, $f14, $f8                        \r\n"
+        "psubusb $f8, $f0, $f2                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f12, $f4, $f6                         \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "psubusb $f8, $f6, $f4                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "pcmpeqb $f14, $f14, $f16                       \r\n"
+        "pcmpeqb $f6, $f6, $f6                          \r\n"
+        "gslwlc1 $f8, 0x3(%[tc0])                       \r\n"
+        "gslwrc1 $f8, 0x0(%[tc0])                       \r\n"
+        "punpcklbh $f8, $f8, $f8                        \r\n"
+        "punpcklbh $f18, $f8, $f8                       \r\n"
+        "pcmpgtb $f8, $f18, $f6                         \r\n"
+        "ldc1 $f6, 0x0($9)                              \r\n"
+        "and $f20, $f8, $f14                            \r\n"
+        "psubusb $f14, $f6, $f2                         \r\n"
+        "psubusb $f12, $f2, $f6                         \r\n"
+        "psubusb $f14, $f14, $f10                       \r\n"
+        "psubusb $f12, $f12, $f10                       \r\n"
+        "pcmpeqb $f12, $f12, $f14                       \r\n"
+        "and $f12, $f12, $f20                           \r\n"
+        "and $f8, $f20, $f18                            \r\n"
+        "psubb $f14, $f8, $f12                          \r\n"
+        "and $f12, $f12, $f8                            \r\n"
+        "pavgb $f8, $f2, $f4                            \r\n"
+        "ldc1 $f22, 0x0($9)                             \r\n"
+        "pavgb $f6, $f6, $f8                            \r\n"
+        "xor $f8, $f8, $f22                             \r\n"
+        "and $f8, $f8, %[ff_pb_1]                       \r\n"
+        "psubusb $f6, $f6, $f8                          \r\n"
+        "psubusb $f8, $f0, $f12                         \r\n"
+        "paddusb $f12, $f12, $f0                        \r\n"
+        "pmaxub $f6, $f6, $f8                           \r\n"
+        "pminub $f6, $f6, $f12                          \r\n"
+        "gssdxc1 $f6, 0x0($9, %[stride])                \r\n"
+        "gsldxc1 $f8, 0x0(%[pix], $8)                   \r\n"
+        "psubusb $f6, $f8, $f4                          \r\n"
+        "psubusb $f12, $f4, $f8                         \r\n"
+        "psubusb $f6, $f6, $f10                         \r\n"
+        "psubusb $f12, $f12, $f10                       \r\n"
+        "pcmpeqb $f12, $f12, $f6                        \r\n"
+        "and $f12, $f12, $f20                           \r\n"
+        "psubb $f14, $f14, $f12                         \r\n"
+        "and $f10, $f18, $f12                           \r\n"
+        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
+        "pavgb $f12, $f2, $f4                           \r\n"
+        "gsldxc1 $f22, 0x0(%[pix], $8)                  \r\n"
+        "pavgb $f8, $f8, $f12                           \r\n"
+        "xor $f12, $f12, $f22                           \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "psubusb $f8, $f8, $f12                         \r\n"
+        "psubusb $f12, $f6, $f10                        \r\n"
+        "paddusb $f10, $f10, $f6                        \r\n"
+        "pmaxub $f8, $f8, $f12                          \r\n"
+        "pminub $f8, $f8, $f10                          \r\n"
+        "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
+        "xor $f10, $f2, $f4                             \r\n"
+        "pcmpeqb $f8, $f8, $f8                          \r\n"
+        "and $f10, $f10, %[ff_pb_1]                     \r\n"
+        "xor $f6, $f6, $f8                              \r\n"
+        "xor $f8, $f8, $f2                              \r\n"
+        "pavgb $f6, $f6, $f0                            \r\n"
+        "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
+        "pavgb $f8, $f8, $f4                            \r\n"
+        "pavgb $f6, $f6, $f10                           \r\n"
+        "paddusb $f6, $f6, $f8                          \r\n"
+        "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
+        "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
+        "pminub $f12, $f12, $f14                        \r\n"
+        "pminub $f6, $f6, $f14                          \r\n"
+        "psubusb $f2, $f2, $f12                         \r\n"
+        "psubusb $f4, $f4, $f6                          \r\n"
+        "paddusb $f2, $f2, $f6                          \r\n"
+        "paddusb $f4, $f4, $f12                         \r\n"
+        "gssdxc1 $f2, 0x0($9, $8)                       \r\n"
+        "sdc1 $f4, 0x0(%[pix])                          \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
+          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
+          [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
+        : "$8","$9","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20","$f22"
+    );
+}
+
+void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    uint64_t stack[0xa];
+
+    __asm__ volatile (
+        "ori $8, $0, 0x1                                \r\n"
+        "xor $f30, $f30, $f30                           \r\n"
+        "dmtc1 $8, $f16                                 \r\n"
+        "dsll $8, %[stride], 2                          \r\n"
+        "daddu $10, %[stride], %[stride]                \r\n"
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "dsll $f20, $f16, $f16                          \r\n"
+        "bltz %[alpha], 1f                              \r\n"
+        "daddu $9, $10, %[stride]                       \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "bltz %[beta], 1f                               \r\n"
+        "dsubu $8, $0, $8                               \r\n"
+        "daddu $8, $8, %[pix]                           \r\n"
+        "ldc1 $f4, 0x0(%[pix])                          \r\n"
+        "gsldxc1 $f0, 0x0($8, $10)                      \r\n"
+        "gsldxc1 $f2, 0x0($8, $9)                       \r\n"
+        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
+        "mtc1 %[alpha], $f8                             \r\n"
+        "mtc1 %[beta], $f10                             \r\n"
+        "pshufh $f8, $f8, $f30                          \r\n"
+        "pshufh $f10, $f10, $f30                        \r\n"
+        "packushb $f8, $f8, $f8                         \r\n"
+        "psubusb $f12, $f4, $f2                         \r\n"
+        "psubusb $f14, $f2, $f4                         \r\n"
+        "packushb $f10, $f10, $f10                      \r\n"
+        "or $f14, $f14, $f12                            \r\n"
+        "sdc1 $f8, 0x10+%[stack]                        \r\n"
+        "psubusb $f14, $f14, $f8                        \r\n"
+        "psubusb $f12, $f2, $f0                         \r\n"
+        "psubusb $f8, $f0, $f2                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "psubusb $f12, $f4, $f6                         \r\n"
+        "psubusb $f8, $f6, $f4                          \r\n"
+        "or $f8, $f8, $f12                              \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "or $f14, $f14, $f8                             \r\n"
+        "xor $f12, $f12, $f12                           \r\n"
+        "ldc1 $f8, 0x10+%[stack]                        \r\n"
+        "pcmpeqb $f14, $f14, $f12                       \r\n"
+        "sdc1 $f14, 0x20+%[stack]                       \r\n"
+        "pavgb $f8, $f8, $f30                           \r\n"
+        "psubusb $f14, $f4, $f2                         \r\n"
+        "pavgb $f8, $f8, %[ff_pb_1]                     \r\n"
+        "psubusb $f12, $f2, $f4                         \r\n"
+        "psubusb $f14, $f14, $f8                        \r\n"
+        "psubusb $f12, $f12, $f8                        \r\n"
+        "ldc1 $f28, 0x20+%[stack]                       \r\n"
+        "pcmpeqb $f12, $f12, $f14                       \r\n"
+        "and $f12, $f12, $f28                           \r\n"
+        "gsldxc1 $f28, 0x0($8, %[stride])               \r\n"
+        "psubusb $f14, $f28, $f2                        \r\n"
+        "psubusb $f8, $f2, $f28                         \r\n"
+        "psubusb $f14, $f14, $f10                       \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "pcmpeqb $f8, $f8, $f14                         \r\n"
+        "and $f8, $f8, $f12                             \r\n"
+        "gsldxc1 $f26, 0x0(%[pix], $10)                 \r\n"
+        "sdc1 $f8, 0x30+%[stack]                        \r\n"
+        "psubusb $f14, $f26, $f4                        \r\n"
+        "psubusb $f8, $f4, $f26                         \r\n"
+        "psubusb $f14, $f14, $f10                       \r\n"
+        "psubusb $f8, $f8, $f10                         \r\n"
+        "pcmpeqb $f8, $f8, $f14                         \r\n"
+        "and $f8, $f8, $f12                             \r\n"
+        "sdc1 $f8, 0x40+%[stack]                        \r\n"
+        "pavgb $f8, $f28, $f0                           \r\n"
+        "pavgb $f10, $f2, $f4                           \r\n"
+        "pavgb $f8, $f8, $f10                           \r\n"
+        "sdc1 $f10, 0x10+%[stack]                       \r\n"
+        "paddb $f12, $f28, $f0                          \r\n"
+        "paddb $f14, $f2, $f4                           \r\n"
+        "paddb $f12, $f12, $f14                         \r\n"
+        "mov.d $f14, $f12                               \r\n"
+        "sdc1 $f12, 0x0+%[stack]                        \r\n"
+        "psrlh $f12, $f12, $f16                         \r\n"
+        "pavgb $f12, $f12, $f30                         \r\n"
+        "xor $f12, $f12, $f8                            \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "psubb $f8, $f8, $f12                           \r\n"
+        "pavgb $f10, $f28, $f6                          \r\n"
+        "psubb $f12, $f28, $f6                          \r\n"
+        "paddb $f14, $f14, $f14                         \r\n"
+        "psubb $f14, $f14, $f12                         \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "psubb $f10, $f10, $f12                         \r\n"
+        "ldc1 $f24, 0x10+%[stack]                       \r\n"
+        "pavgb $f10, $f10, $f0                          \r\n"
+        "psrlh $f14, $f14, $f20                         \r\n"
+        "pavgb $f10, $f10, $f24                         \r\n"
+        "pavgb $f14, $f14, $f30                         \r\n"
+        "xor $f14, $f14, $f10                           \r\n"
+        "and $f14, $f14, %[ff_pb_1]                     \r\n"
+        "psubb $f10, $f10, $f14                         \r\n"
+        "xor $f14, $f2, $f6                             \r\n"
+        "pavgb $f12, $f2, $f6                           \r\n"
+        "and $f14, $f14, %[ff_pb_1]                     \r\n"
+        "psubb $f12, $f12, $f14                         \r\n"
+        "ldc1 $f24, 0x30+%[stack]                       \r\n"
+        "pavgb $f12, $f12, $f0                          \r\n"
+        "ldc1 $f22, 0x20+%[stack]                       \r\n"
+        "xor $f10, $f10, $f12                           \r\n"
+        "xor $f12, $f12, $f2                            \r\n"
+        "and $f10, $f10, $f24                           \r\n"
+        "and $f12, $f12, $f22                           \r\n"
+        "xor $f10, $f10, $f12                           \r\n"
+        "xor $f10, $f10, $f2                            \r\n"
+        "gssdxc1 $f10, 0x0($8, $9)                      \r\n"
+        "ldc1 $f10, 0x0($8)                             \r\n"
+        "paddb $f12, $f28, $f10                         \r\n"
+        "pavgb $f10, $f10, $f28                         \r\n"
+        "ldc1 $f22, 0x0+%[stack]                        \r\n"
+        "pavgb $f10, $f10, $f8                          \r\n"
+        "paddb $f12, $f12, $f12                         \r\n"
+        "paddb $f12, $f12, $f22                         \r\n"
+        "psrlh $f12, $f12, $f20                         \r\n"
+        "pavgb $f12, $f12, $f30                         \r\n"
+        "xor $f12, $f12, $f10                           \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "ldc1 $f22, 0x30+%[stack]                       \r\n"
+        "psubb $f10, $f10, $f12                         \r\n"
+        "xor $f8, $f8, $f0                              \r\n"
+        "xor $f10, $f10, $f28                           \r\n"
+        "and $f8, $f8, $f22                             \r\n"
+        "and $f10, $f10, $f22                           \r\n"
+        "xor $f8, $f8, $f0                              \r\n"
+        "xor $f10, $f10, $f28                           \r\n"
+        "gssdxc1 $f8, 0x0($8, $10)                      \r\n"
+        "gssdxc1 $f10, 0x0($8, %[stride])               \r\n"
+        "pavgb $f8, $f26, $f6                           \r\n"
+        "pavgb $f10, $f4, $f2                           \r\n"
+        "pavgb $f8, $f8, $f10                           \r\n"
+        "sdc1 $f10, 0x10+%[stack]                       \r\n"
+        "paddb $f12, $f26, $f6                          \r\n"
+        "paddb $f14, $f4, $f2                           \r\n"
+        "paddb $f12, $f12, $f14                         \r\n"
+        "mov.d $f14, $f12                               \r\n"
+        "sdc1 $f12, 0x0+%[stack]                        \r\n"
+        "psrlh $f12, $f12, $f16                         \r\n"
+        "pavgb $f12, $f12, $f30                         \r\n"
+        "xor $f12, $f12, $f8                            \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "psubb $f8, $f8, $f12                           \r\n"
+        "pavgb $f10, $f26, $f0                          \r\n"
+        "paddb $f14, $f14, $f14                         \r\n"
+        "psubb $f12, $f26, $f0                          \r\n"
+        "psubb $f14, $f14, $f12                         \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "psubb $f10, $f10, $f12                         \r\n"
+        "ldc1 $f22, 0x10+%[stack]                       \r\n"
+        "pavgb $f10, $f10, $f6                          \r\n"
+        "pavgb $f10, $f10, $f22                         \r\n"
+        "psrlh $f14, $f14, $f20                         \r\n"
+        "pavgb $f14, $f14, $f30                         \r\n"
+        "xor $f14, $f14, $f10                           \r\n"
+        "and $f14, $f14, %[ff_pb_1]                     \r\n"
+        "psubb $f10, $f10, $f14                         \r\n"
+        "xor $f14, $f4, $f0                             \r\n"
+        "pavgb $f12, $f4, $f0                           \r\n"
+        "and $f14, $f14, %[ff_pb_1]                     \r\n"
+        "ldc1 $f22, 0x40+%[stack]                       \r\n"
+        "psubb $f12, $f12, $f14                         \r\n"
+        "ldc1 $f24, 0x20+%[stack]                       \r\n"
+        "pavgb $f12, $f12, $f6                          \r\n"
+        "xor $f10, $f10, $f12                           \r\n"
+        "xor $f12, $f12, $f4                            \r\n"
+        "and $f10, $f10, $f22                           \r\n"
+        "and $f12, $f12, $f24                           \r\n"
+        "xor $f10, $f10, $f12                           \r\n"
+        "xor $f10, $f10, $f4                            \r\n"
+        "sdc1 $f10, 0x0(%[pix])                         \r\n"
+        "gsldxc1 $f10, 0x0(%[pix], $9)                  \r\n"
+        "paddb $f12, $f26, $f10                         \r\n"
+        "pavgb $f10, $f10, $f26                         \r\n"
+        "ldc1 $f22, 0x0+%[stack]                        \r\n"
+        "pavgb $f10, $f10, $f8                          \r\n"
+        "paddb $f12, $f12, $f12                         \r\n"
+        "paddb $f12, $f12, $f22                         \r\n"
+        "psrlh $f12, $f12, $f20                         \r\n"
+        "pavgb $f12, $f12, $f30                         \r\n"
+        "xor $f12, $f12, $f10                           \r\n"
+        "and $f12, $f12, %[ff_pb_1]                     \r\n"
+        "ldc1 $f22, 0x40+%[stack]                       \r\n"
+        "psubb $f10, $f10, $f12                         \r\n"
+        "xor $f8, $f8, $f6                              \r\n"
+        "xor $f10, $f10, $f26                           \r\n"
+        "and $f8, $f8, $f22                             \r\n"
+        "and $f10, $f10, $f22                           \r\n"
+        "xor $f8, $f8, $f6                              \r\n"
+        "xor $f10, $f10, $f26                           \r\n"
+        "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
+        "gssdxc1 $f10, 0x0(%[pix], $10)                 \r\n"
+        "1:                                             \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
+          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
+          [stack]"m"(stack[0]),[ff_pb_1]"f"(ff_pb_1)
+        : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14",
+          "$f16","$f18","$f20","$f22","$f24","$f26","$f28","$f30"
+    );
+}
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    __asm__ volatile (
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "or $16, $0, %[pix]                             \r\n"
+        "dsubu $16, $16, %[stride]                      \r\n"
+        "dsubu $16, $16, %[stride]                      \r\n"
+        "ldc1 $f0, 0x0($16)                             \r\n"
+        "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
+        "ldc1 $f4, 0x0(%[pix])                          \r\n"
+        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
+        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
+          [beta]"+r"(beta)
+        : [tc0]"r"(tc0)
+        : "$16","$f2","$f4"
+    );
+
+    chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
+
+    __asm__ volatile (
+        "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
+        "sdc1 $f4, 0x0(%[pix])                          \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
+        : "$16","$f2","$f4"
+    );
+}
+
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    __asm__ volatile (
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "or $16, $0, %[pix]                             \r\n"
+        "dsubu $16, $16, %[stride]                      \r\n"
+        "dsubu $16, $16, %[stride]                      \r\n"
+        "ldc1 $f0, 0x0($16)                             \r\n"
+        "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
+        "ldc1 $f4, 0x0(%[pix])                          \r\n"
+        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
+        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
+          [beta]"+r"(beta)
+        ::"$16","$f0","$f2","$f4","$f6"
+    );
+
+    chroma_intra_body_mmi(pix, stride, alpha, beta);
+
+    __asm__ volatile (
+        "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
+        "sdc1 $f4, 0x0(%[pix])                          \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
+        : "$16","$f2","$f4"
+    );
+}
+
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    __asm__ volatile (
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "daddu $16, %[stride], %[stride]                \r\n"
+        "daddiu %[pix], %[pix], -0x2                    \r\n"
+        "daddu $17, $16, %[stride]                      \r\n"
+        "daddu $19, $16, $16                            \r\n"
+        "or $18, $0, %[pix]                             \r\n"
+        "daddu %[pix], %[pix], $17                      \r\n"
+        "gslwlc1 $f0, 0x3($18)                          \r\n"
+        "daddu $12, $18, %[stride]                      \r\n"
+        "gslwrc1 $f0, 0x0($18)                          \r\n"
+        "gslwlc1 $f4, 0x3($12)                          \r\n"
+        "daddu $13, $18, $16                            \r\n"
+        "gslwrc1 $f4, 0x0($12)                          \r\n"
+        "gslwlc1 $f2, 0x3($13)                          \r\n"
+        "gslwrc1 $f2, 0x0($13)                          \r\n"
+        "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
+        "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
+        "punpcklbh $f0, $f0, $f4                        \r\n"
+        "punpcklbh $f2, $f2, $f6                        \r\n"
+        "daddu $12, %[pix], %[stride]                   \r\n"
+        "punpckhhw $f4, $f0, $f2                        \r\n"
+        "punpcklhw $f0, $f0, $f2                        \r\n"
+        "gslwlc1 $f8, 0x3($12)                          \r\n"
+        "daddu $13, %[pix], $16                         \r\n"
+        "gslwrc1 $f8, 0x0($12)                          \r\n"
+        "gslwlc1 $f12, 0x3($13)                         \r\n"
+        "daddu $12, %[pix], $17                         \r\n"
+        "gslwrc1 $f12, 0x0($13)                         \r\n"
+        "gslwlc1 $f10, 0x3($12)                         \r\n"
+        "daddu $13, %[pix], $19                         \r\n"
+        "gslwrc1 $f10, 0x0($12)                         \r\n"
+        "gslwlc1 $f14, 0x3($13)                         \r\n"
+        "gslwrc1 $f14, 0x0($13)                         \r\n"
+        "punpcklbh $f8, $f8, $f12                       \r\n"
+        "punpcklbh $f10, $f10, $f14                     \r\n"
+        "mov.d $f12, $f8                                \r\n"
+        "punpcklhw $f8, $f8, $f10                       \r\n"
+        "punpckhhw $f12, $f12, $f10                     \r\n"
+        "punpckhwd $f2, $f0, $f8                        \r\n"
+        "punpckhwd $f6, $f4, $f12                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpcklwd $f4, $f4, $f12                       \r\n"
+        "mov.d $f20, $f0                                \r\n"
+        "mov.d $f22, $f6                                \r\n"
+        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
+          [beta]"+r"(beta)
+        ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
+          "$f10","$f12","$f14","$f20","$f22"
+    );
+
+    chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
+
+    __asm__ volatile (
+        "punpckhwd $f8, $f20, $f20                      \r\n"
+        "punpckhwd $f10, $f2, $f2                       \r\n"
+        "punpckhwd $f12, $f4, $f4                       \r\n"
+        "punpcklbh $f0, $f20, $f2                       \r\n"
+        "punpcklbh $f4, $f4, $f22                       \r\n"
+        "punpcklhw $f2, $f0, $f4                        \r\n"
+        "punpckhhw $f0, $f0, $f4                        \r\n"
+        "gsswlc1 $f2, 0x3($18)                          \r\n"
+        "gsswrc1 $f2, 0x0($18)                          \r\n"
+        "daddu $12, $18, %[stride]                      \r\n"
+        "punpckhwd $f2, $f2, $f2                        \r\n"
+        "gsswlc1 $f2, 0x3($12)                          \r\n"
+        "daddu $13, $18, $16                            \r\n"
+        "gsswrc1 $f2, 0x0($12)                          \r\n"
+        "gsswlc1 $f0, 0x3($13)                          \r\n"
+        "gsswrc1 $f0, 0x0($13)                          \r\n"
+        "punpckhwd $f0, $f0, $f0                        \r\n"
+        "punpckhwd $f6, $f22, $f22                      \r\n"
+        "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
+        "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "punpcklbh $f12, $f12, $f6                      \r\n"
+        "daddu $12, %[pix], %[stride]                   \r\n"
+        "punpcklhw $f10, $f8, $f12                      \r\n"
+        "punpckhhw $f8, $f8, $f12                       \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "punpckhwd $f10, $f10, $f10                     \r\n"
+        "daddu $12, %[pix], $16                         \r\n"
+        "daddu $13, %[pix], $17                         \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "gsswlc1 $f8, 0x3($13)                          \r\n"
+        "daddu $12, %[pix], $19                         \r\n"
+        "punpckhwd $f20, $f8, $f8                       \r\n"
+        "gsswrc1 $f8, 0x0($13)                          \r\n"
+        "gsswlc1 $f20, 0x3($12)                         \r\n"
+        "gsswrc1 $f20, 0x0($12)                         \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
+        : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
+          "$f10","$f12","$f20"
+    );
+}
+
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    __asm__ volatile (
+        "daddiu %[alpha], %[alpha], -0x1                \r\n"
+        "daddiu %[beta], %[beta], -0x1                  \r\n"
+        "daddu $16, %[stride], %[stride]                \r\n"
+        "daddiu %[pix], %[pix], -0x2                    \r\n"
+        "daddu $17, $16, %[stride]                      \r\n"
+        "daddu $19, $16, $16                            \r\n"
+        "or $18, $0, %[pix]                             \r\n"
+        "daddu %[pix], %[pix], $17                      \r\n"
+        "gslwlc1 $f0, 0x3($18)                          \r\n"
+        "daddu $12, $18, %[stride]                      \r\n"
+        "gslwrc1 $f0, 0x0($18)                          \r\n"
+        "gslwlc1 $f4, 0x3($12)                          \r\n"
+        "daddu $13, $18, $16                            \r\n"
+        "gslwrc1 $f4, 0x0($12)                          \r\n"
+        "gslwlc1 $f2, 0x3($13)                          \r\n"
+        "gslwrc1 $f2, 0x0($13)                          \r\n"
+        "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
+        "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
+        "punpcklbh $f0, $f0, $f4                        \r\n"
+        "punpcklbh $f2, $f2, $f6                        \r\n"
+        "daddu $12, %[pix], %[stride]                   \r\n"
+        "punpckhhw $f4, $f0, $f2                        \r\n"
+        "punpcklhw $f0, $f0, $f2                        \r\n"
+        "gslwlc1 $f8, 0x3($12)                          \r\n"
+        "daddu $13, %[pix], $16                         \r\n"
+        "gslwrc1 $f8, 0x0($12)                          \r\n"
+        "gslwlc1 $f12, 0x3($13)                         \r\n"
+        "daddu $12, %[pix], $17                         \r\n"
+        "gslwrc1 $f12, 0x0($13)                         \r\n"
+        "gslwlc1 $f10, 0x3($12)                         \r\n"
+        "daddu $13, %[pix], $19                         \r\n"
+        "gslwrc1 $f10, 0x0($12)                         \r\n"
+        "gslwlc1 $f14, 0x3($13)                         \r\n"
+        "gslwrc1 $f14, 0x0($13)                         \r\n"
+        "punpcklbh $f8, $f8, $f12                       \r\n"
+        "punpcklbh $f10, $f10, $f14                     \r\n"
+        "mov.d $f12, $f8                                \r\n"
+        "punpcklhw $f8, $f8, $f10                       \r\n"
+        "punpckhhw $f12, $f12, $f10                     \r\n"
+        "punpckhwd $f2, $f0, $f8                        \r\n"
+        "punpckhwd $f6, $f4, $f12                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpcklwd $f4, $f4, $f12                       \r\n"
+        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
+          [beta]"+r"(beta)
+        ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
+          "$f10","$f12","$f14","$f20","$f22"
+    );
+
+    chroma_intra_body_mmi(pix, stride, alpha, beta);
+
+    __asm__ volatile (
+        "punpckhwd $f8, $f0, $f0                        \r\n"
+        "punpckhwd $f10, $f2, $f2                       \r\n"
+        "punpckhwd $f12, $f4, $f4                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpcklhw $f2, $f0, $f4                        \r\n"
+        "punpckhhw $f0, $f0, $f4                        \r\n"
+        "gsswlc1 $f2, 0x3($18)                          \r\n"
+        "gsswrc1 $f2, 0x0($18)                          \r\n"
+        "daddu $12, $18, %[stride]                      \r\n"
+        "punpckhwd $f2, $f2, $f2                        \r\n"
+        "gsswlc1 $f2, 0x3($12)                          \r\n"
+        "daddu $13, $18, $16                            \r\n"
+        "gsswrc1 $f2, 0x0($12)                          \r\n"
+        "gsswlc1 $f0, 0x3($13)                          \r\n"
+        "gsswrc1 $f0, 0x0($13)                          \r\n"
+        "punpckhwd $f0, $f0, $f0                        \r\n"
+        "punpckhwd $f6, $f6, $f6                        \r\n"
+        "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
+        "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "punpcklbh $f12, $f12, $f6                      \r\n"
+        "daddu $12, %[pix], %[stride]                   \r\n"
+        "punpcklhw $f10, $f8, $f12                      \r\n"
+        "punpckhhw $f8, $f8, $f12                       \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "punpckhwd $f10, $f10, $f10                     \r\n"
+        "daddu $12, %[pix], $16                         \r\n"
+        "daddu $13, %[pix], $17                         \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "gsswlc1 $f8, 0x3($13)                          \r\n"
+        "daddu $12, %[pix], $19                         \r\n"
+        "punpckhwd $f20, $f8, $f8                       \r\n"
+        "gsswrc1 $f8, 0x0($13)                          \r\n"
+        "gsswlc1 $f20, 0x3($12)                         \r\n"
+        "gsswrc1 $f20, 0x0($12)                         \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
+        : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
+          "$f10","$f12","$f20"
+    );
+}
+
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    if ((tc0[0] & tc0[1]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
+    if ((tc0[2] & tc0[3]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    ff_deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
+}
+
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    uint64_t stack[0xd];
+
+    __asm__ volatile (
+        "daddu $15, %[stride], %[stride]                \r\n"
+        "daddiu $8, %[pix], -0x4                        \r\n"
+        "daddu $9, %[stride], $15                       \r\n"
+        "gsldlc1 $f0, 0x7($8)                           \r\n"
+        "gsldrc1 $f0, 0x0($8)                           \r\n"
+        "daddu $12, $8, %[stride]                       \r\n"
+        "daddu $10, $8, $9                              \r\n"
+        "gsldlc1 $f2, 0x7($12)                          \r\n"
+        "daddu $11, $8, $15                             \r\n"
+        "gsldrc1 $f2, 0x0($12)                          \r\n"
+        "gsldlc1 $f4, 0x7($11)                          \r\n"
+        "gsldrc1 $f4, 0x0($11)                          \r\n"
+        "gsldlc1 $f6, 0x7($10)                          \r\n"
+        "daddu $12, $10, %[stride]                      \r\n"
+        "gsldrc1 $f6, 0x0($10)                          \r\n"
+        "gsldlc1 $f8, 0x7($12)                          \r\n"
+        "daddu $11, $10, $15                            \r\n"
+        "gsldrc1 $f8, 0x0($12)                          \r\n"
+        "gsldlc1 $f10, 0x7($11)                         \r\n"
+        "daddu $12, $10, $9                             \r\n"
+        "gsldrc1 $f10, 0x0($11)                         \r\n"
+        "gsldlc1 $f12, 0x7($12)                         \r\n"
+        "gsldrc1 $f12, 0x0($12)                         \r\n"
+        "daddu $14, $15, $15                            \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "daddu $12, $10, $14                            \r\n"
+        "sdc1 $f2, 0x10+%[stack]                        \r\n"
+        "gsldlc1 $f16, 0x7($12)                         \r\n"
+        "gsldrc1 $f16, 0x0($12)                         \r\n"
+        "daddu $13, $14, $14                            \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "punpckhhw $f2, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "ldc1 $f16, 0x10+%[stack]                       \r\n"
+        "punpckhwd $f0, $f0, $f8                        \r\n"
+        "sdc1 $f0, 0x0+%[stack]                         \r\n"
+        "punpckhhw $f12, $f14, $f16                     \r\n"
+        "punpcklhw $f14, $f14, $f16                     \r\n"
+        "punpckhhw $f0, $f6, $f10                       \r\n"
+        "punpcklhw $f6, $f6, $f10                       \r\n"
+        "punpcklwd $f12, $f12, $f0                      \r\n"
+        "punpckhwd $f10, $f14, $f6                      \r\n"
+        "punpcklwd $f14, $f14, $f6                      \r\n"
+        "punpckhwd $f6, $f2, $f4                        \r\n"
+        "punpcklwd $f2, $f2, $f4                        \r\n"
+        "sdc1 $f2, 0x10+%[stack]                        \r\n"
+        "sdc1 $f6, 0x20+%[stack]                        \r\n"
+        "sdc1 $f14, 0x30+%[stack]                       \r\n"
+        "sdc1 $f10, 0x40+%[stack]                       \r\n"
+        "sdc1 $f12, 0x50+%[stack]                       \r\n"
+        "daddu $8, $8, $13                              \r\n"
+        "daddu $10, $10, $13                            \r\n"
+        "gsldlc1 $f0, 0x7($8)                           \r\n"
+        "daddu $12, $8, %[stride]                       \r\n"
+        "gsldrc1 $f0, 0x0($8)                           \r\n"
+        "gsldlc1 $f2, 0x7($12)                          \r\n"
+        "daddu $11, $8, $15                             \r\n"
+        "gsldrc1 $f2, 0x0($12)                          \r\n"
+        "gsldlc1 $f4, 0x7($11)                          \r\n"
+        "gsldrc1 $f4, 0x0($11)                          \r\n"
+        "gsldlc1 $f6, 0x7($10)                          \r\n"
+        "daddu $12, $10, %[stride]                      \r\n"
+        "gsldrc1 $f6, 0x0($10)                          \r\n"
+        "gsldlc1 $f8, 0x7($12)                          \r\n"
+        "daddu $11, $10, $15                            \r\n"
+        "gsldrc1 $f8, 0x0($12)                          \r\n"
+        "gsldlc1 $f10, 0x7($11)                         \r\n"
+        "daddu $12, $10, $9                             \r\n"
+        "gsldrc1 $f10, 0x0($11)                         \r\n"
+        "gsldlc1 $f12, 0x7($12)                         \r\n"
+        "gsldrc1 $f12, 0x0($12)                         \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "daddu $12, $10, $14                            \r\n"
+        "sdc1 $f2, 0x18+%[stack]                        \r\n"
+        "gsldlc1 $f16, 0x7($12)                         \r\n"
+        "gsldrc1 $f16, 0x0($12)                         \r\n"
+        "punpckhhw $f2, $f0, $f4                        \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "punpckhwd $f0, $f0, $f8                        \r\n"
+        "ldc1 $f16, 0x18+%[stack]                       \r\n"
+        "sdc1 $f0, 0x8+%[stack]                         \r\n"
+        "punpckhhw $f12, $f14, $f16                     \r\n"
+        "punpcklhw $f14, $f14, $f16                     \r\n"
+        "punpckhhw $f0, $f6, $f10                       \r\n"
+        "punpcklhw $f6, $f6, $f10                       \r\n"
+        "punpckhwd $f10, $f14, $f6                      \r\n"
+        "punpcklwd $f14, $f14, $f6                      \r\n"
+        "punpckhwd $f6, $f2, $f4                        \r\n"
+        "punpcklwd $f2, $f2, $f4                        \r\n"
+        "punpcklwd $f12, $f12, $f0                      \r\n"
+        "sdc1 $f2, 0x18+%[stack]                        \r\n"
+        "sdc1 $f6, 0x28+%[stack]                        \r\n"
+        "sdc1 $f14, 0x38+%[stack]                       \r\n"
+        "sdc1 $f10, 0x48+%[stack]                       \r\n"
+        "sdc1 $f12, 0x58+%[stack]                       \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),[stack]"m"(stack[0])
+        : "$8","$9","$10","$11","$12","$13","$14","$15","$f0","$f2","$f4",
+          "$f6","$f8","$f10","$f12","$f14","$f16"
+    );
+
+    ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
+
+    __asm__ volatile (
+        "daddu $15, %[stride], %[stride]                \r\n"
+        "daddiu $8, %[pix], -0x2                        \r\n"
+        "daddu $14, $15, $15                            \r\n"
+        "daddu $9, $15, %[stride]                       \r\n"
+        "daddu $13, $14, $14                            \r\n"
+        "daddu $10, $8, $9                              \r\n"
+        "ldc1 $f0, 0x10+%[stack]                        \r\n"
+        "ldc1 $f2, 0x20+%[stack]                        \r\n"
+        "ldc1 $f4, 0x30+%[stack]                        \r\n"
+        "ldc1 $f6, 0x40+%[stack]                        \r\n"
+        "punpckhwd $f8, $f0, $f0                        \r\n"
+        "punpckhwd $f10, $f2, $f2                       \r\n"
+        "punpckhwd $f12, $f4, $f4                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpcklhw $f2, $f0, $f4                        \r\n"
+        "punpckhhw $f0, $f0, $f4                        \r\n"
+        "gsswlc1 $f2, 0x3($8)                           \r\n"
+        "gsswrc1 $f2, 0x0($8)                           \r\n"
+        "daddu $12, $8, %[stride]                       \r\n"
+        "punpckhwd $f2, $f2, $f2                        \r\n"
+        "daddu $11, $8, $15                             \r\n"
+        "gsswlc1 $f2, 0x3($12)                          \r\n"
+        "gsswrc1 $f2, 0x0($12)                          \r\n"
+        "gsswlc1 $f0, 0x3($11)                          \r\n"
+        "gsswrc1 $f0, 0x0($11)                          \r\n"
+        "punpckhwd $f0, $f0, $f0                        \r\n"
+        "punpckhwd $f6, $f6, $f6                        \r\n"
+        "gsswlc1 $f0, 0x3($10)                          \r\n"
+        "gsswrc1 $f0, 0x0($10)                          \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "punpcklbh $f12, $f12, $f6                      \r\n"
+        "punpcklhw $f10, $f8, $f12                      \r\n"
+        "daddu $12, $10, %[stride]                      \r\n"
+        "punpckhhw $f8, $f8, $f12                       \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "daddu $12, $10, $15                            \r\n"
+        "punpckhwd $f10, $f10, $f10                     \r\n"
+        "daddu $11, $10, $9                             \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "gsswlc1 $f8, 0x3($11)                          \r\n"
+        "gsswrc1 $f8, 0x0($11)                          \r\n"
+        "daddu $12, $10, $14                            \r\n"
+        "punpckhwd $f8, $f8, $f8                        \r\n"
+        "daddu $8, $8, $13                              \r\n"
+        "gsswlc1 $f8, 0x3($12)                          \r\n"
+        "gsswrc1 $f8, 0x0($12)                          \r\n"
+        "daddu $10, $10, $13                            \r\n"
+        "ldc1 $f0, 0x18+%[stack]                        \r\n"
+        "ldc1 $f2, 0x28+%[stack]                        \r\n"
+        "ldc1 $f4, 0x38+%[stack]                        \r\n"
+        "ldc1 $f6, 0x48+%[stack]                        \r\n"
+        "daddu $15, %[stride], %[stride]                \r\n"
+        "punpckhwd $f8, $f0, $f0                        \r\n"
+        "daddu $14, $15, $15                            \r\n"
+        "punpckhwd $f10, $f2, $f2                       \r\n"
+        "punpckhwd $f12, $f4, $f4                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "daddu $12, $8, %[stride]                       \r\n"
+        "punpcklhw $f2, $f0, $f4                        \r\n"
+        "punpckhhw $f0, $f0, $f4                        \r\n"
+        "gsswlc1 $f2, 0x3($8)                           \r\n"
+        "gsswrc1 $f2, 0x0($8)                           \r\n"
+        "punpckhwd $f2, $f2, $f2                        \r\n"
+        "daddu $11, $8, $15                             \r\n"
+        "gsswlc1 $f2, 0x3($12)                          \r\n"
+        "gsswrc1 $f2, 0x0($12)                          \r\n"
+        "gsswlc1 $f0, 0x3($11)                          \r\n"
+        "gsswrc1 $f0, 0x0($11)                          \r\n"
+        "punpckhwd $f0, $f0, $f0                        \r\n"
+        "punpckhwd $f6, $f6, $f6                        \r\n"
+        "gsswlc1 $f0, 0x3($10)                          \r\n"
+        "gsswrc1 $f0, 0x0($10)                          \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "punpcklbh $f12, $f12, $f6                      \r\n"
+        "daddu $12, $10, %[stride]                      \r\n"
+        "punpcklhw $f10, $f8, $f12                      \r\n"
+        "punpckhhw $f8, $f8, $f12                       \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "daddu $12, $10, $15                            \r\n"
+        "punpckhwd $f10, $f10, $f10                     \r\n"
+        "daddu $11, $10, $9                             \r\n"
+        "gsswlc1 $f10, 0x3($12)                         \r\n"
+        "gsswrc1 $f10, 0x0($12)                         \r\n"
+        "gsswlc1 $f8, 0x3($11)                          \r\n"
+        "gsswrc1 $f8, 0x0($11)                          \r\n"
+        "daddu $12, $10, $14                            \r\n"
+        "punpckhwd $f8, $f8, $f8                        \r\n"
+        "gsswlc1 $f8, 0x3($12)                          \r\n"
+        "gsswrc1 $f8, 0x0($12)                          \r\n"
+        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),[stack]"m"(stack[0])
+        : "$8","$9","$10","$11","$12","$13","$14","$15","$f0","$f2","$f4",
+          "$f6","$f8","$f10","$f12","$f14","$f16"
+    );
+}
+
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    uint64_t ptmp[0x11];
+    uint64_t pdat[4];
+
+    __asm__ volatile (
+        "daddu $12, %[stride], %[stride]                \r\n"
+        "daddiu $10, %[pix], -0x4                       \r\n"
+        "daddu $11, $12, %[stride]                      \r\n"
+        "daddu $13, $12, $12                            \r\n"
+        "daddu $9, $10, $11                             \r\n"
+        "daddu $8, $10, %[stride]                       \r\n"
+        "gsldlc1 $f0, 0x7($10)                          \r\n"
+        "gsldrc1 $f0, 0x0($10)                          \r\n"
+        "daddu $14, $10, $12                            \r\n"
+        "gsldlc1 $f2, 0x7($8)                           \r\n"
+        "gsldrc1 $f2, 0x0($8)                           \r\n"
+        "gsldlc1 $f4, 0x7($14)                          \r\n"
+        "gsldrc1 $f4, 0x0($14)                          \r\n"
+        "daddu $8, $9, %[stride]                        \r\n"
+        "gsldlc1 $f6, 0x7($9)                           \r\n"
+        "gsldrc1 $f6, 0x0($9)                           \r\n"
+        "daddu $14, $9, $12                             \r\n"
+        "gsldlc1 $f8, 0x7($8)                           \r\n"
+        "gsldrc1 $f8, 0x0($8)                           \r\n"
+        "daddu $8, $9, $11                              \r\n"
+        "gsldlc1 $f10, 0x7($14)                         \r\n"
+        "gsldrc1 $f10, 0x0($14)                         \r\n"
+        "gsldlc1 $f12, 0x7($8)                          \r\n"
+        "gsldrc1 $f12, 0x0($8)                          \r\n"
+        "daddu $8, $9, $13                              \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "gsldlc1 $f16, 0x7($8)                          \r\n"
+        "gsldrc1 $f16, 0x0($8)                          \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "sdc1 $f6, 0x0+%[ptmp]                          \r\n"
+        "punpckhhw $f6, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "punpckhhw $f12, $f14, $f2                      \r\n"
+        "punpcklhw $f14, $f14, $f2                      \r\n"
+        "sdc1 $f4, 0x20+%[ptmp]                         \r\n"
+        "ldc1 $f4, 0x0+%[ptmp]                          \r\n"
+        "punpckhhw $f2, $f4, $f10                       \r\n"
+        "punpcklhw $f4, $f4, $f10                       \r\n"
+        "punpckhwd $f10, $f0, $f8                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpckhwd $f8, $f14, $f4                       \r\n"
+        "punpcklwd $f14, $f14, $f4                      \r\n"
+        "sdc1 $f0, 0x0+%[ptmp]                          \r\n"
+        "sdc1 $f10, 0x10+%[ptmp]                        \r\n"
+        "sdc1 $f14, 0x40+%[ptmp]                        \r\n"
+        "sdc1 $f8, 0x50+%[ptmp]                         \r\n"
+        "ldc1 $f16, 0x20+%[ptmp]                        \r\n"
+        "punpckhwd $f0, $f6, $f16                       \r\n"
+        "punpcklwd $f6, $f6, $f16                       \r\n"
+        "punpckhwd $f10, $f12, $f2                      \r\n"
+        "punpcklwd $f12, $f12, $f2                      \r\n"
+        "daddu $8, $13, $13                             \r\n"
+        "sdc1 $f6, 0x20+%[ptmp]                         \r\n"
+        "sdc1 $f0, 0x30+%[ptmp]                         \r\n"
+        "sdc1 $f12, 0x60+%[ptmp]                        \r\n"
+        "sdc1 $f10, 0x70+%[ptmp]                        \r\n"
+        "daddu $10, $10, $8                             \r\n"
+        "daddu $9, $9, $8                               \r\n"
+        "daddu $8, $10, %[stride]                       \r\n"
+        "gsldlc1 $f0, 0x7($10)                          \r\n"
+        "gsldrc1 $f0, 0x0($10)                          \r\n"
+        "daddu $14, $10, $12                            \r\n"
+        "gsldlc1 $f2, 0x7($8)                           \r\n"
+        "gsldrc1 $f2, 0x0($8)                           \r\n"
+        "gsldlc1 $f4, 0x7($14)                          \r\n"
+        "gsldrc1 $f4, 0x0($14)                          \r\n"
+        "daddu $8, $9, %[stride]                        \r\n"
+        "gsldlc1 $f6, 0x7($9)                           \r\n"
+        "gsldrc1 $f6, 0x0($9)                           \r\n"
+        "daddu $14, $9, $12                             \r\n"
+        "gsldlc1 $f8, 0x7($8)                           \r\n"
+        "gsldrc1 $f8, 0x0($8)                           \r\n"
+        "daddu $8, $9, $11                              \r\n"
+        "gsldlc1 $f10, 0x7($14)                         \r\n"
+        "gsldrc1 $f10, 0x0($14)                         \r\n"
+        "gsldlc1 $f12, 0x7($8)                          \r\n"
+        "gsldrc1 $f12, 0x0($8)                          \r\n"
+        "daddu $8, $9, $13                              \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "gsldlc1 $f16, 0x7($8)                          \r\n"
+        "gsldrc1 $f16, 0x0($8)                          \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "sdc1 $f6, 0x8+%[ptmp]                          \r\n"
+        "punpckhhw $f6, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "punpckhhw $f12, $f14, $f2                      \r\n"
+        "punpcklhw $f14, $f14, $f2                      \r\n"
+        "sdc1 $f4, 0x28+%[ptmp]                         \r\n"
+        "ldc1 $f4, 0x8+%[ptmp]                          \r\n"
+        "punpckhhw $f2, $f4, $f10                       \r\n"
+        "punpcklhw $f4, $f4, $f10                       \r\n"
+        "punpckhwd $f10, $f0, $f8                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpckhwd $f8, $f14, $f4                       \r\n"
+        "punpcklwd $f14, $f14, $f4                      \r\n"
+        "sdc1 $f0, 0x8+%[ptmp]                          \r\n"
+        "sdc1 $f10, 0x18+%[ptmp]                        \r\n"
+        "sdc1 $f14, 0x48+%[ptmp]                        \r\n"
+        "sdc1 $f8, 0x58+%[ptmp]                         \r\n"
+        "ldc1 $f16, 0x28+%[ptmp]                        \r\n"
+        "punpckhwd $f0, $f6, $f16                       \r\n"
+        "punpcklwd $f6, $f6, $f16                       \r\n"
+        "punpckhwd $f10, $f12, $f2                      \r\n"
+        "punpcklwd $f12, $f12, $f2                      \r\n"
+        "sdc1 $f6, 0x28+%[ptmp]                         \r\n"
+        "sdc1 $f0, 0x38+%[ptmp]                         \r\n"
+        "sdc1 $f12, 0x68+%[ptmp]                        \r\n"
+        "sdc1 $f10, 0x78+%[ptmp]                        \r\n"
+        "sd $10, 0x00+%[pdat]                           \r\n"
+        "sd $11, 0x08+%[pdat]                           \r\n"
+        "sd $12, 0x10+%[pdat]                           \r\n"
+        "sd $13, 0x18+%[pdat]                           \r\n"
+        ::[pix]"r"(pix),[stride]"r"((uint64_t)stride),[ptmp]"m"(ptmp[0]),
+          [pdat]"m"(pdat[0])
+        : "$8","$9","$10","$11","$12","$13","$14","$f0","$f2","$f4","$f6",
+          "$f8","$f10","$f12","$f14","$f16"
+    );
+
+    ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
+
+    __asm__ volatile (
+        "ld $10, 0x00+%[pdat]                           \r\n"
+        "ld $11, 0x08+%[pdat]                           \r\n"
+        "ld $12, 0x10+%[pdat]                           \r\n"
+        "ld $13, 0x18+%[pdat]                           \r\n"
+        "daddu $9, $10, $11                             \r\n"
+        "ldc1 $f0, 0x8+%[ptmp]                          \r\n"
+        "ldc1 $f2, 0x18+%[ptmp]                         \r\n"
+        "ldc1 $f4, 0x28+%[ptmp]                         \r\n"
+        "ldc1 $f6, 0x38+%[ptmp]                         \r\n"
+        "ldc1 $f8, 0x48+%[ptmp]                         \r\n"
+        "ldc1 $f10, 0x58+%[ptmp]                        \r\n"
+        "ldc1 $f12, 0x68+%[ptmp]                        \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "ldc1 $f16, 0x78+%[ptmp]                        \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "gssdlc1 $f6, 0x7($10)                          \r\n"
+        "gssdrc1 $f6, 0x0($10)                          \r\n"
+        "daddu $8, $10, $12                             \r\n"
+        "punpckhhw $f6, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "punpckhhw $f12, $f14, $f2                      \r\n"
+        "punpcklhw $f14, $f14, $f2                      \r\n"
+        "gssdlc1 $f4, 0x7($8)                           \r\n"
+        "gssdrc1 $f4, 0x0($8)                           \r\n"
+        "gsldlc1 $f4, 0x7($10)                          \r\n"
+        "gsldrc1 $f4, 0x0($10)                          \r\n"
+        "punpckhhw $f2, $f4, $f10                       \r\n"
+        "punpcklhw $f4, $f4, $f10                       \r\n"
+        "punpckhwd $f10, $f0, $f8                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpckhwd $f8, $f14, $f4                       \r\n"
+        "punpcklwd $f14, $f14, $f4                      \r\n"
+        "daddu $8, $10, %[stride]                       \r\n"
+        "gssdlc1 $f0, 0x7($10)                          \r\n"
+        "gssdrc1 $f0, 0x0($10)                          \r\n"
+        "daddu $14, $9, %[stride]                       \r\n"
+        "gssdlc1 $f10, 0x7($8)                          \r\n"
+        "gssdrc1 $f10, 0x0($8)                          \r\n"
+        "daddu $8, $9, $12                              \r\n"
+        "gssdlc1 $f14, 0x7($14)                         \r\n"
+        "gssdrc1 $f14, 0x0($14)                         \r\n"
+        "daddu $14, $10, $12                            \r\n"
+        "gssdlc1 $f8, 0x7($8)                           \r\n"
+        "gssdrc1 $f8, 0x0($8)                           \r\n"
+        "gsldlc1 $f16, 0x7($14)                         \r\n"
+        "gsldrc1 $f16, 0x0($14)                         \r\n"
+        "daddu $8, $10, $12                             \r\n"
+        "punpckhwd $f0, $f6, $f16                       \r\n"
+        "punpcklwd $f6, $f6, $f16                       \r\n"
+        "punpckhwd $f10, $f12, $f2                      \r\n"
+        "punpcklwd $f12, $f12, $f2                      \r\n"
+        "gssdlc1 $f6, 0x7($8)                           \r\n"
+        "gssdrc1 $f6, 0x0($8)                           \r\n"
+        "daddu $8, $9, $11                              \r\n"
+        "gssdlc1 $f0, 0x7($9)                           \r\n"
+        "gssdrc1 $f0, 0x0($9)                           \r\n"
+        "daddu $14, $9, $13                             \r\n"
+        "gssdlc1 $f12, 0x7($8)                          \r\n"
+        "gssdrc1 $f12, 0x0($8)                          \r\n"
+        "daddu $8, $13, $13                             \r\n"
+        "gssdlc1 $f10, 0x7($14)                         \r\n"
+        "gssdrc1 $f10, 0x0($14)                         \r\n"
+        "dsubu $10, $10, $8                             \r\n"
+        "dsubu $9, $9, $8                               \r\n"
+        "ldc1 $f0, 0x0+%[ptmp]                          \r\n"
+        "ldc1 $f2, 0x10+%[ptmp]                         \r\n"
+        "ldc1 $f4, 0x20+%[ptmp]                         \r\n"
+        "ldc1 $f6, 0x30+%[ptmp]                         \r\n"
+        "ldc1 $f8, 0x40+%[ptmp]                         \r\n"
+        "ldc1 $f10, 0x50+%[ptmp]                        \r\n"
+        "ldc1 $f12, 0x60+%[ptmp]                        \r\n"
+        "punpckhbh $f14, $f0, $f2                       \r\n"
+        "punpcklbh $f0, $f0, $f2                        \r\n"
+        "punpckhbh $f2, $f4, $f6                        \r\n"
+        "punpcklbh $f4, $f4, $f6                        \r\n"
+        "punpckhbh $f6, $f8, $f10                       \r\n"
+        "punpcklbh $f8, $f8, $f10                       \r\n"
+        "ldc1 $f16, 0x70+%[ptmp]                        \r\n"
+        "punpckhbh $f10, $f12, $f16                     \r\n"
+        "punpcklbh $f12, $f12, $f16                     \r\n"
+        "gssdlc1 $f6, 0x7($10)                          \r\n"
+        "gssdrc1 $f6, 0x0($10)                          \r\n"
+        "daddu $8, $10, $12                             \r\n"
+        "punpckhhw $f6, $f0, $f4                        \r\n"
+        "punpcklhw $f0, $f0, $f4                        \r\n"
+        "punpckhhw $f4, $f8, $f12                       \r\n"
+        "punpcklhw $f8, $f8, $f12                       \r\n"
+        "punpckhhw $f12, $f14, $f2                      \r\n"
+        "punpcklhw $f14, $f14, $f2                      \r\n"
+        "gssdlc1 $f4, 0x7($8)                           \r\n"
+        "gssdrc1 $f4, 0x0($8)                           \r\n"
+        "gsldlc1 $f4, 0x7($10)                          \r\n"
+        "gsldrc1 $f4, 0x0($10)                          \r\n"
+        "punpckhhw $f2, $f4, $f10                       \r\n"
+        "punpcklhw $f4, $f4, $f10                       \r\n"
+        "punpckhwd $f10, $f0, $f8                       \r\n"
+        "punpcklwd $f0, $f0, $f8                        \r\n"
+        "punpckhwd $f8, $f14, $f4                       \r\n"
+        "punpcklwd $f14, $f14, $f4                      \r\n"
+        "daddu $8, $10, %[stride]                       \r\n"
+        "gssdlc1 $f0, 0x7($10)                          \r\n"
+        "gssdrc1 $f0, 0x0($10)                          \r\n"
+        "daddu $14, $9, %[stride]                       \r\n"
+        "gssdlc1 $f10, 0x7($8)                          \r\n"
+        "gssdrc1 $f10, 0x0($8)                          \r\n"
+        "daddu $8, $9, $12                              \r\n"
+        "gssdlc1 $f14, 0x7($14)                         \r\n"
+        "gssdrc1 $f14, 0x0($14)                         \r\n"
+        "daddu $14, $10, $12                            \r\n"
+        "gssdlc1 $f8, 0x7($8)                           \r\n"
+        "gssdrc1 $f8, 0x0($8)                           \r\n"
+        "gsldlc1 $f16, 0x7($14)                         \r\n"
+        "gsldrc1 $f16, 0x0($14)                         \r\n"
+        "daddu $8, $10, $12                             \r\n"
+        "punpckhwd $f0, $f6, $f16                       \r\n"
+        "punpcklwd $f6, $f6, $f16                       \r\n"
+        "punpckhwd $f10, $f12, $f2                      \r\n"
+        "punpcklwd $f12, $f12, $f2                      \r\n"
+        "gssdlc1 $f6, 0x7($8)                           \r\n"
+        "gssdrc1 $f6, 0x0($8)                           \r\n"
+        "daddu $8, $9, $11                              \r\n"
+        "gssdlc1 $f0, 0x7($9)                           \r\n"
+        "gssdrc1 $f0, 0x0($9)                           \r\n"
+        "daddu $14, $9, $13                             \r\n"
+        "gssdlc1 $f12, 0x7($8)                          \r\n"
+        "gssdrc1 $f12, 0x0($8)                          \r\n"
+        "gssdlc1 $f10, 0x7($14)                         \r\n"
+        "gssdrc1 $f10, 0x0($14)                         \r\n"
+        ::[pix]"r"(pix),[stride]"r"((uint64_t)stride),[ptmp]"m"(ptmp[0]),
+          [pdat]"m"(pdat[0])
+        : "$8","$9","$10","$11","$12","$13","$14","$f0","$f2","$f4","$f6",
+          "$f8","$f10","$f12","$f14","$f16"
+    );
+}
diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
new file mode 100644
index 0000000000..fce01ac91c
--- /dev/null
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -0,0 +1,2544 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
+                            int32_t log2_denom, int32_t src_weight,
+                            int32_t offset_in)
+{
+    uint32_t data0, data1;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1;
+    v4i32 res0, res1;
+    v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
+    v8u16 out0, out1;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom);
+
+    data0 = LW(data);
+    data1 = LW(data + stride);
+
+    src0 = (v16u8) __msa_fill_w(data0);
+    src1 = (v16u8) __msa_fill_w(data1);
+
+    ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
+    MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
+    ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
+    MAXI_SH2_SH(temp0, temp1, 0);
+
+    out0 = (v8u16) __msa_srl_h(temp0, denom);
+    out1 = (v8u16) __msa_srl_h(temp1, denom);
+
+    SAT_UH2_UH(out0, out1, 7);
+    PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
+
+    data0 = __msa_copy_u_w(res0, 0);
+    data1 = __msa_copy_u_w(res1, 0);
+    SW(data0, data);
+    data += stride;
+    SW(data1, data);
+}
+
+static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride,
+                                    int32_t height, int32_t log2_denom,
+                                    int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    uint32_t data0, data1, data2, data3;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 temp0, temp1, temp2, temp3, wgt;
+    v8i16 denom, offset;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LW4(data, stride, data0, data1, data2, data3);
+
+        src0 = (v16u8) __msa_fill_w(data0);
+        src1 = (v16u8) __msa_fill_w(data1);
+        src2 = (v16u8) __msa_fill_w(data2);
+        src3 = (v16u8) __msa_fill_w(data3);
+
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   temp0, temp1, temp2, temp3);
+        MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+             temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride);
+        data += (4 * stride);
+    }
+}
+
+static void avc_wgt_4width_msa(uint8_t *data, int32_t stride,
+                               int32_t height, int32_t log2_denom,
+                               int32_t src_weight, int32_t offset_in)
+{
+    if (2 == height) {
+        avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in);
+    } else {
+        avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight,
+                                offset_in);
+    }
+}
+
+static void avc_wgt_8width_msa(uint8_t *data, int32_t stride,
+                               int32_t height, int32_t log2_denom,
+                               int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 src0_r, src1_r, src2_r, src3_r;
+    v8u16 temp0, temp1, temp2, temp3;
+    v8u16 wgt, denom, offset;
+    v16i8 out0, out1;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = (v8u16) __msa_fill_h(offset_in);
+    denom = (v8u16) __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_UB4(data, stride, src0, src1, src2, src3);
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_r, src1_r, src2_r, src3_r);
+        MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
+             temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
+        ST8x4_UB(out0, out1, data, stride);
+        data += (4 * stride);
+    }
+}
+
+static void avc_wgt_16width_msa(uint8_t *data, int32_t stride,
+                                int32_t height, int32_t log2_denom,
+                                int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8u16 wgt, denom, offset;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = (v8u16) __msa_fill_h(offset_in);
+    denom = (v8u16) __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_UB4(data, stride, src0, src1, src2, src3);
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_l, src1_l, src2_l, src3_l);
+        MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
+             temp0, temp1, temp2, temp3);
+        MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
+             temp4, temp5, temp6, temp7);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
+                    temp4, temp5, temp6, temp7);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        SAT_UH4_UH(temp4, temp5, temp6, temp7, 7);
+        PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, data, stride);
+        data += 4 * stride;
+    }
+}
+
+static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint32_t load0, load1, out0, out1;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, dst0, dst1;
+    v8i16 temp0, temp1, denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    load0 = LW(src);
+    src += src_stride;
+    load1 = LW(src);
+
+    src0 = (v16i8) __msa_fill_w(load0);
+    src1 = (v16i8) __msa_fill_w(load1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    dst0 = (v16i8) __msa_fill_w(load0);
+    dst1 = (v16i8) __msa_fill_w(load1);
+
+    XORI_B4_128_SB(src0, src1, dst0, dst1);
+    ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
+
+    temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+    temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+
+    temp0 >>= denom;
+    temp1 >>= denom;
+
+    CLIP_SH2_0_255(temp0, temp1);
+    PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
+
+    out0 = __msa_copy_u_w((v4i32) dst0, 0);
+    out1 = __msa_copy_u_w((v4i32) dst1, 0);
+    SW(out0, dst);
+    dst += dst_stride;
+    SW(out1, dst);
+}
+
+static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, int32_t log2_denom,
+                                      int32_t src_weight, int32_t dst_weight,
+                                      int32_t offset_in)
+{
+    uint8_t cnt;
+    uint32_t load0, load1, load2, load3;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LW4(src, src_stride, load0, load1, load2, load3);
+        src += (4 * src_stride);
+
+        src0 = (v16i8) __msa_fill_w(load0);
+        src1 = (v16i8) __msa_fill_w(load1);
+        src2 = (v16i8) __msa_fill_w(load2);
+        src3 = (v16i8) __msa_fill_w(load3);
+
+        LW4(dst, dst_stride, load0, load1, load2, load3);
+
+        dst0 = (v16i8) __msa_fill_w(load0);
+        dst1 = (v16i8) __msa_fill_w(load1);
+        dst2 = (v16i8) __msa_fill_w(load2);
+        dst3 = (v16i8) __msa_fill_w(load3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   temp0, temp1, temp2, temp3);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height, int32_t log2_denom,
+                                 int32_t src_weight, int32_t dst_weight,
+                                 int32_t offset_in)
+{
+    if (2 == height) {
+        avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
+                          src_weight, dst_weight, offset_in);
+    } else {
+        avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
+                                  log2_denom, src_weight, dst_weight,
+                                  offset_in);
+    }
+}
+
+static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height, int32_t log2_denom,
+                                 int32_t src_weight, int32_t dst_weight,
+                                 int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v16i8 out0, out1;
+    v8i16 temp0, temp1, temp2, temp3;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   temp0, temp1, temp2, temp3);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 4 * dst_stride;
+    }
+}
+
+static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t log2_denom,
+                                  int32_t src_weight, int32_t dst_weight,
+                                  int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec0, vec2, vec4, vec6);
+        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec1, vec3, vec5, vec7);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+        temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+        temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+        temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+        temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        SRA_4V(temp4, temp5, temp6, temp7, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+        PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                    dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += 4 * dst_stride;
+    }
+}
+
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
+                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
+                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
+                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
+{                                                                           \
+    v8i16 threshold;                                                        \
+    v8i16 const3 = __msa_ldi_h(3);                                          \
+                                                                            \
+    threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
+    threshold += (p1_or_q1_org_in);                                         \
+                                                                            \
+    (p0_or_q0_out) = threshold << 1;                                        \
+    (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
+    (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
+                                                                            \
+    (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
+    (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
+                                                                            \
+    (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += threshold;                                            \
+    (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
+}
+
+/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
+#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
+                         p1_or_q1_org_in, p0_or_q0_out)      \
+{                                                            \
+    (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
+}
+
+#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
+                         p1_or_q1_org_in, p2_or_q2_org_in,    \
+                         negate_tc_in, tc_in, p1_or_q1_out)   \
+{                                                             \
+    v8i16 clip3, temp;                                        \
+                                                              \
+    clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
+                                   (v8u16) q0_or_p0_org_in);  \
+    temp = p1_or_q1_org_in << 1;                              \
+    clip3 = clip3 - temp;                                     \
+    clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
+    clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);              \
+    p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
+}
+
+#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
+                     p1_or_q1_org_in, q1_or_p1_org_in,          \
+                     negate_threshold_in, threshold_in,         \
+                     p0_or_q0_out, q0_or_p0_out)                \
+{                                                               \
+    v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
+                                                                \
+    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
+    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
+    q0_sub_p0 <<= 2;                                            \
+    p1_sub_q1 += 4;                                             \
+    delta = q0_sub_p0 + p1_sub_q1;                              \
+    delta >>= 3;                                                \
+                                                                \
+    delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+                                                                \
+    p0_or_q0_out = p0_or_q0_org_in + delta;                     \
+    q0_or_p0_out = q0_or_p0_org_in - delta;                     \
+                                                                \
+    CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
+}
+
+#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
+{                                                                        \
+    uint32_t load0, load1, load2, load3;                                 \
+    v16u8 src0 = { 0 };                                                  \
+    v16u8 src1 = { 0 };                                                  \
+    v16u8 src2 = { 0 };                                                  \
+    v16u8 src3 = { 0 };                                                  \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
+    v8i16 res0_r, res1_r;                                                \
+    v16i8 zeros = { 0 };                                                 \
+    v16u8 res0, res1;                                                    \
+                                                                         \
+    LW4((src - 2), stride, load0, load1, load2, load3);                  \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
+    src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
+    src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
+                                                                         \
+    TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
+                                                                         \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
+                                                                         \
+    tc = __msa_fill_h(tc_val);                                           \
+                                                                         \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                           \
+    is_less_than_beta = (p1_asub_p0 < beta);                             \
+    is_less_than = is_less_than_alpha & is_less_than_beta;               \
+    is_less_than_beta = (q1_asub_q0 < beta);                             \
+    is_less_than = is_less_than_beta & is_less_than;                     \
+                                                                         \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
+                                                                         \
+    q0_sub_p0 <<= 2;                                                     \
+    delta = q0_sub_p0 + p1_sub_q1;                                       \
+    delta = __msa_srari_h(delta, 3);                                     \
+                                                                         \
+    delta = CLIP_SH(delta, -tc, tc);                                     \
+                                                                         \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
+                                                                         \
+    res0_r += delta;                                                     \
+    res1_r -= delta;                                                     \
+                                                                         \
+    CLIP_SH2_0_255(res0_r, res1_r);                                      \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
+                                                                         \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
+                                                                         \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
+}
+
+#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
+{                                                            \
+    v16i8 zero_m = { 0 };                                    \
+                                                             \
+    out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
+    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
+    SLDI_B2_0_UB(out1, out2, out2, out3, 2);                 \
+}
+
+#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
+{                                                                          \
+    uint32_t load0, load1;                                                 \
+    v16u8 src0 = { 0 };                                                    \
+    v16u8 src1 = { 0 };                                                    \
+    v16u8 src2 = { 0 };                                                    \
+    v16u8 src3 = { 0 };                                                    \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
+    v16i8 zeros = { 0 };                                                   \
+    v16u8 res0, res1;                                                      \
+                                                                           \
+    load0 = LW(src - 2);                                                   \
+    load1 = LW(src - 2 + stride);                                          \
+                                                                           \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
+                                                                           \
+    TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
+                                                                           \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
+                                                                           \
+    tc = __msa_fill_h(tc_val);                                             \
+                                                                           \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                             \
+    is_less_than_beta = (p1_asub_p0 < beta);                               \
+    is_less_than = is_less_than_alpha & is_less_than_beta;                 \
+    is_less_than_beta = (q1_asub_q0 < beta);                               \
+    is_less_than = is_less_than_beta & is_less_than;                       \
+                                                                           \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
+                                                                           \
+    q0_sub_p0 <<= 2;                                                       \
+    delta = q0_sub_p0 + p1_sub_q1;                                         \
+    delta = __msa_srari_h(delta, 3);                                       \
+    delta = CLIP_SH(delta, -tc, tc);                                       \
+                                                                           \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
+                                                                           \
+    res0_r += delta;                                                       \
+    res1_r -= delta;                                                       \
+                                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                        \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
+                                                                           \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
+                                                                           \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
+}
+
+static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
+    v16u8 alpha, beta;
+    v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p2, p1, p0, q0, q1, q2;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16u8 tmp_flag;
+    v16i8 zero = { 0 };
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
+
+    {
+        v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        q2_org = LD_UB(data + (2 * img_width));
+        p3_org = LD_UB(data - (img_width << 2));
+        p2_org = LD_UB(data - (3 * img_width));
+
+        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+        is_less_than_beta = (p2_asub_p0 < beta);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+
+            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+
+            is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                         p2_r, q1_org_r, p0_r, p1_r, p2_r);
+            }
+
+            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+            is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+
+            ST_UB(p1_org, data - (2 * img_width));
+            ST_UB(p2_org, data - (3 * img_width));
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+            }
+
+            negate_is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+            }
+        }
+        /* combine */
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+            p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+        }
+
+        ST_UB(p0_org, data - img_width);
+
+        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
+        q3_org = LD_UB(data + (3 * img_width));
+        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+        is_less_than_beta = (q2_asub_q0 < beta);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+            is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                         q2_r, p1_org_r, q0_r, q1_r, q2_r);
+            }
+            is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                         q2_l, p1_org_l, q0_l, q1_l, q2_l);
+            }
+        }
+
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+
+            ST_UB(q1_org, data + img_width);
+            ST_UB(q2_org, data + 2 * img_width);
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+            negate_is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+            }
+
+            negate_is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+            }
+        }
+        /* combine */
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+            q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+        }
+        ST_UB(q0_org, data);
+    }
+}
+
+static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 alpha, beta, p0_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than;
+    v16u8 is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16i8 zero = { 0 };
+    v16u8 tmp_flag;
+
+    src = data - 4;
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+        LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
+        LD_UB8(src + (8 * img_width), img_width,
+               row8, row9, row10, row11, row12, row13, row14, row15);
+
+        TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
+                            row4, row5, row6, row7,
+                            row8, row9, row10, row11,
+                            row12, row13, row14, row15,
+                            p3_org, p2_org, p1_org, p0_org,
+                            q0_org, q1_org, q2_org, q3_org);
+    }
+    UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+    UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+    UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+    UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
+
+    /*  if ( ((unsigned)ABS(p0-q0) < thresholds->alpha_in) &&
+       ((unsigned)ABS(p1-p0) < thresholds->beta_in)  &&
+       ((unsigned)ABS(q1-q0) < thresholds->beta_in) )   */
+    {
+        v16u8 p1_asub_p0, q1_asub_q0;
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        {
+            v16u8 p2_asub_p0;
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+        }
+        is_less_than_beta = tmp_flag & is_less_than_beta;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* right */
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                         p2_r, q1_org_r, p0_r, p1_r, p2_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 p0, p2, p1;
+
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+        }
+        /* right */
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+
+            if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+            }
+        }
+
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            v16u8 p0;
+
+            p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+            p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+        }
+
+        {
+            v16u8 q2_asub_q0;
+
+            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (q2_asub_q0 < beta);
+        }
+
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* right */
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                         q2_r, p1_org_r, q0_r, q1_r, q2_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                         q2_l, p1_org_l, q0_l, q1_l, q2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 q0, q1, q2;
+
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+        }
+
+        /* right */
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+            }
+        }
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            v16u8 q0;
+
+            q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+            q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+        }
+    }
+    {
+        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+        ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
+        ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
+        ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+        ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
+        ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
+
+        src = data - 3;
+        ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 4, src + 4, img_width);
+        src += 4 * img_width;
+
+        ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 4, src + 4, img_width);
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
+                                                   int32_t alpha_in,
+                                                   int32_t beta_in)
+{
+    uint64_t load0, load1;
+    uint32_t out0, out2;
+    uint16_t out1, out3;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
+    v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
+    v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v16i8 src0 = { 0 };
+    v16i8 src1 = { 0 };
+    v16i8 src2 = { 0 };
+    v16i8 src3 = { 0 };
+    v16i8 src4 = { 0 };
+    v16i8 src5 = { 0 };
+    v16i8 src6 = { 0 };
+    v16i8 src7 = { 0 };
+    v16i8 zeros = { 0 };
+
+    load0 = LD(src - 4);
+    load1 = LD(src + stride - 4);
+    src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
+    src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
+
+    load0 = LD(src + (2 * stride) - 4);
+    load1 = LD(src + (3 * stride) - 4);
+    src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
+    src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
+
+    load0 = LD(src + (4 * stride) - 4);
+    load1 = LD(src + (5 * stride) - 4);
+    src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
+    src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
+
+    load0 = LD(src + (6 * stride) - 4);
+    load1 = LD(src + (7 * stride) - 4);
+    src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
+    src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
+    ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
+
+    ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
+    ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
+    SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than & is_less_than_beta;
+
+    alpha >>= 2;
+    alpha += 2;
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
+               src4_r, src5_r, src6_r, src7_r);
+
+    dst2_x_r = src1_r + src2_r + src3_r;
+    dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
+    dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
+    dst1_r = src0_r + src1_r + src2_r + src3_r;
+    dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
+
+    dst0_r = (2 * src6_r) + (3 * src0_r);
+    dst0_r += src1_r + src2_r + src3_r;
+    dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+
+    PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
+
+    dst3_x_r = src2_r + src3_r + src4_r;
+    dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
+    dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
+    dst4_r = src2_r + src3_r + src4_r + src5_r;
+    dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
+
+    dst5_r = (2 * src7_r) + (3 * src5_r);
+    dst5_r += src4_r + src3_r + src2_r;
+    dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
+
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
+
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
+    dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
+    dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
+
+    is_less_than = is_less_than_alpha & is_less_than;
+    dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
+    is_less_than_beta1 = is_less_than_beta1 & is_less_than;
+    dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
+
+    dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+    dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
+    dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
+    is_less_than_beta2 = is_less_than_beta2 & is_less_than;
+    dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
+    dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
+    dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
+
+    ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
+    dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
+    ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
+    ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
+    SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
+    dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
+    dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
+    SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
+
+    out0 = __msa_copy_u_w((v4i32) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 2);
+    out2 = __msa_copy_u_w((v4i32) dst1, 0);
+    out3 = __msa_copy_u_h((v8i16) dst1, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst4, 0);
+    out1 = __msa_copy_u_h((v8i16) dst4, 2);
+    out2 = __msa_copy_u_w((v4i32) dst5, 0);
+    out3 = __msa_copy_u_h((v8i16) dst5, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v16u8 is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
+           p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+
+        ST_UB(q0_or_p0_org, data_cb_or_cr);
+        ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v8i16 tmp1;
+    v16u8 alpha, beta, is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+
+        LD_UB8((data_cb_or_cr - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                           p1_or_q1_org, p0_or_q0_org,
+                           q0_or_p0_org, q1_or_p1_org);
+    }
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+
+        /* convert 16 bit output into 8 bit output */
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+        tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
+
+        data_cb_or_cr -= 1;
+        ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
+        data_cb_or_cr += 4 * img_width;
+        ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 beta, tmp_vec, bs = { 0 };
+    v16u8 tc = { 0 };
+    v16u8 is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p0_r, q0_r, p1_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 p0_l, q0_l, p1_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v8i16 tc_r, tc_l;
+    v16i8 zero = { 0 };
+    v16u8 is_bs_greater_than0;
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        is_bs_greater_than0 = (zero < bs);
+
+        {
+            v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+            v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+            src = data;
+            src -= 4;
+
+            LD_UB8(src, img_width,
+                   row0, row1, row2, row3, row4, row5, row6, row7);
+            src += (8 * img_width);
+            LD_UB8(src, img_width,
+                   row8, row9, row10, row11, row12, row13, row14, row15);
+
+            TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                                row8, row9, row10, row11,
+                                row12, row13, row14, row15,
+                                p3_org, p2_org, p1_org, p0_org,
+                                q0_org, q1_org, q2_org, q3_org);
+        }
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
+            v16u8 is_less_than_alpha;
+
+            p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+            p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+            q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+            alpha = (v16u8) __msa_fill_b(alpha_in);
+            beta = (v16u8) __msa_fill_b(beta_in);
+
+            is_less_than_alpha = (p0_asub_q0 < alpha);
+            is_less_than_beta = (p1_asub_p0 < beta);
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = (q1_asub_q0 < beta);
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 negate_tc, sign_negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l;
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            {
+                v16u8 p2_asub_p0;
+                v16u8 is_less_than_beta_r, is_less_than_beta_l;
+
+                p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+                is_less_than_beta = (p2_asub_p0 < beta);
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                is_less_than_beta_r =
+                    (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                    p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                     negate_tc_r, tc_r, p1_r);
+                }
+
+                is_less_than_beta_l =
+                    (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                    p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                     i16_negatetc_l, tc_l, p1_l);
+                }
+            }
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v16u8 u8_q2asub_q0;
+                v16u8 is_less_than_beta_l, is_less_than_beta_r;
+
+                u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+                is_less_than_beta = (u8_q2asub_q0 < beta);
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+
+                is_less_than_beta_r =
+                    (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                    q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                     negate_tc_r, tc_r, q1_r);
+                }
+
+                q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+                is_less_than_beta_l =
+                    (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                    q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                     i16_negatetc_l, tc_l, q1_l);
+                }
+            }
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v8i16 threshold_r, negate_thresh_r;
+                v8i16 threshold_l, negate_thresh_l;
+                v16i8 negate_thresh, sign_negate_thresh;
+
+                negate_thresh = zero - (v16i8) tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+        }
+        {
+            v16i8 tp0, tp1, tp2, tp3;
+            v8i16 tmp2, tmp5;
+            v4i32 tmp3, tmp4, tmp6, tmp7;
+            uint32_t out0, out2;
+            uint16_t out1, out3;
+
+            src = data - 3;
+
+            ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
+            ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
+            ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+            ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
+            ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
+
+            out0 = __msa_copy_u_w(tmp3, 0);
+            out1 = __msa_copy_u_h(tmp2, 0);
+            out2 = __msa_copy_u_w(tmp3, 1);
+            out3 = __msa_copy_u_h(tmp2, 1);
+
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp3, 2);
+            out1 = __msa_copy_u_h(tmp2, 2);
+            out2 = __msa_copy_u_w(tmp3, 3);
+            out3 = __msa_copy_u_h(tmp2, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 0);
+            out1 = __msa_copy_u_h(tmp2, 4);
+            out2 = __msa_copy_u_w(tmp4, 1);
+            out3 = __msa_copy_u_h(tmp2, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 2);
+            out1 = __msa_copy_u_h(tmp2, 6);
+            out2 = __msa_copy_u_w(tmp4, 3);
+            out3 = __msa_copy_u_h(tmp2, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 0);
+            out1 = __msa_copy_u_h(tmp5, 0);
+            out2 = __msa_copy_u_w(tmp6, 1);
+            out3 = __msa_copy_u_h(tmp5, 1);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 2);
+            out1 = __msa_copy_u_h(tmp5, 2);
+            out2 = __msa_copy_u_w(tmp6, 3);
+            out3 = __msa_copy_u_h(tmp5, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 0);
+            out1 = __msa_copy_u_h(tmp5, 4);
+            out2 = __msa_copy_u_w(tmp7, 1);
+            out3 = __msa_copy_u_h(tmp5, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 2);
+            out1 = __msa_copy_u_h(tmp5, 6);
+            out2 = __msa_copy_u_w(tmp7, 3);
+            out3 = __msa_copy_u_h(tmp5, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+        }
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t image_width)
+{
+    v16u8 p2_asub_p0, u8_q2asub_q0;
+    v16u8 alpha, beta, is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r, q0_r, q1_r = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l, q0_l, q1_l = { 0 };
+    v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v16i8 zero = { 0 };
+    v16u8 tmp_vec;
+    v16u8 bs = { 0 };
+    v16i8 tc = { 0 };
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB5(data - (3 * image_width), image_width,
+               p2_org, p1_org, p0_org, q0_org, q1_org);
+
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+            v16u8 is_less_than_alpha, is_bs_greater_than0;
+
+            is_bs_greater_than0 = ((v16u8) zero < bs);
+            p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+            p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+            q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+            is_less_than_alpha = (p0_asub_q0 < alpha);
+            is_less_than_beta = (p1_asub_p0 < beta);
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = (q1_asub_q0 < beta);
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 sign_negate_tc, negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
+
+            q2_org = LD_UB(data + (2 * image_width));
+            negate_tc = zero - tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+
+                is_less_than_beta_r =
+                    (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                    p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                     negate_tc_r, tc_r, p1_r);
+                }
+
+                is_less_than_beta_l =
+                    (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                    p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                     i16_negatetc_l, tc_l, p1_l);
+                }
+            }
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+                ST_UB(p1_org, data - (2 * image_width));
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+
+            u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (u8_q2asub_q0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+                is_less_than_beta_r =
+                    (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+
+                q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                    q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                     negate_tc_r, tc_r, q1_r);
+                }
+                is_less_than_beta_l =
+                    (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+
+                q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                    q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                     i16_negatetc_l, tc_l, q1_l);
+                }
+            }
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+                ST_UB(q1_org, data + image_width);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+            {
+                v16i8 negate_thresh, sign_negate_thresh;
+                v8i16 threshold_r, threshold_l;
+                v8i16 negate_thresh_l, negate_thresh_r;
+
+                negate_thresh = zero - tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(p0_org, (data - image_width));
+            ST_UB(q0_org, data);
+        }
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
+                                             int32_t alpha_in, int32_t beta_in,
+                                             int8_t *tc0)
+{
+    uint8_t *data = in;
+    uint32_t out0, out1, out2, out3;
+    uint64_t load;
+    uint32_t tc_val;
+    v16u8 alpha, beta;
+    v16i8 inp0 = { 0 };
+    v16i8 inp1 = { 0 };
+    v16i8 inp2 = { 0 };
+    v16i8 inp3 = { 0 };
+    v16i8 inp4 = { 0 };
+    v16i8 inp5 = { 0 };
+    v16i8 inp6 = { 0 };
+    v16i8 inp7 = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 src4, src5, src6, src7;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v8i16 tc, tc_orig_r, tc_plus1;
+    v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
+    v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
+    v8u16 src2_r, src3_r;
+    v8i16 p2_r, p1_r, q2_r, q1_r;
+    v16u8 p2, q2, p0, q0;
+    v4i32 dst0, dst1;
+    v16i8 zeros = { 0 };
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
+        load = LD(data - 3 + stride);
+        inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
+        load = LD(data - 3 + stride);
+        inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
+        load = LD(data - 3 + stride);
+        inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[3] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
+        load = LD(data - 3 + stride);
+        inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
+        data += (2 * stride);
+    }
+
+    ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
+    ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
+
+    src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
+    src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
+    src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
+    src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
+    src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
+    src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
+    p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
+
+    ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
+    p2_r += p0_add_q0;
+    p2_r >>= 1;
+    p2_r -= p1_r;
+    ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
+    q2_r += p0_add_q0;
+    q2_r >>= 1;
+    q2_r -= q1_r;
+
+    tc_val = LW(tc0);
+    tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
+    tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
+    is_tc_orig1 = tc_orig;
+    is_tc_orig2 = tc_orig;
+    tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
+    tc = tc_orig_r;
+
+    p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
+    q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
+
+    p2_r += p1_r;
+    q2_r += q1_r;
+
+    PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
+
+    is_tc_orig1 = (zeros < is_tc_orig1);
+    is_tc_orig2 = is_tc_orig1;
+    is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
+    is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
+    is_tc_orig1 = is_less_than & is_tc_orig1;
+    is_tc_orig2 = is_less_than & is_tc_orig2;
+
+    p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
+    q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
+
+    q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    q0_sub_p0 <<= 2;
+    p1_sub_q1 = p1_r - q1_r;
+    q0_sub_p0 += p1_sub_q1;
+    q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
+
+    tc_plus1 = tc + 1;
+    is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
+                                              (v16i8) is_less_than_beta1);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
+    tc_plus1 = tc + 1;
+    is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
+                                              (v16i8) is_less_than_beta2);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
+
+    q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
+
+    ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
+    src2_r += q0_sub_p0;
+    src3_r -= q0_sub_p0;
+
+    src2_r = (v8u16) CLIP_SH_0_255(src2_r);
+    src3_r = (v8u16) CLIP_SH_0_255(src3_r);
+
+    PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
+
+    p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
+    q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
+
+    ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
+
+    ILVRL_H2_SW(q2, p2, dst0, dst1);
+
+    data = in;
+
+    out0 = __msa_copy_u_w(dst0, 0);
+    out1 = __msa_copy_u_w(dst0, 1);
+    out2 = __msa_copy_u_w(dst0, 2);
+    out3 = __msa_copy_u_w(dst0, 3);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+        data += stride;
+    }
+
+    out0 = __msa_copy_u_w(dst1, 0);
+    out1 = __msa_copy_u_w(dst1, 1);
+    out2 = __msa_copy_u_w(dst1, 2);
+    out3 = __msa_copy_u_w(dst1, 3);
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[3] >= 0) {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v8i16 tmp_vec;
+    v8i16 bs = { 0 };
+    v8i16 tc = { 0 };
+    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than;
+    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
+    v8i16 p0_r, q0_r;
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 zero = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB4(data - (img_width << 1), img_width,
+               p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
+
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(q0_org, data);
+            ST_UB(p0_org, (data - img_width));
+        }
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+    v16u8 p0, q0;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16u8 is_bs_greater_than0;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v16i8 zero = { 0 };
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v8i16 tmp1, tmp_vec, bs = { 0 };
+    v8i16 tc = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        LD_UB8((data - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
+                           row4, row5, row6, row7,
+                           p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_bs_greater_than0 & is_less_than;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+            tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
+            src = data - 1;
+            ST2x4_UB(tmp1, 0, src, img_width);
+            src += 4 * img_width;
+            ST2x4_UB(tmp1, 4, src, img_width);
+        }
+    }
+}
+
+static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                            int32_t alpha_in, int32_t beta_in,
+                                            int8_t *tc0)
+{
+    int32_t col, tc_val;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += (4 * stride);
+            continue;
+        }
+
+        AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+        ST2x4_UB(res, 0, (src - 1), stride);
+        src += (4 * stride);
+    }
+}
+
+static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                                  int32_t alpha_in,
+                                                  int32_t beta_in,
+                                                  int8_t *tc0)
+{
+    int32_t col, tc_val;
+    int16_t out0, out1;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += 4 * stride;
+            continue;
+        }
+
+        AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+
+        out0 = __msa_copy_s_h((v8i16) res, 0);
+        out1 = __msa_copy_s_h((v8i16) res, 1);
+
+        SH(out0, (src - 1));
+        src += stride;
+        SH(out1, (src - 1));
+        src += stride;
+    }
+}
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
+                                         int32_t ystride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
+                                               int32_t ystride,
+                                               int32_t alpha,
+                                               int32_t beta,
+                                               int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
+                                          int32_t ystride,
+                                          int32_t alpha,
+                                          int32_t beta,
+                                          int8_t *tc0)
+{
+    avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
+                                                int32_t ystride,
+                                                int32_t alpha,
+                                                int32_t beta)
+{
+    avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
+}
+
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset)
+{
+    avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     int stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset)
+{
+    avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
+                          weight_src, weight_dst, offset);
+}
+
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
+                         weight_src, weight_dst, offset);
+}
+
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
+                         weight_src, weight_dst, offset);
+}
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
new file mode 100644
index 0000000000..fac1e7add4
--- /dev/null
+++ b/libavcodec/mips/h264idct_msa.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+
+#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)          \
+{                                                                         \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    tmp0_m = in0 + in2;                                                   \
+    tmp1_m = in0 - in2;                                                   \
+    tmp2_m = in1 >> 1;                                                    \
+    tmp2_m = tmp2_m - in3;                                                \
+    tmp3_m = in3 >> 1;                                                    \
+    tmp3_m = in1 + tmp3_m;                                                \
+                                                                          \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3);  \
+}
+
+static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride)
+{
+    v8i16 src0, src1, src2, src3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v8i16 zeros = { 0 };
+
+    LD4x4_SH(src, src0, src1, src2, src3);
+    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+    SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+    ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride);
+    ST_SH2(zeros, zeros, src, 8);
+}
+
+static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                      int32_t dst_stride)
+{
+    int16_t dc;
+    uint32_t src0, src1, src2, src3;
+    v16u8 pred = { 0 };
+    v16i8 out;
+    v8i16 input_dc, pred_r, pred_l;
+
+    dc = (src[0] + 32) >> 6;
+    input_dc = __msa_fill_h(dc);
+    src[0] = 0;
+
+    LW4(dst, dst_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, pred);
+    UNPCK_UB_SH(pred, pred_r, pred_l);
+
+    pred_r += input_dc;
+    pred_l += input_dc;
+
+    CLIP_SH2_0_255(pred_r, pred_l);
+    out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                     int32_t de_q_val)
+{
+#define DC_DEST_STRIDE 16
+    int16_t out0, out1, out2, out3;
+    v8i16 src0, src1, src2, src3;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
+    v4i32 de_q_vec = __msa_fill_w(de_q_val);
+
+    LD4x4_SH(src, src0, src1, src2, src3);
+    TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, src0, src1, src2, src3);
+    BUTTERFLY_4(src0, src2, src3, src1, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
+    UNPCK_R_SH_SW(vres0, vres0_r);
+    UNPCK_R_SH_SW(vres1, vres1_r);
+    UNPCK_R_SH_SW(vres2, vres2_r);
+    UNPCK_R_SH_SW(vres3, vres3_r);
+
+    vres0_r *= de_q_vec;
+    vres1_r *= de_q_vec;
+    vres2_r *= de_q_vec;
+    vres3_r *= de_q_vec;
+
+    SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
+    PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
+
+    out0 = __msa_copy_s_h(vec0, 0);
+    out1 = __msa_copy_s_h(vec0, 1);
+    out2 = __msa_copy_s_h(vec0, 2);
+    out3 = __msa_copy_s_h(vec0, 3);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += DC_DEST_STRIDE;
+
+    out0 = __msa_copy_s_h(vec0, 4);
+    out1 = __msa_copy_s_h(vec0, 5);
+    out2 = __msa_copy_s_h(vec0, 6);
+    out3 = __msa_copy_s_h(vec0, 7);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += (3 * DC_DEST_STRIDE);
+
+    out0 = __msa_copy_s_h(vec1, 0);
+    out1 = __msa_copy_s_h(vec1, 1);
+    out2 = __msa_copy_s_h(vec1, 2);
+    out3 = __msa_copy_s_h(vec1, 3);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += DC_DEST_STRIDE;
+
+    out0 = __msa_copy_s_h(vec1, 4);
+    out1 = __msa_copy_s_h(vec1, 5);
+    out2 = __msa_copy_s_h(vec1, 6);
+    out3 = __msa_copy_s_h(vec1, 7);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+
+#undef DC_DEST_STRIDE
+}
+
+static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
+{
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 zeros = { 0 };
+
+    src[0] += 32;
+
+    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    vec0 = src0 + src4;
+    vec1 = src0 - src4;
+    vec2 = src2 >> 1;
+    vec2 = vec2 - src6;
+    vec3 = src6 >> 1;
+    vec3 = src2 + vec3;
+
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
+
+    vec0 = src7 >> 1;
+    vec0 = src5 - vec0 - src3 - src7;
+    vec1 = src3 >> 1;
+    vec1 = src1 - vec1 + src7 - src3;
+    vec2 = src5 >> 1;
+    vec2 = vec2 - src1 + src7 + src5;
+    vec3 = src1 >> 1;
+    vec3 = vec3 + src3 + src5 + src1;
+    tmp4 = vec3 >> 2;
+    tmp4 += vec0;
+    tmp5 = vec2 >> 2;
+    tmp5 += vec1;
+    tmp6 = vec1 >> 2;
+    tmp6 -= vec2;
+    tmp7 = vec0 >> 2;
+    tmp7 = vec3 - tmp7;
+
+    BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                res0, res1, res2, res3, res4, res5, res6, res7);
+    TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
+                       res0, res1, res2, res3, res4, res5, res6, res7);
+    UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
+    UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
+    UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
+    UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
+    UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
+    UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
+    UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
+    UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
+
+    vec2_r = tmp2_r >> 1;
+    vec2_l = tmp2_l >> 1;
+    vec2_r -= tmp6_r;
+    vec2_l -= tmp6_l;
+    vec3_r = tmp6_r >> 1;
+    vec3_l = tmp6_l >> 1;
+    vec3_r += tmp2_r;
+    vec3_l += tmp2_l;
+
+    BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
+    BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
+
+    vec0_r = tmp7_r >> 1;
+    vec0_l = tmp7_l >> 1;
+    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
+    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
+    vec1_r = tmp3_r >> 1;
+    vec1_l = tmp3_l >> 1;
+    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
+    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
+    vec2_r = tmp5_r >> 1;
+    vec2_l = tmp5_l >> 1;
+    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
+    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
+    vec3_r = tmp1_r >> 1;
+    vec3_l = tmp1_l >> 1;
+    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
+    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
+    tmp1_r = vec3_r >> 2;
+    tmp1_l = vec3_l >> 2;
+    tmp1_r += vec0_r;
+    tmp1_l += vec0_l;
+    tmp3_r = vec2_r >> 2;
+    tmp3_l = vec2_l >> 2;
+    tmp3_r += vec1_r;
+    tmp3_l += vec1_l;
+    tmp5_r = vec1_r >> 2;
+    tmp5_l = vec1_l >> 2;
+    tmp5_r -= vec2_r;
+    tmp5_l -= vec2_l;
+    tmp7_r = vec0_r >> 2;
+    tmp7_l = vec0_l >> 2;
+    tmp7_r = vec3_r - tmp7_r;
+    tmp7_l = vec3_l - tmp7_l;
+
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
+    BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
+    BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
+    BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
+    SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
+    SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
+    SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
+    SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
+    PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
+                res0, res1, res2, res3);
+    PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
+                res4, res5, res6, res7);
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               tmp4, tmp5, tmp6, tmp7);
+    ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
+         res0, res1, res2, res3);
+    ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
+         res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                    int32_t dst_stride)
+{
+    int32_t dc_val;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dc;
+    v16i8 zeros = { 0 };
+
+    dc_val = (src[0] + 32) >> 6;
+    dc = __msa_fill_h(dc_val);
+
+    src[0] = 0;
+
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               dst0_r, dst1_r, dst2_r, dst3_r);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               dst4_r, dst5_r, dst6_r, dst7_r);
+    ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
+         dst0_r, dst1_r, dst2_r, dst3_r);
+    ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
+         dst4_r, dst5_r, dst6_r, dst7_r);
+    CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
+    CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
+    PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src,
+                          int32_t dst_stride)
+{
+    avc_idct4x4_addblk_msa(dst, src, dst_stride);
+    memset(src, 0, 16 * sizeof(dctcoef));
+}
+
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
+                              int32_t dst_stride)
+{
+    avc_idct8_addblk_msa(dst, src, dst_stride);
+    memset(src, 0, 64 * sizeof(dctcoef));
+}
+
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride)
+{
+    avc_idct4x4_addblk_dc_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride)
+{
+    avc_idct8_dc_addblk_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct_add16_msa(uint8_t *dst,
+                            const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+            else
+                ff_h264_idct_add_msa(dst + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
+                ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
+                                            block + cnt * 16 * sizeof(pixel),
+                                            dst_stride);
+            else
+                ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
+                                         block + cnt * 16 * sizeof(pixel),
+                                         dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_msa(uint8_t **dst,
+                           const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_msa(uint8_t **dst,
+                               const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
+            if (nzc[scan8[i + 4]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16_intra_msa(uint8_t *dst,
+                                  const int32_t *blk_offset,
+                                  int16_t *block,
+                                  int32_t dst_stride,
+                                  const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_msa(dst + blk_offset[i],
+                                 block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                          block + i * 16 * sizeof(pixel),
+                                          dst_stride);
+    }
+}
+
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_qval)
+{
+    avc_deq_idct_luma_dc_msa(dst, src, de_qval);
+}
diff --git a/libavcodec/mips/h264pred_init_mips.c b/libavcodec/mips/h264pred_init_mips.c
new file mode 100644
index 0000000000..93a2409a0f
--- /dev/null
+++ b/libavcodec/mips/h264pred_init_mips.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "h264dsp_mips.h"
+#include "h264pred_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264_pred_init_msa(H264PredContext *h, int codec_id,
+                                       const int bit_depth,
+                                       const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8[VERT_PRED8x8] = ff_h264_intra_pred_vert_8x8_msa;
+            h->pred8x8[HOR_PRED8x8] = ff_h264_intra_pred_horiz_8x8_msa;
+        }
+
+        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[PLANE_PRED8x8] = ff_h264_intra_predict_plane_8x8_msa;
+            }
+        }
+        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7
+            && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[DC_PRED8x8] = ff_h264_intra_predict_dc_4blk_8x8_msa;
+                h->pred8x8[LEFT_DC_PRED8x8] =
+                    ff_h264_intra_predict_hor_dc_8x8_msa;
+                h->pred8x8[TOP_DC_PRED8x8] =
+                    ff_h264_intra_predict_vert_dc_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa;
+            }
+        } else {
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred8x8[7] = ff_vp8_pred8x8_127_dc_8_msa;
+                h->pred8x8[8] = ff_vp8_pred8x8_129_dc_8_msa;
+            }
+        }
+
+        if (chroma_format_idc == 1) {
+            h->pred8x8[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_8x8_msa;
+        }
+
+        h->pred16x16[DC_PRED8x8] = ff_h264_intra_pred_dc_16x16_msa;
+        h->pred16x16[VERT_PRED8x8] = ff_h264_intra_pred_vert_16x16_msa;
+        h->pred16x16[HOR_PRED8x8] = ff_h264_intra_pred_horiz_16x16_msa;
+
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            ;
+            break;
+        case AV_CODEC_ID_RV40:
+            ;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            h->pred16x16[7] = ff_vp8_pred16x16_127_dc_8_msa;
+            h->pred16x16[8] = ff_vp8_pred16x16_129_dc_8_msa;
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8] =
+                ff_h264_intra_predict_plane_16x16_msa;
+            break;
+        }
+
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_h264_intra_pred_dc_left_16x16_msa;
+        h->pred16x16[TOP_DC_PRED8x8] = ff_h264_intra_pred_dc_top_16x16_msa;
+        h->pred16x16[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_16x16_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264_pred_init_mmi(H264PredContext *h, int codec_id,
+        const int bit_depth, const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x8_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x8_horizontal_8_mmi;
+        } else {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x16_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x16_horizontal_8_mmi;
+        }
+
+        h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmi;
+        h->pred16x16[VERT_PRED8x8           ] = ff_pred16x16_vertical_8_mmi;
+        h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmi;
+        h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmi;
+        h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmi;
+
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_svq3_8_mmi;
+            ;
+            break;
+        case AV_CODEC_ID_RV40:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_rv40_8_mmi;
+            ;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            ;
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_h264_8_mmi;
+            break;
+        }
+
+        if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_8_mmi;
+                h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_8_mmi;
+            }
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                                    int bit_depth,
+                                    const int chroma_format_idc)
+{
+#if HAVE_MSA
+    h264_pred_init_msa(h, codec_id, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264_pred_init_mmi(h, codec_id, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264pred_mips.h b/libavcodec/mips/h264pred_mips.h
new file mode 100644
index 0000000000..d7d12c5a20
--- /dev/null
+++ b/libavcodec/mips/h264pred_mips.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef H264_PRED_MIPS_H
+#define H264_PRED_MIPS_H
+
+#include "constants.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride);
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride);
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+
+#endif  /* H264_PRED_MIPS_H */
diff --git a/libavcodec/mips/h264pred_mmi.c b/libavcodec/mips/h264pred_mmi.c
new file mode 100644
index 0000000000..e949d1116a
--- /dev/null
+++ b/libavcodec/mips/h264pred_mmi.c
@@ -0,0 +1,780 @@
+/*
+ * Loongson SIMD optimized h264pred
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264pred_mips.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "dli $8, 16                         \r\n"
+        "gsldlc1 $f2, 7(%[srcA])            \r\n"
+        "gsldrc1 $f2, 0(%[srcA])            \r\n"
+        "gsldlc1 $f4, 15(%[srcA])           \r\n"
+        "gsldrc1 $f4, 8(%[srcA])            \r\n"
+        "1:                                 \r\n"
+        "gssdlc1 $f2, 7(%[src])             \r\n"
+        "gssdrc1 $f2, 0(%[src])             \r\n"
+        "gssdlc1 $f4, 15(%[src])            \r\n"
+        "gssdrc1 $f4, 8(%[src])             \r\n"
+        "daddu %[src], %[src], %[stride]    \r\n"
+        "daddi $8, $8, -1                   \r\n"
+        "bnez $8, 1b                        \r\n"
+        : [src]"+&r"(src)
+        : [stride]"r"(stride),[srcA]"r"(src-stride)
+        : "$8","$f2","$f4"
+    );
+}
+
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "daddiu $2, %[src], -1              \r\n"
+        "daddu $3, %[src], $0               \r\n"
+        "dli $6, 0x10                       \r\n"
+        "1:                                 \r\n"
+        "lbu $4, 0($2)                      \r\n"
+        "dmul $5, $4, %[ff_pb_1]            \r\n"
+        "sdl $5, 7($3)                      \r\n"
+        "sdr $5, 0($3)                      \r\n"
+        "sdl $5, 15($3)                     \r\n"
+        "sdr $5, 8($3)                      \r\n"
+        "daddu $2, %[stride]                \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddiu $6, -1                      \r\n"
+        "bnez $6, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[ff_pb_1]"r"(ff_pb_1)
+        : "$2","$3","$4","$5","$6"
+    );
+}
+
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "daddiu $2, %[src], -1              \r\n"
+        "dli $6, 0x10                       \r\n"
+        "xor $8, $8, $8                     \r\n"
+        "1:                                 \r\n"
+        "lbu $4, 0($2)                      \r\n"
+        "daddu $8, $8, $4                   \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "daddiu $6, $6, -1                  \r\n"
+        "bnez $6, 1b                        \r\n"
+        "dli $6, 0x10                       \r\n"
+        "negu $3, %[stride]                 \r\n"
+        "daddu $2, %[src], $3               \r\n"
+        "2:                                 \r\n"
+        "lbu $4, 0($2)                      \r\n"
+        "daddu $8, $8, $4                   \r\n"
+        "daddiu $2, $2, 1                   \r\n"
+        "daddiu $6, $6, -1                  \r\n"
+        "bnez $6, 2b                        \r\n"
+        "daddiu $8, $8, 0x10                \r\n"
+        "dsra $8, 5                         \r\n"
+        "dmul $5, $8, %[ff_pb_1]            \r\n"
+        "daddu $2, %[src], $0               \r\n"
+        "dli $6, 0x10                       \r\n"
+        "3:                                 \r\n"
+        "sdl $5, 7($2)                      \r\n"
+        "sdr $5, 0($2)                      \r\n"
+        "sdl $5, 15($2)                     \r\n"
+        "sdr $5, 8($2)                      \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "daddiu $6, $6, -1                  \r\n"
+        "bnez $6, 3b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[ff_pb_1]"r"(ff_pb_1)
+        : "$2","$3","$4","$5","$6","$8"
+    );
+}
+
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    uint32_t dc;
+
+    __asm__ volatile (
+        "ldl $8, 7(%[srcA])                 \r\n"
+        "ldr $8, 0(%[srcA])                 \r\n"
+        "ldl $9, 7(%[src0])                 \r\n"
+        "ldr $9, 0(%[src0])                 \r\n"
+        "ldl $10, 7(%[src1])                \r\n"
+        "ldr $10, 0(%[src1])                \r\n"
+        "dmtc1 $8, $f2                      \r\n"
+        "dmtc1 $9, $f4                      \r\n"
+        "dmtc1 $10, $f6                     \r\n"
+        "dmtc1 $0, $f0                      \r\n"
+        "punpcklbh $f8, $f2, $f0            \r\n"
+        "punpckhbh $f10, $f2, $f0           \r\n"
+        "punpcklbh $f12, $f4, $f0           \r\n"
+        "punpckhbh $f14, $f4, $f0           \r\n"
+        "punpcklbh $f16, $f6, $f0           \r\n"
+        "punpckhbh $f18, $f6, $f0           \r\n"
+        "bnez %[has_topleft], 1f            \r\n"
+        "pinsrh_0 $f8, $f8, $f12            \r\n"
+        "1:                                 \r\n"
+        "bnez %[has_topright], 2f           \r\n"
+        "pinsrh_3 $f18, $f18, $f14          \r\n"
+        "2:                                 \r\n"
+        "daddiu $8, $0, 2                   \r\n"
+        "dmtc1 $8, $f20                     \r\n"
+        "pshufh $f22, $f20, $f0             \r\n"
+        "pmullh $f12, $f12, $f22            \r\n"
+        "pmullh $f14, $f14, $f22            \r\n"
+        "paddh $f8, $f8, $f12               \r\n"
+        "paddh $f10, $f10, $f14             \r\n"
+        "paddh $f8, $f8, $f16               \r\n"
+        "paddh $f10, $f10, $f18             \r\n"
+        "paddh $f8, $f8, $f22               \r\n"
+        "paddh $f10, $f10, $f22             \r\n"
+        "psrah $f8, $f8, $f20               \r\n"
+        "psrah $f10, $f10, $f20             \r\n"
+        "packushb $f4, $f8, $f10            \r\n"
+        "biadd $f2, $f4                     \r\n"
+        "mfc1 $9, $f2                       \r\n"
+        "addiu $9, $9, 4                    \r\n"
+        "dsrl $9, $9, 3                     \r\n"
+        "mul %[dc], $9, %[ff_pb_1]          \r\n"
+        : [dc]"=r"(dc)
+        : [srcA]"r"(src-stride-1),[src0]"r"(src-stride),
+          [src1]"r"(src-stride+1),[has_topleft]"r"(has_topleft),
+          [has_topright]"r"(has_topright),[ff_pb_1]"r"(ff_pb_1)
+        : "$8","$9","$10","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20","$f22"
+    );
+
+    __asm__ volatile (
+        "dli $8, 8                          \r\n"
+        "1:                                 \r\n"
+        "punpcklwd $f2, %[dc], %[dc]        \r\n"
+        "gssdlc1 $f2, 7(%[src])             \r\n"
+        "gssdrc1 $f2, 0(%[src])             \r\n"
+        "daddu %[src], %[src], %[stride]    \r\n"
+        "daddi $8, $8, -1                   \r\n"
+        "bnez $8, 1b                        \r\n"
+        : [src]"+&r"(src)
+        : [dc]"f"(dc),[stride]"r"(stride)
+        : "$8","$f2"
+    );
+}
+
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    uint32_t dc, dc1, dc2;
+
+    const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
+    const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
+    const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
+    const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
+    const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
+    const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
+    const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
+    const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
+
+    __asm__ volatile (
+        "ldl $8, 7(%[srcA])                 \r\n"
+        "ldr $8, 0(%[srcA])                 \r\n"
+        "ldl $9, 7(%[src0])                 \r\n"
+        "ldr $9, 0(%[src0])                 \r\n"
+        "ldl $10, 7(%[src1])                \r\n"
+        "ldr $10, 0(%[src1])                \r\n"
+        "dmtc1 $8, $f2                      \r\n"
+        "dmtc1 $9, $f4                      \r\n"
+        "dmtc1 $10, $f6                     \r\n"
+        "dmtc1 $0, $f0                      \r\n"
+        "punpcklbh $f8, $f2, $f0            \r\n"
+        "punpckhbh $f10, $f2, $f0           \r\n"
+        "punpcklbh $f12, $f4, $f0           \r\n"
+        "punpckhbh $f14, $f4, $f0           \r\n"
+        "punpcklbh $f16, $f6, $f0           \r\n"
+        "punpckhbh $f18, $f6, $f0           \r\n"
+        "daddiu $8, $0, 3                   \r\n"
+        "dmtc1 $8, $f20                     \r\n"
+        "pshufh $f28, $f10, $f20            \r\n"
+        "pshufh $f30, $f18, $f20            \r\n"
+        "pinsrh_3 $f10, $f10, $f30          \r\n"
+        "pinsrh_3 $f18, $f18, $f28          \r\n"
+        "bnez %[has_topleft], 1f            \r\n"
+        "pinsrh_0 $f8, $f8, $f12            \r\n"
+        "1:                                 \r\n"
+        "bnez %[has_topright], 2f           \r\n"
+        "pshufh $f30, $f14, $f20            \r\n"
+        "pinsrh_3 $f10, $f10, $f30          \r\n"
+        "2:                                 \r\n"
+        "daddiu $8, $0, 2                   \r\n"
+        "dmtc1 $8, $f20                     \r\n"
+        "pshufh $f22, $f20, $f0             \r\n"
+        "pmullh $f12, $f12, $f22            \r\n"
+        "pmullh $f14, $f14, $f22            \r\n"
+        "paddh $f8, $f8, $f12               \r\n"
+        "paddh $f10, $f10, $f14             \r\n"
+        "paddh $f8, $f8, $f16               \r\n"
+        "paddh $f10, $f10, $f18             \r\n"
+        "paddh $f8, $f8, $f22               \r\n"
+        "paddh $f10, $f10, $f22             \r\n"
+        "psrah $f8, $f8, $f20               \r\n"
+        "psrah $f10, $f10, $f20             \r\n"
+        "packushb $f4, $f8, $f10            \r\n"
+        "biadd $f2, $f4                     \r\n"
+        "mfc1 %[dc2], $f2                   \r\n"
+        : [dc2]"=r"(dc2)
+        : [srcA]"r"(src-stride-1),[src0]"r"(src-stride),
+          [src1]"r"(src-stride+1),[has_topleft]"r"(has_topleft),
+          [has_topright]"r"(has_topright)
+        : "$8","$9","$10","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20","$f22"
+    );
+
+    dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
+    dc = ((dc1+dc2+8)>>4)*0x01010101U;
+
+    __asm__ volatile (
+        "dli $8, 8                          \r\n"
+        "1:                                 \r\n"
+        "punpcklwd $f2, %[dc], %[dc]        \r\n"
+        "gssdlc1 $f2, 7(%[src])             \r\n"
+        "gssdrc1 $f2, 0(%[src])             \r\n"
+        "daddu %[src], %[src], %[stride]    \r\n"
+        "daddi $8, $8, -1                   \r\n"
+        "bnez $8, 1b                        \r\n"
+        : [src]"+&r"(src)
+        : [dc]"f"(dc),[stride]"r"(stride)
+        : "$8","$f2"
+    );
+}
+
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "ldl $8, 7(%[srcA])                 \r\n"
+        "ldr $8, 0(%[srcA])                 \r\n"
+        "ldl $9, 7(%[src0])                 \r\n"
+        "ldr $9, 0(%[src0])                 \r\n"
+        "ldl $10, 7(%[src1])                \r\n"
+        "ldr $10, 0(%[src1])                \r\n"
+        "dmtc1 $8, $f2                      \r\n"
+        "dmtc1 $9, $f4                      \r\n"
+        "dmtc1 $10, $f6                     \r\n"
+        "dmtc1 $0, $f0                      \r\n"
+        "punpcklbh $f8, $f2, $f0            \r\n"
+        "punpckhbh $f10, $f2, $f0           \r\n"
+        "punpcklbh $f12, $f4, $f0           \r\n"
+        "punpckhbh $f14, $f4, $f0           \r\n"
+        "punpcklbh $f16, $f6, $f0           \r\n"
+        "punpckhbh $f18, $f6, $f0           \r\n"
+        "bnez %[has_topleft], 1f            \r\n"
+        "pinsrh_0 $f8, $f8, $f12            \r\n"
+        "1:                                 \r\n"
+        "bnez %[has_topright], 2f           \r\n"
+        "pinsrh_3 $f18, $f18, $f14          \r\n"
+        "2:                                 \r\n"
+        "daddiu $8, $0, 2                   \r\n"
+        "dmtc1 $8, $f20                     \r\n"
+        "pshufh $f22, $f20, $f0             \r\n"
+        "pmullh $f12, $f12, $f22            \r\n"
+        "pmullh $f14, $f14, $f22            \r\n"
+        "paddh $f8, $f8, $f12               \r\n"
+        "paddh $f10, $f10, $f14             \r\n"
+        "paddh $f8, $f8, $f16               \r\n"
+        "paddh $f10, $f10, $f18             \r\n"
+        "paddh $f8, $f8, $f22               \r\n"
+        "paddh $f10, $f10, $f22             \r\n"
+        "psrah $f8, $f8, $f20               \r\n"
+        "psrah $f10, $f10, $f20             \r\n"
+        "packushb $f4, $f8, $f10            \r\n"
+        "sdc1 $f4, 0(%[src])                \r\n"
+        : [src]"=r"(src)
+        : [srcA]"r"(src-stride-1),[src0]"r"(src-stride),
+          [src1]"r"(src-stride+1),[has_topleft]"r"(has_topleft),
+          [has_topright]"r"(has_topright)
+        : "$8","$9","$10","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20","$f22"
+    );
+
+    __asm__ volatile (
+        "dli $8, 7                          \r\n"
+        "gsldlc1 $f2, 7(%[src])             \r\n"
+        "gsldrc1 $f2, 0(%[src])             \r\n"
+        "dadd %[src], %[src], %[stride]     \r\n"
+        "1:                                 \r\n"
+        "gssdlc1 $f2, 7(%[src])             \r\n"
+        "gssdrc1 $f2, 0(%[src])             \r\n"
+        "daddu %[src], %[src], %[stride]    \r\n"
+        "daddi $8, $8, -1                   \r\n"
+        "bnez $8, 1b                        \r\n"
+        : [src]"+&r"(src)
+        : [stride]"r"(stride)
+        : "$8","$f2"
+    );
+}
+
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride)
+{
+    const int dc = (src[-stride] + src[1-stride] + src[2-stride]
+                 + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
+                 + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+
+    __asm__ volatile (
+        "daddu $2, %[dc], $0                \r\n"
+        "dmul $3, $2, %[ff_pb_1]            \r\n"
+        "xor $4, $4, $4                     \r\n"
+        "gsswx $3, 0(%[src],$4)             \r\n"
+        "daddu $4, %[stride]                \r\n"
+        "gsswx $3, 0(%[src],$4)             \r\n"
+        "daddu $4, %[stride]                \r\n"
+        "gsswx $3, 0(%[src],$4)             \r\n"
+        "daddu $4, %[stride]                \r\n"
+        "gsswx $3, 0(%[src],$4)             \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[dc]"r"(dc),[ff_pb_1]"r"(ff_pb_1)
+        : "$2","$3","$4"
+    );
+}
+
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "dsubu $2, %[src], %[stride]        \r\n"
+        "daddu $3, %[src], $0               \r\n"
+        "ldl $4, 7($2)                      \r\n"
+        "ldr $4, 0($2)                      \r\n"
+        "dli $5, 0x8                        \r\n"
+        "1:                                 \r\n"
+        "sdl $4, 7($3)                      \r\n"
+        "sdr $4, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddiu $5, -1                      \r\n"
+        "bnez $5, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride)
+        : "$2","$3","$4","$5"
+    );
+}
+
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "daddiu $2, %[src], -1              \r\n"
+        "daddu $3, %[src], $0               \r\n"
+        "dli $6, 0x8                        \r\n"
+        "1:                                 \r\n"
+        "lbu $4, 0($2)                      \r\n"
+        "dmul $5, $4, %[ff_pb_1]            \r\n"
+        "sdl $5, 7($3)                      \r\n"
+        "sdr $5, 0($3)                      \r\n"
+        "daddu $2, %[stride]                \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddiu $6, -1                      \r\n"
+        "bnez $6, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[ff_pb_1]"r"(ff_pb_1)
+        : "$2","$3","$4","$5","$6"
+    );
+}
+
+static void ff_pred16x16_plane_compat_8_mmi(uint8_t *src, ptrdiff_t stride,
+        const int svq3, const int rv40)
+{
+    __asm__ volatile (
+        "negu $2, %[stride]                 \r\n"
+        "daddu $3, %[src], $2               \r\n"
+        "xor $f8, $f8, $f8                  \r\n"
+        "gslwlc1 $f0, 2($3)                 \r\n"
+        "gslwrc1 $f0, -1($3)                \r\n"
+        "gslwlc1 $f2, 6($3)                 \r\n"
+        "gslwrc1 $f2, 3($3)                 \r\n"
+        "gslwlc1 $f4, 11($3)                \r\n"
+        "gslwrc1 $f4, 8($3)                 \r\n"
+        "gslwlc1 $f6, 15($3)                \r\n"
+        "gslwrc1 $f6, 12($3)                \r\n"
+        "punpcklbh $f0, $f0, $f8            \r\n"
+        "punpcklbh $f2, $f2, $f8            \r\n"
+        "punpcklbh $f4, $f4, $f8            \r\n"
+        "punpcklbh $f6, $f6, $f8            \r\n"
+        "dmtc1 %[ff_pw_m8tom5], $f20        \r\n"
+        "dmtc1 %[ff_pw_m4tom1], $f22        \r\n"
+        "dmtc1 %[ff_pw_1to4], $f24          \r\n"
+        "dmtc1 %[ff_pw_5to8], $f26          \r\n"
+        "pmullh $f0, $f0, $f20              \r\n"
+        "pmullh $f2, $f2, $f22              \r\n"
+        "pmullh $f4, $f4, $f24              \r\n"
+        "pmullh $f6, $f6, $f26              \r\n"
+        "paddsh $f0, $f0, $f4               \r\n"
+        "paddsh $f2, $f2, $f6               \r\n"
+        "paddsh $f0, $f0, $f2               \r\n"
+        "dli $4, 0xE                        \r\n"
+        "dmtc1 $4, $f28                     \r\n"
+        "pshufh $f2, $f0, $f28              \r\n"
+        "paddsh $f0, $f0, $f2               \r\n"
+        "dli $4, 0x1                        \r\n"
+        "dmtc1 $4, $f30                     \r\n"
+        "pshufh $f2, $f0, $f30              \r\n"
+        "paddsh $f10, $f0, $f2              \r\n"
+        "daddiu $3, %[src], -1              \r\n"
+        "daddu $3, $2                       \r\n"
+        "lbu $4, 0($3)                      \r\n"
+        "lbu $8, 16($3)                     \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $5, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $6, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $7, 0($3)                      \r\n"
+        "dsll $5, 16                        \r\n"
+        "dsll $6, 32                        \r\n"
+        "dsll $7, 48                        \r\n"
+        "or $6, $7                          \r\n"
+        "or $4, $5                          \r\n"
+        "or $4, $6                          \r\n"
+        "dmtc1 $4, $f0                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $4, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $5, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $6, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $7, 0($3)                      \r\n"
+        "dsll $5, 16                        \r\n"
+        "dsll $6, 32                        \r\n"
+        "dsll $7, 48                        \r\n"
+        "or $6, $7                          \r\n"
+        "or $4, $5                          \r\n"
+        "or $4, $6                          \r\n"
+        "dmtc1 $4, $f2                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $4, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $5, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $6, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $7, 0($3)                      \r\n"
+        "dsll $5, 16                        \r\n"
+        "dsll $6, 32                        \r\n"
+        "dsll $7, 48                        \r\n"
+        "or $6, $7                          \r\n"
+        "or $4, $5                          \r\n"
+        "or $4, $6                          \r\n"
+        "dmtc1 $4, $f4                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $4, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $5, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $6, 0($3)                      \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "lbu $7, 0($3)                      \r\n"
+        "daddu $8, $7                       \r\n"
+        "daddiu $8, 1                       \r\n"
+        "dsll $8, 4                         \r\n"
+        "dsll $5, 16                        \r\n"
+        "dsll $6, 32                        \r\n"
+        "dsll $7, 48                        \r\n"
+        "or $6, $7                          \r\n"
+        "or $4, $5                          \r\n"
+        "or $4, $6                          \r\n"
+        "dmtc1 $4, $f6                      \r\n"
+        "pmullh $f0, $f0, $f20              \r\n"
+        "pmullh $f2, $f2, $f22              \r\n"
+        "pmullh $f4, $f4, $f24              \r\n"
+        "pmullh $f6, $f6, $f26              \r\n"
+        "paddsh $f0, $f0, $f4               \r\n"
+        "paddsh $f2, $f2, $f6               \r\n"
+        "paddsh $f0, $f0, $f2               \r\n"
+        "pshufh $f2, $f0, $f28              \r\n"
+        "paddsh $f0, $f0, $f2               \r\n"
+        "pshufh $f2, $f0, $f30              \r\n"
+        "paddsh $f12, $f0, $f2              \r\n"
+        "dmfc1 $2, $f10                     \r\n"
+        "dsll $2, 48                        \r\n"
+        "dsra $2, 48                        \r\n"
+        "dmfc1 $3, $f12                     \r\n"
+        "dsll $3, 48                        \r\n"
+        "dsra $3, 48                        \r\n"
+        "beqz %[svq3], 1f                   \r\n"
+        "dli $4, 4                          \r\n"
+        "ddiv $2, $4                        \r\n"
+        "ddiv $3, $4                        \r\n"
+        "dli $4, 5                          \r\n"
+        "dmul $2, $4                        \r\n"
+        "dmul $3, $4                        \r\n"
+        "dli $4, 16                         \r\n"
+        "ddiv $2, $4                        \r\n"
+        "ddiv $3, $4                        \r\n"
+        "daddu $4, $2, $0                   \r\n"
+        "daddu $2, $3, $0                   \r\n"
+        "daddu $3, $4, $0                   \r\n"
+        "b 2f                               \r\n"
+        "1:                                 \r\n"
+        "beqz %[rv40], 1f                   \r\n"
+        "dsra $4, $2, 2                     \r\n"
+        "daddu $2, $4                       \r\n"
+        "dsra $4, $3, 2                     \r\n"
+        "daddu $3, $4                       \r\n"
+        "dsra $2, 4                         \r\n"
+        "dsra $3, 4                         \r\n"
+        "b 2f                               \r\n"
+        "1:                                 \r\n"
+        "dli $4, 5                          \r\n"
+        "dmul $2, $4                        \r\n"
+        "dmul $3, $4                        \r\n"
+        "daddiu $2, 32                      \r\n"
+        "daddiu $3, 32                      \r\n"
+        "dsra $2, 6                         \r\n"
+        "dsra $3, 6                         \r\n"
+        "2:                                 \r\n"
+        "daddu $5, $2, $3                   \r\n"
+        "dli $4, 7                          \r\n"
+        "dmul $5, $4                        \r\n"
+        "dsubu $8, $5                       \r\n"
+        "dmtc1 $0, $f8                      \r\n"
+        "dmtc1 $2, $f0                      \r\n"
+        "pshufh $f0, $f0, $f8               \r\n"
+        "dmtc1 $3, $f10                     \r\n"
+        "pshufh $f10, $f10, $f8             \r\n"
+        "dmtc1 $8, $f12                     \r\n"
+        "pshufh $f12, $f12, $f8             \r\n"
+        "dli $4, 5                          \r\n"
+        "dmtc1 $4, $f14                     \r\n"
+        "pmullh $f2, %[ff_pw_0to3], $f0     \r\n"
+        "pmullh $f4, %[ff_pw_4to7], $f0     \r\n"
+        "pmullh $f6, %[ff_pw_8tob], $f0     \r\n"
+        "pmullh $f8, %[ff_pw_ctof], $f0     \r\n"
+        "daddu $3, %[src], $0               \r\n"
+        "dli $2, 16                         \r\n"
+        "1:                                 \r\n"
+        "paddsh $f16, $f2, $f12             \r\n"
+        "psrah $f16, $f16, $f14             \r\n"
+        "paddsh $f18, $f4, $f12             \r\n"
+        "psrah $f18, $f18, $f14             \r\n"
+        "packushb $f20, $f16, $f18          \r\n"
+        "gssdlc1 $f20, 7($3)                \r\n"
+        "gssdrc1 $f20, 0($3)                \r\n"
+        "paddsh $f16, $f6, $f12             \r\n"
+        "psrah $f16, $f16, $f14             \r\n"
+        "paddsh $f18, $f8, $f12             \r\n"
+        "psrah $f18, $f18, $f14             \r\n"
+        "packushb $f20, $f16, $f18          \r\n"
+        "gssdlc1 $f20, 15($3)               \r\n"
+        "gssdrc1 $f20, 8($3)                \r\n"
+        "paddsh $f12, $f12, $f10            \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddiu $2, -1                      \r\n"
+        "bnez $2, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[svq3]"r"(svq3),[rv40]"r"(rv40),
+          [ff_pw_m8tom5]"r"(ff_pw_m8tom5),[ff_pw_m4tom1]"r"(ff_pw_m4tom1),
+          [ff_pw_1to4]"r"(ff_pw_1to4),[ff_pw_5to8]"r"(ff_pw_5to8),
+          [ff_pw_0to3]"f"(ff_pw_0to3),[ff_pw_4to7]"f"(ff_pw_4to7),
+          [ff_pw_8tob]"f"(ff_pw_8tob),[ff_pw_ctof]"f"(ff_pw_ctof)
+        : "$2","$3","$4","$5","$6","$7","$8","$f0","$f2","$f4","$f6","$f8",
+          "$f10","$f12","$f14","$f16","$f18","$f20","$f22","$f24","$f26",
+          "$f28","$f30"
+    );
+}
+
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    ff_pred16x16_plane_compat_8_mmi(src, stride, 1, 0);
+}
+
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    ff_pred16x16_plane_compat_8_mmi(src, stride, 0, 1);
+}
+
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    ff_pred16x16_plane_compat_8_mmi(src, stride, 0, 0);
+}
+
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "dli $2, 2                          \r\n"
+        "xor $f0, $f0, $f0                  \r\n"
+        "xor $f2, $f2, $f2                  \r\n"
+        "xor $f30, $f30, $f30               \r\n"
+        "negu $3, %[stride]                 \r\n"
+        "daddu $3, $3, %[src]               \r\n"
+        "gsldlc1 $f4, 7($3)                 \r\n"
+        "gsldrc1 $f4, 0($3)                 \r\n"
+        "punpcklbh $f0, $f4, $f30           \r\n"
+        "punpckhbh $f2, $f4, $f30           \r\n"
+        "biadd $f0, $f0                     \r\n"
+        "biadd $f2, $f2                     \r\n"
+        "pshufh $f0, $f0, $f30              \r\n"
+        "pshufh $f2, $f2, $f30              \r\n"
+        "dmtc1 $2, $f4                      \r\n"
+        "pshufh $f4, $f4, $f30              \r\n"
+        "paddush $f0, $f0, $f4              \r\n"
+        "paddush $f2, $f2, $f4              \r\n"
+        "dmtc1 $2, $f4                      \r\n"
+        "psrlh $f0, $f0, $f4                \r\n"
+        "psrlh $f2, $f2, $f4                \r\n"
+        "packushb $f4, $f0, $f2             \r\n"
+        "dli $2, 8                          \r\n"
+        "1:                                 \r\n"
+        "gssdlc1 $f4, 7(%[src])             \r\n"
+        "gssdrc1 $f4, 0(%[src])             \r\n"
+        "daddu %[src], %0, %[stride]        \r\n"
+        "daddiu $2, $2, -1                  \r\n"
+        "bnez $2, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride)
+        : "$2","$3","$f0","$f2","$f4","$f30"
+    );
+}
+
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "negu $2, %[stride]                 \r\n"
+        "daddu $2, $2, %[src]               \r\n"
+        "daddiu $5, $2, 4                   \r\n"
+        "lbu $6, 0($2)                      \r\n"
+        "daddu $3, $0, $6                   \r\n"
+        "daddiu $2, 1                       \r\n"
+        "lbu $6, 0($5)                      \r\n"
+        "daddu $4, $0, $6                   \r\n"
+        "daddiu $5, 1                       \r\n"
+        "lbu $6, 0($2)                      \r\n"
+        "daddu $3, $3, $6                   \r\n"
+        "daddiu $2, 1                       \r\n"
+        "lbu $6, 0($5)                      \r\n"
+        "daddu $4, $4, $6                   \r\n"
+        "daddiu $5, 1                       \r\n"
+        "lbu $6, 0($2)                      \r\n"
+        "daddu $3, $3, $6                   \r\n"
+        "daddiu $2, 1                       \r\n"
+        "lbu $6, 0($5)                      \r\n"
+        "daddu $4, $4, $6                   \r\n"
+        "daddiu $5, 1                       \r\n"
+        "lbu $6, 0($2)                      \r\n"
+        "daddu $3, $3, $6                   \r\n"
+        "daddiu $2, 1                       \r\n"
+        "lbu $6, 0($5)                      \r\n"
+        "daddu $4, $4, $6                   \r\n"
+        "daddiu $5, 1                       \r\n"
+        "dli $6, -1                         \r\n"
+        "daddu $6, $6, %[src]               \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $7, $0, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $7, $7, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $7, $7, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $7, $7, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $8, $0, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $8, $8, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $8, $8, $5                   \r\n"
+        "daddu $6, $6, %[stride]            \r\n"
+        "lbu $5, 0($6)                      \r\n"
+        "daddu $8, $8, $5                   \r\n"
+        "daddu $3, $3, $7                   \r\n"
+        "daddiu $3, $3, 4                   \r\n"
+        "daddiu $4, $4, 2                   \r\n"
+        "daddiu $5, $8, 2                   \r\n"
+        "daddu $6, $4, $5                   \r\n"
+        "dsrl $3, 3                         \r\n"
+        "dsrl $4, 2                         \r\n"
+        "dsrl $5, 2                         \r\n"
+        "dsrl $6, 3                         \r\n"
+        "xor $f30, $f30, $f30               \r\n"
+        "dmtc1 $3, $f0                      \r\n"
+        "pshufh $f0, $f0, $f30              \r\n"
+        "dmtc1 $4, $f2                      \r\n"
+        "pshufh $f2, $f2, $f30              \r\n"
+        "dmtc1 $5, $f4                      \r\n"
+        "pshufh $f4, $f4, $f30              \r\n"
+        "dmtc1 $6, $f6                      \r\n"
+        "pshufh $f6, $f6, $f30              \r\n"
+        "packushb $f0, $f0, $f2             \r\n"
+        "packushb $f2, $f4, $f6             \r\n"
+        "daddu $2, $0, %[src]               \r\n"
+        "sdc1 $f0, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f0, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f0, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f0, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f2, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f2, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f2, 0($2)                    \r\n"
+        "daddu $2, $2, %[stride]            \r\n"
+        "sdc1 $f2, 0($2)                    \r\n"
+        ::[src]"r"(src),[stride]"r"(stride)
+        : "$2","$3","$4","$5","$6","$7","$8","$f0","$f2","$f4","$f6","$f30"
+    );
+}
+
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "gsldlc1 $f2, 7(%[srcA])            \r\n"
+        "gsldrc1 $f2, 0(%[srcA])            \r\n"
+        "dli $8, 16                         \r\n"
+        "1:                                 \r\n"
+        "gssdlc1 $f2, 7(%[src])             \r\n"
+        "gssdrc1 $f2, 0(%[src])             \r\n"
+        "daddu %[src], %[src], %[stride]    \r\n"
+        "daddi $8, $8, -1                   \r\n"
+        "bnez $8, 1b                        \r\n"
+        : [src]"+&r"(src)
+        : [stride]"r"(stride),[srcA]"r"(src-stride)
+        : "$8","$f2"
+    );
+}
+
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    __asm__ volatile (
+        "daddiu $2, %[src], -1              \r\n"
+        "daddu $3, %[src], $0               \r\n"
+        "dli $6, 0x10                       \r\n"
+        "1:                                 \r\n"
+        "lbu $4, 0($2)                      \r\n"
+        "dmul $5, $4, %[ff_pb_1]            \r\n"
+        "sdl $5, 7($3)                      \r\n"
+        "sdr $5, 0($3)                      \r\n"
+        "daddu $2, %[stride]                \r\n"
+        "daddu $3, %[stride]                \r\n"
+        "daddiu $6, -1                      \r\n"
+        "bnez $6, 1b                        \r\n"
+        ::[src]"r"(src),[stride]"r"(stride),[ff_pb_1]"r"(ff_pb_1)
+        : "$2","$3","$4","$5","$6"
+    );
+}
diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c
new file mode 100644
index 0000000000..cddcd2e878
--- /dev/null
+++ b/libavcodec/mips/h264pred_msa.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride)
+{
+    uint32_t row;
+    uint32_t src_data1, src_data2;
+
+    src_data1 = LW(src);
+    src_data2 = LW(src + 4);
+
+    for (row = 8; row--;) {
+        SW(src_data1, dst);
+        SW(src_data2, (dst + 4));
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    uint32_t row;
+    v16u8 src0;
+
+    src0 = LD_UB(src);
+
+    for (row = 16; row--;) {
+        ST_UB(src0, dst);
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
+                                        uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    out0 = src[0 * src_stride] * 0x0101010101010101;
+    out1 = src[1 * src_stride] * 0x0101010101010101;
+    out2 = src[2 * src_stride] * 0x0101010101010101;
+    out3 = src[3 * src_stride] * 0x0101010101010101;
+    out4 = src[4 * src_stride] * 0x0101010101010101;
+    out5 = src[5 * src_stride] * 0x0101010101010101;
+    out6 = src[6 * src_stride] * 0x0101010101010101;
+    out7 = src[7 * src_stride] * 0x0101010101010101;
+
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16u8 src0, src1, src2, src3;
+
+    for (row = 4; row--;) {
+        inp0 = src[0];
+        src += src_stride;
+        inp1 = src[0];
+        src += src_stride;
+        inp2 = src[0];
+        src += src_stride;
+        inp3 = src[0];
+        src += src_stride;
+
+        src0 = (v16u8) __msa_fill_b(inp0);
+        src1 = (v16u8) __msa_fill_b(inp1);
+        src2 = (v16u8) __msa_fill_b(inp2);
+        src3 = (v16u8) __msa_fill_b(inp3);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
+                                     int32_t src_stride_left,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row;
+    uint32_t out, addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32) sum, 0);
+
+        for (row = 0; row < 8; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_left) {
+        for (row = 0; row < 8; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 4) >> 3;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64) __msa_srari_d((v2i64) sum, 3);
+        store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+    } else {
+        store = (v16u8) __msa_ldi_b(128);
+    }
+
+    out = __msa_copy_u_w((v4i32) store, 0);
+
+    for (row = 8; row--;) {
+        SW(out, dst);
+        SW(out, (dst + 4));
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
+                                       int32_t src_stride_left,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row;
+    uint32_t addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32) sum, 0);
+
+        for (row = 0; row < 16; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 16) >> 5;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_left) {
+        for (row = 0; row < 16; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
+        store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+    } else {
+        store = (v16u8) __msa_ldi_b(128);
+    }
+
+    for (row = 16; row--;) {
+        ST_UB(store, dst);
+        dst += dst_stride;
+    }
+}
+
+#define INTRA_PREDICT_VALDC_8X8_MSA(val)                         \
+static void intra_predict_##val##dc_8x8_msa(uint8_t *dst,        \
+                                            int32_t dst_stride)  \
+{                                                                \
+    uint32_t row, out;                                           \
+    v16i8 store;                                                 \
+                                                                 \
+    store = __msa_ldi_b(val);                                    \
+    out = __msa_copy_u_w((v4i32) store, 0);                      \
+                                                                 \
+    for (row = 8; row--;) {                                      \
+        SW(out, dst);                                            \
+        SW(out, (dst + 4));                                      \
+        dst += dst_stride;                                       \
+    }                                                            \
+}
+
+INTRA_PREDICT_VALDC_8X8_MSA(127);
+INTRA_PREDICT_VALDC_8X8_MSA(129);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                         \
+static void intra_predict_##val##dc_16x16_msa(uint8_t *dst,        \
+                                              int32_t dst_stride)  \
+{                                                                  \
+    uint32_t row;                                                  \
+    v16u8 store;                                                   \
+                                                                   \
+    store = (v16u8) __msa_ldi_b(val);                              \
+                                                                   \
+    for (row = 16; row--;) {                                       \
+        ST_UB(store, dst);                                         \
+        dst += dst_stride;                                         \
+    }                                                              \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res, res0, res1, res2, res3;
+    uint64_t out0, out1;
+    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top;
+    v8i16 vec9, vec10, vec11;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
+    v2i64 sum;
+
+    src_top = LD_UB(src - (stride + 1));
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    sum = __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w((v4i32) sum, 0);
+
+    res1 = (src[4 * stride - 1] - src[2 * stride - 1]) +
+        2 * (src[5 * stride - 1] - src[stride - 1]) +
+        3 * (src[6 * stride - 1] - src[-1]) +
+        4 * (src[7 * stride - 1] - src[-stride - 1]);
+
+    res0 *= 17;
+    res1 *= 17;
+    res0 = (res0 + 16) >> 5;
+    res1 = (res1 + 16) >> 5;
+
+    res3 = 3 * (res0 + res1);
+    res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1);
+    res = res2 - res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res);
+    vec2 = __msa_fill_w(res1);
+    vec5 = vec8 * int_multiplier;
+    vec3 = vec8 * 4;
+
+    for (lpcnt = 4; lpcnt--;) {
+        vec0 = vec5;
+        vec0 += vec4;
+        vec1 = vec0 + vec3;
+        vec6 = vec5;
+        vec4 += vec2;
+        vec6 += vec4;
+        vec7 = vec6 + vec3;
+
+        SRA_4V(vec0, vec1, vec6, vec7, 5);
+        PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11);
+        CLIP_SH2_0_255(vec10, vec11);
+        PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11);
+
+        out0 = __msa_copy_s_d((v2i64) vec10, 0);
+        out1 = __msa_copy_s_d((v2i64) vec11, 0);
+        SD(out0, src);
+        src += stride;
+        SD(out1, src);
+        src += stride;
+
+        vec4 += vec2;
+    }
+}
+
+static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res0, res1, res2, res3;
+    uint64_t load0, load1;
+    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top = { 0 };
+    v8i16 vec9, vec10;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
+
+    load0 = LD(src - (stride + 1));
+    load1 = LD(src - (stride + 1) + 9);
+
+    INSERT_D2_UB(load0, load1, src_top);
+
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    res_add = (v4i32) __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2);
+
+    res1 = (src[8 * stride - 1] - src[6 * stride - 1]) +
+        2 * (src[9 * stride - 1] - src[5 * stride - 1]) +
+        3 * (src[10 * stride - 1] - src[4 * stride - 1]) +
+        4 * (src[11 * stride - 1] - src[3 * stride - 1]) +
+        5 * (src[12 * stride - 1] - src[2 * stride - 1]) +
+        6 * (src[13 * stride - 1] - src[stride - 1]) +
+        7 * (src[14 * stride - 1] - src[-1]) +
+        8 * (src[15 * stride - 1] - src[-1 * stride - 1]);
+
+    res0 *= 5;
+    res1 *= 5;
+    res0 = (res0 + 32) >> 6;
+    res1 = (res1 + 32) >> 6;
+
+    res3 = 7 * (res0 + res1);
+    res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1);
+    res2 -= res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res2);
+    vec5 = __msa_fill_w(res1);
+    vec6 = vec8 * 4;
+    vec7 = vec8 * int_multiplier;
+
+    for (lpcnt = 16; lpcnt--;) {
+        vec0 = vec7;
+        vec0 += vec4;
+        vec1 = vec0 + vec6;
+        vec2 = vec1 + vec6;
+        vec3 = vec2 + vec6;
+
+        SRA_4V(vec0, vec1, vec2, vec3, 5);
+        PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10);
+        CLIP_SH2_0_255(vec9, vec10);
+        PCKEV_ST_SB(vec9, vec10, src);
+        src += stride;
+
+        vec4 += vec5;
+    }
+}
+
+static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src3, src2 = 0;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+        src2 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 4) >> 3;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, (src + 4));
+        SW(out2, (src + 4 * stride));
+        SW(out3, (src + 4 * stride + 4));
+        src += stride;
+    }
+}
+
+static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0, src1 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+        src1 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = src1 * 0x0101010101010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, (src + 4 * stride));
+        src += stride;
+    }
+}
+
+static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t out0 = 0, out1 = 0;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+    v4i32 res0, res1;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
+    res0 = (v4i32) __msa_splati_b((v16i8) sum, 0);
+    res1 = (v4i32) __msa_splati_b((v16i8) sum, 4);
+    out0 = __msa_copy_u_w(res0, 0);
+    out1 = __msa_copy_u_w(res1, 0);
+
+    for (lp_cnt = 8; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, src + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src2 = 0;
+    uint32_t out0, out1, out2;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src2 += src[lp_cnt * stride - 1];
+    }
+    src2 = (src0 + src2 + 4) >> 3;
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out2, src);
+        SW(out1, src + 4);
+        SW(out0, src + stride * 4);
+        SW(out1, src + stride * 4 + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src2 = 0, src3;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src2 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, src + 4);
+        SW(out2, src + stride * 4);
+        SW(out3, src + stride * 4 + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = 0x8080808080808080;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, src + stride * 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+
+    out0 = 0x8080808080808080;
+    out1 = src0 * 0x0101010101010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, src + stride * 4);
+        src += stride;
+    }
+}
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_dc_4blk_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_hor_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_vert_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l00_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_16x16_msa(src, stride);
+}
+
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_8x8_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 1);
+}
+
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_16x16_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 1);
+}
+
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 0);
+}
+
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_8x8_msa(src_top, src_left, stride, dst, stride, 0, 0);
+}
+
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 0);
+}
+
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_16x16_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_16x16_msa(src, stride);
+}
diff --git a/libavcodec/mips/h264qpel_init_mips.c b/libavcodec/mips/h264qpel_init_mips.c
new file mode 100644
index 0000000000..92219f8877
--- /dev/null
+++ b/libavcodec/mips/h264qpel_init_mips.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264qpel_init_msa(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_msa;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_msa;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_msa;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_msa;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_msa;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_msa;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_msa;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_msa;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_msa;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_msa;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_msa;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_msa;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_msa;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_msa;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_msa;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_msa;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_msa;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_msa;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_msa;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_msa;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_msa;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_msa;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_msa;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_msa;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_msa;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_msa;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_msa;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_msa;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_msa;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_msa;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_msa;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_msa;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_msa;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_msa;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_msa;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_msa;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_msa;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_msa;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_msa;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_msa;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_msa;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_msa;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_msa;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_msa;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264qpel_init_mmi(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[2][0] = ff_put_h264_qpel4_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_MSA
+    h264qpel_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264qpel_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c
new file mode 100644
index 0000000000..e04a2d5936
--- /dev/null
+++ b/libavcodec/mips/h264qpel_mmi.c
@@ -0,0 +1,2637 @@
+/*
+ * Loongson SIMD optimized h264qpel
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+
+static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 3(%[src])                 \r\n"
+        "gslwrc1 $f2, 0(%[src])                 \r\n"
+        "gsswlc1 $f2, 3(%[dst])                 \r\n"
+        "gsswrc1 $f2, 0(%[dst])                 \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
+        : "$f2"
+    );
+}
+
+static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[src])                 \r\n"
+        "gsldrc1 $f2, 0(%[src])                 \r\n"
+        "gssdlc1 $f2, 7(%[dst])                 \r\n"
+        "gssdrc1 $f2, 0(%[dst])                 \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
+        : "$f2"
+    );
+}
+
+static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[src])                 \r\n"
+        "gsldrc1 $f2, 0(%[src])                 \r\n"
+        "gsldlc1 $f4, 15(%[src])                \r\n"
+        "gsldrc1 $f4, 8(%[src])                 \r\n"
+        "gssdlc1 $f2, 7(%[dst])                 \r\n"
+        "gssdrc1 $f2, 0(%[dst])                 \r\n"
+        "gssdlc1 $f4, 15(%[dst])                \r\n"
+        "gssdrc1 $f4, 8(%[dst])                 \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
+        : "$f2","$f4"
+    );
+}
+
+#define op_put(a, b) a = b
+#define op_avg(a, b) a = rnd_avg_pixel4(a, b)
+static inline void put_pixels4_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 3(%[pixels])              \r\n"
+        "gslwrc1 $f2, 0(%[pixels])              \r\n"
+        "gsswlc1 $f2, 3(%[block])               \r\n"
+        "gsswrc1 $f2, 0(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2"
+    );
+}
+
+static inline void put_pixels8_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[pixels])              \r\n"
+        "gsldrc1 $f2, 0(%[pixels])              \r\n"
+        "gssdlc1 $f2, 7(%[block])               \r\n"
+        "gssdrc1 $f2, 0(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2"
+    );
+}
+
+static inline void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[pixels])              \r\n"
+        "gsldrc1 $f2, 0(%[pixels])              \r\n"
+        "gsldlc1 $f4, 15(%[pixels])             \r\n"
+        "gsldrc1 $f4, 8(%[pixels])              \r\n"
+        "gssdlc1 $f2, 7(%[block])               \r\n"
+        "gssdrc1 $f2, 0(%[block])               \r\n"
+        "gssdlc1 $f4, 15(%[block])              \r\n"
+        "gssdrc1 $f4, 8(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2","$f4"
+    );
+}
+
+static inline void avg_pixels4_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 3(%[pixels])              \r\n"
+        "gslwrc1 $f2, 0(%[pixels])              \r\n"
+        "gslwlc1 $f4, 3(%[block])               \r\n"
+        "gslwrc1 $f4, 0(%[block])               \r\n"
+        "pavgb $f2, $f2, $f4                    \r\n"
+        "gsswlc1 $f2, 3(%[block])               \r\n"
+        "gsswrc1 $f2, 0(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2","$f4"
+    );
+}
+
+static inline void avg_pixels8_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[block])               \r\n"
+        "gsldrc1 $f2, 0(%[block])               \r\n"
+        "gsldlc1 $f4, 7(%[pixels])              \r\n"
+        "gsldrc1 $f4, 0(%[pixels])              \r\n"
+        "pavgb $f2, $f2, $f4                    \r\n"
+        "gssdlc1 $f2, 7(%[block])               \r\n"
+        "gssdrc1 $f2, 0(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2","$f4"
+    );
+}
+
+static inline void avg_pixels16_mmi(uint8_t *block, const uint8_t *pixels,
+        ptrdiff_t line_size, int h)
+{
+    __asm__ volatile (
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 7(%[block])               \r\n"
+        "gsldrc1 $f2, 0(%[block])               \r\n"
+        "gsldlc1 $f4, 15(%[block])              \r\n"
+        "gsldrc1 $f4, 8(%[block])               \r\n"
+        "gsldlc1 $f6, 7(%[pixels])              \r\n"
+        "gsldrc1 $f6, 0(%[pixels])              \r\n"
+        "gsldlc1 $f8, 15(%[pixels])             \r\n"
+        "gsldrc1 $f8, 8(%[pixels])              \r\n"
+        "pavgb $f2, $f2, $f6                    \r\n"
+        "pavgb $f4, $f4, $f8                    \r\n"
+        "gssdlc1 $f2, 7(%[block])               \r\n"
+        "gssdrc1 $f2, 0(%[block])               \r\n"
+        "gssdlc1 $f4, 15(%[block])              \r\n"
+        "gssdrc1 $f4, 8(%[block])               \r\n"
+        "dadd %[pixels], %[pixels], %[line_size]\r\n"
+        "dadd %[block], %[block], %[line_size]  \r\n"
+        "daddi %[h], %[h], -1                   \r\n"
+        "bnez %[h], 1b                          \r\n"
+        : [block]"+&r"(block),[pixels]"+&r"(pixels)
+        : [line_size]"r"(line_size),[h]"r"(h)
+        : "$f2","$f4","$f6","$f8"
+    );
+}
+
+static inline void put_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+    }
+}
+
+static inline void put_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 4]);
+        b = AV_RN4P(&src2[i * src_stride2 + 4]);
+        op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
+    }
+}
+
+static inline void put_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 4]);
+        b = AV_RN4P(&src2[i * src_stride2 + 4]);
+        op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 8]);
+        b = AV_RN4P(&src2[i * src_stride2 + 8]);
+        op_put(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 12]);
+        b = AV_RN4P(&src2[i * src_stride2 + 12]);
+        op_put(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b));
+    }
+}
+
+static inline void avg_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+    }
+}
+
+static inline void avg_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 4]);
+        b = AV_RN4P(&src2[i * src_stride2 + 4]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
+    }
+}
+
+static inline void avg_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1,
+        const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+        int h)
+{
+    int i;
+    for (i = 0; i < h; i++) {
+        pixel4 a, b;
+        a = AV_RN4P(&src1[i * src_stride1]);
+        b = AV_RN4P(&src2[i * src_stride2]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 4]);
+        b = AV_RN4P(&src2[i * src_stride2 + 4]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 8]);
+        b = AV_RN4P(&src2[i * src_stride2 + 8]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b));
+        a = AV_RN4P(&src1[i * src_stride1 + 12]);
+        b = AV_RN4P(&src2[i * src_stride2 + 12]);
+        op_avg(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b));
+
+    }
+}
+#undef op_put
+#undef op_avg
+
+#define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
+#define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
+static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 4                              \r\n"
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 1(%[src])                 \r\n"
+        "gslwrc1 $f2, -2(%[src])                \r\n"
+        "gslwlc1 $f4, 2(%[src])                 \r\n"
+        "gslwrc1 $f4, -1(%[src])                \r\n"
+        "gslwlc1 $f6, 3(%[src])                 \r\n"
+        "gslwrc1 $f6, 0(%[src])                 \r\n"
+        "gslwlc1 $f8, 4(%[src])                 \r\n"
+        "gslwrc1 $f8, 1(%[src])                 \r\n"
+        "gslwlc1 $f10, 5(%[src])                \r\n"
+        "gslwrc1 $f10, 2(%[src])                \r\n"
+        "gslwlc1 $f12, 6(%[src])                \r\n"
+        "gslwrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+        "paddsh $f18, $f18, %[ff_pw_16]         \r\n"
+        "psrah $f18, $f18, %[ff_pw_5]           \r\n"
+        "packushb $f18, $f18, $f0               \r\n"
+        "gsswlc1 $f18, 3(%[dst])                \r\n"
+        "gsswrc1 $f18, 0(%[dst])                \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18"
+    );
+}
+
+static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 8                              \r\n"
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 5(%[src])                 \r\n"
+        "gsldrc1 $f2, -2(%[src])                \r\n"
+        "gsldlc1 $f4, 6(%[src])                 \r\n"
+        "gsldrc1 $f4, -1(%[src])                \r\n"
+        "gsldlc1 $f6, 7(%[src])                 \r\n"
+        "gsldrc1 $f6, 0(%[src])                 \r\n"
+        "gsldlc1 $f8, 8(%[src])                 \r\n"
+        "gsldrc1 $f8, 1(%[src])                 \r\n"
+        "gsldlc1 $f10, 9(%[src])                \r\n"
+        "gsldrc1 $f10, 2(%[src])                \r\n"
+        "gsldlc1 $f12, 10(%[src])               \r\n"
+        "gsldrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f14, $f6, $f0               \r\n"
+        "punpckhbh $f16, $f6, $f0               \r\n"
+        "punpcklbh $f18, $f8, $f0               \r\n"
+        "punpckhbh $f20, $f8, $f0               \r\n"
+        "paddsh $f6, $f14, $f18                 \r\n"
+        "paddsh $f8, $f16, $f20                 \r\n"
+        "pmullh $f6, $f6, %[ff_pw_20]           \r\n"
+        "pmullh $f8, $f8, %[ff_pw_20]           \r\n"
+        "punpcklbh $f14, $f4, $f0               \r\n"
+        "punpckhbh $f16, $f4, $f0               \r\n"
+        "punpcklbh $f18, $f10, $f0              \r\n"
+        "punpckhbh $f20, $f10, $f0              \r\n"
+        "paddsh $f4, $f14, $f18                 \r\n"
+        "paddsh $f10, $f16, $f20                \r\n"
+        "pmullh $f4, $f4, %[ff_pw_5]            \r\n"
+        "pmullh $f10, $f10, %[ff_pw_5]          \r\n"
+        "punpcklbh $f14, $f2, $f0               \r\n"
+        "punpckhbh $f16, $f2, $f0               \r\n"
+        "punpcklbh $f18, $f12, $f0              \r\n"
+        "punpckhbh $f20, $f12, $f0              \r\n"
+        "paddsh $f2, $f14, $f18                 \r\n"
+        "paddsh $f12, $f16, $f20                \r\n"
+        "psubsh $f6, $f6, $f4                   \r\n"
+        "psubsh $f8, $f8, $f10                  \r\n"
+        "paddsh $f6, $f6, $f2                   \r\n"
+        "paddsh $f8, $f8, $f12                  \r\n"
+        "paddsh $f6, $f6, %[ff_pw_16]           \r\n"
+        "paddsh $f8, $f8, %[ff_pw_16]           \r\n"
+        "psrah $f6, $f6, %[ff_pw_5]             \r\n"
+        "psrah $f8, $f8, %[ff_pw_5]             \r\n"
+        "packushb $f18, $f6, $f8                \r\n"
+        "sdc1 $f18, 0(%[dst])                   \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20"
+    );
+}
+
+static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 4                              \r\n"
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 1(%[src])                 \r\n"
+        "gslwrc1 $f2, -2(%[src])                \r\n"
+        "gslwlc1 $f4, 2(%[src])                 \r\n"
+        "gslwrc1 $f4, -1(%[src])                \r\n"
+        "gslwlc1 $f6, 3(%[src])                 \r\n"
+        "gslwrc1 $f6, 0(%[src])                 \r\n"
+        "gslwlc1 $f8, 4(%[src])                 \r\n"
+        "gslwrc1 $f8, 1(%[src])                 \r\n"
+        "gslwlc1 $f10, 5(%[src])                \r\n"
+        "gslwrc1 $f10, 2(%[src])                \r\n"
+        "gslwlc1 $f12, 6(%[src])                \r\n"
+        "gslwrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+        "paddsh $f18, $f18, %[ff_pw_16]         \r\n"
+        "psrah $f18, $f18, %[ff_pw_5]           \r\n"
+        "packushb $f18, $f18, $f0               \r\n"
+        "lwc1 $f20, 0(%[dst])                   \r\n"
+        "pavgb $f18, $f18, $f20                 \r\n"
+        "gsswlc1 $f18, 3(%[dst])                \r\n"
+        "gsswrc1 $f18, 0(%[dst])                \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20"
+    );
+}
+
+static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 8                              \r\n"
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 5(%[src])                 \r\n"
+        "gsldrc1 $f2, -2(%[src])                \r\n"
+        "gsldlc1 $f4, 6(%[src])                 \r\n"
+        "gsldrc1 $f4, -1(%[src])                \r\n"
+        "gsldlc1 $f6, 7(%[src])                 \r\n"
+        "gsldrc1 $f6, 0(%[src])                 \r\n"
+        "gsldlc1 $f8, 8(%[src])                 \r\n"
+        "gsldrc1 $f8, 1(%[src])                 \r\n"
+        "gsldlc1 $f10, 9(%[src])                \r\n"
+        "gsldrc1 $f10, 2(%[src])                \r\n"
+        "gsldlc1 $f12, 10(%[src])               \r\n"
+        "gsldrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f14, $f6, $f0               \r\n"
+        "punpckhbh $f16, $f6, $f0               \r\n"
+        "punpcklbh $f18, $f8, $f0               \r\n"
+        "punpckhbh $f20, $f8, $f0               \r\n"
+        "paddsh $f6, $f14, $f18                 \r\n"
+        "paddsh $f8, $f16, $f20                 \r\n"
+        "pmullh $f6, $f6, %[ff_pw_20]           \r\n"
+        "pmullh $f8, $f8, %[ff_pw_20]           \r\n"
+        "punpcklbh $f14, $f4, $f0               \r\n"
+        "punpckhbh $f16, $f4, $f0               \r\n"
+        "punpcklbh $f18, $f10, $f0              \r\n"
+        "punpckhbh $f20, $f10, $f0              \r\n"
+        "paddsh $f4, $f14, $f18                 \r\n"
+        "paddsh $f10, $f16, $f20                \r\n"
+        "pmullh $f4, $f4, %[ff_pw_5]            \r\n"
+        "pmullh $f10, $f10, %[ff_pw_5]          \r\n"
+        "punpcklbh $f14, $f2, $f0               \r\n"
+        "punpckhbh $f16, $f2, $f0               \r\n"
+        "punpcklbh $f18, $f12, $f0              \r\n"
+        "punpckhbh $f20, $f12, $f0              \r\n"
+        "paddsh $f2, $f14, $f18                 \r\n"
+        "paddsh $f12, $f16, $f20                \r\n"
+        "psubsh $f6, $f6, $f4                   \r\n"
+        "psubsh $f8, $f8, $f10                  \r\n"
+        "paddsh $f6, $f6, $f2                   \r\n"
+        "paddsh $f8, $f8, $f12                  \r\n"
+        "paddsh $f6, $f6, %[ff_pw_16]           \r\n"
+        "paddsh $f8, $f8, %[ff_pw_16]           \r\n"
+        "psrah $f6, $f6, %[ff_pw_5]             \r\n"
+        "psrah $f8, $f8, %[ff_pw_5]             \r\n"
+        "packushb $f18, $f6, $f8                \r\n"
+        "ldc1 $f20, 0(%[dst])                   \r\n"
+        "pavgb $f18, $f18, $f20                 \r\n"
+        "sdc1 $f18, 0(%[dst])                   \r\n"
+        "dadd %[dst], %[dst], %[dstStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [dst]"+&r"(dst),[src]"+&r"(src)
+        : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18","$f20"
+    );
+}
+
+static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "gslwlc1 $f2, 3(%[srcB])                \r\n"
+        "gslwrc1 $f2, 0(%[srcB])                \r\n"
+        "gslwlc1 $f4, 3(%[srcA])                \r\n"
+        "gslwrc1 $f4, 0(%[srcA])                \r\n"
+        "gslwlc1 $f6, 3(%[src0])                \r\n"
+        "gslwrc1 $f6, 0(%[src0])                \r\n"
+        "gslwlc1 $f8, 3(%[src1])                \r\n"
+        "gslwrc1 $f8, 0(%[src1])                \r\n"
+        "gslwlc1 $f10, 3(%[src2])               \r\n"
+        "gslwrc1 $f10, 0(%[src2])               \r\n"
+        "gslwlc1 $f12, 3(%[src3])               \r\n"
+        "gslwrc1 $f12, 0(%[src3])               \r\n"
+        "gslwlc1 $f14, 3(%[src4])               \r\n"
+        "gslwrc1 $f14, 0(%[src4])               \r\n"
+        "gslwlc1 $f16, 3(%[src5])               \r\n"
+        "gslwrc1 $f16, 0(%[src5])               \r\n"
+        "gslwlc1 $f18, 3(%[src6])               \r\n"
+        "gslwrc1 $f18, 0(%[src6])               \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "punpcklbh $f14, $f14, $f0              \r\n"
+        "punpcklbh $f16, $f16, $f0              \r\n"
+        "punpcklbh $f18, $f18, $f0              \r\n"
+        "paddsh $f20, $f6, $f8                  \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f4, $f10                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f24, $f20, $f22                \r\n"
+        "paddsh $f24, $f24, $f2                 \r\n"
+        "paddsh $f24, $f24, $f12                \r\n"
+        "paddsh $f20, $f8, $f10                 \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f6, $f12                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f26, $f20, $f22                \r\n"
+        "paddsh $f26, $f26, $f4                 \r\n"
+        "paddsh $f26, $f26, $f14                \r\n"
+        "paddsh $f20, $f10, $f12                \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f8, $f14                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f28, $f20, $f22                \r\n"
+        "paddsh $f28, $f28, $f6                 \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f20, $f12, $f14                \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f10, $f16                \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f30, $f20, $f22                \r\n"
+        "paddsh $f30, $f30, $f8                 \r\n"
+        "paddsh $f30, $f30, $f18                \r\n"
+        "paddsh $f24, $f24, %[ff_pw_16]         \r\n"
+        "paddsh $f26, $f26, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "paddsh $f30, $f30, %[ff_pw_16]         \r\n"
+        "psrah $f24, $f24, %[ff_pw_5]           \r\n"
+        "psrah $f26, $f26, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "psrah $f30, $f30, %[ff_pw_5]           \r\n"
+        "packushb $f24, $f24, $f0               \r\n"
+        "packushb $f26, $f26, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "packushb $f30, $f30, $f0               \r\n"
+        "swc1 $f24, 0(%[dst0])                  \r\n"
+        "swc1 $f26, 0(%[dst1])                  \r\n"
+        "swc1 $f28, 0(%[dst2])                  \r\n"
+        "swc1 $f30, 0(%[dst3])                  \r\n"
+        ::[dst0]"r"(dst),               [dst1]"r"(dst+dstStride),
+          [dst2]"r"(dst+2*dstStride),   [dst3]"r"(dst+3*dstStride),
+          [srcB]"r"(src-2*srcStride),   [srcA]"r"(src-srcStride),
+          [src0]"r"(src),               [src1]"r"(src+srcStride),
+          [src2]"r"(src+2*srcStride),   [src3]"r"(src+3*srcStride),
+          [src4]"r"(src+4*srcStride),   [src5]"r"(src+5*srcStride),
+          [src6]"r"(src+6*srcStride),   [ff_pw_20]"f"(ff_pw_20),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18",
+          "$f20","$f22","$f24","$f26","$f28","$f30"
+    );
+}
+
+static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "gsldlc1 $f2, 7(%[srcB])                \r\n"
+        "gsldrc1 $f2, 0(%[srcB])                \r\n"
+        "gsldlc1 $f4, 7(%[srcA])                \r\n"
+        "gsldrc1 $f4, 0(%[srcA])                \r\n"
+        "gsldlc1 $f6, 7(%[src0])                \r\n"
+        "gsldrc1 $f6, 0(%[src0])                \r\n"
+        "gsldlc1 $f8, 7(%[src1])                \r\n"
+        "gsldrc1 $f8, 0(%[src1])                \r\n"
+        "gsldlc1 $f10, 7(%[src2])               \r\n"
+        "gsldrc1 $f10, 0(%[src2])               \r\n"
+        "gsldlc1 $f12, 7(%[src3])               \r\n"
+        "gsldrc1 $f12, 0(%[src3])               \r\n"
+        "gsldlc1 $f14, 7(%[src4])               \r\n"
+        "gsldrc1 $f14, 0(%[src4])               \r\n"
+        "gsldlc1 $f16, 7(%[src5])               \r\n"
+        "gsldrc1 $f16, 0(%[src5])               \r\n"
+        "gsldlc1 $f18, 7(%[src6])               \r\n"
+        "gsldrc1 $f18, 0(%[src6])               \r\n"
+        "gsldlc1 $f20, 7(%[src7])               \r\n"
+        "gsldrc1 $f20, 0(%[src7])               \r\n"
+        "gsldlc1 $f22, 7(%[src8])               \r\n"
+        "gsldrc1 $f22, 0(%[src8])               \r\n"
+        "gsldlc1 $f24, 7(%[src9])               \r\n"
+        "gsldrc1 $f24, 0(%[src9])               \r\n"
+        "gsldlc1 $f26, 7(%[src10])              \r\n"
+        "gsldrc1 $f26, 0(%[src10])              \r\n"
+        "punpcklbh $f1, $f2, $f0                \r\n"
+        "punpckhbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f3, $f4, $f0                \r\n"
+        "punpckhbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f5, $f6, $f0                \r\n"
+        "punpckhbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f7, $f8, $f0                \r\n"
+        "punpckhbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f9, $f10, $f0               \r\n"
+        "punpckhbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f11, $f12, $f0              \r\n"
+        "punpckhbh $f12, $f12, $f0              \r\n"
+        "punpcklbh $f13, $f14, $f0              \r\n"
+        "punpckhbh $f14, $f14, $f0              \r\n"
+        "punpcklbh $f15, $f16, $f0              \r\n"
+        "punpckhbh $f16, $f16, $f0              \r\n"
+        "punpcklbh $f17, $f18, $f0              \r\n"
+        "punpckhbh $f18, $f18, $f0              \r\n"
+        "punpcklbh $f19, $f20, $f0              \r\n"
+        "punpckhbh $f20, $f20, $f0              \r\n"
+        "punpcklbh $f21, $f22, $f0              \r\n"
+        "punpckhbh $f22, $f22, $f0              \r\n"
+        "punpcklbh $f23, $f24, $f0              \r\n"
+        "punpckhbh $f24, $f24, $f0              \r\n"
+        "punpcklbh $f25, $f26, $f0              \r\n"
+        "punpckhbh $f26, $f26, $f0              \r\n"
+        "paddsh $f27, $f5, $f7                  \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f6, $f8                  \r\n"//src0+src1
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f3                 \r\n"
+        "psubsh $f28, $f28, $f4                 \r\n"
+        "psubsh $f27, $f27, $f9                 \r\n"
+        "psubsh $f28, $f28, $f10                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f1                 \r\n"
+        "paddsh $f28, $f28, $f2                 \r\n"
+        "paddsh $f27, $f27, $f11                \r\n"
+        "paddsh $f28, $f28, $f12                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f2, $f27, $f28              \r\n"
+        "sdc1 $f2, 0(%[dst0])                   \r\n"
+        "paddsh $f27, $f7, $f9                  \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f8, $f10                 \r\n"//src1+src2
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f5                 \r\n"
+        "psubsh $f28, $f28, $f6                 \r\n"
+        "psubsh $f27, $f27, $f11                \r\n"
+        "psubsh $f28, $f28, $f12                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f3                 \r\n"
+        "paddsh $f28, $f28, $f4                 \r\n"
+        "paddsh $f27, $f27, $f13                \r\n"
+        "paddsh $f28, $f28, $f14                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f4, $f27, $f28              \r\n"
+        "sdc1 $f4, 0(%[dst1])                   \r\n"
+        "paddsh $f27, $f9, $f11                 \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f10, $f12                \r\n"//src2+src3
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f7                 \r\n"
+        "psubsh $f28, $f28, $f8                 \r\n"
+        "psubsh $f27, $f27, $f13                \r\n"
+        "psubsh $f28, $f28, $f14                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f5                 \r\n"
+        "paddsh $f28, $f28, $f6                 \r\n"
+        "paddsh $f27, $f27, $f15                \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f6, $f27, $f28              \r\n"
+        "sdc1 $f6, 0(%[dst2])                   \r\n"
+        "paddsh $f27, $f11, $f13                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f12, $f14                \r\n"//src3+src4
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f9                 \r\n"
+        "psubsh $f28, $f28, $f10                \r\n"
+        "psubsh $f27, $f27, $f15                \r\n"
+        "psubsh $f28, $f28, $f16                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f7                 \r\n"
+        "paddsh $f28, $f28, $f8                 \r\n"
+        "paddsh $f27, $f27, $f17                \r\n"
+        "paddsh $f28, $f28, $f18                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f8, $f27, $f28              \r\n"
+        "sdc1 $f8, 0(%[dst3])                   \r\n"
+        "paddsh $f27, $f13, $f15                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f14, $f16                \r\n"//src4+src5
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f11                \r\n"
+        "psubsh $f28, $f28, $f12                \r\n"
+        "psubsh $f27, $f27, $f17                \r\n"
+        "psubsh $f28, $f28, $f18                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f9                 \r\n"
+        "paddsh $f28, $f28, $f10                \r\n"
+        "paddsh $f27, $f27, $f19                \r\n"
+        "paddsh $f28, $f28, $f20                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f10, $f27, $f28             \r\n"
+        "sdc1 $f10, 0(%[dst4])                  \r\n"
+
+        "paddsh $f27, $f15, $f17                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f16, $f18                \r\n"//src5+src6
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f13                \r\n"
+        "psubsh $f28, $f28, $f14                \r\n"
+        "psubsh $f27, $f27, $f19                \r\n"
+        "psubsh $f28, $f28, $f20                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f11                \r\n"
+        "paddsh $f28, $f28, $f12                \r\n"
+        "paddsh $f27, $f27, $f21                \r\n"
+        "paddsh $f28, $f28, $f22                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f12, $f27, $f28             \r\n"
+        "sdc1 $f12, 0(%[dst5])                  \r\n"
+        "paddsh $f27, $f17, $f19                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f18, $f20                \r\n"//src6+src7
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f15                \r\n"
+        "psubsh $f28, $f28, $f16                \r\n"
+        "psubsh $f27, $f27, $f21                \r\n"
+        "psubsh $f28, $f28, $f22                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f13                \r\n"
+        "paddsh $f28, $f28, $f14                \r\n"
+        "paddsh $f27, $f27, $f23                \r\n"
+        "paddsh $f28, $f28, $f24                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f14, $f27, $f28             \r\n"
+        "sdc1 $f14, 0(%[dst6])                  \r\n"
+        "paddsh $f27, $f19, $f21                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f20, $f22                \r\n"//src7+src8
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f17                \r\n"
+        "psubsh $f28, $f28, $f18                \r\n"
+        "psubsh $f27, $f27, $f23                \r\n"
+        "psubsh $f28, $f28, $f24                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f15                \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f27, $f27, $f25                \r\n"
+        "paddsh $f28, $f28, $f26                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f16, $f27, $f28             \r\n"
+        "sdc1 $f16, 0(%[dst7])                  \r\n"
+        ::[dst0]"r"(dst),               [dst1]"r"(dst+dstStride),
+          [dst2]"r"(dst+2*dstStride),   [dst3]"r"(dst+3*dstStride),
+          [dst4]"r"(dst+4*dstStride),   [dst5]"r"(dst+5*dstStride),
+          [dst6]"r"(dst+6*dstStride),   [dst7]"r"(dst+7*dstStride),
+          [srcB]"r"(src-2*srcStride),   [srcA]"r"(src-srcStride),
+          [src0]"r"(src),               [src1]"r"(src+srcStride),
+          [src2]"r"(src+2*srcStride),   [src3]"r"(src+3*srcStride),
+          [src4]"r"(src+4*srcStride),   [src5]"r"(src+5*srcStride),
+          [src6]"r"(src+6*srcStride),   [src7]"r"(src+7*srcStride),
+          [src8]"r"(src+8*srcStride),   [src9]"r"(src+9*srcStride),
+          [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10",
+          "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19",
+          "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28"
+    );
+}
+
+static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "gslwlc1 $f2, 3(%[srcB])                \r\n"
+        "gslwrc1 $f2, 0(%[srcB])                \r\n"
+        "gslwlc1 $f4, 3(%[srcA])                \r\n"
+        "gslwrc1 $f4, 0(%[srcA])                \r\n"
+        "gslwlc1 $f6, 3(%[src0])                \r\n"
+        "gslwrc1 $f6, 0(%[src0])                \r\n"
+        "gslwlc1 $f8, 3(%[src1])                \r\n"
+        "gslwrc1 $f8, 0(%[src1])                \r\n"
+        "gslwlc1 $f10, 3(%[src2])               \r\n"
+        "gslwrc1 $f10, 0(%[src2])               \r\n"
+        "gslwlc1 $f12, 3(%[src3])               \r\n"
+        "gslwrc1 $f12, 0(%[src3])               \r\n"
+        "gslwlc1 $f14, 3(%[src4])               \r\n"
+        "gslwrc1 $f14, 0(%[src4])               \r\n"
+        "gslwlc1 $f16, 3(%[src5])               \r\n"
+        "gslwrc1 $f16, 0(%[src5])               \r\n"
+        "gslwlc1 $f18, 3(%[src6])               \r\n"
+        "gslwrc1 $f18, 0(%[src6])               \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "punpcklbh $f14, $f14, $f0              \r\n"
+        "punpcklbh $f16, $f16, $f0              \r\n"
+        "punpcklbh $f18, $f18, $f0              \r\n"
+        "paddsh $f20, $f6, $f8                  \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f4, $f10                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f24, $f20, $f22                \r\n"
+        "paddsh $f24, $f24, $f2                 \r\n"
+        "paddsh $f24, $f24, $f12                \r\n"
+        "paddsh $f20, $f8, $f10                 \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f6, $f12                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f26, $f20, $f22                \r\n"
+        "paddsh $f26, $f26, $f4                 \r\n"
+        "paddsh $f26, $f26, $f14                \r\n"
+        "paddsh $f20, $f10, $f12                \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f8, $f14                 \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f28, $f20, $f22                \r\n"
+        "paddsh $f28, $f28, $f6                 \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f20, $f12, $f14                \r\n"
+        "pmullh $f20, $f20, %[ff_pw_20]         \r\n"
+        "paddsh $f22, $f10, $f16                \r\n"
+        "pmullh $f22, $f22, %[ff_pw_5]          \r\n"
+        "psubsh $f30, $f20, $f22                \r\n"
+        "paddsh $f30, $f30, $f8                 \r\n"
+        "paddsh $f30, $f30, $f18                \r\n"
+        "paddsh $f24, $f24, %[ff_pw_16]         \r\n"
+        "paddsh $f26, $f26, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "paddsh $f30, $f30, %[ff_pw_16]         \r\n"
+        "psrah $f24, $f24, %[ff_pw_5]           \r\n"
+        "psrah $f26, $f26, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "psrah $f30, $f30, %[ff_pw_5]           \r\n"
+        "packushb $f24, $f24, $f0               \r\n"
+        "packushb $f26, $f26, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "packushb $f30, $f30, $f0               \r\n"
+        "lwc1 $f2, 0(%[dst0])                   \r\n"
+        "lwc1 $f4, 0(%[dst1])                   \r\n"
+        "lwc1 $f6, 0(%[dst2])                   \r\n"
+        "lwc1 $f8, 0(%[dst3])                   \r\n"
+        "pavgb $f24, $f2, $f24                  \r\n"
+        "pavgb $f26, $f4, $f26                  \r\n"
+        "pavgb $f28, $f6, $f28                  \r\n"
+        "pavgb $f30, $f8, $f30                  \r\n"
+        "swc1 $f24, 0(%[dst0])                  \r\n"
+        "swc1 $f26, 0(%[dst1])                  \r\n"
+        "swc1 $f28, 0(%[dst2])                  \r\n"
+        "swc1 $f30, 0(%[dst3])                  \r\n"
+        ::[dst0]"r"(dst),               [dst1]"r"(dst+dstStride),
+          [dst2]"r"(dst+2*dstStride),   [dst3]"r"(dst+3*dstStride),
+          [srcB]"r"(src-2*srcStride),   [srcA]"r"(src-srcStride),
+          [src0]"r"(src),               [src1]"r"(src+srcStride),
+          [src2]"r"(src+2*srcStride),   [src3]"r"(src+3*srcStride),
+          [src4]"r"(src+4*srcStride),   [src5]"r"(src+5*srcStride),
+          [src6]"r"(src+6*srcStride),   [ff_pw_20]"f"(ff_pw_20),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18",
+          "$f20","$f22","$f24","$f26","$f28","$f30"
+    );
+}
+
+static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "gsldlc1 $f2, 7(%[srcB])                \r\n"
+        "gsldrc1 $f2, 0(%[srcB])                \r\n"
+        "gsldlc1 $f4, 7(%[srcA])                \r\n"
+        "gsldrc1 $f4, 0(%[srcA])                \r\n"
+        "gsldlc1 $f6, 7(%[src0])                \r\n"
+        "gsldrc1 $f6, 0(%[src0])                \r\n"
+        "gsldlc1 $f8, 7(%[src1])                \r\n"
+        "gsldrc1 $f8, 0(%[src1])                \r\n"
+        "gsldlc1 $f10, 7(%[src2])               \r\n"
+        "gsldrc1 $f10, 0(%[src2])               \r\n"
+        "gsldlc1 $f12, 7(%[src3])               \r\n"
+        "gsldrc1 $f12, 0(%[src3])               \r\n"
+        "gsldlc1 $f14, 7(%[src4])               \r\n"
+        "gsldrc1 $f14, 0(%[src4])               \r\n"
+        "gsldlc1 $f16, 7(%[src5])               \r\n"
+        "gsldrc1 $f16, 0(%[src5])               \r\n"
+        "gsldlc1 $f18, 7(%[src6])               \r\n"
+        "gsldrc1 $f18, 0(%[src6])               \r\n"
+        "gsldlc1 $f20, 7(%[src7])               \r\n"
+        "gsldrc1 $f20, 0(%[src7])               \r\n"
+        "gsldlc1 $f22, 7(%[src8])               \r\n"
+        "gsldrc1 $f22, 0(%[src8])               \r\n"
+        "gsldlc1 $f24, 7(%[src9])               \r\n"
+        "gsldrc1 $f24, 0(%[src9])               \r\n"
+        "gsldlc1 $f26, 7(%[src10])              \r\n"
+        "gsldrc1 $f26, 0(%[src10])              \r\n"
+        "punpcklbh $f1, $f2, $f0                \r\n"
+        "punpckhbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f3, $f4, $f0                \r\n"
+        "punpckhbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f5, $f6, $f0                \r\n"
+        "punpckhbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f7, $f8, $f0                \r\n"
+        "punpckhbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f9, $f10, $f0               \r\n"
+        "punpckhbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f11, $f12, $f0              \r\n"
+        "punpckhbh $f12, $f12, $f0              \r\n"
+        "punpcklbh $f13, $f14, $f0              \r\n"
+        "punpckhbh $f14, $f14, $f0              \r\n"
+        "punpcklbh $f15, $f16, $f0              \r\n"
+        "punpckhbh $f16, $f16, $f0              \r\n"
+        "punpcklbh $f17, $f18, $f0              \r\n"
+        "punpckhbh $f18, $f18, $f0              \r\n"
+        "punpcklbh $f19, $f20, $f0              \r\n"
+        "punpckhbh $f20, $f20, $f0              \r\n"
+        "punpcklbh $f21, $f22, $f0              \r\n"
+        "punpckhbh $f22, $f22, $f0              \r\n"
+        "punpcklbh $f23, $f24, $f0              \r\n"
+        "punpckhbh $f24, $f24, $f0              \r\n"
+        "punpcklbh $f25, $f26, $f0              \r\n"
+        "punpckhbh $f26, $f26, $f0              \r\n"
+        "paddsh $f27, $f5, $f7                  \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f6, $f8                  \r\n"//src0+src1
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f3                 \r\n"
+        "psubsh $f28, $f28, $f4                 \r\n"
+        "psubsh $f27, $f27, $f9                 \r\n"
+        "psubsh $f28, $f28, $f10                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f1                 \r\n"
+        "paddsh $f28, $f28, $f2                 \r\n"
+        "paddsh $f27, $f27, $f11                \r\n"
+        "paddsh $f28, $f28, $f12                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f2, $f27, $f28              \r\n"
+        "ldc1 $f28, 0(%[dst0])                  \r\n"
+        "pavgb $f2, $f2, $f28                   \r\n"
+        "sdc1 $f2, 0(%[dst0])                   \r\n"
+        "paddsh $f27, $f7, $f9                  \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f8, $f10                 \r\n"//src1+src2
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f5                 \r\n"
+        "psubsh $f28, $f28, $f6                 \r\n"
+        "psubsh $f27, $f27, $f11                \r\n"
+        "psubsh $f28, $f28, $f12                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f3                 \r\n"
+        "paddsh $f28, $f28, $f4                 \r\n"
+        "paddsh $f27, $f27, $f13                \r\n"
+        "paddsh $f28, $f28, $f14                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f4, $f27, $f28              \r\n"
+        "ldc1 $f28, 0(%[dst1])                  \r\n"
+        "pavgb $f4, $f4, $f28                   \r\n"
+        "sdc1 $f4, 0(%[dst1])                   \r\n"
+        "paddsh $f27, $f9, $f11                 \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f10, $f12                \r\n"//src2+src3
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f7                 \r\n"
+        "psubsh $f28, $f28, $f8                 \r\n"
+        "psubsh $f27, $f27, $f13                \r\n"
+        "psubsh $f28, $f28, $f14                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f5                 \r\n"
+        "paddsh $f28, $f28, $f6                 \r\n"
+        "paddsh $f27, $f27, $f15                \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f6, $f27, $f28              \r\n"
+        "ldc1 $f28, 0(%[dst2])                  \r\n"
+        "pavgb $f6, $f6, $f28                   \r\n"
+        "sdc1 $f6, 0(%[dst2])                   \r\n"
+        "paddsh $f27, $f11, $f13                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f12, $f14                \r\n"//src3+src4
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f9                 \r\n"
+        "psubsh $f28, $f28, $f10                \r\n"
+        "psubsh $f27, $f27, $f15                \r\n"
+        "psubsh $f28, $f28, $f16                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f7                 \r\n"
+        "paddsh $f28, $f28, $f8                 \r\n"
+        "paddsh $f27, $f27, $f17                \r\n"
+        "paddsh $f28, $f28, $f18                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f8, $f27, $f28              \r\n"
+        "ldc1 $f28, 0(%[dst3])                  \r\n"
+        "pavgb $f8, $f8, $f28                   \r\n"
+        "sdc1 $f8, 0(%[dst3])                   \r\n"
+        "paddsh $f27, $f13, $f15                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f14, $f16                \r\n"//src4+src5
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f11                \r\n"
+        "psubsh $f28, $f28, $f12                \r\n"
+        "psubsh $f27, $f27, $f17                \r\n"
+        "psubsh $f28, $f28, $f18                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f9                 \r\n"
+        "paddsh $f28, $f28, $f10                \r\n"
+        "paddsh $f27, $f27, $f19                \r\n"
+        "paddsh $f28, $f28, $f20                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f10, $f27, $f28             \r\n"
+        "ldc1 $f28, 0(%[dst4])                  \r\n"
+        "pavgb $f10, $f10, $f28                 \r\n"
+        "sdc1 $f10, 0(%[dst4])                  \r\n"
+        "paddsh $f27, $f15, $f17                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f16, $f18                \r\n"//src5+src6
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f13                \r\n"
+        "psubsh $f28, $f28, $f14                \r\n"
+        "psubsh $f27, $f27, $f19                \r\n"
+        "psubsh $f28, $f28, $f20                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f11                \r\n"
+        "paddsh $f28, $f28, $f12                \r\n"
+        "paddsh $f27, $f27, $f21                \r\n"
+        "paddsh $f28, $f28, $f22                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f12, $f27, $f28             \r\n"
+        "ldc1 $f28, 0(%[dst5])                  \r\n"
+        "pavgb $f12, $f12, $f28                 \r\n"
+        "sdc1 $f12, 0(%[dst5])                  \r\n"
+        "paddsh $f27, $f17, $f19                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f18, $f20                \r\n"//src6+src7
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f15                \r\n"
+        "psubsh $f28, $f28, $f16                \r\n"
+        "psubsh $f27, $f27, $f21                \r\n"
+        "psubsh $f28, $f28, $f22                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f13                \r\n"
+        "paddsh $f28, $f28, $f14                \r\n"
+        "paddsh $f27, $f27, $f23                \r\n"
+        "paddsh $f28, $f28, $f24                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f14, $f27, $f28             \r\n"
+        "ldc1 $f28, 0(%[dst6])                  \r\n"
+        "pavgb $f14, $f14, $f28                 \r\n"
+        "sdc1 $f14, 0(%[dst6])                  \r\n"
+        "paddsh $f27, $f19, $f21                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_4]          \r\n"
+        "paddsh $f28, $f20, $f22                \r\n"//src7+src8
+        "pmullh $f28, $f28, %[ff_pw_4]          \r\n"
+        "psubsh $f27, $f27, $f17                \r\n"
+        "psubsh $f28, $f28, $f18                \r\n"
+        "psubsh $f27, $f27, $f23                \r\n"
+        "psubsh $f28, $f28, $f24                \r\n"
+        "pmullh $f27, $f27, %[ff_pw_5]          \r\n"
+        "pmullh $f28, $f28, %[ff_pw_5]          \r\n"
+        "paddsh $f27, $f27, $f15                \r\n"
+        "paddsh $f28, $f28, $f16                \r\n"
+        "paddsh $f27, $f27, $f25                \r\n"
+        "paddsh $f28, $f28, $f26                \r\n"
+        "paddsh $f27, $f27, %[ff_pw_16]         \r\n"
+        "paddsh $f28, $f28, %[ff_pw_16]         \r\n"
+        "psrah $f27, $f27, %[ff_pw_5]           \r\n"
+        "psrah $f28, $f28, %[ff_pw_5]           \r\n"
+        "packushb $f27, $f27, $f0               \r\n"
+        "packushb $f28, $f28, $f0               \r\n"
+        "punpcklwd $f16, $f27, $f28             \r\n"
+        "ldc1 $f28, 0(%[dst7])                  \r\n"
+        "pavgb $f16, $f16, $f28                 \r\n"
+        "sdc1 $f16, 0(%[dst7])                  \r\n"
+        ::[dst0]"r"(dst),               [dst1]"r"(dst+dstStride),
+          [dst2]"r"(dst+2*dstStride),   [dst3]"r"(dst+3*dstStride),
+          [dst4]"r"(dst+4*dstStride),   [dst5]"r"(dst+5*dstStride),
+          [dst6]"r"(dst+6*dstStride),   [dst7]"r"(dst+7*dstStride),
+          [srcB]"r"(src-2*srcStride),   [srcA]"r"(src-srcStride),
+          [src0]"r"(src),               [src1]"r"(src+srcStride),
+          [src2]"r"(src+2*srcStride),   [src3]"r"(src+3*srcStride),
+          [src4]"r"(src+4*srcStride),   [src5]"r"(src+5*srcStride),
+          [src6]"r"(src+6*srcStride),   [src7]"r"(src+7*srcStride),
+          [src8]"r"(src+8*srcStride),   [src9]"r"(src+9*srcStride),
+          [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10",
+          "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19",
+          "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28"
+    );
+}
+
+static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    src -= 2*srcStride;
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 9                              \r\n"
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 1(%[src])                 \r\n"
+        "gslwrc1 $f2, -2(%[src])                \r\n"
+        "gslwlc1 $f4, 2(%[src])                 \r\n"
+        "gslwrc1 $f4, -1(%[src])                \r\n"
+        "gslwlc1 $f6, 3(%[src])                 \r\n"
+        "gslwrc1 $f6, 0(%[src])                 \r\n"
+        "gslwlc1 $f8, 4(%[src])                 \r\n"
+        "gslwrc1 $f8, 1(%[src])                 \r\n"
+        "gslwlc1 $f10, 5(%[src])                \r\n"
+        "gslwrc1 $f10, 2(%[src])                \r\n"
+        "gslwlc1 $f12, 6(%[src])                \r\n"
+        "gslwrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+        "sdc1 $f18, 0(%[tmp])                   \r\n"
+        "dadd %[tmp], %[tmp], %[tmpStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [tmp]"+&r"(tmp),[src]"+&r"(src)
+        : [tmpStride]"r"(8),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18"
+    );
+
+    tmp -= 28;
+
+    for(i=0; i<4; i++) {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int16_t _tmp[104];
+    int16_t *tmp = _tmp;
+    int i;
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 13                             \r\n"
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 5(%[src])                 \r\n"
+        "gsldrc1 $f2, -2(%[src])                \r\n"
+        "gsldlc1 $f4, 6(%[src])                 \r\n"
+        "gsldrc1 $f4, -1(%[src])                \r\n"
+        "gsldlc1 $f6, 7(%[src])                 \r\n"
+        "gsldrc1 $f6, 0(%[src])                 \r\n"
+        "gsldlc1 $f8, 8(%[src])                 \r\n"
+        "gsldrc1 $f8, 1(%[src])                 \r\n"
+        "gsldlc1 $f10, 9(%[src])                \r\n"
+        "gsldrc1 $f10, 2(%[src])                \r\n"
+        "gsldlc1 $f12, 10(%[src])               \r\n"
+        "gsldrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f1, $f2, $f0                \r\n"
+        "punpcklbh $f3, $f4, $f0                \r\n"
+        "punpcklbh $f5, $f6, $f0                \r\n"
+        "punpcklbh $f7, $f8, $f0                \r\n"
+        "punpcklbh $f9, $f10, $f0               \r\n"
+        "punpcklbh $f11, $f12, $f0              \r\n"
+        "punpckhbh $f2, $f2, $f0                \r\n"
+        "punpckhbh $f4, $f4, $f0                \r\n"
+        "punpckhbh $f6, $f6, $f0                \r\n"
+        "punpckhbh $f8, $f8, $f0                \r\n"
+        "punpckhbh $f10, $f10, $f0              \r\n"
+        "punpckhbh $f12, $f12, $f0              \r\n"
+        "paddsh $f13, $f5, $f7                  \r\n"
+        "paddsh $f15, $f3, $f9                 \r\n"
+        "paddsh $f17, $f1, $f11                 \r\n"
+        "pmullh $f13, $f13, %[ff_pw_20]         \r\n"
+        "pmullh $f15, $f15, %[ff_pw_5]          \r\n"
+        "psubsh $f13, $f13, $f15                \r\n"
+        "paddsh $f17, $f13, $f17                \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+        "sdc1 $f17, 0(%[tmp])                   \r\n"
+        "sdc1 $f18, 8(%[tmp])                   \r\n"
+        "dadd %[tmp], %[tmp], %[tmpStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [tmp]"+&r"(tmp),[src]"+&r"(src)
+        : [tmpStride]"r"(16),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
+        : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9",
+          "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18"
+    );
+
+    tmp -= 88;
+
+    for(i=0; i<8; i++) {
+        const int tmpB= tmp[-16];
+        const int tmpA= tmp[ -8];
+        const int tmp0= tmp[  0];
+        const int tmp1= tmp[  8];
+        const int tmp2= tmp[ 16];
+        const int tmp3= tmp[ 24];
+        const int tmp4= tmp[ 32];
+        const int tmp5= tmp[ 40];
+        const int tmp6= tmp[ 48];
+        const int tmp7= tmp[ 56];
+        const int tmp8= tmp[ 64];
+        const int tmp9= tmp[ 72];
+        const int tmp10=tmp[ 80];
+        op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        op2_put(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));
+        op2_put(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));
+        op2_put(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));
+        op2_put(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));
+        dst++;
+        tmp++;
+    }
+}
+
+static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 9                              \r\n"
+        "1:                                     \r\n"
+        "gslwlc1 $f2, 1(%[src])                 \r\n"
+        "gslwrc1 $f2, -2(%[src])                \r\n"
+        "gslwlc1 $f4, 2(%[src])                 \r\n"
+        "gslwrc1 $f4, -1(%[src])                \r\n"
+        "gslwlc1 $f6, 3(%[src])                 \r\n"
+        "gslwrc1 $f6, 0(%[src])                 \r\n"
+        "gslwlc1 $f8, 4(%[src])                 \r\n"
+        "gslwrc1 $f8, 1(%[src])                 \r\n"
+        "gslwlc1 $f10, 5(%[src])                \r\n"
+        "gslwrc1 $f10, 2(%[src])                \r\n"
+        "gslwlc1 $f12, 6(%[src])                \r\n"
+        "gslwrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f2, $f2, $f0                \r\n"
+        "punpcklbh $f4, $f4, $f0                \r\n"
+        "punpcklbh $f6, $f6, $f0                \r\n"
+        "punpcklbh $f8, $f8, $f0                \r\n"
+        "punpcklbh $f10, $f10, $f0              \r\n"
+        "punpcklbh $f12, $f12, $f0              \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+        "sdc1 $f18, 0(%[tmp])                   \r\n"
+        "dadd %[tmp], %[tmp], %[tmpStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [tmp]"+&r"(tmp),[src]"+&r"(src)
+        : [tmpStride]"r"(8),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18"
+    );
+
+    tmp -= 28;
+
+    for(i=0; i<4; i++)
+    {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int16_t _tmp[104];
+    int16_t *tmp = _tmp;
+    int i;
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor $f0, $f0, $f0                      \r\n"
+        "dli $8, 13                             \r\n"
+        "1:                                     \r\n"
+        "gsldlc1 $f2, 5(%[src])                 \r\n"
+        "gsldrc1 $f2, -2(%[src])                \r\n"
+        "gsldlc1 $f4, 6(%[src])                 \r\n"
+        "gsldrc1 $f4, -1(%[src])                \r\n"
+        "gsldlc1 $f6, 7(%[src])                 \r\n"
+        "gsldrc1 $f6, 0(%[src])                 \r\n"
+        "gsldlc1 $f8, 8(%[src])                 \r\n"
+        "gsldrc1 $f8, 1(%[src])                 \r\n"
+        "gsldlc1 $f10, 9(%[src])                \r\n"
+        "gsldrc1 $f10, 2(%[src])                \r\n"
+        "gsldlc1 $f12, 10(%[src])               \r\n"
+        "gsldrc1 $f12, 3(%[src])                \r\n"
+        "punpcklbh $f1, $f2, $f0                \r\n"
+        "punpcklbh $f3, $f4, $f0                \r\n"
+        "punpcklbh $f5, $f6, $f0                \r\n"
+        "punpcklbh $f7, $f8, $f0                \r\n"
+        "punpcklbh $f9, $f10, $f0               \r\n"
+        "punpcklbh $f11, $f12, $f0              \r\n"
+        "punpckhbh $f2, $f2, $f0                \r\n"
+        "punpckhbh $f4, $f4, $f0                \r\n"
+        "punpckhbh $f6, $f6, $f0                \r\n"
+        "punpckhbh $f8, $f8, $f0                \r\n"
+        "punpckhbh $f10, $f10, $f0              \r\n"
+        "punpckhbh $f12, $f12, $f0              \r\n"
+        "paddsh $f13, $f5, $f7                  \r\n"
+        "paddsh $f15, $f3, $f9                 \r\n"
+        "paddsh $f17, $f1, $f11                 \r\n"
+        "pmullh $f13, $f13, %[ff_pw_20]         \r\n"
+        "pmullh $f15, $f15, %[ff_pw_5]          \r\n"
+        "psubsh $f13, $f13, $f15                \r\n"
+        "paddsh $f17, $f13, $f17                \r\n"
+        "paddsh $f14, $f6, $f8                  \r\n"
+        "paddsh $f16, $f4, $f10                 \r\n"
+        "paddsh $f18, $f2, $f12                 \r\n"
+        "pmullh $f14, $f14, %[ff_pw_20]         \r\n"
+        "pmullh $f16, $f16, %[ff_pw_5]          \r\n"
+        "psubsh $f14, $f14, $f16                \r\n"
+        "paddsh $f18, $f14, $f18                \r\n"
+
+        "sdc1 $f17, 0(%[tmp])                   \r\n"
+        "sdc1 $f18, 8(%[tmp])                   \r\n"
+        "dadd %[tmp], %[tmp], %[tmpStride]      \r\n"
+        "dadd %[src], %[src], %[srcStride]      \r\n"
+        "daddi $8, $8, -1                       \r\n"
+        "bnez $8, 1b                            \r\n"
+        : [tmp]"+&r"(tmp),[src]"+&r"(src)
+        : [tmpStride]"r"(16),[srcStride]"r"(srcStride),
+          [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
+        : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9",
+          "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18"
+    );
+
+    tmp -= 88;
+
+    for(i=0; i<8; i++) {
+        const int tmpB= tmp[-16];
+        const int tmpA= tmp[ -8];
+        const int tmp0= tmp[  0];
+        const int tmp1= tmp[  8];
+        const int tmp2= tmp[ 16];
+        const int tmp3= tmp[ 24];
+        const int tmp4= tmp[ 32];
+        const int tmp5= tmp[ 40];
+        const int tmp6= tmp[ 48];
+        const int tmp7= tmp[ 56];
+        const int tmp8= tmp[ 64];
+        const int tmp9= tmp[ 72];
+        const int tmp10=tmp[ 80];
+        op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        op2_avg(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));
+        op2_avg(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));
+        op2_avg(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));
+        op2_avg(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));
+        dst++;
+        tmp++;
+    }
+}
+
+static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride){
+    avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+//DEF_H264_MC_MMI(put_, 4)
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_pixels4_mmi(dst, src, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    put_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    put_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(avg_, 4)
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_pixels4_mmi(dst, src, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(put_, 8)
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_pixels8_mmi(dst, src, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    put_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    put_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfHV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfHV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+
+//DEF_H264_MC_MMI(avg_, 8)
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_pixels8_mmi(dst, src, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfHV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfHV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
+    avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+
+//DEF_H264_MC_MMI(put_, 16)
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_pixels16_mmi(dst, src, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    put_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    put_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[256];
+    uint8_t halfHV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[256];
+    uint8_t halfHV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfV[256];
+    uint8_t halfHV[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfV[256];
+    uint8_t halfHV[256];
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
+}
+
+//DEF_H264_MC_MMI(avg_, 16)
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_pixels16_mmi(dst, src, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[256];
+    uint8_t halfHV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[256];
+    uint8_t halfHV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfV[256];
+    uint8_t halfHV[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfV[256];
+    uint8_t halfHV[256];
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
+    avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
+}
+
+#undef op2_avg
+#undef op2_put
diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
new file mode 100644
index 0000000000..c38f1f7a42
--- /dev/null
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -0,0 +1,3600 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)    \
+( {                                                                      \
+    v4i32 tmp0_m, tmp1_m;                                                \
+    v8i16 out0_m, out1_m, out2_m, out3_m;                                \
+    v8i16 minus5h_m = __msa_ldi_h(-5);                                   \
+    v8i16 plus20h_m = __msa_ldi_h(20);                                   \
+                                                                         \
+    ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m);                               \
+                                                                         \
+    tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m);             \
+    tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m);             \
+                                                                         \
+    ILVRL_H2_SH(in1, in4, out0_m, out1_m);                               \
+    DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m);  \
+    ILVRL_H2_SH(in2, in3, out2_m, out3_m);                               \
+    DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m);  \
+                                                                         \
+    SRARI_W2_SW(tmp0_m, tmp1_m, 10);                                     \
+    SAT_SW2_SW(tmp0_m, tmp1_m, 7);                                       \
+    out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m);              \
+                                                                         \
+    out0_m;                                                              \
+} )
+
+#define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2)     \
+( {                                                     \
+    v8i16 out0_m, out1_m;                               \
+    v16i8 tmp0_m, tmp1_m;                               \
+    v16i8 minus5b = __msa_ldi_b(-5);                    \
+    v16i8 plus20b = __msa_ldi_b(20);                    \
+                                                        \
+    tmp0_m = __msa_vshf_b((v16i8) mask0, in, in);       \
+    out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);            \
+                                                        \
+    tmp0_m = __msa_vshf_b((v16i8) mask1, in, in);       \
+    out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);  \
+                                                        \
+    tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in);     \
+    out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m);  \
+                                                        \
+    out1_m;                                             \
+} )
+
+static const uint8_t luma_mask_arr[16 * 8] = {
+    /* 8 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+
+    /* 4 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+
+    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
+    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
+};
+
+#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
+                                        out1, out2)                          \
+{                                                                            \
+    v16i8 tmp0_m, tmp1_m;                                                    \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
+                                                                             \
+    ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \
+    HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \
+    ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \
+    ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \
+}
+
+#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  \
+( {                                                                            \
+    v8i16 tmp1_m;                                                              \
+    v16i8 tmp0_m, tmp2_m;                                                      \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                         \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                         \
+                                                                               \
+    tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0);                 \
+    tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m);                   \
+                                                                               \
+    ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m);                        \
+    DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m);        \
+                                                                               \
+    tmp1_m;                                                                    \
+} )
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  \
+( {                                                                            \
+    v4i32 tmp1_m;                                                              \
+    v8i16 tmp2_m, tmp3_m;                                                      \
+    v8i16 minus5h_m = __msa_ldi_h(-5);                                         \
+    v8i16 plus20h_m = __msa_ldi_h(20);                                         \
+                                                                               \
+    tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0);                 \
+    tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m);                   \
+                                                                               \
+    ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m);                        \
+    DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m);        \
+                                                                               \
+    tmp1_m = __msa_srari_w(tmp1_m, 10);                                        \
+    tmp1_m = __msa_sat_s_w(tmp1_m, 7);                                         \
+                                                                               \
+    tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m);                    \
+                                                                               \
+    tmp2_m;                                                                    \
+} )
+
+#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,              \
+                                                    mask0, mask1, mask2)     \
+( {                                                                          \
+    v8i16 hz_out_m;                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m;                                            \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
+                                                                             \
+    vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0);        \
+    hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m);                               \
+                                                                             \
+    VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m);        \
+    DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m);  \
+                                                                             \
+    hz_out_m;                                                                \
+} )
+
+static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1;
+    v16u8 out;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        HADD_SB2_SH(vec0, vec1, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+        SRARI_H2_SH(res0, res1, 5);
+        SAT_SH2_SH(res0, res1, 7);
+        out = PCKEV_XORI128_UB(res0, res1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16u8 out0, out1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        out0 = PCKEV_XORI128_UB(res0, res1);
+        out1 = PCKEV_XORI128_UB(res2, res3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+
+        LD_SB2(src, 8, src4, src5);
+        src += src_stride;
+        LD_SB2(src, 8, src6, src7);
+        src += src_stride;
+
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                    vec0, vec1, vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+
+        ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1;
+    v16i8 res, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    slide = 2 + hor_offset;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        HADD_SB2_SH(vec0, vec1, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+        SRARI_H2_SH(res0, res1, 5);
+        SAT_SH2_SH(res0, res1, 7);
+
+        res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+        src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+        src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+        src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+        res = __msa_aver_s_b(res, src0);
+        res = (v16i8) __msa_xori_b((v16u8) res, 128);
+
+        ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 tmp0, tmp1;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    slide = 2 + hor_offset;
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+        PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+
+        tmp0 = __msa_aver_s_b(tmp0, src0);
+        tmp1 = __msa_aver_s_b(tmp1, src1);
+
+        XORI_B2_128_SB(tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                    uint8_t *dst, int32_t dst_stride,
+                                    int32_t height, uint8_t hor_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 dst0, dst1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2, vshf;
+    v8i16 res0, res1, res2, res3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        vshf = LD_SB(&luma_mask_arr[16 + 96]);
+    } else {
+        vshf = LD_SB(&luma_mask_arr[96]);
+    }
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
+
+        dst0 = __msa_aver_s_b(dst0, src0);
+        dst1 = __msa_aver_s_b(dst1, src2);
+
+        XORI_B2_128_SB(dst0, dst1);
+
+        ST_SB2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v16i8 filt0, filt1, filt2;
+    v8i16 out10, out32;
+    v16u8 out;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 5);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 filt0, filt1, filt2;
+    v16u8 out0, out1;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16u8 res0, res1, res2, res3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 out;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 5);
+        SAT_SH2_SH(out10, out32, 7);
+
+        out = PCKEV_XORI128_UB(out10, out32);
+
+        if (ver_offset) {
+            src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+            src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+        } else {
+            src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+            src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+        }
+
+        src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+        out = __msa_aver_u_b(out, (v16u8) src32_r);
+
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src2110 = src6554;
+        src4332 = src8776;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 res0, res1;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
+
+        if (ver_offset) {
+            PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
+        } else {
+            PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
+        }
+
+        res0 = __msa_aver_s_b(res0, (v16i8) src10_r);
+        res1 = __msa_aver_s_b(res1, (v16i8) src32_r);
+
+        XORI_B2_128_SB(res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src2 = src8;
+        src3 = src9;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                    uint8_t *dst, int32_t dst_stride,
+                                    int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16u8 res0, res1, res2, res3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+
+        if (ver_offset) {
+            res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
+            res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
+            res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
+            res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
+        } else {
+            res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
+            res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
+            res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
+            res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
+        }
+
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                              mask0, mask1,
+                                                              mask2);
+
+        PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                                 hz_out3, hz_out4, hz_out5);
+        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                                 hz_out4, hz_out5, hz_out6);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
+                                                 hz_out5, hz_out6, hz_out7);
+        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
+                                                 hz_out6, hz_out7, hz_out8);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+
+        ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16u8 out0, out1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+        out0 = PCKEV_XORI128_UB(dst0, dst1);
+        out1 = PCKEV_XORI128_UB(dst2, dst3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out3 = hz_out7;
+        hz_out1 = hz_out5;
+        hz_out5 = hz_out4;
+        hz_out4 = hz_out8;
+        hz_out2 = hz_out6;
+        hz_out0 = hz_out5;
+    }
+}
+
+static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t horiz_offset)
+{
+    uint32_t row;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
+    v4i32 hz_res0, hz_res1;
+    v8i16 dst0, dst1;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+    v16u8 out;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = (height >> 1); row--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
+                   mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
+                   mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+        SRARI_W2_SW(hz_res0, hz_res1, 10);
+        SAT_SW2_SW(hz_res0, hz_res1, 7);
+
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+
+        SAT_SH2_SH(dst0, dst1, 7);
+
+        if (horiz_offset) {
+            dst0 = __msa_ilvod_h(zeros, dst0);
+            dst1 = __msa_ilvod_h(zeros, dst1);
+        } else {
+            ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
+        }
+
+        hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+        hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+        dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
+
+        out = PCKEV_XORI128_UB(dst0, dst0);
+        ST4x2_UB(out, dst, dst_stride);
+
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
+                                 horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
+                                 horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t ver_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                              mask0, mask1,
+                                                              mask2);
+
+        PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (ver_offset) {
+            dst1 = __msa_srari_h(hz_out3, 5);
+            dst3 = __msa_srari_h(hz_out4, 5);
+            dst5 = __msa_srari_h(hz_out5, 5);
+            dst7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            dst1 = __msa_srari_h(hz_out2, 5);
+            dst3 = __msa_srari_h(hz_out3, 5);
+            dst5 = __msa_srari_h(hz_out4, 5);
+            dst7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+        dst0 = __msa_aver_s_h(dst0, dst1);
+        dst1 = __msa_aver_s_h(dst2, dst3);
+        dst2 = __msa_aver_s_h(dst4, dst5);
+        dst3 = __msa_aver_s_h(dst6, dst7);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+
+        ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t ver_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 out;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (ver_offset) {
+            dst1 = __msa_srari_h(hz_out3, 5);
+            dst3 = __msa_srari_h(hz_out4, 5);
+            dst5 = __msa_srari_h(hz_out5, 5);
+            dst7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            dst1 = __msa_srari_h(hz_out2, 5);
+            dst3 = __msa_srari_h(hz_out3, 5);
+            dst5 = __msa_srari_h(hz_out4, 5);
+            dst7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+        dst0 = __msa_aver_s_h(dst0, dst1);
+        dst1 = __msa_aver_s_h(dst2, dst3);
+        dst2 = __msa_aver_s_h(dst4, dst5);
+        dst3 = __msa_aver_s_h(dst6, dst7);
+
+        out = PCKEV_XORI128_UB(dst0, dst0);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst1, dst1);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst2, dst2);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst3, dst3);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, uint8_t vert_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midv_qrt_8w_msa(src, src_stride, dst, dst_stride, height,
+                                 vert_offset);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                   int32_t src_stride, uint8_t *dst,
+                                   int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
+    v8i16 out0, out1;
+    v16u8 out;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+
+        hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
+                                                              src_hz1, mask0,
+                                                              mask1, mask2);
+        hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
+                                                              src_hz3, mask0,
+                                                              mask1, mask2);
+
+        SRARI_H2_SH(hz_out0, hz_out1, 5);
+        SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+        /* filter calc */
+        vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
+                                                      src_vt2, src_vt3,
+                                                      src_vt4, src_vt5);
+        vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
+                                                      src_vt4, src_vt5,
+                                                      src_vt6, src_vt7);
+
+        SRARI_H2_SH(vert_out0, vert_out1, 5);
+        SAT_SH2_SH(vert_out0, vert_out1, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+
+        SAT_SH2_SH(out0, out1, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src_vt3 = src_vt7;
+        src_vt1 = src_vt5;
+        src_vt0 = src_vt4;
+        src_vt4 = src_vt8;
+        src_vt2 = src_vt6;
+    }
+}
+
+static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                   int32_t src_stride, uint8_t *dst,
+                                   int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
+    v8i16 out0, out1, out2, out3;
+    v16u8 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
+
+        SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+        SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+        /* filter calc */
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
+                                        src_vt4, src_vt5, vert_out0, vert_out1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
+                                        src_vt6, src_vt7, vert_out2, vert_out3);
+
+        SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
+        SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+        out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
+        out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
+
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src_vt3 = src_vt7;
+        src_vt1 = src_vt5;
+        src_vt5 = src_vt4;
+        src_vt4 = src_vt8;
+        src_vt2 = src_vt6;
+        src_vt0 = src_vt5;
+    }
+}
+
+static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                    int32_t src_stride, uint8_t *dst,
+                                    int32_t dst_stride, int32_t height)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
+                               height);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3, res;
+    v8i16 res0, res1;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    res = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        LD_SB2(src, 8, src4, src5);
+        src += src_stride;
+        LD_SB2(src, 8, src6, src7);
+        src += src_stride;
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                    vec0, vec1, vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+        AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t hor_offset)
+{
+    uint8_t slide;
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 out0, out1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16u8 res0, res1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        slide = 3;
+    } else {
+        slide = 2;
+    }
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
+    SRARI_H2_SH(out0, out1, 5);
+    SAT_SH2_SH(out0, out1, 7);
+
+    PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
+
+    src0 = __msa_sld_b(src0, src0, slide);
+    src1 = __msa_sld_b(src1, src1, slide);
+    src2 = __msa_sld_b(src2, src2, slide);
+    src3 = __msa_sld_b(src3, src3, slide);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
+    res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
+
+    XORI_B2_128_UB(res0, res1);
+
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+
+    AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+    ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v8i16 out0, out1, out2, out3;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16i8 res0, res1, res2, res3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        slide = 3;
+    } else {
+        slide = 2;
+    }
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, out0, out1, out2, out3);
+
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+
+        SRARI_H4_SH(out0, out1, out2, out3, 5);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
+                    res0, res1, res2, res3);
+
+        res0 = __msa_aver_s_b(res0, src0);
+        res1 = __msa_aver_s_b(res1, src1);
+        res2 = __msa_aver_s_b(res2, src2);
+        res3 = __msa_aver_s_b(res3, src3);
+
+        XORI_B4_128_SB(res0, res1, res2, res3);
+        AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                      dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   uint8_t hor_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 out0, out1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2, vshf;
+    v16u8 dst0, dst1;
+    v8i16 res0, res1, res2, res3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        vshf = LD_SB(&luma_mask_arr[16 + 96]);
+    } else {
+        vshf = LD_SB(&luma_mask_arr[96]);
+    }
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        LD_UB2(dst, dst_stride, dst0, dst1);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
+
+        out0 = __msa_aver_s_b(out0, src0);
+        out1 = __msa_aver_s_b(out1, src2);
+
+        XORI_B2_128_SB(out0, out1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+        ST_UB2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+               src54_r, src65_r, src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    res = PCKEV_XORI128_UB(out10, out32);
+
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0, out1, out2, out3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0, out1, out2, out3, 5);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res0, res1, res2, res3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                    res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t ver_offset)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+               src54_r, src65_r, src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    res = PCKEV_XORI128_UB(out10, out32);
+
+    if (ver_offset) {
+        src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+        src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+    } else {
+        src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+        src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+    }
+
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    res = __msa_aver_u_b(res, (v16u8) src32_r);
+
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 res0, res1;
+    v16u8 vec0, vec1;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
+
+        if (ver_offset) {
+            PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
+        } else {
+            PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
+        }
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+        vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
+        vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
+
+        XORI_B2_128_UB(vec0, vec1);
+        AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src2 = src8;
+        src3 = src9;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 out0, out1, out2, out3;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res0, res1, res2, res3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, out0, out1, out2, out3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        if (ver_offset) {
+            res0 = (v16u8) __msa_aver_s_b(out0, src3);
+            res1 = (v16u8) __msa_aver_s_b(out1, src4);
+            res2 = (v16u8) __msa_aver_s_b(out2, src5);
+            res3 = (v16u8) __msa_aver_s_b(out3, src6);
+        } else {
+            res0 = (v16u8) __msa_aver_s_b(out0, src2);
+            res1 = (v16u8) __msa_aver_s_b(out1, src3);
+            res2 = (v16u8) __msa_aver_s_b(out2, src4);
+            res3 = (v16u8) __msa_aver_s_b(out3, src5);
+        }
+
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+    res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                             hz_out3, hz_out4, hz_out5);
+    res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                             hz_out4, hz_out5, hz_out6);
+    res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
+                                             hz_out5, hz_out6, hz_out7);
+    res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
+                                             hz_out6, hz_out7, hz_out8);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = PCKEV_XORI128_UB(res0, res1);
+    tmp1 = PCKEV_XORI128_UB(res2, res3);
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
+    AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+
+    ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 res0, res1, res2, res3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out3 = hz_out7;
+        hz_out1 = hz_out5;
+        hz_out5 = hz_out4;
+        hz_out4 = hz_out8;
+        hz_out2 = hz_out6;
+        hz_out0 = hz_out5;
+    }
+}
+
+static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride)
+{
+    avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
+    avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                                     16);
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t horiz_offset)
+{
+    uint32_t row;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16u8 dst0, dst1, res;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
+    v4i32 hz_res0, hz_res1;
+    v8i16 res0, res1;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = (height >> 1); row--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+
+        dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
+                   mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
+                   mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+        SRARI_W2_SW(hz_res0, hz_res1, 10);
+        SAT_SW2_SW(hz_res0, hz_res1, 7);
+
+        res0 = __msa_srari_h(shf_vec2, 5);
+        res1 = __msa_srari_h(shf_vec5, 5);
+
+        SAT_SH2_SH(res0, res1, 7);
+
+        if (horiz_offset) {
+            res0 = __msa_ilvod_h(zeros, res0);
+            res1 = __msa_ilvod_h(zeros, res1);
+        } else {
+            ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
+        }
+        hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
+        hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
+        res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
+
+        res = PCKEV_XORI128_UB(res0, res0);
+
+        dst0 = __msa_aver_u_b(res, dst0);
+
+        ST4x2_UB(dst0, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
+                                              height, horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height,
+                                                   uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
+                                              height, horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int32_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6;
+    v8i16 res0, res1, res2, res3;
+    v16u8 vec0, vec1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                                 hz_out3, hz_out4, hz_out5);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                                 hz_out4, hz_out5, hz_out6);
+
+        if (ver_offset) {
+            res1 = __msa_srari_h(hz_out3, 5);
+            res3 = __msa_srari_h(hz_out4, 5);
+        } else {
+            res1 = __msa_srari_h(hz_out2, 5);
+            res3 = __msa_srari_h(hz_out3, 5);
+        }
+
+        SAT_SH2_SH(res1, res3, 7);
+
+        res0 = __msa_aver_s_h(res0, res1);
+        res1 = __msa_aver_s_h(res2, res3);
+
+        vec0 = PCKEV_XORI128_UB(res0, res0);
+        vec1 = PCKEV_XORI128_UB(res1, res1);
+
+        AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
+
+        out0 = __msa_copy_u_w((v4i32) dst0, 0);
+        out1 = __msa_copy_u_w((v4i32) dst1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        hz_out0 = hz_out2;
+        hz_out1 = hz_out3;
+        hz_out2 = hz_out4;
+        hz_out3 = hz_out5;
+        hz_out4 = hz_out6;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t vert_offset)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 res0, res1, res2, res3;
+    v8i16 res4, res5, res6, res7;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (vert_offset) {
+            res1 = __msa_srari_h(hz_out3, 5);
+            res3 = __msa_srari_h(hz_out4, 5);
+            res5 = __msa_srari_h(hz_out5, 5);
+            res7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            res1 = __msa_srari_h(hz_out2, 5);
+            res3 = __msa_srari_h(hz_out3, 5);
+            res5 = __msa_srari_h(hz_out4, 5);
+            res7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(res1, res3, res5, res7, 7);
+
+        res0 = __msa_aver_s_h(res0, res1);
+        res1 = __msa_aver_s_h(res2, res3);
+        res2 = __msa_aver_s_h(res4, res5);
+        res3 = __msa_aver_s_h(res6, res7);
+
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height,
+                                                   uint8_t vert_offset)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              height, vert_offset);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
+    v8i16 res0, res1;
+    v16u8 res;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+    LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
+                                                          mask0, mask1, mask2);
+    hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
+                                                          mask0, mask1, mask2);
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+    LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+
+    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+    /* filter calc */
+    vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
+                                                  src_vt3, src_vt4, src_vt5);
+    vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
+                                                  src_vt5, src_vt6, src_vt7);
+    SRARI_H2_SH(vert_out0, vert_out1, 5);
+    SAT_SH2_SH(vert_out0, vert_out1, 7);
+
+    res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+    res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
+    v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
+    v8i16 out0, out1, out2, out3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
+        SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+        SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
+                                        src_vt4, src_vt5, vert_out0, vert_out1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
+                                        src_vt6, src_vt7, vert_out2, vert_out3);
+        SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
+        SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+        out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
+        out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
+
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src_vt0 = src_vt4;
+        src_vt1 = src_vt5;
+        src_vt2 = src_vt6;
+        src_vt3 = src_vt7;
+        src_vt4 = src_vt8;
+    }
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
+                                                   const uint8_t *src_y,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
+                                             dst, dst_stride);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+
+    src_x += (8 * src_stride) - 16;
+    src_y += (8 * src_stride) - 16;
+    dst += (8 * dst_stride) - 16;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
+                                             dst, dst_stride);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    copy_width16_msa(src, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    copy_width8_msa(src, stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_width16_msa(src, stride, dst, stride, 16);
+}
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_width8_msa(src, stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_width4_msa(src, stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_16w_msa(src - 2, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_8w_msa(src - 2, stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_4w_msa(src - 2, stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src - 2,
+                            src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src - 2,
+                            src - (stride * 2) +
+                            sizeof(uint8_t), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src + stride - 2,
+                            src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src + stride - 2,
+                            src - (stride * 2) +
+                            sizeof(uint8_t), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src + stride - 2,
+                           src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src + stride - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 8);
+}
+
+
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src + stride - 2,
+                           src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src + stride - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_16w_msa(src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_8w_msa(src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_4w_msa(src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
+}
+
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
+                                           stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
+                                           stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
+                                         stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
+                                         stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
+                                         stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
+                                         stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2),
+                                           stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t), stride,
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2),
+                                           stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t), stride,
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 0);
+}
+
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 1);
+}
+
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 0);
+}
+
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 1);
+}
+
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 0);
+}
+
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 1);
+}
+
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 0);
+}
+
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 1);
+}
+
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 0);
+}
+
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 1);
+}
+
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 0);
+}
+
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 1);
+}
+
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
+                                        stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                     stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
+                                      stride, dst, stride);
+}
diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
new file mode 100644
index 0000000000..975d91f8be
--- /dev/null
+++ b/libavcodec/mips/hevc_idct_msa.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static const int16_t gt8x8_cnst[16] = {
+    64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] = {
+    64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+    64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+    64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+    64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] = {
+    90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+    90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+    88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+    85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+    82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+    78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+    73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+    67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+    61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+    54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+    46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+    38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+    31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+    22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+    13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+    4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] = {
+    90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+    80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+    57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+    25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] = {
+    89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+static const int16_t gt32x32_cnst3[16] = {
+    64, 64, 64, 64, 83, 36, -36, -83, 64, -64, -64, 64, 36, -83, 83, -36
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \
+                         sum0, sum1, sum2, sum3, shift)       \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5;                 \
+    v4i32 cnst64 = __msa_ldi_w(64);                           \
+    v4i32 cnst83 = __msa_ldi_w(83);                           \
+    v4i32 cnst36 = __msa_ldi_w(36);                           \
+                                                              \
+    DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64,   \
+                cnst83, cnst36, vec0, vec2, vec1, vec3);      \
+    DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5);    \
+                                                              \
+    sum0 = vec0 + vec2;                                       \
+    sum1 = vec0 - vec2;                                       \
+    sum3 = sum0;                                              \
+    sum2 = sum1;                                              \
+                                                              \
+    vec1 += vec3;                                             \
+    vec4 -= vec5;                                             \
+                                                              \
+    sum0 += vec1;                                             \
+    sum1 += vec4;                                             \
+    sum2 -= vec4;                                             \
+    sum3 -= vec1;                                             \
+                                                              \
+    SRARI_W4_SW(sum0, sum1, sum2, sum3, shift);               \
+    SAT_SW4_SW(sum0, sum1, sum2, sum3, 15);                   \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
+{                                                                        \
+    v8i16 src0_r, src1_r, src2_r, src3_r;                                \
+    v8i16 src0_l, src1_l, src2_l, src3_l;                                \
+    v8i16 filt0, filter0, filter1, filter2, filter3;                     \
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;          \
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;          \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r;                                \
+    v4i32 sum0_l, sum1_l, sum2_l, sum3_l;                                \
+                                                                         \
+    ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_r, src1_r, src2_r, src3_r);                          \
+    ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_l, src1_l, src2_l, src3_l);                          \
+                                                                         \
+    filt0 = LD_SH(filter);                                               \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0,        \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter2, filter2,       \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter3, filter3,       \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r += temp4_r;                                                   \
+    sum1_l += temp4_l;                                                   \
+    sum2_r -= temp4_r;                                                   \
+    sum2_l -= temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4);               \
+                                                                         \
+    filt0 = LD_SH(filter + 8);                                           \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l,  filter0, filter0,       \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2,        \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3,        \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r -= temp4_r;                                                   \
+    sum1_l -= temp4_l;                                                   \
+    sum2_r += temp4_r;                                                   \
+    sum2_l += temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5);               \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                \
+                           src4_r, src5_r, src6_r, src7_r,                \
+                           src0_l, src1_l, src2_l, src3_l,                \
+                           src4_l, src5_l, src6_l, src7_l, shift)         \
+{                                                                         \
+    int16_t *ptr0, *ptr1;                                                 \
+    v8i16 filt0, filt1, dst0, dst1;                                       \
+    v8i16 filter0, filter1, filter2, filter3;                             \
+    v4i32 temp0_r, temp1_r, temp0_l, temp1_l;                             \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;         \
+    v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l;                         \
+                                                                          \
+    ptr0 = (buf_ptr + 112);                                               \
+    ptr1 = (buf_ptr + 128);                                               \
+    k = -1;                                                               \
+                                                                          \
+    for (j = 0; j < 4; j++)                                               \
+    {                                                                     \
+        LD_SH2(filter, 8, filt0, filt1)                                   \
+        filter += 16;                                                     \
+        SPLATI_W2_SH(filt0, 0, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 0, filter2, filter3);                         \
+        DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l,  filter0, filter0,    \
+                    filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l);    \
+        DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l);    \
+        DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l,  filter1, filter1,   \
+                     filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l);   \
+        DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l);   \
+                                                                          \
+        sum1_r = sum0_r;                                                  \
+        sum1_l = sum0_l;                                                  \
+                                                                          \
+        SPLATI_W2_SH(filt0, 2, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 2, filter2, filter3);                         \
+        DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l);  \
+        DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l);   \
+        DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l);  \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        sum3_r = temp1_r - sum3_r;                                        \
+        sum3_l = temp1_l - sum3_l;                                        \
+                                                                          \
+        DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l);  \
+        DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3,    \
+                     filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l);   \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, buf_ptr);                                             \
+        ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16)));                   \
+                                                                          \
+        BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16)));           \
+        ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16)));           \
+                                                                          \
+        k *= -1;                                                          \
+        buf_ptr += 16;                                                    \
+    }                                                                     \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \
+{                                                                     \
+    LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l);                  \
+    tmp1_r = sum0_r;                                                  \
+    tmp1_l = sum0_l;                                                  \
+    sum0_r += tmp0_r;                                                 \
+    sum0_l += tmp0_l;                                                 \
+    ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4);                \
+    tmp1_r -= tmp0_r;                                                 \
+    tmp1_l -= tmp0_l;                                                 \
+    ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4);               \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \
+                              res0, res1, res2, res3, shift)  \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3;                             \
+    v4i32 cnst74 = __msa_ldi_w(74);                           \
+    v4i32 cnst55 = __msa_ldi_w(55);                           \
+    v4i32 cnst29 = __msa_ldi_w(29);                           \
+                                                              \
+    vec0 = in_r0 + in_r1;                                     \
+    vec2 = in_r0 - in_l1;                                     \
+    res0 = vec0 * cnst29;                                     \
+    res1 = vec2 * cnst55;                                     \
+    res2 = in_r0 - in_r1;                                     \
+    vec1 = in_r1 + in_l1;                                     \
+    res2 += in_l1;                                            \
+    vec3 = in_l0 * cnst74;                                    \
+    res3 = vec0 * cnst55;                                     \
+                                                              \
+    res0 += vec1 * cnst55;                                    \
+    res1 -= vec1 * cnst29;                                    \
+    res2 *= cnst74;                                           \
+    res3 += vec2 * cnst29;                                    \
+                                                              \
+    res0 += vec3;                                             \
+    res1 += vec3;                                             \
+    res3 -= vec3;                                             \
+                                                              \
+    SRARI_W4_SW(res0, res1, res2, res3, shift);               \
+    SAT_SW4_SW(res0, res1, res2, res3, 15);                   \
+}
+
+static void hevc_idct_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1;
+    v4i32 in_r0, in_l0, in_r1, in_l1;
+    v4i32 sum0, sum1, sum2, sum3;
+    v8i16 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
+    ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
+
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, sum0, sum1, sum2, sum3);
+    PCKEV_H2_SH(sum1, sum0, sum3, sum2, in0, in1);
+    ST_SH2(in0, in1, coeffs, 8);
+}
+
+static void hevc_idct_8x8_msa(int16_t *coeffs)
+{
+    int16_t *filter = &gt8x8_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
+}
+
+static void hevc_idct_16x16_msa(int16_t *coeffs)
+{
+    int16_t i, j, k;
+    int16_t buf[256];
+    int16_t *buf_ptr = &buf[0];
+    int16_t *src = coeffs;
+    int16_t *filter = &gt16x16_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
+                in8, in9, in10, in11, in12, in13, in14, in15);
+
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 7);
+
+        src += 8;
+        buf_ptr = (&buf[0] + 8);
+        filter = &gt16x16_cnst[0];
+    }
+
+    src = &buf[0];
+    buf_ptr = coeffs;
+    filter = &gt16x16_cnst[0];
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
+                in4, in12, in5, in13, in6, in14, in7, in15);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                           in8, in9, in10, in11, in12, in13, in14, in15);
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 12);
+
+        src += 128;
+        buf_ptr = coeffs + 8;
+        filter = &gt16x16_cnst[0];
+    }
+
+    LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
+
+    LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
+    TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
+
+    LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
+}
+
+static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
+                                      uint8_t round)
+{
+    uint8_t i;
+    int16_t *filter_ptr0 = &gt32x32_cnst0[0];
+    int16_t *filter_ptr1 = &gt32x32_cnst1[0];
+    int16_t *filter_ptr2 = &gt32x32_cnst2[0];
+    int16_t *filter_ptr3 = &gt32x32_cnst3[0];
+    int16_t *src0 = (coeffs + buf_pitch);
+    int16_t *src1 = (coeffs + 2 * buf_pitch);
+    int16_t *src2 = (coeffs + 4 * buf_pitch);
+    int16_t *src3 = (coeffs);
+    int32_t cnst0, cnst1;
+    int32_t tmp_buf[8 * 32];
+    int32_t *tmp_buf_ptr = &tmp_buf[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+    v8i16 filt0, filter0, filter1, filter2, filter3;
+    v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+    /* process coeff 4, 12, 20, 28 */
+    LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
+    ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
+    ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 4; i++) {
+        /* processing single column of constants */
+        cnst0 = LW(filter_ptr2);
+        cnst1 = LW(filter_ptr2 + 2);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
+        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + i * 8), 4);
+
+        filter_ptr2 += 4;
+    }
+
+    /* process coeff 0, 8, 16, 24 */
+    LD_SH2(src3, 16 * buf_pitch, in0, in2);
+    LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in1, in3);
+
+    ILVR_H2_SH(in2, in0, in3, in1, src0_r, src1_r);
+    ILVL_H2_SH(in2, in0, in3, in1, src0_l, src1_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 2; i++) {
+        /* processing first column of filter constants */
+        cnst0 = LW(filter_ptr3);
+        cnst1 = LW(filter_ptr3 + 4);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, filter1,
+                    filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+
+        sum1_r = sum0_r;
+        sum1_l = sum0_l;
+        sum0_r += tmp1_r;
+        sum0_l += tmp1_l;
+
+        sum1_r -= tmp1_r;
+        sum1_l -= tmp1_l;
+
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
+
+        filter_ptr3 += 8;
+    }
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 8; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr1);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
+
+        filter_ptr1 += 8;
+    }
+
+    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    src0 += 16 * buf_pitch;
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_r, src5_r, src6_r, src7_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_l, src5_l, src6_l, src7_l);
+
+    /* loop for all columns of filter constants */
+    for (i = 0; i < 16; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr0);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        tmp1_r = sum0_r;
+        tmp1_l = sum0_l;
+
+        filt0 = LD_SH(filter_ptr0 + 8);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
+
+        sum0_r += tmp1_r;
+        sum0_l += tmp1_l;
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        sum1_r = __msa_fill_w(round);
+        SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
+        SAT_SW2_SW(tmp0_r, tmp0_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
+        ST_SH(in0, (coeffs + i * buf_pitch));
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
+        SAT_SW2_SW(tmp1_r, tmp1_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
+        ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
+
+        filter_ptr0 += 16;
+    }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
+    }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
+    }
+}
+
+static void hevc_idct_32x32_msa(int16_t *coeffs)
+{
+    uint8_t row_cnt, col_cnt;
+    int16_t *src = coeffs;
+    int16_t tmp_buf[8 * 32];
+    int16_t *tmp_buf_ptr = &tmp_buf[0];
+    uint8_t round;
+    uint8_t buf_pitch;
+
+    /* column transform */
+    round = 7;
+    buf_pitch = 32;
+    for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+        /* process 8x32 blocks */
+        hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
+    }
+
+    /* row transform */
+    round = 12;
+    buf_pitch = 8;
+    for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+        /* process 32x8 blocks */
+        src = (coeffs + 32 * 8 * row_cnt);
+
+        hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+        hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
+        hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+    }
+}
+
+static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH2(dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 4; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 16; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint32_t dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, in0, in1;
+    v4i32 dst_vec = { 0 };
+    v16u8 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    LW4(dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
+    ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
+    ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
+    CLIP_SH2_0_255(dst_r0, dst_l0);
+    dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
+    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t *temp_dst = dst;
+    uint64_t dst0, dst1, dst2, dst3;
+    v2i64 dst_vec0 = { 0 };
+    v2i64 dst_vec1 = { 0 };
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 zeros = { 0 };
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    temp_dst += (4 * stride);
+
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
+    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+    dst += (4 * stride);
+
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+}
+
+static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SH4(coeffs, 16, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+        coeffs += 64;
+        LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
+        temp_dst += (4 * stride);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SH4(coeffs, 32, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 32, in1, in3, in5, in7);
+        LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+
+        LD_SH4((coeffs + 16), 32, in0, in2, in4, in6);
+        LD_SH4((coeffs + 24), 32, in1, in3, in5, in7);
+        coeffs += 128;
+        LD_UB4((temp_dst + 16), stride, dst0, dst1, dst2, dst3);
+        temp_dst += (4 * stride);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+
+        ST_UB4(dst0, dst1, dst2, dst3, (dst + 16), stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1, dst0, dst1;
+    v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
+
+    LD_SH2(coeffs, 8, in0, in1);
+    UNPCK_SH_SW(in0, in_r0, in_l0);
+    UNPCK_SH_SW(in1, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          7);
+    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          12);
+    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, res0, res1, res2, res3);
+    PCKEV_H2_SH(res1, res0, res3, res2, dst0, dst1);
+    ST_SH2(dst0, dst1, coeffs, 8);
+}
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_32x32_msa(coeffs);
+}
+
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_4x4_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_8x8_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_16x16_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_32x32_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_32x32_msa(coeffs);
+}
+
+void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_luma_4x4_msa(coeffs);
+}
diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
new file mode 100644
index 0000000000..da1db51ef5
--- /dev/null
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -0,0 +1,2088 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src - (stride << 2);
+    uint8_t *p2 = src - ((stride << 1) + stride);
+    uint8_t *p1 = src - (stride << 1);
+    uint8_t *p0 = src - stride;
+    uint8_t *q0 = src;
+    uint8_t *q1 = src + stride;
+    uint8_t *q2 = src + (stride << 1);
+    uint8_t *q3 = src + (stride << 1) + stride;
+    uint8_t flag0, flag1;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    uint64_t dst_val0, dst_val1;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
+        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
+            p3_src = LD_UH(p3);
+            p2_src = LD_UH(p2);
+            p1_src = LD_UH(p1);
+            p0_src = LD_UH(p0);
+            q0_src = LD_UH(q0);
+            q1_src = LD_UH(q1);
+            q2_src = LD_UH(q2);
+            q3_src = LD_UH(q3);
+
+            tc0 = tc[0];
+            beta30 = beta >> 3;
+            beta20 = beta >> 2;
+            tc250 = ((tc0 * 5 + 1) >> 1);
+            tc4 = tc[1];
+            tc254 = ((tc4 * 5 + 1) >> 1);
+
+            flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+                     abs(p0[0] - q0[0]) < tc250 &&
+                     abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+                     abs(p0[3] - q0[3]) < tc250 &&
+                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
+            cmp0 = __msa_fill_d(flag0);
+
+            flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+                     abs(p0[4] - q0[4]) < tc254 &&
+                     abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+                     abs(p0[7] - q0[7]) < tc254 &&
+                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
+            cmp1 = __msa_fill_d(flag1);
+            cmp2 = __msa_ilvev_d(cmp1, cmp0);
+            cmp2 = __msa_ceqi_d(cmp2, 0);
+
+            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                       q3_src);
+
+            cmp0 = (v2i64) __msa_fill_h(tc0);
+            cmp1 = (v2i64) __msa_fill_h(tc4);
+            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+            tc_pos <<= 1;
+            tc_neg = -tc_pos;
+
+            temp0 = (p1_src + p0_src + q0_src);
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            cmp0 = __msa_fill_d(p_is_pcm0);
+            cmp1 = __msa_fill_d(p_is_pcm4);
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            temp0 = (q1_src + p0_src + q0_src);
+
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            cmp0 = __msa_fill_d(q_is_pcm0);
+            cmp1 = __msa_fill_d(q_is_pcm4);
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+
+            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+
+            cmp0 = __msa_fill_d(d00 + d30 >= beta);
+            cmp1 = __msa_fill_d(d04 + d34 >= beta);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            cmp0 = __msa_ceqi_d(cmp0, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0);
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0);
+
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+
+            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
+            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
+
+            ST8x4_UB(dst0, dst1, p2, stride);
+            p2 += (4 * stride);
+            SD(dst_val0, p2);
+            p2 += stride;
+            SD(dst_val1, p2);
+        }
+    }
+}
+
+static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src;
+    uint8_t *p2 = src + 3 * stride;
+    uint8_t *p1 = src + (stride << 2);
+    uint8_t *p0 = src + 7 * stride;
+    uint8_t flag0, flag1;
+    uint16_t tmp0, tmp1;
+    uint32_t tmp2, tmp3;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+
+    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
+        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
+            src -= 4;
+            LD_UH8(src, stride,
+                   p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                   q3_src);
+
+            tc0 = tc[0];
+            beta30 = beta >> 3;
+            beta20 = beta >> 2;
+            tc250 = ((tc0 * 5 + 1) >> 1);
+
+            tc4 = tc[1];
+            tc254 = ((tc4 * 5 + 1) >> 1);
+
+            TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+                               q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+                               q0_src, q1_src, q2_src, q3_src);
+
+            flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+                     abs(p3[-1] - p3[0]) < tc250 &&
+                     abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+                     abs(p2[-1] - p2[0]) < tc250 &&
+                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
+            cmp0 = __msa_fill_d(flag0);
+
+            flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+                     abs(p1[-1] - p1[0]) < tc254 &&
+                     abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+                     abs(p0[-1] - p0[0]) < tc254 &&
+                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
+            cmp1 = __msa_fill_d(flag1);
+            cmp2 = __msa_ilvev_d(cmp1, cmp0);
+            cmp2 = __msa_ceqi_d(cmp2, 0);
+
+            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                       q3_src);
+
+            cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
+            cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
+            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+            tc_neg = -tc_pos;
+
+            temp0 = (p1_src + p0_src + q0_src);
+
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            cmp0 = __msa_fill_d(p_is_pcm0);
+            cmp1 = __msa_fill_d(p_is_pcm4);
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            temp0 = (q1_src + p0_src + q0_src);
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            cmp0 = __msa_fill_d(q_is_pcm0);
+            cmp1 = __msa_fill_d(q_is_pcm4);
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (v8i16) (diff0 << 3) + diff0;
+            diff1 = (v8i16) (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            temp0 = (v8u16) delta0 + p0_src;
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) q0_src - delta0;
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = ((beta + (beta >> 1)) >> 3);
+            cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+
+            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+
+            cmp0 = __msa_fill_d(d00 + d30 >= beta);
+            dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta);
+            cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0);
+            dst6 = (v16u8) __msa_ceqi_d(cmp0, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6);
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6);
+
+            PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3,
+                        dst0, dst1, dst2, dst3);
+            PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5);
+
+            TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
+                               dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+            src += 1;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst0, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst1, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst1, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst2, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst2, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst3, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst4, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst4, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst5, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst5, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst6, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst6, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst7, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst7, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+        }
+    }
+}
+
+static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    uint8_t *p1_ptr = src - (stride << 1);
+    uint8_t *p0_ptr = src - stride;
+    uint8_t *q0_ptr = src;
+    uint8_t *q1_ptr = src + stride;
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        p1 = LD_UH(p1_ptr);
+        p0 = LD_UH(p0_ptr);
+        q0 = LD_UH(q0_ptr);
+        q1 = LD_UH(q1_ptr);
+
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
+        ST8x2_UB(temp0, p0_ptr, stride);
+    }
+}
+
+static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        src -= 2;
+        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
+                           p1, p0, q0, q1);
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
+
+        src += 1;
+        ST2x4_UB(temp0, 0, src, stride);
+        src += (4 * stride);
+        ST2x4_UB(temp0, 4, src, stride);
+    }
+}
+
+static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    int32_t h_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r;
+    v16i8 offset, offset_val, mask;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, dst0, dst1;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
+
+        src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
+        mask = __msa_srli_b(src0_r, 3);
+        offset = __msa_vshf_b(mask, offset1, offset0);
+
+        UNPCK_SB_SH(offset, temp0, temp1);
+        ILVRL_B2_SH(zero, src0_r, dst0, dst1);
+        ADD2(dst0, temp0, dst1, temp1, dst0, dst1);
+        CLIP_SH2_0_255(dst0, dst1);
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    int32_t h_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r, mask0, mask1;
+    v16i8 offset, offset_val;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
+
+        mask0 = __msa_srli_b(src0_r, 3);
+        mask1 = __msa_srli_b(src1_r, 3);
+
+        offset = __msa_vshf_b(mask0, offset1, offset0);
+        UNPCK_SB_SH(offset, temp0, temp1);
+
+        offset = __msa_vshf_b(mask1, offset1, offset0);
+        UNPCK_SB_SH(offset, temp2, temp3);
+
+        UNPCK_UB_SH(src0_r, dst0, dst1);
+        UNPCK_UB_SH(src1_r, dst2, dst3);
+        ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
+             dst0, dst1, dst2, dst3);
+        CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst2);
+        ST8x4_UB(dst0, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
+                                                int32_t dst_stride,
+                                                uint8_t *src,
+                                                int32_t src_stride,
+                                                int32_t sao_left_class,
+                                                int16_t *sao_offset_val,
+                                                int32_t width, int32_t height)
+{
+    int32_t h_cnt, w_cnt;
+    v16u8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 out0, out1, out2, out3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        for (w_cnt = 0; w_cnt < (width >> 4); w_cnt++) {
+            LD_UB4(src + w_cnt * 16, src_stride, src0, src1, src2, src3);
+
+            mask0 = __msa_srli_b((v16i8) src0, 3);
+            mask1 = __msa_srli_b((v16i8) src1, 3);
+            mask2 = __msa_srli_b((v16i8) src2, 3);
+            mask3 = __msa_srli_b((v16i8) src3, 3);
+
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
+                       tmp0, tmp1);
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
+                       tmp2, tmp3);
+            UNPCK_SB_SH(tmp0, temp0, temp1);
+            UNPCK_SB_SH(tmp1, temp2, temp3);
+            UNPCK_SB_SH(tmp2, temp4, temp5);
+            UNPCK_SB_SH(tmp3, temp6, temp7);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            ILVRL_B2_SH(zero, src2, dst4, dst5);
+            ILVRL_B2_SH(zero, src3, dst6, dst7);
+            ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
+                 dst0, dst1, dst2, dst3);
+            ADD4(dst4, temp4, dst5, temp5, dst6, temp6, dst7, temp7,
+                 dst4, dst5, dst6, dst7);
+            CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
+            CLIP_SH4_0_255(dst4, dst5, dst6, dst7);
+            PCKEV_B4_SB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                        out0, out1, out2, out3);
+            ST_SB4(out0, out1, out2, out3, dst + w_cnt * 16, dst_stride);
+        }
+
+        src += src_stride << 2;
+        dst += dst_stride << 2;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11;
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0;
+    v8i16 offset_mask0, offset_mask1;
+    v8i16 sao_offset, src00, src01;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src -= 1;
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src, src_stride, src_minus10, src_minus11);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    uint8_t *src_minus1;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 dst0, dst1;
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        src_minus1 = src - 1;
+        LD_UB2(src_minus1, src_stride, src_minus10, src_minus11);
+
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
+                   src_zero0, src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        uint8_t *src,
+                                                        int32_t src_stride,
+                                                        int16_t *sao_offset_val,
+                                                        int32_t width,
+                                                        int32_t height)
+{
+    uint8_t *dst_ptr, *src_minus1;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 sao_offset;
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+    v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_minus1 = src - 1;
+        LD_UB4(src_minus1, src_stride,
+               src_minus10, src_minus11, src_minus12, src_minus13);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus1 += 16;
+            dst_ptr = dst + (v_cnt << 4);
+            LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
+
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
+                       src_plus11, 2);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
+                       src_plus13, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                       offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                       offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
+                       offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
+                       offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus10 = src10;
+            ST_UB(dst0, dst_ptr);
+            src_minus11 = src11;
+            ST_UB(dst1, dst_ptr + dst_stride);
+            src_minus12 = src12;
+            ST_UB(dst2, dst_ptr + (dst_stride << 1));
+            src_minus13 = src13;
+            ST_UB(dst3, dst_ptr + (dst_stride * 3));
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 dst0;
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v16i8 src_zero0, src_zero1;
+    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0, dst1;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, dst2, src13, dst3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+        src = src_orig + (v_cnt << 4);
+        dst = dst_orig + (v_cnt << 4);
+
+        LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+        for (h_cnt = (height >> 2); h_cnt--;) {
+            LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
+
+            cmp_minus10 = (src_minus11 == src_minus10);
+            cmp_plus10 = (src_minus11 == src10);
+            cmp_minus11 = (src10 == src_minus11);
+            cmp_plus11 = (src10 == src11);
+            cmp_minus12 = (src11 == src10);
+            cmp_plus12 = (src11 == src12);
+            cmp_minus13 = (src12 == src11);
+            cmp_plus13 = (src12 == src13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < src_minus11);
+            cmp_plus10 = (src10 < src_minus11);
+            cmp_minus11 = (src_minus11 < src10);
+            cmp_plus11 = (src11 < src10);
+            cmp_minus12 = (src10 < src11);
+            cmp_plus12 = (src12 < src11);
+            cmp_minus13 = (src11 < src12);
+            cmp_plus13 = (src13 < src12);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_minus11, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src10, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src11, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src12, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus10 = src12;
+            src_minus11 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+
+            src += (src_stride << 2);
+            dst += (dst_stride << 2);
+        }
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus11, src10, src11;
+    v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    src_orig = src - 1;
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+
+        ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0, dst1;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
+                   src_zero0, src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13, src_minus14, src_plus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
+    v16i8 src_zero3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_orig = src - 1;
+        dst_orig = dst;
+        LD_UB4(src_orig, src_stride,
+               src_minus11, src_minus12, src_minus13, src_minus14);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus10 = LD_UB(src_orig - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src + 1 + (v_cnt << 4) + (src_stride << 2));
+            src_orig += 16;
+
+            SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
+                       src_plus11, 2);
+
+            src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4,
+                        temp7, temp6, dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_minus12 = src11;
+            src_minus13 = src12;
+            src_minus14 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0, dst1;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
+                                                          int32_t dst_stride,
+                                                          uint8_t *src,
+                                                          int32_t src_stride,
+                                                          int16_t *
+                                                          sao_offset_val,
+                                                          int32_t width,
+                                                          int32_t height)
+{
+    uint8_t *src_orig, *dst_orig;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+    v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+    v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+    v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+    v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
+    v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_orig = src - 1;
+        dst_orig = dst;
+
+        LD_UB4(src_orig, src_stride,
+               src_minus11, src_plus10, src_plus11, src_plus12);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus10 = LD_UB(src_orig + 2 - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src_orig + (src_stride << 2));
+            src_orig += 16;
+
+            src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
+
+            src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
+            src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
+                                               (v16i8) src_minus11, 2);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
+
+            src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
+            src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
+            cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
+
+            src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
+            src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
+            cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_plus10 = src11;
+            src_plus11 = src12;
+            src_plus12 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height)
+{
+    if (width >> 4) {
+        hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
+                                            sao_left_class, sao_offset_val,
+                                            width - (width % 16), height);
+        dst += width - (width % 16);
+        src += width - (width % 16);
+        width %= 16;
+    }
+
+    if (width >> 3) {
+        hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+        dst += 8;
+        src += 8;
+        width %= 8;
+    }
+
+    if (width) {
+        hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+    }
+}
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height)
+{
+    ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+
+    switch (eo) {
+    case 0:
+        if (width >> 4) {
+            hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
+                                                        src, stride_src,
+                                                        sao_offset_val,
+                                                        width - (width % 16),
+                                                        height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+        }
+        break;
+
+    case 1:
+        if (width >> 4) {
+            hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 2:
+        if (width >> 4) {
+            hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 3:
+        if (width >> 4) {
+            hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
+                                                          src, stride_src,
+                                                          sao_offset_val,
+                                                          width - (width % 16),
+                                                          height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+        }
+        break;
+    }
+}
diff --git a/libavcodec/mips/hevc_macros_msa.h b/libavcodec/mips/hevc_macros_msa.h
new file mode 100644
index 0000000000..b06c5ad9b9
--- /dev/null
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
+#define AVCODEC_MIPS_HEVC_MACROS_MSA_H
+
+#define HEVC_PCK_SW_SB2(in0, in1, out)                            \
+{                                                                 \
+    v8i16 tmp0_m;                                                 \
+                                                                  \
+    tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1);             \
+    out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m);  \
+}
+
+#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)                  \
+{                                                                 \
+    v8i16 tmp0_m, tmp1_m;                                         \
+                                                                  \
+    PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m);              \
+    out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m);  \
+}
+
+#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)  \
+{                                                                            \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+                                                                             \
+    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,                      \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                             \
+    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                 \
+}
+
+#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,   \
+                         in8, in9, in10, in11, out0, out1, out2)   \
+{                                                                  \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;          \
+                                                                   \
+    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
+    PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m);             \
+    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);       \
+    out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m);  \
+}
+
+#define HEVC_FILT_8TAP(in0, in1, in2, in3,                       \
+                       filt0, filt1, filt2, filt3)               \
+( {                                                              \
+    v4i32 out_m;                                                 \
+                                                                 \
+    out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1);  \
+    DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m);          \
+    out_m;                                                       \
+} )
+
+#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)           \
+( {                                                      \
+    v4i32 out_m;                                         \
+                                                         \
+    out_m = __msa_dotp_s_w(in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1);  \
+    out_m;                                               \
+} )
+
+#endif  /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
new file mode 100644
index 0000000000..8208be327d
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -0,0 +1,4462 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
+{                                                                     \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
+    SRARI_H2_SH(out0, out1, rnd_val);                                 \
+    CLIP_SH2_0_255(out0, out1);                                       \
+}
+
+#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
+                          vec0, vec1, vec2, vec3, rnd_val,         \
+                          out0, out1, out2, out3)                  \
+{                                                                  \
+    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0, in0, in1;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+        dst0 += in0;
+        dst0 = __msa_srari_h(dst0, 7);
+        dst0 = CLIP_SH_0_255(dst0);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+        ST4x2_UB(dst0, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1;
+        v8i16 in0, in1, in2, in3;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src0_ptr, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src0_ptr += (8 * src_stride);
+
+            LD_SH8(src1_ptr, src2_stride,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+            src1_ptr += (8 * src2_stride);
+
+            ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+            ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST4x8_UB(dst0, dst1, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   dst4, dst5, dst6, dst7);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5);
+        ST6x4_UB(dst4, dst5, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1;
+        v8i16 dst0, dst1;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(dst0, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+
+        LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2, dst, dst_stride);
+    } else if (0 == height % 8) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST8x4_UB(dst0, dst1, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST8x4_UB(dst0, dst1, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v16i8 zero = { 0 };
+
+    for (loop_cnt = (16 >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int32_t height,
+                                          int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 zero = { 0 };
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            v16i8 src0, src1, src2, src3;
+            v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+            v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+            v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+
+            LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
+            src1_ptr_tmp += (4 * src2_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0_r, dst1_r, dst2_r, dst3_r);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+
+            SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+            SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in4, in5,
+                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                              dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            HEVC_BI_RND_CLIP4(in2, in3, in6, in7,
+                              dst2_r, dst3_r, dst2_l, dst3_l, 7,
+                              dst2_r, dst3_r, dst2_l, dst3_l);
+
+            PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+            ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 16);
+}
+
+static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 16);
+
+    hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                        dst + 16, dst_stride, height);
+}
+
+static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 32);
+}
+
+static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 48);
+}
+
+static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 64);
+}
+
+static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src0_ptr -= 3;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
+               src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, filter, height);
+    hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter, height);
+}
+
+static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    v16i8 src0, src1, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v8i16 in0, in1, in2;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr = src0_ptr - 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+        dst2 = __msa_adds_s_h(dst2, in2);
+        dst2 = __msa_srari_h(dst2, 7);
+        dst2 = CLIP_SH_0_255(dst2);
+
+        PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
+        dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
+        ST_SB(tmp0, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 tmp0, tmp1, tmp2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+        LD_SH2(src1_ptr, 8, in0, in1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST_SB(tmp0, dst);
+
+        LD_SB2(src0_ptr + 32, 8, src2, src3);
+        XORI_B2_128_SB(src2, src3);
+        src0_ptr += src_stride;
+
+        LD_SH2(src1_ptr + 16, 8, in2, in3);
+
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
+
+        tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2);
+        ST_SB(tmp1, dst + 16);
+
+        LD_SH2(src1_ptr + 32, 8, in4, in5);
+        src1_ptr += src2_stride;
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST_SB(tmp2, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint8_t *src0_ptr_tmp;
+    uint8_t *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src0_ptr_tmp, 16, src0, src1);
+            src2 = LD_SB(src0_ptr_tmp + 24);
+            src0_ptr_tmp += 32;
+            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
+            src1_ptr_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+            ST_SB2(tmp0, tmp1, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src1_ptr += src2_stride;
+        src0_ptr += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                           int32_t src_stride,
+                                           int16_t *src1_ptr,
+                                           int32_t src2_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter,
+                                           int32_t height, int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+            XORI_B2_128_SB(src7, src8);
+
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                              dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+    hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 64);
+}
+
+static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r, in0_r, in0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        src1_ptr += (2 * src2_stride);
+
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        UNPCK_SH_SW(in0, in0_r, in0_l);
+        dst0_r = __msa_adds_s_w(dst0_r, in0_r);
+        dst1_r = __msa_adds_s_w(dst1_r, in0_l);
+        SRARI_W2_SW(dst0_r, dst1_r, 7);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height, int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v4i32 in0_r, in0_l, in1_r, in1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            /* row 7 */
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src0_ptr_tmp += 2 * src_stride;
+
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            UNPCK_SH_SW(in0, in0_r, in0_l);
+            UNPCK_SH_SW(in1, in1_r, in1_l);
+            in0_r = __msa_adds_s_w(in0_r, dst0_r);
+            in0_l = __msa_adds_s_w(in0_l, dst0_l);
+            in1_r = __msa_adds_s_w(in1_r, dst1_r);
+            in1_l = __msa_adds_s_w(in1_l, dst1_l);
+            SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7);
+            in0_r = CLIP_SW_0_255(in0_r);
+            in0_l = CLIP_SW_0_255(in0_l);
+            in1_r = CLIP_SW_0_255(in1_r);
+            in1_l = CLIP_SW_0_255(in1_l);
+
+            HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+}
+
+static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+
+    hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter_x, filter_y, height);
+}
+
+static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 16);
+}
+
+static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 48);
+}
+
+static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 64);
+}
+
+static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, dst0, vec0, vec1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 tmp0;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    tmp0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+
+    tmp0 = __msa_adds_s_h(tmp0, in0);
+    tmp0 = __msa_srari_h(tmp0, 7);
+    tmp0 = CLIP_SH_0_255(tmp0);
+    dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+
+    ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 tmp0, tmp1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    tmp0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
+    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
+    dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 dst0, dst1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        tmp0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (0 == (height % 4)) {
+        hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
+                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    dst_tmp = dst + 16;
+    src1_ptr_tmp = src1_ptr + 16;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
+                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+        src1_ptr_tmp += (4 * src2_stride);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst10 = __msa_adds_s_h(dst10, in0);
+    dst10 = __msa_srari_h(dst10, 7);
+    dst10 = CLIP_SH_0_255(dst10);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
+    ST4x2_UB(dst10, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
+    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst0_r, dst1_r;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
+    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+    dst2_r = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+    dst3_r = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+    dst4_r = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
+    dst5_r = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                      dst0_r, dst1_r, dst2_r, dst3_r);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
+
+    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 dst0_l, dst1_l, filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (1 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+        /* next 16width */
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst2_r, dst3_r, dst2_l, dst3_l, 7,
+                          dst2_r, dst3_r, dst2_l, dst3_l);
+
+        PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+        ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v8i16 in0, in1;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src3, src4);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
+    dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
+    dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
+
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v8i16 in0, in1, in2, in3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 dst0_r, dst1_r;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    tmp0 >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    tmp1 >>= 6;
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    tmp2 >>= 6;
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    tmp3 >>= 6;
+    PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
+
+    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        /* row 7 */
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+        /* row 8 */
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+        /* row 9 */
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+        /* row 10 */
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+        ST4x8_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (4 == height) {
+        hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (0 == (height % 8)) {
+        hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v8i16 in0, in1;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
+
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    LD_SB2(src0_ptr, src_stride, src5, src6);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+    tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+    tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
+
+    LD_SB2(src0_ptr, src_stride, src7, src8);
+    XORI_B2_128_SB(src7, src8);
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+    tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+    tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+    HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5);
+
+    PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+    dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height,
+                                          int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              tmp0, tmp1, tmp2, tmp3, 7,
+                              tmp0, tmp1, tmp2, tmp3);
+
+            PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else {
+        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+    hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter_x, filter_y, height);
+}
+
+static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 16);
+}
+
+static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+#define BI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                dst, dst_stride, height);                 \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                            \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)             \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
+                                             MAX_PB_SIZE, dst, dst_stride,     \
+                                             filter, height);                  \
+}
+
+BI_MC(qpel, h, 4, 8, hz, mx);
+BI_MC(qpel, h, 8, 8, hz, mx);
+BI_MC(qpel, h, 12, 8, hz, mx);
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 4, 8, vt, my);
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 12, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 4, 4, hz, mx);
+BI_MC(epel, h, 8, 4, hz, mx);
+BI_MC(epel, h, 6, 4, hz, mx);
+BI_MC(epel, h, 12, 4, hz, mx);
+BI_MC(epel, h, 16, 4, hz, mx);
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 4, 4, vt, my);
+BI_MC(epel, v, 8, 4, vt, my);
+BI_MC(epel, v, 6, 4, vt, my);
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                   \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)             \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
+                                                                               \
+    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
+                                             MAX_PB_SIZE, dst, dst_stride,     \
+                                             filter_x, filter_y,               \
+                                             height);                          \
+}
+
+BI_MC_HV(qpel, hv, 4, 8, hv);
+BI_MC_HV(qpel, hv, 8, 8, hv);
+BI_MC_HV(qpel, hv, 12, 8, hv);
+BI_MC_HV(qpel, hv, 16, 8, hv);
+BI_MC_HV(qpel, hv, 24, 8, hv);
+BI_MC_HV(qpel, hv, 32, 8, hv);
+BI_MC_HV(qpel, hv, 48, 8, hv);
+BI_MC_HV(qpel, hv, 64, 8, hv);
+
+BI_MC_HV(epel, hv, 4, 4, hv);
+BI_MC_HV(epel, hv, 8, 4, hv);
+BI_MC_HV(epel, hv, 6, 4, hv);
+BI_MC_HV(epel, hv, 12, 4, hv);
+BI_MC_HV(epel, hv, 16, 4, hv);
+BI_MC_HV(epel, hv, 24, 4, hv);
+BI_MC_HV(epel, hv, 32, 4, hv);
+
+#undef BI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c
new file mode 100644
index 0000000000..05a28ece44
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -0,0 +1,5572 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
+                           out0_r, out1_r, out0_l, out1_l)          \
+{                                                                   \
+    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \
+    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \
+                                                                    \
+    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \
+    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \
+    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \
+    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \
+                                                                    \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \
+                                                                    \
+    out0_r = CLIP_SW_0_255(out0_r);                                 \
+    out1_r = CLIP_SW_0_255(out1_r);                                 \
+    out0_l = CLIP_SW_0_255(out0_l);                                 \
+    out1_l = CLIP_SW_0_255(out1_l);                                 \
+}
+
+#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,  \
+                           wgt, rnd, offset,                            \
+                           out0_r, out1_r, out2_r, out3_r,              \
+                           out0_l, out1_l, out2_l, out3_l)              \
+{                                                                       \
+    HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,          \
+                       out0_r, out1_r, out0_l, out1_l)                  \
+    HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset,          \
+                       out2_r, out3_r, out2_l, out3_l)                  \
+}
+
+#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
+{                                                                     \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
+    SRARI_H2_SH(out0, out1, rnd_val);                                 \
+    CLIP_SH2_0_255(out0, out1);                                       \
+}
+
+#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
+                          vec0, vec1, vec2, vec3, rnd_val,         \
+                          out0, out1, out2, out3)                  \
+{                                                                  \
+    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1, dst0;
+        v4i32 dst0_r, dst0_l;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+        dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r,
+                                 (v8i16) weight_vec);
+        dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst0_l = CLIP_SW_0_255(dst0_l);
+
+        HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src0_ptr, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src0_ptr += (8 * src_stride);
+            LD_SH8(src1_ptr, src2_stride,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+            src1_ptr += (8 * src2_stride);
+
+            ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+            ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1, dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+        LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2_r, dst, dst_stride);
+    } else if (0 == height % 4) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (16 >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            v16i8 src0, src1, src2, src3;
+            v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+            v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+            v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+            v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+
+            LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
+            src1_ptr_tmp += (4 * src2_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       tmp0, tmp1, tmp2, tmp3);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       tmp4, tmp5, tmp6, tmp7);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                               in0, in1, in4, in5,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                               in2, in3, in6, in7,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+}
+
+static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+    hevc_biwgt_copy_8w_msa(src0_ptr + 16, src_stride,
+                           src1_ptr + 16, src2_stride,
+                           dst + 16, dst_stride, height, weight0,
+                           weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 32);
+}
+
+static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 48);
+}
+
+static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 64);
+}
+
+static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src0_ptr -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hz_biwgt_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                            dst, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+    hevc_hz_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    int32_t offset, weight;
+    v16i8 src0, src1;
+    v8i16 in0, in1, in2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr = src0_ptr - 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        ST_SW(dst0_r, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    uint64_t dst_val0;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src0_ptr, 16, src0, src1, src2);
+        src3 = LD_SB(src0_ptr + 40);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        ST_SW(dst0_r, dst);
+        SD(dst_val0, dst + 16);
+
+        LD_SH2(src1_ptr + 24, 8, in3, in4);
+        in5 = LD_SH(src1_ptr + 40);
+        src1_ptr += src2_stride;
+
+        HEVC_BIW_RND_CLIP2(dst3, dst4, in3, in4,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst3_r, dst4_r, dst3_l, dst4_l);
+
+        ILVRL_H2_SW(dst5, in5, dst5_r, dst5_l);
+        dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r,
+                                 (v8i16) weight_vec);
+        dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst5_r, dst5_l, rnd_vec);
+        dst5_r = CLIP_SW_0_255(dst5_r);
+        dst5_l = CLIP_SW_0_255(dst5_l);
+
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        HEVC_PCK_SW_SB2(dst3_l, dst3_r, dst3_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0);
+        SD(dst_val0, dst + 24);
+        ST_SW(dst4_r, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint8_t *src0_ptr_tmp;
+    uint8_t *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src0_ptr_tmp, 16, src0, src1);
+            src2 = LD_SB(src0_ptr_tmp + 24);
+            src0_ptr_tmp += 32;
+            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
+            src1_ptr_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src0_ptr += src_stride;
+        src1_ptr += src2_stride;
+        dst += dst_stride;
+
+    }
+}
+
+static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src13, src14;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
+    v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
+                     filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst10_r, dst32_r, dst54_r, dst76_r,
+                           dst10_l, dst32_l, dst54_l, dst76_l);
+
+        HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
+                        dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
+        ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
+        XORI_B2_128_SB(src7, src8);
+
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+
+        HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+        ST4x2_UB(dst2_r, dst + 8, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src32_r = src54_r;
+        src54_r = src76_r;
+        src21_r = src43_r;
+        src43_r = src65_r;
+        src65_r = src87_r;
+        src2110 = src4332;
+        src4332 = src6554;
+        src6554 = src8776;
+        src6 = src8;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                              int32_t src_stride,
+                                              int16_t *src1_ptr,
+                                              int32_t src2_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight0,
+                                              int32_t weight1,
+                                              int32_t offset0,
+                                              int32_t offset1,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            XORI_B2_128_SB(src7, src8);
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            tmp0 = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+            tmp1 = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+            tmp2 = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+            tmp3 = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+}
+
+static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+    hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
+                            src1_ptr + 16, src2_stride,
+                            dst + 16, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 32);
+}
+
+static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 48);
+}
+
+static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 64);
+}
+
+static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v4i32 tmp1, tmp2;
+    v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec0 = __msa_fill_w(weight0);
+    weight_vec1 = __msa_fill_w(weight1);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        src1_ptr += (2 * src2_stride);
+
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+
+        ILVRL_H2_SW(in0, in0, tmp1, tmp2);
+        tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, (v8i16) weight_vec0);
+        tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, (v8i16) weight_vec0);
+        tmp1 += dst0_r * weight_vec1;
+        tmp2 += dst1_r * weight_vec1;
+        SRAR_W2_SW(tmp1, tmp2, rnd_vec);
+        tmp1 = CLIP_SW_0_255(tmp1);
+        tmp2 = CLIP_SW_0_255(tmp2);
+
+        HEVC_PCK_SW_SB2(tmp2, tmp1, tmp1);
+        ST4x2_UB(tmp1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    int32_t offset;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec0 = __msa_fill_w(weight0);
+    weight_vec1 = __msa_fill_w(weight1);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src0_ptr_tmp += 2 * src_stride;
+
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            ILVRL_H2_SW(in0, in0, tmp0, tmp1);
+            ILVRL_H2_SW(in1, in1, tmp2, tmp3);
+            tmp0 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp0,
+                                   (v8i16) weight_vec0);
+            tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1,
+                                   (v8i16) weight_vec0);
+            tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2,
+                                   (v8i16) weight_vec0);
+            tmp3 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp3,
+                                   (v8i16) weight_vec0);
+
+            tmp0 += (dst0_r * weight_vec1);
+            tmp1 += (dst0_l * weight_vec1);
+            tmp2 += (dst1_r * weight_vec1);
+            tmp3 += (dst1_l * weight_vec1);
+
+            SRAR_W4_SW(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+            tmp0 = CLIP_SW_0_255(tmp0);
+            tmp1 = CLIP_SW_0_255(tmp1);
+            tmp2 = CLIP_SW_0_255(tmp2);
+            tmp3 = CLIP_SW_0_255(tmp3);
+            HEVC_PCK_SW_SB4(tmp1, tmp0, tmp3, tmp2, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src0_ptr += 8;
+        src1_ptr += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 8);
+}
+
+static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 8);
+    hevc_hv_biwgt_8t_4w_msa(src0_ptr + 8, src_stride,
+                            src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter_x, filter_y,
+                            height, weight0, weight1, offset0, offset1,
+                            rnd_val);
+}
+
+static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 16);
+}
+
+static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 24);
+}
+
+static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 32);
+}
+
+static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 48);
+}
+
+static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 64);
+}
+
+static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 dst0, dst1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src0, src2);
+        LD_SB2(src0_ptr + 16, src_stride, src1, src3);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in2);
+        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst10;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v4i32 dst10_r, dst10_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    src1_ptr += (2 * src2_stride);
+
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
+    dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
+    dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
+    dst10_r = CLIP_SW_0_255(dst10_r);
+    dst10_l = CLIP_SW_0_255(dst10_l);
+
+    HEVC_PCK_SW_SB2(dst10_l, dst10_r, dst10_r);
+    ST4x2_UB(dst10_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v4i32 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    src0_ptr += (4 * src_stride);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+
+    HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst10_r, dst32_r, dst10_l, dst32_l);
+
+    HEVC_PCK_SW_SB4(dst10_l, dst10_r, dst32_l, dst32_r, dst10_r);
+    ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
+    v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst10_r, dst32_r, dst54_r, dst76_r,
+                           dst10_l, dst32_l, dst54_l, dst76_l);
+
+        HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
+                        dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
+        ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, tmp0, tmp1;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    tmp2 = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
+    tmp3 = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
+    tmp4 = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
+    tmp5 = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
+    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else {
+        hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= (1 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        /* 8width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        /* 8width */
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
+        /* 8width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        /* 8width */
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp6 = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        tmp7 = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
+        /* next 16width */
+        HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst6_r, dst7_r,
+                           dst4_l, dst5_l, dst6_l, dst7_l);
+
+        /* next 16width */
+        HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
+                        dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
+        ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 in0, in1;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src3, src4);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+
+    ILVRL_H2_SW(dst1_r, in0, dst0_r, dst0_l);
+    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 tmp0, tmp1;
+    v4i32 dst0_l, dst1_l;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (4 == height) {
+        hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0, weight1,
+                                         offset0, offset1, rnd_val);
+    }
+}
+
+static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v8i16 in0, in1;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    LD_SB2(src0_ptr, src_stride, src5, src6);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+    tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+    tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
+
+    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+
+    HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB2(src0_ptr, src_stride, src7, src8);
+    XORI_B2_128_SB(src7, src8);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+    tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
+
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+    tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
+
+    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst2_r);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    int32_t offset, weight;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (6 == height) {
+        hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0,
+                                         weight1, offset0, offset1, rnd_val, 8);
+    }
+}
+
+static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 8);
+
+    hevc_hv_biwgt_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter_x, filter_y,
+                            height, weight0, weight1, offset0,
+                            offset1, rnd_val);
+}
+
+static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+}
+
+static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 24);
+}
+
+static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 32);
+}
+
+#define BI_W_MC_COPY(WIDTH)                                                  \
+void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \
+                                                     ptrdiff_t dst_stride,   \
+                                                     uint8_t *src,           \
+                                                     ptrdiff_t src_stride,   \
+                                                     int16_t *src_16bit,     \
+                                                     int height,             \
+                                                     int denom,              \
+                                                     int weight0,            \
+                                                     int weight1,            \
+                                                     int offset0,            \
+                                                     int offset1,            \
+                                                     intptr_t mx,            \
+                                                     intptr_t my,            \
+                                                     int width)              \
+{                                                                            \
+    int shift = 14 + 1 - 8;                                                  \
+    int log2Wd = denom + shift - 1;                                          \
+                                                                             \
+    hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                   dst, dst_stride, height,                  \
+                                   weight0, weight1, offset0,                \
+                                   offset1, log2Wd);                         \
+}
+
+BI_W_MC_COPY(4);
+BI_W_MC_COPY(6);
+BI_W_MC_COPY(8);
+BI_W_MC_COPY(12);
+BI_W_MC_COPY(16);
+BI_W_MC_COPY(24);
+BI_W_MC_COPY(32);
+BI_W_MC_COPY(48);
+BI_W_MC_COPY(64);
+
+#undef BI_W_MC_COPY
+
+#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)           \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    int shift = 14 + 1 - 8;                                                    \
+    int log2Wd = denom + shift - 1;                                            \
+                                                                               \
+    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride,               \
+                                                src_16bit, MAX_PB_SIZE,        \
+                                                dst, dst_stride,               \
+                                                filter, height,                \
+                                                weight0, weight1, offset0,     \
+                                                offset1, log2Wd);              \
+}
+
+BI_W_MC(qpel, h, 4, 8, hz, mx);
+BI_W_MC(qpel, h, 8, 8, hz, mx);
+BI_W_MC(qpel, h, 12, 8, hz, mx);
+BI_W_MC(qpel, h, 16, 8, hz, mx);
+BI_W_MC(qpel, h, 24, 8, hz, mx);
+BI_W_MC(qpel, h, 32, 8, hz, mx);
+BI_W_MC(qpel, h, 48, 8, hz, mx);
+BI_W_MC(qpel, h, 64, 8, hz, mx);
+
+BI_W_MC(qpel, v, 4, 8, vt, my);
+BI_W_MC(qpel, v, 8, 8, vt, my);
+BI_W_MC(qpel, v, 12, 8, vt, my);
+BI_W_MC(qpel, v, 16, 8, vt, my);
+BI_W_MC(qpel, v, 24, 8, vt, my);
+BI_W_MC(qpel, v, 32, 8, vt, my);
+BI_W_MC(qpel, v, 48, 8, vt, my);
+BI_W_MC(qpel, v, 64, 8, vt, my);
+
+BI_W_MC(epel, h, 4, 4, hz, mx);
+BI_W_MC(epel, h, 8, 4, hz, mx);
+BI_W_MC(epel, h, 6, 4, hz, mx);
+BI_W_MC(epel, h, 12, 4, hz, mx);
+BI_W_MC(epel, h, 16, 4, hz, mx);
+BI_W_MC(epel, h, 24, 4, hz, mx);
+BI_W_MC(epel, h, 32, 4, hz, mx);
+
+BI_W_MC(epel, v, 4, 4, vt, my);
+BI_W_MC(epel, v, 8, 4, vt, my);
+BI_W_MC(epel, v, 6, 4, vt, my);
+BI_W_MC(epel, v, 12, 4, vt, my);
+BI_W_MC(epel, v, 16, 4, vt, my);
+BI_W_MC(epel, v, 24, 4, vt, my);
+BI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_W_MC
+
+#define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                 \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)           \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
+                                                                               \
+    int shift = 14 + 1 - 8;                                                    \
+    int log2Wd = denom + shift - 1;                                            \
+                                                                               \
+    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride,               \
+                                                src_16bit, MAX_PB_SIZE,        \
+                                                dst, dst_stride,               \
+                                                filter_x, filter_y,            \
+                                                height, weight0, weight1,      \
+                                                offset0, offset1, log2Wd);     \
+}
+
+BI_W_MC_HV(qpel, hv, 4, 8, hv);
+BI_W_MC_HV(qpel, hv, 8, 8, hv);
+BI_W_MC_HV(qpel, hv, 12, 8, hv);
+BI_W_MC_HV(qpel, hv, 16, 8, hv);
+BI_W_MC_HV(qpel, hv, 24, 8, hv);
+BI_W_MC_HV(qpel, hv, 32, 8, hv);
+BI_W_MC_HV(qpel, hv, 48, 8, hv);
+BI_W_MC_HV(qpel, hv, 64, 8, hv);
+
+BI_W_MC_HV(epel, hv, 4, 4, hv);
+BI_W_MC_HV(epel, hv, 8, 4, hv);
+BI_W_MC_HV(epel, hv, 6, 4, hv);
+BI_W_MC_HV(epel, hv, 12, 4, hv);
+BI_W_MC_HV(epel, hv, 16, 4, hv);
+BI_W_MC_HV(epel, hv, 24, 4, hv);
+BI_W_MC_HV(epel, hv, 32, 4, hv);
+
+#undef BI_W_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c
new file mode 100644
index 0000000000..754fbdbb41
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -0,0 +1,3964 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_width12_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+}
+
+static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width24_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
+}
+
+static void copy_width32_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width48_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
+}
+
+static void copy_width64_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3)         \
+( {                                                             \
+    v8i16 tmp0, tmp1;                                           \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                res0_m, res1_m, res2_m, res3_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+                res4_m, res5_m, res6_m, res7_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 res4_m, res5_m, res6_m, res7_m);                             \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+                res7_m, out0, out1, out2, out3);                              \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    v8i16 tmp0;                                                 \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRAR_H2_SH(out0, out1, rnd_vec);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter, uint8_t rnd_val)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (16 == height) {
+        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
+                              rnd_val);
+    }
+}
+
+static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (4 == height) {
+        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    }
+}
+
+static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint8_t *src1_ptr, *dst1;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
+    v8i16 rnd_vec;
+
+    mask00 = LD_UB(&mc_filt_mask_arr[0]);
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    src1_ptr = src - 3;
+    dst1 = dst;
+
+    dst = dst1 + 8;
+    src = src1_ptr + 8;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask00 + 2;
+    mask2 = mask00 + 4;
+    mask3 = mask00 + 6;
+    mask4 = mask0 + 2;
+    mask5 = mask0 + 4;
+    mask6 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 8 width */
+        LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src1_ptr += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        dst1 += (4 * dst_stride);
+
+        /* 4 width */
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
+                                   mask6, filt0, filt1, filt2, filt3, out0,
+                                   out1);
+        SRAR_H2_SH(out0, out1, rnd_vec);
+        SAT_SH2_SH(out0, out1, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
+    v8i16 out11, filt;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
+        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
+                    out8, out2, out9);
+        DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
+        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
+                    out10, out6, out11);
+        DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
+                     out0, out8, out2, out9);
+        DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
+                     out4, out10, out6, out11);
+        DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
+        ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
+                    out8, out2, out9);
+        ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
+        SRAR_H4_SH(out0, out8, out2, out9, rnd_vec);
+        SRAR_H2_SH(out1, out3, rnd_vec);
+        SAT_SH4_SH(out0, out8, out2, out9, 7);
+        SAT_SH2_SH(out1, out3, 7);
+        out = PCKEV_XORI128_UB(out8, out9);
+        ST8x2_UB(out, dst + 16, dst_stride);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src2, src3);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
+        out5 = __msa_dpadd_s_h(out5, vec2, filt3);
+        ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
+        out2 = __msa_adds_s_h(out2, out5);
+        SRAR_H2_SH(out0, out1, rnd_vec);
+        out6 = __msa_srar_h(out2, rnd_vec);
+        SAT_SH3_SH(out0, out1, out6, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+
+        src1 = LD_SB(src + 40);
+        src += src_stride;
+        src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
+
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
+        out5 = __msa_dpadd_s_h(out5, vec2, filt3);
+        ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
+        out5 = __msa_adds_s_h(out2, out5);
+        SRAR_H3_SH(out3, out4, out5, rnd_vec);
+        SAT_SH3_SH(out3, out4, out5, 7);
+        out = PCKEV_XORI128_UB(out6, out3);
+        ST_UB(out, dst + 16);
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+
+        src0 = LD_SB(src + 32);
+        src2 = LD_SB(src + 48);
+        src3 = LD_SB(src + 56);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRAR_H2_SH(out10, out32, rnd_vec);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    int32_t loop_cnt;
+    uint32_t out2, out3;
+    uint64_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
+    v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
+    v8i16 filt, filt0, filt1, filt2, filt3;
+    v8i16 rnd_vec;
+    v4i32 mask = { 2, 6, 2, 6 };
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* 4 width */
+    VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
+    VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
+    VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        XORI_B2_128_SB(src7, src8);
+        src += (2 * src_stride);
+
+        ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
+                   vec01, vec23, vec45, vec67);
+        tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+        ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
+                   vec45, vec67);
+        tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+
+        /* 4 width */
+        VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
+        ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
+                   vec45, vec67);
+        tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+        SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
+        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
+        PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
+        XORI_B3_128_SB(res0, res1, res2);
+
+        out0 = __msa_copy_u_d((v2i64) res0, 0);
+        out1 = __msa_copy_u_d((v2i64) res1, 0);
+        out2 = __msa_copy_u_w((v4i32) res2, 0);
+        out3 = __msa_copy_u_w((v4i32) res2, 1);
+        SD(out0, dst);
+        SW(out2, (dst + 8));
+        dst += dst_stride;
+        SD(out1, dst);
+        SW(out3, (dst + 8));
+        dst += dst_stride;
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+        src5 = src7;
+        src6 = src8;
+        vec0 = vec2;
+        vec1 = vec3;
+        vec2 = vec4;
+        vec3 = vec5;
+        vec4 = vec6;
+        vec5 = vec7;
+    }
+}
+
+static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      uint8_t rnd_val, int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+            SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 16);
+
+    common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
+                        height, rnd_val);
+}
+
+static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 32);
+}
+
+static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 48);
+}
+
+static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 64);
+}
+
+static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += 2 * src_stride;
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h(mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        SRARI_W2_SW(dst0_r, dst1_r, 6);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src_tmp += 2 * src_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+            SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            dst0_r = CLIP_SW_0_255(dst0_r);
+            dst0_l = CLIP_SW_0_255(dst0_l);
+            dst1_r = CLIP_SW_0_255(dst1_r);
+            dst1_l = CLIP_SW_0_255(dst1_l);
+
+            HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+
+    hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                          filter_x, filter_y, height);
+}
+
+static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 64);
+}
+
+static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
+    v16u8 out;
+    v8i16 filt, res0;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+    res0 = __msa_srar_h(res0, rnd_vec);
+    res0 = __msa_sat_s_h(res0, 7);
+    out = PCKEV_XORI128_UB(res0, res0);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRAR_H2_SH(out0, out1, rnd_vec);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter,
+                              rnd_val);
+    }
+}
+
+static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out4, out5;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        out4 = PCKEV_XORI128_UB(out0, out1);
+        out5 = PCKEV_XORI128_UB(out2, out3);
+        ST6x4_UB(out4, out5, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
+        SRAR_H2_SH(vec0, vec1, rnd_vec);
+        SAT_SH2_SH(vec0, vec1, 7);
+        out = PCKEV_XORI128_UB(vec0, vec1);
+        ST8x2_UB(out, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if ((2 == height) || (6 == height)) {
+        common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    } else {
+        common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    }
+}
+
+static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    mask2 = LD_SB(&mc_filt_mask_arr[32]);
+
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
+        DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out2, out3, out4, out5);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
+                     out2, out3, out4, out5);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H2_SH(out4, out5, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH2_SH(out4, out5, 7);
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        tmp1 = PCKEV_XORI128_UB(out4, out5);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint8_t *dst1 = dst + 16;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
+    v8i16 filt, out0, out1, out2, out3;
+    v16u8 tmp0, tmp1;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
+
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        dst1 += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
+        src += src_stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   filt0, filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                                   filt0, filt1, out4, out5, out6, out7);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v16u8 out;
+    v8i16 filt, out10;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src, src_stride, src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+    out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+    out10 = __msa_srar_h(out10, rnd_vec);
+    out10 = __msa_sat_s_h(out10, 7);
+    out = PCKEV_XORI128_UB(out10, out10);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride,
+                                         const int8_t *filter, int32_t height,
+                                         uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRAR_H2_SH(out10, out32, rnd_vec);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
+                                     height, rnd_val);
+    }
+}
+
+static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
+    v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, filt0, filt1;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
+    vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
+    vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src3, src0, src1, src2);
+        src += (4 * src_stride);
+
+        vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
+        ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
+
+        vec0 = __msa_xori_b((v16u8) src0, 128);
+        ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
+
+        vec1 = __msa_xori_b((v16u8) src1, 128);
+        vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
+
+        vec2 = __msa_xori_b((v16u8) src2, 128);
+        vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
+
+        SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST6x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
+    tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
+    ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
+    tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
+    SRAR_H2_SH(tmp0, tmp1, rnd_vec);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
+    v8i16 filt, filt0, filt1;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+
+        XORI_B3_128_SB(src3, src4, src5);
+        ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
+        SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
+        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
+        PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
+        XORI_B2_128_SH(tmp0, tmp2);
+
+        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
+        out1 = __msa_copy_u_d((v2i64) tmp0, 1);
+        out2 = __msa_copy_u_d((v2i64) tmp2, 0);
+        SD(out0, dst);
+        dst += dst_stride;
+        SD(out1, dst);
+        dst += dst_stride;
+        SD(out2, dst);
+        dst += dst_stride;
+
+        src2 = src5;
+        vec0 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (6 == height) {
+        common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
+                                 filter, height, rnd_val);
+    }
+}
+
+static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16u8 out0, out1;
+    v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
+    v4u32 mask = { 2, 6, 2, 6 };
+    v8i16 rnd_vec;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    src -= src_stride;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
+        VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
+        VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
+        tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
+        ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
+                   src21, src43, src54, src65);
+        tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
+        tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
+        ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
+        tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
+        tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
+        SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+        SRAR_H2_SH(tmp4, tmp5, rnd_vec);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH2_SH(tmp4, tmp5, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        out0 = PCKEV_XORI128_UB(tmp4, tmp5);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src1 = src5;
+        src2 = src6;
+        vec0 = vec4;
+        vec1 = vec5;
+        src2 = src6;
+    }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, filt0, filt1;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
+    v16u8 out;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* 16 width */
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* 8 width */
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        /* 16 width */
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H2_SH(out0_l, out1_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
+        XORI_B2_128_SH(out2_r, out3_r);
+        out0 = __msa_copy_u_d((v2i64) out2_r, 0);
+        out1 = __msa_copy_u_d((v2i64) out3_r, 0);
+        SD(out0, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        SD(out1, dst + 16);
+        dst += dst_stride;
+
+        /* 16 width */
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        /* 16 width */
+        out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H2_SH(out0_l, out1_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2_r, out2_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out3_r, out3_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      uint8_t rnd_val, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *dst_tmp, *src_tmp;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt;
+    v16i8 filt0, filt1;
+    v8i16 rnd_vec;
+    v16u8 out;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    for (cnt = (width >> 5); cnt--;) {
+        dst_tmp = dst;
+        src_tmp = src;
+
+        /* 16 width */
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+        ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+        /* next 16 width */
+        LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src6, src7, src8);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            /* 16 width */
+            LD_SB2(src_tmp, src_stride, src3, src4);
+            XORI_B2_128_SB(src3, src4);
+            ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+            ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+            /* 16 width */
+            out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+            out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+            out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+            out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+
+            /* 16 width */
+            SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
+            SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
+            out = PCKEV_XORI128_UB(out0_r, out0_l);
+            ST_UB(out, dst_tmp);
+            out = PCKEV_XORI128_UB(out1_r, out1_l);
+            ST_UB(out, dst_tmp + dst_stride);
+
+            src10_r = src32_r;
+            src21_r = src43_r;
+            src10_l = src32_l;
+            src21_l = src43_l;
+            src2 = src4;
+
+            /* next 16 width */
+            LD_SB2(src_tmp + 16, src_stride, src9, src10);
+            src_tmp += (2 * src_stride);
+            XORI_B2_128_SB(src9, src10);
+            ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+            ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+            /* next 16 width */
+            out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
+            out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
+            out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+            out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
+
+            /* next 16 width */
+            SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
+            SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
+            out = PCKEV_XORI128_UB(out2_r, out2_l);
+            ST_UB(out, dst_tmp + 16);
+            out = PCKEV_XORI128_UB(out3_r, out3_l);
+            ST_UB(out, dst_tmp + 16 + dst_stride);
+
+            dst_tmp += 2 * dst_stride;
+
+            src76_r = src98_r;
+            src87_r = src109_r;
+            src76_l = src98_l;
+            src87_l = src109_l;
+            src8 = src10;
+        }
+
+        src += 32;
+        dst += 32;
+    }
+}
+
+static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
+                              filter, height, rnd_val, 32);
+}
+
+static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
+    dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 out0_r, out1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
+    SRARI_H2_SH(out0_r, out1_r, 6);
+    CLIP_SH2_0_255(out0_r, out1_r);
+    out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
+
+    ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        /* row 7 */
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        /* row 8 */
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        /* row 9 */
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        /* row 10 */
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r,
+                    out0_r, out1_r, out2_r, out3_r);
+
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+        PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+        ST4x8_UB(out0_r, out1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (4 == height) {
+        hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                       filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r,
+                    out0_r, out1_r, out2_r, out3_r);
+
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+        PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+        ST6x4_UB(out0_r, out1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
+    SRARI_H2_SH(out0_r, out1_r, 6);
+    CLIP_SH2_0_255(out0_r, out1_r);
+    out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
+
+    ST8x2_UB(out0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
+    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+    SRARI_H2_SH(out4_r, out5_r, 6);
+    CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+    CLIP_SH2_0_255(out4_r, out5_r);
+
+    PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+    out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
+
+    ST8x4_UB(out0_r, out1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            /* row 3 */
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 4 */
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            /* row 5 */
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            /* row 6 */
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        out0_r, out1_r, out2_r, out3_r);
+
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+            CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+            PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+            ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                          filter_x, filter_y, height);
+}
+
+static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+#define UNI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                    ptrdiff_t dst_stride,  \
+                                                    uint8_t *src,          \
+                                                    ptrdiff_t src_stride,  \
+                                                    int height,            \
+                                                    intptr_t mx,           \
+                                                    intptr_t my,           \
+                                                    int width)             \
+{                                                                          \
+    copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \
+}
+
+UNI_MC_COPY(8);
+UNI_MC_COPY(12);
+UNI_MC_COPY(16);
+UNI_MC_COPY(24);
+UNI_MC_COPY(32);
+UNI_MC_COPY(48);
+UNI_MC_COPY(64);
+
+#undef UNI_MC_COPY
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                         ptrdiff_t             \
+                                                         dst_stride,           \
+                                                         uint8_t *src,         \
+                                                         ptrdiff_t             \
+                                                         src_stride,           \
+                                                         int height,           \
+                                                         intptr_t mx,          \
+                                                         intptr_t my,          \
+                                                         int width)            \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
+                                            filter, height, 6);                \
+}
+
+UNI_MC(qpel, h, 4, 8, hz, mx);
+UNI_MC(qpel, h, 8, 8, hz, mx);
+UNI_MC(qpel, h, 12, 8, hz, mx);
+UNI_MC(qpel, h, 16, 8, hz, mx);
+UNI_MC(qpel, h, 24, 8, hz, mx);
+UNI_MC(qpel, h, 32, 8, hz, mx);
+UNI_MC(qpel, h, 48, 8, hz, mx);
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 4, 8, vt, my);
+UNI_MC(qpel, v, 8, 8, vt, my);
+UNI_MC(qpel, v, 12, 8, vt, my);
+UNI_MC(qpel, v, 16, 8, vt, my);
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, h, 4, 4, hz, mx);
+UNI_MC(epel, h, 6, 4, hz, mx);
+UNI_MC(epel, h, 8, 4, hz, mx);
+UNI_MC(epel, h, 12, 4, hz, mx);
+UNI_MC(epel, h, 16, 4, hz, mx);
+UNI_MC(epel, h, 24, 4, hz, mx);
+UNI_MC(epel, h, 32, 4, hz, mx);
+
+UNI_MC(epel, v, 4, 4, vt, my);
+UNI_MC(epel, v, 6, 4, vt, my);
+UNI_MC(epel, v, 8, 4, vt, my);
+UNI_MC(epel, v, 12, 4, vt, my);
+UNI_MC(epel, v, 16, 4, vt, my);
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)     \
+{                                                                       \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
+                                                                        \
+    hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,     \
+                                              dst_stride, filter_x,     \
+                                              filter_y, height);        \
+}
+
+UNI_MC_HV(qpel, hv, 4, 8, hv);
+UNI_MC_HV(qpel, hv, 8, 8, hv);
+UNI_MC_HV(qpel, hv, 12, 8, hv);
+UNI_MC_HV(qpel, hv, 16, 8, hv);
+UNI_MC_HV(qpel, hv, 24, 8, hv);
+UNI_MC_HV(qpel, hv, 32, 8, hv);
+UNI_MC_HV(qpel, hv, 48, 8, hv);
+UNI_MC_HV(qpel, hv, 64, 8, hv);
+
+UNI_MC_HV(epel, hv, 4, 4, hv);
+UNI_MC_HV(epel, hv, 6, 4, hv);
+UNI_MC_HV(epel, hv, 8, 4, hv);
+UNI_MC_HV(epel, hv, 12, 4, hv);
+UNI_MC_HV(epel, hv, 16, 4, hv);
+UNI_MC_HV(epel, hv, 24, 4, hv);
+UNI_MC_HV(epel, hv, 32, 4, hv);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c
new file mode 100644
index 0000000000..ce10f413ed
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -0,0 +1,4790 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,       \
+                               out0, out1, out2, out3)                     \
+{                                                                          \
+    MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3);  \
+    SRAR_W4_SW(out0, out1, out2, out3, rnd);                               \
+    ADD4(out0, offset, out1, offset, out2, offset, out3, offset,           \
+         out0, out1, out2, out3);                                          \
+    out0 = CLIP_SW_0_255(out0);                                            \
+    out1 = CLIP_SW_0_255(out1);                                            \
+    out2 = CLIP_SW_0_255(out2);                                            \
+    out3 = CLIP_SW_0_255(out3);                                            \
+}
+
+#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,              \
+                            out0_r, out1_r, out0_l, out1_l)          \
+{                                                                    \
+    ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r);                  \
+    ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l);                  \
+    DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt,  \
+                out0_r, out1_r, out0_l, out1_l);                     \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
+    ADD4(out0_r, offset, out1_r, offset,                             \
+         out0_l, offset, out1_l, offset,                             \
+         out0_r, out1_r, out0_l, out1_l);                            \
+    out0_r = CLIP_SW_0_255(out0_r);                                  \
+    out1_r = CLIP_SW_0_255(out1_r);                                  \
+    out0_l = CLIP_SW_0_255(out0_l);                                  \
+    out1_l = CLIP_SW_0_255(out1_l);                                  \
+}
+
+#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,  \
+                            out0_r, out1_r, out2_r, out3_r,        \
+                            out0_l, out1_l, out2_l, out3_l)        \
+{                                                                  \
+    HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,                \
+                        out0_r, out1_r, out0_l, out1_l);           \
+    HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd,                \
+                        out2_r, out3_r, out2_l, out3_l);           \
+}
+
+static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0;
+        v4i32 dst0_r, dst0_l;
+
+        LD_SB2(src, src_stride, src0, src1);
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+        DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst0_l = CLIP_SW_0_255(dst0_l);
+
+        HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r;
+        v4i32 dst0_l, dst1_l;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+        dst0 <<= 6;
+        dst1 <<= 6;
+
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   dst4, dst5, dst6, dst7);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB2(src, src_stride, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2_r, dst, dst_stride);
+    } else if (0 == height % 4) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_16multx4mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src0, src1, src2, src3);
+            src_tmp += (4 * src_stride);
+            ILVR_B2_SH(zero, src0, zero, src1, tmp0, tmp1);
+            ILVL_B2_SH(zero, src0, zero, src1, tmp2, tmp3);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            ILVR_B2_SH(zero, src2, zero, src3, tmp0, tmp1);
+            ILVL_B2_SH(zero, src2, zero, src3, tmp2, tmp3);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 16);
+}
+
+static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 16);
+
+    hevc_uniwgt_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                            height, weight, offset, rnd_val);
+}
+
+static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 32);
+}
+
+static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 48);
+}
+
+static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 64);
+}
+
+static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hz_uniwgt_8t_8w_msa(src, src_stride, dst, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+    hevc_hz_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+}
+
+static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst3_l, dst3_r, dst4_l, dst4_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst2_l, dst2_r, dst5_l, dst5_r, dst2_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        ST_SW(dst2_r, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src_tmp, 16, src0, src1);
+            src2 = LD_SB(src_tmp + 24);
+            src_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    ILVR_D3_SB(src21_r, src10_r, src43_r,
+               src32_r, src65_r, src54_r, src2110, src4332, src6554);
+
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
+                     filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+        tmp4 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
+        tmp5 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               const int8_t *filter,
+                                               int32_t height,
+                                               int32_t weight,
+                                               int32_t offset,
+                                               int32_t rnd_val,
+                                               int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            src_tmp += (2 * src_stride);
+            XORI_B2_128_SB(src7, src8);
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            tmp0 = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+            tmp1 = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+            tmp2 = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+            tmp3 = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 16);
+}
+
+static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 16);
+
+    hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+}
+
+static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 32);
+}
+
+static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 48);
+}
+
+static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 64);
+}
+
+static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            src_tmp += 2 * src_stride;
+            XORI_B2_128_SB(src7, src8);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0_r, dst1_r, dst0_l, dst1_l);
+
+            HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+}
+
+static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+    hevc_hv_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter_x, filter_y, height, weight, offset,
+                             rnd_val);
+}
+
+static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 16);
+}
+
+static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 24);
+}
+
+static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 32);
+}
+
+static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 48);
+}
+
+static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 64);
+}
+
+static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, vec0, vec1;
+    v16i8 mask1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                        weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst2_r, dst3_r,
+                        dst0_l, dst1_l, dst2_l, dst3_l);
+
+    HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                        dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else {
+        hevc_hz_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst10;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src, src_stride, src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    HEVC_UNIW_RND_CLIP2(dst10, dst32, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+        src += (6 * src_stride);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src, src_stride, src9, src2);
+        src += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+        HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    HEVC_UNIW_RND_CLIP2(tmp0, tmp1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    tmp2 = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
+    tmp3 = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
+    tmp4 = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
+    tmp5 = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
+    HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                        weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst2_r, dst3_r,
+                        dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                        dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else {
+        hevc_vt_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= (1 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp6 = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        tmp7 = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
+
+        HEVC_UNIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst6_r, dst7_r,
+                            dst4_l, dst5_l, dst6_l, dst7_l);
+
+        HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
+                        dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
+        ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 dst0_r, dst1_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+    SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst1_r = CLIP_SW_0_255(dst1_r);
+
+    HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r);
+    HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst6_r, dst7_r,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst4_r, dst5_r, dst6_r, dst7_r);
+        HEVC_PCK_SW_SB4(dst5_r, dst4_r, dst7_r, dst6_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst2_r, dst3_r, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst2_r, dst3_r, dst2_l, dst3_l);
+    HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst4_l, dst5_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0_r, dst1_r, dst0_l, dst1_l);
+            HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst2_r, dst3_r, dst2_l, dst3_l);
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val, 8);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+    hevc_hv_uniwgt_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter_x, filter_y, height, weight,
+                             offset, rnd_val);
+}
+
+static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 16);
+}
+
+static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 24);
+}
+
+static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 32);
+}
+
+#define UNIWGT_MC_COPY(WIDTH)                                                \
+void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int height,            \
+                                                      int denom,             \
+                                                      int weight,            \
+                                                      int offset,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)             \
+{                                                                            \
+    int shift = denom + 14 - 8;                                              \
+    hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \
+                                    height, weight, offset, shift);          \
+}
+
+UNIWGT_MC_COPY(4);
+UNIWGT_MC_COPY(6);
+UNIWGT_MC_COPY(8);
+UNIWGT_MC_COPY(12);
+UNIWGT_MC_COPY(16);
+UNIWGT_MC_COPY(24);
+UNIWGT_MC_COPY(32);
+UNIWGT_MC_COPY(48);
+UNIWGT_MC_COPY(64);
+
+#undef UNIWGT_MC_COPY
+
+#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \
+                                                           ptrdiff_t          \
+                                                           dst_stride,        \
+                                                           uint8_t *src,      \
+                                                           ptrdiff_t          \
+                                                           src_stride,        \
+                                                           int height,        \
+                                                           int denom,         \
+                                                           int weight,        \
+                                                           int offset,        \
+                                                           intptr_t mx,       \
+                                                           intptr_t my,       \
+                                                           int width)         \
+{                                                                             \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
+    int shift = denom + 14 - 8;                                               \
+                                                                              \
+    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \
+                                                 dst_stride, filter, height,  \
+                                                 weight, offset, shift);      \
+}
+
+UNI_W_MC(qpel, h, 4, 8, hz, mx);
+UNI_W_MC(qpel, h, 8, 8, hz, mx);
+UNI_W_MC(qpel, h, 12, 8, hz, mx);
+UNI_W_MC(qpel, h, 16, 8, hz, mx);
+UNI_W_MC(qpel, h, 24, 8, hz, mx);
+UNI_W_MC(qpel, h, 32, 8, hz, mx);
+UNI_W_MC(qpel, h, 48, 8, hz, mx);
+UNI_W_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_W_MC(qpel, v, 4, 8, vt, my);
+UNI_W_MC(qpel, v, 8, 8, vt, my);
+UNI_W_MC(qpel, v, 12, 8, vt, my);
+UNI_W_MC(qpel, v, 16, 8, vt, my);
+UNI_W_MC(qpel, v, 24, 8, vt, my);
+UNI_W_MC(qpel, v, 32, 8, vt, my);
+UNI_W_MC(qpel, v, 48, 8, vt, my);
+UNI_W_MC(qpel, v, 64, 8, vt, my);
+
+UNI_W_MC(epel, h, 4, 4, hz, mx);
+UNI_W_MC(epel, h, 6, 4, hz, mx);
+UNI_W_MC(epel, h, 8, 4, hz, mx);
+UNI_W_MC(epel, h, 12, 4, hz, mx);
+UNI_W_MC(epel, h, 16, 4, hz, mx);
+UNI_W_MC(epel, h, 24, 4, hz, mx);
+UNI_W_MC(epel, h, 32, 4, hz, mx);
+
+UNI_W_MC(epel, v, 4, 4, vt, my);
+UNI_W_MC(epel, v, 6, 4, vt, my);
+UNI_W_MC(epel, v, 8, 4, vt, my);
+UNI_W_MC(epel, v, 12, 4, vt, my);
+UNI_W_MC(epel, v, 16, 4, vt, my);
+UNI_W_MC(epel, v, 24, 4, vt, my);
+UNI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_W_MC
+
+#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                              \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,       \
+                                                           ptrdiff_t         \
+                                                           dst_stride,       \
+                                                           uint8_t *src,     \
+                                                           ptrdiff_t         \
+                                                           src_stride,       \
+                                                           int height,       \
+                                                           int denom,        \
+                                                           int weight,       \
+                                                           int offset,       \
+                                                           intptr_t mx,      \
+                                                           intptr_t my,      \
+                                                           int width)        \
+{                                                                            \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                \
+    int shift = denom + 14 - 8;                                              \
+                                                                             \
+    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,       \
+                                                 dst_stride, filter_x,       \
+                                                 filter_y,  height, weight,  \
+                                                 offset, shift);             \
+}
+
+UNI_W_MC_HV(qpel, hv, 4, 8, hv);
+UNI_W_MC_HV(qpel, hv, 8, 8, hv);
+UNI_W_MC_HV(qpel, hv, 12, 8, hv);
+UNI_W_MC_HV(qpel, hv, 16, 8, hv);
+UNI_W_MC_HV(qpel, hv, 24, 8, hv);
+UNI_W_MC_HV(qpel, hv, 32, 8, hv);
+UNI_W_MC_HV(qpel, hv, 48, 8, hv);
+UNI_W_MC_HV(qpel, hv, 64, 8, hv);
+
+UNI_W_MC_HV(epel, hv, 4, 4, hv);
+UNI_W_MC_HV(epel, hv, 6, 4, hv);
+UNI_W_MC_HV(epel, hv, 8, 4, hv);
+UNI_W_MC_HV(epel, hv, 12, 4, hv);
+UNI_W_MC_HV(epel, hv, 16, 4, hv);
+UNI_W_MC_HV(epel, hv, 24, 4, hv);
+UNI_W_MC_HV(epel, hv, 32, 4, hv);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c
new file mode 100644
index 0000000000..3675b93155
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
+                                      const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+        c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa;
+        c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa;
+        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa;
+        c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa;
+        c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa;
+        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa;
+        c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa;
+        c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa;
+        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa;
+
+        c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa;
+        c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa;
+        c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa;
+        c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa;
+        c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa;
+        c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa;
+        c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa;
+        c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa;
+
+        c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa;
+        c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa;
+        c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa;
+        c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa;
+        c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa;
+        c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa;
+        c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa;
+        c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa;
+
+        c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+
+        c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa;
+        c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa;
+        c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa;
+        c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa;
+        c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa;
+        c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa;
+        c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa;
+
+        c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa;
+        c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa;
+        c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa;
+        c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa;
+        c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa;
+        c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa;
+        c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa;
+
+        c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa;
+        c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa;
+        c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa;
+        c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa;
+        c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa;
+        c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa;
+        c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
+        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
+        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
+        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
+        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
+        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
+        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
+        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
+        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
+        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
+        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
+        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
+        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
+        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
+        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
+        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
+        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
+        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
+        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
+        c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
+        c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
+        c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
+        c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
+        c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
+        c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
+        c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
+        c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
+        c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
+        c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
+        c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
+        c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
+        c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
+        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
+        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
+        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
+        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
+        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
+
+        c->sao_band_filter[0] =
+        c->sao_band_filter[1] =
+        c->sao_band_filter[2] =
+        c->sao_band_filter[3] =
+        c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
+
+        c->sao_edge_filter[0] =
+        c->sao_edge_filter[1] =
+        c->sao_edge_filter[2] =
+        c->sao_edge_filter[3] =
+        c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
+
+        c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->idct[0] = ff_hevc_idct_4x4_msa;
+        c->idct[1] = ff_hevc_idct_8x8_msa;
+        c->idct[2] = ff_hevc_idct_16x16_msa;
+        c->idct[3] = ff_hevc_idct_32x32_msa;
+        c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa;
+        c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
+        c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
+        c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
+        c->transform_add[0] = ff_hevc_addblk_4x4_msa;
+        c->transform_add[1] = ff_hevc_addblk_8x8_msa;
+        c->transform_add[2] = ff_hevc_addblk_16x16_msa;
+        c->transform_add[3] = ff_hevc_addblk_32x32_msa;
+        c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+    hevc_dsp_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h
new file mode 100644
index 0000000000..1573d1cc9d
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
+#define AVCODEC_MIPS_HEVCDSP_MIPS_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,            \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int height,            \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 4);
+MC(epel, h, 6);
+MC(epel, h, 8);
+MC(epel, h, 12);
+MC(epel, h, 16);
+MC(epel, h, 24);
+MC(epel, h, 32);
+MC(epel, h, 48);
+MC(epel, h, 64);
+
+MC(epel, v, 4);
+MC(epel, v, 6);
+MC(epel, v, 8);
+MC(epel, v, 12);
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+MC(epel, v, 48);
+MC(epel, v, 64);
+
+MC(epel, hv, 4);
+MC(epel, hv, 6);
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+MC(epel, hv, 48);
+MC(epel, hv, 64);
+
+#undef MC
+
+#define UNI_MC(PEL, DIR, WIDTH)                                                \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                         ptrdiff_t dst_stride, \
+                                                         uint8_t *src,         \
+                                                         ptrdiff_t src_stride, \
+                                                         int height,           \
+                                                         intptr_t mx,          \
+                                                         intptr_t my,          \
+                                                         int width)
+
+UNI_MC(pel, pixels, 4);
+UNI_MC(pel, pixels, 6);
+UNI_MC(pel, pixels, 8);
+UNI_MC(pel, pixels, 12);
+UNI_MC(pel, pixels, 16);
+UNI_MC(pel, pixels, 24);
+UNI_MC(pel, pixels, 32);
+UNI_MC(pel, pixels, 48);
+UNI_MC(pel, pixels, 64);
+
+UNI_MC(qpel, h, 4);
+UNI_MC(qpel, h, 8);
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 4);
+UNI_MC(qpel, v, 8);
+UNI_MC(qpel, v, 12);
+UNI_MC(qpel, v, 16);
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 4);
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 12);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, h, 4);
+UNI_MC(epel, h, 6);
+UNI_MC(epel, h, 8);
+UNI_MC(epel, h, 12);
+UNI_MC(epel, h, 16);
+UNI_MC(epel, h, 24);
+UNI_MC(epel, h, 32);
+UNI_MC(epel, h, 48);
+UNI_MC(epel, h, 64);
+
+UNI_MC(epel, v, 4);
+UNI_MC(epel, v, 6);
+UNI_MC(epel, v, 8);
+UNI_MC(epel, v, 12);
+UNI_MC(epel, v, 16);
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+UNI_MC(epel, v, 48);
+UNI_MC(epel, v, 64);
+
+UNI_MC(epel, hv, 4);
+UNI_MC(epel, hv, 6);
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+UNI_MC(epel, hv, 48);
+UNI_MC(epel, hv, 64);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH)                                         \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \
+                                                           ptrdiff_t      \
+                                                           dst_stride,    \
+                                                           uint8_t *src,  \
+                                                           ptrdiff_t      \
+                                                           src_stride,    \
+                                                           int height,    \
+                                                           int denom,     \
+                                                           int weight,    \
+                                                           int offset,    \
+                                                           intptr_t mx,   \
+                                                           intptr_t my,   \
+                                                           int width)
+
+UNI_W_MC(pel, pixels, 4);
+UNI_W_MC(pel, pixels, 6);
+UNI_W_MC(pel, pixels, 8);
+UNI_W_MC(pel, pixels, 12);
+UNI_W_MC(pel, pixels, 16);
+UNI_W_MC(pel, pixels, 24);
+UNI_W_MC(pel, pixels, 32);
+UNI_W_MC(pel, pixels, 48);
+UNI_W_MC(pel, pixels, 64);
+
+UNI_W_MC(qpel, h, 4);
+UNI_W_MC(qpel, h, 8);
+UNI_W_MC(qpel, h, 12);
+UNI_W_MC(qpel, h, 16);
+UNI_W_MC(qpel, h, 24);
+UNI_W_MC(qpel, h, 32);
+UNI_W_MC(qpel, h, 48);
+UNI_W_MC(qpel, h, 64);
+
+UNI_W_MC(qpel, v, 4);
+UNI_W_MC(qpel, v, 8);
+UNI_W_MC(qpel, v, 12);
+UNI_W_MC(qpel, v, 16);
+UNI_W_MC(qpel, v, 24);
+UNI_W_MC(qpel, v, 32);
+UNI_W_MC(qpel, v, 48);
+UNI_W_MC(qpel, v, 64);
+
+UNI_W_MC(qpel, hv, 4);
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 12);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+UNI_W_MC(epel, h, 4);
+UNI_W_MC(epel, h, 6);
+UNI_W_MC(epel, h, 8);
+UNI_W_MC(epel, h, 12);
+UNI_W_MC(epel, h, 16);
+UNI_W_MC(epel, h, 24);
+UNI_W_MC(epel, h, 32);
+UNI_W_MC(epel, h, 48);
+UNI_W_MC(epel, h, 64);
+
+UNI_W_MC(epel, v, 4);
+UNI_W_MC(epel, v, 6);
+UNI_W_MC(epel, v, 8);
+UNI_W_MC(epel, v, 12);
+UNI_W_MC(epel, v, 16);
+UNI_W_MC(epel, v, 24);
+UNI_W_MC(epel, v, 32);
+UNI_W_MC(epel, v, 48);
+UNI_W_MC(epel, v, 64);
+
+UNI_W_MC(epel, hv, 4);
+UNI_W_MC(epel, hv, 6);
+UNI_W_MC(epel, hv, 8);
+UNI_W_MC(epel, hv, 12);
+UNI_W_MC(epel, hv, 16);
+UNI_W_MC(epel, hv, 24);
+UNI_W_MC(epel, hv, 32);
+UNI_W_MC(epel, hv, 48);
+UNI_W_MC(epel, hv, 64);
+
+#undef UNI_W_MC
+
+#define BI_MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 4);
+BI_MC(qpel, h, 8);
+BI_MC(qpel, h, 12);
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 4);
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 12);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 4);
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 12);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 4);
+BI_MC(epel, h, 6);
+BI_MC(epel, h, 8);
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+BI_MC(epel, v, 4);
+BI_MC(epel, v, 6);
+BI_MC(epel, v, 8);
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+BI_MC(epel, v, 48);
+BI_MC(epel, v, 64);
+
+BI_MC(epel, hv, 4);
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 12);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+BI_MC(epel, hv, 48);
+BI_MC(epel, hv, 64);
+
+#undef BI_MC
+
+#define BI_W_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)
+
+BI_W_MC(pel, pixels, 4);
+BI_W_MC(pel, pixels, 6);
+BI_W_MC(pel, pixels, 8);
+BI_W_MC(pel, pixels, 12);
+BI_W_MC(pel, pixels, 16);
+BI_W_MC(pel, pixels, 24);
+BI_W_MC(pel, pixels, 32);
+BI_W_MC(pel, pixels, 48);
+BI_W_MC(pel, pixels, 64);
+
+BI_W_MC(qpel, h, 4);
+BI_W_MC(qpel, h, 8);
+BI_W_MC(qpel, h, 12);
+BI_W_MC(qpel, h, 16);
+BI_W_MC(qpel, h, 24);
+BI_W_MC(qpel, h, 32);
+BI_W_MC(qpel, h, 48);
+BI_W_MC(qpel, h, 64);
+
+BI_W_MC(qpel, v, 4);
+BI_W_MC(qpel, v, 8);
+BI_W_MC(qpel, v, 12);
+BI_W_MC(qpel, v, 16);
+BI_W_MC(qpel, v, 24);
+BI_W_MC(qpel, v, 32);
+BI_W_MC(qpel, v, 48);
+BI_W_MC(qpel, v, 64);
+
+BI_W_MC(qpel, hv, 4);
+BI_W_MC(qpel, hv, 8);
+BI_W_MC(qpel, hv, 12);
+BI_W_MC(qpel, hv, 16);
+BI_W_MC(qpel, hv, 24);
+BI_W_MC(qpel, hv, 32);
+BI_W_MC(qpel, hv, 48);
+BI_W_MC(qpel, hv, 64);
+
+BI_W_MC(epel, h, 4);
+BI_W_MC(epel, h, 6);
+BI_W_MC(epel, h, 8);
+BI_W_MC(epel, h, 12);
+BI_W_MC(epel, h, 16);
+BI_W_MC(epel, h, 24);
+BI_W_MC(epel, h, 32);
+BI_W_MC(epel, h, 48);
+BI_W_MC(epel, h, 64);
+
+BI_W_MC(epel, v, 4);
+BI_W_MC(epel, v, 6);
+BI_W_MC(epel, v, 8);
+BI_W_MC(epel, v, 12);
+BI_W_MC(epel, v, 16);
+BI_W_MC(epel, v, 24);
+BI_W_MC(epel, v, 32);
+BI_W_MC(epel, v, 48);
+BI_W_MC(epel, v, 64);
+
+BI_W_MC(epel, hv, 4);
+BI_W_MC(epel, hv, 6);
+BI_W_MC(epel, hv, 8);
+BI_W_MC(epel, hv, 12);
+BI_W_MC(epel, hv, 16);
+BI_W_MC(epel, hv, 24);
+BI_W_MC(epel, hv, 32);
+BI_W_MC(epel, hv, 48);
+BI_W_MC(epel, hv, 64);
+
+#undef BI_W_MC
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height);
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height);
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs);
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_idct_luma_4x4_msa(int16_t *pi16Coeffs);
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
new file mode 100644
index 0000000000..f2bc748e37
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -0,0 +1,3878 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        in0 = (v8i16) __msa_ilvr_b(zero, src0);
+        in0 <<= 6;
+        ST8x2_UB(in0, dst, 2 * dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST8x4_UB(in0, in1, dst, 2 * dst_stride);
+    } else if (0 == height % 8) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3;
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in4, in5, in6, in7);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        SLLI_4V(in4, in5, in6, in7, 6);
+        ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH2(in0, in1, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        ST_SH4(in0, in1, in2, in3, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+
+        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        in4 <<= 6;
+        in5 <<= 6;
+        ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in4, in5, in6, in7);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            SLLI_4V(in4, in5, in6, in7, 6);
+            ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_copy_16multx8mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0_r, in1_r, in2_r, in3_r;
+    v8i16 in0_l, in1_l, in2_l, in3_l;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0_r, in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0_l, in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in0_r, in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in0_l, in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (12 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v16i8 src8, src9, src10, src11;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_SB4(src, src_stride, src8, src9, src10, src11);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (0 == (height % 8)) {
+        hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride,
+                                   height, 16);
+    }
+}
+
+static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    hevc_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride, height);
+}
+
+static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+}
+
+static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
+}
+
+static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
+    hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst7, dst7, dst7, dst7);
+
+        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        ST_SH(dst2, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst3, dst4, dst, 8);
+        ST_SH(dst5, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src1, src2, src3);
+        src4 = LD_SB(src + 56);
+        src += src_stride;
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        ST_SH(dst0, dst);
+
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        ST_SH(dst1, dst + 8);
+
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        ST_SH(dst2, dst + 16);
+
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        ST_SH(dst3, dst + 24);
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        ST_SH(dst4, dst + 32);
+
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        ST_SH(dst5, dst + 40);
+
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+        ST_SH(dst6, dst + 48);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst7, dst7, dst7, dst7);
+        ST_SH(dst7, dst + 56);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
+                   src1211_r, src1110_r, src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
+                                        int32_t src_stride,
+                                        int16_t *dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter,
+                                        int32_t height,
+                                        int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_r, src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_l, src87_l, src98_l, src109_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst2_r = const_vec;
+            DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                         filt0, filt1, filt2, filt3,
+                         dst2_r, dst2_r, dst2_r, dst2_r);
+            dst3_r = const_vec;
+            DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                         filt0, filt1, filt2, filt3,
+                         dst3_r, dst3_r, dst3_r, dst3_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+            dst2_l = const_vec;
+            DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
+                         filt0, filt1, filt2, filt3,
+                         dst2_l, dst2_l, dst2_l, dst2_l);
+            dst3_l = const_vec;
+            DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
+                         filt0, filt1, filt2, filt3,
+                         dst3_l, dst3_l, dst3_l, dst3_l);
+
+            ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
+            ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+    hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                      filter, height);
+}
+
+static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = {
+        0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+    };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+
+        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+        ST8x2_UB(dst0_r, dst, (2 * dst_stride));
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_8t_8multx2mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src_tmp += 2 * src_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ST_SW(dst0_r, dst_tmp);
+            dst_tmp += dst_stride;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst6 = dst8;
+            dst0_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ST_SW(dst0_r, dst_tmp);
+            dst_tmp += dst_stride;
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                      filter_x, filter_y, height);
+}
+
+static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ST8x2_UB(dst0, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (0 == height % 8) {
+        hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height || 6 == height) {
+        hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    } else {
+        hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    int16_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask00, mask11;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16 width */
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ST8x2_UB(dst10, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+
+    ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x8multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+        src += (6 * src_stride);
+
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src, src_stride, src9, src2);
+        src += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+
+        ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
+    } else if (0 == (height % 8)) {
+        hevc_vt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_vt_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    int32_t loop_cnt;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src1, src2);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src1, src2);
+
+    ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (6 == height) {
+        hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_vt_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= (1 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hv_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    ST8x2_UB(dst0_r, dst, 2 * dst_stride);
+}
+
+static void hevc_hv_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride);
+}
+
+
+static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        PCKEV_H4_SW(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r,
+                    dst0_r, dst1_r, dst2_r, dst3_r);
+        ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (0 == (height % 8)) {
+        hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r,
+                    dst0_r, dst1_r, dst2_r, dst3_r);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+
+    }
+}
+
+static void hevc_hv_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst2_r, dst3_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst4_r, dst5_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 4 */
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            /* row 5 */
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            /* row 6 */
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        dst0_r, dst1_r, dst2_r, dst3_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+            ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+
+    if (2 == height) {
+        hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y, height);
+    } else if (0 == (height % 4)) {
+        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                      filter_x, filter_y, height);
+
+}
+
+static void hevc_hv_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+#define MC_COPY(WIDTH)                                                    \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst,             \
+                                                uint8_t *src,             \
+                                                ptrdiff_t src_stride,     \
+                                                int height,               \
+                                                intptr_t mx,              \
+                                                intptr_t my,              \
+                                                int width)                \
+{                                                                         \
+    hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height);  \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                            \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,            \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int height,            \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)             \
+{                                                                           \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];           \
+                                                                            \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,             \
+                                          MAX_PB_SIZE, filter, height);     \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 4, 4, hz, mx);
+MC(epel, h, 6, 4, hz, mx);
+MC(epel, h, 8, 4, hz, mx);
+MC(epel, h, 12, 4, hz, mx);
+MC(epel, h, 16, 4, hz, mx);
+MC(epel, h, 24, 4, hz, mx);
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 4, 4, vt, my);
+MC(epel, v, 6, 4, vt, my);
+MC(epel, v, 8, 4, vt, my);
+MC(epel, v, 12, 4, vt, my);
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                     \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,              \
+                                                     uint8_t *src,            \
+                                                     ptrdiff_t src_stride,    \
+                                                     int height,              \
+                                                     intptr_t mx,             \
+                                                     intptr_t my,             \
+                                                     int width)               \
+{                                                                             \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \
+                                                                              \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE,  \
+                                          filter_x, filter_y, height);        \
+}
+
+MC_HV(qpel, hv, 4, 8, hv);
+MC_HV(qpel, hv, 8, 8, hv);
+MC_HV(qpel, hv, 12, 8, hv);
+MC_HV(qpel, hv, 16, 8, hv);
+MC_HV(qpel, hv, 24, 8, hv);
+MC_HV(qpel, hv, 32, 8, hv);
+MC_HV(qpel, hv, 48, 8, hv);
+MC_HV(qpel, hv, 64, 8, hv);
+
+MC_HV(epel, hv, 4, 4, hv);
+MC_HV(epel, hv, 6, 4, hv);
+MC_HV(epel, hv, 8, 4, hv);
+MC_HV(epel, hv, 12, 4, hv);
+MC_HV(epel, hv, 16, 4, hv);
+MC_HV(epel, hv, 24, 4, hv);
+MC_HV(epel, hv, 32, 4, hv);
+
+#undef MC_HV
diff --git a/libavcodec/mips/hevcpred_init_mips.c b/libavcodec/mips/hevcpred_init_mips.c
new file mode 100644
index 0000000000..331cfac115
--- /dev/null
+++ b/libavcodec/mips/hevcpred_init_mips.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevc.h"
+#include "libavcodec/mips/hevcpred_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_pred_init_msa(HEVCPredContext *c, const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->intra_pred[2] = ff_intra_pred_8_16x16_msa;
+        c->intra_pred[3] = ff_intra_pred_8_32x32_msa;
+        c->pred_planar[0] = ff_hevc_intra_pred_planar_0_msa;
+        c->pred_planar[1] = ff_hevc_intra_pred_planar_1_msa;
+        c->pred_planar[2] = ff_hevc_intra_pred_planar_2_msa;
+        c->pred_planar[3] = ff_hevc_intra_pred_planar_3_msa;
+        c->pred_dc = ff_hevc_intra_pred_dc_msa;
+        c->pred_angular[0] = ff_pred_intra_pred_angular_0_msa;
+        c->pred_angular[1] = ff_pred_intra_pred_angular_1_msa;
+        c->pred_angular[2] = ff_pred_intra_pred_angular_2_msa;
+        c->pred_angular[3] = ff_pred_intra_pred_angular_3_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_pred_init_mips(HEVCPredContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+    hevc_pred_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcpred_mips.h b/libavcodec/mips/hevcpred_mips.h
new file mode 100644
index 0000000000..12f57a2a3c
--- /dev/null
+++ b/libavcodec/mips/hevcpred_mips.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
+#define AVCODEC_MIPS_HEVCPRED_MIPS_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx);
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx);
+void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx);
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c
new file mode 100644
index 0000000000..5d9299fb32
--- /dev/null
+++ b/libavcodec/mips/hevcpred_msa.c
@@ -0,0 +1,3084 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevc.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "hevcpred_mips.h"
+
+static const int8_t intra_pred_angle_up[17] = {
+    -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
+};
+
+static const int8_t intra_pred_angle_low[16] = {
+    32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
+};
+
+#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \
+                              mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \
+                              res0, res1, mul_val_b0, mul_val_b1, round)       \
+{                                                                              \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
+                                                                               \
+    MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \
+         mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \
+                                                                               \
+    res0_m += mul_val_h1 * tmp0;                                               \
+    res1_m += mul_val_h3 * tmp0;                                               \
+    res2_m += mul_val_h1 * tmp0;                                               \
+    res3_m += mul_val_h3 * tmp0;                                               \
+                                                                               \
+    res0_m += mul_val_b0 * src0_r;                                             \
+    res1_m += mul_val_b0 * src0_l;                                             \
+    res2_m += (mul_val_b0 - 1) * src0_r;                                       \
+    res3_m += (mul_val_b0 - 1) * src0_l;                                       \
+                                                                               \
+    res0_m += mul_val_b1 * tmp1;                                               \
+    res1_m += mul_val_b1 * tmp1;                                               \
+    res2_m += (mul_val_b1 + 1) * tmp1;                                         \
+    res3_m += (mul_val_b1 + 1) * tmp1;                                         \
+                                                                               \
+    SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \
+    PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \
+}
+
+static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint32_t col;
+    uint32_t src_data;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data = LW(src_top);
+    SW4(src_data, src_data, src_data, src_data, dst, stride);
+
+    if (0 == flag) {
+        src_data = LW(src_left);
+
+        vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        for (col = 0; col < 4; col++) {
+            dst[stride * col] = (uint8_t) vec2[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint16_t val0, val1, val2, val3;
+    uint64_t src_data1;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data1 = LD(src_top);
+
+    for (row = 8; row--;) {
+        SD(src_data1, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src_data1 = LD(src_left);
+
+        vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        val0 = vec2[0];
+        val1 = vec2[1];
+        val2 = vec2[2];
+        val3 = vec2[3];
+
+        dst[0] = val0;
+        dst[stride] = val1;
+        dst[2 * stride] = val2;
+        dst[3 * stride] = val3;
+
+        val0 = vec2[4];
+        val1 = vec2[5];
+        val2 = vec2[6];
+        val3 = vec2[7];
+
+        dst[4 * stride] = val0;
+        dst[5 * stride] = val1;
+        dst[6 * stride] = val2;
+        dst[7 * stride] = val3;
+    }
+}
+
+static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
+                                           const uint8_t *src_left,
+                                           uint8_t *dst, int32_t stride,
+                                           int32_t flag)
+{
+    int32_t col;
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    v16u8 src;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    src = LD_UB(src_top);
+
+    for (row = 16; row--;) {
+        ST_UB(src, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src = LD_UB(src_left);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        UNPCK_UB_SH(src, vec2, vec3);
+        SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
+
+        vec2 >>= 1;
+        vec3 >>= 1;
+
+        ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
+        CLIP_SH2_0_255(vec2, vec3);
+
+        src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
+
+        for (col = 0; col < 16; col++) {
+            dst[stride * col] = src[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint32_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x01010101;
+    val1 = src_left[1] * 0x01010101;
+    val2 = src_left[2] * 0x01010101;
+    val3 = src_left[3] * 0x01010101;
+    SW4(val0, val1, val2, val3, dst, stride);
+
+    if (0 == flag) {
+        val0 = LW(src_top);
+        src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_w((v4i32) src0, 0);
+        SW(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint64_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x0101010101010101;
+    val1 = src_left[1] * 0x0101010101010101;
+    val2 = src_left[2] * 0x0101010101010101;
+    val3 = src_left[3] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst, stride);
+
+    val0 = src_left[4] * 0x0101010101010101;
+    val1 = src_left[5] * 0x0101010101010101;
+    val2 = src_left[6] * 0x0101010101010101;
+    val3 = src_left[7] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
+
+    if (0 == flag) {
+        val0 = LD(src_top);
+        src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_d((v2i64) src0, 0);
+        SD(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride,
+                                            int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+    v8i16 src0_r, src0_l, src_left_val, src_top_val;
+
+    src_left_val = __msa_fill_h(src_left[0]);
+
+    for (row = 4; row--;) {
+        inp0 = src_left[0];
+        inp1 = src_left[1];
+        inp2 = src_left[2];
+        inp3 = src_left[3];
+        src_left += 4;
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
+        tmp_dst += (4 * stride);
+    }
+
+    if (0 == flag) {
+        src0 = LD_SB(src_top);
+        src_top_val = __msa_fill_h(src_top[-1]);
+
+        UNPCK_UB_SH(src0, src0_r, src0_l);
+        SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
+
+        src0_r >>= 1;
+        src0_l >>= 1;
+
+        ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
+        CLIP_SH2_0_255(src0_r, src0_l);
+        src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
+        ST_SB(src0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+
+    for (row = 0; row < 8; row++) {
+        inp0 = src_left[row * 4];
+        inp1 = src_left[row * 4 + 1];
+        inp2 = src_left[row * 4 + 2];
+        inp3 = src_left[row * 4 + 3];
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB2(src0, src0, dst, 16);
+        dst += stride;
+        ST_SB2(src1, src1, dst, 16);
+        dst += stride;
+        ST_SB2(src2, src2, dst, 16);
+        dst += stride;
+        ST_SB2(src3, src3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t addition = 0;
+    uint32_t val0, val1, val2;
+    v16i8 src = { 0 };
+    v16u8 store;
+    v16i8 zero = { 0 };
+    v8u16 sum, vec0, vec1;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+    SW4(val0, val0, val0, val0, dst, stride)
+
+        if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
+        val0 = __msa_copy_u_w((v4i32) store, 0);
+        SW(val0, tmp_dst);
+
+        val0 = src_left[1];
+        val1 = src_left[2];
+        val2 = src_left[3];
+
+        addition *= 3;
+
+        ADD2(val0, addition, val1, addition, val0, val1);
+        val2 += addition;
+
+        val0 += 2;
+        val1 += 2;
+        val2 += 2;
+        val0 >>= 2;
+        val1 >>= 2;
+        val2 >>= 2;
+
+        tmp_dst[stride * 1] = val0;
+        tmp_dst[stride * 2] = val1;
+        tmp_dst[stride * 3] = val2;
+    }
+}
+
+static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    uint64_t val0, val1;
+    v16u8 src = { 0 };
+    v16u8 store;
+    v8u16 sum, vec0, vec1;
+    v16i8 zero = { 0 };
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    for (row = 8; row--;) {
+        SD(val0, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        val0 = __msa_copy_u_d((v2i64) store, 0);
+        SD(val0, tmp_dst);
+
+        val0 = LD(src_left);
+        src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
+        vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+
+        for (col = 1; col < 8; col++) {
+            tmp_dst[stride * col] = vec1[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    v16u8 src_above1, store, src_left1;
+    v8u16 sum, sum_above, sum_left;
+    v8u16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_above1 = LD_UB(src_top);
+    src_left1 = LD_UB(src_left);
+
+    HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+
+    for (row = 16; row--;) {
+        ST_UB(store, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
+        ILVRL_B2_UH(zero, src_above1, vec1, vec2);
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        vec0 += vec0;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        ST_UB(store, tmp_dst);
+
+        ILVRL_B2_UH(zero, src_left1, vec1, vec2);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+
+        for (col = 1; col < 16; col++) {
+            tmp_dst[stride * col] = store[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    v16u8 src_above1, src_above2, store, src_left1, src_left2;
+    v8u16 sum_above1, sum_above2;
+    v8u16 sum_left1, sum_left2;
+    v8u16 sum, sum_above, sum_left;
+
+    LD_UB2(src_top, 16, src_above1, src_above2);
+    LD_UB2(src_left, 16, src_left1, src_left2);
+    HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
+    HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
+    sum_above = sum_above1 + sum_above2;
+    sum_left = sum_left1 + sum_left2;
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
+    store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+
+    for (row = 16; row--;) {
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint32_t src0, src1;
+    v16i8 src_vec0, src_vec1;
+    v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
+    v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
+    v16i8 zero = { 0 };
+
+    src0 = LW(src_top);
+    src1 = LW(src_left);
+
+    mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
+
+    src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
+    SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+
+    tmp0 = __msa_fill_h(src_top[4]);
+    tmp1 = __msa_fill_h(src_left[4]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+
+    res0 += mul_val1 * tmp0;
+    res1 += mul_val1 * tmp0;
+    res2 += mul_val1 * tmp0;
+    res3 += mul_val1 * tmp0;
+
+    res0 += 3 * src_vec0_r;
+    res1 += 2 * src_vec0_r;
+    res2 += src_vec0_r;
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+
+    PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
+    SRARI_H2_SH(res0, res1, 3);
+    src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+    ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint64_t src0, src1;
+    v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
+    v8i16 src_vec0_r, src_vec1_r;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 tmp0, tmp1, tmp2;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16i8 zero = { 0 };
+
+    src0 = LD(src_top);
+    src1 = LD(src_left);
+
+    src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
+    SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+    SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
+
+    tmp0 = __msa_fill_h(src_top[8]);
+    tmp1 = __msa_fill_h(src_left[8]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+    MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
+         res4, res5, res6, res7);
+
+    tmp2 = mul_val1 * tmp0;
+    res0 += tmp2;
+    res1 += tmp2;
+    res2 += tmp2;
+    res3 += tmp2;
+    res4 += tmp2;
+    res5 += tmp2;
+    res6 += tmp2;
+    res7 += tmp2;
+
+    res0 += 7 * src_vec0_r;
+    res1 += 6 * src_vec0_r;
+    res2 += 5 * src_vec0_r;
+    res3 += 4 * src_vec0_r;
+    res4 += 3 * src_vec0_r;
+    res5 += 2 * src_vec0_r;
+    res6 += src_vec0_r;
+
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+    res4 += 5 * tmp1;
+    res5 += 6 * tmp1;
+    res6 += 7 * tmp1;
+    res7 += 8 * tmp1;
+
+    SRARI_H4_SH(res0, res1, res2, res3, 4);
+    SRARI_H4_SH(res4, res5, res6, res7, 4);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                src_vec0, src_vec1, src_vec2, src_vec3);
+
+    ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    v16u8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1;
+    v8i16 res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    src0 = LD_UB(src_top);
+    src1 = LD_UB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    tmp0 = __msa_fill_h(src_top[16]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 1, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 3, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 5, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 7, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 9, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 11, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 13, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 15, 5);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_upper_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1;
+    v8i16 tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[32]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 31, 1, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 29, 3, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 27, 5, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 25, 7, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 23, 9, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 21, 11, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 19, 13, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 17, 15, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_lower_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 17, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 19, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 21, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 23, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 25, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 27, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 29, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 31, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_upper_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+    dst += (16 * stride);
+    src_left += 16;
+
+    process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_lower_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+}
+
+static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, offset;
+    uint64_t tmp0;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0;
+    v16i8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 3;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (angle < 0 && last < -1) {
+        inv_angle_val = inv_angle[mode - 18];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_left[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_SB(ref + idx0 + 1);
+    top1 = LD_SB(ref + idx1 + 1);
+    top2 = LD_SB(ref + idx2 + 1);
+    top3 = LD_SB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
+    ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last, offset;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t tmp0, tmp1, tmp2;
+    v16i8 top0, top1, top2, top3;
+    v16u8 dst_val0, dst_val1;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
+        ST8x4_UB(dst_val0, dst_val1, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t tmp0;
+    int32_t angle, angle_loop, offset;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle >> 1;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        top0 = LD_UB(ref);
+        tmp0 = LW(ref + 16);
+        ST_UB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 4; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        LD_UB2(ref + idx0 + 1, 16, top0, top1);
+        LD_UB2(ref + idx1 + 1, 16, top2, top3);
+        LD_UB2(ref + idx2 + 1, 16, top4, top5);
+        LD_UB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t tmp0, tmp1, tmp2, tmp3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t last, offset;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    ref_tmp = ref_array + 32;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+        LD_UB2(ref, 16, top0, top1);
+        tmp0 = ref[32];
+        tmp1 = ref[33];
+        tmp2 = ref[34];
+        tmp3 = ref[35];
+
+        ST_UB2(top0, top1, ref_tmp, 16);
+        ref_tmp[32] = tmp0;
+        ref_tmp[33] = tmp1;
+        ref_tmp[34] = tmp2;
+        ref_tmp[35] = tmp3;
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 16; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_UB(ref + idx0 + 1);
+        top4 = LD_UB(ref + idx1 + 1);
+        top1 = LD_UB(ref + idx0 + 17);
+        top5 = LD_UB(ref + idx1 + 17);
+        top3 = LD_UB(ref + idx0 + 33);
+        top7 = LD_UB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+
+        ST_SB2(dst0, dst1, dst, 16);
+        dst += stride;
+        ST_SB2(dst2, dst3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last, offset;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    uint64_t tmp0;
+    v16i8 dst_val0, dst_val1;
+    v16u8 top0, top1, top2, top3;
+    v16u8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle >> 3;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_top[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_UB(ref + idx0 + 1);
+    top1 = LD_UB(ref + idx1 + 1);
+    top2 = LD_UB(ref + idx2 + 1);
+    top3 = LD_UB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
+
+    diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
+    diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
+
+    diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
+
+    dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
+    dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
+
+    ST4x2_UB(dst_val0, dst, stride);
+    dst += (2 * stride);
+    ST4x2_UB(dst_val1, dst, stride);
+}
+
+static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last, offset, tmp0, tmp1, tmp2;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVRL_H2_SH(diff1, diff0, diff3, diff4);
+        ST4x8_UB(diff3, diff4, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 1;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        top0 = LD_SB(ref);
+        tmp0 = LW(ref + 16);
+        ST_SB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 4; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        LD_SB2(ref + idx0 + 1, 16, top0, top1);
+        LD_SB2(ref + idx1 + 1, 16, top2, top3);
+        LD_SB2(ref + idx2 + 1, 16, top4, top5);
+        LD_SB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
+        ILVRL_H2_SH(diff1, diff0, diff4, diff5);
+        ILVRL_H2_SH(diff3, diff2, diff6, diff7);
+        ST4x8_UB(diff4, diff5, dst_org, stride);
+        dst_org += (8 * stride);
+        ST4x8_UB(diff6, diff7, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 32;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        LD_SB2(ref, 16, top0, top1);
+        tmp0 = LW(ref + 32);
+        ST_SB2(top0, top1, ref_tmp, 16);
+        SW(tmp0, ref_tmp + 32);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 16; v_cnt++) {
+        dst_org = dst;
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top4 = LD_SB(ref + idx1 + 1);
+        top1 = LD_SB(ref + idx0 + 17);
+        top5 = LD_SB(ref + idx1 + 17);
+        top3 = LD_SB(ref + idx0 + 33);
+        top7 = LD_SB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
+        ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
+
+        ST2x4_UB(diff0, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff0, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        ST2x4_UB(diff2, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff2, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        dst += 2;
+    }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx)
+{
+    switch (log2) {
+    case 2:
+        hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 3:
+        hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 4:
+        hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 5:
+        hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
+        break;
+    }
+}
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
+    } else if (mode == 26) {
+        intra_predict_vert_32x32_msa(src_top, dst, stride);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0;
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->sps->hshift[c_idx];
+    int vshift = s->sps->vshift[c_idx];
+    int size_in_luma_h = 16 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
+    int size_in_luma_v = 16 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
+    int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
+
+    int cur_tb_addr =
+        s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
+                               (s->sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->sps->height) ? (s->sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->sps->width) ? (s->sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->sps->min_pu_height -
+                  y_bottom_pu) ? (s->sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->sps->min_pu_height -
+                  y_left_pu) ? (s->sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->sps->min_pu_width -
+                  x_top_pu) ? (s->sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->sps->min_pu_width -
+                  x_right_pu) ? (s->sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        vec0 = LD_UB(src - stride);
+        ST_UB(vec0, top);
+    }
+    if (cand_up_right) {
+        vec0 = LD_UB(src - stride + 16);
+        ST_UB(vec0, (top + 16));
+
+        do {
+            uint32_t pix =
+                ((src[(16 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 16 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 16; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 16; i < 16 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 16 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 16) << hshift) <
+                s->sps->width ? 2 * 16 : (s->sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 16) << vshift) <
+                s->sps->height ? 2 * 16 : (s->sps->height - y0) >> vshift;
+            int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((16) << hshift) < s->sps->width ?
+                    16 : (s->sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((16) << vshift) < s->sps->height ?
+                    16 : (s->sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB(vec0, left);
+            }
+            if (!cand_bottom_left) {
+
+                vec0 = (v16u8) __msa_fill_b(left[15]);
+
+                ST_UB(vec0, (left + 16));
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[15]);
+
+            ST_UB(vec0, (left + 16));
+
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[16]);
+
+            ST_UB(vec0, top);
+
+            left[-1] = top[16];
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB2(vec0, vec0, top, 16);
+            ST_UB2(vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[16]);
+        ST_UB(vec0, left);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+        ST_UB(vec0, top);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[15]);
+        ST_UB(vec0, (top + 16));
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 16 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
+                filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
+                filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                        left[i - 1] + 2) >> 2;
+                filtered_top[-1] =
+                    filtered_left[-1] =
+                    (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                       top[i - 1] + 2) >> 2;
+                left = filtered_left;
+                top = filtered_top;
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                   (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 4, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                    (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
+
+void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0, vec1;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3;
+    v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->sps->hshift[c_idx];
+    int vshift = s->sps->vshift[c_idx];
+    int size_in_luma_h = 32 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->sps->log2_min_tb_size;
+    int size_in_luma_v = 32 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
+    int y_tb = (y0 >> s->sps->log2_min_tb_size) & s->sps->tb_mask;
+
+    int cur_tb_addr =
+        s->pps->min_tb_addr_zs[(y_tb) * (s->sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->sps->tb_mask) *
+                               (s->sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->pps->min_tb_addr_zs[(y_tb - 1) * (s->sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->sps->height) ? (s->sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->sps->width) ? (s->sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->sps->min_pu_height -
+                  y_bottom_pu) ? (s->sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->sps->min_pu_height -
+                  y_left_pu) ? (s->sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->sps->min_pu_width -
+                  x_top_pu) ? (s->sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->sps->min_pu_width -
+                  x_right_pu) ? (s->sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        LD_UB2(src - stride, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, top, 16);
+    }
+
+    if (cand_up_right) {
+        LD_UB2(src - stride + 32, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, (top + 32), 16);
+        do {
+            uint32_t pix =
+                ((src[(32 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 32 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 32; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 32; i < 32 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 32 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 32) << hshift) <
+                s->sps->width ? 2 * 32 : (s->sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 32) << vshift) <
+                s->sps->height ? 2 * 32 : (s->sps->height - y0) >> vshift;
+            int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((32) << hshift) < s->sps->width ?
+                    32 : (s->sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((32) << vshift) < s->sps->height ?
+                    32 : (s->sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB2(vec0, vec0, left, 16);
+            }
+            if (!cand_bottom_left) {
+                vec0 = (v16u8) __msa_fill_b(left[31]);
+
+                ST_UB2(vec0, vec0, (left + 32), 16);
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[31]);
+
+            ST_UB2(vec0, vec0, (left + 32), 16);
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[32]);
+
+            ST_UB2(vec0, vec0, top, 16);
+
+            left[-1] = top[32];
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[32]);
+
+        ST_UB2(vec0, vec0, left, 16);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+        ST_UB2(vec0, vec0, top, 16);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[31]);
+
+        ST_UB2(vec0, vec0, (top + 32), 16);
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 32 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
+                int threshold = 1 << (8 - 5);
+                if (s->sps->sps_strong_intra_smoothing_enable_flag
+                    && c_idx == 0
+                    && ((top[-1] + top[63] - 2 * top[31]) >=
+                        0 ? (top[-1] + top[63] -
+                             2 * top[31]) : (-(top[-1] + top[63] -
+                                               2 * top[31]))) < threshold
+                    && ((left[-1] + left[63] - 2 * left[31]) >=
+                        0 ? (left[-1] + left[63] -
+                             2 * left[31]) : (-(left[-1] + left[63] -
+                                                2 * left[31]))) < threshold) {
+
+
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+
+
+                    for (i = 0; i < 63; i++) {
+                        filtered_top[i] =
+                            ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
+                    }
+
+                    tmp0 = __msa_fill_h(top[-1]);
+                    tmp1 = __msa_fill_h(top[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, filtered_top, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (filtered_top + 32), 16);
+
+                    filtered_top[63] = top[63];
+
+                    tmp0 = __msa_fill_h(left[-1]);
+                    tmp1 = __msa_fill_h(left[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, left, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (left + 32), 16);
+
+                    left[63] = tmp1[0];
+
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
+                    filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1] =
+                        filtered_left[-1] =
+                        (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top = filtered_top;
+                }
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
+                               (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 5, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
+                                (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
diff --git a/libavcodec/mips/hpeldsp_init_mips.c b/libavcodec/mips/hpeldsp_init_mips.c
new file mode 100644
index 0000000000..82f2310985
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../hpeldsp.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#if HAVE_MSA
+static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa;
+
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa;
+}
+#endif  // #if HAVE_MSA
+
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
+{
+#if HAVE_MSA
+    ff_hpeldsp_init_msa(c, flags);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hpeldsp_mips.h b/libavcodec/mips/hpeldsp_mips.h
new file mode 100644
index 0000000000..f4ab53eb29
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mips.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
+#define AVCODEC_MIPS_HPELDSP_MIPS_H
+
+#include "libavcodec/bit_depth_template.c"
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+
+#endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
diff --git a/libavcodec/mips/hpeldsp_msa.c b/libavcodec/mips/hpeldsp_msa.c
new file mode 100644
index 0000000000..40a0dca0fe
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_msa.c
@@ -0,0 +1,1498 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                           \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                                   \
+                                                                            \
+    PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7,                     \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                            \
+    ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride);                 \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride)                                \
+{                                                                       \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
+                                                                        \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \
+}
+
+static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+        AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                      src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    SLDI_B4_0_SB(src4, src5, src6, src7,
+                 src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
+
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
+                 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src4, src5, src6, src7);
+    LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t dst0, dst1, out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+
+        AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
+                          src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src2, res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                      dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4;
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+    src16 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src12, src13, src13, src14,
+                  src14, src15, src15, src16, dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+}
+
+static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1, dst0, dst1;
+    v16u8 src0, src1, src2;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+    v16u8 res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                          dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+        AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                    res4, res5, res6, res7);
+
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
+                    res4, res5, res6, res7);
+        ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t res0, res1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r, res;
+    v8u16 add0, add1, add2, sum0, sum1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
+                   src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
+        res0 = __msa_copy_u_w((v4i32) res, 0);
+        res1 = __msa_copy_u_w((v4i32) res, 2);
+        SW(res0, dst);
+        dst += dst_stride;
+        SW(res1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
+        ST8x4_UB(src0, src1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+        ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
+             sum0_r, sum1_r, sum2_r, sum3_r);
+        ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
+             sum4_r, sum5_r, sum6_r, sum7_r);
+        ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
+             sum0_l, sum1_l, sum2_l, sum3_l);
+        ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
+             sum4_l, sum5_l, sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
+                     sum3_l, sum3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
+                     sum7_l, sum7_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r;
+    v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+    v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+    v16i8 out0, out1;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
+                 src3_sld1, 1);
+    SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
+    SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
+    ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
+               src3, src0_r, src1_r, src2_r, src3_r);
+    ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
+               src5_r, src6_r);
+    ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+    sum4 = add4 + add5 + 1;
+    sum5 = add5 + add6 + 1;
+    sum6 = add6 + add7 + 1;
+    sum7 = add7 + add8 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    SRA_4V(sum4, sum5, sum6, sum7, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+    v16i8 out0, out1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    src4 = LD_SB(src);
+
+    SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+    SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+    ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+               src1_r, src2_r);
+    ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB2_UH(src3_r, src4_r, add3, add4);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r;
+    v8u16 add0, add1, add2, sum0, sum1;
+    v16u8 dst0, dst1, res0, res1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
+                           sum2, dst2, sum3, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, src12, src13, src14, src15, src16, src17;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v16u8 src7_l, src8_l;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UB(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UB(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UB(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UB(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UB(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UB(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UB(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UB(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UB(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
+             sum2_r, sum3_r);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
+             sum6_r, sum7_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
+             sum2_l, sum3_l);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
+             sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    copy_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    copy_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
+                                    const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    avg_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width4_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
diff --git a/libavcodec/mips/idctdsp_init_mips.c b/libavcodec/mips/idctdsp_init_mips.c
new file mode 100644
index 0000000000..8c26bca538
--- /dev/null
+++ b/libavcodec/mips/idctdsp_init_mips.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void idctdsp_init_msa(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+                c->idct_put = ff_simple_idct_put_msa;
+                c->idct_add = ff_simple_idct_add_msa;
+                c->idct = ff_simple_idct_msa;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_msa;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_msa;
+    c->add_pixels_clamped = ff_add_pixels_clamped_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void idctdsp_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+                c->idct = ff_simple_idct_mmi;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_mmi;
+    c->add_pixels_clamped = ff_add_pixels_clamped_mmi;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth)
+{
+#if HAVE_MSA
+    idctdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    idctdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/idctdsp_mips.h b/libavcodec/mips/idctdsp_mips.h
new file mode 100644
index 0000000000..19267e6705
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mips.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
+#define AVCODEC_MIPS_IDCTDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size);
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_j_rev_dct_msa(int16_t *data);
+void ff_jref_idct_put_msa(uint8_t *dest, int32_t stride, int16_t *block);
+void ff_jref_idct_add_msa(uint8_t *dest, int32_t stride, int16_t *block);
+void ff_simple_idct_msa(int16_t *block);
+void ff_simple_idct_put_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
+void ff_simple_idct_add_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_simple_idct_mmi(int16_t *block);
+void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
new file mode 100644
index 0000000000..25476f3c40
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -0,0 +1,190 @@
+/*
+ * Loongson SIMD optimized idctdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+
+    p = block;
+    pix = pixels;
+
+    __asm__ volatile (
+        "ldc1 $f0, 0+%3                 \r\n"
+        "ldc1 $f2, 8+%3                 \r\n"
+        "ldc1 $f4, 16+%3                \r\n"
+        "ldc1 $f6, 24+%3                \r\n"
+        "ldc1 $f8, 32+%3                \r\n"
+        "ldc1 $f10, 40+%3               \r\n"
+        "ldc1 $f12, 48+%3               \r\n"
+        "ldc1 $f14, 56+%3               \r\n"
+        "dadd $10, %0, %1               \r\n"
+        "packushb $f0, $f0, $f2         \r\n"
+        "packushb $f4, $f4, $f6         \r\n"
+        "packushb $f8, $f8, $f10        \r\n"
+        "packushb $f12, $f12, $f14      \r\n"
+        "sdc1 $f0, 0(%0)                \r\n"
+        "sdc1 $f4, 0($10)               \r\n"
+        "gssdxc1 $f8, 0($10, %1)        \r\n"
+        "gssdxc1 $f12, 0(%0, %2)        \r\n"
+        ::"r"(pix),"r"((int)line_size),
+          "r"((int)line_size*3),"m"(*p)
+        : "$10","memory"
+    );
+
+    pix += line_size*4;
+    p += 32;
+
+    __asm__ volatile (
+        "ldc1 $f0, 0+%3                 \r\n"
+        "ldc1 $f2, 8+%3                 \r\n"
+        "ldc1 $f4, 16+%3                \r\n"
+        "ldc1 $f6, 24+%3                \r\n"
+        "ldc1 $f8, 32+%3                \r\n"
+        "ldc1 $f10, 40+%3               \r\n"
+        "ldc1 $f12, 48+%3               \r\n"
+        "ldc1 $f14, 56+%3               \r\n"
+        "dadd $10, %0, %1               \r\n"
+        "packushb $f0, $f0, $f2         \r\n"
+        "packushb $f4, $f4, $f6         \r\n"
+        "packushb $f8, $f8, $f10        \r\n"
+        "packushb $f12, $f12, $f14      \r\n"
+        "sdc1 $f0, 0(%0)                \r\n"
+        "sdc1 $f4, 0($10)               \r\n"
+        "gssdxc1 $f8, 0($10, %1)        \r\n"
+        "gssdxc1 $f12, 0(%0, %2)        \r\n"
+        ::"r"(pix),"r"((int)line_size),
+          "r"((int)line_size*3),"m"(*p)
+        : "$10","memory"
+    );
+}
+
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+    uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    int64_t line_skip = line_size;
+    int64_t line_skip3;
+
+    __asm__ volatile (
+        "dmtc1 %4, $f0                  \n\t"
+        "daddu %1, %3, %3               \n\t"
+        "ldc1 $f2, 0(%2)                \n\t"
+        "ldc1 $f10, 8(%2)               \n\t"
+        "packsshb $f2, $f2, $f10        \n\t"
+        "ldc1 $f4, 16(%2)               \n\t"
+        "ldc1 $f10, 24(%2)              \n\t"
+        "packsshb $f4, $f4, $f10        \n\t"
+        "ldc1 $f6, 32(%2)               \n\t"
+        "ldc1 $f10, 40(%2)              \n\t"
+        "packsshb $f6, $f6, $f10        \n\t"
+        "ldc1 $f8, 48(%2)               \n\t"
+        "ldc1 $f10, 56(%2)              \n\t"
+        "packsshb $f8, $f8, $f10        \n\t"
+        "paddb $f2, $f2, $f0            \n\t"
+        "paddb $f4, $f4, $f0            \n\t"
+        "paddb $f6, $f6, $f0            \n\t"
+        "paddb $f8, $f8, $f0            \n\t"
+        "sdc1 $f2, 0(%0)                \n\t"
+        "gssdxc1 $f4, 0(%0, %3)         \n\t"
+        "gssdxc1 $f6, 0(%0, %1)         \n\t"
+        "daddu %1, %1, %3               \n\t"
+        "gssdxc1 $f8, 0(%0, %1)         \n\t"
+        "daddu $10, %1, %3              \n\t"
+        "daddu %0, %0, $10              \n\t"
+        "ldc1 $f2, 64(%2)               \n\t"
+        "ldc1 $f10, 8+64(%2)            \n\t"
+        "packsshb  $f2, $f2, $f10       \n\t"
+        "ldc1 $f4, 16+64(%2)            \n\t"
+        "ldc1 $f10, 24+64(%2)           \n\t"
+        "packsshb $f4, $f4, $f10        \n\t"
+        "ldc1 $f6, 32+64(%2)            \n\t"
+        "ldc1 $f10, 40+64(%2)           \n\t"
+        "packsshb $f6, $f6, $f10        \n\t"
+        "ldc1 $f8, 48+64(%2)            \n\t"
+        "ldc1 $f10, 56+64(%2)           \n\t"
+        "packsshb $f8, $f8, $f10        \n\t"
+        "paddb $f2, $f2, $f0            \n\t"
+        "paddb $f4, $f4, $f0            \n\t"
+        "paddb $f6, $f6, $f0            \n\t"
+        "paddb $f8, $f8, $f0            \n\t"
+        "sdc1 $f2, 0(%0)                \n\t"
+        "gssdxc1 $f4, 0(%0, %3)         \n\t"
+        "daddu $10, %3, %3              \n\t"
+        "gssdxc1 $f6, 0(%0, $10)        \n\t"
+        "gssdxc1 $f8, 0(%0, %1)         \n\t"
+        : "+&r"(pixels),"=&r"(line_skip3)
+        : "r"(block),"r"(line_skip),"r"(ff_pb_80)
+        : "$10","memory"
+    );
+}
+
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+    int i = 4;
+
+    p = block;
+    pix = pixels;
+
+    __asm__ volatile (
+        "xor $f14, $f14, $f14           \r\n"
+        ::
+    );
+
+    do {
+        __asm__ volatile (
+            "ldc1 $f0, 0+%2             \r\n"
+            "ldc1 $f2, 8+%2             \r\n"
+            "ldc1 $f4, 16+%2            \r\n"
+            "ldc1 $f6, 24+%2            \r\n"
+            "ldc1 $f8, %0               \r\n"
+            "ldc1 $f12, %1              \r\n"
+            "mov.d $f10, $f8            \r\n"
+            "punpcklbh $f8, $f8, $f14   \r\n"
+            "punpckhbh $f10, $f10, $f14 \r\n"
+            "paddsh $f0, $f0, $f8       \r\n"
+            "paddsh $f2, $f2, $f10      \r\n"
+            "mov.d $f10, $f12           \r\n"
+            "punpcklbh $f12, $f12, $f14 \r\n"
+            "punpckhbh $f10, $f10, $f14 \r\n"
+            "paddsh $f4, $f4, $f12      \r\n"
+            "paddsh $f6, $f6, $f10      \r\n"
+            "packushb $f0, $f0, $f2     \r\n"
+            "packushb $f4, $f4, $f6     \r\n"
+            "sdc1 $f0, %0               \r\n"
+            "sdc1 $f4, %1               \r\n"
+            : "+m"(*pix),"+m"(*(pix+line_size))
+            : "m"(*p)
+            : "memory"
+        );
+
+        pix += line_size*2;
+        p += 16;
+    } while (--i);
+}
diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c
new file mode 100644
index 0000000000..b29e420556
--- /dev/null
+++ b/libavcodec/mips/idctdsp_msa.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                          int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    in0 += 128;
+    in1 += 128;
+    in2 += 128;
+    in3 += 128;
+    in4 += 128;
+    in5 += 128;
+    in6 += 128;
+    in7 += 128;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 pix_in0, pix_in1, pix_in2, pix_in3;
+    v16u8 pix_in4, pix_in5, pix_in6, pix_in7;
+    v8u16 pix0, pix1, pix2, pix3, pix4, pix5, pix6, pix7;
+    v8i16 zero = { 0 };
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD_UB8(pixels, stride, pix_in0, pix_in1, pix_in2,
+           pix_in3, pix_in4, pix_in5, pix_in6, pix_in7);
+
+    ILVR_B4_UH(zero, pix_in0, zero, pix_in1, zero, pix_in2, zero, pix_in3,
+               pix0, pix1, pix2, pix3);
+    ILVR_B4_UH(zero, pix_in4, zero, pix_in5, zero, pix_in6, zero, pix_in7,
+               pix4, pix5, pix6, pix7);
+
+    in0 += (v8i16) pix0;
+    in1 += (v8i16) pix1;
+    in2 += (v8i16) pix2;
+    in3 += (v8i16) pix3;
+    in4 += (v8i16) pix4;
+    in5 += (v8i16) pix5;
+    in6 += (v8i16) pix6;
+    in7 += (v8i16) pix7;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    put_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size)
+{
+    put_signed_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    add_pixels_clamped_msa(block, pixels, line_size);
+}
diff --git a/libavcodec/mips/iirfilter_mips.c b/libavcodec/mips/iirfilter_mips.c
new file mode 100644
index 0000000000..a5646cde8b
--- /dev/null
+++ b/libavcodec/mips/iirfilter_mips.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * IIR filter optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ /**
+ * @file
+ * Reference: libavcodec/iirfilter.c
+ */
+
+#include "libavcodec/iirfilter.h"
+
+#if HAVE_INLINE_ASM
+typedef struct FFIIRFilterCoeffs {
+    int   order;
+    float gain;
+    int   *cx;
+    float *cy;
+} FFIIRFilterCoeffs;
+
+typedef struct FFIIRFilterState {
+    float x[1];
+} FFIIRFilterState;
+
+static void ff_iir_filter_flt_mips(const struct FFIIRFilterCoeffs *c,
+                                   struct FFIIRFilterState *s, int size,
+                                   const float *src, int sstep, float *dst, int dstep)
+{
+    if (c->order == 2) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            float in = *src0 * c->gain  + s->x[0] * c->cy[0] + s->x[1] * c->cy[1];
+            *dst0 = s->x[0] + in + s->x[1] * c->cx[1];
+            s->x[0] = s->x[1];
+            s->x[1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    } else if (c->order == 4) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        float four = 4.0;
+        float six  = 6.0;
+        for (i = 0; i < size; i += 4) {
+            float in1, in2, in3, in4;
+            float res1, res2, res3, res4;
+            float *x  = s->x;
+            float *cy = c->cy;
+            float gain = c->gain;
+            float src0_0 = src0[0      ];
+            float src0_1 = src0[sstep  ];
+            float src0_2 = src0[2*sstep];
+            float src0_3 = src0[3*sstep];
+
+            __asm__ volatile (
+                "lwc1   $f0,        0(%[cy])                    \n\t"
+                "lwc1   $f4,        0(%[x])                     \n\t"
+                "lwc1   $f5,        4(%[x])                     \n\t"
+                "lwc1   $f6,        8(%[x])                     \n\t"
+                "lwc1   $f7,        12(%[x])                    \n\t"
+                "mul.s  %[in1],     %[src0_0],  %[gain]         \n\t"
+                "mul.s  %[in2],     %[src0_1],  %[gain]         \n\t"
+                "mul.s  %[in3],     %[src0_2],  %[gain]         \n\t"
+                "mul.s  %[in4],     %[src0_3],  %[gain]         \n\t"
+                "lwc1   $f1,        4(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f0,    $f4     \n\t"
+                "madd.s %[in2],     %[in2],     $f0,    $f5     \n\t"
+                "madd.s %[in3],     %[in3],     $f0,    $f6     \n\t"
+                "madd.s %[in4],     %[in4],     $f0,    $f7     \n\t"
+                "lwc1   $f2,        8(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f1,    $f5     \n\t"
+                "madd.s %[in2],     %[in2],     $f1,    $f6     \n\t"
+                "madd.s %[in3],     %[in3],     $f1,    $f7     \n\t"
+                "lwc1   $f3,        12(%[cy])                   \n\t"
+                "add.s  $f8,        $f5,        $f7             \n\t"
+                "madd.s %[in1],     %[in1],     $f2,    $f6     \n\t"
+                "madd.s %[in2],     %[in2],     $f2,    $f7     \n\t"
+                "mul.s  $f9,        $f6,        %[six]          \n\t"
+                "mul.s  $f10,       $f7,        %[six]          \n\t"
+                "madd.s %[in1],     %[in1],     $f3,    $f7     \n\t"
+                "madd.s %[in2],     %[in2],     $f3,    %[in1]  \n\t"
+                "madd.s %[in3],     %[in3],     $f2,    %[in1]  \n\t"
+                "madd.s %[in4],     %[in4],     $f1,    %[in1]  \n\t"
+                "add.s  %[res1],    $f4,        %[in1]          \n\t"
+                "swc1   %[in1],     0(%[x])                     \n\t"
+                "add.s  $f0,        $f6,        %[in1]          \n\t"
+                "madd.s %[in3],     %[in3],     $f3,    %[in2]  \n\t"
+                "madd.s %[in4],     %[in4],     $f2,    %[in2]  \n\t"
+                "add.s  %[res2],    $f5,        %[in2]          \n\t"
+                "madd.s %[res1],    %[res1],    $f8,    %[four] \n\t"
+                "add.s  $f8,        $f7,        %[in2]          \n\t"
+                "swc1   %[in2],     4(%[x])                     \n\t"
+                "madd.s %[in4],     %[in4],     $f3,    %[in3]  \n\t"
+                "add.s  %[res3],    $f6,        %[in3]          \n\t"
+                "add.s  %[res1],    %[res1],    $f9             \n\t"
+                "madd.s %[res2],    %[res2],    $f0,    %[four] \n\t"
+                "swc1   %[in3],     8(%[x])                     \n\t"
+                "add.s  %[res4],    $f7,        %[in4]          \n\t"
+                "madd.s %[res3],    %[res3],    $f8,    %[four] \n\t"
+                "swc1   %[in4],     12(%[x])                    \n\t"
+                "add.s  %[res2],    %[res2],    $f10            \n\t"
+                "add.s  $f8,        %[in1],     %[in3]          \n\t"
+                "madd.s %[res3],    %[res3],    %[in1], %[six]  \n\t"
+                "madd.s %[res4],    %[res4],    $f8,    %[four] \n\t"
+                "madd.s %[res4],    %[res4],    %[in2], %[six]  \n\t"
+
+                : [in1]"=&f"(in1), [in2]"=&f"(in2),
+                  [in3]"=&f"(in3), [in4]"=&f"(in4),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2),
+                  [res3]"=&f"(res3), [res4]"=&f"(res4)
+                : [src0_0]"f"(src0_0), [src0_1]"f"(src0_1),
+                  [src0_2]"f"(src0_2), [src0_3]"f"(src0_3),
+                  [gain]"f"(gain), [x]"r"(x), [cy]"r"(cy),
+                  [four]"f"(four), [six]"f"(six)
+                : "$f0", "$f1", "$f2", "$f3",
+                  "$f4", "$f5", "$f6", "$f7",
+                  "$f8", "$f9", "$f10",
+                  "memory"
+            );
+
+            dst0[0      ] = res1;
+            dst0[sstep  ] = res2;
+            dst0[2*sstep] = res3;
+            dst0[3*sstep] = res4;
+
+            src0 += 4*sstep;
+            dst0 += 4*dstep;
+        }
+    } else {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            int j;
+            float in, res;
+            in = *src0 * c->gain;
+            for(j = 0; j < c->order; j++)
+                in += c->cy[j] * s->x[j];
+            res = s->x[0] + in + s->x[c->order >> 1] * c->cx[c->order >> 1];
+            for(j = 1; j < c->order >> 1; j++)
+                res += (s->x[j] + s->x[c->order - j]) * c->cx[j];
+            for(j = 0; j < c->order - 1; j++)
+                s->x[j] = s->x[j + 1];
+            *dst0 = res;
+            s->x[c->order - 1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    }
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_iir_filter_init_mips(FFIIRFilterContext *f) {
+#if HAVE_INLINE_ASM
+    f->filter_flt = ff_iir_filter_flt_mips;
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h
new file mode 100644
index 0000000000..9d7521865b
--- /dev/null
+++ b/libavcodec/mips/lsp_mips.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * LSP routines for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/lsp.c
+ */
+#ifndef AVCODEC_LSP_MIPS_H
+#define AVCODEC_LSP_MIPS_H
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+#include "libavutil/mips/asmdefs.h"
+
+static av_always_inline void ff_lsp2polyf_mips(const double *lsp, double *f, int lp_half_order)
+{
+    int i, j = 0;
+    double * p_fi = f;
+    double * p_f = 0;
+
+    f[0] = 1.0;
+    f[1] = -2 * lsp[0];
+    lsp -= 2;
+
+    for(i=2; i<=lp_half_order; i++)
+    {
+        double tmp, f_j_2, f_j_1, f_j;
+        double val = lsp[2*i];
+
+        __asm__ volatile(
+            "move   %[p_f],     %[p_fi]                         \n\t"
+            "add.d  %[val],     %[val],     %[val]              \n\t"
+            PTR_ADDIU "%[p_fi], 8                               \n\t"
+            "ldc1   %[f_j_1],   0(%[p_f])                       \n\t"
+            "ldc1   %[f_j],     8(%[p_f])                       \n\t"
+            "neg.d  %[val],     %[val]                          \n\t"
+            "add.d  %[tmp],     %[f_j_1],   %[f_j_1]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j], %[val]      \n\t"
+            "addiu  %[j],       %[i], -2                        \n\t"
+            "ldc1   %[f_j_2],   -8(%[p_f])                      \n\t"
+            "sdc1   %[tmp],     16(%[p_f])                      \n\t"
+            "beqz   %[j],       ff_lsp2polyf_lp_j_end%=         \n\t"
+            "ff_lsp2polyf_lp_j%=:                               \n\t"
+            "add.d  %[tmp],     %[f_j],     %[f_j_2]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j_1], %[val]    \n\t"
+            "mov.d  %[f_j],     %[f_j_1]                        \n\t"
+            "addiu  %[j],       -1                              \n\t"
+            "mov.d  %[f_j_1],   %[f_j_2]                        \n\t"
+            "ldc1   %[f_j_2],   -16(%[p_f])                     \n\t"
+            "sdc1   %[tmp],     8(%[p_f])                       \n\t"
+            PTR_ADDIU "%[p_f], -8                              \n\t"
+            "bgtz   %[j],       ff_lsp2polyf_lp_j%=             \n\t"
+            "ff_lsp2polyf_lp_j_end%=:                           \n\t"
+
+            : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val),
+              [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f),
+              [j]"+r"(j), [p_fi]"+r"(p_fi)
+            : [i]"r"(i)
+            : "memory"
+        );
+        f[1] += val;
+    }
+}
+#define ff_lsp2polyf ff_lsp2polyf_mips
+#endif /* HAVE_MIPSFPU && HAVE_INLINE_ASM */
+#endif /* AVCODEC_LSP_MIPS_H */
diff --git a/libavcodec/mips/mathops.h b/libavcodec/mips/mathops.h
index dd80f68072..bb9dc8375a 100644
--- a/libavcodec/mips/mathops.h
+++ b/libavcodec/mips/mathops.h
@@ -1,20 +1,21 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,55 +28,39 @@
 
 #if HAVE_INLINE_ASM
 
-#if HAVE_LOONGSON
+#if HAVE_LOONGSON3
 
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
 {
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "daddu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
-}
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "dsubu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-
-#elif ARCH_MIPS64
-
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "daddu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
+    int c;
+    __asm__ ("dmult %1, %2      \n\t"
+             "mflo %0           \n\t"
+             "dsrl %0, %0, 32   \n\t"
+             : "=r"(c)
+             : "r"(a),"r"(b)
              : "hi", "lo");
-    return d;
+    return c;
 }
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
 
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
 {
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "dsubu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
-             : "hi", "lo");
-    return d;
+    int t = b;
+    __asm__ ("sgt $8, %1, %2    \n\t"
+             "movn %0, %1, $8   \n\t"
+             "movn %1, %2, $8   \n\t"
+             "sgt $8, %1, %3    \n\t"
+             "movz %1, %3, $8   \n\t"
+             "sgt $8, %0, %1    \n\t"
+             "movn %0, %1, $8   \n\t"
+             : "+&r"(t),"+&r"(a)
+             : "r"(b),"r"(c)
+             : "$8");
+    return t;
 }
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
 
-#endif
+#endif /* HAVE_LOONGSON3 */
 
 #endif /* HAVE_INLINE_ASM */
 
diff --git a/libavcodec/mips/me_cmp_init_mips.c b/libavcodec/mips/me_cmp_init_mips.c
new file mode 100644
index 0000000000..219a0dc00c
--- /dev/null
+++ b/libavcodec/mips/me_cmp_init_mips.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "me_cmp_mips.h"
+
+#if HAVE_MSA
+static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_abs[0][0] = ff_pix_abs16_msa;
+    c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
+    c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
+    c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
+    c->pix_abs[1][0] = ff_pix_abs8_msa;
+    c->pix_abs[1][1] = ff_pix_abs8_x2_msa;
+    c->pix_abs[1][2] = ff_pix_abs8_y2_msa;
+    c->pix_abs[1][3] = ff_pix_abs8_xy2_msa;
+
+    c->hadamard8_diff[0] = ff_hadamard8_diff16_msa;
+    c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa;
+
+    c->hadamard8_diff[4] = ff_hadamard8_intra16_msa;
+    c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa;
+
+    c->sad[0] = ff_pix_abs16_msa;
+    c->sad[1] = ff_pix_abs8_msa;
+    c->sse[0] = ff_sse16_msa;
+    c->sse[1] = ff_sse8_msa;
+    c->sse[2] = ff_sse4_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    me_cmp_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/me_cmp_mips.h b/libavcodec/mips/me_cmp_mips.h
new file mode 100644
index 0000000000..e0d0f51af8
--- /dev/null
+++ b/libavcodec/mips/me_cmp_mips.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
+#define AVCODEC_MIPS_ME_CMP_MIPS_H
+
+#include "../mpegvideo.h"
+#include "libavcodec/bit_depth_template.c"
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h);
+int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                            ptrdiff_t stride, int h);
+int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                 ptrdiff_t stride, int i32Height);
+int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block,
+                        ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
diff --git a/libavcodec/mips/me_cmp_msa.c b/libavcodec/mips/me_cmp_msa.c
new file mode 100644
index 0000000000..0e3165cd8f
--- /dev/null
+++ b/libavcodec/mips/me_cmp_msa.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "me_cmp_mips.h"
+
+static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
+                               uint8_t *ref, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *ref, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *ref,
+                                                      int32_t ref_stride,
+                                                      int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *ref,
+                                                    int32_t ref_stride,
+                                                    int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (5 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        ref4 = ref3;
+
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (3 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *ref,
+                                                  int32_t ref_stride,
+                                                  int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, temp0, temp1, diff;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+
+        VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp0 += comp1;
+        comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
+        comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
+        comp2 = __msa_hadd_u_h(temp0, temp0);
+        comp1 += comp2;
+        comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
+        comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
+        comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
+        diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
+        comp3 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp3;
+        comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
+        comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp3 += comp0;
+        comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
+        comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
+        comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
+        diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *ref,
+                                                   int32_t ref_stride,
+                                                   int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp, diff;
+    v16u8 temp0, temp1, temp2, temp3;
+    v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
+        LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
+        ref += (5 * ref_stride);
+
+        ILVRL_B2_UB(ref14, ref04, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
+        LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
+        ref += (3 * ref_stride);
+
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+#define CALC_MSE_B(src, ref, var)                                    \
+{                                                                    \
+    v16u8 src_l0_m, src_l1_m;                                        \
+    v8i16 res_l0_m, res_l1_m;                                        \
+                                                                     \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+}
+
+static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    uint32_t src0, src1, src2, src3;
+    uint32_t ref0, ref1, ref2, ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LW4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        INSERT_W4_UB(src0, src1, src2, src3, src);
+        INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        CALC_MSE_B(src0, ref0, var);
+        CALC_MSE_B(src1, ref1, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
+                                uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src, ref;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *ref, int32_t ref_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v8i16 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+    ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+               src4, ref4, src5, ref5, src6, ref6, src7, ref7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
+    HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
+    TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+                       diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, zero);
+    sum += __msa_add_a_h((v8i16) diff1, zero);
+    sum += __msa_add_a_h((v8i16) diff2, zero);
+    sum += __msa_add_a_h((v8i16) diff3, zero);
+
+    return (HADD_UH_U32(sum));
+}
+
+static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *ref, int32_t ref_stride)
+{
+    int32_t sum_res = 0;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v16i8 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
+                       src0, src1, src2, src3, src4, src5, src6, src7);
+    ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+               zero, src4, zero, src5, zero, src6, zero, src7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
+    sum_res = (HADD_UH_U32(sum));
+    sum_res -= abs(temp0[0] + temp4[0]);
+
+    return sum_res;
+}
+
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                     ptrdiff_t stride, int height)
+{
+    return sad_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                    ptrdiff_t stride, int height)
+{
+    return sad_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                 ptrdiff_t stride, int height)
+{
+    return sse_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_4width_msa(src, stride, ref, stride, height);
+}
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h)
+{
+    return hadamard_diff_8x8_msa(src, stride, dst, stride);
+}
+
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h)
+{
+    return hadamard_intra_8x8_msa(src, stride, dst, stride);
+}
+
+/* Hadamard Transform functions */
+#define WRAPPER8_16_SQ(name8, name16)                      \
+int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
+           ptrdiff_t stride, int h)                        \
+{                                                          \
+    int score = 0;                                         \
+    score += name8(s, dst, src, stride, 8);                \
+    score += name8(s, dst + 8, src + 8, stride, 8);        \
+    if(h == 16) {                                          \
+        dst += 8 * stride;                                 \
+        src += 8 * stride;                                 \
+        score +=name8(s, dst, src, stride, 8);             \
+        score +=name8(s, dst + 8, src + 8, stride, 8);     \
+    }                                                      \
+    return score;                                          \
+}
+
+WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
+WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
diff --git a/libavcodec/mips/mpegaudiodsp_mips_fixed.c b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
new file mode 100644
index 0000000000..86ea13d8d6
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
@@ -0,0 +1,908 @@
+    /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window,
+                               int *dither_state, int16_t *samples, int incr)
+{
+    register const int32_t *w, *w2, *p;
+    int j;
+    int16_t *samples2;
+    int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2;
+    int w2_asm, w2_asm1, *p_temp1, *p_temp2;
+    int sum1 = 0;
+    int const min_asm = -32768, max_asm = 32767;
+    int temp1, temp2 = 0, temp3 = 0;
+    int64_t sum;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+    samples2 = samples + 31 * incr;
+    w = window;
+    w2 = window + 31;
+    sum = *dither_state;
+    p = synth_buf + 16;
+    p_temp1 = synth_buf + 16;
+    p_temp2 = synth_buf + 48;
+    temp1 = sum;
+
+    /**
+    * use of round_sample function from the original code is eliminated,
+    * changed with appropriate assembly instructions.
+    */
+    __asm__ volatile (
+         "mthi   $zero                                                    \n\t"
+         "mtlo   %[temp1]                                                 \n\t"
+         "lw     %[w_asm],  0(%[w])                                       \n\t"
+         "lw     %[p_asm],  0(%[p])                                       \n\t"
+         "lw     %[w_asm1], 64*4(%[w])                                    \n\t"
+         "lw     %[p_asm1], 64*4(%[p])                                    \n\t"
+         "lw     %[w_asm2], 128*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 128*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  192*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  192*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 256*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 256*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 320*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 320*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  384*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  384*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 448*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 448*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 32*4(%[w])                                    \n\t"
+         "lw     %[p_asm2], 32*4(%[p])                                    \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  96*4(%[w])                                    \n\t"
+         "lw     %[p_asm],  96*4(%[p])                                    \n\t"
+         "lw     %[w_asm1], 160*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 160*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 224*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 224*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  288*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  288*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 352*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 352*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "lw     %[w_asm],  480*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  480*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 416*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 416*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+
+         /*round_sample function from the original code is eliminated,
+          * changed with appropriate assembly instructions
+          * code example:
+
+         "extr.w  %[sum1],$ac0,24                                       \n\t"
+         "mflo %[temp3],  $ac0                                          \n\t"
+         "and  %[temp1],  %[temp3],  0x00ffffff                         \n\t"
+         "slt  %[temp2],  %[sum1],   %[min_asm]                         \n\t"
+         "movn %[sum1],   %[min_asm],%[temp2]                           \n\t"
+         "slt  %[temp2],  %[max_asm],%[sum1]                            \n\t"
+         "movn %[sum1],   %[max_asm],%[temp2]                           \n\t"
+         "sh   %[sum1],   0(%[samples])                                 \n\t"
+         */
+
+         "extr.w %[sum1],   $ac0,       24                                \n\t"
+         "mflo   %[temp3]                                                 \n\t"
+         PTR_ADDIU "%[w],   %[w],       4                                 \n\t"
+         "and    %[temp1],  %[temp3],   0x00ffffff                        \n\t"
+         "slt    %[temp2],  %[sum1],    %[min_asm]                        \n\t"
+         "movn   %[sum1],   %[min_asm], %[temp2]                          \n\t"
+         "slt    %[temp2],  %[max_asm], %[sum1]                           \n\t"
+         "movn   %[sum1],   %[max_asm], %[temp2]                          \n\t"
+         "sh     %[sum1],   0(%[samples])                                 \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2),
+          [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3)
+        : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi","lo"
+     );
+
+     samples += incr;
+
+    /* we calculate two samples at the same time to avoid one memory
+       access per two sample */
+
+    for(j = 1; j < 16; j++) {
+        __asm__ volatile (
+             "mthi   $0,         $ac1                                      \n\t"
+             "mtlo   $0,         $ac1                                      \n\t"
+             "mthi   $0                                                    \n\t"
+             "mtlo   %[temp1]                                              \n\t"
+             PTR_ADDIU "%[p_temp1], %[p_temp1],    4                       \n\t"
+             "lw     %[w_asm],   0(%[w])                                   \n\t"
+             "lw     %[p_asm],   0(%[p_temp1])                             \n\t"
+             "lw     %[w2_asm],  0(%[w2])                                  \n\t"
+             "lw     %[w_asm1],  64*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp1])                          \n\t"
+             "lw     %[w2_asm1], 64*4(%[w2])                               \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   128*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  128*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  192*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 192*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   256*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  256*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  320*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 320*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   384*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  384*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  448*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 448*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[p_temp2], %[p_temp2],   -4                      \n\t"
+             "lw     %[w_asm],   32*4(%[w])                                \n\t"
+             "lw     %[p_asm],   0(%[p_temp2])                             \n\t"
+             "lw     %[w2_asm],  32*4(%[w2])                               \n\t"
+             "lw     %[w_asm1],  96*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp2])                          \n\t"
+             "lw     %[w2_asm1], 96*4(%[w2])                               \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   160*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  160*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  224*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 224*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   288*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  288*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  352*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 352*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   416*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  416*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  480*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 480*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[w],    %[w],             4                       \n\t"
+             PTR_ADDIU "%[w2],   %[w2],            -4                      \n\t"
+             "mflo   %[temp2]                                              \n\t"
+             "extr.w %[sum1],    $ac0,             24                      \n\t"
+             "li     %[temp3],   1                                         \n\t"
+             "and    %[temp1],   %[temp2],         0x00ffffff              \n\t"
+             "madd   $ac1,       %[temp1],         %[temp3]                \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples])                             \n\t"
+             "mflo   %[temp3],   $ac1                                      \n\t"
+             "extr.w %[sum1],    $ac1,             24                      \n\t"
+             "and    %[temp1],   %[temp3],         0x00ffffff              \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples2])                            \n\t"
+
+            : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+              [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1),
+              [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+              [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1),
+              [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples),
+              [samples2] "+r" (samples2), [temp3] "+r" (temp3)
+            : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+        );
+
+        samples += incr;
+        samples2 -= incr;
+    }
+
+    p = synth_buf + 32;
+
+    __asm__ volatile (
+        "mthi   $0                                                        \n\t"
+        "mtlo   %[temp1]                                                  \n\t"
+        "lw     %[w_asm],  32*4(%[w])                                     \n\t"
+        "lw     %[p_asm],  0(%[p])                                        \n\t"
+        "lw     %[w_asm1], 96*4(%[w])                                     \n\t"
+        "lw     %[p_asm1], 64*4(%[p])                                     \n\t"
+        "lw     %[w_asm2], 160*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 128*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  224*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  192*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 288*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 256*4(%[p])                                    \n\t"
+        "lw     %[w_asm2], 352*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 320*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  416*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  384*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 480*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 448*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "extr.w %[sum1],   $ac0,       24                                 \n\t"
+        "mflo   %[temp2]                                                  \n\t"
+        "and    %[temp1],  %[temp2],   0x00ffffff                         \n\t"
+        "slt    %[temp2],  %[sum1],    %[min_asm]                         \n\t"
+        "movn   %[sum1],   %[min_asm], %[temp2]                           \n\t"
+        "slt    %[temp2],  %[max_asm], %[sum1]                            \n\t"
+        "movn   %[sum1],   %[max_asm], %[temp2]                           \n\t"
+        "sh     %[sum1],   0(%[samples])                                  \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1)
+        : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+     );
+
+    *dither_state= temp1;
+}
+
+static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
+{
+    int j;
+    int t0, t1, t2, t3, s0, s1, s2, s3;
+    int tmp[18], *tmp1, *in1;
+    /* temporary variables */
+    int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+    int t4, t5, t6, t8, t7;
+
+   /* values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+    int const C_1  =  4229717092; /* cos(pi*1/18)*2  */
+    int const C_2  =  4035949074; /* cos(pi*2/18)*2  */
+    int const C_3  =  575416510;  /* -cos(pi*3/18)*2 */
+    int const C_3A =  3719550786; /* cos(pi*3/18)*2  */
+    int const C_4  =  1004831466; /* -cos(pi*4/18)*2 */
+    int const C_5  =  1534215534; /* -cos(pi*5/18)*2 */
+    int const C_7  = -1468965330; /* -cos(pi*7/18)*2 */
+    int const C_8  = -745813244;  /* -cos(pi*8/18)*2 */
+
+   /*
+    * instructions of the first two loops are reorganized and loops are unrolled,
+    * in order to eliminate unnecessary readings and writings in array
+    */
+
+    __asm__ volatile (
+        "lw   %[t1], 17*4(%[in])                                         \n\t"
+        "lw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t3], 15*4(%[in])                                         \n\t"
+        "lw   %[t4], 14*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "lw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "sw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t6], 12*4(%[in])                                         \n\t"
+        "sw   %[t1], 17*4(%[in])                                         \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "lw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "sw   %[t4], 14*4(%[in])                                         \n\t"
+        "lw   %[t8], 10*4(%[in])                                         \n\t"
+        "sw   %[t3], 15*4(%[in])                                         \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 12*4(%[in])                                         \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 9*4(%[in])                                          \n\t"
+        "lw   %[t2], 8*4(%[in])                                          \n\t"
+        "sw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 10*4(%[in])                                         \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "lw   %[t3], 7*4(%[in])                                          \n\t"
+        "lw   %[t4], 6*4(%[in])                                          \n\t"
+        "sw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "sw   %[t2], 8*4(%[in])                                          \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "lw   %[t5], 5*4(%[in])                                          \n\t"
+        "lw   %[t6], 4*4(%[in])                                          \n\t"
+        "sw   %[t1], 9*4(%[in])                                          \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "sw   %[t4], 6*4(%[in])                                          \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "lw   %[t7], 3*4(%[in])                                          \n\t"
+        "lw   %[t8], 2*4(%[in])                                          \n\t"
+        "sw   %[t3], 7*4(%[in])                                          \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 4*4(%[in])                                          \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 1*4(%[in])                                          \n\t"
+        "lw   %[t2], 0*4(%[in])                                          \n\t"
+        "sw   %[t5], 5*4(%[in])                                          \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 2*4(%[in])                                          \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "sw   %[t7], 3*4(%[in])                                          \n\t"
+        "sw   %[t1], 1*4(%[in])                                          \n\t"
+
+        : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6),
+          [t7] "=&r" (t7), [t8] "=&r" (t8)
+        :
+        : "memory"
+    );
+
+    for(j = 0; j < 2; j++) {
+
+        tmp1 = tmp + j;
+        in1 = in + j;
+
+         /**
+         *  Original constants are multiplied by two in advanced
+         *  for assembly optimization (e.g. C_2 = 2 * C2).
+         *  That can lead to overflow in operations where they are used.
+         *
+         *  Example of the solution:
+         *
+         *  in original code:
+         *  t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32
+         *
+         *  in assembly:
+         *  C_2 = 2 * C2;
+         *   .
+         *   .
+         *  "lw   %[t7],       4*4(%[in1])                               \n\t"
+         *  "lw   %[t8],       8*4(%[in1])                               \n\t"
+         *  "addu %[temp_reg2],%[t7],       %[t8]                        \n\t"
+         *  "multu %[C_2],     %[temp_reg2]                              \n\t"
+         *  "mfhi %[temp_reg1]                                           \n\t"
+         *  "sra  %[temp_reg2],%[temp_reg2],31                           \n\t"
+         *  "move %[t0],       $0                                        \n\t"
+         *  "movn %[t0],       %[C_2],      %[temp_reg2]                 \n\t"
+         *  "sub  %[t0],       %[temp_reg1],%[t0]                        \n\t"
+         */
+
+        __asm__ volatile (
+            "lw    %[t7],        4*4(%[in1])                               \n\t"
+            "lw    %[t8],        8*4(%[in1])                               \n\t"
+            "lw    %[t6],        16*4(%[in1])                              \n\t"
+            "lw    %[t4],        0*4(%[in1])                               \n\t"
+            "addu  %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "addu  %[t2],        %[t6],        %[t8]                       \n\t"
+            "multu %[C_2],       %[temp_reg2]                              \n\t"
+            "lw    %[t5],        12*4(%[in1])                              \n\t"
+            "sub   %[t2],        %[t2],        %[t7]                       \n\t"
+            "sub   %[t1],        %[t4],        %[t5]                       \n\t"
+            "sra   %[t3],        %[t5],        1                           \n\t"
+            "sra   %[temp_reg1], %[t2],        1                           \n\t"
+            "addu  %[t3],        %[t3],        %[t4]                       \n\t"
+            "sub   %[temp_reg1], %[t1],        %[temp_reg1]                \n\t"
+            "sra   %[temp_reg2], %[temp_reg2], 31                          \n\t"
+            "sw    %[temp_reg1], 6*4(%[tmp1])                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "movn  %[t0],        %[C_2],       %[temp_reg2]                \n\t"
+            "mfhi  %[temp_reg1]                                            \n\t"
+            "addu  %[t1],        %[t1],        %[t2]                       \n\t"
+            "sw    %[t1],        16*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg4], %[t8],        %[t6]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t6]                       \n\t"
+            "mult  $ac1,         %[C_8],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_4],       %[temp_reg2]                \n\t"
+            "sub   %[t0],        %[temp_reg1], %[t0]                       \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_4],       %[temp_reg1]                \n\t"
+            "mfhi  %[t1],        $ac1                                      \n\t"
+            "mfhi  %[temp_reg1], $ac2                                      \n\t"
+            "lw    %[t6],        10*4(%[in1])                              \n\t"
+            "lw    %[t8],        14*4(%[in1])                              \n\t"
+            "lw    %[t7],        2*4(%[in1])                               \n\t"
+            "lw    %[t4],        6*4(%[in1])                               \n\t"
+            "sub   %[temp_reg3], %[t3],        %[t0]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t0]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t1]                       \n\t"
+            "sub   %[t2],        %[temp_reg1], %[t2]                       \n\t"
+            "sw    %[temp_reg4], 2*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg3], %[temp_reg3], %[t2]                       \n\t"
+            "add   %[temp_reg1], %[t3],        %[t2]                       \n\t"
+            "sw    %[temp_reg3], 10*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t1]                       \n\t"
+            "addu  %[temp_reg2], %[t6],        %[t8]                       \n\t"
+            "sw    %[temp_reg1], 14*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t7]                       \n\t"
+            "addu  %[temp_reg3], %[t7],        %[t6]                       \n\t"
+            "multu $ac3,         %[C_3],       %[temp_reg2]                \n\t"
+            "multu %[C_1],       %[temp_reg3]                              \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "sra   %[temp_reg3], %[temp_reg3], 31                          \n\t"
+            "movn  %[t1],        %[C_3],       %[temp_reg1]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "mfhi  %[temp_reg4]                                            \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_1],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg3], %[t6],        %[t8]                       \n\t"
+            "sub   %[t2],        %[temp_reg4], %[t2]                       \n\t"
+            "multu $ac1,         %[C_7],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sra   %[temp_reg4], %[temp_reg3], 31                          \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "move  %[t3],        $0                                        \n\t"
+            "sw    %[t1],        4*4(%[tmp1])                              \n\t"
+            "movn  %[t3],        %[C_7],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_3A],      %[t4]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "mfhi  %[temp_reg4], $ac1                                      \n\t"
+            "multu $ac3,%[C_5],  %[temp_reg2]                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "movn  %[t1],%[C_5], %[temp_reg1]                              \n\t"
+            "sub   %[temp_reg4], %[temp_reg4], %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "sra   %[temp_reg3], %[t4],        31                          \n\t"
+            "movn  %[t0],        %[C_3A],      %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg3], $ac2                                      \n\t"
+            "sub   %[t3],        %[temp_reg4], %[t3]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t2]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "sub   %[t0],        %[temp_reg3], %[t0]                       \n\t"
+            "add   %[temp_reg1], %[t2],        %[t1]                       \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[t3],        %[t1]                       \n\t"
+            "sw    %[temp_reg4], 0*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t0]                       \n\t"
+            "sw    %[temp_reg1], 12*4(%[tmp1])                             \n\t"
+            "sw    %[temp_reg2], 8*4(%[tmp1])                              \n\t"
+
+            : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1),
+              [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0),
+              [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2),
+              [t3] "=&r" (t3), [t1] "=&r" (t1)
+            : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8),
+              [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7),
+              [C_3A] "r" (C_3A), [C_5] "r" (C_5)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+    }
+
+    /**
+    * loop is unrolled four times
+    *
+    * values defined in tables(icos36[] and icos36h[]) are not loaded from
+    * these tables - they are directly loaded in appropriate registers
+    *
+    */
+
+    __asm__ volatile (
+        "lw     %[t2],        1*4(%[tmp])                                  \n\t"
+        "lw     %[t3],        3*4(%[tmp])                                  \n\t"
+        "lw     %[t0],        0*4(%[tmp])                                  \n\t"
+        "lw     %[t1],        2*4(%[tmp])                                  \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x807D2B1E                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x2de5151                                    \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 9*4(%[win])                                  \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "lw     %[temp_reg6], 4*9*4(%[buf])                                \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg3], 29*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg2], %[s1]                          \n\t"
+        "lw     %[temp_reg4], 28*4(%[win])                                 \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,23                                      \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg1], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg2], 8*4(%[win])                                  \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg4]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg3], 4*9*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "lw     %[temp_reg3], 37*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg6], 17*4(%[win])                                 \n\t"
+        "sw     %[temp_reg5], 32*9*4(%[out])                               \n\t"
+        "sw     %[temp_reg4], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg2], 0*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*17*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 8*32*4(%[out])                               \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg4], 20*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 0(%[buf])                                    \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "mult   %[t0],        %[temp_reg4]                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "lw     %[t0],        4*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg4]                                               \n\t"
+        "sw     %[temp_reg5], 17*32*4(%[out])                              \n\t"
+        "lw     %[t1],        6*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[t2],        5*4(%[tmp])                                  \n\t"
+        "sw     %[temp_reg1], 0*32*4(%[out])                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg3], 4*17*4(%[buf])                               \n\t"
+        "lw     %[t3],        7*4(%[tmp])                                  \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg4], 0(%[buf])                                    \n\t"
+        "addu   %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x8483EE0C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg6], %[temp_reg5]                                 \n\t"
+        "sub    %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0xf746ea                                     \n\t"
+        "sra    %[temp_reg5], %[temp_reg5], 31                             \n\t"
+        "mult   $ac1,         %[temp_reg2], %[temp_reg1]                   \n\t"
+        "movn   %[s1],        %[temp_reg6], %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "lw     %[temp_reg3], 10*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*10*4(%[buf])                               \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg1], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg2], 7*4(%[win])                                  \n\t"
+        "lw     %[temp_reg6], 30*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg5], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg5], 27*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg4], 16*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5], $ac1                                         \n\t"
+        "sw     %[temp_reg3], 32*10*4(%[out])                              \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 4*16*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*10*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 7*32*4(%[out])                               \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg4]                   \n\t"
+        "sw     %[temp_reg5], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg6], 1*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*1*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 36*4(%[win])                                 \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg2], 21*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "mult   %[t0],        %[temp_reg1]                                 \n\t"
+        "mult   $ac1,         %[t0],%[temp_reg2]                           \n\t"
+        "lw     %[t0],        8*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t1],        10*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        11*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[t2],        9*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "sw     %[temp_reg3], 16*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg5], 1*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg1], 4*16*4(%[buf])                               \n\t"
+        "addu   %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x8D3B7CD6                                   \n\t"
+        "sw     %[temp_reg2], 4*1*4(%[buf])                                \n\t"
+        "multu  %[temp_reg4],%[temp_reg3]                                  \n\t"
+        "sra    %[temp_reg3], %[temp_reg3], 31                             \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "movn   %[s1],        %[temp_reg4], %[temp_reg3]                   \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sub    %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x976fd9                                     \n\t"
+        "lw     %[temp_reg2], 11*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*11*4(%[buf])                               \n\t"
+        "mult   $ac1,         %[temp_reg6], %[temp_reg5]                   \n\t"
+        "subu   %[s1],        %[temp_reg3], %[s1]                          \n\t"
+        "lw     %[temp_reg5], 31*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg4], 6*4(%[win])                                  \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg3], 4*6*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg2], $ac2                                         \n\t"
+        "lw     %[temp_reg6], 26*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg4]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg6]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 4*11*4(%[buf])                               \n\t"
+        "sw     %[temp_reg2], 32*11*4(%[out])                              \n\t"
+        "lw     %[temp_reg1], 4*15*4(%[buf])                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg2], 15*4(%[win])                                 \n\t"
+        "sw     %[temp_reg3], 6*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg6], 4*6*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg2]                                 \n\t"
+        "lw     %[temp_reg3], 2*4(%[win])                                  \n\t"
+        "lw     %[temp_reg4], 4*2*4(%[buf])                                \n\t"
+        "lw     %[temp_reg5], 35*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg3]                   \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "lw     %[temp_reg6], 22*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg5]                   \n\t"
+        "lw     %[t1],        14*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg6]                   \n\t"
+        "lw     %[t0],        12*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg3], $ac1                                         \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "sw     %[temp_reg1], 15*32*4(%[out])                              \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t2],        13*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        15*4(%[tmp])                                 \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg5], 4*15*4(%[buf])                               \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x9C42577C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sw     %[temp_reg4], 2*32*4(%[out])                               \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x6f94a2                                     \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg6], 4*2*4(%[buf])                                \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 12*4(%[win])                                 \n\t"
+        "lw     %[temp_reg6], 4*12*4(%[buf])                               \n\t"
+        "subu   %[s1],        %[temp_reg1], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg3], 32*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg5]                   \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg2], 5*4(%[win])                                  \n\t"
+        "mult   %[t0],        %[temp_reg3]                                 \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "lw     %[temp_reg4], 25*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*5*4(%[buf])                                \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg4], $ac1                                         \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg5], 32*12*4(%[out])                              \n\t"
+        "sw     %[temp_reg3], 4*12*4(%[buf])                               \n\t"
+        "lw     %[temp_reg6], 14*4(%[win])                                 \n\t"
+        "lw     %[temp_reg5], 4*14*4(%[buf])                               \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "sw     %[temp_reg4], 4*5*4(%[buf])                                \n\t"
+        "sw     %[temp_reg1], 5*32*4(%[out])                               \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg4], 34*4(%[win])                                 \n\t"
+        "lw     %[temp_reg2], 3*4(%[win])                                  \n\t"
+        "lw     %[temp_reg1], 4*3*4(%[buf])                                \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 23*4(%[win])                                 \n\t"
+        "lw     %[s0],        16*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "lw     %[t1],        17*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg3]                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 14*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*14*4(%[buf])                               \n\t"
+        "mfhi   %[temp_reg3], $ac3                                         \n\t"
+        "li     %[temp_reg5], 0xB504F334                                   \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "multu  %[temp_reg5], %[t1]                                        \n\t"
+        "lw     %[temp_reg2], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 3*32*4(%[out])                               \n\t"
+        "sra    %[t1],        %[t1],        31                             \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "movn   %[s1],        %[temp_reg5], %[t1]                          \n\t"
+        "sw     %[temp_reg3], 4*3*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 13*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*4*4(%[buf])                                \n\t"
+        "lw     %[temp_reg3], 4*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 33*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg6], %[s1]                          \n\t"
+        "lw     %[temp_reg6], 24*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg1]                   \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg5]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "mfhi   %[temp_reg1], $ac1                                         \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg5], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg2], 13*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg5], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*4*4(%[buf])                                \n\t"
+
+        : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1),
+          [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3),
+          [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+          [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6),
+          [out] "+r" (out)
+        : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+          "$ac3hi", "$ac3lo"
+    );
+}
+
+static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_fixed(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+void ff_mpadsp_init_mipsdspr1(MPADSPContext *s)
+{
+    s->apply_window_fixed   = ff_mpadsp_apply_window_mips_fixed;
+    s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed;
+}
diff --git a/libavcodec/mips/mpegaudiodsp_mips_float.c b/libavcodec/mips/mpegaudiodsp_mips_float.c
new file mode 100644
index 0000000000..bd36894d31
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_float.c
@@ -0,0 +1,1251 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ *            libavcodec/dct32.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window,
+                               int *dither_state, float *samples, int incr)
+{
+    register const float *w, *w2, *p;
+    int j;
+    float *samples2;
+    float sum, sum2;
+    /* temporary variables */
+    int incr1 = incr << 2;
+    int t_sample;
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float *p2;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    * use of round_sample function from the original code is
+    * changed with appropriate assembly instructions.
+    */
+
+    __asm__ volatile (
+        "lwc1    %[sum],      0(%[dither_state])                            \t\n"
+        "sll     %[t_sample], %[incr1],     5                               \t\n"
+        "sub     %[t_sample], %[t_sample],  %[incr1]                        \n\t"
+        "li      %[j],        4                                             \t\n"
+        "lwc1    %[in1],      0(%[window])                                  \t\n"
+        "lwc1    %[in2],      16*4(%[synth_buf])                            \t\n"
+        "sw      $zero,       0(%[dither_state])                            \t\n"
+        "lwc1    %[in3],      64*4(%[window])                               \t\n"
+        "lwc1    %[in4],      80*4(%[synth_buf])                            \t\n"
+        PTR_ADDU "%[samples2],%[samples],   %[t_sample]                     \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      128*4(%[window])                              \t\n"
+        "lwc1    %[in6],      144*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      192*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in8],      208*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      256*4(%[window])                              \t\n"
+        "lwc1    %[in2],      272*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      320*4(%[window])                              \t\n"
+        "lwc1    %[in4],      336*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in5],      384*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in6],      400*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      448*4(%[window])                              \t\n"
+        "lwc1    %[in8],      464*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      32*4(%[window])                               \t\n"
+        "lwc1    %[in2],      48*4(%[synth_buf])                            \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      96*4(%[window])                               \t\n"
+        "lwc1    %[in4],      112*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      160*4(%[window])                              \t\n"
+        "lwc1    %[in6],      176*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      224*4(%[window])                              \t\n"
+        "lwc1    %[in8],      240*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      288*4(%[window])                              \t\n"
+        "lwc1    %[in2],      304*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      352*4(%[window])                              \t\n"
+        "lwc1    %[in4],      368*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      416*4(%[window])                              \t\n"
+        "lwc1    %[in6],      432*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      480*4(%[window])                              \t\n"
+        "lwc1    %[in8],      496*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        PTR_ADDU "%[w],       %[window],    4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        PTR_ADDU "%[w2],      %[window],    124                             \t\n"
+        PTR_ADDIU "%[p],      %[synth_buf], 68                              \t\n"
+        PTR_ADDIU "%[p2],     %[synth_buf], 188                             \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+
+        /* calculate two samples at the same time to avoid one memory
+           access per two sample */
+
+        "ff_mpadsp_apply_window_loop%=:                                     \t\n"
+        "lwc1    %[in1],      0(%[w])                                       \t\n"
+        "lwc1    %[in2],      0(%[p])                                       \t\n"
+        "lwc1    %[in3],      0(%[w2])                                      \t\n"
+        "lwc1    %[in4],      64*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p])                                    \t\n"
+        "lwc1    %[in6],      64*4(%[w2])                                   \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "mul.s   %[sum2],     %[in2],       %[in3]                          \t\n"
+        "lwc1    %[in1],      128*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      128*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmadd.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      128*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      192*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      192*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      192*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      256*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      256*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      256*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      320*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      320*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      320*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      384*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      384*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      384*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      448*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      448*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      448*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in1],      32*4(%[w])                                    \t\n"
+        "lwc1    %[in2],      0(%[p2])                                      \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      32*4(%[w2])                                   \t\n"
+        "lwc1    %[in4],      96*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p2])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      96*4(%[w2])                                   \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      160*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      128*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      160*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      224*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      192*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      224*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      288*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      256*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      288*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      352*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      320*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      352*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      416*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      384*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      416*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      480*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      448*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      480*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        PTR_ADDIU "%[w],      %[w],         4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        PTR_ADDIU "%[w2],     %[w2],        -4                              \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "addu    %[j],        %[j],         4                               \t\n"
+        PTR_ADDIU "%[p],      4                                             \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDIU "%[p2],     -4                                            \t\n"
+        "swc1    %[sum2],     0(%[samples2])                                \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+        PTR_SUBU "%[samples2],%[samples2],  %[incr1]                        \t\n"
+        "bne     %[j],        64,           ff_mpadsp_apply_window_loop%=   \t\n"
+
+        "lwc1    %[in1],      48*4(%[window])                               \t\n"
+        "lwc1    %[in2],      32*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in3],      112*4(%[window])                              \t\n"
+        "lwc1    %[in4],      96*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in5],      176*4(%[window])                              \t\n"
+        "lwc1    %[in6],      160*4(%[synth_buf])                           \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "lwc1    %[in7],      240*4(%[window])                              \t\n"
+        "lwc1    %[in8],      224*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      304*4(%[window])                              \t\n"
+        "nmadd.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in2],      288*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in3],      368*4(%[window])                              \t\n"
+        "lwc1    %[in4],      352*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in5],      432*4(%[window])                              \t\n"
+        "lwc1    %[in6],      416*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in7],      496*4(%[window])                              \t\n"
+        "lwc1    %[in8],      480*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+
+        : [sum] "=&f" (sum), [sum2] "=&f" (sum2),
+          [w2] "=&r" (w2),   [w] "=&r" (w),
+          [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j),
+          [samples] "+r" (samples), [samples2] "=&r" (samples2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [t_sample] "=&r" (t_sample)
+        : [synth_buf] "r" (synth_buf), [window] "r" (window),
+          [dither_state] "r" (dither_state), [incr1] "r" (incr1)
+        : "memory"
+    );
+}
+
+static void ff_dct32_mips_float(float *out, const float *tab)
+{
+    float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7,
+          val8 , val9 , val10, val11, val12, val13, val14, val15,
+          val16, val17, val18, val19, val20, val21, val22, val23,
+          val24, val25, val26, val27, val28, val29, val30, val31;
+    float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
+          fTmp9, fTmp10, fTmp11;
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       0*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       31*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       15*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       16*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.50241928618815570551                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50060299823519630134                  \n\t"
+        "li.s       %[fTmp11],      10.19000812354805681150                 \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val0],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val15],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       7*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       24*4(%[tab])                            \n\t"
+        "madd.s     %[val16],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val31],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val15],       %[val15],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       8*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       23*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val31],       %[val31],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       5.10114861868916385802                  \n\t"
+        "li.s       %[fTmp10],      0.67480834145500574602                  \n\t"
+        "li.s       %[fTmp11],      0.74453627100229844977                  \n\t"
+        "add.s      %[val7],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val8],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.50979557910415916894                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val8],        %[val8],        %[fTmp7]                \n\t"
+        "madd.s     %[val23],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val24],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val0],        %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val8]                 \n\t"
+        "add.s      %[val8],        %[val15],       %[val8]                 \n\t"
+        "mul.s      %[val24],       %[val24],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val16],       %[val23]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val23]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp4],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val24],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp4]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val0]  "=f" (val0),  [val7]  "=f" (val7),
+          [val8]  "=f" (val8),  [val15] "=f" (val15),
+          [val16] "=f" (val16), [val23] "=f" (val23),
+          [val24] "=f" (val24), [val31] "=f" (val31)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       3*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       28*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       12*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       19*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.64682178335999012954                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.53104259108978417447                  \n\t"
+        "li.s       %[fTmp11],      1.48416461631416627724                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val3],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val12],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       4*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       27*4(%[tab])                            \n\t"
+        "madd.s     %[val19],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val28],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val12],       %[val12],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       11*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       20*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val28],       %[val28],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "li.s       %[fTmp7],       0.78815462345125022473                  \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.55310389603444452782                  \n\t"
+        "li.s       %[fTmp11],      1.16943993343288495515                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val4],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val11],       %[fTmp5],       %[fTmp6]                \n\t"
+        "li.s       %[fTmp1],       2.56291544774150617881                  \n\t"
+        "madd.s     %[val20],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val27],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val11],       %[val11],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp2],       %[val3],        %[val4]                 \n\t"
+        "add.s      %[val3],        %[val3],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val27],       %[val27],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val12],       %[val11]                \n\t"
+        "mul.s      %[val4],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val11],       %[val12],       %[val11]                \n\t"
+        "add.s      %[val19],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val20],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val12],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val28],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val28],       %[val27]                \n\t"
+        "mul.s      %[val28],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val3]  "=f" (val3),  [val4]  "=f" (val4),
+          [val11] "=f" (val11), [val12] "=f" (val12),
+          [val19] "=f" (val19), [val20] "=f" (val20),
+          [val27] "=f" (val27), [val28] "=f" (val28)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.54119610014619698439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val3]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val3]                 \n\t"
+        "sub.s      %[fTmp3],       %[val7],        %[val4]                 \n\t"
+        "add.s      %[val4],        %[val7],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val8],        %[val11]                \n\t"
+        "mul.s      %[val3],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val11]                \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val12],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [val0]  "+f" (val0),   [val3] "+f" (val3),
+          [val4]  "+f" (val4),   [val7] "+f" (val7),
+          [val8]  "+f" (val8),   [val11] "+f" (val11),
+          [val12] "+f" (val12),  [val15] "+f" (val15),
+          [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val19]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val19]                \n\t"
+        "sub.s      %[fTmp3],       %[val23],       %[val20]                \n\t"
+        "add.s      %[val20],       %[val23],       %[val20]                \n\t"
+        "sub.s      %[fTmp4],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val28],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
+          [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
+          [val28] "+f" (val28), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       1*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       30*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       14*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       17*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.52249861493968888062                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50547095989754365998                  \n\t"
+        "li.s       %[fTmp11],      3.40760841846871878570                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val1],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val14],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       6*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       25*4(%[tab])                            \n\t"
+        "madd.s     %[val17],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val30],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val14],       %[val14],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       9*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       22*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val30],       %[val30],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.72244709823833392782                  \n\t"
+        "li.s       %[fTmp10],      0.62250412303566481615                  \n\t"
+        "li.s       %[fTmp11],      0.83934964541552703873                  \n\t"
+        "add.s      %[val6],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val9],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.60134488693504528054                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val6]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val6]                 \n\t"
+        "mul.s      %[val9],        %[val9],        %[fTmp7]                \n\t"
+        "madd.s     %[val22],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val25],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val14],       %[val9]                 \n\t"
+        "mul.s      %[val25],       %[val25],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val17],       %[val22]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val22]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val25],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val1]  "=f" (val1),  [val6]  "=f" (val6),
+          [val9]  "=f" (val9),  [val14] "=f" (val14),
+          [val17] "=f" (val17), [val22] "=f" (val22),
+          [val25] "=f" (val25), [val30] "=f" (val30)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       2*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       29*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       13*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       18*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.56694403481635770368                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.51544730992262454697                  \n\t"
+        "li.s       %[fTmp11],      2.05778100995341155085                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val2],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val13],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       5*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       26*4(%[tab])                            \n\t"
+        "madd.s     %[val18],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val29],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val13],       %[val13],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       10*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       21*4(%[tab])                            \n\t"
+        "mul.s      %[val29],       %[val29],       %[fTmp7]                \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.06067768599034747134                  \n\t"
+        "li.s       %[fTmp10],      0.58293496820613387367                  \n\t"
+        "li.s       %[fTmp11],      0.97256823786196069369                  \n\t"
+        "add.s      %[val5],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val10],       %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.89997622313641570463                  \n\t"
+        "sub.s      %[fTmp2],       %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val10],       %[val10],       %[fTmp7]                \n\t"
+        "madd.s     %[val21],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val26],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val2],        %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp3],       %[val13],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val13],       %[val10]                \n\t"
+        "mul.s      %[val26],       %[val26],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp4],       %[val18],       %[val21]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val21]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val29],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val29],       %[val26]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val2]  "=f" (val2),  [val5]  "=f" (val5),
+          [val10] "=f" (val10), [val13] "=f" (val13),
+          [val18] "=f" (val18), [val21] "=f" (val21),
+          [val26] "=f" (val26), [val29] "=f" (val29)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       1.30656296487637652785                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val2]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val2]                 \n\t"
+        "sub.s      %[fTmp3],       %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val6],        %[val5]                 \n\t"
+        "sub.s      %[fTmp4],       %[val9],        %[val10]                \n\t"
+        "mul.s      %[val2],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val9],        %[val9],        %[val10]                \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val10],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val13],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val1]  "+f" (val1),  [val2]  "+f" (val2),
+          [val5]  "+f" (val5),  [val6]  "+f" (val6),
+          [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val13] "+f" (val13), [val14] "+f" (val14)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val17],       %[val18]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val18]                \n\t"
+        "sub.s      %[fTmp3],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val22],       %[val21]                \n\t"
+        "sub.s      %[fTmp4],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val18],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val26],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val29],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val29] "+f" (val29), [val30] "+f" (val30)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.70710678118654752439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val1]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val1]                 \n\t"
+        "sub.s      %[fTmp3],       %[val3],        %[val2]                 \n\t"
+        "add.s      %[val2],        %[val3],        %[val2]                 \n\t"
+        "sub.s      %[fTmp4],       %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val1],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val0],        0(%[out])                               \n\t"
+        "mul.s      %[val3],        %[fTmp3],       %[fTmp1]                \n\t"
+        "add.s      %[val4],        %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp4]                \n\t"
+        "swc1       %[val1],        16*4(%[out])                            \n\t"
+        "sub.s      %[fTmp2],       %[val7],        %[val6]                 \n\t"
+        "add.s      %[val2],        %[val2],        %[val3]                 \n\t"
+        "swc1       %[val3],        24*4(%[out])                            \n\t"
+        "add.s      %[val6],        %[val7],        %[val6]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val2],        8*4(%[out])                             \n\t"
+        "add.s      %[val6],        %[val6],        %[val7]                 \n\t"
+        "swc1       %[val7],        28*4(%[out])                            \n\t"
+        "add.s      %[val4],        %[val4],        %[val6]                 \n\t"
+        "add.s      %[val6],        %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val5],        %[val7]                 \n\t"
+        "swc1       %[val4],        4*4(%[out])                             \n\t"
+        "swc1       %[val5],        20*4(%[out])                            \n\t"
+        "swc1       %[val6],        12*4(%[out])                            \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val0] "+f" (val0), [val1] "+f" (val1),
+          [val2] "+f" (val2), [val3] "+f" (val3),
+          [val4] "+f" (val4), [val5] "+f" (val5),
+          [val6] "+f" (val6), [val7] "+f" (val7)
+        : [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val8],        %[val9]                 \n\t"
+        "add.s      %[val8],        %[val8],        %[val9]                 \n\t"
+        "sub.s      %[fTmp3],       %[val11],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val11],       %[val10]                \n\t"
+        "sub.s      %[fTmp4],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val9],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val14]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val14],       %[val15],       %[val14]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val11]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val15]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val15]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val12]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val9],        %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val11]                \n\t"
+        "add.s      %[val11],       %[val11],       %[val15]                \n\t"
+        "swc1       %[val8],         2*4(%[out])                            \n\t"
+        "swc1       %[val9],        18*4(%[out])                            \n\t"
+        "swc1       %[val10],       10*4(%[out])                            \n\t"
+        "swc1       %[val11],       26*4(%[out])                            \n\t"
+        "swc1       %[val12],        6*4(%[out])                            \n\t"
+        "swc1       %[val13],       22*4(%[out])                            \n\t"
+        "swc1       %[val14],       14*4(%[out])                            \n\t"
+        "swc1       %[val15],       30*4(%[out])                            \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val8]  "+f" (val8),  [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
+          [val14] "+f" (val14), [val15] "+f" (val15)
+        : [fTmp1] "f" (fTmp1), [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val17]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val17]                \n\t"
+        "sub.s      %[fTmp3],       %[val19],       %[val18]                \n\t"
+        "add.s      %[val18],       %[val19],       %[val18]                \n\t"
+        "sub.s      %[fTmp4],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val17],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val23],       %[val22]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val22],       %[val23],       %[val22]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val19]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val23]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val22]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val21],       %[val23]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
+          [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val23] "+f" (val23)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val24],       %[val25]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val25]                \n\t"
+        "sub.s      %[fTmp3],       %[val27],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val27],       %[val26]                \n\t"
+        "sub.s      %[fTmp4],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val25],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val30]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val30],       %[val31],       %[val30]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val27]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val31]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val31]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val28]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val25]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val27],       %[val31]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
+          [val30] "+f" (val30), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    out[ 1] = val16 + val24;
+    out[17] = val17 + val25;
+    out[ 9] = val18 + val26;
+    out[25] = val19 + val27;
+    out[ 5] = val20 + val28;
+    out[21] = val21 + val29;
+    out[13] = val22 + val30;
+    out[29] = val23 + val31;
+    out[ 3] = val24 + val20;
+    out[19] = val25 + val21;
+    out[11] = val26 + val22;
+    out[27] = val27 + val23;
+    out[ 7] = val28 + val18;
+    out[23] = val29 + val19;
+    out[15] = val30 + val17;
+    out[31] = val31;
+}
+
+static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
+{
+    float t0, t1, t2, t3, s0, s1, s2, s3;
+    float tmp[18];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6;
+    float out1, out2, out3, out4, out5;
+    float c1, c2, c3, c4, c5, c6, c7, c8, c9;
+
+    /**
+    * all loops are unrolled totally, and instructions are scheduled to
+    * minimize pipeline stall. instructions of the first two loops are
+    * reorganized, in order to eliminate unnecessary readings and
+    * writings into array. values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+
+    /* loop 1 and 2 */
+    __asm__ volatile (
+        "lwc1   %[in1],  17*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  16*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  15*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  14*4(%[in])                                    \t\n"
+        "lwc1   %[in5],  13*4(%[in])                                    \t\n"
+        "lwc1   %[in6],  12*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out2], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out3], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out4], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out5], %[in5],  %[in6]                                \t\n"
+        "lwc1   %[in1],  11*4(%[in])                                    \t\n"
+        "swc1   %[out2], 16*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "swc1   %[out4], 14*4(%[in])                                    \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "lwc1   %[in2],  10*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  9*4(%[in])                                     \t\n"
+        "swc1   %[out1], 17*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  8*4(%[in])                                     \t\n"
+        "swc1   %[out3], 15*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out2], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out3], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out4], %[in3],  %[in4]                                \t\n"
+        "lwc1   %[in5],  7*4(%[in])                                     \t\n"
+        "swc1   %[out1], 12*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out3], 10*4(%[in])                                    \t\n"
+        "add.s  %[out2], %[out2], %[out4]                               \t\n"
+        "lwc1   %[in6],  6*4(%[in])                                     \t\n"
+        "lwc1   %[in1],  5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 13*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  4*4(%[in])                                     \t\n"
+        "swc1   %[out2], 11*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out1], %[in5],  %[in6]                                \t\n"
+        "add.s  %[out2], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out3], %[in1],  %[in2]                                \t\n"
+        "lwc1   %[in3],  3*4(%[in])                                     \t\n"
+        "swc1   %[out5], 8*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[out4], %[out1]                               \t\n"
+        "swc1   %[out2], 6*4(%[in])                                     \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "lwc1   %[in4],  2*4(%[in])                                     \t\n"
+        "lwc1   %[in5],  1*4(%[in])                                     \t\n"
+        "swc1   %[out4], 9*4(%[in])                                     \t\n"
+        "lwc1   %[in6],  0(%[in])                                       \t\n"
+        "swc1   %[out1], 7*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out5], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out1], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out2], %[in5],  %[in6]                                \t\n"
+        "swc1   %[out4], 4*4(%[in])                                     \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "swc1   %[out1], 2*4(%[in])                                     \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out2], 1*4(%[in])                                     \t\n"
+        "swc1   %[out3], 5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 3*4(%[in])                                     \t\n"
+
+        : [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [out5] "=&f" (out5)
+        : [in] "r" (in)
+        : "memory"
+    );
+
+    /* loop 3 */
+    __asm__ volatile (
+        "li.s    %[c1],   0.5                                           \t\n"
+        "lwc1    %[in1],  8*4(%[in])                                    \t\n"
+        "lwc1    %[in2],  16*4(%[in])                                   \t\n"
+        "lwc1    %[in3],  4*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  0(%[in])                                      \t\n"
+        "lwc1    %[in5],  12*4(%[in])                                   \t\n"
+        "li.s    %[c2],   0.93969262078590838405                        \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "li.s    %[c3],   -0.76604444311897803520                       \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "li.s    %[c4],   -0.17364817766693034885                       \t\n"
+        "li.s    %[c5],   -0.86602540378443864676                       \t\n"
+        "li.s    %[c6],   0.98480775301220805936                        \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 6*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 16*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  10*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  14*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  2*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  6*4(%[in])                                    \t\n"
+        "swc1    %[out3], 10*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 2*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 14*4(%[tmp])                                  \t\n"
+        "li.s    %[c7],   -0.34202014332566873304                       \t\n"
+        "sub.s   %[out1], %[out1], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "li.s    %[c8],   0.86602540378443864676                        \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "mul.s   %[out1], %[out1], %[c5]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "li.s    %[c9],   -0.64278760968653932632                       \t\n"
+        "add.s   %[out2], %[t2],   %[t3]                                \t\n"
+        "lwc1    %[in1],  9*4(%[in])                                    \t\n"
+        "swc1    %[out1], 4*4(%[tmp])                                   \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "lwc1    %[in2],  17*4(%[in])                                   \t\n"
+        "add.s   %[out2], %[out2], %[t0]                                \t\n"
+        "lwc1    %[in3],  5*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  1*4(%[in])                                    \t\n"
+        "add.s   %[out3], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out1], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out2], 0(%[tmp])                                     \t\n"
+        "lwc1    %[in5],  13*4(%[in])                                   \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "sub.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "swc1    %[out3], 12*4(%[tmp])                                  \t\n"
+        "swc1    %[out1], 8*4(%[tmp])                                   \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 7*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 17*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  11*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  15*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  3*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  7*4(%[in])                                    \t\n"
+        "swc1    %[out3], 11*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 3*4(%[tmp])                                   \t\n"
+        "add.s   %[out3], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 15*4(%[tmp])                                  \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "sub.s   %[out3], %[out3], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "mul.s   %[out3], %[out3], %[c5]                                \t\n"
+        "add.s   %[out1], %[t2],   %[t3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "swc1    %[out3], 5*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[out2], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out3], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out1], 1*4(%[tmp])                                   \t\n"
+        "sub.s   %[out2], %[out2], %[t0]                                \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "swc1    %[out2], 13*4(%[tmp])                                  \t\n"
+        "swc1    %[out3], 9*4(%[tmp])                                   \t\n"
+
+        : [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3),
+          [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [c3] "=&f" (c3), [c4] "=&f" (c4),
+          [c5] "=&f" (c5), [c6] "=&f" (c6),
+          [c7] "=&f" (c7), [c8] "=&f" (c8),
+          [c9] "=&f" (c9)
+        : [in] "r" (in), [tmp] "r" (tmp)
+        : "memory"
+    );
+
+    /* loop 4 */
+    __asm__ volatile (
+        "lwc1   %[in1],  2*4(%[tmp])                                    \t\n"
+        "lwc1   %[in2],  0(%[tmp])                                      \t\n"
+        "lwc1   %[in3],  3*4(%[tmp])                                    \t\n"
+        "lwc1   %[in4],  1*4(%[tmp])                                    \t\n"
+        "li.s   %[c1],   0.50190991877167369479                         \t\n"
+        "li.s   %[c2],   5.73685662283492756461                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  9*4(%[win])                                    \t\n"
+        "lwc1   %[in2],  4*9*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  8*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*8*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  29*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  28*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "li.s   %[c1],   0.51763809020504152469                         \t\n"
+        "li.s   %[c2],   1.93185165257813657349                         \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*9*4(%[buf])                                  \t\n"
+        "swc1   %[out1], 288*4(%[out])                                  \t\n"
+        "swc1   %[out2], 256*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*8*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  17*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*17*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  0(%[win])                                      \t\n"
+        "lwc1   %[in4],  0(%[buf])                                      \t\n"
+        "lwc1   %[in5],  37*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  20*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  6*4(%[tmp])                                    \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 544*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  4*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 0(%[out])                                      \t\n"
+        "swc1   %[out3], 4*17*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 0(%[buf])                                      \t\n"
+        "lwc1   %[in3],  7*4(%[tmp])                                    \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  5*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  10*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*10*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  7*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in4],  4*7*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  30*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  27*4(%[win])                                   \t\n"
+        "li.s   %[c1],   0.55168895948124587824                         \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 320*4(%[out])                                  \t\n"
+        "swc1   %[out2], 224*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*10*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*7*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  16*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*16*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  1*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*1*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  36*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  21*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  10*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 512*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  8*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 32*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*16*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*1*4(%[buf])                                  \t\n"
+        "li.s   %[c2],   1.18310079157624925896                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in3],  11*4(%[tmp])                                   \t\n"
+        "lwc1   %[in4],  9*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  11*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*11*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  6*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*6*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  31*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  26*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "swc1   %[out3], 4*11*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*6*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 352*4(%[out])                                  \t\n"
+        "swc1   %[out2], 192*4(%[out])                                  \t\n"
+        "lwc1   %[in1],  15*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*15*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  2*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  35*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  22*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  14*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 480*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  12*4(%[tmp])                                   \t\n"
+        "swc1   %[out2], 64*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*15*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  15*4(%[tmp])                                   \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  13*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.61038729438072803416                         \t\n"
+        "li.s   %[c2],   0.87172339781054900991                         \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  12*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*12*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  5*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*5*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  32*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  25*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[s0],   16*4(%[tmp])                                   \t\n"
+        "lwc1   %[s1],   17*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.70710678118654752439                         \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*12*4(%[buf])                                 \t\n"
+        "swc1   %[out1], 384*4(%[out])                                  \t\n"
+        "swc1   %[out2], 160*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*5*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  14*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*14*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  3*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*3*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  34*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  23*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 448*4(%[out])                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "swc1   %[out2], 96*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*14*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*3*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in1],  13*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*13*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  4*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*4*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  33*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  24*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 416*4(%[out])                                  \t\n"
+        "swc1   %[out2], 128*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*13*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*4*4(%[buf])                                  \t\n"
+
+        : [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [s0] "=&f" (s0), [s1] "=&f" (s1),
+          [s2] "=&f" (s2), [s3] "=&f" (s3)
+        : [tmp] "r" (tmp), [win] "r" (win),
+          [buf] "r" (buf), [out] "r" (out)
+        : "memory"
+    );
+}
+
+static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_float(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s)
+{
+    s->apply_window_float   = ff_mpadsp_apply_window_mips_float;
+    s->imdct36_blocks_float = ff_imdct36_blocks_mips_float;
+    s->dct32_float          = ff_dct32_mips_float;
+}
diff --git a/libavcodec/mips/mpegvideo_init_mips.c b/libavcodec/mips/mpegvideo_init_mips.c
new file mode 100644
index 0000000000..1918da5f46
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_init_mips.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
+
+#if HAVE_MSA
+static av_cold void dct_unquantize_init_msa(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_msa;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_msa;
+    if (!s->q_scale_type)
+        s->dct_unquantize_mpeg2_inter = ff_dct_unquantize_mpeg2_inter_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void dct_unquantize_init_mmi(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_mmi;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_mmi;
+    s->dct_unquantize_mpeg1_intra = ff_dct_unquantize_mpeg1_intra_mmi;
+    s->dct_unquantize_mpeg1_inter = ff_dct_unquantize_mpeg1_inter_mmi;
+
+    if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        if (!s->q_scale_type)
+            s->dct_unquantize_mpeg2_intra = ff_dct_unquantize_mpeg2_intra_mmi;
+
+    s->denoise_dct= ff_denoise_dct_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_mpv_common_init_mips(MpegEncContext *s)
+{
+#if HAVE_MSA
+    dct_unquantize_init_msa(s);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    dct_unquantize_init_mmi(s);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
new file mode 100644
index 0000000000..decacd4c62
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef MPEGVIDEO_MIPS_H
+#define MPEGVIDEO_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block);
+
+#endif /* MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
new file mode 100644
index 0000000000..94781e6e5c
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -0,0 +1,443 @@
+/*
+ * Loongson SIMD optimized mpegvideo
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegvideo_mips.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t level, qmul, qadd, nCoeffs;
+
+    qmul = qscale << 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+    if (!s->h263_aic) {
+        if (n<4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale-1) | 1;
+    } else {
+        qadd = 0;
+        level = block[0];
+    }
+
+    if(s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "xor $f12, $f12, $f12           \r\n"
+        "lwc1 $f12, %1                  \n\r"
+        "xor $f10, $f10, $f10           \r\n"
+        "lwc1 $f10, %2                  \r\n"
+        "xor $f14, $f14, $f14           \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "packsswh $f10, $f10, $f10      \r\n"
+        "packsswh $f10, $f10, $f10      \r\n"
+        "psubh $f14, $f14, $f10         \r\n"
+        "xor $f8, $f8, $f8              \r\n"
+        ".p2align 4                     \r\n"
+        "1:                             \r\n"
+        "daddu $8, %0, %3               \r\n"
+        "gsldlc1 $f0, 7($8)             \r\n"
+        "gsldrc1 $f0, 0($8)             \r\n"
+        "gsldlc1 $f2, 15($8)            \r\n"
+        "gsldrc1 $f2, 8($8)             \r\n"
+        "mov.d $f4, $f0                 \r\n"
+        "mov.d $f6, $f2                 \r\n"
+        "pmullh $f0, $f0, $f12          \r\n"
+        "pmullh $f2, $f2, $f12          \r\n"
+        "pcmpgth $f4, $f4, $f8          \r\n"
+        "pcmpgth $f6, $f6, $f8          \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "paddh $f0, $f0, $f14           \r\n"
+        "paddh $f2, $f2, $f14           \r\n"
+        "xor $f4, $f4, $f0              \r\n"
+        "xor $f6, $f6, $f2              \r\n"
+        "pcmpeqh $f0, $f0, $f14         \r\n"
+        "pcmpeqh $f2, $f2, $f14         \r\n"
+        "pandn $f0, $f0, $f4            \r\n"
+        "pandn $f2, $f2, $f6            \r\n"
+        "gssdlc1 $f0, 7($8)             \r\n"
+        "gssdrc1 $f0, 0($8)             \r\n"
+        "gssdlc1 $f2, 15($8)            \r\n"
+        "gssdrc1 $f2, 8($8)             \r\n"
+        "addi %3, %3, 16                \r\n"
+        "blez %3, 1b                    \r\n"
+        ::"r"(block+nCoeffs),"m"(qmul),"m"(qadd),"r"(2*(-nCoeffs))
+        :"$8","memory"
+    );
+
+    block[0] = level;
+}
+
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t qmul, qadd, nCoeffs;
+
+    qmul = qscale << 1;
+    qadd = (qscale - 1) | 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "xor $f12, $f12, $f12           \r\n"
+        "lwc1 $f12, %1                  \r\n"
+        "xor $f10, $f10, $f10           \r\n"
+        "lwc1 $f10, %2                  \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "xor $f14, $f14, $f14           \r\n"
+        "packsswh $f10, $f10, $f10      \r\n"
+        "packsswh $f10, $f10, $f10      \r\n"
+        "psubh $f14, $f14, $f10         \r\n"
+        "xor $f8, $f8, $f8              \r\n"
+        ".p2align 4                     \r\n"
+        "1:                             \r\n"
+        "daddu $8, %0, %3               \r\n"
+        "gsldlc1 $f0, 7($8)             \r\n"
+        "gsldrc1 $f0, 0($8)             \r\n"
+        "gsldlc1 $f2, 15($8)            \r\n"
+        "gsldrc1 $f2, 8($8)             \r\n"
+        "mov.d $f4, $f0                 \r\n"
+        "mov.d $f6, $f2                 \r\n"
+        "pmullh $f0, $f0, $f12          \r\n"
+        "pmullh $f2, $f2, $f12          \r\n"
+        "pcmpgth $f4, $f4, $f8          \r\n"
+        "pcmpgth $f6, $f6, $f8          \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "paddh $f0, $f0, $f14           \r\n"
+        "paddh $f2, $f2, $f14           \r\n"
+        "xor $f4, $f4, $f0              \r\n"
+        "xor $f6, $f6, $f2              \r\n"
+        "pcmpeqh $f0, $f0, $f14         \r\n"
+        "pcmpeqh $f2, $f2, $f14         \r\n"
+        "pandn $f0, $f0, $f4            \r\n"
+        "pandn $f2, $f2, $f6            \r\n"
+        "gssdlc1 $f0, 7($8)             \r\n"
+        "gssdrc1 $f0, 0($8)             \r\n"
+        "gssdlc1 $f2, 15($8)            \r\n"
+        "gssdrc1 $f2, 8($8)             \r\n"
+        "addi %3, %3, 16                \r\n"
+        "blez %3, 1b                    \r\n"
+        ::"r"(block+nCoeffs),"m"(qmul),"m"(qadd),"r"(2*(-nCoeffs))
+        : "$8","memory"
+    );
+}
+
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+
+    av_assert2(s->block_last_index[n]>=0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+
+    if (n<4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    /* XXX: only mpeg1 */
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "pcmpeqh $f14, $f14, $f14       \r\n"
+        "dli $10, 15                    \r\n"
+        "dmtc1 $10, $f16                \r\n"
+        "xor $f12, $f12, $f12           \r\n"
+        "lwc1 $f12, %2                  \r\n"
+        "psrlh $f14, $f14, $f16         \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "or $8, %3, $0                  \r\n"
+        ".p2align 4                     \r\n"
+        "1:                             \r\n"
+        "gsldxc1 $f0, 0($8, %0)         \r\n"
+        "gsldxc1 $f2, 8($8, %0)         \r\n"
+        "mov.d $f16, $f0                \r\n"
+        "mov.d $f18, $f2                \r\n"
+        "gsldxc1 $f8, 0($8, %1)         \r\n"
+        "gsldxc1 $f10, 8($8, %1)        \r\n"
+        "pmullh $f8, $f8, $f12          \r\n"
+        "pmullh $f10, $f10, $f12        \r\n"
+        "xor $f4, $f4, $f4              \r\n"
+        "xor $f6, $f6, $f6              \r\n"
+        "pcmpgth $f4, $f4, $f0          \r\n"
+        "pcmpgth $f6, $f6, $f2          \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "psubh $f0, $f0, $f4            \r\n"
+        "psubh $f2, $f2, $f6            \r\n"
+        "pmullh $f0, $f0, $f8           \r\n"
+        "pmullh $f2, $f2, $f10          \r\n"
+        "xor $f8, $f8, $f8              \r\n"
+        "xor $f10, $f10, $f10           \r\n"
+        "pcmpeqh $f8, $f8, $f16         \r\n"
+        "pcmpeqh $f10, $f10, $f18       \r\n"
+        "dli $10, 3                     \r\n"
+        "dmtc1 $10, $f16                \r\n"
+        "psrah $f0, $f0, $f16           \r\n"
+        "psrah $f2, $f2, $f16           \r\n"
+        "psubh $f0, $f0, $f14           \r\n"
+        "psubh $f2, $f2, $f14           \r\n"
+        "or $f0, $f0, $f14              \r\n"
+        "or $f2, $f2, $f14              \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "psubh $f0, $f0, $f4            \r\n"
+        "psubh $f2, $f2, $f6            \r\n"
+        "pandn $f8, $f8, $f0            \r\n"
+        "pandn $f10, $f10, $f2          \r\n"
+        "gssdxc1 $f8, 0($8, %0)         \r\n"
+        "gssdxc1 $f10, 8($8, %0)        \r\n"
+        "addi $8, $8, 16                \r\n"
+        "bltz $8, 1b                    \r\n"
+        ::"r"(block+nCoeffs),"r"(quant_matrix+nCoeffs),"m"(qscale),
+          "g"(-2*nCoeffs)
+        : "$8","$10","memory"
+    );
+
+    block[0] = block0;
+}
+
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+
+    av_assert2(s->block_last_index[n] >= 0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+    quant_matrix = s->inter_matrix;
+
+    __asm__ volatile (
+        "pcmpeqh $f14, $f14, $f14       \r\n"
+        "dli $10, 15                    \r\n"
+        "dmtc1 $10, $f16                \r\n"
+        "xor $f12, $f12, $f12           \r\n"
+        "lwc1 $f12, %2                  \r\n"
+        "psrlh $f14, $f14, $f16         \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "packsswh $f12, $f12, $f12      \r\n"
+        "or $8, %3, $0                  \r\n"
+        ".p2align 4                     \r\n"
+        "1:                             \r\n"
+        "gsldxc1 $f0, 0($8, %0)         \r\n"
+        "gsldxc1 $f2, 8($8, %0)         \r\n"
+        "mov.d $f16, $f0                \r\n"
+        "mov.d $f18, $f2                \r\n"
+        "gsldxc1 $f8, 0($8, %1)         \r\n"
+        "gsldxc1 $f10, 8($8, %1)        \r\n"
+        "pmullh $f8, $f8, $f12          \r\n"
+        "pmullh $f10, $f10, $f12        \r\n"
+        "xor $f4, $f4, $f4              \r\n"
+        "xor $f6, $f6, $f6              \r\n"
+        "pcmpgth $f4, $f4, $f0          \r\n"
+        "pcmpgth $f6, $f6, $f2          \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "psubh $f0, $f0, $f4            \r\n"
+        "psubh $f2, $f2, $f6            \r\n"
+        "paddh $f0, $f0, $f0            \r\n"
+        "paddh $f2, $f2, $f2            \r\n"
+        "paddh $f0, $f0, $f14           \r\n"
+        "paddh $f2, $f2, $f14           \r\n"
+        "pmullh $f0, $f0, $f8           \r\n"
+        "pmullh $f2, $f2, $f10          \r\n"
+        "xor $f8, $f8, $f8              \r\n"
+        "xor $f10, $f10, $f10           \r\n"
+        "pcmpeqh $f8, $f8, $f16         \r\n"
+        "pcmpeqh $f10, $f10, $f18       \r\n"
+        "dli $10, 4                     \r\n"
+        "dmtc1 $10, $f16                \r\n"
+        "psrah $f0, $f0, $f16           \r\n"
+        "psrah $f2, $f2, $f16           \r\n"
+        "psubh $f0, $f0, $f14           \r\n"
+        "psubh $f2, $f2, $f14           \r\n"
+        "or $f0, $f0, $f14              \r\n"
+        "or $f2, $f2, $f14              \r\n"
+        "xor $f0, $f0, $f4              \r\n"
+        "xor $f2, $f2, $f6              \r\n"
+        "psubh $f0, $f0, $f4            \r\n"
+        "psubh $f2, $f2, $f6            \r\n"
+        "pandn $f8, $f8, $f0            \r\n"
+        "pandn $f10, $f10, $f2          \r\n"
+        "gssdxc1 $f8, 0($8, %0)         \r\n"
+        "gssdxc1 $f10, 8($8, %0)        \r\n"
+        "addi $8, $8, 16                \r\n"
+        "bltz $8, 1b                    \r\n"
+        ::"r"(block+nCoeffs),"r"(quant_matrix+nCoeffs),"m"(qscale),
+          "g"(-2*nCoeffs)
+        :"$8","$10","memory"
+    );
+}
+
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
+{
+    const int intra = s->mb_intra;
+    int *sum = s->dct_error_sum[intra];
+    uint16_t *offset = s->dct_offset[intra];
+
+    s->dct_count[intra]++;
+
+    __asm__ volatile(
+        "xor $f14, $f14, $f14               \r\n"
+        "1:                                 \r\n"
+        "ldc1 $f4, 0(%[block])              \r\n"
+        "xor $f0, $f0, $f0                  \r\n"
+        "ldc1 $f6, 8(%[block])              \r\n"
+        "xor $f2, $f2, $f2                  \r\n"
+        "pcmpgth $f0, $f0, $f4              \r\n"
+        "pcmpgth $f2, $f2, $f6              \r\n"
+        "xor $f4, $f4, $f0                  \r\n"
+        "xor $f6, $f6, $f2                  \r\n"
+        "psubh $f4, $f4, $f0                \r\n"
+        "psubh $f6, $f6, $f2                \r\n"
+        "ldc1 $f12, 0(%[offset])            \r\n"
+        "mov.d $f8, $f4                     \r\n"
+        "psubush $f4, $f4, $f12             \r\n"
+        "ldc1 $f12, 8(%[offset])            \r\n"
+        "mov.d $f10, $f6                    \r\n"
+        "psubush $f6, $f6, $f12             \r\n"
+        "xor $f4, $f4, $f0                  \r\n"
+        "xor $f6, $f6, $f2                  \r\n"
+        "psubh $f4, $f4, $f0                \r\n"
+        "psubh $f6, $f6, $f2                \r\n"
+        "sdc1 $f4, 0(%[block])              \r\n"
+        "sdc1 $f6, 8(%[block])              \r\n"
+        "mov.d $f4, $f8                     \r\n"
+        "mov.d $f6, $f10                    \r\n"
+        "punpcklhw $f8, $f8, $f14           \r\n"
+        "punpckhhw $f4, $f4, $f14           \r\n"
+        "punpcklhw $f10, $f10, $f14         \r\n"
+        "punpckhhw $f6, $f6, $f14           \r\n"
+        "ldc1 $f0, 0(%[sum])                \r\n"
+        "paddw $f8, $f8, $f0                \r\n"
+        "ldc1 $f0, 8(%[sum])                \r\n"
+        "paddw $f4, $f4, $f0                \r\n"
+        "ldc1 $f0, 16(%[sum])               \r\n"
+        "paddw $f10, $f10, $f0              \r\n"
+        "ldc1 $f0, 24(%[sum])               \r\n"
+        "paddw $f6, $f6, $f0                \r\n"
+        "sdc1 $f8, 0(%[sum])                \r\n"
+        "sdc1 $f4, 8(%[sum])                \r\n"
+        "sdc1 $f10, 16(%[sum])              \r\n"
+        "sdc1 $f6, 24(%[sum])               \r\n"
+        "daddiu %[block], %[block], 16      \r\n"
+        "daddiu %[sum], %[sum], 32          \r\n"
+        "daddiu %[offset], %[offset], 16    \r\n"
+        "dsubu $8, %[block1], %[block]      \r\n"
+        "bgtz $8, 1b                        \r\n"
+        : [block]"+r"(block),[sum]"+r"(sum),[offset]"+r"(offset)
+        : [block1]"r"(block+64)
+        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+    );
+}
+
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    uint64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+
+    assert(s->block_last_index[n]>=0);
+
+    if (s->alternate_scan)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
+
+    if (n < 4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "pcmpeqh $f14, $f14, $f14           \r\n"
+        "dli $10, 15                        \r\n"
+        "dmtc1 $10, $f16                    \r\n"
+        "xor $f12, $f12, $f12               \r\n"
+        "lwc1 $f12, %[qscale]               \r\n"
+        "psrlh $f14, $f14, $f16             \r\n"
+        "packsswh $f12, $f12, $f12          \r\n"
+        "packsswh $f12, $f12, $f12          \r\n"
+        "or $8, %[ncoeffs], $0              \r\n"
+        ".p2align 4                         \r\n"
+        "1:                                 \r\n"
+        "gsldxc1 $f0, 0($8, %[block])       \r\n"
+        "gsldxc1 $f2, 8($8, %[block])       \r\n"
+        "mov.d $f16, $f0                    \r\n"
+        "mov.d $f18, $f2                    \r\n"
+        "gsldxc1 $f8, 0($8, %[quant])       \r\n"
+        "gsldxc1 $f10, 0($8, %[quant])      \r\n"
+        "pmullh $f8, $f8, $f12              \r\n"
+        "pmullh $f10, $f10, $f12            \r\n"
+        "xor $f4, $f4, $f4                  \r\n"
+        "xor $f6, $f6, $f6                  \r\n"
+        "pcmpgth $f4, $f4, $f0              \r\n"
+        "pcmpgth $f6, $f6, $f2              \r\n"
+        "xor $f0, $f0, $f4                  \r\n"
+        "xor $f2, $f2, $f6                  \r\n"
+        "psubh $f0, $f0, $f4                \r\n"
+        "psubh $f2, $f2, $f6                \r\n"
+        "pmullh $f0, $f0, $f8               \r\n"
+        "pmullh $f2, $f2, $f10              \r\n"
+        "xor $f8, $f8, $f8                  \r\n"
+        "xor $f10, $f10, $f10               \r\n"
+        "pcmpeqh $f8, $f8, $f16             \r\n"
+        "pcmpeqh $f10 ,$f10, $f18           \r\n"
+        "dli $10, 3                         \r\n"
+        "dmtc1 $10, $f16                    \r\n"
+        "psrah $f0, $f0, $f16               \r\n"
+        "psrah $f2, $f2, $f16               \r\n"
+        "xor $f0, $f0, $f4                  \r\n"
+        "xor $f2, $f2, $f6                  \r\n"
+        "psubh $f0, $f0, $f4                \r\n"
+        "psubh $f2, $f2, $f6                \r\n"
+        "pandn $f8, $f8, $f0                \r\n"
+        "pandn $f10, $f10, $f2              \r\n"
+        "gssdxc1 $f8, 0($8, %[block])       \r\n"
+        "gssdxc1 $f10, 8($8, %[block])      \r\n"
+        "daddiu $8, $8, 16                  \r\n"
+        "blez $8, 1b                        \r\n"
+        ::[block]"r"(block+nCoeffs),[quant]"r"(quant_matrix+nCoeffs),
+          [qscale]"m"(qscale),[ncoeffs]"g"(-2*nCoeffs)
+        : "$8","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
+          "$f18"
+    );
+
+    block[0]= block0;
+}
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
new file mode 100644
index 0000000000..aa9ef770eb
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
+                                    int16_t qadd, int8_t n_coeffs,
+                                    uint8_t loop_start)
+{
+    int16_t *block_dup = block;
+    int32_t level, cnt;
+    v8i16 block_vec, qmul_vec, qadd_vec, sub;
+    v8i16 add, mask, mul, zero_mask;
+
+    qmul_vec = __msa_fill_h(qmul);
+    qadd_vec = __msa_fill_h(qadd);
+    for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
+        block_vec = LD_SH(block_dup + loop_start);
+        mask = __msa_clti_s_h(block_vec, 0);
+        zero_mask = __msa_ceqi_h(block_vec, 0);
+        mul = block_vec * qmul_vec;
+        sub = mul - qadd_vec;
+        add = mul + qadd_vec;
+        add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
+                                         (v16u8) zero_mask);
+        ST_SH(block_vec, block_dup + loop_start);
+        block_dup += 8;
+    }
+
+    cnt = ((n_coeffs >> 3) * 8) + loop_start;
+
+    for (; cnt <= n_coeffs; cnt++) {
+        level = block[cnt];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[cnt] = level;
+        }
+    }
+}
+
+static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
+                                              int32_t qscale,
+                                              const int16_t *quant_matrix)
+{
+    int32_t cnt, sum_res = -1;
+    v8i16 block_vec, block_neg, qscale_vec, mask;
+    v8i16 block_org0, block_org1, block_org2, block_org3;
+    v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
+    v8i16 sum, mul, zero_mask;
+    v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
+    v4i32 block_l, block_r, sad;
+
+    qscale_vec = __msa_fill_h(qscale);
+    for (cnt = 0; cnt < 2; cnt++) {
+        LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
+        LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
+        mask = __msa_clti_s_h(block_org0, 0);
+        zero_mask = __msa_ceqi_h(block_org0, 0);
+        block_neg = -block_org0;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org1, 0);
+        zero_mask = __msa_ceqi_h(block_org1, 0);
+        block_neg = - block_org1;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org2, 0);
+        zero_mask = __msa_ceqi_h(block_org2, 0);
+        block_neg = - block_org2;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org3, 0);
+        zero_mask = __msa_ceqi_h(block_org3, 0);
+        block_neg = - block_org3;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+    }
+
+    return sum_res;
+}
+
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+    if (s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
+}
+
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0);
+
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
+}
+
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+                                       int16_t *block, int32_t index,
+                                       int32_t qscale)
+{
+    const uint16_t *quant_matrix;
+    int32_t sum = -1;
+
+    quant_matrix = s->inter_matrix;
+
+    sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
+
+    block[63] ^= sum & 1;
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c
new file mode 100644
index 0000000000..9bfe94e4cd
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void mpegvideoencdsp_init_msa(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_sum = ff_pix_sum_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                          AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    mpegvideoencdsp_init_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_msa.c b/libavcodec/mips/mpegvideoencdsp_msa.c
new file mode 100644
index 0000000000..46473dafe5
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_msa.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "libavutil/mips/generic_macros_msa.h"
+
+static int32_t sum_u8src_16width_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t sum = 0;
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 in8, in9, in10, in11, in12, in13, in14, in15;
+
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    src += (8 * stride);
+    LD_UB8(src, stride, in8, in9, in10, in11, in12, in13, in14, in15);
+
+    HADD_UB4_UB(in0, in1, in2, in3, in0, in1, in2, in3);
+    HADD_UB4_UB(in4, in5, in6, in7, in4, in5, in6, in7);
+    HADD_UB4_UB(in8, in9, in10, in11, in8, in9, in10, in11);
+    HADD_UB4_UB(in12, in13, in14, in15, in12, in13, in14, in15);
+
+    sum = HADD_UH_U32(in0);
+    sum += HADD_UH_U32(in1);
+    sum += HADD_UH_U32(in2);
+    sum += HADD_UH_U32(in3);
+    sum += HADD_UH_U32(in4);
+    sum += HADD_UH_U32(in5);
+    sum += HADD_UH_U32(in6);
+    sum += HADD_UH_U32(in7);
+    sum += HADD_UH_U32(in8);
+    sum += HADD_UH_U32(in9);
+    sum += HADD_UH_U32(in10);
+    sum += HADD_UH_U32(in11);
+    sum += HADD_UH_U32(in12);
+    sum += HADD_UH_U32(in13);
+    sum += HADD_UH_U32(in14);
+    sum += HADD_UH_U32(in15);
+
+    return sum;
+}
+
+int ff_pix_sum_msa(uint8_t *pix, int line_size)
+{
+    return sum_u8src_16width_msa(pix, line_size);
+}
diff --git a/libavcodec/mips/pixblockdsp_init_mips.c b/libavcodec/mips/pixblockdsp_init_mips.c
new file mode 100644
index 0000000000..1b3741ea76
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_init_mips.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void pixblockdsp_init_msa(PixblockDSPContext *c,
+                                         AVCodecContext *avctx,
+                                         unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_msa;
+
+    switch (avctx->bits_per_raw_sample) {
+    case 9:
+    case 10:
+    case 12:
+    case 14:
+        c->get_pixels = ff_get_pixels_16_msa;
+        break;
+    default:
+        if (avctx->bits_per_raw_sample <= 8 || avctx->codec_type !=
+            AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = ff_get_pixels_8_msa;
+        }
+        break;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void pixblockdsp_init_mmi(PixblockDSPContext *c,
+        AVCodecContext *avctx, unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_mmi;
+
+    if (!high_bit_depth || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+        c->get_pixels = ff_get_pixels_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth)
+{
+#if HAVE_MSA
+    pixblockdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    pixblockdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/pixblockdsp_mips.h b/libavcodec/mips/pixblockdsp_mips.h
new file mode 100644
index 0000000000..7f8cc96683
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mips.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, int stride);
+void ff_get_pixels_16_msa(int16_t *restrict dst, const uint8_t *src,
+                          ptrdiff_t stride);
+void ff_get_pixels_8_msa(int16_t *restrict dst, const uint8_t *src,
+                         ptrdiff_t stride);
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+        ptrdiff_t line_size);
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+        const uint8_t *src2, int stride);
+
+#endif  // #ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/pixblockdsp_mmi.c b/libavcodec/mips/pixblockdsp_mmi.c
new file mode 100644
index 0000000000..30631d8035
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mmi.c
@@ -0,0 +1,79 @@
+/*
+ * Loongson SIMD optimized pixblockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+        ptrdiff_t line_size)
+{
+    __asm__ volatile (
+        "move $8, $0                    \n\t"
+        "xor $f0, $f0, $f0              \n\t"
+        "1:                             \n\t"
+        "gsldlc1 $f2, 7(%1)             \n\t"
+        "gsldrc1 $f2, 0(%1)             \n\t"
+        "punpcklbh $f4, $f2, $f0        \n\t"
+        "punpckhbh $f6, $f2, $f0        \n\t"
+        "gssdxc1 $f4, 0(%0, $8)         \n\t"
+        "gssdxc1 $f6, 8(%0, $8)         \n\t"
+        "daddiu $8, $8, 16              \n\t"
+        "daddu %1, %1, %2               \n\t"
+        "daddi %3, %3, -1               \n\t"
+        "bnez %3, 1b                    \n\t"
+        ::"r"((uint8_t *)block),"r"(pixels),"r"(line_size),"r"(8)
+        : "$8","memory"
+    );
+}
+
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+        const uint8_t *src2, int stride)
+{
+    __asm__ volatile (
+        "dli $2, 8                     \n\t"
+        "xor $f14, $f14, $f14          \n\t"
+        "1:                            \n\t"
+        "gsldlc1 $f0, 7(%1)            \n\t"
+        "gsldrc1 $f0, 0(%1)            \n\t"
+        "or $f2, $f0, $f0              \n\t"
+        "gsldlc1 $f4, 7(%2)            \n\t"
+        "gsldrc1 $f4, 0(%2)            \n\t"
+        "or $f6, $f4, $f4              \n\t"
+        "punpcklbh $f0, $f0, $f14      \n\t"
+        "punpckhbh $f2, $f2, $f14      \n\t"
+        "punpcklbh $f4, $f4, $f14      \n\t"
+        "punpckhbh $f6, $f6, $f14      \n\t"
+        "psubh $f0, $f0, $f4           \n\t"
+        "psubh $f2, $f2, $f6           \n\t"
+        "gssdlc1 $f0, 7(%0)            \n\t"
+        "gssdrc1 $f0, 0(%0)            \n\t"
+        "gssdlc1 $f2, 15(%0)           \n\t"
+        "gssdrc1 $f2, 8(%0)            \n\t"
+        "daddi %0, %0, 16              \n\t"
+        "daddu %1, %1, %3              \n\t"
+        "daddu %2, %2, %3              \n\t"
+        "daddi $2, $2, -1              \n\t"
+        "bgtz $2, 1b                   \n\t"
+        ::"r"(block),"r"(src1),"r"(src2),"r"(stride)
+        : "$2","memory"
+    );
+}
diff --git a/libavcodec/mips/pixblockdsp_msa.c b/libavcodec/mips/pixblockdsp_msa.c
new file mode 100644
index 0000000000..966e11a7f5
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_msa.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "pixblockdsp_mips.h"
+
+static void diff_pixels_msa(int16_t *block, const uint8_t *src1,
+                            const uint8_t *src2, int32_t stride)
+{
+    v16u8 in10, in11, in12, in13, in14, in15, in16, in17;
+    v16u8 in20, in21, in22, in23, in24, in25, in26, in27;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+
+    LD_UB8(src1, stride, in10, in11, in12, in13, in14, in15, in16, in17);
+    LD_UB8(src2, stride, in20, in21, in22, in23, in24, in25, in26, in27);
+    ILVR_B4_SH(in10, in20, in11, in21, in12, in22, in13, in23,
+               out0, out1, out2, out3);
+    ILVR_B4_SH(in14, in24, in15, in25, in16, in26, in17, in27,
+               out4, out5, out6, out7);
+    HSUB_UB4_SH(out0, out1, out2, out3, out0, out1, out2, out3);
+    HSUB_UB4_SH(out4, out5, out6, out7, out4, out5, out6, out7);
+    ST_SH8(out0, out1, out2, out3, out4, out5, out6, out7, block, 8);
+}
+
+static void copy_8bit_to_16bit_width8_msa(const uint8_t *src, int32_t src_stride,
+                                          int16_t *dst, int32_t dst_stride,
+                                          int32_t height)
+{
+    uint8_t *dst_ptr;
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+
+    dst_ptr = (uint8_t *) dst;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0, src1, src2, src3);
+
+        ST_UB4(src0, src1, src2, src3, dst_ptr, (dst_stride * 2));
+        dst_ptr += (4 * 2 * dst_stride);
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+void ff_get_pixels_16_msa(int16_t *av_restrict dest, const uint8_t *src,
+                          ptrdiff_t stride)
+{
+    copy_width16_msa(src, stride, (uint8_t *) dest, 16, 8);
+}
+
+void ff_get_pixels_8_msa(int16_t *av_restrict dest, const uint8_t *src,
+                         ptrdiff_t stride)
+{
+    copy_8bit_to_16bit_width8_msa(src, stride, dest, 8, 8);
+}
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, int stride)
+{
+    diff_pixels_msa(block, src1, src2, stride);
+}
diff --git a/libavcodec/mips/qpeldsp_init_mips.c b/libavcodec/mips/qpeldsp_init_mips.c
new file mode 100644
index 0000000000..140e8f89c9
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_init_mips.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "qpeldsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void qpeldsp_init_msa(QpelDSPContext *c)
+{
+    c->put_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_qpel_pixels_tab[0][1] = ff_horiz_mc_qpel_aver_src0_16width_msa;
+    c->put_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_16width_msa;
+    c->put_qpel_pixels_tab[0][3] = ff_horiz_mc_qpel_aver_src1_16width_msa;
+    c->put_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_aver_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][5] = ff_hv_mc_qpel_aver_hv_src00_16x16_msa;
+    c->put_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_aver_v_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][7] = ff_hv_mc_qpel_aver_hv_src10_16x16_msa;
+    c->put_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_aver_h_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_aver_h_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_aver_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][13] = ff_hv_mc_qpel_aver_hv_src01_16x16_msa;
+    c->put_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_aver_v_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][15] = ff_hv_mc_qpel_aver_hv_src11_16x16_msa;
+
+    c->put_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_qpel_pixels_tab[1][1] = ff_horiz_mc_qpel_aver_src0_8width_msa;
+    c->put_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_8width_msa;
+    c->put_qpel_pixels_tab[1][3] = ff_horiz_mc_qpel_aver_src1_8width_msa;
+    c->put_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_aver_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_aver_hv_src00_8x8_msa;
+    c->put_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_aver_v_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_aver_hv_src10_8x8_msa;
+    c->put_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_aver_h_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_aver_h_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_aver_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_aver_hv_src01_8x8_msa;
+    c->put_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_aver_v_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_aver_hv_src11_8x8_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_no_rnd_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_no_rnd_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa;
+
+    c->avg_qpel_pixels_tab[0][0] = ff_avg_width16_msa;
+    c->avg_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa;
+    c->avg_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_avg_dst_16width_msa;
+    c->avg_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa;
+    c->avg_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa;
+    c->avg_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa;
+    c->avg_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa;
+    c->avg_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa;
+
+    c->avg_qpel_pixels_tab[1][0] = ff_avg_width8_msa;
+    c->avg_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa;
+    c->avg_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_avg_dst_8width_msa;
+    c->avg_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa;
+    c->avg_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa;
+    c->avg_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa;
+    c->avg_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa;
+    c->avg_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa;
+}
+#endif  // #if HAVE_MSA
+
+void ff_qpeldsp_init_mips(QpelDSPContext *c)
+{
+#if HAVE_MSA
+    qpeldsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/qpeldsp_mips.h b/libavcodec/mips/qpeldsp_mips.h
new file mode 100644
index 0000000000..704d221331
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_mips.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
+#define AVCODEC_MIPS_QPELDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_copy_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_copy_16x16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dst, const uint8_t *src,
+                                          ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c
new file mode 100644
index 0000000000..4710b3f732
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_msa.c
@@ -0,0 +1,6518 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "qpeldsp_mips.h"
+
+#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2)  \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,                       \
+                                      mask0, mask1, mask2, mask3,       \
+                                      coef0, coef1, coef2)              \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                               \
+    v8i16 res0_r, res1_r;                                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);   \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                        \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);          \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);   \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);         \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                 \
+    SRARI_H2_SH(res0_r, res1_r, 5);                                     \
+    CLIP_SH2_0_255(res0_r, res1_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,                        \
+                                           mask0, mask1, mask2, mask3,  \
+                                           coef0, coef1, coef2)         \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8i16 res0_r;                                                       \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);   \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);            \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);             \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);   \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res0_r = __msa_srari_h(res0_r, 5);                                  \
+    res0_r = CLIP_SH_0_255(res0_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,   \
+                                                    mask2, mask3, coef0,  \
+                                                    coef1, coef2)         \
+( {                                                                       \
+    v16u8 out;                                                            \
+    v8i16 res0_r;                                                         \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                 \
+                                                                          \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);     \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);              \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);               \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);     \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);           \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                   \
+    res0_r += 15;                                                         \
+    res0_r >>= 5;                                                         \
+    res0_r = CLIP_SH_0_255(res0_r);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
+                                                                          \
+    out;                                                                  \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,              \
+                                         coef0, coef1, coef2)           \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1,                  \
+                                               mask0, mask1, mask2, mask3,  \
+                                               coef0, coef1, coef2)         \
+( {                                                                         \
+    v16u8 out;                                                              \
+    v8i16 res0_r, res1_r;                                                   \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                   \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                                   \
+                                                                            \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);       \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                            \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);              \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);       \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);             \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);             \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                     \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                     \
+    res0_r += 15;                                                           \
+    res1_r += 15;                                                           \
+    res0_r >>= 5;                                                           \
+    res1_r >>= 5;                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                         \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);            \
+                                                                            \
+    out;                                                                    \
+} )
+
+#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3,                  \
+                               inp4, inp5, inp6, inp7,                  \
+                               coef0, coef1, coef2)                     \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03,        \
+                                     inp04, inp05, inp06, inp07,        \
+                                     inp10, inp11, inp12, inp13,        \
+                                     inp14, inp15, inp16, inp17,        \
+                                     coef0, coef1, coef2)               \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 val0, val1;                                                   \
+    v8u16 sum00, sum01, sum02, sum03;                                   \
+    v8u16 sum10, sum11, sum12, sum13;                                   \
+                                                                        \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,  \
+               sum00, sum10, sum03, sum13);                             \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);              \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                            \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,  \
+               sum02, sum12, sum01, sum11);                             \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);             \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);             \
+    val0 = (v8i16) (sum00 - sum03);                                     \
+    val1 = (v8i16) (sum10 - sum13);                                     \
+    SRARI_H2_SH(val0, val1, 5);                                         \
+    CLIP_SH2_0_255(val0, val1);                                         \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);            \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3,         \
+                                        inp4, inp5, inp6, inp7,         \
+                                        coef0, coef1, coef2)            \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03,  \
+                                              inp04, inp05, inp06, inp07,  \
+                                              inp10, inp11, inp12, inp13,  \
+                                              inp14, inp15, inp16, inp17,  \
+                                              coef0, coef1, coef2)         \
+( {                                                                        \
+    v16u8 res;                                                             \
+    v8i16 val0, val1;                                                      \
+    v8u16 sum00, sum01, sum02, sum03;                                      \
+    v8u16 sum10, sum11, sum12, sum13;                                      \
+                                                                           \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,     \
+               sum00, sum10, sum03, sum13);                                \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);                 \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                               \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,     \
+               sum02, sum12, sum01, sum11);                                \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);                \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);                \
+    val0 = (v8i16) (sum00 - sum03);                                        \
+    val1 = (v8i16) (sum10 - sum13);                                        \
+    val0 += 15;                                                            \
+    val1 += 15;                                                            \
+    val0 >>= 5;                                                            \
+    val1 >>= 5;                                                            \
+    CLIP_SH2_0_255(val0, val1);                                            \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);               \
+                                                                           \
+    res;                                                                   \
+} )
+
+static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_8width_msa(const uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_16width_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+
+static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_8x8_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_16x16_msa(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_16x16_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                         mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+
+}
+
+static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
+                                        horiz2, horiz3, horiz4, horiz1, horiz0,
+                                        horiz0, horiz1, horiz2, horiz3, horiz4,
+                                        horiz5, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
+                                        horiz4, horiz5, horiz6, horiz3, horiz2,
+                                        horiz1, horiz0, horiz4, horiz5, horiz6,
+                                        horiz7, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1;
+    int32_t loop_cnt;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src0 = LD(src);
+        src += src_stride;
+        src1 = LD(src);
+        src += src_stride;
+
+        SD(src0, dst);
+        dst += dst_stride;
+        SD(src1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           dst, dst_stride);
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
+                                        const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width8_msa(src, stride, dest, stride, 8);
+}
+
+void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width16_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
+                                          const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    vert_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                               ptrdiff_t stride)
+{
+    vert_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                      const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+/* HV cases */
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    hv_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                           ptrdiff_t stride)
+{
+    hv_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                   const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
diff --git a/libavcodec/mips/sbrdsp_mips.c b/libavcodec/mips/sbrdsp_mips.c
new file mode 100644
index 0000000000..c203095548
--- /dev/null
+++ b/libavcodec/mips/sbrdsp_mips.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/sbrdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/sbrdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_pre_shuffle_mips(float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6;
+    float *z1 = &z[66];
+    float *z2 = &z[59];
+    float *z3 = &z[2];
+    float *z4 = z1 + 60;
+
+    /* loop unrolled 5 times */
+    __asm__ volatile (
+        "lui    %[Temp6],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "lw     %[Temp5],   16(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp6]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp6]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp6]    \n\t"
+        "xor    %[Temp5],   %[Temp5],   %[Temp6]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -20         \n\t"
+        "sw     %[Temp1],   32(%[z1])               \n\t"
+        "sw     %[Temp2],   24(%[z1])               \n\t"
+        "sw     %[Temp3],   16(%[z1])               \n\t"
+        "sw     %[Temp4],   8(%[z1])                \n\t"
+        "sw     %[Temp5],   0(%[z1])                \n\t"
+        "lw     %[Temp1],   0(%[z3])                \n\t"
+        "lw     %[Temp2],   4(%[z3])                \n\t"
+        "lw     %[Temp3],   8(%[z3])                \n\t"
+        "lw     %[Temp4],   12(%[z3])               \n\t"
+        "lw     %[Temp5],   16(%[z3])               \n\t"
+        "sw     %[Temp1],   4(%[z1])                \n\t"
+        "sw     %[Temp2],   12(%[z1])               \n\t"
+        "sw     %[Temp3],   20(%[z1])               \n\t"
+        "sw     %[Temp4],   28(%[z1])               \n\t"
+        "sw     %[Temp5],   36(%[z1])               \n\t"
+        PTR_ADDIU "%[z3],   %[z3],      20          \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      40          \n\t"
+        "bne    %[z1],      %[z4],      1b          \n\t"
+        "lw     %[Temp1],   132(%[z])               \n\t"
+        "lw     %[Temp2],   128(%[z])               \n\t"
+        "lw     %[Temp3],   0(%[z])                 \n\t"
+        "lw     %[Temp4],   4(%[z])                 \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "sw     %[Temp1],   504(%[z])               \n\t"
+        "sw     %[Temp2],   508(%[z])               \n\t"
+        "sw     %[Temp3],   256(%[z])               \n\t"
+        "sw     %[Temp4],   260(%[z])               \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [Temp6]"=&r"(Temp6),
+          [z1]"+r"(z1), [z2]"+r"(z2), [z3]"+r"(z3)
+        : [z4]"r"(z4), [z]"r"(z)
+        : "memory"
+    );
+}
+
+static void sbr_qmf_post_shuffle_mips(float W[32][2], const float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5;
+    float *W_ptr = (float *)W;
+    float *z1    = (float *)z;
+    float *z2    = (float *)&z[60];
+    float *z_end = z1 + 32;
+
+     /* loop unrolled 4 times */
+    __asm__ volatile (
+        "lui    %[Temp5],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp5]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp5]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp5]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp5]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -16         \n\t"
+        "sw     %[Temp1],   24(%[W_ptr])            \n\t"
+        "sw     %[Temp2],   16(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   8(%[W_ptr])             \n\t"
+        "sw     %[Temp4],   0(%[W_ptr])             \n\t"
+        "lw     %[Temp1],   0(%[z1])                \n\t"
+        "lw     %[Temp2],   4(%[z1])                \n\t"
+        "lw     %[Temp3],   8(%[z1])                \n\t"
+        "lw     %[Temp4],   12(%[z1])               \n\t"
+        "sw     %[Temp1],   4(%[W_ptr])             \n\t"
+        "sw     %[Temp2],   12(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   20(%[W_ptr])            \n\t"
+        "sw     %[Temp4],   28(%[W_ptr])            \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      16          \n\t"
+        PTR_ADDIU "%[W_ptr],%[W_ptr],   32          \n\t"
+        "bne    %[z1],      %[z_end],   1b          \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [z1]"+r"(z1),
+          [z2]"+r"(z2), [W_ptr]"+r"(W_ptr)
+        : [z_end]"r"(z_end)
+        : "memory"
+    );
+}
+
+#if HAVE_MIPSFPU
+static void sbr_sum64x5_mips(float *z)
+{
+    int k;
+    float *z1;
+    float f1, f2, f3, f4, f5, f6, f7, f8;
+    for (k = 0; k < 64; k += 8) {
+
+        z1 = &z[k];
+
+         /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lwc1   $f0,    0(%[z1])        \n\t"
+            "lwc1   $f1,    256(%[z1])      \n\t"
+            "lwc1   $f2,    4(%[z1])        \n\t"
+            "lwc1   $f3,    260(%[z1])      \n\t"
+            "lwc1   $f4,    8(%[z1])        \n\t"
+            "add.s  %[f1],  $f0,    $f1     \n\t"
+            "lwc1   $f5,    264(%[z1])      \n\t"
+            "add.s  %[f2],  $f2,    $f3     \n\t"
+            "lwc1   $f6,    12(%[z1])       \n\t"
+            "lwc1   $f7,    268(%[z1])      \n\t"
+            "add.s  %[f3],  $f4,    $f5     \n\t"
+            "lwc1   $f8,    16(%[z1])       \n\t"
+            "lwc1   $f9,    272(%[z1])      \n\t"
+            "add.s  %[f4],  $f6,    $f7     \n\t"
+            "lwc1   $f10,   20(%[z1])       \n\t"
+            "lwc1   $f11,   276(%[z1])      \n\t"
+            "add.s  %[f5],  $f8,    $f9     \n\t"
+            "lwc1   $f12,   24(%[z1])       \n\t"
+            "lwc1   $f13,   280(%[z1])      \n\t"
+            "add.s  %[f6],  $f10,   $f11    \n\t"
+            "lwc1   $f14,   28(%[z1])       \n\t"
+            "lwc1   $f15,   284(%[z1])      \n\t"
+            "add.s  %[f7],  $f12,   $f13    \n\t"
+            "lwc1   $f0,    512(%[z1])      \n\t"
+            "lwc1   $f1,    516(%[z1])      \n\t"
+            "add.s  %[f8],  $f14,   $f15    \n\t"
+            "lwc1   $f2,    520(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    524(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    528(%[z1])      \n\t"
+            "lwc1   $f5,    532(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    536(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    540(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    768(%[z1])      \n\t"
+            "lwc1   $f1,    772(%[z1])      \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    776(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    780(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    784(%[z1])      \n\t"
+            "lwc1   $f5,    788(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    792(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    796(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    1024(%[z1])     \n\t"
+            "lwc1   $f1,    1028(%[z1])     \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    1032(%[z1])     \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    1036(%[z1])     \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    1040(%[z1])     \n\t"
+            "lwc1   $f5,    1044(%[z1])     \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    1048(%[z1])     \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    1052(%[z1])     \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "swc1   %[f1],  0(%[z1])        \n\t"
+            "swc1   %[f2],  4(%[z1])        \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "swc1   %[f3],  8(%[z1])        \n\t"
+            "swc1   %[f4],  12(%[z1])       \n\t"
+            "swc1   %[f5],  16(%[z1])       \n\t"
+            "swc1   %[f6],  20(%[z1])       \n\t"
+            "swc1   %[f7],  24(%[z1])       \n\t"
+            "swc1   %[f8],  28(%[z1])       \n\t"
+
+            : [f1]"=&f"(f1), [f2]"=&f"(f2), [f3]"=&f"(f3),
+              [f4]"=&f"(f4), [f5]"=&f"(f5), [f6]"=&f"(f6),
+              [f7]"=&f"(f7), [f8]"=&f"(f8)
+            : [z1]"r"(z1)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+              "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
+              "$f12", "$f13", "$f14", "$f15",
+              "memory"
+        );
+    }
+}
+
+static float sbr_sum_square_mips(float (*x)[2], int n)
+{
+    float sum0 = 0.0f, sum1 = 0.0f;
+    float *p_x;
+    float temp0, temp1, temp2, temp3;
+    float *loop_end;
+    p_x = &x[0][0];
+    loop_end = p_x + (n >> 1)*4 - 4;
+
+    __asm__ volatile (
+        ".set      push                                             \n\t"
+        ".set      noreorder                                        \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "lwc1      %[temp3],   12(%[p_x])                           \n\t"
+    "1:                                                             \n\t"
+        PTR_ADDIU "%[p_x],     %[p_x],       16                     \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        "bne       %[p_x],     %[loop_end],  1b                     \n\t"
+        " lwc1     %[temp3],   12(%[p_x])                           \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        ".set      pop                                              \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [sum0]"+f"(sum0), [sum1]"+f"(sum1),
+          [p_x]"+r"(p_x)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+    return sum0 + sum1;
+}
+
+static void sbr_qmf_deint_bfly_mips(float *v, const float *src0, const float *src1)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *v0 = v;
+    float *v1 = &v[127];
+    float *psrc0 = (float*)src0;
+    float *psrc1 = (float*)&src1[63];
+
+    for (i = 0; i < 4; i++) {
+
+         /* loop unrolled 16 times */
+        __asm__ volatile(
+            "lwc1       %[temp0],   0(%[src0])             \n\t"
+            "lwc1       %[temp1],   0(%[src1])             \n\t"
+            "lwc1       %[temp3],   4(%[src0])             \n\t"
+            "lwc1       %[temp4],   -4(%[src1])            \n\t"
+            "lwc1       %[temp6],   8(%[src0])             \n\t"
+            "lwc1       %[temp7],   -8(%[src1])            \n\t"
+            "lwc1       %[temp9],   12(%[src0])            \n\t"
+            "lwc1       %[temp10],  -12(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   0(%[v1])               \n\t"
+            "swc1       %[temp0],   0(%[v0])               \n\t"
+            "swc1       %[temp5],   -4(%[v1])              \n\t"
+            "swc1       %[temp3],   4(%[v0])               \n\t"
+            "swc1       %[temp8],   -8(%[v1])              \n\t"
+            "swc1       %[temp6],   8(%[v0])               \n\t"
+            "swc1       %[temp11],  -12(%[v1])             \n\t"
+            "swc1       %[temp9],   12(%[v0])              \n\t"
+            "lwc1       %[temp0],   16(%[src0])            \n\t"
+            "lwc1       %[temp1],   -16(%[src1])           \n\t"
+            "lwc1       %[temp3],   20(%[src0])            \n\t"
+            "lwc1       %[temp4],   -20(%[src1])           \n\t"
+            "lwc1       %[temp6],   24(%[src0])            \n\t"
+            "lwc1       %[temp7],   -24(%[src1])           \n\t"
+            "lwc1       %[temp9],   28(%[src0])            \n\t"
+            "lwc1       %[temp10],  -28(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -16(%[v1])             \n\t"
+            "swc1       %[temp0],   16(%[v0])              \n\t"
+            "swc1       %[temp5],   -20(%[v1])             \n\t"
+            "swc1       %[temp3],   20(%[v0])              \n\t"
+            "swc1       %[temp8],   -24(%[v1])             \n\t"
+            "swc1       %[temp6],   24(%[v0])              \n\t"
+            "swc1       %[temp11],  -28(%[v1])             \n\t"
+            "swc1       %[temp9],   28(%[v0])              \n\t"
+            "lwc1       %[temp0],   32(%[src0])            \n\t"
+            "lwc1       %[temp1],   -32(%[src1])           \n\t"
+            "lwc1       %[temp3],   36(%[src0])            \n\t"
+            "lwc1       %[temp4],   -36(%[src1])           \n\t"
+            "lwc1       %[temp6],   40(%[src0])            \n\t"
+            "lwc1       %[temp7],   -40(%[src1])           \n\t"
+            "lwc1       %[temp9],   44(%[src0])            \n\t"
+            "lwc1       %[temp10],  -44(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -32(%[v1])             \n\t"
+            "swc1       %[temp0],   32(%[v0])              \n\t"
+            "swc1       %[temp5],   -36(%[v1])             \n\t"
+            "swc1       %[temp3],   36(%[v0])              \n\t"
+            "swc1       %[temp8],   -40(%[v1])             \n\t"
+            "swc1       %[temp6],   40(%[v0])              \n\t"
+            "swc1       %[temp11],  -44(%[v1])             \n\t"
+            "swc1       %[temp9],   44(%[v0])              \n\t"
+            "lwc1       %[temp0],   48(%[src0])            \n\t"
+            "lwc1       %[temp1],   -48(%[src1])           \n\t"
+            "lwc1       %[temp3],   52(%[src0])            \n\t"
+            "lwc1       %[temp4],   -52(%[src1])           \n\t"
+            "lwc1       %[temp6],   56(%[src0])            \n\t"
+            "lwc1       %[temp7],   -56(%[src1])           \n\t"
+            "lwc1       %[temp9],   60(%[src0])            \n\t"
+            "lwc1       %[temp10],  -60(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -48(%[v1])             \n\t"
+            "swc1       %[temp0],   48(%[v0])              \n\t"
+            "swc1       %[temp5],   -52(%[v1])             \n\t"
+            "swc1       %[temp3],   52(%[v0])              \n\t"
+            "swc1       %[temp8],   -56(%[v1])             \n\t"
+            "swc1       %[temp6],   56(%[v0])              \n\t"
+            "swc1       %[temp11],  -60(%[v1])             \n\t"
+            "swc1       %[temp9],   60(%[v0])              \n\t"
+            PTR_ADDIU " %[src0],    %[src0],    64         \n\t"
+            PTR_ADDIU " %[src1],    %[src1],    -64        \n\t"
+            PTR_ADDIU " %[v0],      %[v0],      64         \n\t"
+            PTR_ADDIU " %[v1],      %[v1],      -64        \n\t"
+
+            : [v0]"+r"(v0), [v1]"+r"(v1), [src0]"+r"(psrc0), [src1]"+r"(psrc1),
+              [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            :"memory"
+        );
+    }
+}
+
+static void sbr_autocorrelate_mips(const float x[40][2], float phi[3][2][2])
+{
+    int i;
+    float real_sum_0 = 0.0f;
+    float real_sum_1 = 0.0f;
+    float real_sum_2 = 0.0f;
+    float imag_sum_1 = 0.0f;
+    float imag_sum_2 = 0.0f;
+    float *p_x, *p_phi;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp_r, temp_r1, temp_r2, temp_r3, temp_r4;
+    p_x = (float*)&x[0][0];
+    p_phi = &phi[0][0][0];
+
+    __asm__ volatile (
+        "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+        "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+        "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+        "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+        "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+        "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+        "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+        "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+        "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+        "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+        "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+        "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+        "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+        "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+        "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+        "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+        "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+        "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+        "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+        "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+        PTR_ADDIU "%[p_x],      %[p_x],        8                    \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+          [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), [temp_r2]"=&f"(temp_r2),
+          [temp_r3]"=&f"(temp_r3), [temp_r4]"=&f"(temp_r4),
+          [p_x]"+r"(p_x), [imag_sum_2]"+f"(imag_sum_2)
+        :
+        : "memory"
+    );
+
+    for (i = 0; i < 12; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+            "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+            "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+            "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+            "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+            "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+            "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+            "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+            "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp0],      32(%[p_x])                          \n\t"
+            "lwc1    %[temp1],      36(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp3],      %[temp3]             \n\t"
+            "mul.s   %[temp_r1],    %[temp3],      %[temp5]             \n\t"
+            "mul.s   %[temp_r2],    %[temp3],      %[temp4]             \n\t"
+            "mul.s   %[temp_r3],    %[temp3],      %[temp1]             \n\t"
+            "mul.s   %[temp_r4],    %[temp3],      %[temp0]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp2],  %[temp2]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp2],  %[temp4]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp2],  %[temp5]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp2],  %[temp0]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp2],  %[temp1]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp2],      40(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      44(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp5],      %[temp5]             \n\t"
+            "mul.s   %[temp_r1],    %[temp5],      %[temp1]             \n\t"
+            "mul.s   %[temp_r2],    %[temp5],      %[temp0]             \n\t"
+            "mul.s   %[temp_r3],    %[temp5],      %[temp3]             \n\t"
+            "mul.s   %[temp_r4],    %[temp5],      %[temp2]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp4],  %[temp4]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp4],  %[temp0]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp4],  %[temp1]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp4],  %[temp2]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp4],  %[temp3]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            PTR_ADDIU "%[p_x],      %[p_x],        24                   \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+              [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+              [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1),
+              [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+              [temp_r4]"=&f"(temp_r4), [p_x]"+r"(p_x),
+              [imag_sum_2]"+f"(imag_sum_2)
+            :
+            : "memory"
+        );
+    }
+    __asm__ volatile (
+        "lwc1    %[temp0],    -296(%[p_x])                        \n\t"
+        "lwc1    %[temp1],    -292(%[p_x])                        \n\t"
+        "lwc1    %[temp2],    8(%[p_x])                           \n\t"
+        "lwc1    %[temp3],    12(%[p_x])                          \n\t"
+        "lwc1    %[temp4],    -288(%[p_x])                        \n\t"
+        "lwc1    %[temp5],    -284(%[p_x])                        \n\t"
+        "lwc1    %[temp6],    -280(%[p_x])                        \n\t"
+        "lwc1    %[temp7],    -276(%[p_x])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_0], %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],  %[real_sum_0], %[temp2],  %[temp2]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_1], %[temp0],  %[temp4]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_1], %[temp0],  %[temp5]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp1],  %[temp1]  \n\t"
+        "madd.s  %[temp_r1],  %[temp_r1],    %[temp3],  %[temp3]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp5]  \n\t"
+        "nmsub.s  %[temp_r3], %[temp_r3],    %[temp1],  %[temp4]  \n\t"
+        "lwc1    %[temp4],    16(%[p_x])                          \n\t"
+        "lwc1    %[temp5],    20(%[p_x])                          \n\t"
+        "swc1    %[temp_r],   40(%[p_phi])                        \n\t"
+        "swc1    %[temp_r1],  16(%[p_phi])                        \n\t"
+        "swc1    %[temp_r2],  24(%[p_phi])                        \n\t"
+        "swc1    %[temp_r3],  28(%[p_phi])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_1], %[temp2],  %[temp4]  \n\t"
+        "madd.s  %[temp_r1],  %[imag_sum_1], %[temp2],  %[temp5]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_2], %[temp0],  %[temp6]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_2], %[temp0],  %[temp7]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp3],  %[temp5]  \n\t"
+        "nmsub.s %[temp_r1],  %[temp_r1],    %[temp3],  %[temp4]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp7]  \n\t"
+        "nmsub.s %[temp_r3],  %[temp_r3],    %[temp1],  %[temp6]  \n\t"
+        "swc1    %[temp_r],   0(%[p_phi])                         \n\t"
+        "swc1    %[temp_r1],  4(%[p_phi])                         \n\t"
+        "swc1    %[temp_r2],  8(%[p_phi])                         \n\t"
+        "swc1    %[temp_r3],  12(%[p_phi])                        \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp_r]"=&f"(temp_r),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [real_sum_2]"+f"(real_sum_2), [imag_sum_1]"+f"(imag_sum_1),
+          [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+          [temp_r1]"=&f"(temp_r1), [p_phi]"+r"(p_phi),
+          [imag_sum_2]"+f"(imag_sum_2)
+        : [p_x]"r"(p_x)
+        : "memory"
+    );
+}
+
+static void sbr_hf_gen_mips(float (*X_high)[2], const float (*X_low)[2],
+                         const float alpha0[2], const float alpha1[2],
+                         float bw, int start, int end)
+{
+    float alpha[4];
+    int i;
+    float *p_x_low = (float*)&X_low[0][0] + 2*start;
+    float *p_x_high = &X_high[0][0] + 2*start;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp8, temp9, temp10, temp11, temp12;
+
+    alpha[0] = alpha1[0] * bw * bw;
+    alpha[1] = alpha1[1] * bw * bw;
+    alpha[2] = alpha0[0] * bw;
+    alpha[3] = alpha0[1] * bw;
+
+    for (i = start; i < end; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],    -16(%[p_x_low])                        \n\t"
+            "lwc1    %[temp1],    -12(%[p_x_low])                        \n\t"
+            "lwc1    %[temp2],    -8(%[p_x_low])                         \n\t"
+            "lwc1    %[temp3],    -4(%[p_x_low])                         \n\t"
+            "lwc1    %[temp5],    0(%[p_x_low])                          \n\t"
+            "lwc1    %[temp6],    4(%[p_x_low])                          \n\t"
+            "lwc1    %[temp7],    0(%[alpha])                            \n\t"
+            "lwc1    %[temp8],    4(%[alpha])                            \n\t"
+            "lwc1    %[temp9],    8(%[alpha])                            \n\t"
+            "lwc1    %[temp10],   12(%[alpha])                           \n\t"
+            PTR_ADDIU "%[p_x_high], %[p_x_high],   8                     \n\t"
+            PTR_ADDIU "%[p_x_low],  %[p_x_low],    8                     \n\t"
+            "mul.s   %[temp11],   %[temp1],        %[temp8]              \n\t"
+            "msub.s  %[temp11],   %[temp11],       %[temp0],  %[temp7]   \n\t"
+            "madd.s  %[temp11],   %[temp11],       %[temp2],  %[temp9]   \n\t"
+            "nmsub.s %[temp11],   %[temp11],       %[temp3],  %[temp10]  \n\t"
+            "add.s   %[temp11],   %[temp11],       %[temp5]              \n\t"
+            "swc1    %[temp11],   -8(%[p_x_high])                        \n\t"
+            "mul.s   %[temp12],   %[temp1],        %[temp7]              \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp0],  %[temp8]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp3],  %[temp9]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp2],  %[temp10]  \n\t"
+            "add.s   %[temp12],   %[temp12],       %[temp6]              \n\t"
+            "swc1    %[temp12],   -4(%[p_x_high])                        \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+              [temp12]"=&f"(temp12), [p_x_high]"+r"(p_x_high),
+              [p_x_low]"+r"(p_x_low)
+            : [alpha]"r"(alpha)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_g_filt_mips(float (*Y)[2], const float (*X_high)[40][2],
+                            const float *g_filt, int m_max, intptr_t ixh)
+{
+    const float *p_x, *p_g, *loop_end;
+    float *p_y;
+    float temp0, temp1, temp2;
+
+    p_g = &g_filt[0];
+    p_y = &Y[0][0];
+    p_x = &X_high[0][ixh][0];
+    loop_end = p_g + m_max;
+
+    __asm__ volatile(
+        ".set    push                                \n\t"
+        ".set    noreorder                           \n\t"
+    "1:                                              \n\t"
+        "lwc1    %[temp0],   0(%[p_g])               \n\t"
+        "lwc1    %[temp1],   0(%[p_x])               \n\t"
+        "lwc1    %[temp2],   4(%[p_x])               \n\t"
+        "mul.s   %[temp1],   %[temp1],     %[temp0]  \n\t"
+        "mul.s   %[temp2],   %[temp2],     %[temp0]  \n\t"
+        PTR_ADDIU "%[p_g],   %[p_g],       4         \n\t"
+        PTR_ADDIU "%[p_x],   %[p_x],       320       \n\t"
+        "swc1    %[temp1],   0(%[p_y])               \n\t"
+        "swc1    %[temp2],   4(%[p_y])               \n\t"
+        "bne     %[p_g],     %[loop_end],  1b        \n\t"
+        PTR_ADDIU "%[p_y],   %[p_y],       8         \n\t"
+        ".set    pop                                 \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_x]"+r"(p_x),
+          [p_y]"+r"(p_y), [p_g]"+r"(p_g)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static void sbr_hf_apply_noise_0_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++){
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y0],       0(%[Y1])                                    \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                  \n\t"
+            "addiu   %[noise],    %[noise],              1                    \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                \n\t"
+            "sll     %[temp0],    %[noise], 3                                 \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]             \n\t"
+            "add.s   %[y0],       %[y0],                 %[temp1]             \n\t"
+            "mfc1    %[temp3],    %[temp1]                                    \n\t"
+            "bne     %[temp3],    $0,                    1f                   \n\t"
+            "lwc1    %[y1],       4(%[Y1])                                    \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                               \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                              \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                              \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2],  %[temp4]  \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2],  %[temp5]  \n\t"
+            "swc1    %[y1],       4(%[Y1])                                    \n\t"
+        "1:                                                                   \n\t"
+            "swc1    %[y0],       0(%[Y1])                                    \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_1_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float y0,y1,temp1, temp2, temp4, temp5;
+    int temp0, temp3, m;
+    float phi_sign = 1 - 2 * (kx & 1);
+
+    for (m = 0; m < m_max; m++) {
+
+        float *ff_table;
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                   \n\t"
+            "lw     %[temp3],    0(%[s_m1])                                   \n\t"
+            "addiu  %[noise],    %[noise],               1                    \n\t"
+            "andi   %[noise],    %[noise],               0x1ff                \n\t"
+            "sll    %[temp0],    %[noise],               3                    \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]              \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "bne    %[temp3],    $0,                    1f                    \n\t"
+            "lwc1   %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1   %[y0],       0(%[Y1])                                     \n\t"
+        "1:                                                                   \n\t"
+            "swc1   %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table] "=&r" (ff_table), [y0] "=&f" (y0), [y1] "=&f" (y1),
+              [temp0] "=&r" (temp0), [temp1] "=&f" (temp1), [temp2] "=&f" (temp2),
+              [temp3] "=&r" (temp3), [temp4] "=&f" (temp4), [temp5] "=&f" (temp5)
+            : [ff_sbr_noise_table] "r" (ff_sbr_noise_table), [noise] "r" (noise),
+              [Y1] "r" (Y1), [s_m1] "r" (s_m1), [q_filt1] "r" (q_filt1),
+              [phi_sign] "f" (phi_sign)
+            : "memory"
+        );
+        phi_sign = -phi_sign;
+    }
+}
+
+static void sbr_hf_apply_noise_2_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+    float *ff_table;
+    float y0,y1, temp0, temp1, temp2, temp3, temp4, temp5;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y0],       0(%[Y1])                                  \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                \n\t"
+            "addiu  %[noise],    %[noise],              1                  \n\t"
+            "andi   %[noise],    %[noise],              0x1ff              \n\t"
+            "sll    %[temp0],    %[noise],              3                  \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]           \n\t"
+            "sub.s  %[y0],       %[y0],                 %[temp1]           \n\t"
+            "mfc1   %[temp3],    %[temp1]                                  \n\t"
+            "bne    %[temp3],    $0,                    1f                 \n\t"
+            "lwc1   %[y1],       4(%[Y1])                                  \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                             \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                            \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                            \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4] \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5] \n\t"
+            "swc1   %[y1],       4(%[Y1])                                  \n\t"
+        "1:                                                                \n\t"
+            "swc1   %[y0],       0(%[Y1])                                  \n\t"
+
+            : [temp0]"=&r"(temp0), [ff_table]"=&r"(ff_table), [y0]"=&f"(y0),
+              [y1]"=&f"(y1), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_3_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float phi_sign = 1 - 2 * (kx & 1);
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                   \n\t"
+            "addiu   %[noise],    %[noise],              1                     \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                 \n\t"
+            "sll     %[temp0],    %[noise],              3                     \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]              \n\t"
+            "nmsub.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "mfc1    %[temp3],    %[temp1]                                     \n\t"
+            "bne     %[temp3],    $0,                    1f                    \n\t"
+            "lwc1    %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1    %[y0],       0(%[Y1])                                     \n\t"
+            "1:                                                                \n\t"
+            "swc1    %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1),
+              [phi_sign]"f"(phi_sign)
+            : "memory"
+        );
+       phi_sign = -phi_sign;
+    }
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_sbrdsp_init_mips(SBRDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_mips;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_mips;
+#if HAVE_MIPSFPU
+    s->sum64x5 = sbr_sum64x5_mips;
+    s->sum_square = sbr_sum_square_mips;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_mips;
+    s->autocorrelate = sbr_autocorrelate_mips;
+    s->hf_gen = sbr_hf_gen_mips;
+    s->hf_g_filt = sbr_hf_g_filt_mips;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0_mips;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1_mips;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2_mips;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
new file mode 100644
index 0000000000..628e13f7d2
--- /dev/null
+++ b/libavcodec/mips/simple_idct_mmi.c
@@ -0,0 +1,816 @@
+/*
+ * Loongson SIMD optimized simple idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+
+#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
+    1<<(ROW_SHIFT-1),   0, 1<<(ROW_SHIFT-1),   0,
+    1<<(ROW_SHIFT-1),   1, 1<<(ROW_SHIFT-1),   0,
+                  C4,  C4,               C4,  C4,
+                  C4, -C4,               C4, -C4,
+                  C2,  C6,               C2,  C6,
+                  C6, -C2,               C6, -C2,
+                  C1,  C3,               C1,  C3,
+                  C5,  C7,               C5,  C7,
+                  C3, -C7,               C3, -C7,
+                 -C1, -C5,              -C1, -C5,
+                  C5, -C1,               C5, -C1,
+                  C7,  C3,               C7,  C3,
+                  C7, -C5,               C7, -C5,
+                  C3, -C1,               C3, -C1
+};
+
+void ff_simple_idct_mmi(int16_t *block)
+{
+        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        int16_t * const temp= (int16_t*)align_tmp;
+
+        __asm__ volatile (
+#undef  DC_COND_IDCT
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift)      \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, %3                   \n\t"                                \
+        "and  $f8, $f8, $f0             \n\t"                                \
+        "or $f8, $f8, $f2               \n\t"                                \
+        "or $f8, $f8, $f4               \n\t"                                \
+        "or $f8, $f8, $f6               \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t"                                \
+        "li $11, " #shift "             \n\t"                                \
+        "mfc1 $10, $f8                  \n\t"                                \
+        "mtc1 $11, $f18                 \n\t"                                \
+        "beqz $10, 1f                   \n\t"                                \
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        #rounder " $f8, $f8, $f16       \n\t"                                \
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        #rounder " $f0, $f0, $f16       \n\t"                                \
+        "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "paddw $f0, $f0, $f0            \n\t"                                \
+        "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
+        "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
+        "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
+        "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
+        "sdc1 $f14, " #dst "            \n\t"                                \
+        "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "sdc1 $f4, 24+" #dst "          \n\t"                                \
+        "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
+        "sdc1 $f4, 8+" #dst "           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
+        "sdc1 $f8, 16+" #dst "          \n\t"                                \
+        "b 2f                           \n\t"                                \
+        "1:                             \n\t"                                \
+        "li $10, 16                     \n\t"                                \
+        "mtc1 $10, $f16                 \n\t"                                \
+        "psllw $f0, $f0, $f16           \n\t"                                \
+        "ldc1 $f16, %4                  \n\t"                                \
+        "paddw $f0, $f0, $f16           \n\t"                                \
+        "li $10, 13                     \n\t"                                \
+        "mtc1 $10, $f16                 \n\t"                                \
+        "psraw $f0, $f0, $f16           \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t"                                \
+        "sdc1 $f0, " #dst "             \n\t"                                \
+        "sdc1 $f0, 8+" #dst "           \n\t"                                \
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 24+" #dst "          \n\t"                                \
+        "2:                             \n\t"
+
+#undef  Z_COND_IDCT
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt)   \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "mov.d $f8, $f0                 \n\t"                                \
+        "or $f8, $f8, $f2               \n\t"                                \
+        "or $f8, $f8, $f4               \n\t"                                \
+        "or $f8, $f8, $f6               \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t"                                \
+        "mfc1 $10, $f8                  \n\t"                                \
+        "beqz $10, " #bt "              \n\t"                                \
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        #rounder " $f8, $f8, $f16       \n\t"                                \
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        #rounder " $f0, $f0, $f16       \n\t"                                \
+        "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f0            \n\t"                                \
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
+        "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "li $10, " #shift "             \n\t"                                \
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
+        "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
+        "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
+        "sdc1 $f14, " #dst "            \n\t"                                \
+        "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "sdc1 $f4, 24+" #dst "          \n\t"                                \
+        "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
+        "sdc1 $f4, 8+" #dst "           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
+        "sdc1 $f8, 16+" #dst "          \n\t"                                \
+
+        //IDCT(       src0,   src4,   src1,   src5,    dst,     rounder, shift)
+        DC_COND_IDCT(0(%0),  8(%0), 16(%0), 24(%0),  0(%1), paddw,8(%2), 11)
+        Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddw,(%2), 11, 4f)
+        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddw,(%2), 11, 2f)
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1), paddw,(%2), 11, 1f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
+        "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "4:                             \n\t"
+        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f)
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f2, " #dst "             \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f2, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f2, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "6:                             \n\t"
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f2, " #dst "             \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f2, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f2, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "2:                             \n\t"
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
+        "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "3:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f6, 64(%2)               \n\t"                                \
+        "pmaddhw $f6, $f6, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f2, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f6            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f2, $f2, $f6            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A1-B1  a1-b1              */\
+        "swc1 $f2, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "mov.d $f2, $f10                \n\t" /* A2             a2         */\
+        "paddw $f2, $f2, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A2+B2  a2+b2              */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3  a3+b3              */\
+        "swc1 $f2, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3  a3-b3              */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2  a2-b2              */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "5:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f6, 8+" #src4 "          \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "ldc1 $f16, 40(%2)              \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "paddw $f14, $f14, $f2          \n\t" /* A0             a0         */\
+        "paddw $f2, $f2, $f2            \n\t" /* 2C0            2c0        */\
+        "psubw $f2, $f2, $f14           \n\t" /* A3             a3         */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f6, $f6, $f4            \n\t" /* A1             a1         */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f4, $f4, $f4            \n\t" /* 2C1            2c1        */\
+        "psubw $f4, $f4, $f6            \n\t" /* A2             a2         */\
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f14        \n\t" /* A0             a0         */\
+        "sdc1 $f8, " #dst "             \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "packsswh $f0, $f0, $f6         \n\t" /* A1             a1         */\
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 96+" #dst "          \n\t"                                \
+        "sdc1 $f8, 112+" #dst "         \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f10, $f10, $f4       \n\t" /* A2-B2          a2-b2      */\
+        "sdc1 $f10, 32+" #dst "         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f12, $f12, $f2       \n\t" /* A3+B3          a3+b3      */\
+        "sdc1 $f12, 48+" #dst "         \n\t"                                \
+        "sdc1 $f12, 64+" #dst "         \n\t"                                \
+        "sdc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "1:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 64(%2)               \n\t"                                \
+        "pmaddhw $f2, $f2, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f6, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f6, $f6, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f6, $f6, $f6         \n\t" /* A1-B1  a1-b1              */\
+        "swc1 $f6, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "mov.d $f6, $f10                \n\t" /* A2             a2         */\
+        "paddw $f6, $f6, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f6, $f6, $f6         \n\t" /* A2+B2          a2+b2      */\
+        "swc1 $f6, 32+" #dst "          \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "7:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f2         \n\t" /* A0             a0         */\
+        "sdc1 $f8, " #dst "             \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f0, $f0, $f4         \n\t" /* A1             a1         */\
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 96+" #dst "          \n\t"                                \
+        "sdc1 $f8, 112+" #dst "         \n\t"                                \
+        "sdc1 $f0, 32+" #dst "          \n\t"                                \
+        "sdc1 $f8, 48+" #dst "          \n\t"                                \
+        "sdc1 $f8, 64+" #dst "          \n\t"                                \
+        "sdc1 $f0, 80+" #dst "          \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+
+        "9:                             \n\t"
+        ::"r"(block),"r"(temp),"r"(coeffs),"m"(ff_wm1010),"m"(ff_d40000)
+        : "$10","$11"
+    );
+}
diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
new file mode 100644
index 0000000000..bd8b31012f
--- /dev/null
+++ b/libavcodec/mips/simple_idct_msa.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void simple_idct_msa(int16_t *block)
+{
+    int32_t const_val;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
+           block, 8);
+}
+
+static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+}
+
+static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    UNPCK_SH_SW(in4, temp4_r, temp4_l);
+    UNPCK_SH_SW(in6, temp7_r, temp7_l);
+    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
+    temp = in0 << 3;
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
+    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
+    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
+    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
+    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
+    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
+    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
+    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
+    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
+    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
+    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
+               temp0_l, temp1_l, temp2_l, temp3_l);
+    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
+    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
+    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
+    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
+               a3_l, a2_l, a1_l, a0_l);
+    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
+    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
+    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
+    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_simple_idct_msa(int16_t *block)
+{
+    simple_idct_msa(block);
+}
+
+void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+{
+    simple_idct_put_msa(dst, dst_stride, block);
+}
+
+void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+{
+    simple_idct_add_msa(dst, dst_stride, block);
+}
diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c
new file mode 100644
index 0000000000..11ac9ff83e
--- /dev/null
+++ b/libavcodec/mips/vp8_idct_msa.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)    \
+{                                                                    \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                    \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                     \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);     \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                      \
+    a1_m = in0 + in2;                                                \
+    b1_m = in0 - in2;                                                \
+    c_tmp1_m = ((in1) * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + (((in3) * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                      \
+    d_tmp1_m = (in1) + (((in1) * const_cospi8sqrt2minus1_m) >> 16);  \
+    d_tmp2_m = ((in3) * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                      \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);     \
+}
+
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+               res0, res1, res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    memset(input, 0, 4 * 4 * sizeof(*input));
+}
+
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 2, 4, 6, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    vec = __msa_fill_h(in_dc[0]);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    in_dc[0] = 0;
+}
+
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t input[16])
+{
+    int16_t *mb_dq_coeff = &block[0][0][0];
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16) vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16) vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16) vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16) vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16) vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16) vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16) vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16) vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16) vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16) vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16) vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16) vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16) vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16) vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16) vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16) vt3, 6);
+
+    memset(input, 0, 4 * 4 * sizeof(int16_t));
+}
+
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 8, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 12, &block[3][0], stride);
+}
+
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4 + 4, &block[3][0], stride);
+}
diff --git a/libavcodec/mips/vp8_lpf_msa.c b/libavcodec/mips/vp8_lpf_msa.c
new file mode 100644
index 0000000000..359096174a
--- /dev/null
+++ b/libavcodec/mips/vp8_lpf_msa.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)           \
+{                                                                \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                              \
+                                                                 \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                        \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                        \
+    p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);      \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);             \
+    mask = ((v16u8) mask <= b_limit);                            \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80);                       \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);  \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                     \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);  \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                     \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);               \
+    filt = filt & (v16i8) mask_in;                                      \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80);                       \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80);                       \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)           \
+{                                                                   \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;        \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;            \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;         \
+                                                                    \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                       \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+                                                                    \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                          \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                          \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) (mask);                                   \
+                                                                    \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                           \
+    filt1 >>= 3;                                                    \
+                                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                           \
+    filt2 >>= 3;                                                    \
+                                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+    q0_in = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \
+{                                                                   \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                       \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                          \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;               \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;       \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                        \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+                                                                    \
+    p2_m = (v16i8) __msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8) __msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8) __msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8) __msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8) __msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8) __msa_xori_b(q2, 0x80);                          \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    /* right part */                                                \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                  \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    /* left part */                                                 \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                  \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    /* combine left and right part */                               \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) mask;                                     \
+    filt2 = filt & (v16i8) hev;                                     \
+                                                                    \
+    /* filt_val &= ~hev */                                          \
+    hev = __msa_xori_b(hev, 0xff);                                  \
+    filt = filt & (v16i8) hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                          \
+    filt1 >>= 3;                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                          \
+    filt2 >>= 3;                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+                                                                    \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                   \
+                                                                    \
+    cnst27h = __msa_ldi_h(27);                                      \
+    cnst63h = __msa_ldi_h(63);                                      \
+                                                                    \
+    /* right part */                                                \
+    u_r = filt_r * cnst27h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst27h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q0_m = __msa_subs_s_b(q0_m, u);                                 \
+    q0 = __msa_xori_b((v16u8) q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                 \
+    p0 = __msa_xori_b((v16u8) p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                      \
+    u_r = filt_r * cnst18h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst18h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q1_m = __msa_subs_s_b(q1_m, u);                                 \
+    q1 = __msa_xori_b((v16u8) q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                 \
+    p1 = __msa_xori_b((v16u8) p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                              \
+    u_r += filt_r + cnst63h;                                        \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l << 3;                                              \
+    u_l += filt_l + cnst63h;                                        \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q2_m = __msa_subs_s_b(q2_m, u);                                 \
+    q2 = __msa_xori_b((v16u8) q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                 \
+    p2 = __msa_xori_b((v16u8) p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8) flat_out;                      \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8) mask_out;                      \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx);              \
+    tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx);              \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    /* load vector elements */
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    /* store vector elements */
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 0);
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    q2_d = __msa_copy_u_d((v2i64) q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 1);
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    q2_d = __msa_copy_u_d((v2i64) q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    /* load vector elements */
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1);
+    tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/libavcodec/mips/vp8_mc_msa.c b/libavcodec/mips/vp8_mc_msa.c
new file mode 100644
index 0000000000..2bf0abd8c9
--- /dev/null
+++ b/libavcodec/mips/vp8_mc_msa.c
@@ -0,0 +1,2332 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t subpel_filters_msa[7][8] = {
+    {-6, 123, 12, -1, 0, 0, 0, 0},
+    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-9, 93, 50, -6, 0, 0, 0, 0},
+    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
+    {-6, 50, 93, -9, 0, 0, 0, 0},
+    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-1, 12, 123, -6, 0, 0, 0, 0},
+};
+
+static const int8_t bilinear_filters_msa[7][2] = {
+    {112, 16},
+    {96, 32},
+    {80, 48},
+    {64, 64},
+    {48, 80},
+    {32, 96},
+    {16, 112}
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m;                                        \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, mask2,                \
+                                   filt0, filt1, filt2,                \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
+}
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2,                       \
+                                   filt0, filt1, filt2,                       \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    v8i16 tmp0;                                                 \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+( {                                                                    \
+    v16i8 vec0_m, vec1_m;                                              \
+    v8i16 hz_out_m;                                                    \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
+                                                                       \
+    hz_out_m;                                                          \
+} )
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
+               src10_r, src32_r, src21_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
+               src32_r, src43_r, src21_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
+               src32_l, src43_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
+                              filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
+                              filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
+                              filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
+                              filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
+                              filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
+                              filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
+                              filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
+                              filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 tmp0, tmp1;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src7, src8);
+        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B2_128_SB(src3, src4);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out0, out1;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        vec0 = vec4;
+        vec2 = vec1;
+    }
+}
+
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 res0, res1, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+        XORI_B2_128_UB(res0, res1);
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+    v16u8 out0, out1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (2 + src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v16u8 out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 vec0, vec1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          const int8_t *filter_horiz,
+                                          const int8_t *filter_vert,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                  uint8_t *src, ptrdiff_t src_stride,
+                                  int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+
+    if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp8dsp_init_mips.c b/libavcodec/mips/vp8dsp_init_mips.c
new file mode 100644
index 0000000000..58d1b6ce38
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_init_mips.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * VP8 compatible video decoder
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_MC_MIPS_FUNC(IDX, SIZE)            \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_epel##SIZE##_h4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_epel##SIZE##_h6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_epel##SIZE##_v4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_epel##SIZE##_v6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v6_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v6_msa
+
+#define VP8_BILINEAR_MC_MIPS_FUNC(IDX, SIZE)       \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa
+
+#define VP8_MC_MIPS_COPY(IDX, SIZE)                \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][0] =      \
+        ff_put_vp8_pixels##SIZE##_msa;             \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] =  \
+        ff_put_vp8_pixels##SIZE##_msa;
+
+#if HAVE_MSA
+static av_cold void vp8dsp_init_msa(VP8DSPContext *dsp)
+{
+    dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_msa;
+    dsp->vp8_idct_add = ff_vp8_idct_add_msa;
+    dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_msa;
+    dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_msa;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_msa;
+
+    VP8_MC_MIPS_FUNC(0, 16);
+    VP8_MC_MIPS_FUNC(1, 8);
+    VP8_MC_MIPS_FUNC(2, 4);
+
+    VP8_BILINEAR_MC_MIPS_FUNC(0, 16);
+    VP8_BILINEAR_MC_MIPS_FUNC(1, 8);
+    VP8_BILINEAR_MC_MIPS_FUNC(2, 4);
+
+    VP8_MC_MIPS_COPY(0, 16);
+    VP8_MC_MIPS_COPY(1, 8);
+
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_msa;
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_msa;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_msa;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_msa;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_msa;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_msa;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_msa;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_msa;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_msa;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_msa;
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_vp8dsp_init_mips(VP8DSPContext *dsp)
+{
+#if HAVE_MSA
+    vp8dsp_init_msa(dsp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp8dsp_mips.h b/libavcodec/mips/vp8dsp_mips.h
new file mode 100644
index 0000000000..8e715b58be
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_mips.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
+#define AVCODEC_MIPS_VP8DSP_MIPS_H
+
+void ff_put_vp8_pixels4_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int x, int y);
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                  uint8_t *src, ptrdiff_t srcstride,
+                                  int h, int mx, int my);
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+/* loop filter */
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+
+/* Idct functions */
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
new file mode 100644
index 0000000000..25ea16c72a
--- /dev/null
+++ b/libavcodec/mips/vp9_idct_msa.c
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_DCT_CONST_BITS   14
+#define ROUND_POWER_OF_TWO(value, n)  (((value) + (1 << ((n) - 1))) >> (n))
+
+static const int32_t cospi_1_64 = 16364;
+static const int32_t cospi_2_64 = 16305;
+static const int32_t cospi_3_64 = 16207;
+static const int32_t cospi_4_64 = 16069;
+static const int32_t cospi_5_64 = 15893;
+static const int32_t cospi_6_64 = 15679;
+static const int32_t cospi_7_64 = 15426;
+static const int32_t cospi_8_64 = 15137;
+static const int32_t cospi_9_64 = 14811;
+static const int32_t cospi_10_64 = 14449;
+static const int32_t cospi_11_64 = 14053;
+static const int32_t cospi_12_64 = 13623;
+static const int32_t cospi_13_64 = 13160;
+static const int32_t cospi_14_64 = 12665;
+static const int32_t cospi_15_64 = 12140;
+static const int32_t cospi_16_64 = 11585;
+static const int32_t cospi_17_64 = 11003;
+static const int32_t cospi_18_64 = 10394;
+static const int32_t cospi_19_64 = 9760;
+static const int32_t cospi_20_64 = 9102;
+static const int32_t cospi_21_64 = 8423;
+static const int32_t cospi_22_64 = 7723;
+static const int32_t cospi_23_64 = 7005;
+static const int32_t cospi_24_64 = 6270;
+static const int32_t cospi_25_64 = 5520;
+static const int32_t cospi_26_64 = 4756;
+static const int32_t cospi_27_64 = 3981;
+static const int32_t cospi_28_64 = 3196;
+static const int32_t cospi_29_64 = 2404;
+static const int32_t cospi_30_64 = 1606;
+static const int32_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const int32_t sinpi_1_9 = 5283;
+static const int32_t sinpi_2_9 = 9929;
+static const int32_t sinpi_3_9 = 13377;
+static const int32_t sinpi_4_9 = 15212;
+
+#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)  \
+{                                                                  \
+    v8i16 k0_m = __msa_fill_h(cnst0);                              \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                  \
+                                                                   \
+    s0_m = (v4i32) __msa_fill_h(cnst1);                            \
+    k0_m = __msa_ilvev_h((v8i16) s0_m, k0_m);                      \
+                                                                   \
+    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                        \
+    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                           \
+    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out0 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+                                                                   \
+    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out1 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+}
+
+#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                      dst0, dst1, dst2, dst3)              \
+{                                                                          \
+    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                               \
+    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                               \
+                                                                           \
+    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                    \
+                tp0_m, tp2_m, tp3_m, tp4_m);                               \
+    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                    \
+                tp5_m, tp6_m, tp7_m, tp8_m);                               \
+    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);   \
+    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);   \
+    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, VP9_DCT_CONST_BITS);           \
+    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, VP9_DCT_CONST_BITS);           \
+    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,    \
+                dst0, dst1, dst2, dst3);                                   \
+}
+
+#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)          \
+( {                                                       \
+    v8i16 dst_m;                                          \
+    v4i32 tp0_m, tp1_m;                                   \
+                                                          \
+    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);        \
+    SRARI_W2_SW(tp1_m, tp0_m, VP9_DCT_CONST_BITS);        \
+    dst_m = __msa_pckev_h((v8i16) tp1_m, (v8i16) tp0_m);  \
+                                                          \
+    dst_m;                                                \
+} )
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                  out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                         \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
+    v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
+        cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };             \
+    v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
+        -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in7, in0,        \
+                              in4, in3);                                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+                                                                          \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in5, in2,        \
+                              in6, in1);                                  \
+    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
+    out7 = -s0_m;                                                         \
+    out0 = s1_m;                                                          \
+                                                                          \
+    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
+                 cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
+                                                                          \
+    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+    cnst1_m = cnst0_m;                                                    \
+                                                                          \
+    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst2_m, cnst3_m, cnst1_m, out1, out6,      \
+                              s0_m, s1_m);                                \
+                                                                          \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);            \
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);            \
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);            \
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);            \
+                                                                          \
+    out1 = -out1;                                                         \
+    out3 = -out3;                                                         \
+    out5 = -out5;                                                         \
+}
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1)                        \
+{                                                                         \
+    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                             \
+    v8i16 madd_s0_m, madd_s1_m;                                           \
+                                                                          \
+    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                            \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,               \
+                c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);      \
+    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);          \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,       \
+                    out0, out1, out2, out3)                               \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h)   \
+( {                                      \
+    v8i16 out0_m, r0_m, r1_m;            \
+                                         \
+    r0_m = __msa_fill_h(c0_h);           \
+    r1_m = __msa_fill_h(c1_h);           \
+    out0_m = __msa_ilvev_h(r1_m, r0_m);  \
+                                         \
+    out0_m;                              \
+} )
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)  \
+{                                                                 \
+    uint8_t *dst_m = (uint8_t *) (dst);                           \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                         \
+    v16i8 tmp0_m, tmp1_m;                                         \
+    v16i8 zero_m = { 0 };                                         \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                         \
+                                                                  \
+    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);    \
+    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,    \
+               zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);   \
+    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,      \
+         res0_m, res1_m, res2_m, res3_m);                         \
+    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);               \
+    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);  \
+    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                  \
+}
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
+{                                                                     \
+    v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
+    v8i16 step0_m, step1_m;                                           \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+                                                                      \
+    c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    step0_m = __msa_ilvr_h(in2, in0);                                 \
+    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
+                                                                      \
+    c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    step1_m = __msa_ilvr_h(in3, in1);                                 \
+    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);  \
+                                                                      \
+    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
+    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
+    BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m,                       \
+                (v8i16) tmp2_m, (v8i16) tmp3_m,                       \
+                out0, out1, out2, out3);                              \
+}
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)      \
+{                                                                     \
+    v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
+    v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
+    v8i16 zero_m = { 0 };                                             \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+    v4i32 int0_m, int1_m, int2_m, int3_m;                             \
+    v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
+        sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                \
+        -sinpi_4_9 };                                                 \
+                                                                      \
+    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
+    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
+    int0_m = tmp2_m + tmp1_m;                                         \
+                                                                      \
+    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
+    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int1_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int2_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
+                                                                      \
+    res0_m = __msa_ilvr_h((in1), (in3));                              \
+    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
+    int3_m = tmp2_m + tmp0_m;                                         \
+                                                                      \
+    res0_m = __msa_ilvr_h((in2), (in3));                              \
+    c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
+                                                                      \
+    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
+    res1_m = __msa_ilvr_h((in0), (in2));                              \
+    c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
+                                                                      \
+    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
+    int3_m += tmp2_m;                                                 \
+    int3_m += tmp3_m;                                                 \
+                                                                      \
+    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
+    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
+}
+
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                           out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                           \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                   \
+    v8i16 zero_m = { 0 };                                                   \
+                                                                            \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                      \
+               tmp0_n, tmp1_n, tmp2_n, tmp3_n);                             \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                            \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                            \
+                                                                            \
+    out0 = (v8i16) __msa_ilvr_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out1 = (v8i16) __msa_ilvl_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out2 = (v8i16) __msa_ilvr_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+    out3 = (v8i16) __msa_ilvl_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+                                                                            \
+    out4 = zero_m;                                                          \
+    out5 = zero_m;                                                          \
+    out6 = zero_m;                                                          \
+    out7 = zero_m;                                                          \
+}
+
+static void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 4);
+    vec = __msa_fill_h(out);
+
+    ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
+
+static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* rows */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* rows */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* cols */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* cols */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)     \
+( {                                                    \
+    v8i16 c0_m, c1_m;                                  \
+                                                       \
+    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);  \
+    c0_m = __msa_ilvev_h(c1_m, c0_m);                  \
+                                                       \
+    c0_m;                                              \
+} )
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,          \
+                 out0, out1, out2, out3)                                  \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
+    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
+                cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
+    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
+                cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
+}
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                       out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                              \
+    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
+    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
+       cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                 \
+                                                                               \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
+    k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
+    k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
+    k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
+    VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
+    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
+    k1_m = __msa_splati_h(mask_m, 4);                                          \
+                                                                               \
+    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
+    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);           \
+    tp4_m = in1 + in3;                                                         \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
+    tp7_m = in7 + in5;                                                         \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
+             in0, in4, in2, in6);                                              \
+    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
+    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
+                out0, out1, out2, out3, out4, out5, out6, out7);               \
+}
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,              \
+                        out0, out1, out2, out3, out4, out5, out6, out7)      \
+{                                                                            \
+    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
+    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
+    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
+    v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
+        cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
+    v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
+        cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };    \
+    v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
+        -cospi_16_64, 0, 0, 0, 0 };                                          \
+                                                                             \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
+    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
+    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
+    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
+    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
+    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
+    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
+    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
+    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
+    k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
+    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
+    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
+                                                                             \
+    out1 = -in1;                                                             \
+    out3 = -in3;                                                             \
+    out5 = -in5;                                                             \
+    out7 = -in7;                                                             \
+}
+
+static void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    int32_t val;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    val = ROUND_POWER_OF_TWO(out, 5);
+    vec = __msa_fill_h(val);
+
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
+
+static void vp9_idct8x8_12_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+    //TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    /* stage1 */
+    ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+    k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+    /* stage2 */
+    ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+    /* stage3 */
+    s0 = __msa_ilvr_h(s6, s5);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+    SRARI_W2_SW(tmp0, tmp1, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+    /* stage4 */
+    BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 cnst0, cnst1, cnst2, cnst3, cnst4;
+    v8i16 temp0, temp1, temp2, temp3, s0, s1;
+    v16i8 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* 1D adst8x8 */
+    VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+              in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    cnst0 = __msa_fill_h(cospi_2_64);
+    cnst1 = __msa_fill_h(cospi_30_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_18_64);
+    cnst3 = __msa_fill_h(cospi_14_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in0, in7, temp1, temp0);
+    ILVRL_H2_SH(in4, in3, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in7, in0, in4, in3);
+
+    cnst0 = __msa_fill_h(cospi_10_64);
+    cnst1 = __msa_fill_h(cospi_22_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_26_64);
+    cnst3 = __msa_fill_h(cospi_6_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in5, in2, in6, in1);
+    BUTTERFLY_4(in7, in0, in2, in5, s1, s0, in2, in5);
+    out7 = -s0;
+    out0 = s1;
+    SRARI_H2_SH(out0, out7, 5);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst7 = LD_UB(dst + 7 * dst_stride);
+
+    res0 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst0);
+    res0 += out0;
+    res0 = CLIP_SH_0_255(res0);
+    res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+    ST8x1_UB(res0, dst);
+
+    res7 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst7);
+    res7 += out7;
+    res7 = CLIP_SH_0_255(res7);
+    res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
+    ST8x1_UB(res7, dst + 7 * dst_stride);
+
+    cnst1 = __msa_fill_h(cospi_24_64);
+    cnst0 = __msa_fill_h(cospi_8_64);
+    cnst3 = -cnst1;
+    cnst2 = -cnst0;
+
+    ILVEV_H2_SH(cnst3, cnst0, cnst1, cnst2, cnst3, cnst2);
+    cnst0 = __msa_ilvev_h(cnst1, cnst0);
+    cnst1 = cnst0;
+
+    ILVRL_H2_SH(in4, in3, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst2, cnst3,
+                              cnst1, out1, out6, s0, s1);
+    out1 = -out1;
+    SRARI_H2_SH(out1, out6, 5);
+    dst1 = LD_UB(dst + 1 * dst_stride);
+    dst6 = LD_UB(dst + 6 * dst_stride);
+    ILVR_B2_SH(zero, dst1, zero, dst6, res1, res6);
+    ADD2(res1, out1, res6, out6, res1, res6);
+    CLIP_SH2_0_255(res1, res6);
+    PCKEV_B2_SH(res1, res1, res6, res6, res1, res6);
+    ST8x1_UB(res1, dst + dst_stride);
+    ST8x1_UB(res6, dst + 6 * dst_stride);
+
+    cnst0 = __msa_fill_h(cospi_16_64);
+    cnst1 = -cnst0;
+    cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(s0, s1, temp3, temp2);
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst0);
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst1);
+    out3 = -out3;
+    SRARI_H2_SH(out3, out4, 5);
+    dst3 = LD_UB(dst + 3 * dst_stride);
+    dst4 = LD_UB(dst + 4 * dst_stride);
+    ILVR_B2_SH(zero, dst3, zero, dst4, res3, res4);
+    ADD2(res3, out3, res4, out4, res3, res4);
+    CLIP_SH2_0_255(res3, res4);
+    PCKEV_B2_SH(res3, res3, res4, res4, res3, res4);
+    ST8x1_UB(res3, dst + 3 * dst_stride);
+    ST8x1_UB(res4, dst + 4 * dst_stride);
+
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0);
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1);
+    out5 = -out5;
+    SRARI_H2_SH(out2, out5, 5);
+    dst2 = LD_UB(dst + 2 * dst_stride);
+    dst5 = LD_UB(dst + 5 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst5, res2, res5);
+    ADD2(res2, out2, res5, out5, res2, res5);
+    CLIP_SH2_0_255(res2, res5);
+    PCKEV_B2_SH(res2, res2, res5, res5, res2, res5);
+    ST8x1_UB(res2, dst + 2 * dst_stride);
+    ST8x1_UB(res5, dst + 5 * dst_stride);
+}
+
+static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in1, in6, in3, in4, in5, in2, in7, in0);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct_iadst_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in1, in6, in3, in4, in5, in2, in7, in0);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,          \
+                         r9, r10, r11, r12, r13, r14, r15,            \
+                         out0, out1, out2, out3, out4, out5,          \
+                         out6, out7, out8, out9, out10, out11,        \
+                         out12, out13, out14, out15)                  \
+{                                                                     \
+    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
+    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
+    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
+    v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
+    v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
+                                                                      \
+    /* stage 1 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
+    VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,              \
+                g0_m, g1_m, g2_m, g3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
+    VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,             \
+                g4_m, g5_m, g6_m, g7_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
+    VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,             \
+                g8_m, g9_m, g10_m, g11_m);                            \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
+    VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,              \
+                g12_m, g13_m, g14_m, g15_m);                          \
+                                                                      \
+    /* stage 2 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
+    VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,      \
+                h0_m, h1_m, h2_m, h3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
+    VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,     \
+                h4_m, h5_m, h6_m, h7_m);                              \
+    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
+    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
+                h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
+                                                                      \
+    /* stage 3 */                                                     \
+    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
+    VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,       \
+                out4, out6, out5, out7);                              \
+    VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,       \
+                out12, out14, out13, out15);                          \
+                                                                      \
+    /* stage 4 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
+    VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);             \
+    VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);               \
+    VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);           \
+    VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);           \
+}
+
+static void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+
+    /* load up 8x8 */
+    LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    input += 8 * 16;
+    /* load bottom 8x8 */
+    LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+static void vp9_idct16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+
+    /* load up 8x8 */
+    LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    input += 8 * 16;
+    /* load bottom 8x8 */
+    LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                       reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+    ST_SH4(reg0, reg2, reg4, reg6, output, 16);
+    ST_SH4(reg8, reg10, reg12, reg14, (output + 4 * 16), 16);
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                       reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+    ST_SH4(reg3, reg13, reg11, reg5, (output + 8), 16);
+    ST_SH4(reg7, reg9, reg1, reg15, (output + 8 + 4 * 16), 16);
+}
+
+static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    uint8_t i;
+    int16_t out;
+    v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+
+    vec = __msa_fill_h(out);
+
+    for (i = 4; i--;)
+    {
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void vp9_idct16x16_10_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    vp9_idct16_1d_columns_msa(input, out);
+
+    /* short case just considers top 4 rows as valid output */
+    out += 4 * 16;
+    for (i = 12; i--;) {
+        __asm__ volatile (
+            "sw     $zero,   0(%[out])     \n\t"
+            "sw     $zero,   4(%[out])     \n\t"
+            "sw     $zero,   8(%[out])     \n\t"
+            "sw     $zero,  12(%[out])     \n\t"
+            "sw     $zero,  16(%[out])     \n\t"
+            "sw     $zero,  20(%[out])     \n\t"
+            "sw     $zero,  24(%[out])     \n\t"
+            "sw     $zero,  28(%[out])     \n\t"
+
+            :
+            : [out] "r" (out)
+        );
+
+        out += 16;
+    }
+
+    out = out_arr;
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_idct16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_iadst16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+    /* load input data */
+    LD_SH16(input, 16,
+            l0, l1, l2, l3, l4, l5, l6, l7,
+            l8, l9, l10, l11, l12, l13, l14, l15);
+
+    /* ADST in horizontal */
+    VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                     l8, l9, l10, l11, l12, l13, l14, l15,
+                     r0, r1, r2, r3, r4, r5, r6, r7,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+
+    l1 = -r8;
+    l3 = -r4;
+    l13 = -r13;
+    l15 = -r1;
+
+    TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                       l0, l1, l2, l3, l4, l5, l6, l7);
+    ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+    TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                       l8, l9, l10, l11, l12, l13, l14, l15);
+    ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                              int32_t dst_stride)
+{
+    v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+    v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+    v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+    v16i8 zero = { 0 };
+
+    r0 = LD_SH(input + 0 * 16);
+    r3 = LD_SH(input + 3 * 16);
+    r4 = LD_SH(input + 4 * 16);
+    r7 = LD_SH(input + 7 * 16);
+    r8 = LD_SH(input + 8 * 16);
+    r11 = LD_SH(input + 11 * 16);
+    r12 = LD_SH(input + 12 * 16);
+    r15 = LD_SH(input + 15 * 16);
+
+    /* stage 1 */
+    k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+    VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+    k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+    VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+    BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+    VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+    r1 = LD_SH(input + 1 * 16);
+    r2 = LD_SH(input + 2 * 16);
+    r5 = LD_SH(input + 5 * 16);
+    r6 = LD_SH(input + 6 * 16);
+    r9 = LD_SH(input + 9 * 16);
+    r10 = LD_SH(input + 10 * 16);
+    r13 = LD_SH(input + 13 * 16);
+    r14 = LD_SH(input + 14 * 16);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+    VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+    k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+    VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+    BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+    BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+    out1 = -out1;
+    SRARI_H2_SH(out0, out1, 6);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst1 = LD_UB(dst + 15 * dst_stride);
+    ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+    ADD2(res0, out0, res1, out1, res0, res1);
+    CLIP_SH2_0_255(res0, res1);
+    PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+    ST8x1_UB(res0, dst);
+    ST8x1_UB(res1, dst + 15 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+    VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+    BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+    out8 = -out8;
+
+    SRARI_H2_SH(out8, out9, 6);
+    dst8 = LD_UB(dst + 1 * dst_stride);
+    dst9 = LD_UB(dst + 14 * dst_stride);
+    ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+    ADD2(res8, out8, res9, out9, res8, res9);
+    CLIP_SH2_0_255(res8, res9);
+    PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+    ST8x1_UB(res8, dst + dst_stride);
+    ST8x1_UB(res9, dst + 14 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+    VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+    out4 = -out4;
+    SRARI_H2_SH(out4, out5, 6);
+    dst4 = LD_UB(dst + 3 * dst_stride);
+    dst5 = LD_UB(dst + 12 * dst_stride);
+    ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+    ADD2(res4, out4, res5, out5, res4, res5);
+    CLIP_SH2_0_255(res4, res5);
+    PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+    ST8x1_UB(res4, dst + 3 * dst_stride);
+    ST8x1_UB(res5, dst + 12 * dst_stride);
+
+    VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+    out13 = -out13;
+    SRARI_H2_SH(out12, out13, 6);
+    dst12 = LD_UB(dst + 2 * dst_stride);
+    dst13 = LD_UB(dst + 13 * dst_stride);
+    ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+    ADD2(res12, out12, res13, out13, res12, res13);
+    CLIP_SH2_0_255(res12, res13);
+    PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+    ST8x1_UB(res12, dst + 2 * dst_stride);
+    ST8x1_UB(res13, dst + 13 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+    SRARI_H2_SH(out6, out7, 6);
+    dst6 = LD_UB(dst + 4 * dst_stride);
+    dst7 = LD_UB(dst + 11 * dst_stride);
+    ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+    ADD2(res6, out6, res7, out7, res6, res7);
+    CLIP_SH2_0_255(res6, res7);
+    PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+    ST8x1_UB(res6, dst + 4 * dst_stride);
+    ST8x1_UB(res7, dst + 11 * dst_stride);
+
+    VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+    SRARI_H2_SH(out10, out11, 6);
+    dst10 = LD_UB(dst + 6 * dst_stride);
+    dst11 = LD_UB(dst + 9 * dst_stride);
+    ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+    ADD2(res10, out10, res11, out11, res10, res11);
+    CLIP_SH2_0_255(res10, res11);
+    PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+    ST8x1_UB(res10, dst + 6 * dst_stride);
+    ST8x1_UB(res11, dst + 9 * dst_stride);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+    SRARI_H2_SH(out2, out3, 6);
+    dst2 = LD_UB(dst + 7 * dst_stride);
+    dst3 = LD_UB(dst + 8 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+    ADD2(res2, out2, res3, out3, res2, res3);
+    CLIP_SH2_0_255(res2, res3);
+    PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+    ST8x1_UB(res2, dst + 7 * dst_stride);
+    ST8x1_UB(res3, dst + 8 * dst_stride);
+
+    VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+    SRARI_H2_SH(out14, out15, 6);
+    dst14 = LD_UB(dst + 5 * dst_stride);
+    dst15 = LD_UB(dst + 10 * dst_stride);
+    ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+    ADD2(res14, out14, res15, out15, res14, res15);
+    CLIP_SH2_0_255(res14, res15);
+    PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+    ST8x1_UB(res14, dst + 5 * dst_stride);
+    ST8x1_UB(res15, dst + 10 * dst_stride);
+}
+
+static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+    int32_t i;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 16 * 8 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                          dst_stride);
+    }
+}
+
+static void vp9_iadst_idct_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                         (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_iadst_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf,
+                                               int16_t *tmp_odd_buf,
+                                               int16_t *dst)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+    /* Transpose : 16 vectors */
+    /* 1st & 2nd 8x8 */
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+    /* 3rd & 4th 8x8 */
+    LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+    LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                                   int16_t *tmp_eve_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+    v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+    /* Even stage 1 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    tmp_buf += (2 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+    VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+    BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+    VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+    loc1 = vec3;
+    loc0 = vec1;
+
+    VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+    BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+    BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+    BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+    /* Even stage 2 */
+    /* Load 8 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+    vec0 = reg0 + reg4;
+    reg0 = reg0 - reg4;
+    reg4 = reg6 + reg2;
+    reg6 = reg6 - reg2;
+    reg2 = reg1 + reg5;
+    reg1 = reg1 - reg5;
+    reg5 = reg7 + reg3;
+    reg7 = reg7 - reg3;
+    reg3 = vec0;
+
+    vec1 = reg2;
+    reg2 = reg3 + reg4;
+    reg3 = reg3 - reg4;
+    reg4 = reg5 - vec1;
+    reg5 = reg5 + vec1;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+    vec0 = reg0 - reg6;
+    reg0 = reg0 + reg6;
+    vec1 = reg7 - reg1;
+    reg7 = reg7 + reg1;
+
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+    /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+    /* Store 8 */
+    BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+    BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+    /* Store 8 */
+    BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+    BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                                  int16_t *tmp_odd_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+    /* Odd stage 1 */
+    reg0 = LD_SH(tmp_buf + 32);
+    reg1 = LD_SH(tmp_buf + 7 * 32);
+    reg2 = LD_SH(tmp_buf + 9 * 32);
+    reg3 = LD_SH(tmp_buf + 15 * 32);
+    reg4 = LD_SH(tmp_buf + 17 * 32);
+    reg5 = LD_SH(tmp_buf + 23 * 32);
+    reg6 = LD_SH(tmp_buf + 25 * 32);
+    reg7 = LD_SH(tmp_buf + 31 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+    vec0 = reg0 + reg3;
+    reg0 = reg0 - reg3;
+    reg3 = reg7 + reg4;
+    reg7 = reg7 - reg4;
+    reg4 = reg1 + reg2;
+    reg1 = reg1 - reg2;
+    reg2 = reg6 + reg5;
+    reg6 = reg6 - reg5;
+    reg5 = vec0;
+
+    /* 4 Stores */
+    ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+    SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+    ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+    /* 4 Stores */
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+    BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+    VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+    ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+    /* Odd stage 2 */
+    /* 8 loads */
+    reg0 = LD_SH(tmp_buf + 3 * 32);
+    reg1 = LD_SH(tmp_buf + 5 * 32);
+    reg2 = LD_SH(tmp_buf + 11 * 32);
+    reg3 = LD_SH(tmp_buf + 13 * 32);
+    reg4 = LD_SH(tmp_buf + 19 * 32);
+    reg5 = LD_SH(tmp_buf + 21 * 32);
+    reg6 = LD_SH(tmp_buf + 27 * 32);
+    reg7 = LD_SH(tmp_buf + 29 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+    VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+    /* 4 Stores */
+    SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+         vec0, vec1, vec2, vec3);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+    BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+    /* 4 Stores */
+    ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7,
+         vec0, vec1, vec2, vec3);
+    BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+    VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+    /* Load 8 & Store 8 */
+    LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+    LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+    SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Load 8 & Store 8 */
+    LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+    LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+    SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                                 int16_t *tmp_odd_buf,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
+                        m0, m2, m4, m6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+}
+
+static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                         dst, dst_stride);
+}
+
+static void vp9_idct8x32_1d_columns_msa(int16_t *input, int16_t *output,
+                                        int16_t *tmp_buf)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0],
+                                       &tmp_odd_buf[0], output);
+}
+
+static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+
+    vec = __msa_fill_h(out);
+
+    for (i = 16; i--;)
+    {
+        LD_UB2(dst, 16, dst0, dst1);
+        LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+
+        ST_UB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(tmp2, tmp3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void vp9_idct32x32_34_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    for (i = 32; i--;) {
+        __asm__ volatile (
+            "sw     $zero,       (%[out_ptr])     \n\t"
+            "sw     $zero,      4(%[out_ptr])     \n\t"
+            "sw     $zero,      8(%[out_ptr])     \n\t"
+            "sw     $zero,     12(%[out_ptr])     \n\t"
+            "sw     $zero,     16(%[out_ptr])     \n\t"
+            "sw     $zero,     20(%[out_ptr])     \n\t"
+            "sw     $zero,     24(%[out_ptr])     \n\t"
+            "sw     $zero,     28(%[out_ptr])     \n\t"
+            "sw     $zero,     32(%[out_ptr])     \n\t"
+            "sw     $zero,     36(%[out_ptr])     \n\t"
+            "sw     $zero,     40(%[out_ptr])     \n\t"
+            "sw     $zero,     44(%[out_ptr])     \n\t"
+            "sw     $zero,     48(%[out_ptr])     \n\t"
+            "sw     $zero,     52(%[out_ptr])     \n\t"
+            "sw     $zero,     56(%[out_ptr])     \n\t"
+            "sw     $zero,     60(%[out_ptr])     \n\t"
+
+            :
+            : [out_ptr] "r" (out_ptr)
+        );
+
+        out_ptr += 32;
+    }
+
+    out_ptr = out_arr;
+
+    /* process 8*32 block */
+    vp9_idct8x32_1d_columns_msa(input, out_ptr, &tmp_buf[0]);
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct32x32_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    /* transform rows */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 8)),
+                                    &tmp_buf[0]);
+    }
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob > 1) {
+        vp9_idct4x4_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 4 * 4 * sizeof(*block));
+    }
+    else {
+        vp9_idct4x4_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+}
+
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob == 1) {
+        vp9_idct8x8_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 12) {
+        vp9_idct8x8_12_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 4 * 8 * sizeof(*block));
+    }
+    else {
+        vp9_idct8x8_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 8 * 8 * sizeof(*block));
+    }
+}
+
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    int i;
+
+    if (eob == 1) {
+        /* DC only DCT coefficient. */
+        vp9_idct16x16_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 10) {
+        vp9_idct16x16_10_colcol_addblk_msa(block, dst, stride);
+        for (i = 0; i < 4; ++i) {
+            memset(block, 0, 4 * sizeof(*block));
+            block += 16;
+        }
+    }
+    else {
+        vp9_idct16x16_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 16 * 16 * sizeof(*block));
+    }
+}
+
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    int i;
+
+    if (eob == 1) {
+        vp9_idct32x32_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 34) {
+        vp9_idct32x32_34_colcol_addblk_msa(block, dst, stride);
+        for (i = 0; i < 8; ++i) {
+            memset(block, 0, 8 * sizeof(*block));
+            block += 32;
+        }
+    }
+    else {
+        vp9_idct32x32_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 32 * 32 * sizeof(*block));
+    }
+}
+
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst4x4_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst8x8_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob)
+{
+    vp9_iadst16x16_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
+
+void ff_idct_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_4x4_add_msa(block, dst, stride, eob);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_idct_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_8x8_add_msa(block, dst, stride, eob);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_idct_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_idct_iadst_16x16_add_msa(block, dst, stride, eob);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
+
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_4x4_add_msa(block, dst, stride, eob);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_8x8_add_msa(block, dst, stride, eob);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_iadst_idct_16x16_add_msa(block, dst, stride, eob);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c
new file mode 100644
index 0000000000..54cf0ae94f
--- /dev/null
+++ b/libavcodec/mips/vp9_intra_msa.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \
+{                                                \
+    out0 = __msa_subs_u_h(out0, in0);            \
+    out1 = __msa_subs_u_h(out1, in1);            \
+}
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src0;
+
+    src0 = LD_UB(src);
+
+    for (row = 16; row--;) {
+        ST_UB(src0, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 12;
+    for (row = 4; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 28;
+    for (row = 8; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB2(src0, src0, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src1, src1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src2, src2, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src3, src3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint32_t val0, val1;
+    v16i8 store, src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+
+    SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_4x4(dir)                                    \
+void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint32_t val0;                                              \
+    v16i8 store, data = { 0 };                                  \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+                                                                \
+    val0 = LW(dir);                                             \
+    data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_w((v4i32) store, 0);                    \
+                                                                \
+    SW4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+INTRA_DC_TL_4x4(top);
+INTRA_DC_TL_4x4(left);
+
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint64_t val0, val1;
+    v16i8 store;
+    v16u8 src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum_h = __msa_hadd_u_h(src, src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_8x8(dir)                                    \
+void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint64_t val0;                                              \
+    v16i8 store;                                                \
+    v16u8 data = { 0 };                                         \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+    v2u64 sum_d;                                                \
+                                                                \
+    val0 = LD(dir);                                             \
+    data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h(data, data);                         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_d((v2i64) store, 0);                    \
+                                                                \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+    dst += (4 * dst_stride);                                    \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+
+INTRA_DC_TL_8x8(top);
+INTRA_DC_TL_8x8(left);
+
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    v16u8 top, left, out;
+    v8u16 sum_h, sum_top, sum_left;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    top = LD_UB(src_top);
+    left = LD_UB(src_left);
+    HADD_UB2_UH(top, left, sum_top, sum_left);
+    sum_h = sum_top + sum_left;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_16x16(dir)                                        \
+void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \
+                             const uint8_t *left,                     \
+                             const uint8_t *top)                      \
+{                                                                     \
+    v16u8 data, out;                                                  \
+    v8u16 sum_h;                                                      \
+    v4u32 sum_w;                                                      \
+    v2u64 sum_d;                                                      \
+                                                                      \
+    data = LD_UB(dir);                                                \
+    sum_h = __msa_hadd_u_h(data, data);                               \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \
+                                                                      \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+    dst += (8 * dst_stride);                                          \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+}
+INTRA_DC_TL_16x16(top);
+INTRA_DC_TL_16x16(left);
+
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    uint32_t row;
+    v16u8 top0, top1, left0, left1, out;
+    v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    LD_UB2(src_top, 16, top0, top1);
+    LD_UB2(src_left, 16, left0, left1);
+    HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+    HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+    sum_h = sum_top0 + sum_top1;
+    sum_h += sum_left0 + sum_left1;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    for (row = 16; row--;)
+    {
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+#define INTRA_DC_TL_32x32(dir)                                    \
+void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                             const uint8_t *left,                 \
+                             const uint8_t *top)                  \
+{                                                                 \
+    uint32_t row;                                                 \
+    v16u8 data0, data1, out;                                      \
+    v8u16 sum_h, sum_data0, sum_data1;                            \
+    v4u32 sum_w;                                                  \
+    v2u64 sum_d;                                                  \
+                                                                  \
+    LD_UB2(dir, 16, data0, data1);                                \
+    HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \
+    sum_h = sum_data0 + sum_data1;                                \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \
+                                                                  \
+    for (row = 16; row--;)                                        \
+    {                                                             \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+    }                                                             \
+}
+INTRA_DC_TL_32x32(top);
+INTRA_DC_TL_32x32(left);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \
+void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+    dst += (8 * dst_stride);                                           \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(128);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+#define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \
+void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    uint32_t row;                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    for (row = 16; row--;)                                             \
+    {                                                                  \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+    }                                                                  \
+}
+
+INTRA_PREDICT_VALDC_32X32_MSA(127);
+INTRA_PREDICT_VALDC_32X32_MSA(128);
+INTRA_PREDICT_VALDC_32X32_MSA(129);
+
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint32_t left;
+    uint8_t top_left = src_top_ptr[-1];
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v16u8 src0, src1, src2, src3;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+    src_top = LD_SB(src_top_ptr);
+    left = LW(src_left);
+    src_left0 = __msa_fill_b(left >> 24);
+    src_left1 = __msa_fill_b(left >> 16);
+    src_left2 = __msa_fill_b(left >> 8);
+    src_left3 = __msa_fill_b(left);
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+    v16u8 src0, src1, src2, src3;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 4;
+    for (loop_cnt = 2; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+                   src_left3, src_top, src0, src1, src2, src3);
+        HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+        SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r, res_l;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 12;
+    for (loop_cnt = 4; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+    src_top0 = LD_SB(src_top_ptr);
+    src_top1 = LD_SB(src_top_ptr + 16);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 28;
+    for (loop_cnt = 8; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+    }
+}
diff --git a/libavcodec/mips/vp9_lpf_msa.c b/libavcodec/mips/vp9_lpf_msa.c
new file mode 100644
index 0000000000..63e538eb90
--- /dev/null
+++ b/libavcodec/mips/vp9_lpf_msa.c
@@ -0,0 +1,2599 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out)               \
+{                                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                  \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                            \
+    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                   \
+                                                                         \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
+                                                                         \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
+    filt = filt & (v16i8) hev_in;                                        \
+    q0_sub_p0 = q0_m - p0_m;                                             \
+    filt_sign = __msa_clti_s_b(filt, 0);                                 \
+                                                                         \
+    cnst3h = __msa_ldi_h(3);                                             \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);   \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                               \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                   \
+                                                                         \
+    /* combine left and right part */                                    \
+    filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r);                \
+                                                                         \
+    filt = filt & (v16i8) mask_in;                                       \
+    cnst4b = __msa_ldi_b(4);                                             \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
+    filt1 >>= 3;                                                         \
+                                                                         \
+    cnst3b = __msa_ldi_b(3);                                             \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
+    filt2 >>= 3;                                                         \
+                                                                         \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
+    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
+    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
+                                                                         \
+    filt = __msa_srari_b(filt1, 1);                                      \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
+    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
+    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
+}
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out)               \
+{                                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                  \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                            \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;              \
+                                                                         \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
+                                                                         \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
+                                                                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q0_sub_p0 = q0_m - p0_m;                                             \
+    filt_sign = __msa_clti_s_b(filt, 0);                                 \
+                                                                         \
+    cnst3h = __msa_ldi_h(3);                                             \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);   \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                               \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                   \
+                                                                         \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);   \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                               \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                   \
+                                                                         \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);                \
+    filt = filt & (v16i8) mask_in;                                       \
+                                                                         \
+    cnst4b = __msa_ldi_b(4);                                             \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
+    filt1 >>= 3;                                                         \
+                                                                         \
+    cnst3b = __msa_ldi_b(3);                                             \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
+    filt2 >>= 3;                                                         \
+                                                                         \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
+    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
+    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
+                                                                         \
+    filt = __msa_srari_b(filt1, 1);                                      \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
+    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
+    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
+}
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
+{                                                                      \
+    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
+    v16u8 zero_in = { 0 };                                             \
+                                                                       \
+    tmp = __msa_ori_b(zero_in, 1);                                     \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
+                                                                       \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
+                                                                       \
+    flat_out = (tmp < (v16u8) flat_out);                               \
+    flat_out = __msa_xori_b(flat_out, 0xff);                           \
+    flat_out = flat_out & (mask);                                      \
+}
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
+                  q5_in, q6_in, q7_in, flat_in, flat2_out)          \
+{                                                                   \
+    v16u8 tmp, zero_in = { 0 };                                     \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
+                                                                    \
+    tmp = __msa_ori_b(zero_in, 1);                                  \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
+                                                                    \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
+                                                                    \
+    flat2_out = (tmp < (v16u8) flat2_out);                          \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
+    flat2_out = flat2_out & flat_in;                                \
+}
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
+                    q0_in, q1_in, q2_in, q3_in,                \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
+                    q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
+{                                                              \
+    v8u16 tmp0, tmp1, tmp2;                                    \
+                                                               \
+    tmp2 = p2_in + p1_in + p0_in;                              \
+    tmp0 = p3_in << 1;                                         \
+                                                               \
+    tmp0 = tmp0 + tmp2 + q0_in;                                \
+    tmp1 = tmp0 + p3_in + p2_in;                               \
+    p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 + p1_in + q1_in;                               \
+    p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = q2_in + q1_in + q0_in;                              \
+    tmp2 = tmp2 + tmp1;                                        \
+    tmp0 = tmp2 + (p0_in);                                     \
+    tmp0 = tmp0 + (p3_in);                                     \
+    p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
+                                                               \
+    tmp0 = q2_in + q3_in;                                      \
+    tmp0 = p0_in + tmp1 + tmp0;                                \
+    tmp1 = q3_in + q3_in;                                      \
+    tmp1 = tmp1 + tmp0;                                        \
+    q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp0 = tmp2 + q3_in;                                       \
+    tmp1 = tmp0 + q0_in;                                       \
+    q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 - p2_in;                                       \
+    tmp0 = q1_in + q3_in;                                      \
+    tmp1 = tmp0 + tmp1;                                        \
+    q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+                                                                   \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = thresh_in < (v16u8) flat_out;                        \
+                                                                   \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+                                                                   \
+    mask_out = b_limit_in < p0_asub_q0_m;                          \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+                                                                   \
+    mask_out = limit_in < (v16u8) mask_out;                        \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+void ff_loop_filter_v_4_8_msa(uint8_t *src, int32_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+
+void ff_loop_filter_v_44_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_loop_filter_v_8_8_msa(uint8_t *src, int32_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+        src -= 3 * pitch;
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+        src += (4 * pitch);
+        SD(q1_d, src);
+        src += pitch;
+        SD(q2_d, src);
+    }
+}
+
+void ff_loop_filter_v_88_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_84_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_48_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                        uint8_t *filter48,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 96);
+
+    LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        src -= 3 * pitch;
+        ST_UB4(p2, p1, p0, q0, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1, q2, src, pitch);
+    } else {
+        src -= 7 * pitch;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += pitch;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += pitch;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += pitch;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += pitch;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += pitch;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += pitch;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += pitch;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+    }
+}
+
+void ff_loop_filter_v_16_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t early_exit = 0;
+
+    early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    }
+}
+
+void ff_loop_filter_v_16_8_msa(uint8_t *src, int32_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+        /* convert 8 bit input data into 16 bit */
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
+                   q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
+                   q1_r, q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
+                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
+                    q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                    q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        /* load 16 vector elements */
+        LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+        LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+        /* if flat2 is zero for all pixels, then no need to calculate other filter */
+        if (__msa_test_bz_v(flat2)) {
+            p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+            p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+            p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+            q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+            q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+            q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+            SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+            SD(q1_d, src + pitch);
+            SD(q2_d, src + 2 * pitch);
+        } else {
+            /* LSB(right) 8 pixel operation */
+            ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
+                       zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
+                       q4_r, q5_r, q6_r, q7_r);
+
+            tmp0 = p7_r << 3;
+            tmp0 -= p7_r;
+            tmp0 += p6_r;
+            tmp0 += q0_r;
+
+            src -= 7 * pitch;
+
+            /* calculation of p6 and p5 */
+            tmp1 = p6_r + p5_r + p4_r + p3_r;
+            tmp1 += (p2_r + p1_r + p0_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp0 = p5_r - p6_r + q1_r - p7_r;
+            tmp1 += tmp0;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p4 and p3 */
+            tmp0 = p4_r - p5_r + q2_r - p7_r;
+            tmp2 = p3_r - p4_r + q3_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p2 and p1 */
+            tmp0 = p2_r - p3_r + q4_r - p7_r;
+            tmp2 = p1_r - p2_r + q5_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p0 and q0 */
+            tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+            tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q1 and q2 */
+            tmp0 = q7_r - q0_r + q1_r - p6_r;
+            tmp2 = q7_r - q1_r + q2_r - p5_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q3 and q4 */
+            tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+            tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q5 and q6 */
+            tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+            tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+        }
+    }
+}
+
+void ff_loop_filter_h_4_8_msa(uint8_t *src, int32_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, limit, thresh, b_limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void ff_loop_filter_h_44_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+void ff_loop_filter_h_8_8_msa(uint8_t *src, int32_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    /* load vector elements */
+    LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        /* Store 4 pixels p1-_q1 */
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+        src -= 2;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        src += 4 * pitch;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                    p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        /* Store 6 pixels p2-_q2 */
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src -= 3;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_88_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        /* filter8 */
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_84_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_48_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+    v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch,
+           p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+    /* 8x8 transpose */
+    TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+    /* 8x8 transpose */
+    ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+    ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+    ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+    ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+    SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+    ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
+                                uint8_t *output, int32_t out_pitch)
+{
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+    v4i32 tmp2, tmp3;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    input += (8 * in_pitch);
+    LD_UB8(input, in_pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p7, p6, p5, p4, p3, p2, p1, p0);
+
+    /* transpose 16x8 matrix into 8x16 */
+    /* total 8 intermediate register and 32 instructions */
+    q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
+    q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
+    q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
+    q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
+    q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
+    q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
+    q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
+    q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
+
+    ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+    tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
+    tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
+
+    ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+    tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
+    tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
+
+    ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+    q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
+    tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
+    q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+    q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
+    tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
+    q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                       uint8_t *src_org, int32_t pitch_org,
+                                       int32_t b_limit_ptr,
+                                       int32_t limit_ptr,
+                                       int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
+        p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
+        p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
+        q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
+        q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
+        q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                                 uint8_t *filter48)
+{
+    v16i8 zero = { 0 };
+    v16u8 filter8, flat, flat2;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 tmp0_r, tmp1_r;
+    v8i16 r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST8x1_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST8x1_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST8x1_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST8x1_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST8x1_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST8x1_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST8x1_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST8x1_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_8_msa(uint8_t *src, int32_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+                                         &filter48[0], src, pitch,
+                                         b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                       &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+        }
+    }
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                        uint8_t *src_org, int32_t pitch,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src_org -= 2;
+        ST4x8_UB(vec2, vec3, src_org, pitch);
+        src_org += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src_org, pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                                  uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_16_msa(uint8_t *src, int32_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+                                          &filter48[0], src, pitch,
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                        &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp9_mc_msa.c b/libavcodec/mips/vp9_mc_msa.c
new file mode 100644
index 0000000000..1671d973a4
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_msa.c
@@ -0,0 +1,4510 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t vp9_bilinear_filters_msa[15][2] = {
+    {120, 8},
+    {112, 16},
+    {104, 24},
+    {96, 32},
+    {88, 40},
+    {80, 48},
+    {72, 56},
+    {64, 64},
+    {56, 72},
+    {48, 80},
+    {40, 88},
+    {32, 96},
+    {24, 104},
+    {16, 112},
+    {8, 120}
+};
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3)         \
+( {                                                             \
+    v8i16 tmp0, tmp1;                                           \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \
+                        filt_h0, filt_h1, filt_h2, filt_h3)              \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+               vec0_m, vec1_m, vec2_m, vec3_m);                          \
+    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                   filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                res0_m, res1_m, res2_m, res3_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+                res4_m, res5_m, res6_m, res7_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 res4_m, res5_m, res6_m, res7_m);                             \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+                res7_m, out0, out1, out2, out3);                              \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \
+{                                                     \
+    v16u8 tmp_m;                                      \
+                                                      \
+    tmp_m = PCKEV_XORI128_UB(in1, in0);               \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \
+    ST_UB(tmp_m, (pdst));                             \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride)                                \
+{                                                                       \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
+                                                                        \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \
+}
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+
+        src0 = LD_SB(src + 32);
+        src2 = LD_SB(src + 48);
+        src3 = LD_SB(src + 56);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              64);
+}
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        out0 = out2;
+        out1 = out3;
+        out2 = out4;
+    }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
+                                   filt_vt1, filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst0, dst1, dst2, dst3, res2, res3;
+    v16u8 mask0, mask1, mask2, mask3;
+    v8i16 filt, res0, res1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, res0, res1);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    SRARI_H2_SH(res0, res1, 7);
+    SAT_SH2_SH(res0, res1, 7);
+    PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    XORI_B2_128_UB(res2, res3);
+    AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec0, vec1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec2, vec3);
+    SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
+    SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                res0, res1, res2, res3);
+    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+    XORI_B2_128_UB(res0, res2);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+               dst0, dst2, dst4, dst6);
+    ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+    AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+    ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        LD_UB2(dst, 16, dst1, dst2);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        for (cnt = 0; cnt < 2; ++cnt) {
+            src0 = LD_SB(&src[cnt << 5]);
+            src2 = LD_SB(&src[16 + (cnt << 5)]);
+            src3 = LD_SB(&src[24 + (cnt << 5)]);
+            src1 = __msa_sldi_b(src2, src0, 8);
+
+            XORI_B4_128_SB(src0, src1, src2, src3);
+            VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                       vec12);
+            VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                       vec13);
+            VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
+                       vec10, vec14);
+            VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
+                       vec11, vec15);
+            DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                        vec0, vec1, vec2, vec3);
+            DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
+                        vec8, vec9, vec10, vec11);
+            DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                         vec0, vec1, vec2, vec3);
+            DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                         vec8, vec9, vec10, vec11);
+            ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                        out1, out2, out3);
+            SRARI_H4_SH(out0, out1, out2, out3, 7);
+            SAT_SH4_SH(out0, out1, out2, out3, 7);
+            LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+            PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+            PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3, out;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+        dst0 = (v16u8) __msa_ilvr_d((v2i64) dst2, (v2i64) dst0);
+        out = __msa_aver_u_b(out, dst0);
+
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+        out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+        out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+        out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter,
+                                                   int32_t height,
+                                                   int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+
+            LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                        dst0, dst1, dst2, dst3);
+            ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 64);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+        SRARI_H2_SH(res0, res1, 7);
+        SAT_SH2_SH(res0, res1, 7);
+        PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+        XORI_B2_128_UB(tmp0, tmp1);
+        AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+        ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        vec0 = vec2;
+        vec1 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        PCKEV_ST_SB(out6, out7, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src4 = LD_SB(src + 32);
+        src6 = LD_SB(src + 48);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        PCKEV_ST_SB(out4, out5, dst + 32);
+        PCKEV_ST_SB(out6, out7, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src5 = LD_UB(src + 16);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int8_t *filter_horiz, const int8_t *filter_vert,
+                                   int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+                vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+                res2, res3);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+               dst4, dst6);
+    AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+                res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                           dst, dst_stride);
+    }
+}
+
+void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, 7);
+    SRARI_H4_UH(res4, res5, res6, res7, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+    dst += dst_stride;
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
+                    res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
+                    res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    res0, res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    res4, res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB2(dst, 16, dst0, dst1);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+        dst += dst_stride;
+        LD_UB2(dst, 16, dst2, dst3);
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src2, src4, src6);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+        PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+        PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+        PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    src4 = LD_SB(src);
+    src += src_stride;
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16u8 src2110, src4332, src6554, src8776, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+               dst2, dst3);
+    ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3,
+                           dst4, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3,
+                           dst8, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB2(src, 16, src0, src5);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5;
+    v16u8 src6, src7, src8, src9, src10, src11, filt0;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8u16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(dst + 16, dst_stride, dst2, dst3);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(dst + 32, dst_stride, dst4, dst5);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        LD_UB2(dst + 48, dst_stride, dst6, dst7);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v16u8 dst0, dst1, dst2, dst3, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+               dst4, dst6);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
+                res2, res3);
+    AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+                res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       const int8_t *filter_horiz,
+                                                       const int8_t *filter_vert,
+                                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3,
+                           dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride,
+                                                   dst, dst_stride,
+                                                   filter_horiz, filter_vert,
+                                                   height);
+    }
+}
+
+void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB4(src, src_stride, src0, src2, src4, src6);
+        LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+        LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+        dst_dup += (4 * dst_stride);
+        LD_UB4(src, src_stride, src8, src10, src12, src14);
+        LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+        src += (4 * src_stride);
+        LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+        LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+        dst_dup += (4 * dst_stride);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                    dst8, dst9, dst10, dst11);
+        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                    dst12, dst13, dst14, dst15);
+
+        ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+        ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+        ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_UB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+        LD_UB4(src, 16, src8, src9, src10, src11);
+        src += src_stride;
+        LD_UB4(src, 16, src12, src13, src14, src15);
+        src += src_stride;
+
+        LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+        dst_dup += dst_stride;
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                    dst8, dst9, dst10, dst11);
+        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                    dst12, dst13, dst14, dst15);
+
+        ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static const int8_t vp9_subpel_filters_msa[3][15][8] = {
+    [FILTER_8TAP_REGULAR] = {
+         {0, 1, -5, 126, 8, -3, 1, 0},
+         {-1, 3, -10, 122, 18, -6, 2, 0},
+         {-1, 4, -13, 118, 27, -9, 3, -1},
+         {-1, 4, -16, 112, 37, -11, 4, -1},
+         {-1, 5, -18, 105, 48, -14, 4, -1},
+         {-1, 5, -19, 97, 58, -16, 5, -1},
+         {-1, 6, -19, 88, 68, -18, 5, -1},
+         {-1, 6, -19, 78, 78, -19, 6, -1},
+         {-1, 5, -18, 68, 88, -19, 6, -1},
+         {-1, 5, -16, 58, 97, -19, 5, -1},
+         {-1, 4, -14, 48, 105, -18, 5, -1},
+         {-1, 4, -11, 37, 112, -16, 4, -1},
+         {-1, 3, -9, 27, 118, -13, 4, -1},
+         {0, 2, -6, 18, 122, -10, 3, -1},
+         {0, 1, -3, 8, 126, -5, 1, 0},
+    }, [FILTER_8TAP_SHARP] = {
+        {-1, 3, -7, 127, 8, -3, 1, 0},
+        {-2, 5, -13, 125, 17, -6, 3, -1},
+        {-3, 7, -17, 121, 27, -10, 5, -2},
+        {-4, 9, -20, 115, 37, -13, 6, -2},
+        {-4, 10, -23, 108, 48, -16, 8, -3},
+        {-4, 10, -24, 100, 59, -19, 9, -3},
+        {-4, 11, -24, 90, 70, -21, 10, -4},
+        {-4, 11, -23, 80, 80, -23, 11, -4},
+        {-4, 10, -21, 70, 90, -24, 11, -4},
+        {-3, 9, -19, 59, 100, -24, 10, -4},
+        {-3, 8, -16, 48, 108, -23, 10, -4},
+        {-2, 6, -13, 37, 115, -20, 9, -4},
+        {-2, 5, -10, 27, 121, -17, 7, -3},
+        {-1, 3, -6, 17, 125, -13, 5, -2},
+        {0, 1, -3, 8, 127, -7, 3, -1},
+    }, [FILTER_8TAP_SMOOTH] = {
+        {-3, -1, 32, 64, 38, 1, -3, 0},
+        {-2, -2, 29, 63, 41, 2, -3, 0},
+        {-2, -2, 26, 63, 43, 4, -4, 0},
+        {-2, -3, 24, 62, 46, 5, -4, 0},
+        {-2, -3, 21, 60, 49, 7, -4, 0},
+        {-1, -4, 18, 59, 51, 9, -4, 0},
+        {-1, -4, 16, 57, 53, 12, -4, -1},
+        {-1, -4, 14, 55, 55, 14, -4, -1},
+        {-1, -4, 12, 53, 57, 16, -4, -1},
+        {0, -4, 9, 51, 59, 18, -4, -1},
+        {0, -4, 7, 49, 60, 21, -3, -2},
+        {0, -4, 5, 46, 62, 24, -3, -2},
+        {0, -4, 4, 43, 63, 26, -2, -2},
+        {0, -3, 2, 41, 63, 29, -2, -2},
+        {0, -3, 1, 38, 64, 32, -1, -3},
+    }
+};
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \
+    const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \
+                                                                               \
+    common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \
+                                    vfilter, h);                               \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \
+                                            dststride, filter, h);             \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \
+                                            filter, h);                        \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \
+    const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \
+                                                                               \
+    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \
+                                                 dststride, hfilter,           \
+                                                 vfilter, h);                  \
+}
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my)                    \
+{                                                                  \
+                                                                   \
+    copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}                                                                  \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my)                     \
+{                                                                  \
+                                                                   \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \
+}
+
+#define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                        const uint8_t *src, ptrdiff_t srcstride,  \
+                        int h, int mx, int my)                    \
+{                                                                 \
+                                                                  \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+#undef VP9_AVG_MIPS_MSA_FUNC
diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c
new file mode 100644
index 0000000000..c8a48908af
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_init_mips.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/common.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][VERT_PRED]    = ff_vert_##sz##_msa;     \
+    dsp->intra_pred[tx][HOR_PRED]     = ff_hor_##sz##_msa;      \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_128_PRED]  = ff_dc_128_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_127_PRED]  = ff_dc_127_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_129_PRED]  = ff_dc_129_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_16X16, 16x16);
+    init_intra_pred_msa(TX_32X32, 32x32);
+#undef init_intra_pred_msa
+
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_4X4, 4x4);
+    init_intra_pred_msa(TX_8X8, 8x8);
+#undef init_intra_pred_msa
+    }
+}
+
+static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_itxfm(tx, sz)                                         \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_idct_idct_##sz##_add_msa;   \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_iadst_idct_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_idct_iadst_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_iadst_iadst_##sz##_add_msa  \
+
+#define init_idct(tx, nm)                        \
+    dsp->itxfm_add[tx][DCT_DCT]   =              \
+    dsp->itxfm_add[tx][ADST_DCT]  =              \
+    dsp->itxfm_add[tx][DCT_ADST]  =              \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_msa
+
+    init_itxfm(TX_4X4, 4x4);
+    init_itxfm(TX_8X8, 8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32, ff_idct_idct_32x32);
+#undef init_itxfm
+#undef init_idct
+    }
+}
+
+static av_cold void vp9dsp_mc_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_fpel(idx1, idx2, sz, type)                                    \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_msa
+
+#define init_copy_avg(idx, sz)    \
+    init_fpel(idx, 0, sz, copy);  \
+    init_fpel(idx, 1, sz, avg)
+
+#define init_avg(idx, sz)  \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_avg
+#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
+        ff_##type##_bilin_##sz##dir##_msa;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_msa;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_msa;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_msa;
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 1, hv, type);  \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+    }
+}
+
+static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+        dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_msa;
+        dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_msa;
+        dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_msa;
+        dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_msa;
+        dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_msa;
+        dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_msa;
+
+        dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_msa;
+        dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_msa;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_msa;
+        dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_msa;
+        dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_msa;
+        dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_msa;
+        dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_msa;
+        dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_msa;
+        dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_msa;
+        dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_msa;
+    }
+}
+
+static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    vp9dsp_intrapred_init_msa(dsp, bpp);
+    vp9dsp_itxfm_init_msa(dsp, bpp);
+    vp9dsp_mc_init_msa(dsp, bpp);
+    vp9dsp_loopfilter_init_msa(dsp, bpp);
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_MSA
+    vp9dsp_init_msa(dsp, bpp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h
new file mode 100644
index 0000000000..4d7303888d
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_mips.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
+#define AVCODEC_MIPS_VP9DSP_MIPS_H
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                         \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);             \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);
+
+#define VP9_BILINEAR_MIPS_MSA_FUNC(SIZE)                                   \
+void ff_put_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);                   \
+                                                                           \
+void ff_avg_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my);                   \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_BILINEAR_MIPS_MSA_FUNC(64);
+VP9_BILINEAR_MIPS_MSA_FUNC(32);
+VP9_BILINEAR_MIPS_MSA_FUNC(16);
+VP9_BILINEAR_MIPS_MSA_FUNC(8);
+VP9_BILINEAR_MIPS_MSA_FUNC(4);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_COPY_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_BILINEAR_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+
+void ff_loop_filter_h_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_v_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_h_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob);
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_idct_iadst_4x4_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_8x8_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+
+#endif  // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
diff --git a/libavcodec/mips/xvid_idct_mmi.c b/libavcodec/mips/xvid_idct_mmi.c
new file mode 100644
index 0000000000..d3f9acb0e2
--- /dev/null
+++ b/libavcodec/mips/xvid_idct_mmi.c
@@ -0,0 +1,253 @@
+/*
+ * Loongson SIMD optimized xvid idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "xvididct_mips.h"
+
+#define BITS_INV_ACC    5                           // 4 or 5 for IEEE
+#define SHIFT_INV_ROW   (16 - BITS_INV_ACC)         //11
+#define SHIFT_INV_COL   (1 + BITS_INV_ACC)          //6
+#define RND_INV_ROW     (1024 * (6 - BITS_INV_ACC))
+#define RND_INV_COL     (16 * (BITS_INV_ACC - 3))
+#define RND_INV_CORR    (RND_INV_COL - 1)
+
+#define BITS_FRW_ACC    3                           // 2 or 3 for accuracy
+#define SHIFT_FRW_COL   BITS_FRW_ACC
+#define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17)
+#define RND_FRW_ROW     (262144*(BITS_FRW_ACC - 1))
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
+     13036, 13036, 13036, 13036,    //  tg * (2<<16) + 0.5
+     27146, 27146, 27146, 27146,    //  tg * (2<<16) + 0.5
+    -21746,-21746,-21746,-21746,    //  tg * (2<<16) + 0.5
+     23170, 23170, 23170, 23170     // cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
+    65536,65536,
+     3597, 3597,
+     2260, 2260,
+     1203, 1203,
+        0,    0,
+      120,  120,
+      512,  512,
+      512,  512
+};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmi)[32*4] = {
+     16384, 21407, 16384,  8867,    // w05 w04 w01 w00
+     16384,  8867,-16384,-21407,    // w07 w06 w03 w02
+     16384, -8867, 16384,-21407,    // w13 w12 w09 w08
+    -16384, 21407, 16384, -8867,    // w15 w14 w11 w10
+     22725, 19266, 19266, -4520,    // w21 w20 w17 w16
+     12873,  4520,-22725,-12873,    // w23 w22 w19 w18
+     12873,-22725,  4520,-12873,    // w29 w28 w25 w24
+      4520, 19266, 19266,-22725,    // w31 w30 w27 w26
+
+     22725, 29692, 22725, 12299,    // w05 w04 w01 w00
+     22725, 12299,-22725,-29692,    // w07 w06 w03 w02
+     22725,-12299, 22725,-29692,    // w13 w12 w09 w08
+    -22725, 29692, 22725,-12299,    // w15 w14 w11 w10
+     31521, 26722, 26722, -6270,    // w21 w20 w17 w16
+     17855,  6270,-31521,-17855,    // w23 w22 w19 w18
+     17855,-31521,  6270,-17855,    // w29 w28 w25 w24
+      6270, 26722, 26722,-31521,    // w31 w30 w27 w26
+
+     21407, 27969, 21407, 11585,    // w05 w04 w01 w00
+     21407, 11585,-21407,-27969,    // w07 w06 w03 w02
+     21407,-11585, 21407,-27969,    // w13 w12 w09 w08
+    -21407, 27969, 21407,-11585,    // w15 w14 w11 w10
+     29692, 25172, 25172, -5906,    // w21 w20 w17 w16
+     16819,  5906,-29692,-16819,    // w23 w22 w19 w18
+     16819,-29692,  5906,-16819,    // w29 w28 w25 w24
+      5906, 25172, 25172,-29692,    // w31 w30 w27 w26
+
+     19266, 25172, 19266, 10426,    // w05 w04 w01 w00
+     19266, 10426,-19266,-25172,    // w07 w06 w03 w02
+     19266,-10426, 19266,-25172,    // w13 w12 w09 w08
+    -19266, 25172, 19266,-10426,    // w15 w14 w11 w10
+     26722, 22654, 22654, -5315,    // w21 w20 w17 w16
+     15137,  5315,-26722,-15137,    // w23 w22 w19 w18
+     15137,-26722,  5315,-15137,    // w29 w28 w25 w24
+      5315, 22654, 22654,-26722,    // w31 w30 w27 w26
+};
+
+#define DCT_8_INV_ROW_MMI(A1,A2,A3,A4)                                      \
+    "dli $10, 0x88              \n\t"                                       \
+    "ldc1 $f4, "#A1"            \n\t" /* 0; x3 x2 x1 x0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "ldc1 $f10, 8+"#A1"         \n\t" /* 1; x7 x6 x5 x4                   */\
+    "ldc1 $f6, "#A3"            \n\t" /* 3; w05 w04 w01 w00               */\
+    "pshufh $f0, $f4, $f16      \n\t" /* x2 x0 x2 x0                      */\
+    "ldc1 $f8, 8+"#A3"          \n\t" /* 4; w07 w06 w03 w02               */\
+    "ldc1 $f12, 32+"#A3"        \n\t" /* 6; w21 w20 w17 w16               */\
+    "pmaddhw $f6, $f6, $f0      \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00      */\
+    "dli $10, 0xdd              \n\t"                                       \
+    "pshufh $f2, $f10, $f16     \n\t" /* x6 x4 x6 x4                      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "pmaddhw $f8, $f8, $f2      \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02      */\
+    "ldc1 $f14, 40+"#A3"        \n\t" /* 7; w23 w22 w19 w18               */\
+    "pshufh $f4, $f4, $f16      \n\t" /* x3 x1 x3 x1                      */\
+    "pmaddhw $f12, $f12, $f4    \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16      */\
+    "pshufh $f10, $f10, $f16    \n\t" /* x7 x5 x7 x5                      */\
+    "ldc1 $f18, "#A4"           \n\t"                                       \
+    "pmaddhw $f14, $f14, $f10   \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18      */\
+    "paddw $f6, $f6, $f18       \n\t" /* +%4                              */\
+    "ldc1 $f16, 16+"#A3"        \n\t"                                       \
+    "pmaddhw $f0, $f0, $f16     \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08      */\
+    "ldc1 $f16, 24+"#A3"        \n\t"                                       \
+    "paddw $f6, $f6, $f8        \n\t" /* 4; a1=sum(even1) a0=sum(even0)   */\
+    "pmaddhw $f2, $f2, $f16     \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10      */\
+    "ldc1 $f16, 48+"#A3"        \n\t"                                       \
+    "pmaddhw $f4, $f4, $f16     \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24      */\
+    "ldc1 $f16, 56+"#A3"        \n\t"                                       \
+    "paddw $f12, $f12, $f14     \n\t" /* 7; b1=sum(odd1) b0=sum(odd0)     */\
+    "dli $10, 11                \n\t"                                       \
+    "pmaddhw $f10, $f10, $f16   \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubw $f8, $f6, $f12       \n\t" /* 6; a1-b1 a0-b0                   */\
+    "paddw $f6, $f6, $f12       \n\t" /* a1+b1 a0+b0                      */\
+    "paddw $f0, $f0, $f18       \n\t" /* +%4                              */\
+    "psraw $f6, $f6, $f16       \n\t" /* y1=a1+b1 y0=a0+b0                */\
+    "paddw $f0, $f0, $f2        \n\t" /* 1; a3=sum(even3) a2=sum(even2)   */\
+    "paddw $f4, $f4, $f10       \n\t" /* 5; b3=sum(odd3) b2=sum(odd2)     */\
+    "psraw $f8, $f8, $f16       \n\t" /* y6=a1-b1 y7=a0-b0                */\
+    "psubw $f14, $f0, $f4       \n\t" /* 2; a3-b3 a2-b2                   */\
+    "paddw $f0, $f0, $f4        \n\t" /* a3+b3 a2+b2                      */\
+    "psraw $f0, $f0, $f16       \n\t" /* y3=a3+b3 y2=a2+b2                */\
+    "psraw $f14, $f14, $f16     \n\t" /* y4=a3-b3 y5=a2-b2                */\
+    "dli $10, 0xb1              \n\t"                                       \
+    "packsswh $f6, $f6, $f0     \n\t" /* 0; y3 y2 y1 y0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "packsswh $f14, $f14, $f8   \n\t" /* 4; y6 y7 y4 y5                   */\
+    "sdc1 $f6, "#A2"            \n\t" /* 3; save y3 y2 y1 y0              */\
+    "pshufh $f14, $f14, $f16    \n\t" /* y7 y6 y5 y4                      */\
+    "sdc1 $f14, 8+"#A2"         \n\t" /* 7; save y7 y6 y5 y4              */\
+
+
+#define DCT_8_INV_COL(A1,A2)                                                \
+    "ldc1 $f2, 2*8(%3)          \n\t"                                       \
+    "ldc1 $f6, 16*3+"#A1"       \n\t"                                       \
+    "ldc1 $f10, 16*5+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f2, $f6       \n\t" /* x3*(tg_3_16-1)                   */\
+    "ldc1 $f4, 0(%3)            \n\t"                                       \
+    "pmulhh $f2, $f2, $f10      \n\t" /* x5*(tg_3_16-1)                   */\
+    "ldc1 $f14, 16*7+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 16*1+"#A1"      \n\t"                                       \
+    "pmulhh $f8, $f4, $f14      \n\t" /* x7*tg_1_16                       */\
+    "paddsh $f0, $f0, $f6       \n\t" /* x3*tg_3_16                       */\
+    "pmulhh $f4, $f4, $f12      \n\t" /* x1*tg_1_16                       */\
+    "paddsh $f2, $f2, $f6       \n\t" /* x3+x5*(tg_3_16-1)                */\
+    "psubsh $f0, $f0, $f10      \n\t" /* x3*tg_3_16-x5 = tm35             */\
+    "ldc1 $f6, 3*8(%3)          \n\t"                                       \
+    "paddsh $f2, $f2, $f10      \n\t" /* x3+x5*tg_3_16 = tp35             */\
+    "paddsh $f8, $f8, $f12      \n\t" /* x1+tg_1_16*x7 = tp17             */\
+    "psubsh $f4, $f4, $f14      \n\t" /* x1*tg_1_16-x7 = tm17             */\
+    "paddsh $f10, $f8, $f2      \n\t" /* tp17+tp35 = b0                   */\
+    "psubsh $f12, $f4, $f0      \n\t" /* tm17-tm35 = b3                   */\
+    "psubsh $f8, $f8, $f2       \n\t" /* tp17-tp35 = t1                   */\
+    "paddsh $f4, $f4, $f0       \n\t" /* tm17+tm35 = t2                   */\
+    "ldc1 $f14, 1*8(%3)         \n\t"                                       \
+    "sdc1 $f10, 3*16+"#A2"      \n\t" /* save b0                          */\
+    "paddsh $f2, $f8, $f4       \n\t" /* t1+t2                            */\
+    "sdc1 $f12, 5*16+"#A2"      \n\t" /* save b3                          */\
+    "psubsh $f8, $f8, $f4       \n\t" /* t1-t2                            */\
+    "ldc1 $f10, 2*16+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 6*16+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f14, $f10     \n\t" /* x2*tg_2_16                       */\
+    "pmulhh $f14, $f14, $f12    \n\t" /* x6*tg_2_16                       */\
+    "pmulhh $f2, $f2, $f6       \n\t" /* ocos_4_16*(t1+t2) = b1/2         */\
+    "ldc1 $f4, 0*16+"#A1"       \n\t"                                       \
+    "pmulhh $f8, $f8, $f6       \n\t" /* ocos_4_16*(t1-t2) = b2/2         */\
+    "psubsh $f0, $f0, $f12      \n\t" /* t2*tg_2_16-x6 = tm26             */\
+    "ldc1 $f12, 4*16+"#A1"      \n\t"                                       \
+    "paddsh $f14, $f14, $f10    \n\t" /* x2+x6*tg_2_16 = tp26             */\
+    "psubsh $f6, $f4, $f12      \n\t" /* x0-x4 = tm04                     */\
+    "paddsh $f4, $f4, $f12      \n\t" /* x0+x4 = tp04                     */\
+    "paddsh $f10, $f4, $f14     \n\t" /* tp04+tp26 = a0                   */\
+    "psubsh $f12, $f6, $f0      \n\t" /* tm04-tm26 = a2                   */\
+    "psubsh $f4, $f4, $f14      \n\t" /* tp04-tp26 = a3                   */\
+    "paddsh $f6, $f6, $f0       \n\t" /* tm04+tm26 = a1                   */\
+    "paddsh $f2, $f2, $f2       \n\t" /* b1                               */\
+    "paddsh $f8, $f8, $f8       \n\t" /* b2                               */\
+    "psubsh $f14, $f6, $f2      \n\t" /* a1-b1                            */\
+    "dli $10, 6                 \n\t"                                       \
+    "paddsh $f6, $f6, $f2       \n\t" /* a1+b1                            */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubsh $f0, $f12, $f8      \n\t" /* a2-b2                            */\
+    "paddsh $f12, $f12, $f8     \n\t" /* a2+b2                            */\
+    "psrah $f6, $f6, $f16       \n\t" /* dst1                             */\
+    "psrah $f12, $f12, $f16     \n\t" /* dst2                             */\
+    "ldc1 $f2, 3*16+"#A2"       \n\t" /* load b0                          */\
+    "psrah $f14, $f14, $f16     \n\t" /* dst6                             */\
+    "psrah $f0, $f0, $f16       \n\t" /* dst5                             */\
+    "sdc1 $f6, 1*16+"#A2"       \n\t"                                       \
+    "psubsh $f8, $f10, $f2      \n\t" /* a0-b0                            */\
+    "paddsh $f10, $f10, $f2     \n\t" /* a0+b0                            */\
+    "sdc1 $f12, 2*16+"#A2"      \n\t"                                       \
+    "ldc1 $f6, 5*16+"#A2"       \n\t" /* load b3                          */\
+    "psrah $f10, $f10, $f16     \n\t" /* dst0                             */\
+    "psrah $f8, $f8, $f16       \n\t" /* dst7                             */\
+    "sdc1 $f0, 5*16+"#A2"       \n\t"                                       \
+    "psubsh $f12, $f4, $f6      \n\t" /* a3-b3                            */\
+    "paddsh $f4, $f4, $f6       \n\t" /* a3+b3                            */\
+    "sdc1 $f14, 6*16+"#A2"      \n\t"                                       \
+    "sdc1 $f10, 0*16+"#A2"      \n\t"                                       \
+    "psrah $f4, $f4, $f16       \n\t" /* dst3                             */\
+    "sdc1 $f8, 7*16+"#A2"       \n\t"                                       \
+    "psrah $f12, $f12, $f16     \n\t" /* dst4                             */\
+    "sdc1 $f4, 3*16+"#A2"       \n\t"                                       \
+    "sdc1 $f12, 4*16+"#A2"      \n\t"                                       \
+
+
+void ff_xvid_idct_mmi(int16_t *block)
+{
+    __asm__ volatile (
+        //# Process each row
+        DCT_8_INV_ROW_MMI(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+        DCT_8_INV_ROW_MMI(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+        DCT_8_INV_ROW_MMI(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+        DCT_8_INV_ROW_MMI(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+        DCT_8_INV_ROW_MMI(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+        DCT_8_INV_ROW_MMI(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+        DCT_8_INV_ROW_MMI(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+        DCT_8_INV_ROW_MMI(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+        //# Process the columns (4 at a time)
+        DCT_8_INV_COL(0(%0), 0(%0))
+        DCT_8_INV_COL(8(%0), 8(%0))
+        ::"r"(block),"r"(rounder_0),"r"(tab_i_04_mmi),"r"(tg_1_16)
+        : "$10"
+    );
+}
+
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_put_pixels_clamped_mmi(block, dest, line_size);
+}
+
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_add_pixels_clamped_mmi(block, dest, line_size);
+}
diff --git a/libavcodec/mips/xvididct_init_mips.c b/libavcodec/mips/xvididct_init_mips.c
new file mode 100644
index 0000000000..c1d82cc30c
--- /dev/null
+++ b/libavcodec/mips/xvididct_init_mips.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xvididct_mips.h"
+
+#if HAVE_MMI
+static av_cold void xvid_idct_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_XVID) {
+            c->idct_put = ff_xvid_idct_put_mmi;
+            c->idct_add = ff_xvid_idct_add_mmi;
+            c->idct = ff_xvid_idct_mmi;
+            c->perm_type = FF_IDCT_PERM_NONE;
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+#if HAVE_MMI
+    xvid_idct_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/xvididct_mips.h b/libavcodec/mips/xvididct_mips.h
new file mode 100644
index 0000000000..0768aaa26f
--- /dev/null
+++ b/libavcodec/mips/xvididct_mips.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_XVIDIDCT_MIPS_H
+#define AVCODEC_MIPS_XVIDIDCT_MIPS_H
+
+#include "libavcodec/xvididct.h"
+
+void ff_xvid_idct_mmi(int16_t *block);
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+
+#endif /* AVCODEC_MIPS_XVIDIDCT_MIPS_H */
diff --git a/libavcodec/mjpeg.h b/libavcodec/mjpeg.h
index 1ebe283748..cd5d0af2ac 100644
--- a/libavcodec/mjpeg.h
+++ b/libavcodec/mjpeg.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -117,6 +117,7 @@ enum JpegMarker {
 
 #define PREDICT(ret, topleft, top, left, predictor)\
     switch(predictor){\
+        case 0: ret= 0; break;\
         case 1: ret= left; break;\
         case 2: ret= top; break;\
         case 3: ret= topleft; break;\
diff --git a/libavcodec/mjpeg2jpeg_bsf.c b/libavcodec/mjpeg2jpeg_bsf.c
index d27793420e..68640db9e6 100644
--- a/libavcodec/mjpeg2jpeg_bsf.c
+++ b/libavcodec/mjpeg2jpeg_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG/AVI1 to JPEG/JFIF bitstream format filter
  * Copyright (c) 2010 Adrian Daerr and Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mjpeg_parser.c b/libavcodec/mjpeg_parser.c
index ab654614eb..e548b00f40 100644
--- a/libavcodec/mjpeg_parser.c
+++ b/libavcodec/mjpeg_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,27 +28,44 @@
 
 #include "parser.h"
 
+typedef struct MJPEGParserContext{
+    ParseContext pc;
+    int size;
+}MJPEGParserContext;
 
 /**
  * Find the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
  */
-static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
+static int find_frame_end(MJPEGParserContext *m, const uint8_t *buf, int buf_size){
+    ParseContext *pc= &m->pc;
     int vop_found, i;
-    uint16_t state;
+    uint32_t state;
 
     vop_found= pc->frame_start_found;
     state= pc->state;
 
     i=0;
     if(!vop_found){
-        for(i=0; i<buf_size; i++){
+        for(i=0; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                i++;
-                vop_found=1;
-                break;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    i++;
+                    vop_found=1;
+                    break;
+                }else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
 
@@ -56,13 +73,25 @@ static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
         /* EOF considered as end of frame */
         if (buf_size == 0)
             return 0;
-        for(; i<buf_size; i++){
+        for(; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                pc->frame_start_found=0;
-                pc->state=0;
-                return i-1;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    pc->frame_start_found=0;
+                    pc->state=0;
+                    return i-3;
+                } else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
     pc->frame_start_found= vop_found;
@@ -75,13 +104,14 @@ static int jpeg_parse(AVCodecParserContext *s,
                       const uint8_t **poutbuf, int *poutbuf_size,
                       const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    MJPEGParserContext *m = s->priv_data;
+    ParseContext *pc = &m->pc;
     int next;
 
     if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
         next= buf_size;
     }else{
-        next= find_frame_end(pc, buf, buf_size);
+        next= find_frame_end(m, buf, buf_size);
 
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf = NULL;
@@ -98,7 +128,7 @@ static int jpeg_parse(AVCodecParserContext *s,
 
 AVCodecParser ff_mjpeg_parser = {
     .codec_ids      = { AV_CODEC_ID_MJPEG },
-    .priv_data_size = sizeof(ParseContext),
+    .priv_data_size = sizeof(MJPEGParserContext),
     .parser_parse   = jpeg_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/mjpega_dump_header_bsf.c b/libavcodec/mjpega_dump_header_bsf.c
index dfb8916dd9..d6d41e6bbb 100644
--- a/libavcodec/mjpega_dump_header_bsf.c
+++ b/libavcodec/mjpega_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG A dump header bitstream filter
  * Copyright (c) 2006 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -90,7 +90,6 @@ static int mjpega_dump_header(AVBitStreamFilterContext *bsfc, AVCodecContext *av
 }
 
 AVBitStreamFilter ff_mjpega_dump_header_bsf = {
-    "mjpegadump",
-    0,
-    mjpega_dump_header,
+    .name   = "mjpegadump",
+    .filter = mjpega_dump_header,
 };
diff --git a/libavcodec/mjpegbdec.c b/libavcodec/mjpegbdec.c
index 3775aa3fb9..a858707d54 100644
--- a/libavcodec/mjpegbdec.c
+++ b/libavcodec/mjpegbdec.c
@@ -2,20 +2,20 @@
  * Apple MJPEG-B decoder
  * Copyright (c) 2002 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,6 +55,7 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
+    s->got_picture = 0;
 
 read_header:
     /* reset on every SOI */
@@ -122,7 +123,7 @@ read_header:
                       8 * FFMIN(field_size, buf_end - buf_ptr - sos_offs));
         s->mjpb_skiptosod = (sod_offs - sos_offs - show_bits(&s->gb, 16));
         s->start_code = SOS;
-        if (ff_mjpeg_decode_sos(s, NULL, NULL) < 0 &&
+        if (ff_mjpeg_decode_sos(s, NULL, 0, NULL) < 0 &&
             (avctx->err_recognition & AV_EF_EXPLODE))
           return AVERROR_INVALIDDATA;
     }
@@ -133,13 +134,17 @@ read_header:
         if (s->bottom_field != s->interlace_polarity && second_field_offs)
         {
             buf_ptr = buf + second_field_offs;
-            second_field_offs = 0;
             goto read_header;
             }
     }
 
     //XXX FIXME factorize, this looks very similar to the EOI code
 
+    if(!s->got_picture) {
+        av_log(avctx, AV_LOG_WARNING, "no picture\n");
+        return buf_size;
+    }
+
     if ((ret = av_frame_ref(data, s->picture_ptr)) < 0)
         return ret;
     *got_frame = 1;
@@ -162,5 +167,6 @@ AVCodec ff_mjpegb_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = mjpegb_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index d8fc9defa3..1a86b7bc31 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,12 +30,12 @@
  * MJPEG decoder.
  */
 
-#include <assert.h>
-
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "copy_block.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "jpegtables.h"
@@ -43,6 +43,9 @@
 #include "mjpegdec.h"
 #include "jpeglsdec.h"
 #include "put_bits.h"
+#include "tiff.h"
+#include "exif.h"
+#include "bytestream.h"
 
 
 static int build_vlc(VLC *vlc, const uint8_t *bits_table,
@@ -54,7 +57,7 @@ static int build_vlc(VLC *vlc, const uint8_t *bits_table,
     uint16_t huff_sym[256];
     int i;
 
-    assert(nb_codes <= 256);
+    av_assert0(nb_codes <= 256);
 
     ff_mjpeg_build_huffman_codes(huff_size, huff_code, bits_table, val_table);
 
@@ -84,6 +87,17 @@ static void build_basic_mjpeg_vlc(MJpegDecodeContext *s)
               avpriv_mjpeg_val_ac_chrominance, 251, 0, 0);
 }
 
+static void parse_avid(MJpegDecodeContext *s, uint8_t *buf, int len)
+{
+    s->buggy_avid = 1;
+    if (len > 14 && buf[12] == 1) /* 1 - NTSC */
+        s->interlace_polarity = 1;
+    if (len > 14 && buf[12] == 2) /* 2 - PAL */
+        s->interlace_polarity = 0;
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(s->avctx, AV_LOG_INFO, "AVID: len:%d %d\n", len, len > 14 ? buf[12] : -1);
+}
+
 av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
 {
     MJpegDecodeContext *s = avctx->priv_data;
@@ -105,6 +119,7 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     s->buffer        = NULL;
     s->start_code    = -1;
     s->first_picture = 1;
+    s->got_picture   = 0;
     s->org_height    = avctx->coded_height;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     avctx->colorspace = AVCOL_SPC_BT470BG;
@@ -112,19 +127,28 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     build_basic_mjpeg_vlc(s);
 
     if (s->extern_huff) {
-        int ret;
-        av_log(avctx, AV_LOG_INFO, "mjpeg: using external huffman table\n");
+        av_log(avctx, AV_LOG_INFO, "using external huffman table\n");
         init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size * 8);
-        if ((ret = ff_mjpeg_decode_dht(s))) {
+        if (ff_mjpeg_decode_dht(s)) {
             av_log(avctx, AV_LOG_ERROR,
-                   "mjpeg: error using external huffman table\n");
-            return ret;
+                   "error using external huffman table, switching back to internal\n");
+            build_basic_mjpeg_vlc(s);
         }
     }
     if (avctx->field_order == AV_FIELD_BB) { /* quicktime icefloe 019 */
         s->interlace_polarity = 1;           /* bottom field first */
-        av_log(avctx, AV_LOG_DEBUG, "mjpeg bottom field first\n");
+        av_log(avctx, AV_LOG_DEBUG, "bottom field first\n");
+    } else if (avctx->field_order == AV_FIELD_UNKNOWN) {
+        if (avctx->codec_tag == AV_RL32("MJPG"))
+            s->interlace_polarity = 1;
+    }
+
+    if (   avctx->extradata_size > 8
+        && AV_RL32(avctx->extradata) == 0x2C
+        && AV_RL32(avctx->extradata+4) == 0x18) {
+        parse_avid(s, avctx->extradata, avctx->extradata_size);
     }
+
     if (avctx->codec->id == AV_CODEC_ID_AMV)
         s->flipped = 1;
 
@@ -139,11 +163,16 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dqt: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len >= 65) {
-        /* only 8 bit precision handled */
-        if (get_bits(&s->gb, 4) != 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "dqt: 16bit precision\n");
-            return -1;
+        int pr = get_bits(&s->gb, 4);
+        if (pr > 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "dqt: invalid precision\n");
+            return AVERROR_INVALIDDATA;
         }
         index = get_bits(&s->gb, 4);
         if (index >= 4)
@@ -152,7 +181,7 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
         /* read quant table */
         for (i = 0; i < 64; i++) {
             j = s->scantable.permutated[i];
-            s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
+            s->quant_matrixes[index][j] = get_bits(&s->gb, pr ? 16 : 8);
         }
 
         // XXX FIXME finetune, and perhaps add dc too
@@ -160,7 +189,7 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
                                  s->quant_matrixes[index][s->scantable.permutated[8]]) >> 1;
         av_log(s->avctx, AV_LOG_DEBUG, "qscale[%d]: %d\n",
                index, s->qscale[index]);
-        len -= 65;
+        len -= 1 + 64 * (1+pr);
     }
     return 0;
 }
@@ -175,6 +204,11 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dht: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len > 0) {
         if (len < 17)
             return AVERROR_INVALIDDATA;
@@ -222,21 +256,32 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
 
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
 {
+    int len, nb_components, i, width, height, bits, ret;
+    unsigned pix_fmt_id;
     int h_count[MAX_COMPONENTS] = { 0 };
     int v_count[MAX_COMPONENTS] = { 0 };
-    int len, nb_components, i, width, height, bits, pix_fmt_id, ret;
+
+    s->cur_scan = 0;
+    memset(s->upscale_h, 0, sizeof(s->upscale_h));
+    memset(s->upscale_v, 0, sizeof(s->upscale_v));
 
     /* XXX: verify len field validity */
     len     = get_bits(&s->gb, 16);
+    s->avctx->bits_per_raw_sample =
     bits    = get_bits(&s->gb, 8);
 
+    if (bits > 16 || bits < 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "bits %d is invalid\n", bits);
+        return AVERROR_INVALIDDATA;
+    }
+
     if (s->pegasus_rct)
         bits = 9;
     if (bits == 9 && !s->pegasus_rct)
         s->rct  = 1;    // FIXME ugly
 
-    if (bits != 8 && !s->lossless) {
-        av_log(s->avctx, AV_LOG_ERROR, "only 8 bits/component accepted\n");
+    if(s->lossless && s->avctx->lowres){
+        av_log(s->avctx, AV_LOG_ERROR, "lowres is not possible with lossless jpeg\n");
         return -1;
     }
 
@@ -282,8 +327,10 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         if (v_count[i] > s->v_max)
             s->v_max = v_count[i];
         s->quant_index[i] = get_bits(&s->gb, 8);
-        if (s->quant_index[i] >= 4)
+        if (s->quant_index[i] >= 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "quant_index is invalid\n");
             return AVERROR_INVALIDDATA;
+        }
         if (!h_count[i] || !v_count[i]) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid sampling factor in component %d %d:%d\n",
@@ -295,28 +342,35 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
                i, h_count[i], v_count[i],
                s->component_id[i], s->quant_index[i]);
     }
+    if (   nb_components == 4
+        && s->component_id[0] == 'C' - 1
+        && s->component_id[1] == 'M' - 1
+        && s->component_id[2] == 'Y' - 1
+        && s->component_id[3] == 'K' - 1)
+        s->adobe_transform = 0;
 
     if (s->ls && (s->h_max > 1 || s->v_max > 1)) {
         avpriv_report_missing_feature(s->avctx, "Subsampling in JPEG-LS");
         return AVERROR_PATCHWELCOME;
     }
 
-    if (s->v_max == 1 && s->h_max == 1 && s->lossless == 1)
-        s->rgb = 1;
 
     /* if different size, realloc/alloc picture */
     if (width != s->width || height != s->height || bits != s->bits ||
         memcmp(s->h_count, h_count, sizeof(h_count))                ||
         memcmp(s->v_count, v_count, sizeof(v_count))) {
+
         s->width      = width;
         s->height     = height;
         s->bits       = bits;
         memcpy(s->h_count, h_count, sizeof(h_count));
         memcpy(s->v_count, v_count, sizeof(v_count));
         s->interlaced = 0;
+        s->got_picture = 0;
 
         /* test interlaced mode */
         if (s->first_picture   &&
+            (s->multiscope != 2 || s->avctx->time_base.den >= 25 * s->avctx->time_base.num) &&
             s->org_height != 0 &&
             s->height < ((s->org_height * 3) / 4)) {
             s->interlaced                    = 1;
@@ -333,9 +387,18 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         s->first_picture = 0;
     }
 
-    if (!(s->interlaced && (s->bottom_field == !s->interlace_polarity))) {
+    if (s->got_picture && s->interlaced && (s->bottom_field == !s->interlace_polarity)) {
+        if (s->progressive) {
+            avpriv_request_sample(s->avctx, "progressively coded interlaced picture");
+            return AVERROR_INVALIDDATA;
+        }
+    } else{
+        if (s->v_max == 1 && s->h_max == 1 && s->lossless==1 && (nb_components==3 || nb_components==4))
+            s->rgb = 1;
+        else if (!s->lossless)
+            s->rgb = 0;
     /* XXX: not complete test ! */
-    pix_fmt_id = (s->h_count[0] << 28) | (s->v_count[0] << 24) |
+    pix_fmt_id = ((unsigned)s->h_count[0] << 28) | (s->v_count[0] << 24) |
                  (s->h_count[1] << 20) | (s->v_count[1] << 16) |
                  (s->h_count[2] << 12) | (s->v_count[2] <<  8) |
                  (s->h_count[3] <<  4) |  s->v_count[3];
@@ -347,38 +410,187 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
     if (!(pix_fmt_id & 0x0D0D0D0D))
         pix_fmt_id -= (pix_fmt_id & 0x0F0F0F0F) >> 1;
 
+    for (i = 0; i < 8; i++) {
+        int j = 6 + (i&1) - (i&6);
+        int is = (pix_fmt_id >> (4*i)) & 0xF;
+        int js = (pix_fmt_id >> (4*j)) & 0xF;
+
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> ( 8 + 4*(i&1))) & 0xF;
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> (16 + 4*(i&1))) & 0xF;
+
+        if (is == 1 && js == 2) {
+            if (i & 1) s->upscale_h[j/2] = 1;
+            else       s->upscale_v[j/2] = 1;
+        }
+    }
+
     switch (pix_fmt_id) {
     case 0x11111100:
         if (s->rgb)
-            s->avctx->pix_fmt = AV_PIX_FMT_BGRA;
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_BGR48;
+        else {
+            if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_GBRP : AV_PIX_FMT_GBRP16;
+            } else {
+                if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+                else              s->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 3);
+        break;
+    case 0x11111111:
+        if (s->rgb)
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_ABGR : AV_PIX_FMT_RGBA64;
         else {
-            s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+            if (s->adobe_transform == 0 && s->bits <= 8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            } else {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_YUVA444P : AV_PIX_FMT_YUVA444P16;
+                s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x22111122:
+    case 0x22111111:
+        if (s->adobe_transform == 0 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (s->adobe_transform == 2 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
             s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         }
-        assert(s->nb_components == 3);
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x12121100:
+    case 0x22122100:
+    case 0x21211100:
+    case 0x22211200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        break;
+    case 0x22221100:
+    case 0x22112200:
+    case 0x11222200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x11000000:
-        s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+    case 0x13000000:
+    case 0x14000000:
+    case 0x31000000:
+    case 0x33000000:
+    case 0x34000000:
+    case 0x41000000:
+    case 0x43000000:
+    case 0x44000000:
+        if(s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        else
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
         break;
     case 0x12111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
-        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+    case 0x14121200:
+    case 0x14111100:
+    case 0x22211100:
+    case 0x22112100:
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_v[0] = s->upscale_v[1] = 1;
+        } else {
+            if (pix_fmt_id == 0x14111100)
+                s->upscale_v[1] = s->upscale_v[2] = 1;
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
+            else
+                goto unk_pixfmt;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
         break;
     case 0x21111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_h[0] = s->upscale_h[1] = 1;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
+        break;
+    case 0x31111100:
+        if (s->bits > 8)
+            goto unk_pixfmt;
+        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        s->upscale_h[1] = s->upscale_h[2] = 2;
+        break;
+    case 0x22121100:
+    case 0x22111200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x22111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+    case 0x42111100:
+    case 0x24111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+        else              s->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        if (pix_fmt_id == 0x42111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (pix_fmt_id == 0x24111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+        }
+        break;
+    case 0x41111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV411P : AV_PIX_FMT_YUVJ411P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "Unhandled pixel format 0x%x\n", pix_fmt_id);
+unk_pixfmt:
+        av_log(s->avctx, AV_LOG_ERROR, "Unhandled pixel format 0x%x bits:%d\n", pix_fmt_id, s->bits);
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((AV_RB32(s->upscale_h) || AV_RB32(s->upscale_v)) && s->avctx->lowres) {
+        av_log(s->avctx, AV_LOG_ERROR, "lowres not supported for weird subsampling\n");
         return AVERROR_PATCHWELCOME;
     }
     if (s->ls) {
-        if (s->nb_components > 1)
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        if (s->nb_components == 3) {
             s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if (s->nb_components != 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported number of components %d\n", s->nb_components);
+            return AVERROR_PATCHWELCOME;
+        } else if (s->palette_index && s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
         else if (s->bits <= 8)
             s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         else
@@ -392,15 +604,13 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
     }
 
     av_frame_unref(s->picture_ptr);
-    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0)
         return -1;
-    }
     s->picture_ptr->pict_type = AV_PICTURE_TYPE_I;
     s->picture_ptr->key_frame = 1;
     s->got_picture            = 1;
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         s->linesize[i] = s->picture_ptr->linesize[i] << s->interlaced;
 
     ff_dlog(s->avctx, "%d %d %d %d %d %d\n",
@@ -411,6 +621,11 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         av_log(s->avctx, AV_LOG_DEBUG, "decode_sof0: error, len(%d) mismatch\n", len);
     }
 
+    if (s->rgb && !s->lossless && !s->ls) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported coding and pixel format combination\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /* totally blank picture as progressive JPEG will only add details to it */
     if (s->progressive) {
         int bw = (width  + s->h_max * 8 - 1) / (s->h_max * 8);
@@ -419,8 +634,10 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
             int size = bw * bh * s->h_count[i] * s->v_count[i];
             av_freep(&s->blocks[i]);
             av_freep(&s->last_nnz[i]);
-            s->blocks[i]       = av_malloc(size * sizeof(**s->blocks));
-            s->last_nnz[i]     = av_mallocz(size * sizeof(**s->last_nnz));
+            s->blocks[i]       = av_mallocz_array(size, sizeof(**s->blocks));
+            s->last_nnz[i]     = av_mallocz_array(size, sizeof(**s->last_nnz));
+            if (!s->blocks[i] || !s->last_nnz[i])
+                return AVERROR(ENOMEM);
             s->block_stride[i] = bw * s->h_count[i];
         }
         memset(s->coefs_finished, 0, sizeof(s->coefs_finished));
@@ -432,11 +649,11 @@ static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 {
     int code;
     code = get_vlc2(&s->gb, s->vlcs[0][dc_index].table, 9, 2);
-    if (code < 0) {
+    if (code < 0 || code > 16) {
         av_log(s->avctx, AV_LOG_WARNING,
                "mjpeg_decode_dc: bad vlc: %d:%d (%p)\n",
                0, dc_index, &s->vlcs[0][dc_index]);
-        return 0xffff;
+        return 0xfffff;
     }
 
     if (code)
@@ -453,11 +670,12 @@ static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
 
     /* DC coef */
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
     val = val * quant_matrix[0] + s->last_dc[component];
+    val = FFMIN(val, 32767);
     s->last_dc[component] = val;
     block[0] = val;
     /* AC coefs */
@@ -501,11 +719,11 @@ static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block,
     int val;
     s->bdsp.clear_block(block);
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
-    val = (val * quant_matrix[0] << Al) + s->last_dc[component];
+    val = (val * (quant_matrix[0] << Al)) + s->last_dc[component];
     s->last_dc[component] = val;
     block[0] = val;
     return 0;
@@ -548,14 +766,14 @@ static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
                 if (i >= se) {
                     if (i == se) {
                         j = s->scantable.permutated[se];
-                        block[j] = level * quant_matrix[j] << Al;
+                        block[j] = level * (quant_matrix[j] << Al);
                         break;
                     }
                     av_log(s->avctx, AV_LOG_ERROR, "error count: %d\n", i);
                     return AVERROR_INVALIDDATA;
                 }
                 j = s->scantable.permutated[i];
-                block[j] = level * quant_matrix[j] << Al;
+                block[j] = level * (quant_matrix[j] << Al);
             } else {
                 if (run == 0xF) {// ZRL - skip 15 coefficients
                     i += 15;
@@ -634,7 +852,7 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
                 ZERO_RUN;
                 j = s->scantable.permutated[i];
                 val--;
-                block[j] = ((quant_matrix[j]^val) - val) << Al;
+                block[j] = ((quant_matrix[j] << Al) ^ val) - val;
                 if (i == se) {
                     if (i > *last_nnz)
                         *last_nnz = i;
@@ -675,46 +893,101 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
 #undef REFINE_BIT
 #undef ZERO_RUN
 
-static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
-                                 int point_transform)
+static int handle_rstn(MJpegDecodeContext *s, int nb_components)
+{
+    int i;
+    int reset = 0;
+
+    if (s->restart_interval) {
+        s->restart_count--;
+        if(s->restart_count == 0 && s->avctx->codec_id == AV_CODEC_ID_THP){
+            align_get_bits(&s->gb);
+            for (i = 0; i < nb_components; i++) /* reset dc */
+                s->last_dc[i] = (4 << s->bits);
+        }
+
+        i = 8 + ((-get_bits_count(&s->gb)) & 7);
+        /* skip RSTn */
+        if (s->restart_count == 0) {
+            if(   show_bits(&s->gb, i) == (1 << i) - 1
+               || show_bits(&s->gb, i) == 0xFF) {
+                int pos = get_bits_count(&s->gb);
+                align_get_bits(&s->gb);
+                while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
+                    skip_bits(&s->gb, 8);
+                if (get_bits_left(&s->gb) >= 8 && (get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
+                    for (i = 0; i < nb_components; i++) /* reset dc */
+                        s->last_dc[i] = (4 << s->bits);
+                    reset = 1;
+                } else
+                    skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
+            }
+        }
+    }
+    return reset;
+}
+
+static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int nb_components, int predictor, int point_transform)
 {
     int i, mb_x, mb_y;
     uint16_t (*buffer)[4];
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     const int linesize = s->linesize[0];
-    const int mask     = (1 << s->bits) - 1;
+    const int mask     = ((1 << s->bits) - 1) << point_transform;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    if (s->nb_components != 3 && s->nb_components != 4)
+        return AVERROR_INVALIDDATA;
+    if (s->v_max != 1 || s->h_max != 1 || !s->lossless)
+        return AVERROR_INVALIDDATA;
+
+
+    s->restart_count = s->restart_interval;
 
     av_fast_malloc(&s->ljpeg_buffer, &s->ljpeg_buffer_size,
                    (unsigned)s->mb_width * 4 * sizeof(s->ljpeg_buffer[0][0]));
     buffer = s->ljpeg_buffer;
 
-    for (i = 0; i < 3; i++)
-        buffer[0][i] = 1 << (s->bits + point_transform - 1);
+    for (i = 0; i < 4; i++)
+        buffer[0][i] = 1 << (s->bits - 1);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        const int modified_predictor = mb_y ? predictor : 1;
         uint8_t *ptr = s->picture_ptr->data[0] + (linesize * mb_y);
 
         if (s->interlaced && s->bottom_field)
             ptr += linesize >> 1;
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i] = left[i] = topleft[i] = buffer[0][i];
 
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            int modified_predictor = predictor;
+
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+                for(i=0; i<4; i++)
+                    top[i] = left[i]= topleft[i]= 1 << (s->bits - 1);
+            }
+            if (mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || !mb_x)
+                modified_predictor = 1;
 
-            for (i = 0; i < 3; i++) {
-                int pred;
+            for (i=0;i<nb_components;i++) {
+                int pred, dc;
 
                 topleft[i] = top[i];
                 top[i]     = buffer[mb_x][i];
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
 
+                dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                if(dc == 0xFFFFF)
+                    return -1;
+
                 left[i] = buffer[mb_x][i] =
-                    mask & (pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform));
+                    mask & (pred + (dc << point_transform));
             }
 
             if (s->restart_interval && !--s->restart_count) {
@@ -722,24 +995,54 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
                 skip_bits(&s->gb, 16); /* skip RSTn */
             }
         }
-
-        if (s->rct) {
+        if (s->rct && s->nb_components == 4) {
+            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                ptr[4*mb_x + 2] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[4*mb_x + 1] = buffer[mb_x][1] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 3] = buffer[mb_x][2] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 0] = buffer[mb_x][3];
+            }
+        } else if (s->nb_components == 4) {
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[4*mb_x+3-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[4*mb_x+c] = buffer[mb_x][i];
+                    }
+                }
+            }
+        } else if (s->rct) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else if (s->pegasus_rct) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else {
-            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 0] = buffer[mb_x][2];
-                ptr[4 * mb_x + 1] = buffer[mb_x][1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][0];
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                }
             }
         }
     }
@@ -749,48 +1052,88 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
 static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                                  int point_transform, int nb_components)
 {
-    int i, mb_x, mb_y;
+    int i, mb_x, mb_y, mask;
+    int bits= (s->bits+7)&~7;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    point_transform += bits - s->bits;
+    mask = ((1 << s->bits) - 1) << point_transform;
+
+    av_assert0(nb_components>=1 && nb_components<=4);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+            }
 
-            if (mb_x == 0 || mb_y == 0 || s->interlaced) {
+            if(!mb_x || mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || s->interlaced){
+                int toprow  = mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x;
+                int leftcol = !mb_x || mb_y == resync_mb_y && mb_x == resync_mb_x;
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
+                    uint16_t *ptr16;
                     int n, h, v, x, y, c, j, linesize;
-                    n        = s->nb_blocks[i];
-                    c        = s->comp_index[i];
-                    h        = s->h_scount[i];
-                    v        = s->v_scount[i];
-                    x        = 0;
-                    y        = 0;
-                    linesize = s->linesize[c];
-
-                    for (j = 0; j < n; j++) {
-                        int pred;
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
-                              (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        if (y == 0 && mb_y == 0) {
-                            if (x == 0 && mb_x == 0)
-                                pred = 128 << point_transform;
-                            else
-                                pred = ptr[-1];
-                        } else {
-                            if (x == 0 && mb_x == 0)
-                                pred = ptr[-linesize];
-                            else
-                                PREDICT(pred, ptr[-linesize - 1],
-                                        ptr[-linesize], ptr[-1], predictor);
-                       }
+                    n = s->nb_blocks[i];
+                    c = s->comp_index[i];
+                    h = s->h_scount[i];
+                    v = s->v_scount[i];
+                    x = 0;
+                    y = 0;
+                    linesize= s->linesize[c];
+
+                    if(bits>8) linesize /= 2;
+
+                    for(j=0; j<n; j++) {
+                        int pred, dc;
+
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if(bits<=8){
+                        ptr = s->picture_ptr->data[c] + (linesize * (v * mb_y + y)) + (h * mb_x + x); //FIXME optimize this crap
+                        if(y==0 && toprow){
+                            if(x==0 && leftcol){
+                                pred= 1 << (bits - 1);
+                            }else{
+                                pred= ptr[-1];
+                            }
+                        }else{
+                            if(x==0 && leftcol){
+                                pred= ptr[-linesize];
+                            }else{
+                                PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+                            }
+                        }
 
                         if (s->interlaced && s->bottom_field)
                             ptr += linesize >> 1;
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                        pred &= mask;
+                        *ptr= pred + (dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            if(y==0 && toprow){
+                                if(x==0 && leftcol){
+                                    pred= 1 << (bits - 1);
+                                }else{
+                                    pred= ptr16[-1];
+                                }
+                            }else{
+                                if(x==0 && leftcol){
+                                    pred= ptr16[-linesize];
+                                }else{
+                                    PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+                                }
+                            }
 
+                            if (s->interlaced && s->bottom_field)
+                                ptr16 += linesize >> 1;
+                            pred &= mask;
+                            *ptr16= pred + (dc << point_transform);
+                        }
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -800,7 +1143,8 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
             } else {
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
-                    int n, h, v, x, y, c, j, linesize;
+                    uint16_t *ptr16;
+                    int n, h, v, x, y, c, j, linesize, dc;
                     n        = s->nb_blocks[i];
                     c        = s->comp_index[i];
                     h        = s->h_scount[i];
@@ -809,16 +1153,30 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                     y        = 0;
                     linesize = s->linesize[c];
 
+                    if(bits>8) linesize /= 2;
+
                     for (j = 0; j < n; j++) {
                         int pred;
 
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if(bits<=8){
+                            ptr = s->picture_ptr->data[c] +
                               (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        PREDICT(pred, ptr[-linesize - 1],
-                                ptr[-linesize], ptr[-1], predictor);
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                              (h * mb_x + x); //FIXME optimize this crap
+                            PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+
+                            pred &= mask;
+                            *ptr = pred + (dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+
+                            pred &= mask;
+                            *ptr16= pred + (dc << point_transform);
+                        }
+
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -835,18 +1193,58 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
     return 0;
 }
 
+static av_always_inline void mjpeg_copy_block(MJpegDecodeContext *s,
+                                              uint8_t *dst, const uint8_t *src,
+                                              int linesize, int lowres)
+{
+    switch (lowres) {
+    case 0: s->hdsp.put_pixels_tab[1][0](dst, src, linesize, 8);
+        break;
+    case 1: copy_block4(dst, src, linesize, linesize, 4);
+        break;
+    case 2: copy_block2(dst, src, linesize, linesize, 2);
+        break;
+    case 3: *dst = *src;
+        break;
+    }
+}
+
+static void shift_output(MJpegDecodeContext *s, uint8_t *ptr, int linesize)
+{
+    int block_x, block_y;
+    int size = 8 >> s->avctx->lowres;
+    if (s->bits > 8) {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(uint16_t*)(ptr + 2*block_x + block_y*linesize) <<= 16 - s->bits;
+    } else {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(ptr + block_x + block_y*linesize) <<= 8 - s->bits;
+    }
+}
+
 static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                              int Al, const uint8_t *mb_bitmask,
+                             int mb_bitmask_size,
                              const AVFrame *reference)
 {
     int i, mb_x, mb_y;
     uint8_t *data[MAX_COMPONENTS];
     const uint8_t *reference_data[MAX_COMPONENTS];
     int linesize[MAX_COMPONENTS];
-    GetBitContext mb_bitmask_gb;
+    GetBitContext mb_bitmask_gb = {0}; // initialize to silence gcc warning
+    int bytes_per_pixel = 1 + (s->bits > 8);
 
-    if (mb_bitmask)
+    if (mb_bitmask) {
+        if (mb_bitmask_size != (s->mb_width * s->mb_height + 7)>>3) {
+            av_log(s->avctx, AV_LOG_ERROR, "mb_bitmask_size mismatches\n");
+            return AVERROR_INVALIDDATA;
+        }
         init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
+    }
+
+    s->restart_count = 0;
 
     for (i = 0; i < nb_components; i++) {
         int c   = s->comp_index[i];
@@ -879,27 +1277,36 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 x = 0;
                 y = 0;
                 for (j = 0; j < n; j++) {
-                    block_offset = ((linesize[c] * (v * mb_y + y) * 8) +
-                                    (h * mb_x + x) * 8);
+                    block_offset = (((linesize[c] * (v * mb_y + y) * 8) +
+                                     (h * mb_x + x) * 8 * bytes_per_pixel) >> s->avctx->lowres);
 
                     if (s->interlaced && s->bottom_field)
                         block_offset += linesize[c] >> 1;
-                    ptr = data[c] + block_offset;
+                    if (   8*(h * mb_x + x) < s->width
+                        && 8*(v * mb_y + y) < s->height) {
+                        ptr = data[c] + block_offset;
+                    } else
+                        ptr = NULL;
                     if (!s->progressive) {
-                        if (copy_mb)
-                            s->hdsp.put_pixels_tab[1][0](ptr,
-                                reference_data[c] + block_offset,
-                                linesize[c], 8);
-                        else {
+                        if (copy_mb) {
+                            if (ptr)
+                                mjpeg_copy_block(s, ptr, reference_data[c] + block_offset,
+                                                linesize[c], s->avctx->lowres);
+
+                        } else {
                             s->bdsp.clear_block(s->block);
                             if (decode_block(s, s->block, i,
                                              s->dc_index[i], s->ac_index[i],
-                                             s->quant_matrixes[s->quant_index[c]]) < 0) {
+                                             s->quant_matrixes[s->quant_sindex[i]]) < 0) {
                                 av_log(s->avctx, AV_LOG_ERROR,
                                        "error y=%d x=%d\n", mb_y, mb_x);
                                 return AVERROR_INVALIDDATA;
                             }
-                            s->idsp.idct_put(ptr, linesize[c], s->block);
+                            if (ptr) {
+                                s->idsp.idct_put(ptr, linesize[c], s->block);
+                                if (s->bits & 7)
+                                    shift_output(s, ptr, linesize[c]);
+                            }
                         }
                     } else {
                         int block_idx  = s->block_stride[c] * (v * mb_y + y) +
@@ -907,9 +1314,9 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                         int16_t *block = s->blocks[c][block_idx];
                         if (Ah)
                             block[0] += get_bits1(&s->gb) *
-                                        s->quant_matrixes[s->quant_index[c]][0] << Al;
+                                        s->quant_matrixes[s->quant_sindex[i]][0] << Al;
                         else if (decode_dc_progressive(s, block, i, s->dc_index[i],
-                                                       s->quant_matrixes[s->quant_index[c]],
+                                                       s->quant_matrixes[s->quant_sindex[i]],
                                                        Al) < 0) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "error y=%d x=%d\n", mb_y, mb_x);
@@ -927,74 +1334,52 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 }
             }
 
-            if (s->restart_interval) {
-                s->restart_count--;
-                i = 8 + ((-get_bits_count(&s->gb)) & 7);
-                /* skip RSTn */
-                if (show_bits(&s->gb, i) == (1 << i) - 1) {
-                    int pos = get_bits_count(&s->gb);
-                    align_get_bits(&s->gb);
-                    while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
-                        skip_bits(&s->gb, 8);
-                    if ((get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
-                        for (i = 0; i < nb_components; i++) /* reset dc */
-                            s->last_dc[i] = 1024;
-                    } else
-                        skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
-                }
-            }
+            handle_rstn(s, nb_components);
         }
     }
     return 0;
 }
 
 static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
-                                            int se, int Ah, int Al,
-                                            const uint8_t *mb_bitmask,
-                                            const AVFrame *reference)
+                                            int se, int Ah, int Al)
 {
     int mb_x, mb_y;
     int EOBRUN = 0;
     int c = s->comp_index[0];
     uint8_t *data = s->picture_ptr->data[c];
-    const uint8_t *reference_data = reference ? reference->data[c] : NULL;
     int linesize  = s->linesize[c];
     int last_scan = 0;
-    int16_t *quant_matrix = s->quant_matrixes[s->quant_index[c]];
-    GetBitContext mb_bitmask_gb;
+    int16_t *quant_matrix = s->quant_matrixes[s->quant_sindex[0]];
+    int bytes_per_pixel = 1 + (s->bits > 8);
 
-    if (ss < 0  || ss >= 64 ||
-        se < ss || se >= 64 ||
-        Ah < 0  || Al < 0)
+    av_assert0(ss>=0 && Ah>=0 && Al>=0);
+    if (se < ss || se > 63) {
+        av_log(s->avctx, AV_LOG_ERROR, "SS/SE %d/%d is invalid\n", ss, se);
         return AVERROR_INVALIDDATA;
-
-    if (mb_bitmask)
-        init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
+    }
 
     if (!Al) {
         // s->coefs_finished is a bitmask for coefficients coded
         // ss and se are parameters telling start and end coefficients
-        s->coefs_finished[c] |= (~0ULL >> (63 - (se - ss))) << ss;
+        s->coefs_finished[c] |= (2ULL << se) - (1ULL << ss);
         last_scan = !~s->coefs_finished[c];
     }
 
-    if (s->interlaced && s->bottom_field) {
-        int offset      = linesize >> 1;
-        data           += offset;
-        reference_data += offset;
-    }
+    if (s->interlaced && s->bottom_field)
+        data += linesize >> 1;
+
+    s->restart_count = 0;
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        int block_offset = mb_y * linesize * 8;
-        uint8_t *ptr     = data + block_offset;
+        uint8_t *ptr     = data + (mb_y * linesize * 8 >> s->avctx->lowres);
         int block_idx    = mb_y * s->block_stride[c];
         int16_t (*block)[64] = &s->blocks[c][block_idx];
         uint8_t *last_nnz    = &s->last_nnz[c][block_idx];
         for (mb_x = 0; mb_x < s->mb_width; mb_x++, block++, last_nnz++) {
-            const int copy_mb = mb_bitmask && !get_bits1(&mb_bitmask_gb);
-
-            if (!copy_mb) {
                 int ret;
+                if (s->restart_interval && !s->restart_count)
+                    s->restart_count = s->restart_interval;
+
                 if (Ah)
                     ret = decode_block_refinement(s, *block, last_nnz, s->ac_index[0],
                                                   quant_matrix, ss, se, Al, &EOBRUN);
@@ -1006,31 +1391,35 @@ static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
                            "error y=%d x=%d\n", mb_y, mb_x);
                     return AVERROR_INVALIDDATA;
                 }
-            }
 
             if (last_scan) {
-                if (copy_mb) {
-                    s->hdsp.put_pixels_tab[1][0](ptr,
-                                                 reference_data + block_offset,
-                                                 linesize, 8);
-                } else {
                     s->idsp.idct_put(ptr, linesize, *block);
-                    ptr += 8;
-                }
+                    if (s->bits & 7)
+                        shift_output(s, ptr, linesize);
+                    ptr += bytes_per_pixel*8 >> s->avctx->lowres;
             }
+            if (handle_rstn(s, 0))
+                EOBRUN = 0;
         }
     }
     return 0;
 }
 
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
-                        const AVFrame *reference)
+                        int mb_bitmask_size, const AVFrame *reference)
 {
     int len, nb_components, i, h, v, predictor, point_transform;
     int index, id, ret;
     const int block_size = s->lossless ? 1 : 8;
     int ilv, prev_shift;
 
+    if (!s->got_picture) {
+        av_log(s->avctx, AV_LOG_WARNING,
+                "Can not process SOS before SOF, skipping\n");
+        return -1;
+    }
+
+    av_assert0(s->picture_ptr->data[0]);
     /* XXX: verify len field validity */
     len = get_bits(&s->gb, 16);
     nb_components = get_bits(&s->gb, 8);
@@ -1060,27 +1449,35 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
             && nb_components == 3 && s->nb_components == 3 && i)
             index = 3 - i;
 
-        s->comp_index[i] = index;
-
+        s->quant_sindex[i] = s->quant_index[index];
         s->nb_blocks[i] = s->h_count[index] * s->v_count[index];
         s->h_scount[i]  = s->h_count[index];
         s->v_scount[i]  = s->v_count[index];
 
+        if(nb_components == 3 && s->nb_components == 3 && s->avctx->pix_fmt == AV_PIX_FMT_GBR24P)
+            index = (i+2)%3;
+        if(nb_components == 1 && s->nb_components == 3 && s->avctx->pix_fmt == AV_PIX_FMT_GBR24P)
+            index = (index+2)%3;
+
+        s->comp_index[i] = index;
+
         s->dc_index[i] = get_bits(&s->gb, 4);
         s->ac_index[i] = get_bits(&s->gb, 4);
 
         if (s->dc_index[i] <  0 || s->ac_index[i] < 0 ||
             s->dc_index[i] >= 4 || s->ac_index[i] >= 4)
             goto out_of_range;
-        if (!s->vlcs[0][s->dc_index[i]].table ||
-            !s->vlcs[1][s->ac_index[i]].table)
+        if (!s->vlcs[0][s->dc_index[i]].table || !(s->progressive ? s->vlcs[2][s->ac_index[0]].table : s->vlcs[1][s->ac_index[i]].table))
             goto out_of_range;
     }
 
     predictor = get_bits(&s->gb, 8);       /* JPEG Ss / lossless JPEG predictor /JPEG-LS NEAR */
     ilv = get_bits(&s->gb, 8);             /* JPEG Se / JPEG-LS ILV */
-    prev_shift      = get_bits(&s->gb, 4); /* Ah */
-    point_transform = get_bits(&s->gb, 4); /* Al */
+    if(s->avctx->codec_tag != AV_RL32("CJPG")){
+        prev_shift      = get_bits(&s->gb, 4); /* Ah */
+        point_transform = get_bits(&s->gb, 4); /* Al */
+    }else
+        prev_shift = point_transform = 0;
 
     if (nb_components > 1) {
         /* interleaved stream */
@@ -1097,10 +1494,10 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
     }
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d %s\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d skip:%d %s comp:%d\n",
                s->lossless ? "lossless" : "sequential DCT", s->rgb ? "RGB" : "",
-               predictor, point_transform, ilv, s->bits,
-               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""));
+               predictor, point_transform, ilv, s->bits, s->mjpb_skiptosod,
+               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""), nb_components);
 
 
     /* mjpeg-b can have padding bytes between sos and image data, skip them */
@@ -1109,9 +1506,10 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
 
 next_field:
     for (i = 0; i < nb_components; i++)
-        s->last_dc[i] = 1024;
+        s->last_dc[i] = (4 << s->bits);
 
     if (s->lossless) {
+        av_assert0(s->picture_ptr == s->picture);
         if (CONFIG_JPEGLS_DECODER && s->ls) {
 //            for () {
 //            reset_ls_coding_parameters(s, 0);
@@ -1121,8 +1519,7 @@ next_field:
                 return ret;
         } else {
             if (s->rgb) {
-                if ((ret = ljpeg_decode_rgb_scan(s, predictor,
-                                                 point_transform)) < 0)
+                if ((ret = ljpeg_decode_rgb_scan(s, nb_components, predictor, point_transform)) < 0)
                     return ret;
             } else {
                 if ((ret = ljpeg_decode_yuv_scan(s, predictor,
@@ -1133,16 +1530,15 @@ next_field:
         }
     } else {
         if (s->progressive && predictor) {
+            av_assert0(s->picture_ptr == s->picture);
             if ((ret = mjpeg_decode_scan_progressive_ac(s, predictor,
                                                         ilv, prev_shift,
-                                                        point_transform,
-                                                        mb_bitmask,
-                                                        reference)) < 0)
+                                                        point_transform)) < 0)
                 return ret;
         } else {
             if ((ret = mjpeg_decode_scan(s, nb_components,
                                          prev_shift, point_transform,
-                                         mb_bitmask, reference)) < 0)
+                                         mb_bitmask, mb_bitmask_size, reference)) < 0)
                 return ret;
         }
     }
@@ -1153,7 +1549,7 @@ next_field:
         GetBitContext bak = s->gb;
         align_get_bits(&bak);
         if (show_bits(&bak, 16) == 0xFFD1) {
-            ff_dlog(s->avctx, "AVRn interlaced picture marker found\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "AVRn interlaced picture marker found\n");
             s->gb = bak;
             skip_bits(&s->gb, 16);
             s->bottom_field ^= 1;
@@ -1186,22 +1582,24 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     int len, id, i;
 
     len = get_bits(&s->gb, 16);
-    if (len < 5)
+    if (len < 6)
         return AVERROR_INVALIDDATA;
     if (8 * len > get_bits_left(&s->gb))
         return AVERROR_INVALIDDATA;
 
     id   = get_bits_long(&s->gb, 32);
-    id   = av_be2ne32(id);
     len -= 6;
 
-    if (s->avctx->debug & FF_DEBUG_STARTCODE)
-        av_log(s->avctx, AV_LOG_DEBUG, "APPx %8X\n", id);
+    if (s->avctx->debug & FF_DEBUG_STARTCODE) {
+        char id_str[32];
+        av_get_codec_tag_string(id_str, sizeof(id_str), av_bswap32(id));
+        av_log(s->avctx, AV_LOG_DEBUG, "APPx (%s / %8X) len=%d\n", id_str, id, len);
+    }
 
     /* Buggy AVID, it puts EOI only at every 10th frame. */
     /* Also, this fourcc is used by non-avid files too, it holds some
        information, but it's always present in AVID-created files. */
-    if (id == AV_RL32("AVI1")) {
+    if (id == AV_RB32("AVI1")) {
         /* structure:
             4bytes      AVI1
             1bytes      polarity
@@ -1209,12 +1607,9 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
             4bytes      field_size
             4bytes      field_size_less_padding
         */
-        s->buggy_avid = 1;
-        i = get_bits(&s->gb, 8);
-        if (i == 2)
-            s->bottom_field = 1;
-        else if (i == 1)
-            s->bottom_field = 0;
+            s->buggy_avid = 1;
+        i = get_bits(&s->gb, 8); len--;
+        av_log(s->avctx, AV_LOG_DEBUG, "polarity %d\n", i);
 #if 0
         skip_bits(&s->gb, 8);
         skip_bits(&s->gb, 32);
@@ -1226,7 +1621,7 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 
 //    len -= 2;
 
-    if (id == AV_RL32("JFIF")) {
+    if (id == AV_RB32("JFIF")) {
         int t_w, t_h, v1, v2;
         skip_bits(&s->gb, 8); /* the trailing zero-byte */
         v1 = get_bits(&s->gb, 8);
@@ -1255,48 +1650,147 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
         goto out;
     }
 
-    if (id == AV_RL32("Adob") && (get_bits(&s->gb, 8) == 'e')) {
-        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
+    if (id == AV_RB32("Adob") && (get_bits(&s->gb, 8) == 'e')) {
         skip_bits(&s->gb, 16); /* version */
         skip_bits(&s->gb, 16); /* flags0 */
         skip_bits(&s->gb, 16); /* flags1 */
-        skip_bits(&s->gb,  8); /* transform */
+        s->adobe_transform = get_bits(&s->gb,  8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found, transform=%d\n", s->adobe_transform);
         len -= 7;
         goto out;
     }
 
-    if (id == AV_RL32("LJIF")) {
+    if (id == AV_RB32("LJIF")) {
+        int rgb = s->rgb;
+        int pegasus_rct = s->pegasus_rct;
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
             av_log(s->avctx, AV_LOG_INFO,
                    "Pegasus lossless jpeg header found\n");
         skip_bits(&s->gb, 16); /* version ? */
-        skip_bits(&s->gb, 16); /* unknwon always 0? */
-        skip_bits(&s->gb, 16); /* unknwon always 0? */
-        skip_bits(&s->gb, 16); /* unknwon always 0? */
-        switch (get_bits(&s->gb, 8)) {
+        skip_bits(&s->gb, 16); /* unknown always 0? */
+        skip_bits(&s->gb, 16); /* unknown always 0? */
+        skip_bits(&s->gb, 16); /* unknown always 0? */
+        switch (i=get_bits(&s->gb, 8)) {
         case 1:
-            s->rgb         = 1;
-            s->pegasus_rct = 0;
+            rgb         = 1;
+            pegasus_rct = 0;
             break;
         case 2:
-            s->rgb         = 1;
-            s->pegasus_rct = 1;
+            rgb         = 1;
+            pegasus_rct = 1;
             break;
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace\n");
+            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace %d\n", i);
         }
+
         len -= 9;
+        if (s->got_picture)
+            if (rgb != s->rgb || pegasus_rct != s->pegasus_rct) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mismatching LJIF tag\n");
+                goto out;
+            }
+
+        s->rgb = rgb;
+        s->pegasus_rct = pegasus_rct;
+
+        goto out;
+    }
+    if (id == AV_RL32("colr") && len > 0) {
+        s->colr = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "COLR %d\n", s->colr);
+        len --;
+        goto out;
+    }
+    if (id == AV_RL32("xfrm") && len > 0) {
+        s->xfrm = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "XFRM %d\n", s->xfrm);
+        len --;
+        goto out;
+    }
+
+    /* JPS extension by VRex */
+    if (s->start_code == APP3 && id == AV_RB32("_JPS") && len >= 10) {
+        int flags, layout, type;
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "_JPSJPS_\n");
+
+        skip_bits(&s->gb, 32); len -= 4;  /* JPS_ */
+        skip_bits(&s->gb, 16); len -= 2;  /* block length */
+        skip_bits(&s->gb, 8);             /* reserved */
+        flags  = get_bits(&s->gb, 8);
+        layout = get_bits(&s->gb, 8);
+        type   = get_bits(&s->gb, 8);
+        len -= 4;
+
+        s->stereo3d = av_stereo3d_alloc();
+        if (!s->stereo3d) {
+            goto out;
+        }
+        if (type == 0) {
+            s->stereo3d->type = AV_STEREO3D_2D;
+        } else if (type == 1) {
+            switch (layout) {
+            case 0x01:
+                s->stereo3d->type = AV_STEREO3D_LINES;
+                break;
+            case 0x02:
+                s->stereo3d->type = AV_STEREO3D_SIDEBYSIDE;
+                break;
+            case 0x03:
+                s->stereo3d->type = AV_STEREO3D_TOPBOTTOM;
+                break;
+            }
+            if (!(flags & 0x04)) {
+                s->stereo3d->flags = AV_STEREO3D_FLAG_INVERT;
+            }
+        }
+        goto out;
+    }
+
+    /* EXIF metadata */
+    if (s->start_code == APP1 && id == AV_RB32("Exif") && len >= 2) {
+        GetByteContext gbytes;
+        int ret, le, ifd_offset, bytes_read;
+        const uint8_t *aligned;
+
+        skip_bits(&s->gb, 16); // skip padding
+        len -= 2;
+
+        // init byte wise reading
+        aligned = align_get_bits(&s->gb);
+        bytestream2_init(&gbytes, aligned, len);
+
+        // read TIFF header
+        ret = ff_tdecode_header(&gbytes, &le, &ifd_offset);
+        if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "mjpeg: invalid TIFF header in EXIF data\n");
+        } else {
+            bytestream2_seek(&gbytes, ifd_offset, SEEK_SET);
+
+            // read 0th IFD and store the metadata
+            // (return values > 0 indicate the presence of subimage metadata)
+            ret = avpriv_exif_decode_ifd(s->avctx, &gbytes, le, 0, &s->exif_metadata);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "mjpeg: error decoding EXIF data\n");
+            }
+        }
+
+        bytes_read = bytestream2_tell(&gbytes);
+        skip_bits(&s->gb, bytes_read << 3);
+        len -= bytes_read;
+
         goto out;
     }
 
     /* Apple MJPEG-A */
     if ((s->start_code == APP1) && (len > (0x28 - 8))) {
         id   = get_bits_long(&s->gb, 32);
-        id   = av_be2ne32(id);
         len -= 4;
         /* Apple MJPEG-A */
-        if (id == AV_RL32("mjpg")) {
+        if (id == AV_RB32("mjpg")) {
 #if 0
             skip_bits(&s->gb, 32); /* field size */
             skip_bits(&s->gb, 32); /* pad field size */
@@ -1338,16 +1832,18 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
                 cbuf[i] = 0;
 
             if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-                av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
+                av_log(s->avctx, AV_LOG_INFO, "comment: '%s'\n", cbuf);
 
             /* buggy avid, it puts EOI only at every 10th frame */
-            if (!strcmp(cbuf, "AVID")) {
-                s->buggy_avid = 1;
+            if (!strncmp(cbuf, "AVID", 4)) {
+                parse_avid(s, cbuf, len);
             } else if (!strcmp(cbuf, "CS=ITU601"))
                 s->cs_itu601 = 1;
-            else if ((len > 20 && !strncmp(cbuf, "Intel(R) JPEG Library", 21)) ||
-                     (len > 19 && !strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
+            else if ((!strncmp(cbuf, "Intel(R) JPEG Library, version 1", 32) && s->avctx->codec_tag) ||
+                     (!strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
                 s->flipped = 1;
+            else if (!strcmp(cbuf, "MULTISCOPE II"))
+                s->multiscope = 2;
 
             av_free(cbuf);
         }
@@ -1363,22 +1859,19 @@ static int find_marker(const uint8_t **pbuf_ptr, const uint8_t *buf_end)
     const uint8_t *buf_ptr;
     unsigned int v, v2;
     int val;
-#ifdef DEBUG
     int skipped = 0;
-#endif
 
     buf_ptr = *pbuf_ptr;
-    while (buf_ptr < buf_end) {
+    while (buf_end - buf_ptr > 1) {
         v  = *buf_ptr++;
         v2 = *buf_ptr;
         if ((v == 0xff) && (v2 >= 0xc0) && (v2 <= 0xfe) && buf_ptr < buf_end) {
             val = *buf_ptr++;
             goto found;
         }
-#ifdef DEBUG
         skipped++;
-#endif
     }
+    buf_ptr = buf_end;
     val = -1;
 found:
     ff_dlog(NULL, "find_marker skipped %d bytes\n", skipped);
@@ -1424,7 +1917,7 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         memset(s->buffer + *unescaped_buf_size, 0,
                AV_INPUT_BUFFER_PADDING_SIZE);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %td bytes\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %"PTRDIFF_SPECIFIER" bytes\n",
                (buf_end - *buf_ptr) - (dst - s->buffer));
     } else if (start_code == SOS && s->ls) {
         const uint8_t *src = *buf_ptr;
@@ -1433,8 +1926,6 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         int t = 0, b = 0;
         PutBitContext pb;
 
-        s->cur_scan++;
-
         /* find marker */
         while (src + t < buf_end) {
             uint8_t x = src[t++];
@@ -1442,7 +1933,7 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                 while ((src + t < buf_end) && x == 0xff)
                     x = src[t++];
                 if (x & 0x80) {
-                    t -= 2;
+                    t -= FFMIN(2, t);
                     break;
                 }
             }
@@ -1456,6 +1947,10 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
             put_bits(&pb, 8, x);
             if (x == 0xFF) {
                 x = src[b++];
+                if (x & 0x80) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid escape sequence\n");
+                    x &= 0x7f;
+                }
                 put_bits(&pb, 7, x);
                 bit_count--;
             }
@@ -1483,11 +1978,17 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     MJpegDecodeContext *s = avctx->priv_data;
     const uint8_t *buf_end, *buf_ptr;
     const uint8_t *unescaped_buf_ptr;
+    int hshift, vshift;
     int unescaped_buf_size;
     int start_code;
+    int i, index;
     int ret = 0;
+    int is16bit;
+
+    av_dict_free(&s->exif_metadata);
+    av_freep(&s->stereo3d);
+    s->adobe_transform = -1;
 
-    s->got_picture = 0; // picture from previous image can not be reused
     buf_ptr = buf;
     buf_end = buf + buf_size;
     while (buf_ptr < buf_end) {
@@ -1497,21 +1998,22 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                           &unescaped_buf_size);
         /* EOF */
         if (start_code < 0) {
-            goto the_end;
+            break;
         } else if (unescaped_buf_size > INT_MAX / 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "MJPEG packet 0x%x too big (%d/%d), corrupt data?\n",
                    start_code, unescaped_buf_size, buf_size);
             return AVERROR_INVALIDDATA;
         }
-
-        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%td\n",
+        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%"PTRDIFF_SPECIFIER"\n",
                start_code, buf_end - buf_ptr);
 
-        ret = init_get_bits(&s->gb, unescaped_buf_ptr,
-                            unescaped_buf_size * 8);
-        if (ret < 0)
-            return ret;
+        ret = init_get_bits8(&s->gb, unescaped_buf_ptr, unescaped_buf_size);
+
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "invalid buffer\n");
+            goto fail;
+        }
 
         s->start_code = start_code;
         if (s->avctx->debug & FF_DEBUG_STARTCODE)
@@ -1528,6 +2030,8 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         else if (start_code == COM)
             mjpeg_decode_com(s);
 
+        ret = -1;
+
         if (!CONFIG_JPEGLS_DECODER &&
             (start_code == SOF48 || start_code == LSE)) {
             av_log(avctx, AV_LOG_ERROR, "JPEG-LS support not enabled.\n");
@@ -1546,7 +2050,7 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         case DHT:
             if ((ret = ff_mjpeg_decode_dht(s)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "huffman table decode error\n");
-                return ret;
+                goto fail;
             }
             break;
         case SOF0:
@@ -1555,39 +2059,39 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF2:
             s->lossless    = 0;
             s->ls          = 0;
             s->progressive = 1;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF3:
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF48:
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 1;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case LSE:
             if (!CONFIG_JPEGLS_DECODER ||
                 (ret = ff_jpegls_decode_lse(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case EOI:
-            s->cur_scan = 0;
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                break;
 eoi_parser:
+            s->cur_scan = 0;
             if (!s->got_picture) {
                 av_log(avctx, AV_LOG_WARNING,
                        "Found EOI before any SOF, ignoring\n");
@@ -1597,43 +2101,34 @@ eoi_parser:
                 s->bottom_field ^= 1;
                 /* if not bottom field, do not output image yet */
                 if (s->bottom_field == !s->interlace_polarity)
-                    goto not_the_end;
+                    break;
             }
             if ((ret = av_frame_ref(frame, s->picture_ptr)) < 0)
                 return ret;
-            if (s->flipped) {
-                int i;
-                for (i = 0; frame->data[i]; i++) {
-                    int h = frame->height >> ((i == 1 || i == 2) ?
-                                              s->pix_desc->log2_chroma_h : 0);
-                    frame->data[i] += frame->linesize[i] * (h - 1);
-                    frame->linesize[i] *= -1;
-                }
-            }
             *got_frame = 1;
+            s->got_picture = 0;
+
+            if (!s->lossless) {
+                int qp = FFMAX3(s->qscale[0],
+                                s->qscale[1],
+                                s->qscale[2]);
+                int qpw = (s->width + 15) / 16;
+                AVBufferRef *qp_table_buf = av_buffer_alloc(qpw);
+                if (qp_table_buf) {
+                    memset(qp_table_buf->data, qp, qpw);
+                    av_frame_set_qp_table(data, qp_table_buf, 0, FF_QSCALE_TYPE_MPEG1);
+                }
 
-            if (!s->lossless &&
-                avctx->debug & FF_DEBUG_QP) {
-                av_log(avctx, AV_LOG_DEBUG,
-                       "QP: %d\n", FFMAX3(s->qscale[0],
-                                          s->qscale[1],
-                                          s->qscale[2]));
+                if(avctx->debug & FF_DEBUG_QP)
+                    av_log(avctx, AV_LOG_DEBUG, "QP: %d\n", qp);
             }
 
             goto the_end;
         case SOS:
-            if (!s->got_picture) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Can not process SOS before SOF, skipping\n");
-                break;
-                }
-            if ((ret = ff_mjpeg_decode_sos(s, NULL, NULL)) < 0 &&
+            s->cur_scan++;
+            if ((ret = ff_mjpeg_decode_sos(s, NULL, 0, NULL)) < 0 &&
                 (avctx->err_recognition & AV_EF_EXPLODE))
-                return ret;
-            /* buggy avid puts EOI every 10-20th frame */
-            /* if restart period is over process EOI */
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                goto eoi_parser;
+                goto fail;
             break;
         case DRI:
             mjpeg_decode_dri(s);
@@ -1653,21 +2148,203 @@ eoi_parser:
             break;
         }
 
-not_the_end:
         /* eof process start code */
         buf_ptr += (get_bits_count(&s->gb) + 7) / 8;
         av_log(avctx, AV_LOG_DEBUG,
                "marker parser used %d bytes (%d bits)\n",
                (get_bits_count(&s->gb) + 7) / 8, get_bits_count(&s->gb));
     }
-    if (s->got_picture) {
+    if (s->got_picture && s->cur_scan) {
         av_log(avctx, AV_LOG_WARNING, "EOI missing, emulating\n");
         goto eoi_parser;
     }
     av_log(avctx, AV_LOG_FATAL, "No JPEG data found in image\n");
     return AVERROR_INVALIDDATA;
+fail:
+    s->got_picture = 0;
+    return ret;
 the_end:
-    av_log(avctx, AV_LOG_DEBUG, "mjpeg decode frame unused %td bytes\n",
+
+    is16bit = av_pix_fmt_desc_get(s->avctx->pix_fmt)->comp[0].step > 1;
+
+    if (AV_RB32(s->upscale_h)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                  );
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        for (p = 0; p<4; p++) {
+            uint8_t *line = s->picture_ptr->data[p];
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_h[p])
+                continue;
+            if (p==1 || p==2) {
+                w = FF_CEIL_RSHIFT(w, hshift);
+                h = FF_CEIL_RSHIFT(h, vshift);
+            }
+            if (s->upscale_v[p])
+                h = (h+1)>>1;
+            av_assert0(w > 0);
+            for (i = 0; i < h; i++) {
+                if (s->upscale_h[p] == 1) {
+                    if (is16bit) ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 2];
+                    else                      line[w - 1] = line[(w - 1) / 2];
+                    for (index = w - 2; index > 0; index--) {
+                        if (is16bit)
+                            ((uint16_t*)line)[index] = (((uint16_t*)line)[index / 2] + ((uint16_t*)line)[(index + 1) / 2]) >> 1;
+                        else
+                            line[index] = (line[index / 2] + line[(index + 1) / 2]) >> 1;
+                    }
+                } else if (s->upscale_h[p] == 2) {
+                    if (is16bit) {
+                        ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 3];
+                        if (w > 1)
+                            ((uint16_t*)line)[w - 2] = ((uint16_t*)line)[w - 1];
+                    } else {
+                        line[w - 1] = line[(w - 1) / 3];
+                        if (w > 1)
+                            line[w - 2] = line[w - 1];
+                    }
+                    for (index = w - 3; index > 0; index--) {
+                        line[index] = (line[index / 3] + line[(index + 1) / 3] + line[(index + 2) / 3] + 1) / 3;
+                    }
+                }
+                line += s->linesize[p];
+            }
+        }
+    }
+    if (AV_RB32(s->upscale_v)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV422P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                   );
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        for (p = 0; p < 4; p++) {
+            uint8_t *dst;
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_v[p])
+                continue;
+            if (p==1 || p==2) {
+                w = FF_CEIL_RSHIFT(w, hshift);
+                h = FF_CEIL_RSHIFT(h, vshift);
+            }
+            dst = &((uint8_t *)s->picture_ptr->data[p])[(h - 1) * s->linesize[p]];
+            for (i = h - 1; i; i--) {
+                uint8_t *src1 = &((uint8_t *)s->picture_ptr->data[p])[i / 2 * s->linesize[p]];
+                uint8_t *src2 = &((uint8_t *)s->picture_ptr->data[p])[(i + 1) / 2 * s->linesize[p]];
+                if (src1 == src2 || i == h - 1) {
+                    memcpy(dst, src1, w);
+                } else {
+                    for (index = 0; index < w; index++)
+                        dst[index] = (src1[index] + src2[index]) >> 1;
+                }
+                dst -= s->linesize[p];
+            }
+        }
+    }
+    if (s->flipped) {
+        int j;
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        for (index=0; index<4; index++) {
+            uint8_t *dst = s->picture_ptr->data[index];
+            int w = s->picture_ptr->width;
+            int h = s->picture_ptr->height;
+            if(index && index<3){
+                w = FF_CEIL_RSHIFT(w, hshift);
+                h = FF_CEIL_RSHIFT(h, vshift);
+            }
+            if(dst){
+                uint8_t *dst2 = dst + s->picture_ptr->linesize[index]*(h-1);
+                for (i=0; i<h/2; i++) {
+                    for (j=0; j<w; j++)
+                        FFSWAP(int, dst[j], dst2[j]);
+                    dst  += s->picture_ptr->linesize[index];
+                    dst2 -= s->picture_ptr->linesize[index];
+                }
+            }
+        }
+    }
+    if (s->adobe_transform == 0 && s->avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = dst[0][j] * k;
+                int g = dst[1][j] * k;
+                int b = dst[2][j] * k;
+                dst[0][j] = g*257 >> 16;
+                dst[1][j] = b*257 >> 16;
+                dst[2][j] = r*257 >> 16;
+                dst[3][j] = 255;
+            }
+        }
+    }
+    if (s->adobe_transform == 2 && s->avctx->pix_fmt == AV_PIX_FMT_YUVA444P) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = (255 - dst[0][j]) * k;
+                int g = (128 - dst[1][j]) * k;
+                int b = (128 - dst[2][j]) * k;
+                dst[0][j] = r*257 >> 16;
+                dst[1][j] = (g*257 >> 16) + 128;
+                dst[2][j] = (b*257 >> 16) + 128;
+                dst[3][j] = 255;
+            }
+        }
+    }
+
+    if (s->stereo3d) {
+        AVStereo3D *stereo = av_stereo3d_create_side_data(data);
+        if (stereo) {
+            stereo->type  = s->stereo3d->type;
+            stereo->flags = s->stereo3d->flags;
+        }
+        av_freep(&s->stereo3d);
+    }
+
+    av_dict_copy(avpriv_frame_get_metadatap(data), s->exif_metadata, 0);
+    av_dict_free(&s->exif_metadata);
+
+    av_log(avctx, AV_LOG_DEBUG, "decode frame unused %"PTRDIFF_SPECIFIER" bytes\n",
            buf_end - buf_ptr);
 //  return buf_end - buf_ptr;
     return buf_ptr - buf;
@@ -1678,13 +2355,18 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
     MJpegDecodeContext *s = avctx->priv_data;
     int i, j;
 
+    if (s->interlaced && s->bottom_field == !s->interlace_polarity && s->got_picture && !avctx->frame_number) {
+        av_log(avctx, AV_LOG_INFO, "Single field\n");
+    }
+
     if (s->picture) {
         av_frame_free(&s->picture);
         s->picture_ptr = NULL;
     } else if (s->picture_ptr)
         av_frame_unref(s->picture_ptr);
 
-    av_free(s->buffer);
+    av_freep(&s->buffer);
+    av_freep(&s->stereo3d);
     av_freep(&s->ljpeg_buffer);
     s->ljpeg_buffer_size = 0;
 
@@ -1696,9 +2378,17 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
         av_freep(&s->blocks[i]);
         av_freep(&s->last_nnz[i]);
     }
+    av_dict_free(&s->exif_metadata);
     return 0;
 }
 
+static void decode_flush(AVCodecContext *avctx)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
+    s->got_picture = 0;
+}
+
+#if CONFIG_MJPEG_DECODER
 #define OFFSET(x) offsetof(MJpegDecodeContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption options[] = {
@@ -1723,11 +2413,14 @@ AVCodec ff_mjpeg_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .priv_class     = &mjpegdec_class,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
-
+#endif
+#if CONFIG_THP_DECODER
 AVCodec ff_thp_decoder = {
     .name           = "thp",
     .long_name      = NULL_IF_CONFIG_SMALL("Nintendo Gamecube THP video"),
@@ -1737,6 +2430,9 @@ AVCodec ff_thp_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index aa4703a24d..fb811294a1 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
 #include "blockdsp.h"
@@ -61,14 +62,19 @@ typedef struct MJpegDecodeContext {
     int ls;
     int progressive;
     int rgb;
+    uint8_t upscale_h[4];
+    uint8_t upscale_v[4];
     int rct;            /* standard rct */
     int pegasus_rct;    /* pegasus reversible colorspace transform */
     int bits;           /* bits per component */
+    int colr;
+    int xfrm;
+    int adobe_transform;
 
     int maxval;
     int near;         ///< near lossless bound (si 0 for lossless)
     int t1,t2,t3;
-    int reset;        ///< context halfing intervall ?rename
+    int reset;        ///< context halfing interval ?rename
 
     int width, height;
     int mb_width, mb_height;
@@ -83,6 +89,7 @@ typedef struct MJpegDecodeContext {
     int nb_blocks[MAX_COMPONENTS];
     int h_scount[MAX_COMPONENTS];
     int v_scount[MAX_COMPONENTS];
+    int quant_sindex[MAX_COMPONENTS];
     int h_max, v_max; /* maximum h and v counts */
     int quant_index[4];   /* quant table index for each component */
     int last_dc[MAX_COMPONENTS]; /* last DEQUANTIZED dc (XXX: am I right to do that ?) */
@@ -95,6 +102,7 @@ typedef struct MJpegDecodeContext {
     int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
     uint8_t *last_nnz[MAX_COMPONENTS];
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
+    int palette_index;
     ScanTable scantable;
     BlockDSPContext bdsp;
     HpelDSPContext hdsp;
@@ -106,6 +114,7 @@ typedef struct MJpegDecodeContext {
     int buggy_avid;
     int cs_itu601;
     int interlace_polarity;
+    int multiscope;
 
     int mjpb_skiptosod;
 
@@ -116,6 +125,9 @@ typedef struct MJpegDecodeContext {
     unsigned int ljpeg_buffer_size;
 
     int extern_huff;
+    AVDictionary *exif_metadata;
+
+    AVStereo3D *stereo3d; ///!< stereoscopic information (cached, since it is read before frame allocation)
 
     const AVPixFmtDescriptor *pix_desc;
 } MJpegDecodeContext;
@@ -129,7 +141,8 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s);
 int ff_mjpeg_decode_dht(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s,
-                        const uint8_t *mb_bitmask, const AVFrame *reference);
+                        const uint8_t *mb_bitmask,int mb_bitmask_size,
+                        const AVFrame *reference);
 int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                          const uint8_t **buf_ptr, const uint8_t *buf_end,
                          const uint8_t **unescaped_buf_ptr, int *unescaped_buf_size);
diff --git a/libavcodec/mjpegenc.c b/libavcodec/mjpegenc.c
index 6724310d98..f15de58c2c 100644
--- a/libavcodec/mjpegenc.c
+++ b/libavcodec/mjpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,6 @@
  * MJPEG encoder.
  */
 
-#include <assert.h>
-
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
@@ -41,10 +39,44 @@
 #include "mjpeg.h"
 #include "mjpegenc.h"
 
+static uint8_t uni_ac_vlc_len[64 * 64 * 2];
+static uint8_t uni_chroma_ac_vlc_len[64 * 64 * 2];
+
+static av_cold void init_uni_ac_vlc(const uint8_t huff_size_ac[256], uint8_t *uni_ac_vlc_len)
+{
+    int i;
+
+    for (i = 0; i < 128; i++) {
+        int level = i - 64;
+        int run;
+        if (!level)
+            continue;
+        for (run = 0; run < 64; run++) {
+            int len, code, nbits;
+            int alevel = FFABS(level);
+
+            len = (run >> 4) * huff_size_ac[0xf0];
+
+            nbits= av_log2_16bit(alevel) + 1;
+            code = ((15&run) << 4) | nbits;
+
+            len += huff_size_ac[code] + nbits;
+
+            uni_ac_vlc_len[UNI_AC_ENC_INDEX(run, i)] = len;
+            // We ignore EOB as its just a constant which does not change generally
+        }
+    }
+}
+
 av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
 {
     MJpegContext *m;
 
+    if (s->width > 65500 || s->height > 65500) {
+        av_log(s, AV_LOG_ERROR, "JPEG does not support resolutions above 65500x65500\n");
+        return AVERROR(EINVAL);
+    }
+
     m = av_malloc(sizeof(MJpegContext));
     if (!m)
         return AVERROR(ENOMEM);
@@ -70,13 +102,20 @@ av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
                                  avpriv_mjpeg_bits_ac_chrominance,
                                  avpriv_mjpeg_val_ac_chrominance);
 
+    init_uni_ac_vlc(m->huff_size_ac_luminance,   uni_ac_vlc_len);
+    init_uni_ac_vlc(m->huff_size_ac_chrominance, uni_chroma_ac_vlc_len);
+    s->intra_ac_vlc_length      =
+    s->intra_ac_vlc_last_length = uni_ac_vlc_len;
+    s->intra_chroma_ac_vlc_length      =
+    s->intra_chroma_ac_vlc_last_length = uni_chroma_ac_vlc_len;
+
     s->mjpeg_ctx = m;
     return 0;
 }
 
-void ff_mjpeg_encode_close(MpegEncContext *s)
+av_cold void ff_mjpeg_encode_close(MpegEncContext *s)
 {
-    av_free(s->mjpeg_ctx);
+    av_freep(&s->mjpeg_ctx);
 }
 
 static void encode_block(MpegEncContext *s, int16_t *block, int n)
@@ -122,7 +161,7 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
                 mant--;
             }
 
-            nbits= av_log2(val) + 1;
+            nbits= av_log2_16bit(val) + 1;
             code = (run << 4) | nbits;
 
             put_bits(&s->pb, huff_size_ac[code], huff_code_ac[code]);
@@ -137,23 +176,92 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
         put_bits(&s->pb, huff_size_ac[0], huff_code_ac[0]);
 }
 
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64])
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64])
 {
     int i;
-    for(i=0;i<5;i++) {
-        encode_block(s, block[i], i);
-    }
-    if (s->chroma_format == CHROMA_420) {
+    if (s->chroma_format == CHROMA_444) {
+        encode_block(s, block[0], 0);
+        encode_block(s, block[2], 2);
+        encode_block(s, block[4], 4);
+        encode_block(s, block[8], 8);
         encode_block(s, block[5], 5);
+        encode_block(s, block[9], 9);
+
+        if (16*s->mb_x+8 < s->width) {
+            encode_block(s, block[1], 1);
+            encode_block(s, block[3], 3);
+            encode_block(s, block[6], 6);
+            encode_block(s, block[10], 10);
+            encode_block(s, block[7], 7);
+            encode_block(s, block[11], 11);
+        }
     } else {
-        encode_block(s, block[6], 6);
-        encode_block(s, block[5], 5);
-        encode_block(s, block[7], 7);
+        for(i=0;i<5;i++) {
+            encode_block(s, block[i], i);
+        }
+        if (s->chroma_format == CHROMA_420) {
+            encode_block(s, block[5], 5);
+        } else {
+            encode_block(s, block[6], 6);
+            encode_block(s, block[5], 5);
+            encode_block(s, block[7], 7);
+        }
     }
 
     s->i_tex_bits += get_bits_diff(s);
 }
 
+// maximum over s->mjpeg_vsample[i]
+#define V_MAX 2
+static int amv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *pic_arg, int *got_packet)
+
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *pic;
+    int i, ret;
+    int chroma_h_shift, chroma_v_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift, &chroma_v_shift);
+
+#if FF_API_EMU_EDGE
+    //CODEC_FLAG_EMU_EDGE have to be cleared
+    if(s->avctx->flags & CODEC_FLAG_EMU_EDGE)
+        return AVERROR(EINVAL);
+#endif
+
+    if ((avctx->height & 15) && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Heights which are not a multiple of 16 might fail with some decoders, "
+               "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->height);
+        av_log(avctx, AV_LOG_WARNING, "If you have a device that plays AMV videos, please test if videos "
+               "with such heights work with it and report your findings to ffmpeg-devel@ffmpeg.org\n");
+        return AVERROR_EXPERIMENTAL;
+    }
+
+    pic = av_frame_clone(pic_arg);
+    if (!pic)
+        return AVERROR(ENOMEM);
+    //picture should be flipped upside-down
+    for(i=0; i < 3; i++) {
+        int vsample = i ? 2 >> chroma_v_shift : 2;
+        pic->data[i] += pic->linesize[i] * (vsample * s->height / V_MAX - 1);
+        pic->linesize[i] *= -1;
+    }
+    ret = ff_mpv_encode_picture(avctx, pkt, pic, got_packet);
+    av_frame_free(&pic);
+    return ret;
+}
+
+#if CONFIG_MJPEG_ENCODER
+
+static const AVClass mjpeg_class = {
+    .class_name = "mjpeg encoder",
+    .item_name  = av_default_item_name,
+    .option     = ff_mpv_generic_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_mjpeg_encoder = {
     .name           = "mjpeg",
     .long_name      = NULL_IF_CONFIG_SMALL("MJPEG (Motion JPEG)"),
@@ -163,7 +271,33 @@ AVCodec ff_mjpeg_encoder = {
     .init           = ff_mpv_encode_init,
     .encode2        = ff_mpv_encode_picture,
     .close          = ff_mpv_encode_end,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &mjpeg_class,
+};
+#endif
+#if CONFIG_AMV_ENCODER
+static const AVClass amv_class = {
+    .class_name = "amv encoder",
+    .item_name  = av_default_item_name,
+    .option     = ff_mpv_generic_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_amv_encoder = {
+    .name           = "amv",
+    .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AMV,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_mpv_encode_init,
+    .encode2        = amv_encode_picture,
+    .close          = ff_mpv_encode_end,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_NONE
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_NONE
     },
+    .priv_class     = &amv_class,
 };
+#endif
diff --git a/libavcodec/mjpegenc.h b/libavcodec/mjpegenc.h
index aa8697edc8..4d77e166ee 100644
--- a/libavcodec/mjpegenc.h
+++ b/libavcodec/mjpegenc.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,6 @@ static inline void put_marker(PutBitContext *p, enum JpegMarker code)
 
 int  ff_mjpeg_encode_init(MpegEncContext *s);
 void ff_mjpeg_encode_close(MpegEncContext *s);
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64]);
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64]);
 
 #endif /* AVCODEC_MJPEGENC_H */
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index 54df9878e9..d1d4cfb40a 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -1,20 +1,22 @@
 /*
  * lossless JPEG shared bits
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,20 +56,43 @@ static int put_huffman_table(PutBitContext *p, int table_class, int table_id,
     return n + 17;
 }
 
-static void jpeg_table_header(PutBitContext *p, ScanTable *intra_scantable,
-                              uint16_t intra_matrix[64])
+static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
+                              ScanTable *intra_scantable,
+                              uint16_t luma_intra_matrix[64],
+                              uint16_t chroma_intra_matrix[64],
+                              int hsample[3])
 {
     int i, j, size;
     uint8_t *ptr;
 
+    if (avctx->codec_id != AV_CODEC_ID_LJPEG) {
+        int matrix_count = 1 + !!memcmp(luma_intra_matrix,
+                                        chroma_intra_matrix,
+                                        sizeof(luma_intra_matrix[0]) * 64);
     /* quant matrixes */
     put_marker(p, DQT);
-    put_bits(p, 16, 2 + 1 * (1 + 64));
+    put_bits(p, 16, 2 + matrix_count * (1 + 64));
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 0); /* table 0 */
     for(i=0;i<64;i++) {
         j = intra_scantable->permutated[i];
-        put_bits(p, 8, intra_matrix[j]);
+        put_bits(p, 8, luma_intra_matrix[j]);
+    }
+
+        if (matrix_count > 1) {
+            put_bits(p, 4, 0); /* 8 bit precision */
+            put_bits(p, 4, 1); /* table 1 */
+            for(i=0;i<64;i++) {
+                j = intra_scantable->permutated[i];
+                put_bits(p, 8, chroma_intra_matrix[j]);
+            }
+        }
+    }
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE){
+        put_marker(p, DRI);
+        put_bits(p, 16, 4);
+        put_bits(p, 16, (avctx->width-1)/(8*hsample[0]) + 1);
     }
 
     /* huffman table */
@@ -120,9 +145,10 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
         AV_WB16(ptr, size);
     }
 
-    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+    if (((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV444P) && avctx->color_range != AVCOL_RANGE_JPEG)
+        || avctx->color_range == AVCOL_RANGE_MPEG) {
         put_marker(p, COM);
         flush_put_bits(p);
         ptr = put_bits_ptr(p);
@@ -133,22 +159,23 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
     }
 }
 
-void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
-                                    ScanTable *intra_scantable,
-                                    uint16_t intra_matrix[64])
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4])
 {
     int chroma_h_shift, chroma_v_shift;
-    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG;
-    int hsample[3], vsample[3];
 
     av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
                                      &chroma_v_shift);
-
     if (avctx->codec->id == AV_CODEC_ID_LJPEG &&
-        avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
+        (   avctx->pix_fmt == AV_PIX_FMT_BGR0
+         || avctx->pix_fmt == AV_PIX_FMT_BGRA
+         || avctx->pix_fmt == AV_PIX_FMT_BGR24)) {
         vsample[0] = hsample[0] =
         vsample[1] = hsample[1] =
-        vsample[2] = hsample[2] = 1;
+        vsample[2] = hsample[2] =
+        vsample[3] = hsample[3] = 1;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P || avctx->pix_fmt == AV_PIX_FMT_YUVJ444P) {
+        vsample[0] = vsample[1] = vsample[2] = 2;
+        hsample[0] = hsample[1] = hsample[2] = 1;
     } else {
         vsample[0] = 2;
         vsample[1] = 2 >> chroma_v_shift;
@@ -157,27 +184,48 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
         hsample[1] = 2 >> chroma_h_shift;
         hsample[2] = 2 >> chroma_h_shift;
     }
+}
+
+void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
+                                    ScanTable *intra_scantable,
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64])
+{
+    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG && avctx->codec_id != AV_CODEC_ID_AMV;
+    int hsample[4], vsample[4];
+    int i;
+    int components = 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA);
+    int chroma_matrix = !!memcmp(luma_intra_matrix,
+                                 chroma_intra_matrix,
+                                 sizeof(luma_intra_matrix[0])*64);
+
+    ff_mjpeg_init_hvsample(avctx, hsample, vsample);
 
     put_marker(pb, SOI);
 
+    // hack for AMV mjpeg format
+    if(avctx->codec_id == AV_CODEC_ID_AMV) goto end;
+
     jpeg_put_comments(avctx, pb);
 
-    jpeg_table_header(pb, intra_scantable, intra_matrix);
+    jpeg_table_header(avctx, pb, intra_scantable, luma_intra_matrix, chroma_intra_matrix, hsample);
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_marker(pb, SOF0 ); break;
     case AV_CODEC_ID_LJPEG:  put_marker(pb, SOF3 ); break;
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 16, 17);
-    if (lossless && avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if (lossless && (  avctx->pix_fmt == AV_PIX_FMT_BGR0
+                    || avctx->pix_fmt == AV_PIX_FMT_BGRA
+                    || avctx->pix_fmt == AV_PIX_FMT_BGR24))
         put_bits(pb, 8, 9); /* 9 bits/component RCT */
     else
         put_bits(pb, 8, 8); /* 8 bits/component */
     put_bits(pb, 16, avctx->height);
     put_bits(pb, 16, avctx->width);
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 8, components); /* 3 or 4 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* component number */
@@ -189,18 +237,25 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 8, 2); /* component number */
     put_bits(pb, 4, hsample[1]); /* H factor */
     put_bits(pb, 4, vsample[1]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
 
     /* Cr component */
     put_bits(pb, 8, 3); /* component number */
     put_bits(pb, 4, hsample[2]); /* H factor */
     put_bits(pb, 4, vsample[2]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
+
+    if (components == 4) {
+        put_bits(pb, 8, 4); /* component number */
+        put_bits(pb, 4, hsample[3]); /* H factor */
+        put_bits(pb, 4, vsample[3]); /* V factor */
+        put_bits(pb, 8, 0); /* select matrix */
+    }
 
     /* scan header */
     put_marker(pb, SOS);
-    put_bits(pb, 16, 12); /* length */
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 16, 6 + 2*components); /* length */
+    put_bits(pb, 8, components); /* 3 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* index */
@@ -217,25 +272,49 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 4, 1); /* DC huffman table index */
     put_bits(pb, 4, lossless ? 0 : 1); /* AC huffman table index */
 
+    if (components == 4) {
+        /* Alpha component */
+        put_bits(pb, 8, 4); /* index */
+        put_bits(pb, 4, 0); /* DC huffman table index */
+        put_bits(pb, 4, 0); /* AC huffman table index */
+    }
+
     put_bits(pb, 8, lossless ? avctx->prediction_method + 1 : 0); /* Ss (not used) */
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_bits(pb, 8, 63); break; /* Se (not used) */
     case AV_CODEC_ID_LJPEG:  put_bits(pb, 8,  0); break; /* not used */
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 8, 0); /* Ah/Al (not used) */
+
+end:
+    if (!lossless) {
+        MpegEncContext *s = avctx->priv_data;
+        av_assert0(avctx->codec->priv_data_size == sizeof(MpegEncContext));
+
+        s->esc_pos = put_bits_count(pb) >> 3;
+        for(i=1; i<s->slice_context_count; i++)
+            s->thread_context[i]->esc_pos = 0;
+    }
 }
 
-static void escape_FF(PutBitContext *pb, int start)
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start)
 {
-    int size = put_bits_count(pb) - start * 8;
+    int size;
     int i, ff_count;
     uint8_t *buf = pb->buf + start;
     int align= (-(size_t)(buf))&3;
+    int pad = (-put_bits_count(pb))&7;
+
+    if (pad)
+        put_bits(pb, pad, (1<<pad)-1);
+
+    flush_put_bits(pb);
+    size = put_bits_count(pb) - start * 8;
 
-    assert((size&7) == 0);
+    av_assert1((size&7) == 0);
     size >>= 3;
 
     ff_count=0;
@@ -280,21 +359,35 @@ static void escape_FF(PutBitContext *pb, int start)
     }
 }
 
-void ff_mjpeg_encode_stuffing(PutBitContext * pbc)
+int ff_mjpeg_encode_stuffing(MpegEncContext *s)
 {
-    int length;
-    length= (-put_bits_count(pbc))&7;
-    if(length) put_bits(pbc, length, (1<<length)-1);
+    int i;
+    PutBitContext *pbc = &s->pb;
+    int mb_y = s->mb_y - !s->mb_x;
+
+    int ret = ff_mpv_reallocate_putbitbuffer(s, put_bits_count(&s->pb) / 8 + 100,
+                                                put_bits_count(&s->pb) / 4 + 1000);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer reallocation failed\n");
+        goto fail;
+    }
+
+    ff_mjpeg_escape_FF(pbc, s->esc_pos);
+
+    if((s->avctx->active_thread_type & FF_THREAD_SLICE) && mb_y < s->mb_height)
+        put_marker(pbc, RST0 + (mb_y&7));
+    s->esc_pos = put_bits_count(pbc) >> 3;
+fail:
+
+    for(i=0; i<3; i++)
+        s->last_dc[i] = 128 << s->intra_dc_precision;
+
+    return ret;
 }
 
 void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits)
 {
-    ff_mjpeg_encode_stuffing(pb);
-    flush_put_bits(pb);
-
-    assert((header_bits & 7) == 0);
-
-    escape_FF(pb, header_bits >> 3);
+    av_assert1((header_bits & 7) == 0);
 
     put_marker(pb, EOI);
 }
diff --git a/libavcodec/mjpegenc_common.h b/libavcodec/mjpegenc_common.h
index b48911e364..924cb36262 100644
--- a/libavcodec/mjpegenc_common.h
+++ b/libavcodec/mjpegenc_common.h
@@ -1,20 +1,20 @@
 /*
  * lossless JPEG shared bits
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,18 @@
 
 #include "avcodec.h"
 #include "idctdsp.h"
+#include "mpegvideo.h"
 #include "put_bits.h"
 
 void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
                                     ScanTable *intra_scantable,
-                                    uint16_t intra_matrix[64]);
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64]);
 void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits);
-void ff_mjpeg_encode_stuffing(PutBitContext *pbc);
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start);
+int ff_mjpeg_encode_stuffing(MpegEncContext *s);
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4]);
+
 void ff_mjpeg_encode_dc(PutBitContext *pb, int val,
                         uint8_t *huff_size, uint16_t *huff_code);
 
diff --git a/libavcodec/mlp.c b/libavcodec/mlp.c
index 9615b66ee1..87f7c77139 100644
--- a/libavcodec/mlp.c
+++ b/libavcodec/mlp.c
@@ -2,20 +2,20 @@
  * MLP codec common code
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlp.h b/libavcodec/mlp.h
index 8a1584e112..05d8dbaa61 100644
--- a/libavcodec/mlp.h
+++ b/libavcodec/mlp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlp_parser.c b/libavcodec/mlp_parser.c
index e8fb4f5f4d..23601c8633 100644
--- a/libavcodec/mlp_parser.c
+++ b/libavcodec/mlp_parser.c
@@ -2,20 +2,20 @@
  * MLP parser
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,28 +44,28 @@ static const uint8_t mlp_channels[32] = {
     5, 6, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-static const uint64_t mlp_layout[32] = {
+const uint64_t ff_mlp_layout[32] = {
     AV_CH_LAYOUT_MONO,
     AV_CH_LAYOUT_STEREO,
     AV_CH_LAYOUT_2_1,
-    AV_CH_LAYOUT_2_2,
+    AV_CH_LAYOUT_QUAD,
     AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_2_1|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_SURROUND,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT0,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_5POINT0_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
@@ -108,7 +108,7 @@ static int truehd_channels(int chanmap)
     return channels;
 }
 
-static uint64_t truehd_layout(int chanmap)
+uint64_t ff_truehd_layout(int chanmap)
 {
     int i;
     uint64_t layout = 0;
@@ -147,7 +147,7 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
     int ratebits, channel_arrangement, header_size;
     uint16_t checksum;
 
-    assert(get_bits_count(gb) == 0);
+    av_assert1(get_bits_count(gb) == 0);
 
     header_size = ff_mlp_get_major_sync_size(gb->buffer, gb->size_in_bits >> 3);
     if (header_size < 0 || gb->size_in_bits < header_size << 3) {
@@ -177,9 +177,10 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
 
         skip_bits(gb, 11);
 
+        mh->channel_arrangement=
         channel_arrangement    = get_bits(gb, 5);
         mh->channels_mlp       = mlp_channels[channel_arrangement];
-        mh->channel_layout_mlp = mlp_layout[channel_arrangement];
+        mh->channel_layout_mlp = ff_mlp_layout[channel_arrangement];
     } else if (mh->stream_type == 0xba) {
         mh->group1_bits = 24; // TODO: Is this information actually conveyed anywhere?
         mh->group2_bits = 0;
@@ -193,15 +194,16 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
         mh->channel_modifier_thd_stream0 = get_bits(gb, 2);
         mh->channel_modifier_thd_stream1 = get_bits(gb, 2);
 
+        mh->channel_arrangement=
         channel_arrangement            = get_bits(gb, 5);
         mh->channels_thd_stream1       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream1 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream1 = ff_truehd_layout(channel_arrangement);
 
         mh->channel_modifier_thd_stream2 = get_bits(gb, 2);
 
         channel_arrangement            = get_bits(gb, 13);
         mh->channels_thd_stream2       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream2 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream2 = ff_truehd_layout(channel_arrangement);
     } else
         return AVERROR_INVALIDDATA;
 
@@ -247,6 +249,7 @@ static int mlp_parse(AVCodecParserContext *s,
     int sync_present;
     uint8_t parity_bits;
     int next;
+    int ret;
     int i, p = 0;
 
     *poutbuf_size = 0;
@@ -268,11 +271,15 @@ static int mlp_parse(AVCodecParserContext *s,
         }
 
         if (!mp->in_sync) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
+            if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
             return buf_size;
         }
 
-        ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size);
+        if ((ret = ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size)) < 0) {
+            av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
+            return ret;
+        }
 
         return i - 7;
     }
@@ -286,13 +293,17 @@ static int mlp_parse(AVCodecParserContext *s,
         }
 
         if (mp->pc.index + buf_size < 2) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
+            if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
             return buf_size;
         }
 
         mp->bytes_left = ((mp->pc.index > 0 ? mp->pc.buffer[0] : buf[0]) << 8)
                        |  (mp->pc.index > 1 ? mp->pc.buffer[1] : buf[1-mp->pc.index]);
         mp->bytes_left = (mp->bytes_left & 0xfff) * 2;
+        if (mp->bytes_left <= 0) { // prevent infinite loop
+            goto lost_sync;
+        }
         mp->bytes_left -= mp->pc.index;
     }
 
@@ -343,6 +354,7 @@ static int mlp_parse(AVCodecParserContext *s,
         avctx->sample_rate = mh.group1_samplerate;
         s->duration = mh.access_unit_size;
 
+        if(!avctx->channels || !avctx->channel_layout) {
         if (mh.stream_type == 0xbb) {
             /* MLP stream */
             if (avctx->request_channel_layout &&
@@ -357,7 +369,7 @@ static int mlp_parse(AVCodecParserContext *s,
             }
         } else { /* mh.stream_type == 0xba */
             /* TrueHD stream */
-                if (avctx->request_channel_layout &&
+            if (avctx->request_channel_layout &&
                     (avctx->request_channel_layout & AV_CH_LAYOUT_STEREO) ==
                     avctx->request_channel_layout &&
                     mh.num_substreams > 1) {
@@ -374,6 +386,7 @@ static int mlp_parse(AVCodecParserContext *s,
                 avctx->channel_layout = mh.channel_layout_thd_stream2;
             }
         }
+        }
 
         if (!mh.is_vbr) /* Stream is CBR */
             avctx->bit_rate = mh.peak_bitrate;
diff --git a/libavcodec/mlp_parser.h b/libavcodec/mlp_parser.h
index 06ab421ba9..0c0109efcc 100644
--- a/libavcodec/mlp_parser.h
+++ b/libavcodec/mlp_parser.h
@@ -2,20 +2,20 @@
  * MLP parser prototypes
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,8 @@ typedef struct MLPHeaderInfo
     int group1_samplerate;                  ///< Sample rate of first substream
     int group2_samplerate;                  ///< Sample rate of second substream (MLP only)
 
+    int channel_arrangement;
+
     int channel_modifier_thd_stream0;       ///< Channel modifier for substream 0 of TrueHD sreams ("2-channel presentation")
     int channel_modifier_thd_stream1;       ///< Channel modifier for substream 1 of TrueHD sreams ("6-channel presentation")
     int channel_modifier_thd_stream2;       ///< Channel modifier for substream 2 of TrueHD sreams ("8-channel presentation")
@@ -62,5 +64,8 @@ typedef struct MLPHeaderInfo
 
 
 int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb);
+uint64_t ff_truehd_layout(int chanmap);
+
+extern const uint64_t ff_mlp_layout[32];
 
 #endif /* AVCODEC_MLP_PARSER_H */
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 8cfeea6175..c93b058dd7 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -2,20 +2,20 @@
  * MLP decoder
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ typedef struct SubStream {
     /// Whether the LSBs of the matrix output are encoded in the bitstream.
     uint8_t     lsb_bypass[MAX_MATRICES];
     /// Matrix coefficients, stored as 2.14 fixed point.
-    int32_t     matrix_coeff[MAX_MATRICES][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS];
     /// Left shift to apply to noise values in 0x31eb substreams.
     uint8_t     matrix_noise_shift[MAX_MATRICES];
     //@}
@@ -144,6 +144,9 @@ typedef struct MLPDecodeContext {
     /// Index of the last substream to decode - further substreams are skipped.
     uint8_t     max_decoded_substream;
 
+    /// Stream needs channel reordering to comply with FFmpeg's channel order
+    uint8_t     needs_reordering;
+
     /// number of PCM samples contained in each frame
     int         access_unit_size;
     /// next power of two above the number of samples in each frame
@@ -156,7 +159,7 @@ typedef struct MLPDecodeContext {
 
     int8_t      noise_buffer[MAX_BLOCKSIZE_POW2];
     int8_t      bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
-    int32_t     sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS];
 
     MLPDSPContext dsp;
 } MLPDecodeContext;
@@ -380,10 +383,22 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
      * substream is Stereo. Subsequent substreams' layouts are indicated in the
      * major sync. */
     if (m->avctx->codec_id == AV_CODEC_ID_MLP) {
+        if (mh.stream_type != 0xbb) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].ch_layout = AV_CH_LAYOUT_STEREO;
         m->substream[substr].ch_layout = mh.channel_layout_mlp;
     } else {
+        if (mh.stream_type != 0xba) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in !MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].ch_layout = AV_CH_LAYOUT_STEREO;
         if (mh.num_substreams > 2)
@@ -392,8 +407,17 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
             else
                 m->substream[2].ch_layout = mh.channel_layout_thd_stream1;
         m->substream[substr].ch_layout = mh.channel_layout_thd_stream1;
+
+        if (m->avctx->channels<=2 && m->substream[substr].ch_layout == AV_CH_LAYOUT_MONO && m->max_decoded_substream == 1) {
+            av_log(m->avctx, AV_LOG_DEBUG, "Mono stream with 2 substreams, ignoring 2nd\n");
+            m->max_decoded_substream = 0;
+            if (m->avctx->channels==2)
+                m->avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+        }
     }
 
+    m->needs_reordering = mh.channel_arrangement >= 18 && mh.channel_arrangement <= 20;
+
     /* Parse the TrueHD decoder channel modifiers and set each substream's
      * AVMatrixEncoding accordingly.
      *
@@ -479,7 +503,7 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
     if (max_matrix_channel > std_max_matrix_channel) {
         av_log(m->avctx, AV_LOG_ERROR,
                "Max matrix channel cannot be greater than %d.\n",
-               max_matrix_channel);
+               std_max_matrix_channel);
         return AVERROR_INVALIDDATA;
     }
 
@@ -491,11 +515,11 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
 
     /* This should happen for TrueHD streams with >6 channels and MLP's noise
      * type. It is not yet known if this is allowed. */
-    if (s->max_channel > MAX_MATRIX_CHANNEL_MLP && !s->noise_type) {
+    if (max_channel > MAX_MATRIX_CHANNEL_MLP && !s->noise_type) {
         avpriv_request_sample(m->avctx,
                               "%d channels (more than the "
                               "maximum supported by the decoder)",
-                              s->max_channel + 2);
+                              max_channel + 2);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -590,6 +614,20 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
                                                                s->output_shift,
                                                                s->max_matrix_channel,
                                                                m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+        if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
+            if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
+                m->avctx->channel_layout == AV_CH_LAYOUT_5POINT0_BACK) {
+                int i = s->ch_assign[4];
+                s->ch_assign[4] = s->ch_assign[3];
+                s->ch_assign[3] = s->ch_assign[2];
+                s->ch_assign[2] = i;
+            } else if (m->avctx->channel_layout == AV_CH_LAYOUT_5POINT1_BACK) {
+                FFSWAP(int, s->ch_assign[2], s->ch_assign[4]);
+                FFSWAP(int, s->ch_assign[3], s->ch_assign[5]);
+            }
+        }
+
     }
 
     return 0;
@@ -608,7 +646,7 @@ static int read_filter_params(MLPDecodeContext *m, GetBitContext *gbp,
     int i, order;
 
     // Filter is 0 for FIR, 1 for IIR.
-    assert(filter < 2);
+    av_assert0(filter < 2);
 
     if (m->filter_changed[channel][filter]++ > 1) {
         av_log(m->avctx, AV_LOG_ERROR, "Filters may change only once per access unit.\n");
@@ -663,7 +701,7 @@ static int read_filter_params(MLPDecodeContext *m, GetBitContext *gbp,
             /* TODO: Check validity of state data. */
 
             for (i = 0; i < order; i++)
-                fp->state[i] = get_sbits(gbp, state_bits) << state_shift;
+                fp->state[i] = state_bits ? get_sbits(gbp, state_bits) << state_shift : 0;
         }
     }
 
@@ -782,6 +820,7 @@ static int read_channel_params(MLPDecodeContext *m, unsigned int substr,
 
     if (cp->huff_lsbs > 24) {
         av_log(m->avctx, AV_LOG_ERROR, "Invalid huff_lsbs.\n");
+        cp->huff_lsbs = 0;
         return AVERROR_INVALIDDATA;
     }
 
@@ -808,7 +847,7 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
         if (get_bits1(gbp)) {
             s->blocksize = get_bits(gbp, 9);
             if (s->blocksize < 8 || s->blocksize > m->access_unit_size) {
-                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.");
+                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.\n");
                 s->blocksize = 0;
                 return AVERROR_INVALIDDATA;
             }
@@ -848,7 +887,7 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
     return 0;
 }
 
-#define MSB_MASK(bits)  (-1u << bits)
+#define MSB_MASK(bits)  (-1u << (bits))
 
 /** Generate PCM samples using the prediction filters and residual values
  *  read from the data stream, and update the filter state. */
@@ -986,15 +1025,27 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
     s->noisegen_seed = seed;
 }
 
+/** Write the audio data into the output buffer. */
 
-/** Apply the channel matrices in turn to reconstruct the original audio
- *  samples. */
-
-static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
+static int output_data(MLPDecodeContext *m, unsigned int substr,
+                       AVFrame *frame, int *got_frame_ptr)
 {
+    AVCodecContext *avctx = m->avctx;
     SubStream *s = &m->substream[substr];
     unsigned int mat;
     unsigned int maxchan;
+    int ret;
+    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+    if (m->avctx->channels != s->max_matrix_channel + 1) {
+        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!s->blockpos) {
+        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     maxchan = s->max_matrix_channel;
     if (!s->noise_type) {
@@ -1004,6 +1055,8 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
         fill_noise_buffer(m, substr);
     }
 
+    /* Apply the channel matrices in turn to reconstruct the original audio
+     * samples. */
     for (mat = 0; mat < s->num_primitive_matrices; mat++) {
         unsigned int dest_ch = s->matrix_out_ch[mat];
         m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
@@ -1018,34 +1071,11 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
                                     m->access_unit_size_pow2,
                                     MSB_MASK(s->quant_step_size[dest_ch]));
     }
-}
-
-/** Write the audio data into the output buffer. */
-
-static int output_data(MLPDecodeContext *m, unsigned int substr,
-                       AVFrame *frame, int *got_frame_ptr)
-{
-    AVCodecContext *avctx = m->avctx;
-    SubStream *s = &m->substream[substr];
-    int ret;
-    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
-
-    if (m->avctx->channels != s->max_matrix_channel + 1) {
-        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (!s->blockpos) {
-        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
-        return AVERROR_INVALIDDATA;
-    }
 
     /* get output buffer */
     frame->nb_samples = s->blockpos;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
                                                     s->blockpos,
                                                     m->sample_buffer,
@@ -1085,7 +1115,7 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
     int ret;
 
     if (buf_size < 4)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
     length = (AV_RB16(buf) & 0xfff) * 2;
 
@@ -1248,8 +1278,6 @@ next_substr:
         buf += substream_data_len[substr];
     }
 
-    rematrix_channels(m, m->max_decoded_substream);
-
     if ((ret = output_data(m, m->max_decoded_substream, data, got_frame_ptr)) < 0)
         return ret;
 
@@ -1264,6 +1292,7 @@ error:
     return AVERROR_INVALIDDATA;
 }
 
+#if CONFIG_MLP_DECODER
 AVCodec ff_mlp_decoder = {
     .name           = "mlp",
     .long_name      = NULL_IF_CONFIG_SMALL("MLP (Meridian Lossless Packing)"),
@@ -1274,7 +1303,7 @@ AVCodec ff_mlp_decoder = {
     .decode         = read_access_unit,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
 #if CONFIG_TRUEHD_DECODER
 AVCodec ff_truehd_decoder = {
     .name           = "truehd",
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index aded554ea1..3ae8c37708 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2007-2008 Ian Caulfield
  *               2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index acd48fc663..a0edeb7762 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index ae9d749fc9..ecac924689 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -2,20 +2,20 @@
  * MMAL Video Decoder
  * Copyright (c) 2015 Rodger Combs
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mmvideo.c b/libavcodec/mmvideo.c
index 0736630df7..04de6bb4c8 100644
--- a/libavcodec/mmvideo.c
+++ b/libavcodec/mmvideo.c
@@ -2,20 +2,20 @@
  * American Laser Games MM Video Decoder
  * Copyright (c) 2006,2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,7 @@
 typedef struct MmContext {
     AVCodecContext *avctx;
     AVFrame *frame;
-    int palette[AVPALETTE_COUNT];
+    unsigned int palette[AVPALETTE_COUNT];
     GetByteContext gb;
 } MmContext;
 
@@ -75,17 +75,15 @@ static av_cold int mm_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int mm_decode_pal(MmContext *s)
+static void mm_decode_pal(MmContext *s)
 {
     int i;
 
     bytestream2_skip(&s->gb, 4);
     for (i = 0; i < 128; i++) {
-        s->palette[i] = bytestream2_get_be24(&s->gb);
+        s->palette[i] = 0xFFU << 24 | bytestream2_get_be24(&s->gb);
         s->palette[i+128] = s->palette[i]<<2;
     }
-
-    return 0;
 }
 
 /**
@@ -99,8 +97,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     while (bytestream2_get_bytes_left(&s->gb) > 0) {
         int run_length, color;
 
-        // writes one more line when half_vert is true
-        if (y >= s->avctx->height + !!half_vert)
+        if (y >= s->avctx->height)
             return 0;
 
         color = bytestream2_get_byte(&s->gb);
@@ -114,12 +111,12 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
         if (half_horiz)
             run_length *=2;
 
-        if (s->avctx->width - x < run_length)
+        if (run_length > s->avctx->width - x)
             return AVERROR_INVALIDDATA;
 
         if (color) {
             memset(s->frame->data[0] + y*s->frame->linesize[0] + x, color, run_length);
-            if (half_vert)
+            if (half_vert && y + half_vert < s->avctx->height)
                 memset(s->frame->data[0] + (y+1)*s->frame->linesize[0] + x, color, run_length);
         }
         x+= run_length;
@@ -133,7 +130,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     return 0;
 }
 
-/*
+/**
  * @param half_horiz Half horizontal resolution (0 or 1)
  * @param half_vert Half vertical resolution (0 or 1)
  */
@@ -204,13 +201,11 @@ static int mm_decode_frame(AVCodecContext *avctx,
     buf_size -= MM_PREAMBLE_SIZE;
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
         return res;
-    }
 
     switch(type) {
-    case MM_TYPE_PALETTE   : res = mm_decode_pal(s); return buf_size;
+    case MM_TYPE_PALETTE   : mm_decode_pal(s); return avpkt->size;
     case MM_TYPE_INTRA     : res = mm_decode_intra(s, 0, 0); break;
     case MM_TYPE_INTRA_HH  : res = mm_decode_intra(s, 1, 0); break;
     case MM_TYPE_INTRA_HHV : res = mm_decode_intra(s, 1, 1); break;
@@ -231,7 +226,7 @@ static int mm_decode_frame(AVCodecContext *avctx,
 
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mm_decode_end(AVCodecContext *avctx)
diff --git a/libavcodec/motion-test.c b/libavcodec/motion-test.c
new file mode 100644
index 0000000000..ebcf4aafa4
--- /dev/null
+++ b/libavcodec/motion-test.c
@@ -0,0 +1,152 @@
+/*
+ * (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * motion test.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "config.h"
+#include "me_cmp.h"
+#include "libavutil/internal.h"
+#include "libavutil/lfg.h"
+#include "libavutil/mem.h"
+#include "libavutil/time.h"
+
+#undef printf
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    for(i=0;i<size;i++) {
+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void help(void)
+{
+    printf("motion-test [-h]\n"
+           "test motion implementations\n");
+}
+
+#define NB_ITS 500
+
+int dummy;
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+    int64_t ti;
+    printf("testing '%s'\n", name);
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);
+
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 = test_func(NULL, img1, ptr, WIDTH, 8);
+                d2 = ref_func(NULL, img1, ptr, WIDTH, 8);
+                if (d1 != d2) {
+                    printf("error: mmx=%d c=%d\n", d1, d2);
+                }
+            }
+        }
+    }
+    emms_c();
+
+    /* speed test */
+    ti = av_gettime_relative();
+    d1 = 0;
+    for(it=0;it<NB_ITS;it++) {
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 += test_func(NULL, img1, ptr, WIDTH, 8);
+            }
+        }
+    }
+    emms_c();
+    dummy = d1; /* avoid optimization */
+    ti = av_gettime_relative() - ti;
+
+    printf("  %0.0f kop/s\n",
+           (double)NB_ITS * (WIDTH - 16) * (HEIGHT - 16) /
+           (double)(ti / 1000.0));
+}
+
+
+int main(int argc, char **argv)
+{
+    AVCodecContext *ctx;
+    int c;
+    MECmpContext cctx, mmxctx;
+    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
+    int flags_size = HAVE_MMXEXT ? 2 : 1;
+
+    if (argc > 1) {
+        help();
+        return 1;
+    }
+
+    printf("ffmpeg motion test\n");
+
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    av_force_cpu_flags(0);
+    memset(&cctx, 0, sizeof(cctx));
+    ff_me_cmp_init(&cctx, ctx);
+    for (c = 0; c < flags_size; c++) {
+        int x;
+        av_force_cpu_flags(flags[c]);
+        memset(&mmxctx, 0, sizeof(mmxctx));
+        ff_me_cmp_init(&mmxctx, ctx);
+
+        for (x = 0; x < 2; x++) {
+            printf("%s for %dx%d pixels\n", c ? "mmx2" : "mmx",
+                   x ? 8 : 16, x ? 8 : 16);
+            test_motion("mmx",     mmxctx.pix_abs[x][0], cctx.pix_abs[x][0]);
+            test_motion("mmx_x2",  mmxctx.pix_abs[x][1], cctx.pix_abs[x][1]);
+            test_motion("mmx_y2",  mmxctx.pix_abs[x][2], cctx.pix_abs[x][2]);
+            test_motion("mmx_xy2", mmxctx.pix_abs[x][3], cctx.pix_abs[x][3]);
+        }
+    }
+    av_free(ctx);
+
+    return 0;
+}
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index 657c28a02d..9f71568efd 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -5,20 +5,20 @@
  *
  * new motion estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,9 +38,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 #define P_LEFT P[1]
 #define P_TOP P[2]
 #define P_TOPRIGHT P[3]
@@ -118,7 +115,7 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
     uint8_t * const * const src= c->src[src_index];
     int d;
     //FIXME check chroma 4mv, (no crashes ...)
-        assert(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
+        av_assert2(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
         if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){
             const int time_pp= s->pp_time;
             const int time_pb= s->pb_time;
@@ -160,14 +157,14 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
                     c->qpel_avg[1][bxy](c->temp     + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride     + 8*stride, stride);
                     c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride);
                 }else{
-                    assert((fx>>1) + 16*s->mb_x >= -16);
-                    assert((fy>>1) + 16*s->mb_y >= -16);
-                    assert((fx>>1) + 16*s->mb_x <= s->width);
-                    assert((fy>>1) + 16*s->mb_y <= s->height);
-                    assert((bx>>1) + 16*s->mb_x >= -16);
-                    assert((by>>1) + 16*s->mb_y >= -16);
-                    assert((bx>>1) + 16*s->mb_x <= s->width);
-                    assert((by>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((fx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((fy>>1) + 16*s->mb_y >= -16);
+                    av_assert2((fx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((fy>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((bx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((by>>1) + 16*s->mb_y >= -16);
+                    av_assert2((bx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((by>>1) + 16*s->mb_y <= s->height);
 
                     c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16);
                     c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16);
@@ -186,8 +183,8 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
     const int stride= c->stride;
     const int uvstride= c->uvstride;
     const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel?
-    const int hx= subx + (x<<(1+qpel));
-    const int hy= suby + (y<<(1+qpel));
+    const int hx= subx + x*(1<<(1+qpel));
+    const int hy= suby + y*(1<<(1+qpel));
     uint8_t * const * const ref= c->ref[ref_index];
     uint8_t * const * const src= c->src[src_index];
     int d;
@@ -195,7 +192,13 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
         int uvdxy;              /* no, it might not be used uninitialized */
         if(dxy){
             if(qpel){
-                c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                if (h << size == 16) {
+                    c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                } else if (size == 0 && h == 8) {
+                    c->qpel_put[1][dxy](c->temp    , ref[0] + x + y*stride    , stride);
+                    c->qpel_put[1][dxy](c->temp + 8, ref[0] + x + y*stride + 8, stride);
+                } else
+                    av_assert2(0);
                 if(chroma){
                     int cx= hx/2;
                     int cy= hy/2;
@@ -305,12 +308,13 @@ int ff_init_me(MpegEncContext *s){
     int cache_size= FFMIN(ME_MAP_SIZE>>ME_MAP_SHIFT, 1<<ME_MAP_SHIFT);
     int dia_size= FFMAX(FFABS(s->avctx->dia_size)&255, FFABS(s->avctx->pre_dia_size)&255);
 
-    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -ME_MAP_SIZE){
+    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -FFMIN(ME_MAP_SIZE, MAX_SAB_SIZE)){
         av_log(s->avctx, AV_LOG_ERROR, "ME_MAP size is too small for SAB diamond\n");
         return -1;
     }
 
 #if FF_API_MOTION_EST
+    //special case of snow is needed because snow uses its own iterative ME code
 FF_DISABLE_DEPRECATION_WARNINGS
     if (s->motion_est == FF_ME_EPZS) {
         if (s->me_method == ME_ZERO)
@@ -319,7 +323,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
             s->motion_est = FF_ME_EPZS;
         else if (s->me_method == ME_X1)
             s->motion_est = FF_ME_XONE;
-        else {
+        else if (s->avctx->codec_id != AV_CODEC_ID_SNOW) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "me_method is only allowed to be set to zero and epzs; "
                    "for hex,umh,full and others see dia_size\n");
@@ -331,6 +335,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     c->avctx= s->avctx;
 
+    if(s->codec_id == AV_CODEC_ID_H261)
+        c->avctx->me_sub_cmp = c->avctx->me_cmp;
+
     if(cache_size < 2*dia_size && !c->stride){
         av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n");
     }
@@ -379,12 +386,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
     /* 8x8 fullpel search would need a 4x4 chroma compare, which we do
      * not have yet, and even if we had, the motion estimation code
      * does not expect it. */
-    if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
-        s->mecc.me_cmp[2] = zero_cmp;
-    if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
-        s->mecc.me_sub_cmp[2] = zero_cmp;
-    c->hpel_put[2][0]= c->hpel_put[2][1]=
-    c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    if (s->codec_id != AV_CODEC_ID_SNOW) {
+        if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
+            s->mecc.me_cmp[2] = zero_cmp;
+        if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
+            s->mecc.me_sub_cmp[2] = zero_cmp;
+        c->hpel_put[2][0]= c->hpel_put[2][1]=
+        c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    }
 
     if(s->codec_id == AV_CODEC_ID_H261){
         c->sub_motion_search= no_sub_motion_search;
@@ -410,10 +419,9 @@ static int sad_hpel_motion_search(MpegEncContext * s,
     int mx, my, dminh;
     uint8_t *pix, *ptr;
     int stride= c->stride;
-    const int flags= c->sub_flags;
     LOAD_COMMON
 
-    assert(flags == 0);
+    av_assert2(c->sub_flags == 0);
 
     if(c->skip){
         *mx_ptr = 0;
@@ -433,13 +441,13 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my > ymin && my < ymax) {
         int dx=0, dy=0;
         int d, pen_x, pen_y;
-        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int index= my*(1<<ME_MAP_SHIFT) + mx;
         const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
         const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)];
         const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)];
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
 
 
         pen_x= pred_x + mx;
@@ -497,8 +505,8 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my+=dy;
 
     }else{
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
     }
 
     *mx_ptr = mx;
@@ -537,6 +545,7 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
 {
     MotionEstContext * const c= &s->me;
     int range= c->avctx->me_range >> (1 + !!(c->flags&FLAG_QPEL));
+    int max_range = MAX_MV >> (1 + !!(c->flags&FLAG_QPEL));
 /*
     if(c->avctx->me_range) c->range= c->avctx->me_range >> 1;
     else                   c->range= 16;
@@ -544,8 +553,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
     if (s->unrestricted_mv) {
         c->xmin = - x - 16;
         c->ymin = - y - 16;
-        c->xmax = - x + s->mb_width *16;
-        c->ymax = - y + s->mb_height*16;
+        c->xmax = - x + s->width;
+        c->ymax = - y + s->height;
     } else if (s->out_format == FMT_H261){
         // Search range of H261 is different from other codec standards
         c->xmin = (x > 15) ? - 15 : 0;
@@ -558,6 +567,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
         c->xmax = - x + s->mb_width *16 - 16;
         c->ymax = - y + s->mb_height*16 - 16;
     }
+    if(!range || range > max_range)
+        range = max_range;
     if(range){
         c->xmin = FFMAX(c->xmin,-range);
         c->xmax = FFMIN(c->xmax, range);
@@ -584,10 +595,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
     const int h=8;
     int block;
     int P[10][2];
-    int dmin_sum=0, mx4_sum=0, my4_sum=0;
+    int dmin_sum=0, mx4_sum=0, my4_sum=0, i;
     int same=1;
     const int stride= c->stride;
     uint8_t *mv_penalty= c->current_mv_penalty;
+    int saftey_cliping= s->unrestricted_mv && (s->width&15) && (s->height&15);
 
     init_mv4_ref(c);
 
@@ -599,6 +611,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         const int mot_stride = s->b8_stride;
         const int mot_xy = s->block_index[block];
 
+        if(saftey_cliping){
+            c->xmax = - 16*s->mb_x + s->width  - 8*(block &1);
+            c->ymax = - 16*s->mb_y + s->height - 8*(block>>1);
+        }
+
         P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
         P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1];
 
@@ -626,6 +643,15 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
+        if(saftey_cliping)
+            for(i=1; i<10; i++){
+                if (s->first_slice_line && block<2 && i>1 && i<9)
+                    continue;
+                if (i>4 && i<9)
+                    continue;
+                if(P[i][0] > (c->xmax<<shift)) P[i][0]= (c->xmax<<shift);
+                if(P[i][1] > (c->ymax<<shift)) P[i][1]= (c->ymax<<shift);
+            }
 
         dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift);
 
@@ -760,8 +786,8 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
             int16_t (*mv_table)[2]= mv_tables[block][field_select];
 
             if(user_field_select){
-                assert(field_select==0 || field_select==1);
-                assert(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
+                av_assert1(field_select==0 || field_select==1);
+                av_assert1(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
                 if(field_select_tables[block][xy] != field_select)
                     continue;
             }
@@ -858,6 +884,10 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
         return lambda>>FF_LAMBDA_SHIFT;
     case FF_CMP_DCT:
         return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
     case FF_CMP_SATD:
     case FF_CMP_DCT264:
         return (2*lambda)>>FF_LAMBDA_SHIFT;
@@ -886,9 +916,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
-    assert(s->linesize == c->stride);
-    assert(s->uvlinesize == c->uvstride);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->linesize == c->stride);
+    av_assert0(s->uvlinesize == c->uvstride);
 
     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
@@ -1057,7 +1087,7 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
     const int xy= mb_x + mb_y*s->mb_stride;
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
 
     c->pre_penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_pre_cmp);
     c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
@@ -1425,7 +1455,7 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
         if(s->mv_type == MV_TYPE_16X16) break;
     }
 
-    assert(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
+    av_assert2(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
 
     if(xmax < 0 || xmin >0 || ymax < 0 || ymin > 0){
         s->b_direct_mv_table[mot_xy][0]= 0;
@@ -1642,12 +1672,12 @@ void ff_fix_long_p_mvs(MpegEncContext * s)
     MotionEstContext * const c= &s->me;
     const int f_code= s->f_code;
     int y, range;
-    assert(s->pict_type==AV_PICTURE_TYPE_P);
+    av_assert0(s->pict_type==AV_PICTURE_TYPE_P);
 
     range = (((s->out_format == FMT_MPEG1 || s->msmpeg4_version) ? 8 : 16) << f_code);
 
-    assert(range <= 16 || !s->msmpeg4_version);
-    assert(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
+    av_assert0(range <= 16 || !s->msmpeg4_version);
+    av_assert0(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
 
     if(c->avctx->me_range && range > c->avctx->me_range) range= c->avctx->me_range;
 
diff --git a/libavcodec/motion_est.h b/libavcodec/motion_est.h
index 3b63972d6e..df9e3c7e85 100644
--- a/libavcodec/motion_est.h
+++ b/libavcodec/motion_est.h
@@ -1,20 +1,20 @@
 /*
  * Motion estimation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 
 struct MpegEncContext;
 
-#define MAX_MV 2048
+#define MAX_MV 4096
 #define ME_MAP_SIZE 64
 
 #define FF_ME_ZERO 0
@@ -77,8 +77,8 @@ typedef struct MotionEstContext {
     int stride;
     int uvstride;
     /* temp variables for picture complexity calculation */
-    int mc_mb_var_sum_temp;
-    int mb_var_sum_temp;
+    int64_t mc_mb_var_sum_temp;
+    int64_t mb_var_sum_temp;
     int scene_change_score;
 
     op_pixels_func(*hpel_put)[4];
diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c
index 33f3dcb5de..37ff1102ea 100644
--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@@ -2,20 +2,20 @@
  * Motion estimation
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,19 +91,18 @@ static int hpel_motion_search(MpegEncContext * s,
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                      + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*c->penalty_factor;
 
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
         unsigned key;
         unsigned map_generation= c->map_generation;
-#ifndef NDEBUG
-        uint32_t *map= c->map;
-#endif
         key= ((my-1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my+1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx+1) + map_generation;
-        assert(map[(index+1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+1)&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx-1) + map_generation;
-        assert(map[(index-1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-1)&(ME_MAP_SIZE-1)] == key);
+#endif
         if(t<=b){
             CHECK_HALF_MV(0, 1, mx  ,my-1)
             if(l<=r){
@@ -143,7 +142,7 @@ static int hpel_motion_search(MpegEncContext * s,
             }
             CHECK_HALF_MV(0, 1, mx  , my)
         }
-        assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
+        av_assert2(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
     }
 
     *mx_ptr = bx;
@@ -181,9 +180,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
     cmp_sub        = s->mecc.mb_cmp[size];
     chroma_cmp_sub = s->mecc.mb_cmp[size + 1];
 
-//    assert(!c->skip);
-//    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
-
     d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
     //FIXME check cbp before adding penalty for (0,0) vector
     if(add_rate && (mx || my || size>0))
@@ -310,11 +306,11 @@ static int qpel_motion_search(MpegEncContext * s,
 
             cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c;
 
-            assert(16*cx2 + 4*cx + 32*c == 32*r);
-            assert(16*cx2 - 4*cx + 32*c == 32*l);
-            assert(16*cy2 + 4*cy + 32*c == 32*b);
-            assert(16*cy2 - 4*cy + 32*c == 32*t);
-            assert(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
+            av_assert2(16*cx2 + 4*cx + 32*c == 32*r);
+            av_assert2(16*cx2 - 4*cx + 32*c == 32*l);
+            av_assert2(16*cy2 + 4*cy + 32*c == 32*b);
+            av_assert2(16*cy2 - 4*cy + 32*c == 32*t);
+            av_assert2(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
 
             for(ny= -3; ny <= 3; ny++){
                 for(nx= -3; nx <= 3; nx++){
@@ -347,7 +343,7 @@ static int qpel_motion_search(MpegEncContext * s,
             CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
         }
 
-        assert(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
+        av_assert2(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
 
         *mx_ptr = bx;
         *my_ptr = by;
@@ -362,17 +358,17 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV(x,y)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    assert((x) >= xmin);\
-    assert((x) <= xmax);\
-    assert((y) >= ymin);\
-    assert((y) <= ymax);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    av_assert2((x) >= xmin);\
+    av_assert2((x) <= xmax);\
+    av_assert2((y) >= ymin);\
+    av_assert2((y) <= ymax);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[((x)*(1<<shift))-pred_x] + mv_penalty[((y)*(1<<shift))-pred_y])*penalty_factor;\
         COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
     }\
 }
@@ -388,13 +384,13 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[(int)((unsigned)(x)<<shift)-pred_x] + mv_penalty[(int)((unsigned)(y)<<shift)-pred_y])*penalty_factor;\
         if(d<dmin){\
             best[0]=x;\
             best[1]=y;\
@@ -405,10 +401,10 @@ static int qpel_motion_search(MpegEncContext * s,
 }
 
 #define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
+if( (x)<(xmin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
 #define LOAD_COMMON2\
     uint32_t *map= c->map;\
@@ -430,8 +426,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best,
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
-        const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
-        const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
+        const unsigned key = ((unsigned)best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
+        const int index= (((unsigned)best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
         if(map[index]!=key){ //this will be executed only very rarey
             score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
             map[index]= key;
@@ -693,6 +689,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
     LOAD_COMMON2
     unsigned map_generation = c->map_generation;
 
+    av_assert1(minima_count <= MAX_SAB_SIZE);
+
     cmpf        = s->mecc.me_cmp[size];
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
@@ -890,7 +888,7 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
 
     map_generation= update_map_generation(c);
 
-    assert(cmpf);
+    av_assert2(cmpf);
     dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
     map[0]= map_generation;
     score_map[0]= dmin;
diff --git a/libavcodec/motionpixels.c b/libavcodec/motionpixels.c
index a18541b852..a88b837b3e 100644
--- a/libavcodec/motionpixels.c
+++ b/libavcodec/motionpixels.c
@@ -2,20 +2,20 @@
  * Motion Pixels Video Decoder
  * Copyright (c) 2008 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,13 +69,24 @@ static av_cold int mp_decode_init(AVCodecContext *avctx)
     int w4 = (avctx->width  + 3) & ~3;
     int h4 = (avctx->height + 3) & ~3;
 
+    if(avctx->extradata_size < 2){
+        av_log(avctx, AV_LOG_ERROR, "extradata too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     motionpixels_tableinit();
     mp->avctx = avctx;
     ff_bswapdsp_init(&mp->bdsp);
-    mp->changes_map = av_mallocz(avctx->width * h4);
+    mp->changes_map = av_mallocz_array(avctx->width, h4);
     mp->offset_bits_len = av_log2(avctx->width * avctx->height) + 1;
-    mp->vpt = av_mallocz(avctx->height * sizeof(YuvPixel));
-    mp->hpt = av_mallocz(h4 * w4 / 16 * sizeof(YuvPixel));
+    mp->vpt = av_mallocz_array(avctx->height, sizeof(YuvPixel));
+    mp->hpt = av_mallocz_array(h4 / 4, w4 / 4 * sizeof(YuvPixel));
+    if (!mp->changes_map || !mp->vpt || !mp->hpt) {
+        av_freep(&mp->changes_map);
+        av_freep(&mp->vpt);
+        av_freep(&mp->hpt);
+        return AVERROR(ENOMEM);
+    }
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     mp->frame = av_frame_alloc();
@@ -116,38 +127,48 @@ static void mp_read_changes_map(MotionPixelsContext *mp, GetBitContext *gb, int
     }
 }
 
-static void mp_get_code(MotionPixelsContext *mp, GetBitContext *gb, int size, int code)
+static int mp_get_code(MotionPixelsContext *mp, GetBitContext *gb, int size, int code)
 {
     while (get_bits1(gb)) {
         ++size;
         if (size > mp->max_codes_bits) {
             av_log(mp->avctx, AV_LOG_ERROR, "invalid code size %d/%d\n", size, mp->max_codes_bits);
-            return;
+            return AVERROR_INVALIDDATA;
         }
         code <<= 1;
-        mp_get_code(mp, gb, size, code + 1);
+        if (mp_get_code(mp, gb, size, code + 1) < 0)
+            return AVERROR_INVALIDDATA;
     }
     if (mp->current_codes_count >= MAX_HUFF_CODES) {
         av_log(mp->avctx, AV_LOG_ERROR, "too many codes\n");
-        return;
+        return AVERROR_INVALIDDATA;
     }
+
     mp->codes[mp->current_codes_count  ].code = code;
     mp->codes[mp->current_codes_count++].size = size;
+    return 0;
 }
 
-static void mp_read_codes_table(MotionPixelsContext *mp, GetBitContext *gb)
+static int mp_read_codes_table(MotionPixelsContext *mp, GetBitContext *gb)
 {
     if (mp->codes_count == 1) {
         mp->codes[0].delta = get_bits(gb, 4);
     } else {
         int i;
+        int ret;
 
         mp->max_codes_bits = get_bits(gb, 4);
         for (i = 0; i < mp->codes_count; ++i)
             mp->codes[i].delta = get_bits(gb, 4);
         mp->current_codes_count = 0;
-        mp_get_code(mp, gb, 0, 0);
+        if ((ret = mp_get_code(mp, gb, 0, 0)) < 0)
+            return ret;
+        if (mp->current_codes_count < mp->codes_count) {
+            av_log(mp->avctx, AV_LOG_ERROR, "too few codes\n");
+            return AVERROR_INVALIDDATA;
+        }
    }
+   return 0;
 }
 
 static int mp_gradient(MotionPixelsContext *mp, int component, int v)
@@ -180,7 +201,6 @@ static int mp_get_vlc(MotionPixelsContext *mp, GetBitContext *gb)
     int i;
 
     i = (mp->codes_count == 1) ? 0 : get_vlc2(gb, mp->vlc.table, mp->max_codes_bits, 1);
-    i = FFMIN(i, FF_ARRAY_ELEMS(mp->codes) - 1);
     return mp->codes[i].delta;
 }
 
@@ -236,6 +256,8 @@ static void mp_decode_frame_helper(MotionPixelsContext *mp, GetBitContext *gb)
     YuvPixel p;
     int y, y0;
 
+    av_assert1(mp->changes_map[0]);
+
     for (y = 0; y < mp->avctx->height; ++y) {
         if (mp->changes_map[y * mp->avctx->width] != 0) {
             memset(mp->gradient_scale, 1, sizeof(mp->gradient_scale));
@@ -268,20 +290,17 @@ static int mp_decode_frame(AVCodecContext *avctx,
     GetBitContext gb;
     int i, count1, count2, sz, ret;
 
-    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0)
         return ret;
-    }
 
     /* le32 bitstream msb first */
-    av_fast_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size);
     if (!mp->bswapbuf)
         return AVERROR(ENOMEM);
     mp->bdsp.bswap_buf((uint32_t *) mp->bswapbuf, (const uint32_t *) buf,
                        buf_size / 4);
     if (buf_size & 3)
         memcpy(mp->bswapbuf + (buf_size & ~3), buf + (buf_size & ~3), buf_size & 3);
-    memset(mp->bswapbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&gb, mp->bswapbuf, buf_size * 8);
 
     memset(mp->changes_map, 0, avctx->width * avctx->height);
@@ -300,7 +319,8 @@ static int mp_decode_frame(AVCodecContext *avctx,
         *(uint16_t *)mp->frame->data[0] = get_bits(&gb, 15);
         mp->changes_map[0] = 1;
     }
-    mp_read_codes_table(mp, &gb);
+    if (mp_read_codes_table(mp, &gb) < 0)
+        goto end;
 
     sz = get_bits(&gb, 18);
     if (avctx->extradata[0] != 5)
diff --git a/libavcodec/motionpixels_tablegen.c b/libavcodec/motionpixels_tablegen.c
index 410b76fb05..14b7b9bb8c 100644
--- a/libavcodec/motionpixels_tablegen.c
+++ b/libavcodec/motionpixels_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/motionpixels_tablegen.h b/libavcodec/motionpixels_tablegen.h
index e6c32c7dec..4ffe74cbe5 100644
--- a/libavcodec/motionpixels_tablegen.h
+++ b/libavcodec/motionpixels_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,13 +24,14 @@
 #define AVCODEC_MOTIONPIXELS_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 typedef struct YuvPixel {
     int8_t y, v, u;
 } YuvPixel;
 
 static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
-    static const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
     int r, g, b;
 
     r = (1000 * y + 701 * v) / 1000;
@@ -49,7 +50,7 @@ static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
 #else
 static YuvPixel mp_rgb_yuv_table[1 << 15];
 
-static void mp_set_zero_yuv(YuvPixel *p)
+static av_cold void mp_set_zero_yuv(YuvPixel *p)
 {
     int i, j;
 
@@ -63,7 +64,7 @@ static void mp_set_zero_yuv(YuvPixel *p)
     }
 }
 
-static void mp_build_rgb_yuv_table(YuvPixel *p)
+static av_cold void mp_build_rgb_yuv_table(YuvPixel *p)
 {
     int y, v, u, i;
 
@@ -81,7 +82,7 @@ static void mp_build_rgb_yuv_table(YuvPixel *p)
         mp_set_zero_yuv(p + i * 32);
 }
 
-static void motionpixels_tableinit(void)
+static av_cold void motionpixels_tableinit(void)
 {
     if (!mp_rgb_yuv_table[0].u)
         mp_build_rgb_yuv_table(mp_rgb_yuv_table);
diff --git a/libavcodec/movsub_bsf.c b/libavcodec/movsub_bsf.c
index 0484303840..8ee7a3a42d 100644
--- a/libavcodec/movsub_bsf.c
+++ b/libavcodec/movsub_bsf.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,9 +37,8 @@ static int text2movsub(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, co
 }
 
 AVBitStreamFilter ff_text2movsub_bsf={
-    "text2movsub",
-    0,
-    text2movsub,
+    .name   = "text2movsub",
+    .filter = text2movsub,
 };
 
 static int mov2textsub(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, const char *args,
@@ -55,7 +54,6 @@ static int mov2textsub(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, co
 }
 
 AVBitStreamFilter ff_mov2textsub_bsf={
-    "mov2textsub",
-    0,
-    mov2textsub,
+    .name   = "mov2textsub",
+    .filter = mov2textsub,
 };
diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c
new file mode 100644
index 0000000000..257d5986f5
--- /dev/null
+++ b/libavcodec/movtextdec.c
@@ -0,0 +1,532 @@
+/*
+ * 3GPP TS 26.245 Timed Text decoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/bprint.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+
+#define BOX_SIZE_INITIAL    40
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+#define TWRP_BOX   (1<<3)
+
+#define BOTTOM_LEFT     1
+#define BOTTOM_CENTER   2
+#define BOTTOM_RIGHT    3
+#define MIDDLE_LEFT     4
+#define MIDDLE_CENTER   5
+#define MIDDLE_RIGHT    6
+#define TOP_LEFT        7
+#define TOP_CENTER      8
+#define TOP_RIGHT       9
+
+typedef struct {
+    char *font;
+    int fontsize;
+    int color;
+    int back_color;
+    int bold;
+    int italic;
+    int underline;
+    int alignment;
+} MovTextDefault;
+
+typedef struct {
+    uint16_t fontID;
+    char *font;
+} FontRecord;
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+    uint8_t fontsize;
+    uint16_t style_fontID;
+} StyleBox;
+
+typedef struct {
+    uint16_t hlit_start;
+    uint16_t hlit_end;
+} HighlightBox;
+
+typedef struct {
+   uint8_t hlit_color[4];
+} HilightcolorBox;
+
+typedef struct {
+    uint8_t wrap_flag;
+} TextWrapBox;
+
+typedef struct {
+    StyleBox **s;
+    StyleBox *s_temp;
+    HighlightBox h;
+    HilightcolorBox c;
+    FontRecord **ftab;
+    FontRecord *ftab_temp;
+    TextWrapBox w;
+    MovTextDefault d;
+    uint8_t box_flags;
+    uint16_t style_entries, ftab_entries;
+    uint64_t tracksize;
+    int size_var;
+    int count_s, count_f;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    size_t base_size;
+    int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *m)
+{
+    int i;
+    if (m->box_flags & STYL_BOX) {
+        for(i = 0; i < m->count_s; i++) {
+            av_freep(&m->s[i]);
+        }
+        av_freep(&m->s);
+    }
+}
+
+static void mov_text_cleanup_ftab(MovTextContext *m)
+{
+    int i;
+    if (m->ftab_temp)
+        av_freep(&m->ftab_temp->font);
+    av_freep(&m->ftab_temp);
+    if (m->ftab) {
+        for(i = 0; i < m->count_f; i++) {
+            av_freep(&m->ftab[i]->font);
+            av_freep(&m->ftab[i]);
+        }
+    }
+    av_freep(&m->ftab);
+}
+
+static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
+{
+    uint8_t *tx3g_ptr = avctx->extradata;
+    int i, box_size, font_length;
+    int8_t v_align, h_align;
+    int style_fontID;
+    StyleBox s_default;
+
+    m->count_f = 0;
+    m->ftab_entries = 0;
+    box_size = BOX_SIZE_INITIAL; /* Size till ftab_entries */
+    if (avctx->extradata_size < box_size)
+        return -1;
+
+    // Display Flags
+    tx3g_ptr += 4;
+    // Alignment
+    h_align = *tx3g_ptr++;
+    v_align = *tx3g_ptr++;
+    if (h_align == 0) {
+        if (v_align == 0)
+            m->d.alignment = TOP_LEFT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_LEFT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_LEFT;
+    }
+    if (h_align == 1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_CENTER;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_CENTER;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_CENTER;
+    }
+    if (h_align == -1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_RIGHT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_RIGHT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_RIGHT;
+    }
+    // Background Color
+    m->d.back_color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // BoxRecord
+    tx3g_ptr += 8;
+    // StyleRecord
+    tx3g_ptr += 4;
+    // fontID
+    style_fontID = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+    // face-style-flags
+    s_default.style_flag = *tx3g_ptr++;
+    m->d.bold = s_default.style_flag & STYLE_FLAG_BOLD;
+    m->d.italic = s_default.style_flag & STYLE_FLAG_ITALIC;
+    m->d.underline = s_default.style_flag & STYLE_FLAG_UNDERLINE;
+    // fontsize
+    m->d.fontsize = *tx3g_ptr++;
+    // Primary color
+    m->d.color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // FontRecord
+    // FontRecord Size
+    tx3g_ptr += 4;
+    // ftab
+    tx3g_ptr += 4;
+
+    m->ftab_entries = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+
+    for (i = 0; i < m->ftab_entries; i++) {
+
+        box_size += 3;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp = av_mallocz(sizeof(*m->ftab_temp));
+        if (!m->ftab_temp) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp->fontID = AV_RB16(tx3g_ptr);
+        tx3g_ptr += 2;
+        font_length = *tx3g_ptr++;
+
+        box_size = box_size + font_length;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp->font = av_malloc(font_length + 1);
+        if (!m->ftab_temp->font) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        memcpy(m->ftab_temp->font, tx3g_ptr, font_length);
+        m->ftab_temp->font[font_length] = '\0';
+        av_dynarray_add(&m->ftab, &m->count_f, m->ftab_temp);
+        if (!m->ftab) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp = NULL;
+        tx3g_ptr = tx3g_ptr + font_length;
+    }
+    for (i = 0; i < m->ftab_entries; i++) {
+        if (style_fontID == m->ftab[i]->fontID)
+            m->d.font = m->ftab[i]->font;
+    }
+    return 0;
+}
+
+static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= TWRP_BOX;
+    m->w.wrap_flag = *tsmb++;
+    return 0;
+}
+
+static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HLIT_BOX;
+    m->h.hlit_start = AV_RB16(tsmb);
+    tsmb += 2;
+    m->h.hlit_end = AV_RB16(tsmb);
+    tsmb += 2;
+    return 0;
+}
+
+static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HCLR_BOX;
+    memcpy(m->c.hlit_color, tsmb, 4);
+    tsmb += 4;
+    return 0;
+}
+
+static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    int i;
+    m->style_entries = AV_RB16(tsmb);
+    tsmb += 2;
+    // A single style record is of length 12 bytes.
+    if (m->tracksize + m->size_var + 2 + m->style_entries * 12 > avpkt->size)
+        return -1;
+
+    m->box_flags |= STYL_BOX;
+    for(i = 0; i < m->style_entries; i++) {
+        m->s_temp = av_malloc(sizeof(*m->s_temp));
+        if (!m->s_temp) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        m->s_temp->style_start = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_end = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_fontID = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_flag = AV_RB8(tsmb);
+        tsmb++;
+        m->s_temp->fontsize = AV_RB8(tsmb);
+        av_dynarray_add(&m->s, &m->count_s, m->s_temp);
+        if(!m->s) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        tsmb++;
+        // text-color-rgba
+        tsmb += 4;
+    }
+    return 0;
+}
+
+static const Box box_types[] = {
+    { MKBETAG('s','t','y','l'), 2, decode_styl },
+    { MKBETAG('h','l','i','t'), 4, decode_hlit },
+    { MKBETAG('h','c','l','r'), 4, decode_hclr },
+    { MKBETAG('t','w','r','p'), 1, decode_twrp }
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
+                        MovTextContext *m)
+{
+    int i = 0;
+    int j = 0;
+    int text_pos = 0;
+
+    if (text < text_end && m->box_flags & TWRP_BOX) {
+        if (m->w.wrap_flag == 1) {
+            av_bprintf(buf, "{\\q1}"); /* End of line wrap */
+        } else {
+            av_bprintf(buf, "{\\q2}"); /* No wrap */
+        }
+    }
+
+    while (text < text_end) {
+        if (m->box_flags & STYL_BOX) {
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_end) {
+                    av_bprintf(buf, "{\\r}");
+                }
+            }
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_start) {
+                    if (m->s[i]->style_flag & STYLE_FLAG_BOLD)
+                        av_bprintf(buf, "{\\b1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_ITALIC)
+                        av_bprintf(buf, "{\\i1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_UNDERLINE)
+                        av_bprintf(buf, "{\\u1}");
+                    av_bprintf(buf, "{\\fs%d}", m->s[i]->fontsize);
+                    for (j = 0; j < m->ftab_entries; j++) {
+                        if (m->s[i]->style_fontID == m->ftab[j]->fontID)
+                            av_bprintf(buf, "{\\fn%s}", m->ftab[j]->font);
+                    }
+                }
+            }
+        }
+        if (m->box_flags & HLIT_BOX) {
+            if (text_pos == m->h.hlit_start) {
+                /* If hclr box is present, set the secondary color to the color
+                 * specified. Otherwise, set primary color to white and secondary
+                 * color to black. These colors will come from TextSampleModifier
+                 * boxes in future and inverse video technique for highlight will
+                 * be implemented.
+                 */
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
+                                m->c.hlit_color[1], m->c.hlit_color[0]);
+                } else {
+                    av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
+                }
+            }
+            if (text_pos == m->h.hlit_end) {
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H000000&}");
+                } else {
+                    av_bprintf(buf, "{\\1c&HFFFFFF&}{\\2c&H000000&}");
+                }
+            }
+        }
+
+        switch (*text) {
+        case '\r':
+            break;
+        case '\n':
+            av_bprintf(buf, "\\N");
+            break;
+        default:
+            av_bprint_chars(buf, *text, 1);
+            break;
+        }
+        text++;
+        text_pos++;
+    }
+
+    return 0;
+}
+
+static int mov_text_init(AVCodecContext *avctx) {
+    /*
+     * TODO: Handle the default text style.
+     * NB: Most players ignore styles completely, with the result that
+     * it's very common to find files where the default style is broken
+     * and respecting it results in a worse experience than ignoring it.
+     */
+    int ret;
+    MovTextContext *m = avctx->priv_data;
+    ret = mov_text_tx3g(avctx, m);
+    if (ret == 0) {
+        return ff_ass_subtitle_header(avctx, m->d.font, m->d.fontsize, m->d.color,
+                                m->d.back_color, m->d.bold, m->d.italic,
+                                m->d.underline, m->d.alignment);
+    } else
+        return ff_ass_subtitle_header_default(avctx);
+}
+
+static int mov_text_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    MovTextContext *m = avctx->priv_data;
+    int ret, ts_start, ts_end;
+    AVBPrint buf;
+    char *ptr = avpkt->data;
+    char *end;
+    int text_length, tsmb_type, ret_tsmb;
+    uint64_t tsmb_size;
+    const uint8_t *tsmb;
+
+    if (!ptr || avpkt->size < 2)
+        return AVERROR_INVALIDDATA;
+
+    /*
+     * A packet of size two with value zero is an empty subtitle
+     * used to mark the end of the previous non-empty subtitle.
+     * We can just drop them here as we have duration information
+     * already. If the value is non-zero, then it's technically a
+     * bad packet.
+     */
+    if (avpkt->size == 2)
+        return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
+
+    /*
+     * The first two bytes of the packet are the length of the text string
+     * In complex cases, there are style descriptors appended to the string
+     * so we can't just assume the packet size is the string size.
+     */
+    text_length = AV_RB16(ptr);
+    end = ptr + FFMIN(2 + text_length, avpkt->size);
+    ptr += 2;
+
+    ts_start = av_rescale_q(avpkt->pts,
+                            avctx->time_base,
+                            (AVRational){1,100});
+    ts_end   = av_rescale_q(avpkt->pts + avpkt->duration,
+                            avctx->time_base,
+                            (AVRational){1,100});
+
+    tsmb_size = 0;
+    m->tracksize = 2 + text_length;
+    m->style_entries = 0;
+    m->box_flags = 0;
+    m->count_s = 0;
+    // Note that the spec recommends lines be no longer than 2048 characters.
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (text_length + 2 != avpkt->size) {
+        while (m->tracksize + 8 <= avpkt->size) {
+            // A box is a minimum of 8 bytes.
+            tsmb = ptr + m->tracksize - 2;
+            tsmb_size = AV_RB32(tsmb);
+            tsmb += 4;
+            tsmb_type = AV_RB32(tsmb);
+            tsmb += 4;
+
+            if (tsmb_size == 1) {
+                if (m->tracksize + 16 > avpkt->size)
+                    break;
+                tsmb_size = AV_RB64(tsmb);
+                tsmb += 8;
+                m->size_var = 16;
+            } else
+                m->size_var = 8;
+            //size_var is equal to 8 or 16 depending on the size of box
+
+            if (m->tracksize + tsmb_size > avpkt->size)
+                break;
+
+            for (size_t i = 0; i < box_count; i++) {
+                if (tsmb_type == box_types[i].type) {
+                    if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
+                        break;
+                    ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
+                    if (ret_tsmb == -1)
+                        break;
+                }
+            }
+            m->tracksize = m->tracksize + tsmb_size;
+        }
+        text_to_ass(&buf, ptr, end, m);
+        mov_text_cleanup(m);
+    } else
+        text_to_ass(&buf, ptr, end, m);
+
+    ret = ff_ass_add_rect_bprint(sub, &buf, ts_start, ts_end - ts_start);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int mov_text_decode_close(AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    mov_text_cleanup_ftab(m);
+    return 0;
+}
+
+AVCodec ff_movtext_decoder = {
+    .name         = "mov_text",
+    .long_name    = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init         = mov_text_init,
+    .decode       = mov_text_decode_frame,
+    .close        = mov_text_decode_close,
+};
diff --git a/libavcodec/movtextenc.c b/libavcodec/movtextenc.c
new file mode 100644
index 0000000000..6d42d5f351
--- /dev/null
+++ b/libavcodec/movtextenc.c
@@ -0,0 +1,395 @@
+/*
+ * 3GPP TS 26.245 Timed Text encoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/common.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+#define STYLE_RECORD_SIZE       12
+#define SIZE_ADD                10
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+
+#define av_bprint_append_any(buf, data, size)   av_bprint_append_data(buf, ((const char*)data), size)
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+} StyleBox;
+
+typedef struct {
+    uint16_t start;
+    uint16_t end;
+} HighlightBox;
+
+typedef struct {
+   uint32_t color;
+} HilightcolorBox;
+
+typedef struct {
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    StyleBox **style_attributes;
+    StyleBox *style_attributes_temp;
+    HighlightBox hlit;
+    HilightcolorBox hclr;
+    int count;
+    uint8_t box_flags;
+    uint16_t style_entries;
+    uint16_t style_fontID;
+    uint8_t style_fontsize;
+    uint32_t style_color;
+    uint16_t text_pos;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    void (*encode)(MovTextContext *s, uint32_t tsmb_type);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *s)
+{
+    int j;
+    if (s->box_flags & STYL_BOX) {
+        for (j = 0; j < s->count; j++) {
+            av_freep(&s->style_attributes[j]);
+        }
+        av_freep(&s->style_attributes);
+    }
+}
+
+static void encode_styl(MovTextContext *s, uint32_t tsmb_type)
+{
+    int j;
+    uint32_t tsmb_size;
+    if (s->box_flags & STYL_BOX) {
+        tsmb_size = s->count * STYLE_RECORD_SIZE + SIZE_ADD;
+        tsmb_size = AV_RB32(&tsmb_size);
+        s->style_entries = AV_RB16(&s->count);
+        s->style_fontID = 0x00 | 0x01<<8;
+        s->style_fontsize = 0x12;
+        s->style_color = MKTAG(0xFF, 0xFF, 0xFF, 0xFF);
+        /*The above three attributes are hard coded for now
+        but will come from ASS style in the future*/
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->style_entries, 2);
+        for (j = 0; j < s->count; j++) {
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_start, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_end, 2);
+            av_bprint_append_any(&s->buffer, &s->style_fontID, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_flag, 1);
+            av_bprint_append_any(&s->buffer, &s->style_fontsize, 1);
+            av_bprint_append_any(&s->buffer, &s->style_color, 4);
+        }
+        mov_text_cleanup(s);
+    }
+}
+
+static void encode_hlit(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HLIT_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hlit.start, 2);
+        av_bprint_append_any(&s->buffer, &s->hlit.end, 2);
+    }
+}
+
+static void encode_hclr(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HCLR_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hclr.color, 4);
+    }
+}
+
+static const Box box_types[] = {
+    { MKTAG('s','t','y','l'), encode_styl },
+    { MKTAG('h','l','i','t'), encode_hlit },
+    { MKTAG('h','c','l','r'), encode_hclr },
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+static av_cold int mov_text_encode_init(AVCodecContext *avctx)
+{
+    /*
+     * For now, we'll use a fixed default style. When we add styling
+     * support, this will be generated from the ASS style.
+     */
+    static const uint8_t text_sample_entry[] = {
+        0x00, 0x00, 0x00, 0x00, // uint32_t displayFlags
+        0x01,                   // int8_t horizontal-justification
+        0xFF,                   // int8_t vertical-justification
+        0x00, 0x00, 0x00, 0x00, // uint8_t background-color-rgba[4]
+        // BoxRecord {
+        0x00, 0x00,             // int16_t top
+        0x00, 0x00,             // int16_t left
+        0x00, 0x00,             // int16_t bottom
+        0x00, 0x00,             // int16_t right
+        // };
+        // StyleRecord {
+        0x00, 0x00,             // uint16_t startChar
+        0x00, 0x00,             // uint16_t endChar
+        0x00, 0x01,             // uint16_t font-ID
+        0x00,                   // uint8_t face-style-flags
+        0x12,                   // uint8_t font-size
+        0xFF, 0xFF, 0xFF, 0xFF, // uint8_t text-color-rgba[4]
+        // };
+        // FontTableBox {
+        0x00, 0x00, 0x00, 0x12, // uint32_t size
+        'f', 't', 'a', 'b',     // uint8_t name[4]
+        0x00, 0x01,             // uint16_t entry-count
+        // FontRecord {
+        0x00, 0x01,             // uint16_t font-ID
+        0x05,                   // uint8_t font-name-length
+        'S', 'e', 'r', 'i', 'f',// uint8_t font[font-name-length]
+        // };
+        // };
+    };
+
+    MovTextContext *s = avctx->priv_data;
+
+    avctx->extradata_size = sizeof text_sample_entry;
+    avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    memcpy(avctx->extradata, text_sample_entry, avctx->extradata_size);
+
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void mov_text_style_cb(void *priv, const char style, int close)
+{
+    MovTextContext *s = priv;
+    if (!close) {
+        if (!(s->box_flags & STYL_BOX)) {   //first style entry
+
+            s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+            if (!s->style_attributes_temp) {
+                av_bprint_clear(&s->buffer);
+                s->box_flags &= ~STYL_BOX;
+                return;
+            }
+
+            s->style_attributes_temp->style_flag = 0;
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        } else {
+            if (s->style_attributes_temp->style_flag) { //break the style record here and start a new one
+                s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+                av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+                s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+                if (!s->style_attributes_temp) {
+                    mov_text_cleanup(s);
+                    av_bprint_clear(&s->buffer);
+                    s->box_flags &= ~STYL_BOX;
+                    return;
+                }
+
+                s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            } else {
+                s->style_attributes_temp->style_flag = 0;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            }
+        }
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_UNDERLINE;
+            break;
+        }
+    } else {
+        s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+        av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+
+        s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+        if (!s->style_attributes_temp) {
+            mov_text_cleanup(s);
+            av_bprint_clear(&s->buffer);
+            s->box_flags &= ~STYL_BOX;
+            return;
+        }
+
+        s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_UNDERLINE;
+            break;
+        }
+        if (s->style_attributes_temp->style_flag) { //start of new style record
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        }
+    }
+    s->box_flags |= STYL_BOX;
+}
+
+static void mov_text_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    MovTextContext *s = priv;
+    if (color_id == 2) {    //secondary color changes
+        if (s->box_flags & HLIT_BOX) {  //close tag
+            s->hlit.end = AV_RB16(&s->text_pos);
+        } else {
+            s->box_flags |= HCLR_BOX;
+            s->box_flags |= HLIT_BOX;
+            s->hlit.start = AV_RB16(&s->text_pos);
+            s->hclr.color = color | (0xFF << 24);  //set alpha value to FF
+        }
+    }
+    /* If there are more than one secondary color changes in ASS, take start of
+       first section and end of last section. Movtext allows only one
+       highlight box per sample.
+     */
+}
+
+static void mov_text_text_cb(void *priv, const char *text, int len)
+{
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+    s->text_pos += len;
+}
+
+static void mov_text_new_line_cb(void *priv, int forced)
+{
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, "\n", 1);
+    s->text_pos += 1;
+}
+
+static const ASSCodesCallbacks mov_text_callbacks = {
+    .text     = mov_text_text_cb,
+    .new_line = mov_text_new_line_cb,
+    .style    = mov_text_style_cb,
+    .color    = mov_text_color_cb,
+};
+
+static int mov_text_encode_frame(AVCodecContext *avctx, unsigned char *buf,
+                                 int bufsize, const AVSubtitle *sub)
+{
+    MovTextContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i, num, length;
+    size_t j;
+
+    s->text_pos = 0;
+    s->count = 0;
+    s->box_flags = 0;
+    s->style_entries = 0;
+    for (i = 0; i < sub->num_rects; i++) {
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+        dialog = ff_ass_split_dialog(s->ass_ctx, sub->rects[i]->ass, 0, &num);
+        for (; dialog && num--; dialog++) {
+            ff_ass_split_override_codes(&mov_text_callbacks, s, dialog->text);
+        }
+
+        for (j = 0; j < box_count; j++) {
+            box_types[j].encode(s, box_types[j].type);
+        }
+    }
+
+    AV_WB16(buf, s->text_pos);
+    buf += 2;
+
+    if (!av_bprint_is_complete(&s->buffer)) {
+        length = AVERROR(ENOMEM);
+        goto exit;
+    }
+
+    if (!s->buffer.len) {
+        length = 0;
+        goto exit;
+    }
+
+    if (s->buffer.len > bufsize - 3) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        length = AVERROR(EINVAL);
+        goto exit;
+    }
+
+    memcpy(buf, s->buffer.str, s->buffer.len);
+    length = s->buffer.len + 2;
+
+exit:
+    av_bprint_clear(&s->buffer);
+    return length;
+}
+
+static int mov_text_encode_close(AVCodecContext *avctx)
+{
+    MovTextContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+AVCodec ff_movtext_encoder = {
+    .name           = "mov_text",
+    .long_name      = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init           = mov_text_encode_init,
+    .encode_sub     = mov_text_encode_frame,
+    .close          = mov_text_encode_close,
+};
diff --git a/libavcodec/mp3_header_decompress_bsf.c b/libavcodec/mp3_header_decompress_bsf.c
new file mode 100644
index 0000000000..95c0b5b769
--- /dev/null
+++ b/libavcodec/mp3_header_decompress_bsf.c
@@ -0,0 +1,97 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "mpegaudiodecheader.h"
+#include "mpegaudiodata.h"
+
+
+static int mp3_header_decompress(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, const char *args,
+                     uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size, int keyframe){
+    uint32_t header;
+    int sample_rate= avctx->sample_rate;
+    int sample_rate_index=0;
+    int lsf, mpeg25, bitrate_index, frame_size;
+
+    header = AV_RB32(buf);
+    if(ff_mpa_check_header(header) >= 0){
+        *poutbuf= (uint8_t *) buf;
+        *poutbuf_size= buf_size;
+
+        return 0;
+    }
+
+    if(avctx->extradata_size != 15 || strcmp(avctx->extradata, "FFCMP3 0.0")){
+        av_log(avctx, AV_LOG_ERROR, "Extradata invalid %d\n", avctx->extradata_size);
+        return -1;
+    }
+
+    header= AV_RB32(avctx->extradata+11) & MP3_MASK;
+
+    lsf     = sample_rate < (24000+32000)/2;
+    mpeg25  = sample_rate < (12000+16000)/2;
+    sample_rate_index= (header>>10)&3;
+    sample_rate= avpriv_mpa_freq_tab[sample_rate_index] >> (lsf + mpeg25); //in case sample rate is a little off
+
+    for(bitrate_index=2; bitrate_index<30; bitrate_index++){
+        frame_size = avpriv_mpa_bitrate_tab[lsf][2][bitrate_index>>1];
+        frame_size = (frame_size * 144000) / (sample_rate << lsf) + (bitrate_index&1);
+        if(frame_size == buf_size + 4)
+            break;
+        if(frame_size == buf_size + 6)
+            break;
+    }
+    if(bitrate_index == 30){
+        av_log(avctx, AV_LOG_ERROR, "Could not find bitrate_index.\n");
+        return -1;
+    }
+
+    header |= (bitrate_index&1)<<9;
+    header |= (bitrate_index>>1)<<12;
+    header |= (frame_size == buf_size + 4)<<16; //FIXME actually set a correct crc instead of 0
+
+    *poutbuf_size= frame_size;
+    *poutbuf= av_malloc(frame_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    memcpy(*poutbuf + frame_size - buf_size, buf, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if(avctx->channels==2){
+        uint8_t *p= *poutbuf + frame_size - buf_size;
+        if(lsf){
+            FFSWAP(int, p[1], p[2]);
+            header |= (p[1] & 0xC0)>>2;
+            p[1] &= 0x3F;
+        }else{
+            header |= p[1] & 0x30;
+            p[1] &= 0xCF;
+        }
+    }
+
+    AV_WB32(*poutbuf, header);
+
+    return 1;
+}
+
+AVBitStreamFilter ff_mp3_header_decompress_bsf={
+    .name   = "mp3decomp",
+    .filter = mp3_header_decompress,
+};
diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
index 763ea2c058..7af30bd0e1 100644
--- a/libavcodec/mpc.c
+++ b/libavcodec/mpc.c
@@ -2,20 +2,20 @@
  * Musepack decoder core
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,13 +74,13 @@ void ff_mpc_dequantize_and_synth(MPCContext * c, int maxband, int16_t **out,
         for(ch = 0; ch < 2; ch++){
             if(bands[i].res[ch]){
                 j = 0;
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][0]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][0] & 0xFF];
                 for(; j < 12; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][1]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][1] & 0xFF];
                 for(; j < 24; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][2]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][2] & 0xFF];
                 for(; j < 36; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
             }
diff --git a/libavcodec/mpc.h b/libavcodec/mpc.h
index cdf49c1a4e..4cb8574897 100644
--- a/libavcodec/mpc.h
+++ b/libavcodec/mpc.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc7.c b/libavcodec/mpc7.c
index 28b51922ec..d38b22a2b3 100644
--- a/libavcodec/mpc7.c
+++ b/libavcodec/mpc7.c
@@ -2,20 +2,20 @@
  * Musepack SV7 decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,10 +36,6 @@
 #include "mpc.h"
 #include "mpc7data.h"
 
-#define BANDS            32
-#define SAMPLES_PER_BAND 36
-#define MPC_FRAME_SIZE   (BANDS * SAMPLES_PER_BAND)
-
 static VLC scfi_vlc, dscf_vlc, hdr_vlc, quant_vlc[MPC7_QUANT_VLC_TABLES][2];
 
 static const uint16_t quant_offsets[MPC7_QUANT_VLC_TABLES*2 + 1] =
@@ -190,7 +186,7 @@ static int get_scale_idx(GetBitContext *gb, int ref)
     int t = get_vlc2(gb, dscf_vlc.table, MPC7_DSCF_BITS, 1) - 7;
     if (t == 8)
         return get_bits(gb, 6);
-    return av_clip_uintp2(ref + t, 7);
+    return ref + t;
 }
 
 static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
@@ -226,11 +222,9 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
     buf_size  -= 4;
 
     /* get output buffer */
-    frame->nb_samples = last_frame ? c->lastframelen : MPC_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = MPC_FRAME_SIZE;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&c->bits, &c->buf_size, buf_size);
     if (!c->bits)
@@ -246,7 +240,11 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
             int t = 4;
             if(i) t = get_vlc2(&gb, hdr_vlc.table, MPC7_HDR_BITS, 1) - 5;
             if(t == 4) bands[i].res[ch] = get_bits(&gb, 4);
-            else bands[i].res[ch] = av_clip(bands[i-1].res[ch] + t, 0, 17);
+            else bands[i].res[ch] = bands[i-1].res[ch] + t;
+            if (bands[i].res[ch] < -1 || bands[i].res[ch] > 17) {
+                av_log(avctx, AV_LOG_ERROR, "subband index invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
 
         if(bands[i].res[0] || bands[i].res[1]){
@@ -293,6 +291,8 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
             idx_to_quant(c, &gb, bands[i].res[ch], c->Q[ch] + off);
 
     ff_mpc_dequantize_and_synth(c, mb, (int16_t **)frame->extended_data, 2);
+    if(last_frame)
+        frame->nb_samples = c->lastframelen;
 
     bits_used = get_bits_count(&gb);
     bits_avail = buf_size * 8;
diff --git a/libavcodec/mpc7data.h b/libavcodec/mpc7data.h
index f205ffe97f..5609e8fbf3 100644
--- a/libavcodec/mpc7data.h
+++ b/libavcodec/mpc7data.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8.c b/libavcodec/mpc8.c
index 84dbb61f7d..a8feb6c4ce 100644
--- a/libavcodec/mpc8.c
+++ b/libavcodec/mpc8.c
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,6 +126,10 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
 
     skip_bits(&gb, 3);//sample rate
     c->maxbands = get_bits(&gb, 5) + 1;
+    if (c->maxbands >= BANDS) {
+        av_log(avctx,AV_LOG_ERROR, "maxbands %d too high\n", c->maxbands);
+        return AVERROR_INVALIDDATA;
+    }
     channels = get_bits(&gb, 4) + 1;
     if (channels > 2) {
         avpriv_request_sample(avctx, "Multichannel MPC SV8");
@@ -135,7 +139,8 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
     c->frames = 1 << (get_bits(&gb, 3) * 2);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channel_layout = (channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channels = channels;
 
     if(vlc_initialized) return 0;
     av_log(avctx, AV_LOG_DEBUG, "Initing VLC\n");
@@ -247,10 +252,8 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = MPC_FRAME_SIZE;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
 
     keyframe = c->cur_frame == 0;
 
@@ -267,8 +270,11 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
         maxband = c->last_max_band + get_vlc2(gb, band_vlc.table, MPC8_BANDS_BITS, 2);
         if(maxband > 32) maxband -= 33;
     }
-    if(maxband > c->maxbands + 1)
+
+    if(maxband > c->maxbands + 1) {
+        av_log(avctx, AV_LOG_ERROR, "maxband %d too large\n",maxband);
         return AVERROR_INVALIDDATA;
+    }
     c->last_max_band = maxband;
 
     /* read subband indexes */
diff --git a/libavcodec/mpc8data.h b/libavcodec/mpc8data.h
index 2940b30733..22c2be43bf 100644
--- a/libavcodec/mpc8data.h
+++ b/libavcodec/mpc8data.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8huff.h b/libavcodec/mpc8huff.h
index 6005e214e8..8491037aa4 100644
--- a/libavcodec/mpc8huff.h
+++ b/libavcodec/mpc8huff.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpcdata.h b/libavcodec/mpcdata.h
index 15724f3b74..64fb4ab326 100644
--- a/libavcodec/mpcdata.h
+++ b/libavcodec/mpcdata.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,7 @@ static const float mpc_CC[18+1] = {
     4.0002, 2.0001, 1.0000
 };
 
-static const float mpc_SCF[128+6] = {
-    920.016296386718750000, 766.355773925781250000, 638.359558105468750000,
-    531.741149902343750000, 442.930114746093750000, 368.952209472656250000,
+static const float mpc_SCF[256] = {
     307.330047607421875000, 255.999984741210937500, 213.243041992187500000, 177.627334594726562500,
     147.960128784179687500, 123.247924804687500000, 102.663139343261718750, 85.516410827636718750,
     71.233520507812500000, 59.336143493652343750, 49.425861358642578125, 41.170787811279296875,
@@ -64,7 +62,39 @@ static const float mpc_SCF[128+6] = {
     0.000000396931966407, 0.000000330636652279, 0.000000275413924555, 0.000000229414467867,
     0.000000191097811353, 0.000000159180785886, 0.000000132594522029, 0.000000110448674207,
     0.000000092001613439, 0.000000076635565449, 0.000000063835940978, 0.000000053174105119,
-    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789
+    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789,
+    0.000000021324305018, 3689522167600.270019531250000000, 3073300627835.926757812500000000, 2560000000000.002929687500000000,
+    2132430501800.519042968750000000, 1776273376956.721923828125000000, 1479601378343.250244140625000000, 1232479339720.794189453125000000,
+    1026631459710.774291992187500000, 855164155779.391845703125000000, 712335206965.024780273437500000, 593361454233.829101562500000000,
+    494258618594.112609863281250000, 411707872682.763122558593750000, 342944697476.612365722656250000, 285666302081.983886718750000000,
+    237954506209.446411132812500000, 198211502766.368713378906250000, 165106349338.563323974609375000, 137530396629.095306396484375000,
+    114560161209.611633300781250000, 95426399240.062576293945312500, 79488345475.196502685546875000, 66212254855.064872741699218750,
+    55153528064.816276550292968750, 45941822471.611343383789062500, 38268649822.956413269042968750, 31877045369.216873168945312500,
+    26552962442.420688629150390625, 22118104306.789615631103515625, 18423953228.829509735107421875, 15346796808.164905548095703125,
+    12783585007.291271209716796875, 10648479137.463939666748046875, 8869977230.669750213623046875, 7388519530.061036109924316406,
+    6154493909.785535812377929688, 5126574428.270387649536132812, 4270337375.232155323028564453, 3557108465.595236301422119141,
+    2963002574.315670013427734375, 2468123854.056322574615478516, 2055899448.676229715347290039, 1712524489.450022459030151367,
+    1426499787.649837732315063477, 1188246741.404872417449951172, 989786560.561257958412170410, 824473067.192597866058349609,
+    686770123.591610312461853027, 572066234.090648531913757324, 476520111.962911486625671387, 396932039.637152194976806641,
+    330636714.243810534477233887, 275413990.026798009872436523, 229414528.498330980539321899, 191097866.455478429794311523,
+    159180827.835415601730346680, 132594551.788319095969200134, 110448697.892960876226425171, 92001629.793398514389991760,
+    76635578.744844585657119751, 63835955.327594503760337830, 53174116.504741288721561432, 44293010.914454914629459381,
+    36895221.676002673804759979, 30733006.278359245508909225, 25600000.000000011175870895, 21324305.018005173653364182,
+    17762733.769567202776670456, 14796013.783432489261031151, 12324793.397207930684089661, 10266314.597107734531164169,
+    8551641.557793911546468735, 7123352.069650243036448956, 5933614.542338287457823753, 4942586.185941123403608799,
+    4117078.726827629376202822, 3429446.974766122177243233, 2856663.020819837693125010, 2379545.062094463035464287,
+    1982115.027663686312735081, 1651063.493385632522404194, 1375303.966290952404960990, 1145601.612096115713939071,
+    954263.992400625254958868, 794883.454751964658498764, 662122.548550648498348892, 551535.280648162588477135,
+    459418.224716113239992410, 382686.498229563992936164, 318770.453692168579436839, 265529.624424206791445613,
+    221181.043067896069260314, 184239.532288295013131574, 153467.968081648985389620, 127835.850072912653558888,
+    106484.791374639346031472, 88699.772306697457679547, 73885.195300610314006917, 61544.939097855312866159,
+    51265.744282703839417081, 42703.373752321524079889, 35571.084655952341563534, 29630.025743156678800005,
+    24681.238540563208516687, 20558.994486762283486314, 17125.244894500214286381, 14264.997876498367986642,
+    11882.467414048716818797, 9897.865605612574654515, 8244.730671925974093028, 6867.701235916098994494,
+    5720.662340906482313585, 4765.201119629112326948, 3969.320396371519564127, 3306.367142438103201130,
+    2754.139900267978191550, 2294.145284983308101801, 1910.978664554782881169, 1591.808278354154936096,
+    1325.945517883190177599, 1104.486978929608085309, 920.016297933984674273, 766.355787448445425980,
+    638.359553275944676898, 531.741165047412550848, 442.930109144548907807, 368.952216760026544762,
 };
 
 #endif /* AVCODEC_MPCDATA_H */
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index 69c6d0a09a..e8a4048222 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,12 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/timecode.h"
+
 #include "internal.h"
 #include "avcodec.h"
 #include "mpegvideo.h"
@@ -34,6 +39,7 @@
 #include "mpeg12data.h"
 #include "mpegvideodata.h"
 #include "bytestream.h"
+#include "vdpau_internal.h"
 #include "thread.h"
 
 uint8_t ff_mpeg12_static_rl_table_store[2][2][2*MAX_RUN + MAX_LEVEL + 3];
@@ -65,21 +71,21 @@ static const uint8_t table_mb_btype[11][2] = {
 #define INIT_2D_VLC_RL(rl, static_size)\
 {\
     static RL_VLC_ELEM rl_vlc_table[static_size];\
-    INIT_VLC_STATIC(&rl.vlc, TEX_VLC_BITS, rl.n + 2,\
-                    &rl.table_vlc[0][1], 4, 2,\
-                    &rl.table_vlc[0][0], 4, 2, static_size);\
-\
     rl.rl_vlc[0] = rl_vlc_table;\
-    init_2d_vlc_rl(&rl);\
+    init_2d_vlc_rl(&rl, static_size);\
 }
 
-static av_cold void init_2d_vlc_rl(RLTable *rl)
+static av_cold void init_2d_vlc_rl(RLTable *rl, unsigned static_size)
 {
     int i;
-
-    for (i = 0; i < rl->vlc.table_size; i++) {
-        int code = rl->vlc.table[i][0];
-        int len  = rl->vlc.table[i][1];
+    VLC_TYPE table[680][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, TEX_VLC_BITS, rl->n + 2, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
+
+    for (i = 0; i < vlc.table_size; i++) {
+        int code = vlc.table[i][0];
+        int len  = vlc.table[i][1];
         int level, run;
 
         if (len == 0) { // illegal code
@@ -195,7 +201,7 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
 */
 
     for (i = 0; i < buf_size; i++) {
-        assert(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
+        av_assert1(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
         if (pc->frame_start_found & 1) {
             if (state == EXT_START_CODE && (buf[i] & 0xF0) != 0x80)
                 pc->frame_start_found--;
@@ -229,10 +235,11 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
                 }
             }
             if (pc->frame_start_found == 0 && s && state == PICTURE_START_CODE) {
-                ff_fetch_timestamp(s, i - 3, 1);
+                ff_fetch_timestamp(s, i - 3, 1, i > 3);
             }
         }
     }
     pc->state = state;
     return END_NOT_FOUND;
 }
+
diff --git a/libavcodec/mpeg12.h b/libavcodec/mpeg12.h
index fda1a533db..16ca195a47 100644
--- a/libavcodec/mpeg12.h
+++ b/libavcodec/mpeg12.h
@@ -2,20 +2,20 @@
  * MPEG1/2 common code
  * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg12data.c b/libavcodec/mpeg12data.c
index ccc3d2deb9..e8c4a5d3d1 100644
--- a/libavcodec/mpeg12data.c
+++ b/libavcodec/mpeg12data.c
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "mpeg12data.h"
 
-const uint16_t ff_mpeg1_default_intra_matrix[64] = {
+const uint16_t ff_mpeg1_default_intra_matrix[256] = {
         8, 16, 19, 22, 26, 27, 29, 34,
         16, 16, 22, 24, 27, 29, 34, 37,
         19, 22, 26, 27, 29, 34, 34, 38,
@@ -325,6 +325,72 @@ const AVRational ff_mpeg12_frame_rate_tab[16] = {
     {    0,    0},
 };
 
+const AVRational ff_mpeg2_frame_rate_tab[] = {
+    {      1,     1},
+    {      2,     1},
+    {      3,     1},
+    {      4,     1},
+    {      5,     1},
+    {      6,     1},
+    {      8,     1},
+    {      9,     1},
+    {     10,     1},
+    {     12,     1},
+    {     15,     1},
+    {     16,     1},
+    {     18,     1},
+    {     20,     1},
+    {     24,     1},
+    {     25,     1},
+    {     30,     1},
+    {     32,     1},
+    {     36,     1},
+    {     40,     1},
+    {     45,     1},
+    {     48,     1},
+    {     50,     1},
+    {     60,     1},
+    {     72,     1},
+    {     75,     1},
+    {     80,     1},
+    {     90,     1},
+    {     96,     1},
+    {    100,     1},
+    {    120,     1},
+    {    150,     1},
+    {    180,     1},
+    {    200,     1},
+    {    240,     1},
+    {    750,  1001},
+    {    800,  1001},
+    {    960,  1001},
+    {   1000,  1001},
+    {   1200,  1001},
+    {   1250,  1001},
+    {   1500,  1001},
+    {   1600,  1001},
+    {   1875,  1001},
+    {   2000,  1001},
+    {   2400,  1001},
+    {   2500,  1001},
+    {   3000,  1001},
+    {   3750,  1001},
+    {   4000,  1001},
+    {   4800,  1001},
+    {   5000,  1001},
+    {   6000,  1001},
+    {   7500,  1001},
+    {   8000,  1001},
+    {  10000,  1001},
+    {  12000,  1001},
+    {  15000,  1001},
+    {  20000,  1001},
+    {  24000,  1001},
+    {  30000,  1001},
+    {  60000,  1001},
+    {      0,     0},
+};
+
 const float ff_mpeg1_aspect[16]={
     0.0000,
     1.0000,
diff --git a/libavcodec/mpeg12data.h b/libavcodec/mpeg12data.h
index 633a2915e6..65b94855a8 100644
--- a/libavcodec/mpeg12data.h
+++ b/libavcodec/mpeg12data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/rational.h"
 #include "rl.h"
 
-extern const uint16_t ff_mpeg1_default_intra_matrix[64];
+extern const uint16_t ff_mpeg1_default_intra_matrix[];
 extern const uint16_t ff_mpeg1_default_non_intra_matrix[64];
 
 extern const uint16_t ff_mpeg12_vlc_dc_lum_code[12];
@@ -49,6 +49,7 @@ extern const uint8_t ff_mpeg12_mbPatTable[64][2];
 extern const uint8_t ff_mpeg12_mbMotionVectorTable[17][2];
 
 extern const AVRational ff_mpeg12_frame_rate_tab[];
+extern const AVRational ff_mpeg2_frame_rate_tab[];
 
 extern const float ff_mpeg1_aspect[16];
 extern const AVRational ff_mpeg2_aspect[16];
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index be20414c12..21a347001e 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -1,22 +1,22 @@
 /*
  * MPEG-1/2 decoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,11 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/stereo3d.h"
 
@@ -44,6 +46,7 @@
 #include "mpegvideodata.h"
 #include "thread.h"
 #include "version.h"
+#include "vdpau_compat.h"
 #include "xvmc_internal.h"
 
 typedef struct Mpeg1Context {
@@ -58,11 +61,11 @@ typedef struct Mpeg1Context {
     uint8_t afd;
     int has_afd;
     int slice_count;
-    int save_aspect_info;
+    AVRational save_aspect;
     int save_width, save_height, save_progressive_seq;
     AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
     int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
-    int closed_gop;             /* GOP is closed */
+    int tmpgexs;
     int first_slice;
     int extradata_decoded;
 } Mpeg1Context;
@@ -93,13 +96,6 @@ static const uint32_t btype2mb_type[11] = {
     MB_TYPE_QUANT | MB_TYPE_L0L1 | MB_TYPE_CBP,
 };
 
-static const uint8_t non_linear_qscale[32] = {
-     0,  1,  2,  3,  4,  5,   6,   7,
-     8, 10, 12, 14, 16, 18,  20,  22,
-    24, 28, 32, 36, 40, 44,  48,  52,
-    56, 64, 72, 80, 88, 96, 104, 112,
-};
-
 /* as H.263, but only 17 codes */
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred)
 {
@@ -155,19 +151,20 @@ static inline int mpeg1_decode_block_intra(MpegEncContext *s,
     dc += diff;
     s->last_dc[component] = dc;
     block[0] = dc * quant_matrix[0];
-    ff_dlog(s->avctx, "dc=%d diff=%d\n", dc, diff);
+    ff_tlog(s->avctx, "dc=%d diff=%d\n", dc, diff);
     i = 0;
     {
         OPEN_READER(re, &s->gb);
+        UPDATE_CACHE(re, &s->gb);
+        if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
+            goto end;
+
         /* now quantify & encode AC coefficients */
         for (;;) {
-            UPDATE_CACHE(re, &s->gb);
             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0],
                        TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
-                break;
-            } else if (level != 0) {
+            if (level != 0) {
                 i += run;
                 check_scantable_index(s, i);
                 j = scantable[i];
@@ -175,7 +172,7 @@ static inline int mpeg1_decode_block_intra(MpegEncContext *s,
                 level = (level - 1) | 1;
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) -
                         SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
+                SKIP_BITS(re, &s->gb, 1);
             } else {
                 /* escape */
                 run = SHOW_UBITS(re, &s->gb, 6) + 1;
@@ -185,10 +182,10 @@ static inline int mpeg1_decode_block_intra(MpegEncContext *s,
                 SKIP_BITS(re, &s->gb, 8);
                 if (level == -128) {
                     level = SHOW_UBITS(re, &s->gb, 8) - 256;
-                    LAST_SKIP_BITS(re, &s->gb, 8);
+                    SKIP_BITS(re, &s->gb, 8);
                 } else if (level == 0) {
                     level = SHOW_UBITS(re, &s->gb, 8);
-                    LAST_SKIP_BITS(re, &s->gb, 8);
+                    SKIP_BITS(re, &s->gb, 8);
                 }
                 i += run;
                 check_scantable_index(s, i);
@@ -205,7 +202,13 @@ static inline int mpeg1_decode_block_intra(MpegEncContext *s,
             }
 
             block[j] = level;
+            if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
+               break;
+
+            UPDATE_CACHE(re, &s->gb);
         }
+end:
+        LAST_SKIP_BITS(re, &s->gb, 2);
         CLOSE_READER(re, &s->gb);
     }
     s->block_last_index[n] = i;
@@ -297,6 +300,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -459,6 +467,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
                                                     int16_t *block, int n)
 {
@@ -488,7 +501,6 @@ static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
 
         if (level != 0) {
             i += run;
-            check_scantable_index(s, i);
             j = scantable[i];
             level = ((level * 2 + 1) * qscale) >> 1;
             level = (level ^ SHOW_SBITS(re, &s->gb, 1)) -
@@ -503,7 +515,6 @@ static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
             SKIP_BITS(re, &s->gb, 12);
 
             i += run;
-            check_scantable_index(s, i);
             j = scantable[i];
             if (level < 0) {
                 level = ((-level * 2 + 1) * qscale) >> 1;
@@ -514,8 +525,9 @@ static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
         }
 
         block[j] = level;
-        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF)
+        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF || i > 63)
             break;
+
         UPDATE_CACHE(re, &s->gb);
     }
 end:
@@ -551,7 +563,7 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     dc += diff;
     s->last_dc[component] = dc;
     block[0] = dc << (3 - s->intra_dc_precision);
-    ff_dlog(s->avctx, "dc=%d\n", block[0]);
+    ff_tlog(s->avctx, "dc=%d\n", block[0]);
     mismatch = block[0] ^ 1;
     i = 0;
     if (s->intra_vlc_format)
@@ -606,6 +618,11 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -645,11 +662,10 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0],
                        TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
+            if (level >= 64 || i > 63) {
                 break;
             } else if (level != 0) {
                 i += run;
-                check_scantable_index(s, i);
                 j = scantable[i];
                 level = (level * qscale * quant_matrix[j]) >> 4;
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) -
@@ -663,7 +679,6 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                 level = SHOW_SBITS(re, &s->gb, 12);
                 SKIP_BITS(re, &s->gb, 12);
                 i += run;
-                check_scantable_index(s, i);
                 j = scantable[i];
                 if (level < 0) {
                     level = (-level * qscale * quant_matrix[j]) >> 4;
@@ -697,11 +712,12 @@ static inline int get_qscale(MpegEncContext *s)
 {
     int qscale = get_bits(&s->gb, 5);
     if (s->q_scale_type)
-        return non_linear_qscale[qscale];
+        return ff_mpeg2_non_linear_qscale[qscale];
     else
         return qscale << 1;
 }
 
+
 /* motion type (for MPEG-2) */
 #define MT_FIELD 1
 #define MT_FRAME 2
@@ -714,9 +730,9 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
     const int mb_block_count = 4 + (1 << s->chroma_format);
     int ret;
 
-    ff_dlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
+    ff_tlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
 
-    assert(s->mb_skipped == 0);
+    av_assert2(s->mb_skipped == 0);
 
     if (s->mb_skip_run-- != 0) {
         if (s->pict_type == AV_PICTURE_TYPE_P) {
@@ -731,11 +747,12 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             else
                 // FIXME not sure if this is allowed in MPEG at all
                 mb_type = s->current_picture.mb_type[s->mb_width + (s->mb_y - 1) * s->mb_stride - 1];
-            if (IS_INTRA(mb_type))
+            if (IS_INTRA(mb_type)) {
+                av_log(s->avctx, AV_LOG_ERROR, "skip with previntra\n");
                 return AVERROR_INVALIDDATA;
+            }
             s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride] =
                 mb_type | MB_TYPE_SKIP;
-//            assert(s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride - 1] & (MB_TYPE_16x16 | MB_TYPE_16x8));
 
             if ((s->mv[0][0][0] | s->mv[0][0][1] | s->mv[1][0][0] | s->mv[1][0][1]) == 0)
                 s->mb_skipped = 1;
@@ -778,7 +795,7 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
         mb_type = btype2mb_type[mb_type];
         break;
     }
-    ff_dlog(s->avctx, "mb_type=%x\n", mb_type);
+    ff_tlog(s->avctx, "mb_type=%x\n", mb_type);
 //    motion_type = 0; /* avoid warning */
     if (IS_INTRA(mb_type)) {
         s->bdsp.clear_blocks(s->block[0]);
@@ -809,19 +826,15 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             s->last_mv[0][1][1] = mpeg_decode_motion(s, s->mpeg_f_code[0][1],
                                                      s->last_mv[0][0][1]);
 
-            skip_bits1(&s->gb); /* marker */
+            check_marker(&s->gb, "after concealment_motion_vectors");
         } else {
             /* reset mv prediction */
             memset(s->last_mv, 0, sizeof(s->last_mv));
         }
         s->mb_intra = 1;
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
         // if 1, we memcpy blocks in xvmcvideo
-        if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
             ff_xvmc_pack_pblocks(s, -1); // inter are always full blocks
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
             if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
@@ -839,11 +852,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     } else {
         if (mb_type & MB_TYPE_ZERO_MV) {
-            assert(mb_type & MB_TYPE_CBP);
+            av_assert2(mb_type & MB_TYPE_CBP);
 
             s->mv_dir = MV_DIR_FORWARD;
             if (s->picture_structure == PICT_FRAME) {
-                if (!s->frame_pred_frame_dct)
+                if (s->picture_structure == PICT_FRAME
+                    && !s->frame_pred_frame_dct)
                     s->interlaced_dct = get_bits1(&s->gb);
                 s->mv_type = MV_TYPE_16X16;
             } else {
@@ -862,10 +876,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->mv[0][0][0]      = 0;
             s->mv[0][0][1]      = 0;
         } else {
-            assert(mb_type & MB_TYPE_L0L1);
+            av_assert2(mb_type & MB_TYPE_L0L1);
             // FIXME decide if MBs in field pictures are MB_TYPE_INTERLACED
             /* get additional motion vector type */
-            if (s->frame_pred_frame_dct) {
+            if (s->picture_structure == PICT_FRAME && s->frame_pred_frame_dct) {
                 motion_type = MT_FRAME;
             } else {
                 motion_type = get_bits(&s->gb, 2);
@@ -878,7 +892,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             /* motion vectors */
             s->mv_dir = (mb_type >> 13) & 3;
-            ff_dlog(s->avctx, "motion_type=%d\n", motion_type);
+            ff_tlog(s->avctx, "motion_type=%d\n", motion_type);
             switch (motion_type) {
             case MT_FRAME: /* or MT_16X8 */
                 if (s->picture_structure == PICT_FRAME) {
@@ -935,16 +949,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                                          s->last_mv[i][j][0]);
                                 s->last_mv[i][j][0] = val;
                                 s->mv[i][j][0]      = val;
-                                ff_dlog(s->avctx, "fmx=%d\n", val);
+                                ff_tlog(s->avctx, "fmx=%d\n", val);
                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
                                                          s->last_mv[i][j][1] >> 1);
-                                s->last_mv[i][j][1] = val << 1;
+                                s->last_mv[i][j][1] = 2 * val;
                                 s->mv[i][j][1]      = val;
-                                ff_dlog(s->avctx, "fmy=%d\n", val);
+                                ff_tlog(s->avctx, "fmy=%d\n", val);
                             }
                         }
                     }
                 } else {
+                    av_assert0(!s->progressive_sequence);
                     mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
                     for (i = 0; i < 2; i++) {
                         if (USES_LIST(mb_type, i)) {
@@ -961,6 +976,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
                 break;
             case MT_DMV:
+                if (s->progressive_sequence){
+                    av_log(s->avctx, AV_LOG_ERROR, "MT_DMV in progressive_sequence\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 s->mv_type = MV_TYPE_DMV;
                 for (i = 0; i < 2; i++) {
                     if (USES_LIST(mb_type, i)) {
@@ -1029,17 +1048,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
             if (cbp <= 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
+                       "invalid cbp %d at %d %d\n", cbp, s->mb_x, s->mb_y);
                 return AVERROR_INVALIDDATA;
             }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
             // if 1, we memcpy blocks in xvmcvideo
-            if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+            if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
                 ff_xvmc_pack_pblocks(s, cbp);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
             if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
                 if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
@@ -1102,6 +1117,11 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
 
     ff_mpv_decode_defaults(s2);
 
+    if (   avctx->codec_tag != AV_RL32("VCR2")
+        && avctx->codec_tag != AV_RL32("BW10"))
+        avctx->coded_width = avctx->coded_height = 0; // do not trust dimensions from input
+    ff_mpv_decode_init(s2, avctx);
+
     s->mpeg_enc_ctx.avctx  = avctx;
 
     /* we need some permutation to store matrices,
@@ -1110,18 +1130,16 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
     ff_mpeg12_common_init(&s->mpeg_enc_ctx);
     ff_mpeg12_init_vlcs();
 
+    s2->chroma_format              = 1;
     s->mpeg_enc_ctx_allocated      = 0;
     s->mpeg_enc_ctx.picture_number = 0;
     s->repeat_field                = 0;
     s->mpeg_enc_ctx.codec_id       = avctx->codec->id;
     avctx->color_range             = AVCOL_RANGE_MPEG;
-    if (avctx->codec->id == AV_CODEC_ID_MPEG1VIDEO)
-        avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
-    else
-        avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
                                              const AVCodecContext *avctx_from)
 {
@@ -1138,17 +1156,15 @@ static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
     if (err)
         return err;
 
-    if (!ctx->mpeg_enc_ctx_allocated) {
-        // copy the whole context after the initial MpegEncContext structure
-        memcpy(ctx, ctx_from, sizeof(*ctx));
-        memset(&ctx->mpeg_enc_ctx, 0, sizeof(ctx->mpeg_enc_ctx));
-    }
+    if (!ctx->mpeg_enc_ctx_allocated)
+        memcpy(s + 1, s1 + 1, sizeof(Mpeg1Context) - sizeof(MpegEncContext));
 
     if (!(s->pict_type == AV_PICTURE_TYPE_B || s->low_delay))
         s->picture_number++;
 
     return 0;
 }
+#endif
 
 static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
                                  const uint8_t *new_perm)
@@ -1162,15 +1178,30 @@ static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
         matrix[new_perm[i]] = temp_matrix[old_perm[i]];
 }
 
-#if FF_API_XVMC
-static const enum AVPixelFormat pixfmt_xvmc_mpg2_420[] = {
-    AV_PIX_FMT_XVMC_MPEG2_IDCT,
-    AV_PIX_FMT_XVMC_MPEG2_MC,
+static const enum AVPixelFormat mpeg1_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG1_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG1_VDPAU_DECODER && FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_MPEG1,
+#endif
+#if CONFIG_MPEG1_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
+    AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
 };
-#endif /* FF_API_XVMC */
 
-static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
+static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG2_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG_VDPAU_DECODER && FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_MPEG2,
+#endif
+#if CONFIG_MPEG2_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
 #if CONFIG_MPEG2_DXVA2_HWACCEL
     AV_PIX_FMT_DXVA2_VLD,
 #endif
@@ -1178,10 +1209,10 @@ static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
     AV_PIX_FMT_D3D11VA_VLD,
 #endif
 #if CONFIG_MPEG2_VAAPI_HWACCEL
-    AV_PIX_FMT_VAAPI_VLD,
+    AV_PIX_FMT_VAAPI,
 #endif
-#if CONFIG_MPEG1_VDPAU_HWACCEL | CONFIG_MPEG2_VDPAU_HWACCEL
-    AV_PIX_FMT_VDPAU,
+#if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
 #endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
@@ -1197,27 +1228,55 @@ static const enum AVPixelFormat mpeg12_pixfmt_list_444[] = {
     AV_PIX_FMT_NONE
 };
 
+#if FF_API_VDPAU
+static inline int uses_vdpau(AVCodecContext *avctx) {
+    return avctx->pix_fmt == AV_PIX_FMT_VDPAU_MPEG1 || avctx->pix_fmt == AV_PIX_FMT_VDPAU_MPEG2;
+}
+#endif
+
 static enum AVPixelFormat mpeg_get_pixelformat(AVCodecContext *avctx)
 {
     Mpeg1Context *s1  = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     const enum AVPixelFormat *pix_fmts;
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (avctx->xvmc_acceleration)
-        return ff_get_format(avctx, pixfmt_xvmc_mpg2_420);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY))
+        return AV_PIX_FMT_GRAY8;
 
     if (s->chroma_format < 2)
-        pix_fmts = mpeg12_hwaccel_pixfmt_list_420;
+        pix_fmts = avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO ?
+                                mpeg1_hwaccel_pixfmt_list_420 :
+                                mpeg2_hwaccel_pixfmt_list_420;
     else if (s->chroma_format == 2)
         pix_fmts = mpeg12_pixfmt_list_422;
     else
         pix_fmts = mpeg12_pixfmt_list_444;
 
-    return ff_get_format(avctx, pix_fmts);
+    return ff_thread_get_format(avctx, pix_fmts);
+}
+
+static void setup_hwaccel_for_pixfmt(AVCodecContext *avctx)
+{
+    // until then pix_fmt may be changed right after codec init
+    if (avctx->hwaccel
+#if FF_API_VDPAU
+        || uses_vdpau(avctx)
+#endif
+        )
+        if (avctx->idct_algo == FF_IDCT_AUTO)
+            avctx->idct_algo = FF_IDCT_SIMPLE;
+
+    if (avctx->hwaccel && avctx->pix_fmt == AV_PIX_FMT_XVMC) {
+        Mpeg1Context *s1 = avctx->priv_data;
+        MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+        s->pack_pblocks = 1;
+#if FF_API_XVMC
+FF_DISABLE_DEPRECATION_WARNINGS
+        avctx->xvmc_acceleration = 2;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_XVMC */
+    }
 }
 
 /* Call this function when we know all parameters.
@@ -1229,27 +1288,83 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
     uint8_t old_permutation[64];
     int ret;
 
+    if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
+        // MPEG-1 aspect
+        avctx->sample_aspect_ratio = av_d2q(1.0 / ff_mpeg1_aspect[s->aspect_ratio_info], 255);
+    } else { // MPEG-2
+        // MPEG-2 aspect
+        if (s->aspect_ratio_info > 1) {
+            AVRational dar =
+                av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                                  (AVRational) { s1->pan_scan.width,
+                                                 s1->pan_scan.height }),
+                         (AVRational) { s->width, s->height });
+
+            /* We ignore the spec here and guess a bit as reality does not
+             * match the spec, see for example res_change_ffmpeg_aspect.ts
+             * and sequence-display-aspect.mpg.
+             * issue1613, 621, 562 */
+            if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
+                (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
+                 av_cmp_q(dar, (AVRational) { 16, 9 }))) {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s->width, s->height });
+            } else {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
+// issue1613 4/3 16/9 -> 16/9
+// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
+// widescreen-issue562.mpg 4/3 16/9 -> 16/9
+//                s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
+                ff_dlog(avctx, "aspect A %d/%d\n",
+                        ff_mpeg2_aspect[s->aspect_ratio_info].num,
+                        ff_mpeg2_aspect[s->aspect_ratio_info].den);
+                ff_dlog(avctx, "aspect B %d/%d\n", s->avctx->sample_aspect_ratio.num,
+                        s->avctx->sample_aspect_ratio.den);
+            }
+        } else {
+            s->avctx->sample_aspect_ratio =
+                ff_mpeg2_aspect[s->aspect_ratio_info];
+        }
+    } // MPEG-2
+
+    if (av_image_check_sar(s->width, s->height,
+                           avctx->sample_aspect_ratio) < 0) {
+        av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
+                avctx->sample_aspect_ratio.num,
+                avctx->sample_aspect_ratio.den);
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    }
+
     if ((s1->mpeg_enc_ctx_allocated == 0)                   ||
         avctx->coded_width       != s->width                ||
         avctx->coded_height      != s->height               ||
         s1->save_width           != s->width                ||
         s1->save_height          != s->height               ||
-        s1->save_aspect_info     != s->aspect_ratio_info    ||
-        s1->save_progressive_seq != s->progressive_sequence ||
+        av_cmp_q(s1->save_aspect, s->avctx->sample_aspect_ratio) ||
+        (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
         0) {
         if (s1->mpeg_enc_ctx_allocated) {
             ParseContext pc = s->parse_context;
             s->parse_context.buffer = 0;
             ff_mpv_common_end(s);
             s->parse_context = pc;
+            s1->mpeg_enc_ctx_allocated = 0;
         }
 
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
 
-        avctx->bit_rate          = s->bit_rate;
-        s1->save_aspect_info     = s->aspect_ratio_info;
+        if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->bit_rate) {
+            avctx->rc_max_rate = s->bit_rate;
+        } else if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && s->bit_rate &&
+                   (s->bit_rate != 0x3FFFF*400 || s->vbv_delay != 0xFFFF)) {
+            avctx->bit_rate = s->bit_rate;
+        }
+        s1->save_aspect          = s->avctx->sample_aspect_ratio;
         s1->save_width           = s->width;
         s1->save_height          = s->height;
         s1->save_progressive_seq = s->progressive_sequence;
@@ -1261,66 +1376,28 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
         if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
             // MPEG-1 fps
             avctx->framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
-            // MPEG-1 aspect
-            avctx->sample_aspect_ratio = av_d2q(1.0 / ff_mpeg1_aspect[s->aspect_ratio_info], 255);
             avctx->ticks_per_frame     = 1;
+
+            avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         } else { // MPEG-2
             // MPEG-2 fps
             av_reduce(&s->avctx->framerate.num,
                       &s->avctx->framerate.den,
-                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num * 2,
+                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
                       ff_mpeg12_frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
                       1 << 30);
             avctx->ticks_per_frame = 2;
-            // MPEG-2 aspect
-            if (s->aspect_ratio_info > 1) {
-                AVRational dar =
-                    av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                      (AVRational) { s1->pan_scan.width,
-                                                     s1->pan_scan.height }),
-                             (AVRational) { s->width, s->height });
 
-                /* We ignore the spec here and guess a bit as reality does not
-                 * match the spec, see for example res_change_ffmpeg_aspect.ts
-                 * and sequence-display-aspect.mpg.
-                 * issue1613, 621, 562 */
-                if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
-                    (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
-                     av_cmp_q(dar, (AVRational) { 16, 9 }))) {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s->width, s->height });
-                } else {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
-// issue1613 4/3 16/9 -> 16/9
-// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
-// widescreen-issue562.mpg 4/3 16/9 -> 16/9
-//                    s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
-                    ff_dlog(avctx, "A %d/%d\n",
-                            ff_mpeg2_aspect[s->aspect_ratio_info].num,
-                            ff_mpeg2_aspect[s->aspect_ratio_info].den);
-                    ff_dlog(avctx, "B %d/%d\n", s->avctx->sample_aspect_ratio.num,
-                            s->avctx->sample_aspect_ratio.den);
-                }
-            } else {
-                s->avctx->sample_aspect_ratio =
-                    ff_mpeg2_aspect[s->aspect_ratio_info];
+            switch (s->chroma_format) {
+            case 1: avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; break;
+            case 2:
+            case 3: avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; break;
+            default: av_assert0(0);
             }
         } // MPEG-2
 
-        ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
-
         avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-        // until then pix_fmt may be changed right after codec init
-#if FF_API_XVMC
-        if ((avctx->pix_fmt == AV_PIX_FMT_XVMC_MPEG2_IDCT ||
-             avctx->hwaccel) && avctx->idct_algo == FF_IDCT_AUTO)
-#else
-        if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-#endif /* FF_API_XVMC */
-            avctx->idct_algo = FF_IDCT_SIMPLE;
+        setup_hwaccel_for_pixfmt(avctx);
 
         /* Quantization matrices may need reordering
          * if DCT permutation is changed. */
@@ -1355,20 +1432,23 @@ static int mpeg1_decode_picture(AVCodecContext *avctx, const uint8_t *buf,
         return AVERROR_INVALIDDATA;
 
     vbv_delay = get_bits(&s->gb, 16);
+    s->vbv_delay = vbv_delay;
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[0] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[0][0] = f_code;
         s->mpeg_f_code[0][1] = f_code;
     }
     if (s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[1] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[1][0] = f_code;
         s->mpeg_f_code[1][1] = f_code;
     }
@@ -1395,13 +1475,19 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
     s->avctx->level         = get_bits(&s->gb, 4);
     s->progressive_sequence = get_bits1(&s->gb);   /* progressive_sequence */
     s->chroma_format        = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */
+
+    if (!s->chroma_format) {
+        s->chroma_format = 1;
+        av_log(s->avctx, AV_LOG_WARNING, "Chroma format invalid\n");
+    }
+
     horiz_size_ext          = get_bits(&s->gb, 2);
     vert_size_ext           = get_bits(&s->gb, 2);
     s->width  |= (horiz_size_ext << 12);
     s->height |= (vert_size_ext  << 12);
     bit_rate_ext = get_bits(&s->gb, 12);  /* XXX: handle it */
-    s->bit_rate += (bit_rate_ext << 18) * 400;
-    skip_bits1(&s->gb); /* marker */
+    s->bit_rate += (bit_rate_ext << 18) * 400LL;
+    check_marker(&s->gb, "after bit rate extension");
     s->avctx->rc_buffer_size += get_bits(&s->gb, 8) * 1024 * 16 << 10;
 
     s->low_delay = get_bits1(&s->gb);
@@ -1416,8 +1502,8 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
-               s->avctx->profile, s->avctx->level,
+               "profile: %d, level: %d ps: %d cf:%d vbv buffer: %d, bitrate:%"PRId64"\n",
+               s->avctx->profile, s->avctx->level, s->progressive_sequence, s->chroma_format,
                s->avctx->rc_buffer_size, s->bit_rate);
 }
 
@@ -1492,7 +1578,7 @@ static int load_matrix(MpegEncContext *s, uint16_t matrix0[64],
             return AVERROR_INVALIDDATA;
         }
         if (intra && i == 0 && v != 8) {
-            av_log(s->avctx, AV_LOG_ERROR, "intra matrix invalid, ignoring\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "intra matrix specifies invalid DC quantizer %d, ignoring\n", v);
             v = 8; // needed by pink.mpg / issue1046
         }
         matrix0[j] = v;
@@ -1538,6 +1624,11 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
         s->current_picture.f->pict_type = s->pict_type;
         s->current_picture.f->key_frame = s->pict_type == AV_PICTURE_TYPE_I;
     }
+    s->mpeg_f_code[0][0] += !s->mpeg_f_code[0][0];
+    s->mpeg_f_code[0][1] += !s->mpeg_f_code[0][1];
+    s->mpeg_f_code[1][0] += !s->mpeg_f_code[1][0];
+    s->mpeg_f_code[1][1] += !s->mpeg_f_code[1][1];
+
     s->intra_dc_precision         = get_bits(&s->gb, 2);
     s->picture_structure          = get_bits(&s->gb, 2);
     s->top_field_first            = get_bits1(&s->gb);
@@ -1550,32 +1641,6 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
     s->chroma_420_type            = get_bits1(&s->gb);
     s->progressive_frame          = get_bits1(&s->gb);
 
-    if (s->progressive_sequence && !s->progressive_frame) {
-        s->progressive_frame = 1;
-        av_log(s->avctx, AV_LOG_ERROR,
-               "interlaced frame in progressive sequence, ignoring\n");
-    }
-
-    if (s->picture_structure == 0 ||
-        (s->progressive_frame && s->picture_structure != PICT_FRAME)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "picture_structure %d invalid, ignoring\n",
-               s->picture_structure);
-        s->picture_structure = PICT_FRAME;
-    }
-
-    if (s->progressive_sequence && !s->frame_pred_frame_dct)
-        av_log(s->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
-
-    if (s->picture_structure == PICT_FRAME) {
-        s->first_field = 0;
-        s->v_edge_pos  = 16 * s->mb_height;
-    } else {
-        s->first_field ^= 1;
-        s->v_edge_pos   = 8 * s->mb_height;
-        memset(s->mbskip_table, 0, s->mb_stride * s->mb_height);
-    }
-
     if (s->alternate_scan) {
         ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
         ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
@@ -1638,6 +1703,7 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
             if (sd)
                 memcpy(sd->data, s1->a53_caption, s1->a53_caption_size);
             av_freep(&s1->a53_caption);
+            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
         }
 
         if (s1->has_stereo3d) {
@@ -1672,9 +1738,11 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
 
         if (s->avctx->hwaccel &&
             (s->avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD)) {
-            if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+            if ((ret = s->avctx->hwaccel->end_frame(s->avctx)) < 0) {
                 av_log(avctx, AV_LOG_ERROR,
                        "hardware accelerator failed to decode first field\n");
+                return ret;
+            }
         }
 
         for (i = 0; i < 4; i++) {
@@ -1690,16 +1758,6 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
             return ret;
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-// ff_mpv_frame_start will call this function too,
-// but we need to call it on every field
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        if (ff_xvmc_field_start(s, avctx) < 0)
-            return -1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     return 0;
 }
 
@@ -1716,15 +1774,18 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                              const uint8_t **buf, int buf_size)
 {
     AVCodecContext *avctx = s->avctx;
+    const int lowres      = s->avctx->lowres;
     const int field_pic   = s->picture_structure != PICT_FRAME;
     int ret;
 
     s->resync_mb_x =
     s->resync_mb_y = -1;
 
-    assert(mb_y < s->mb_height);
+    av_assert0(mb_y < s->mb_height);
 
     init_get_bits(&s->gb, *buf, buf_size * 8);
+    if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+        skip_bits(&s->gb, 3);
 
     ff_mpeg1_clean_buffers(s);
     s->interlaced_dct = 0;
@@ -1737,8 +1798,8 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     /* extra slice info */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     s->mb_x = 0;
 
@@ -1768,7 +1829,7 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
         return AVERROR_INVALIDDATA;
     }
 
-    if (avctx->hwaccel) {
+    if (avctx->hwaccel && avctx->hwaccel->decode_slice) {
         const uint8_t *buf_end, *buf_start = *buf - 4; /* include start_code */
         int start_code = -1;
         buf_end = avpriv_find_start_code(buf_start + 2, *buf + buf_size, &start_code);
@@ -1808,13 +1869,9 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     for (;;) {
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
         // If 1, we memcpy blocks in xvmcvideo.
-        if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
             ff_xvmc_init_block(s); // set s->block
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
         if ((ret = mpeg_decode_mb(s, s->block)) < 0)
             return ret;
@@ -1846,22 +1903,22 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     s->current_picture.motion_val[dir][xy + 1][1] = motion_y;
                     s->current_picture.ref_index [dir][b8_xy]     =
                     s->current_picture.ref_index [dir][b8_xy + 1] = s->field_select[dir][i];
-                    assert(s->field_select[dir][i] == 0 ||
-                           s->field_select[dir][i] == 1);
+                    av_assert2(s->field_select[dir][i] == 0 ||
+                               s->field_select[dir][i] == 1);
                 }
                 xy    += wrap;
                 b8_xy += 2;
             }
         }
 
-        s->dest[0] += 16;
-        s->dest[1] += 16 >> s->chroma_x_shift;
-        s->dest[2] += 16 >> s->chroma_x_shift;
+        s->dest[0] += 16 >> lowres;
+        s->dest[1] +=(16 >> lowres) >> s->chroma_x_shift;
+        s->dest[2] +=(16 >> lowres) >> s->chroma_x_shift;
 
         ff_mpv_decode_mb(s, s->block);
 
         if (++s->mb_x >= s->mb_width) {
-            const int mb_size = 16;
+            const int mb_size = 16 >> s->avctx->lowres;
 
             ff_mpeg_draw_horiz_band(s, mb_size * (s->mb_y >> field_pic), mb_size);
             ff_mpv_report_decode_progress(s);
@@ -1879,15 +1936,35 @@ FF_ENABLE_DEPRECATION_WARNINGS
                              s->progressive_frame == 0
                              /* vbv_delay == 0xBBB || 0xE10 */;
 
+                if (left >= 32 && !is_d10) {
+                    GetBitContext gb = s->gb;
+                    align_get_bits(&gb);
+                    if (show_bits(&gb, 24) == 0x060E2B) {
+                        av_log(avctx, AV_LOG_DEBUG, "Invalid MXF data found in video stream\n");
+                        is_d10 = 1;
+                    }
+                }
+
                 if (left < 0 ||
                     (left && show_bits(&s->gb, FFMIN(left, 23)) && !is_d10) ||
-                    ((avctx->err_recognition & AV_EF_BUFFER) && left > 8)) {
+                    ((avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE)) && left > 8)) {
                     av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X\n",
                            left, show_bits(&s->gb, FFMIN(left, 23)));
                     return AVERROR_INVALIDDATA;
                 } else
                     goto eos;
             }
+            // There are some files out there which are missing the last slice
+            // in cases where the slice is completely outside the visible
+            // area, we detect this here instead of running into the end expecting
+            // more data
+            if (s->mb_y >= ((s->height + 15) >> 4) &&
+                !s->progressive_sequence &&
+                get_bits_left(&s->gb) <= 8 &&
+                get_bits_left(&s->gb) >= 0 &&
+                s->mb_skip_run == -1 &&
+                show_bits(&s->gb, 8) == 0)
+                goto eos;
 
             ff_init_block_index(s);
         }
@@ -1953,8 +2030,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 eos: // end of slice
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s, AV_LOG_ERROR, "overread %d\n", -get_bits_left(&s->gb));
+        return AVERROR_INVALIDDATA;
+    }
     *buf += (get_bits_count(&s->gb) - 1) / 8;
-    ff_dlog(s, "y %d %d %d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
+    ff_dlog(s, "Slice start:%d %d  end:%d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
     return 0;
 }
 
@@ -1994,7 +2075,10 @@ static int slice_decode_thread(AVCodecContext *c, void *arg)
 
         start_code = -1;
         buf        = avpriv_find_start_code(buf, s->gb.buffer_end, &start_code);
-        mb_y       = (start_code - SLICE_MIN_START_CODE) << field_pic;
+        mb_y       = start_code - SLICE_MIN_START_CODE;
+        if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+            mb_y += (*buf&0xE0)<<2;
+        mb_y <<= field_pic;
         if (s->picture_structure == PICT_BOTTOM_FIELD)
             mb_y++;
         if (mb_y < 0 || mb_y >= s->end_mb_y)
@@ -2015,20 +2099,16 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
         return 0;
 
     if (s->avctx->hwaccel) {
-        if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+        int ret = s->avctx->hwaccel->end_frame(s->avctx);
+        if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            return ret;
+        }
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        ff_xvmc_field_end(s);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     /* end of slice reached */
-    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field) {
+    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field && !s1->first_slice) {
         /* end of image */
 
         ff_er_frame_end(&s->er);
@@ -2039,7 +2119,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
             int ret = av_frame_ref(pict, s->current_picture_ptr->f);
             if (ret < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG2);
         } else {
             if (avctx->active_thread_type & FF_THREAD_FRAME)
                 s->picture_number++;
@@ -2049,7 +2130,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 int ret = av_frame_ref(pict, s->last_picture_ptr->f);
                 if (ret < 0)
                     return ret;
-                ff_print_debug_info(s, s->last_picture_ptr);
+                ff_print_debug_info(s, s->last_picture_ptr, pict);
+                ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG2);
             }
         }
 
@@ -2074,28 +2156,25 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     if (width == 0 || height == 0) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid horizontal or vertical size value.\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->aspect_ratio_info = get_bits(&s->gb, 4);
     if (s->aspect_ratio_info == 0) {
         av_log(avctx, AV_LOG_ERROR, "aspect ratio has forbidden 0 value\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->frame_rate_index = get_bits(&s->gb, 4);
     if (s->frame_rate_index == 0 || s->frame_rate_index > 13) {
         av_log(avctx, AV_LOG_WARNING,
                "frame_rate_index %d is invalid\n", s->frame_rate_index);
-        return AVERROR_INVALIDDATA;
+        s->frame_rate_index = 1;
     }
-    s->bit_rate = get_bits(&s->gb, 18) * 400;
-    if (get_bits1(&s->gb) == 0) { /* marker */
-        av_log(avctx, AV_LOG_ERROR, "Marker in sequence header missing\n");
+    s->bit_rate = get_bits(&s->gb, 18) * 400LL;
+    if (check_marker(&s->gb, "in sequence header") == 0) {
         return AVERROR_INVALIDDATA;
     }
-    s->width  = width;
-    s->height = height;
 
     s->avctx->rc_buffer_size = get_bits(&s->gb, 10) * 1024 * 16;
     skip_bits(&s->gb, 1);
@@ -2127,21 +2206,26 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
+
     /* We set MPEG-2 parameters so that it emulates MPEG-1. */
     s->progressive_sequence = 1;
     s->progressive_frame    = 1;
     s->picture_structure    = PICT_FRAME;
+    s->first_field          = 0;
     s->frame_pred_frame_dct = 1;
     s->chroma_format        = 1;
     s->codec_id             =
     s->avctx->codec_id      = AV_CODEC_ID_MPEG1VIDEO;
     s->out_format           = FMT_MPEG1;
+    s->swap_uv              = 0; // AFAIK VCR2 does not have SEQ_HEADER
     if (s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
         s->low_delay = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%d\n",
-               s->avctx->rc_buffer_size, s->bit_rate);
+        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%"PRId64", aspect_ratio_info: %d \n",
+               s->avctx->rc_buffer_size, s->bit_rate, s->aspect_ratio_info);
 
     return 0;
 }
@@ -2156,6 +2240,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->out_format = FMT_MPEG1;
     if (s1->mpeg_enc_ctx_allocated) {
         ff_mpv_common_end(s);
+        s1->mpeg_enc_ctx_allocated = 0;
     }
     s->width            = avctx->coded_width;
     s->height           = avctx->coded_height;
@@ -2163,14 +2248,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->low_delay        = 1;
 
     avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-
-#if FF_API_XVMC
-    if ((avctx->pix_fmt == AV_PIX_FMT_XVMC_MPEG2_IDCT || avctx->hwaccel) &&
-        avctx->idct_algo == FF_IDCT_AUTO)
-#else
-    if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-#endif /* FF_API_XVMC */
-        avctx->idct_algo = FF_IDCT_SIMPLE;
+    setup_hwaccel_for_pixfmt(avctx);
 
     ff_mpv_idct_init(s);
     if ((ret = ff_mpv_common_init(s)) < 0)
@@ -2191,9 +2269,15 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->progressive_sequence  = 1;
     s->progressive_frame     = 1;
     s->picture_structure     = PICT_FRAME;
+    s->first_field           = 0;
     s->frame_pred_frame_dct  = 1;
     s->chroma_format         = 1;
-    s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    if (s->codec_tag == AV_RL32("BW10")) {
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+    } else {
+        s->swap_uv = 1; // in case of xvmc we need to swap uv for each MB
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    }
     s1->save_width           = s->width;
     s1->save_height          = s->height;
     s1->save_progressive_seq = s->progressive_sequence;
@@ -2256,9 +2340,25 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
 static void mpeg_decode_user_data(AVCodecContext *avctx,
                                   const uint8_t *p, int buf_size)
 {
+    Mpeg1Context *s = avctx->priv_data;
     const uint8_t *buf_end = p + buf_size;
     Mpeg1Context *s1 = avctx->priv_data;
 
+#if 0
+    int i;
+    for(i=0; !(!p[i-2] && !p[i-1] && p[i]==1) && i<buf_size; i++){
+        av_log(avctx, AV_LOG_ERROR, "%c", p[i]);
+    }
+    av_log(avctx, AV_LOG_ERROR, "\n");
+#endif
+
+    if (buf_size > 29){
+        int i;
+        for(i=0; i<20; i++)
+            if (!memcmp(p+i, "\0TMPGEXS\0", 9)){
+                s->tmpgexs= 1;
+            }
+    }
     /* we parse the DTG active format information */
     if (buf_end - p >= 5 &&
         p[0] == 'D' && p[1] == 'T' && p[2] == 'G' && p[3] == '1') {
@@ -2317,32 +2417,26 @@ static void mpeg_decode_gop(AVCodecContext *avctx,
 {
     Mpeg1Context *s1  = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
-
-    int time_code_hours, time_code_minutes;
-    int time_code_seconds, time_code_pictures;
     int broken_link;
+    int64_t tc;
 
     init_get_bits(&s->gb, buf, buf_size * 8);
 
-    skip_bits1(&s->gb); /* drop_frame_flag */
-
-    time_code_hours   = get_bits(&s->gb, 5);
-    time_code_minutes = get_bits(&s->gb, 6);
-    skip_bits1(&s->gb); // marker bit
-    time_code_seconds  = get_bits(&s->gb, 6);
-    time_code_pictures = get_bits(&s->gb, 6);
+    tc = avctx->timecode_frame_start = get_bits(&s->gb, 25);
 
-    s1->closed_gop = get_bits1(&s->gb);
+    s->closed_gop = get_bits1(&s->gb);
     /* broken_link indicate that after editing the
      * reference frames of the first B-Frames after GOP I-Frame
      * are missing (open gop) */
     broken_link = get_bits1(&s->gb);
 
-    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        char tcbuf[AV_TIMECODE_STR_SIZE];
+        av_timecode_make_mpeg_tc_string(tcbuf, tc);
         av_log(s->avctx, AV_LOG_DEBUG,
-               "GOP (%2d:%02d:%02d.[%02d]) closed_gop=%d broken_link=%d\n",
-               time_code_hours, time_code_minutes, time_code_seconds,
-               time_code_pictures, s1->closed_gop, broken_link);
+               "GOP (%s) closed_gop=%d broken_link=%d\n",
+               tcbuf, s->closed_gop, broken_link);
+    }
 }
 
 static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
@@ -2354,6 +2448,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
     const uint8_t *buf_end = buf + buf_size;
     int ret, input_size;
     int last_code = 0, skip_frame = 0;
+    int picture_start_code_seen = 0;
 
     for (;;) {
         /* find next start code */
@@ -2365,6 +2460,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     (avctx->active_thread_type & FF_THREAD_SLICE) &&
                     !avctx->hwaccel) {
                     int i;
+                    av_assert0(avctx->thread_count > 1);
 
                     avctx->execute(avctx, slice_decode_thread,
                                    &s2->thread_context[0], NULL,
@@ -2373,6 +2469,12 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                         s2->er.error_count += s2->thread_context[i]->er.error_count;
                 }
 
+#if FF_API_VDPAU
+                if ((CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER)
+                    && uses_vdpau(avctx))
+                    ff_vdpau_mpeg_picture_complete(s2, buf, buf_size, s->slice_count);
+#endif
+
                 ret = slice_end(avctx, picture);
                 if (ret < 0)
                     return ret;
@@ -2383,13 +2485,17 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                 }
             }
             s2->pict_type = 0;
+
+            if (avctx->err_recognition & AV_EF_EXPLODE && s2->er.error_count)
+                return AVERROR_INVALIDDATA;
+
             return FFMAX(0, buf_ptr - buf - s2->parse_context.last_index);
         }
 
         input_size = buf_end - buf_ptr;
 
         if (avctx->debug & FF_DEBUG_STARTCODE)
-            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %td left %d\n",
+            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %"PTRDIFF_SPECIFIER" left %d\n",
                    start_code, buf_ptr - buf, input_size);
 
         /* prepare data for next start code */
@@ -2397,7 +2503,8 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
         case SEQ_START_CODE:
             if (last_code == 0) {
                 mpeg1_decode_sequence(avctx, buf_ptr, input_size);
-                s->sync = 1;
+                if (buf != avctx->extradata)
+                    s->sync = 1;
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "ignoring SEQ_START_CODE after %X\n", last_code);
@@ -2407,12 +2514,24 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
 
         case PICTURE_START_CODE:
+            if (picture_start_code_seen && s2->picture_structure == PICT_FRAME) {
+               /* If it's a frame picture, there can't be more than one picture header.
+                  Yet, it does happen and we need to handle it. */
+               av_log(avctx, AV_LOG_WARNING, "ignoring extra picture following a frame-picture\n");
+               break;
+            }
+            picture_start_code_seen = 1;
+
             if (s2->width <= 0 || s2->height <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid frame dimensions %dx%d.\n",
                        s2->width, s2->height);
                 return AVERROR_INVALIDDATA;
             }
 
+            if (s->tmpgexs){
+                s2->intra_dc_precision= 3;
+                s2->intra_matrix[0]= 1;
+            }
             if (HAVE_THREADS && (avctx->active_thread_type & FF_THREAD_SLICE) &&
                 !avctx->hwaccel && s->slice_count) {
                 int i;
@@ -2496,14 +2615,50 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
         default:
             if (start_code >= SLICE_MIN_START_CODE &&
+                start_code <= SLICE_MAX_START_CODE && last_code == PICTURE_START_CODE) {
+                if (s2->progressive_sequence && !s2->progressive_frame) {
+                    s2->progressive_frame = 1;
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "interlaced frame in progressive sequence, ignoring\n");
+                }
+
+                if (s2->picture_structure == 0 ||
+                    (s2->progressive_frame && s2->picture_structure != PICT_FRAME)) {
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "picture_structure %d invalid, ignoring\n",
+                           s2->picture_structure);
+                    s2->picture_structure = PICT_FRAME;
+                }
+
+                if (s2->progressive_sequence && !s2->frame_pred_frame_dct)
+                    av_log(s2->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
+
+                if (s2->picture_structure == PICT_FRAME) {
+                    s2->first_field = 0;
+                    s2->v_edge_pos  = 16 * s2->mb_height;
+                } else {
+                    s2->first_field ^= 1;
+                    s2->v_edge_pos   = 8 * s2->mb_height;
+                    memset(s2->mbskip_table, 0, s2->mb_stride * s2->mb_height);
+                }
+            }
+            if (start_code >= SLICE_MIN_START_CODE &&
                 start_code <= SLICE_MAX_START_CODE && last_code != 0) {
                 const int field_pic = s2->picture_structure != PICT_FRAME;
-                int mb_y = (start_code - SLICE_MIN_START_CODE) << field_pic;
+                int mb_y = start_code - SLICE_MIN_START_CODE;
                 last_code = SLICE_MIN_START_CODE;
+                if (s2->codec_id != AV_CODEC_ID_MPEG1VIDEO && s2->mb_height > 2800/16)
+                    mb_y += (*buf_ptr&0xE0)<<2;
 
+                mb_y <<= field_pic;
                 if (s2->picture_structure == PICT_BOTTOM_FIELD)
                     mb_y++;
 
+                if (buf_end - buf_ptr < 2) {
+                    av_log(s2->avctx, AV_LOG_ERROR, "slice too small\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
                 if (mb_y >= s2->mb_height) {
                     av_log(s2->avctx, AV_LOG_ERROR,
                            "slice below image (%d >= %d)\n", mb_y, s2->mb_height);
@@ -2514,13 +2669,13 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     /* Skip B-frames if we do not have reference frames and
                      * GOP is not closed. */
                     if (s2->pict_type == AV_PICTURE_TYPE_B) {
-                        if (!s->closed_gop) {
+                        if (!s2->closed_gop) {
                             skip_frame = 1;
                             break;
                         }
                     }
                 }
-                if (s2->pict_type == AV_PICTURE_TYPE_I)
+                if (s2->pict_type == AV_PICTURE_TYPE_I || (s2->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
                     s->sync = 1;
                 if (!s2->next_picture_ptr) {
                     /* Skip P-frames if we do not have a reference frame or
@@ -2567,12 +2722,20 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     return AVERROR_INVALIDDATA;
                 }
 
+#if FF_API_VDPAU
+                if (uses_vdpau(avctx)) {
+                    s->slice_count++;
+                    break;
+                }
+#endif
+
                 if (HAVE_THREADS &&
                     (avctx->active_thread_type & FF_THREAD_SLICE) &&
                     !avctx->hwaccel) {
                     int threshold = (s2->mb_height * s->slice_count +
                                      s2->slice_context_count / 2) /
                                     s2->slice_context_count;
+                    av_assert0(avctx->thread_count > 1);
                     if (threshold <= mb_y) {
                         MpegEncContext *thread_context = s2->thread_context[s->slice_count];
 
@@ -2615,11 +2778,11 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_output, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
+    int ret;
     int buf_size = avpkt->size;
     Mpeg1Context *s = avctx->priv_data;
     AVFrame *picture = data;
     MpegEncContext *s2 = &s->mpeg_enc_ctx;
-    ff_dlog(avctx, "fill_buffer\n");
 
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == SEQ_END_CODE)) {
         /* special case for last picture */
@@ -2644,20 +2807,33 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
             return buf_size;
     }
 
-    if (s->mpeg_enc_ctx_allocated == 0 && avctx->codec_tag == AV_RL32("VCR2"))
+    s2->codec_tag = avpriv_toupper4(avctx->codec_tag);
+    if (s->mpeg_enc_ctx_allocated == 0 && (   s2->codec_tag == AV_RL32("VCR2")
+                                           || s2->codec_tag == AV_RL32("BW10")
+                                          ))
         vcr2_init_sequence(avctx);
 
     s->slice_count = 0;
 
     if (avctx->extradata && !s->extradata_decoded) {
-        int ret = decode_chunks(avctx, picture, got_output,
-                                avctx->extradata, avctx->extradata_size);
+        ret = decode_chunks(avctx, picture, got_output,
+                            avctx->extradata, avctx->extradata_size);
+        if (*got_output) {
+            av_log(avctx, AV_LOG_ERROR, "picture in extradata\n");
+            *got_output = 0;
+        }
         s->extradata_decoded = 1;
-        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE)) {
+            s2->current_picture_ptr = NULL;
             return ret;
+        }
     }
 
-    return decode_chunks(avctx, picture, got_output, buf, buf_size);
+    ret = decode_chunks(avctx, picture, got_output, buf, buf_size);
+    if (ret<0 || *got_output)
+        s2->current_picture_ptr = NULL;
+
+    return ret;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -2665,7 +2841,6 @@ static void flush(AVCodecContext *avctx)
     Mpeg1Context *s = avctx->priv_data;
 
     s->sync       = 0;
-    s->closed_gop = 0;
 
     ff_mpeg_flush(avctx);
 }
@@ -2705,6 +2880,7 @@ AVCodec ff_mpeg1video_decoder = {
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_SLICE_THREADS,
     .flush                 = flush,
+    .max_lowres            = 3,
     .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg_decode_update_thread_context)
 };
 
@@ -2721,11 +2897,28 @@ AVCodec ff_mpeg2video_decoder = {
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                       AV_CODEC_CAP_SLICE_THREADS,
     .flush          = flush,
+    .max_lowres     = 3,
     .profiles       = NULL_IF_CONFIG_SMALL(mpeg2_video_profiles),
 };
 
+//legacy decoder
+AVCodec ff_mpegvideo_decoder = {
+    .name           = "mpegvideo",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1 video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS,
+    .flush          = flush,
+    .max_lowres     = 3,
+};
+
 #if FF_API_XVMC
 #if CONFIG_MPEG_XVMC_DECODER
+FF_DISABLE_DEPRECATION_WARNINGS
 static av_cold int mpeg_mc_decode_init(AVCodecContext *avctx)
 {
     if (avctx->active_thread_type & FF_THREAD_SLICE)
@@ -2757,6 +2950,38 @@ AVCodec ff_mpeg_xvmc_decoder = {
                       AV_CODEC_CAP_DELAY,
     .flush          = flush,
 };
-
+FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 #endif /* FF_API_XVMC */
+
+#if CONFIG_MPEG_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_mpeg_vdpau_decoder = {
+    .name           = "mpegvideo_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1/2 video (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED |
+                      AV_CODEC_CAP_HWACCEL_VDPAU | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+};
+#endif
+
+#if CONFIG_MPEG1_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_mpeg1_vdpau_decoder = {
+    .name           = "mpeg1video_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1 video (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED |
+                      AV_CODEC_CAP_HWACCEL_VDPAU | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+};
+#endif
diff --git a/libavcodec/mpeg12enc.c b/libavcodec/mpeg12enc.c
index 61716785a1..b88e2e6b8e 100644
--- a/libavcodec/mpeg12enc.c
+++ b/libavcodec/mpeg12enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 #include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
@@ -40,11 +42,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-
-static const uint8_t inv_non_linear_qscale[] = {
-    0, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-};
-
 static const uint8_t svcd_scan_offset_placeholder[] = {
     0x10, 0x0E, 0x00, 0x80, 0x81, 0x00, 0x80,
     0x81, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -86,7 +83,7 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
                 /* length of VLC and sign */
                 len = rl->table_vlc[code][1] + 1;
             } else {
-                len = rl->table_vlc[111][1] + 6;    /* rl->n */
+                len = rl->table_vlc[111 /* rl->n */][1] + 6;
 
                 if (alevel < 128)
                     len += 8;
@@ -102,26 +99,37 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
 static int find_frame_rate_index(MpegEncContext *s)
 {
     int i;
-    int64_t dmin = INT64_MAX;
-    int64_t d;
+    AVRational bestq = (AVRational) {0, 0};
+    AVRational ext;
+    AVRational target = av_inv_q(s->avctx->time_base);
 
     for (i = 1; i < 14; i++) {
-        int64_t n0 = 1001LL / ff_mpeg12_frame_rate_tab[i].den *
-                     ff_mpeg12_frame_rate_tab[i].num * s->avctx->time_base.num;
-        int64_t n1 = 1001LL * s->avctx->time_base.den;
-
         if (s->avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL &&
             i >= 9)
             break;
 
-        d = FFABS(n0 - n1);
-        if (d < dmin) {
-            dmin                = d;
-            s->frame_rate_index = i;
+        for (ext.num=1; ext.num <= 4; ext.num++) {
+            for (ext.den=1; ext.den <= 32; ext.den++) {
+                AVRational q = av_mul_q(ext, ff_mpeg12_frame_rate_tab[i]);
+
+                if (s->codec_id != AV_CODEC_ID_MPEG2VIDEO && (ext.den!=1 || ext.num!=1))
+                    continue;
+                if (av_gcd(ext.den, ext.num) != 1)
+                    continue;
+
+                if (    bestq.num==0
+                    || av_nearer_q(target, bestq, q) < 0
+                    || ext.num==1 && ext.den==1 && av_nearer_q(target, bestq, q) == 0) {
+                    bestq               = q;
+                    s->frame_rate_index = i;
+                    s->mpeg2_frame_rate_ext.num = ext.num;
+                    s->mpeg2_frame_rate_ext.den = ext.den;
+                }
+            }
         }
     }
 
-    if (dmin)
+    if (av_cmp_q(target, bestq))
         return -1;
     else
         return 0;
@@ -131,6 +139,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
 
+    if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && avctx->height > 2800)
+        avctx->thread_count = 1;
+
     if (ff_mpv_encode_init(avctx) < 0)
         return -1;
 
@@ -176,12 +187,38 @@ static av_cold int encode_init(AVCodecContext *avctx)
         }
     }
 
+    if ((avctx->width & 0xFFF) == 0 && (avctx->height & 0xFFF) == 1) {
+        av_log(avctx, AV_LOG_ERROR, "Width / Height is invalid for MPEG2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        if ((avctx->width & 0xFFF) == 0 || (avctx->height & 0xFFF) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Width or Height are not allowed to be multiples of 4096\n"
+                                        "add '-strict %d' if you want to use them anyway.\n", FF_COMPLIANCE_UNOFFICIAL);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    s->drop_frame_timecode = s->drop_frame_timecode || !!(avctx->flags2 & AV_CODEC_FLAG2_DROP_FRAME_TIMECODE);
+    if (s->drop_frame_timecode)
+        s->tc.flags |= AV_TIMECODE_FLAG_DROPFRAME;
     if (s->drop_frame_timecode && s->frame_rate_index != 4) {
         av_log(avctx, AV_LOG_ERROR,
                "Drop frame time code only allowed with 1001/30000 fps\n");
         return -1;
     }
 
+    if (s->tc_opt_str) {
+        AVRational rate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
+        int ret = av_timecode_init_from_string(&s->tc, rate, s->tc_opt_str, s);
+        if (ret < 0)
+            return ret;
+        s->drop_frame_timecode = !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME);
+        s->avctx->timecode_frame_start = s->tc.start;
+    } else {
+        s->avctx->timecode_frame_start = 0; // default is -1
+    }
     return 0;
 }
 
@@ -198,11 +235,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
     unsigned int vbv_buffer_size, fps, v;
     int i, constraint_parameter_flag;
     uint64_t time_code;
-    float best_aspect_error = 1E10;
-    float aspect_ratio      = av_q2d(s->avctx->sample_aspect_ratio);
+    int64_t best_aspect_error = INT64_MAX;
+    AVRational aspect_ratio = s->avctx->sample_aspect_ratio;
 
-    if (aspect_ratio == 0.0)
-        aspect_ratio = 1.0;             // pixel aspect 1.1 (VGA)
+    if (aspect_ratio.num == 0 || aspect_ratio.den == 0)
+        aspect_ratio = (AVRational){1,1};             // pixel aspect 1.1 (VGA)
 
     if (s->current_picture.f->key_frame) {
         AVRational framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
@@ -210,19 +247,19 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         /* mpeg1 header repeated every gop */
         put_header(s, SEQ_START_CODE);
 
-        put_sbits(&s->pb, 12, s->width);
-        put_sbits(&s->pb, 12, s->height);
+        put_sbits(&s->pb, 12, s->width  & 0xFFF);
+        put_sbits(&s->pb, 12, s->height & 0xFFF);
 
         for (i = 1; i < 15; i++) {
-            float error = aspect_ratio;
+            int64_t error = aspect_ratio.num * (1LL<<32) / aspect_ratio.den;
             if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO || i <= 1)
-                error -= 1.0 / ff_mpeg1_aspect[i];
+                error -= (1LL<<32) / ff_mpeg1_aspect[i];
             else
-                error -= av_q2d(ff_mpeg2_aspect[i]) * s->height / s->width;
+                error -= (1LL<<32)*ff_mpeg2_aspect[i].num * s->height / s->width / ff_mpeg2_aspect[i].den;
 
             error = FFABS(error);
 
-            if (error < best_aspect_error) {
+            if (error - 2 <= best_aspect_error) {
                 best_aspect_error    = error;
                 s->aspect_ratio_info = i;
             }
@@ -269,6 +306,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         ff_write_quant_matrix(&s->pb, s->avctx->inter_matrix);
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
+            AVFrameSideData *side_data;
+            int width = s->width;
+            int height = s->height;
+            int use_seq_disp_ext;
+
             put_header(s, EXT_START_CODE);
             put_bits(&s->pb, 4, 1);                 // seq ext
 
@@ -285,20 +327,37 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
             put_bits(&s->pb, 1, 1);                 // marker
             put_bits(&s->pb, 8, vbv_buffer_size >> 10); // vbv buffer ext
             put_bits(&s->pb, 1, s->low_delay);
-            put_bits(&s->pb, 2, 0);                 // frame_rate_ext_n
-            put_bits(&s->pb, 5, 0);                 // frame_rate_ext_d
+            put_bits(&s->pb, 2, s->mpeg2_frame_rate_ext.num-1); // frame_rate_ext_n
+            put_bits(&s->pb, 5, s->mpeg2_frame_rate_ext.den-1); // frame_rate_ext_d
+
+            side_data = av_frame_get_side_data(s->current_picture_ptr->f, AV_FRAME_DATA_PANSCAN);
+            if (side_data) {
+                AVPanScan *pan_scan = (AVPanScan *)side_data->data;
+                if (pan_scan->width && pan_scan->height) {
+                    width = pan_scan->width >> 4;
+                    height = pan_scan->height >> 4;
+                }
+            }
 
-            put_header(s, EXT_START_CODE);
-            put_bits(&s->pb, 4, 2);                         // sequence display extension
-            put_bits(&s->pb, 3, 0);                         // video_format: 0 is components
-            put_bits(&s->pb, 1, 1);                         // colour_description
-            put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
-            put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
-            put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
-            put_bits(&s->pb, 14, s->width);                 // display_horizontal_size
-            put_bits(&s->pb, 1, 1);                         // marker_bit
-            put_bits(&s->pb, 14, s->height);                // display_vertical_size
-            put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            use_seq_disp_ext = (width != s->width ||
+                                height != s->height ||
+                                s->avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                                s->avctx->color_trc != AVCOL_TRC_UNSPECIFIED ||
+                                s->avctx->colorspace != AVCOL_SPC_UNSPECIFIED);
+
+            if (s->seq_disp_ext == 1 || (s->seq_disp_ext == -1 && use_seq_disp_ext)) {
+                put_header(s, EXT_START_CODE);
+                put_bits(&s->pb, 4, 2);                         // sequence display extension
+                put_bits(&s->pb, 3, 0);                         // video_format: 0 is components
+                put_bits(&s->pb, 1, 1);                         // colour_description
+                put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
+                put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
+                put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
+                put_bits(&s->pb, 14, width);                    // display_horizontal_size
+                put_bits(&s->pb, 1, 1);                         // marker_bit
+                put_bits(&s->pb, 14, height);                   // display_vertical_size
+                put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            }
         }
 
         put_header(s, GOP_START_CODE);
@@ -310,21 +369,17 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                     s->avctx->timecode_frame_start;
 
         s->gop_picture_number = s->current_picture_ptr->f->coded_picture_number;
-        if (s->drop_frame_timecode) {
-            /* only works for NTSC 29.97 */
-            int d = time_code / 17982;
-            int m = time_code % 17982;
-            /* not needed since -2,-1 / 1798 in C returns 0 */
-            // if (m < 2)
-            //     m += 2;
-            time_code += 18 * d + 2 * ((m - 2) / 1798);
-        }
+
+        av_assert0(s->drop_frame_timecode == !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME));
+        if (s->drop_frame_timecode)
+            time_code = av_timecode_adjust_ntsc_framenum2(time_code, fps);
+
         put_bits(&s->pb, 5, (uint32_t)((time_code / (fps * 3600)) % 24));
         put_bits(&s->pb, 6, (uint32_t)((time_code / (fps *   60)) % 60));
         put_bits(&s->pb, 1, 1);
         put_bits(&s->pb, 6, (uint32_t)((time_code / fps) % 60));
         put_bits(&s->pb, 6, (uint32_t)((time_code % fps)));
-        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
+        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP) || s->intra_only || !s->gop_picture_number);
         put_bits(&s->pb, 1, 0);                     // broken link
     }
 }
@@ -341,17 +396,12 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run)
 
 static av_always_inline void put_qscale(MpegEncContext *s)
 {
-    if (s->q_scale_type) {
-        assert(s->qscale >= 1 && s->qscale <= 12);
-        put_bits(&s->pb, 5, inv_non_linear_qscale[s->qscale]);
-    } else {
-        put_bits(&s->pb, 5, s->qscale);
-    }
+    put_bits(&s->pb, 5, s->qscale);
 }
 
 void ff_mpeg1_encode_slice_header(MpegEncContext *s)
 {
-    if (s->height > 2800) {
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->height > 2800) {
         put_header(s, SLICE_MIN_START_CODE + (s->mb_y & 127));
         /* slice_vertical_position_extension */
         put_bits(&s->pb, 3, s->mb_y >> 7);
@@ -420,7 +470,7 @@ void ff_mpeg1_encode_picture_header(MpegEncContext *s, int picture_number)
         }
         put_bits(&s->pb, 2, s->intra_dc_precision);
 
-        assert(s->picture_structure == PICT_FRAME);
+        av_assert0(s->picture_structure == PICT_FRAME);
         put_bits(&s->pb, 2, s->picture_structure);
         if (s->progressive_sequence)
             put_bits(&s->pb, 1, 0);             /* no repeat */
@@ -532,7 +582,7 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
             sign = 1;
         }
 
-        assert(code > 0 && code <= 16);
+        av_assert2(code > 0 && code <= 16);
 
         put_bits(&s->pb,
                  ff_mpeg12_mbMotionVectorTable[code][1],
@@ -560,12 +610,12 @@ static inline void encode_dc(MpegEncContext *s, int diff, int component)
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_lum_bits[index] + index,
                      (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
         else
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_chroma_bits[index] + index,
                      (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
     } else {
         if (component == 0)
             put_bits(&s->pb,
@@ -675,7 +725,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
 
     if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
         (mb_x != s->mb_width - 1 ||
-         (mb_y != s->mb_height - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
+         (mb_y != s->end_mb_y - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
         ((s->pict_type == AV_PICTURE_TYPE_P && (motion_x | motion_y) == 0) ||
          (s->pict_type == AV_PICTURE_TYPE_B && s->mv_dir == s->last_mv_dir &&
           (((s->mv_dir & MV_DIR_FORWARD)
@@ -697,7 +747,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
         }
     } else {
         if (first_mb) {
-            assert(s->mb_skip_run == 0);
+            av_assert0(s->mb_skip_run == 0);
             encode_mb_skip_run(s, s->mb_x);
         } else {
             encode_mb_skip_run(s, s->mb_skip_run);
@@ -776,7 +826,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                 s->last_mv[0][1][0] = s->last_mv[0][0][0] = motion_x;
                 s->last_mv[0][1][1] = s->last_mv[0][0][1] = motion_y;
             } else {
-                assert(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
 
                 if (cbp) {
                     if (s->dquant) {
@@ -863,8 +913,8 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                     s->b_count++;
                 }
             } else {
-                assert(s->mv_type == MV_TYPE_FIELD);
-                assert(!s->frame_pred_frame_dct);
+                av_assert2(s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct);
                 if (cbp) {                      // With coded bloc pattern
                     if (s->dquant) {
                         if (s->mv_dir == MV_DIR_FORWARD)
@@ -981,12 +1031,12 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 
             bits = ff_mpeg12_vlc_dc_lum_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_lum_dc_uni[i + 255] = bits + (code << 8);
 
             bits = ff_mpeg12_vlc_dc_chroma_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_chr_dc_uni[i + 255] = bits + (code << 8);
         }
 
@@ -1045,6 +1095,8 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
 #define COMMON_OPTS                                                           \
+    { "gop_timecode",        "MPEG GOP Timecode in hh:mm:ss[:;.]ff format",   \
+      OFFSET(tc_opt_str), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, VE },\
     { "intra_vlc",           "Use MPEG-2 intra VLC table.",                   \
       OFFSET(intra_vlc_format),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
     { "drop_frame_timecode", "Timecode is in drop frame format.",             \
@@ -1062,6 +1114,10 @@ static const AVOption mpeg2_options[] = {
     COMMON_OPTS
     { "non_linear_quant", "Use nonlinear quantizer.",    OFFSET(q_scale_type),   AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "alternate_scan",   "Enable alternate scantable.", OFFSET(alternate_scan), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "seq_disp_ext",     "Write sequence_display_extension blocks.", OFFSET(seq_disp_ext), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE, "seq_disp_ext" },
+    {     "auto",   NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = -1},  0, 0, VE, "seq_disp_ext" },
+    {     "never",  NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 0 },  0, 0, VE, "seq_disp_ext" },
+    {     "always", NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 1 },  0, 0, VE, "seq_disp_ext" },
     FF_MPV_COMMON_OPTS
     { NULL },
 };
@@ -1102,7 +1158,7 @@ AVCodec ff_mpeg2video_encoder = {
     .init                 = encode_init,
     .encode2              = ff_mpv_encode_picture,
     .close                = ff_mpv_encode_end,
-    .supported_framerates = ff_mpeg12_frame_rate_tab + 1,
+    .supported_framerates = ff_mpeg2_frame_rate_tab,
     .pix_fmts             = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                            AV_PIX_FMT_YUV422P,
                                                            AV_PIX_FMT_NONE },
diff --git a/libavcodec/mpeg12vlc.h b/libavcodec/mpeg12vlc.h
index f15bebf9b5..31888fdd7f 100644
--- a/libavcodec/mpeg12vlc.h
+++ b/libavcodec/mpeg12vlc.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4_unpack_bframes_bsf.c b/libavcodec/mpeg4_unpack_bframes_bsf.c
new file mode 100644
index 0000000000..df49d3f407
--- /dev/null
+++ b/libavcodec/mpeg4_unpack_bframes_bsf.c
@@ -0,0 +1,194 @@
+/*
+ * Bitstream filter for unpacking DivX-style packed B-frames in MPEG-4 (divx_packed)
+ * Copyright (c) 2015 Andreas Cadhalpun <Andreas.Cadhalpun@googlemail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "mpeg4video.h"
+
+typedef struct UnpackBFramesBSFContext {
+    uint8_t *b_frame_buf;
+    int      b_frame_buf_size;
+    int      updated_extradata;
+} UnpackBFramesBSFContext;
+
+/* search next start code */
+static unsigned int find_startcode(const uint8_t *buf, int buf_size, int *pos)
+{
+    unsigned int startcode = 0xFF;
+
+    for (; *pos < buf_size;) {
+        startcode = ((startcode << 8) | buf[*pos]) & 0xFFFFFFFF;
+        *pos +=1;
+        if ((startcode & 0xFFFFFF00) != 0x100)
+            continue;  /* no startcode */
+        return startcode;
+    }
+
+    return 0;
+}
+
+/* determine the position of the packed marker in the userdata,
+ * the number of VOPs and the position of the second VOP */
+static void scan_buffer(const uint8_t *buf, int buf_size,
+                        int *pos_p, int *nb_vop, int *pos_vop2) {
+    unsigned int startcode;
+    int pos, i;
+
+    for (pos = 0; pos < buf_size;) {
+        startcode = find_startcode(buf, buf_size, &pos);
+
+        if (startcode == USER_DATA_STARTCODE && pos_p) {
+            /* check if the (DivX) userdata string ends with 'p' (packed) */
+            for (i = 0; i < 255 && pos + i + 1 < buf_size; i++) {
+                if (buf[pos + i] == 'p' && buf[pos + i + 1] == '\0') {
+                    *pos_p = pos + i;
+                    break;
+                }
+            }
+        } else if (startcode == VOP_STARTCODE && nb_vop) {
+            *nb_vop += 1;
+            if (*nb_vop == 2 && pos_vop2) {
+                *pos_vop2 = pos - 4; /* subtract 4 bytes startcode */
+            }
+        }
+    }
+}
+
+/* allocate new buffer and copy size bytes from src */
+static uint8_t *create_new_buffer(const uint8_t *src, int size) {
+    uint8_t *dst = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if (dst) {
+        memcpy(dst, src, size);
+        memset(dst + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    }
+
+    return dst;
+}
+
+static int mpeg4_unpack_bframes_filter(AVBitStreamFilterContext *bsfc,
+                                       AVCodecContext *avctx, const char *args,
+                                       uint8_t  **poutbuf, int *poutbuf_size,
+                                       const uint8_t *buf, int      buf_size,
+                                       int keyframe)
+{
+    UnpackBFramesBSFContext *ctx = bsfc->priv_data;
+    int pos_p = -1, nb_vop = 0, pos_vop2 = -1, ret = 0;
+
+    if (avctx->codec_id != AV_CODEC_ID_MPEG4) {
+        av_log(avctx, AV_LOG_ERROR,
+               "The mpeg4_unpack_bframes bitstream filter is only useful for mpeg4.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!ctx->updated_extradata && avctx->extradata) {
+        int pos_p_ext = -1;
+        scan_buffer(avctx->extradata, avctx->extradata_size, &pos_p_ext, NULL, NULL);
+        if (pos_p_ext >= 0) {
+            av_log(avctx, AV_LOG_DEBUG,
+                   "Updating DivX userdata (remove trailing 'p') in extradata.\n");
+            avctx->extradata[pos_p_ext] = '\0';
+        }
+        ctx->updated_extradata = 1;
+    }
+
+    scan_buffer(buf, buf_size, &pos_p, &nb_vop, &pos_vop2);
+    av_log(avctx, AV_LOG_DEBUG, "Found %d VOP startcode(s) in this packet.\n", nb_vop);
+
+    if (pos_vop2 >= 0) {
+        if (ctx->b_frame_buf) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Missing one N-VOP packet, discarding one B-frame.\n");
+            av_freep(&ctx->b_frame_buf);
+            ctx->b_frame_buf_size = 0;
+        }
+        /* store the packed B-frame in the BSFContext */
+        ctx->b_frame_buf_size = buf_size - pos_vop2;
+        ctx->b_frame_buf      = create_new_buffer(buf + pos_vop2, ctx->b_frame_buf_size);
+        if (!ctx->b_frame_buf) {
+            ctx->b_frame_buf_size = 0;
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (nb_vop > 2) {
+        av_log(avctx, AV_LOG_WARNING,
+       "Found %d VOP headers in one packet, only unpacking one.\n", nb_vop);
+    }
+
+    if (nb_vop == 1 && ctx->b_frame_buf) {
+        /* use frame from BSFContext */
+        *poutbuf      = ctx->b_frame_buf;
+        *poutbuf_size = ctx->b_frame_buf_size;
+        /* the output buffer is distinct from the input buffer */
+        ret = 1;
+        if (buf_size <= MAX_NVOP_SIZE) {
+            /* N-VOP */
+            av_log(avctx, AV_LOG_DEBUG, "Skipping N-VOP.\n");
+            ctx->b_frame_buf      = NULL;
+            ctx->b_frame_buf_size = 0;
+        } else {
+            /* copy packet into BSFContext */
+            ctx->b_frame_buf_size = buf_size;
+            ctx->b_frame_buf      = create_new_buffer(buf , buf_size);
+            if (!ctx->b_frame_buf) {
+                ctx->b_frame_buf_size = 0;
+                av_freep(poutbuf);
+                *poutbuf_size = 0;
+                return AVERROR(ENOMEM);
+            }
+        }
+    } else if (nb_vop >= 2) {
+        /* use first frame of the packet */
+        *poutbuf      = (uint8_t *) buf;
+        *poutbuf_size = pos_vop2;
+    } else if (pos_p >= 0) {
+        av_log(avctx, AV_LOG_DEBUG, "Updating DivX userdata (remove trailing 'p').\n");
+        *poutbuf_size = buf_size;
+        *poutbuf      = create_new_buffer(buf, buf_size);
+        if (!*poutbuf) {
+            *poutbuf_size = 0;
+            return AVERROR(ENOMEM);
+        }
+        /* remove 'p' (packed) from the end of the (DivX) userdata string */
+        (*poutbuf)[pos_p] = '\0';
+        /* the output buffer is distinct from the input buffer */
+        ret = 1;
+    } else {
+        /* copy packet */
+        *poutbuf      = (uint8_t *) buf;
+        *poutbuf_size = buf_size;
+    }
+
+    return ret;
+}
+
+static void mpeg4_unpack_bframes_close(AVBitStreamFilterContext *bsfc)
+{
+    UnpackBFramesBSFContext *ctx = bsfc->priv_data;
+    av_freep(&ctx->b_frame_buf);
+}
+
+AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf = {
+    .name           = "mpeg4_unpack_bframes",
+    .priv_data_size = sizeof(UnpackBFramesBSFContext),
+    .filter         = mpeg4_unpack_bframes_filter,
+    .close          = mpeg4_unpack_bframes_close
+};
diff --git a/libavcodec/mpeg4audio.c b/libavcodec/mpeg4audio.c
index 2363cb637d..188d843eee 100644
--- a/libavcodec/mpeg4audio.c
+++ b/libavcodec/mpeg4audio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,8 @@ static int parse_config_ALS(GetBitContext *gb, MPEG4AudioConfig *c)
     return 0;
 }
 
+/* XXX: make sure to update the copies in the different encoders if you change
+ * this table */
 const int avpriv_mpeg4audio_sample_rates[16] = {
     96000, 88200, 64000, 48000, 44100, 32000,
     24000, 22050, 16000, 12000, 11025, 8000, 7350
@@ -82,9 +84,13 @@ int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
     GetBitContext gb;
     int specific_config_bitindex, ret;
 
+    if (bit_size <= 0)
+        return AVERROR_INVALIDDATA;
+
     ret = init_get_bits(&gb, buf, bit_size);
     if (ret < 0)
         return ret;
+
     c->object_type = get_object_type(&gb);
     c->sample_rate = get_sample_rate(&gb, &c->sampling_index);
     c->chan_config = get_bits(&gb, 4);
@@ -125,8 +131,11 @@ int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
             if (show_bits(&gb, 11) == 0x2b7) { // sync extension
                 get_bits(&gb, 11);
                 c->ext_object_type = get_object_type(&gb);
-                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(&gb)) == 1)
+                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(&gb)) == 1) {
                     c->ext_sample_rate = get_sample_rate(&gb, &c->ext_sampling_index);
+                    if (c->ext_sample_rate == c->sample_rate)
+                        c->sbr = -1;
+                }
                 if (get_bits_left(&gb) > 11 && get_bits(&gb, 11) == 0x548)
                     c->ps = get_bits1(&gb);
                 break;
diff --git a/libavcodec/mpeg4audio.h b/libavcodec/mpeg4audio.h
index 2eef2205bd..8239081747 100644
--- a/libavcodec/mpeg4audio.h
+++ b/libavcodec/mpeg4audio.h
@@ -2,20 +2,20 @@
  * MPEG-4 Audio common header
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,7 +102,7 @@ enum AudioObjectType {
     AOT_USAC,                  ///< N                       Unified Speech and Audio Coding
 };
 
-#define MAX_PCE_SIZE 304 ///<Maximum size of a PCE including the 3-bit ID_PCE
+#define MAX_PCE_SIZE 320 ///<Maximum size of a PCE including the 3-bit ID_PCE
                          ///<marker and the comment
 
 int avpriv_copy_pce_data(PutBitContext *pb, GetBitContext *gb);
diff --git a/libavcodec/mpeg4data.h b/libavcodec/mpeg4data.h
index 87bb5391b1..1ac5840e03 100644
--- a/libavcodec/mpeg4data.h
+++ b/libavcodec/mpeg4data.h
@@ -3,20 +3,20 @@
  * H263+ support
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4video.c b/libavcodec/mpeg4video.c
index e37aafab38..d7c6fe7ecc 100644
--- a/libavcodec/mpeg4video.c
+++ b/libavcodec/mpeg4video.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4video.h b/libavcodec/mpeg4video.h
index 252029cd3a..49bc13f87a 100644
--- a/libavcodec/mpeg4video.h
+++ b/libavcodec/mpeg4video.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,9 @@
 #define VISUAL_OBJ_STARTCODE 0x1B5
 #define VOP_STARTCODE        0x1B6
 
+/* smaller packets likely don't contain a real frame */
+#define MAX_NVOP_SIZE 19
+
 typedef struct Mpeg4DecContext {
     MpegEncContext m;
 
@@ -84,6 +87,7 @@ typedef struct Mpeg4DecContext {
     int enhancement_type;
     int scalability;
     int use_intra_dc_vlc;
+
     /// QP above which the ac VLC should be used for intra dc
     int intra_dc_threshold;
 
@@ -92,6 +96,7 @@ typedef struct Mpeg4DecContext {
     int divx_build;
     int xvid_build;
     int lavc_build;
+
     /// flag for having shown the warning about invalid Divx B-frames
     int showed_packed_warning;
     /** does the stream contain the low_delay flag,
@@ -148,6 +153,8 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx);
 int ff_mpeg4_get_video_packet_prefix_length(MpegEncContext *s);
 int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx);
 void ff_mpeg4_init_direct_mv(MpegEncContext *s);
+void ff_mpeg4videodec_static_init(void);
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx);
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size);
 
 /**
@@ -223,21 +230,21 @@ static inline int ff_mpeg4_pred_dc(MpegEncContext *s, int n, int level,
     } else {
         level += pred;
         ret    = level;
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+    }
+    level *= scale;
+    if (level & (~2047)) {
+        if (!s->encoding && (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE))) {
             if (level < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc<0 at %dx%d\n", s->mb_x, s->mb_y);
                 return -1;
             }
-            if (level * scale > 2048 + scale) {
+            if (level > 2048 + scale) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc overflow at %dx%d\n", s->mb_x, s->mb_y);
                 return -1;
             }
         }
-    }
-    level *= scale;
-    if (level & (~2047)) {
         if (level < 0)
             level = 0;
         else if (!(s->workaround_bugs & FF_BUG_DC_CLIP))
diff --git a/libavcodec/mpeg4video_parser.c b/libavcodec/mpeg4video_parser.c
index 246bb9ca4e..aa5e87a544 100644
--- a/libavcodec/mpeg4video_parser.c
+++ b/libavcodec/mpeg4video_parser.c
@@ -3,23 +3,25 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "internal.h"
 #include "parser.h"
 #include "mpegvideo.h"
@@ -86,6 +88,8 @@ static int mpeg4_decode_header(AVCodecParserContext *s1, AVCodecContext *avctx,
     if (avctx->extradata_size && pc->first_picture) {
         init_get_bits(gb, avctx->extradata, avctx->extradata_size * 8);
         ret = ff_mpeg4_decode_picture_header(dec_ctx, gb);
+        if (ret < -1)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse extradata\n");
     }
 
     init_get_bits(gb, buf, 8 * buf_size);
@@ -96,6 +100,13 @@ static int mpeg4_decode_header(AVCodecParserContext *s1, AVCodecContext *avctx,
         if (ret < 0)
             return ret;
     }
+    if((s1->flags & PARSER_FLAG_USE_CODEC_TS) && s->avctx->time_base.den>0 && ret>=0){
+        av_assert1(s1->pts == AV_NOPTS_VALUE);
+        av_assert1(s1->dts == AV_NOPTS_VALUE);
+
+        s1->pts = av_rescale_q(s->time, (AVRational){1, s->avctx->time_base.den}, (AVRational){1, 1200000});
+    }
+
     s1->pict_type     = s->pict_type;
     pc->first_picture = 0;
     return ret;
@@ -105,8 +116,12 @@ static av_cold int mpeg4video_parse_init(AVCodecParserContext *s)
 {
     struct Mp4vParseContext *pc = s->priv_data;
 
+    ff_mpeg4videodec_static_init();
+
     pc->first_picture           = 1;
+    pc->dec_ctx.m.quant_precision     = 5;
     pc->dec_ctx.m.slice_context_count = 1;
+    pc->dec_ctx.showed_packed_warning = 1;
     return 0;
 }
 
diff --git a/libavcodec/mpeg4video_parser.h b/libavcodec/mpeg4video_parser.h
index 0f56e7fd9e..50f8b44403 100644
--- a/libavcodec/mpeg4video_parser.h
+++ b/libavcodec/mpeg4video_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index b09aeb2b74..5c388c4044 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -3,23 +3,27 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
 #include "error_resilience.h"
 #include "idctdsp.h"
 #include "internal.h"
@@ -110,12 +114,13 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
  * check if the next stuff is a resync marker or the end.
  * @return 0 if not
  */
-static inline int mpeg4_is_resync(MpegEncContext *s)
+static inline int mpeg4_is_resync(Mpeg4DecContext *ctx)
 {
+    MpegEncContext *s = &ctx->m;
     int bits_count = get_bits_count(&s->gb);
     int v          = show_bits(&s->gb, 16);
 
-    if (s->workaround_bugs & FF_BUG_NO_PADDING)
+    if (s->workaround_bugs & FF_BUG_NO_PADDING && !ctx->resync_marker)
         return 0;
 
     while (v <= 0xFF) {
@@ -132,10 +137,11 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
         v  |= 0x7F >> (7 - (bits_count & 7));
 
         if (v == 0x7F)
-            return 1;
+            return s->mb_num;
     } else {
         if (v == ff_mpeg4_resync_prefix[bits_count & 7]) {
-            int len;
+            int len, mb_num;
+            int mb_num_bits = av_log2(s->mb_num - 1) + 1;
             GetBitContext gb = s->gb;
 
             skip_bits(&s->gb, 1);
@@ -145,10 +151,14 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
                 if (get_bits1(&s->gb))
                     break;
 
+            mb_num = get_bits(&s->gb, mb_num_bits);
+            if (!mb_num || mb_num > s->mb_num || get_bits_count(&s->gb)+6 > s->gb.size_in_bits)
+                mb_num= -1;
+
             s->gb = gb;
 
             if (len >= ff_mpeg4_get_video_packet_prefix_length(s))
-                return 1;
+                return mb_num;
         }
     }
     return 0;
@@ -181,17 +191,17 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
         int x = 0, y = 0;
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             x = get_xbits(gb, length);
 
         if (!(ctx->divx_version == 500 && ctx->divx_build == 413))
-            skip_bits1(gb);     /* marker bit */
+            check_marker(gb, "before sprite_trajectory");
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             y = get_xbits(gb, length);
 
-        skip_bits1(gb);         /* marker bit */
+        check_marker(gb, "after sprite_trajectory");
         ctx->sprite_traj[i][0] = d[i][0] = x;
         ctx->sprite_traj[i][1] = d[i][1] = y;
     }
@@ -369,6 +379,17 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
     return 0;
 }
 
+static int decode_new_pred(Mpeg4DecContext *ctx, GetBitContext *gb) {
+    int len = FFMIN(ctx->time_increment_bits + 3, 15);
+
+    get_bits(gb, len);
+    if (get_bits1(gb))
+        get_bits(gb, len);
+    check_marker(gb, "after new_pred");
+
+    return 0;
+}
+
 /**
  * Decode the next video packet.
  * @return <0 if something went wrong
@@ -404,19 +425,6 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
                "illegal mb_num in video packet (%d %d) \n", mb_num, s->mb_num);
         return -1;
     }
-    if (s->pict_type == AV_PICTURE_TYPE_B) {
-        int mb_x = 0, mb_y = 0;
-
-        while (s->next_picture.mbskip_table[s->mb_index2xy[mb_num]]) {
-            if (!mb_x)
-                ff_thread_await_progress(&s->next_picture_ptr->tf, mb_y++, 0);
-            mb_num++;
-            if (++mb_x == s->mb_width)
-                mb_x = 0;
-        }
-        if (mb_num >= s->mb_num)
-            return -1;  // slice contains just skipped MBs (already decoded)
-    }
 
     s->mb_x = mb_num % s->mb_width;
     s->mb_y = mb_num / s->mb_width;
@@ -469,7 +477,8 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
             }
         }
     }
-    // FIXME new-pred stuff
+    if (ctx->new_pred)
+        decode_new_pred(ctx, &s->gb);
 
     return 0;
 }
@@ -564,7 +573,7 @@ static inline int mpeg4_decode_dc(MpegEncContext *s, int n, int *dir_ptr)
 
         if (code > 8) {
             if (get_bits1(&s->gb) == 0) { /* marker */
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)) {
                     av_log(s->avctx, AV_LOG_ERROR, "dc marker bit missing\n");
                     return -1;
                 }
@@ -609,7 +618,7 @@ static int mpeg4_decode_partition_a(Mpeg4DecContext *ctx)
                     cbpc = get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 2);
                     if (cbpc < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
-                               "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                               "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
                         return -1;
                     }
                 } while (cbpc == 8);
@@ -681,7 +690,7 @@ try_again:
                 cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
                 if (cbpc < 0) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                           "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
                     return -1;
                 }
                 if (cbpc == 20)
@@ -933,7 +942,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                      int n, int coded, int intra, int rvlc)
 {
     MpegEncContext *s = &ctx->m;
-    int level, i, last, run, qmul, qadd, dc_pred_dir;
+    int level, i, last, run, qmul, qadd;
+    int av_uninit(dc_pred_dir);
     RLTable *rl;
     RL_VLC_ELEM *rl_vlc;
     const uint8_t *scan_table;
@@ -1078,7 +1088,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "1. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return -1;
                                 }
                                 SKIP_CACHE(re, &s->gb, 1);
 
@@ -1088,19 +1099,42 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "2. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return -1;
                                 }
 
                                 SKIP_COUNTER(re, &s->gb, 1 + 12 + 1);
                             }
 
+#if 0
+                            if (s->error_recognition >= FF_ER_COMPLIANT) {
+                                const int abs_level= FFABS(level);
+                                if (abs_level<=MAX_LEVEL && run<=MAX_RUN) {
+                                    const int run1= run - rl->max_run[last][abs_level] - 1;
+                                    if (abs_level <= rl->max_level[last][run]) {
+                                        av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
+                                        return -1;
+                                    }
+                                    if (s->error_recognition > FF_ER_COMPLIANT) {
+                                        if (abs_level <= rl->max_level[last][run]*2) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 1 encoding possible\n");
+                                            return -1;
+                                        }
+                                        if (run1 >= 0 && abs_level <= rl->max_level[last][run1]) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 2 encoding possible\n");
+                                            return -1;
+                                        }
+                                    }
+                                }
+                            }
+#endif
                             if (level > 0)
                                 level = level * qmul + qadd;
                             else
                                 level = level * qmul - qadd;
 
                             if ((unsigned)(level + 2048) > 4095) {
-                                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_AGGRESSIVE)) {
                                     if (level > 2560 || level < -2560) {
                                         av_log(s->avctx, AV_LOG_ERROR,
                                                "|level| overflow in 3. esc, qp=%d\n",
@@ -1137,6 +1171,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
                 LAST_SKIP_BITS(re, &s->gb, 1);
             }
+            ff_tlog(s->avctx, "dct[%d][%d] = %- 4d end?:%d\n", scan_table[i&63]&7, scan_table[i&63] >> 3, level, i>62);
             if (i > 62) {
                 i -= 192;
                 if (i & (~63)) {
@@ -1245,12 +1280,12 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
 
     /* per-MB end of slice check */
     if (--s->mb_num_left <= 0) {
-        if (mpeg4_is_resync(s))
+        if (mpeg4_is_resync(ctx))
             return SLICE_END;
         else
             return SLICE_NOEND;
     } else {
-        if (mpeg4_is_resync(s)) {
+        if (mpeg4_is_resync(ctx)) {
             const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
             if (s->cbp_table[xy + delta])
                 return SLICE_END;
@@ -1264,10 +1299,10 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
     Mpeg4DecContext *ctx = (Mpeg4DecContext *)s;
     int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant;
     int16_t *mot_val;
-    static int8_t quant_tab[4] = { -1, -2, 1, 2 };
+    static const int8_t quant_tab[4] = { -1, -2, 1, 2 };
     const int xy = s->mb_x + s->mb_y * s->mb_stride;
 
-    assert(s->h263_pred);
+    av_assert2(s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_S) {
@@ -1303,7 +1338,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
             cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
             if (cbpc < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
+                       "mcbpc damaged at %d %d\n", s->mb_x, s->mb_y);
                 return -1;
             }
         } while (cbpc == 20);
@@ -1320,6 +1355,11 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
         else
             s->mcsel = 0;
         cbpy = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1) ^ 0x0F;
+        if (cbpy < 0) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "P cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
+            return AVERROR_INVALIDDATA;
+        }
 
         cbp = (cbpc & 3) | (cbpy << 2);
         if (dquant)
@@ -1609,20 +1649,23 @@ intra:
 end:
     /* per-MB end of slice check */
     if (s->codec_id == AV_CODEC_ID_MPEG4) {
-        if (mpeg4_is_resync(s)) {
-            const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
+        int next = mpeg4_is_resync(ctx);
+        if (next) {
+            if        (s->mb_x + s->mb_y*s->mb_width + 1 >  next && (s->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
+                return -1;
+            } else if (s->mb_x + s->mb_y*s->mb_width + 1 >= next)
+                return SLICE_END;
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta]) {
+            if (s->pict_type == AV_PICTURE_TYPE_B) {
+                const int delta= s->mb_x + 1 == s->mb_width ? 2 : 1;
                 ff_thread_await_progress(&s->next_picture_ptr->tf,
                                          (s->mb_x + delta >= s->mb_width)
                                          ? FFMIN(s->mb_y + 1, s->mb_height - 1)
                                          : s->mb_y, 0);
+                if (s->next_picture.mbskip_table[xy + delta])
+                    return SLICE_OK;
             }
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta])
-                return SLICE_OK;
             return SLICE_END;
         }
     }
@@ -1633,29 +1676,30 @@ end:
 static int mpeg4_decode_gop_header(MpegEncContext *s, GetBitContext *gb)
 {
     int hours, minutes, seconds;
-    unsigned time_code = show_bits(gb, 18);
-
-    if (time_code & 0x40) {     /* marker_bit */
-        hours   = time_code >> 13;
-        minutes = time_code >> 7 & 0x3f;
-        seconds = time_code & 0x3f;
-        s->time_base = seconds + 60 * (minutes + 60 * hours);
-        skip_bits(gb, 20);      /* time_code, closed_gov, broken_link */
-    } else {
-        av_log(s->avctx, AV_LOG_WARNING, "GOP header missing marker_bit\n");
+
+    if (!show_bits(gb, 23)) {
+        av_log(s->avctx, AV_LOG_WARNING, "GOP header invalid\n");
+        return -1;
     }
 
+    hours   = get_bits(gb, 5);
+    minutes = get_bits(gb, 6);
+    check_marker(gb, "in gop_header");
+    seconds = get_bits(gb, 6);
+
+    s->time_base = seconds + 60*(minutes + 60*hours);
+
+    skip_bits1(gb);
+    skip_bits1(gb);
+
     return 0;
 }
 
 static int mpeg4_decode_profile_level(MpegEncContext *s, GetBitContext *gb)
 {
-    int profile_and_level_indication;
-
-    profile_and_level_indication = get_bits(gb, 8);
 
-    s->avctx->profile = (profile_and_level_indication & 0xf0) >> 4;
-    s->avctx->level   = (profile_and_level_indication & 0x0f);
+    s->avctx->profile = get_bits(gb, 4);
+    s->avctx->level   = get_bits(gb, 4);
 
     // for Simple profile, level 0
     if (s->avctx->profile == 0 && s->avctx->level == 8) {
@@ -1695,20 +1739,20 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         s->low_delay = get_bits1(gb);
         if (get_bits1(gb)) {    /* vbv parameters */
             get_bits(gb, 15);   /* first_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_bitrate");
             get_bits(gb, 15);   /* latter_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after latter_half_bitrate");
             get_bits(gb, 15);   /* first_half_vbv_buffer_size */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_vbv_buffer_size");
             get_bits(gb, 3);    /* latter_half_vbv_buffer_size */
             get_bits(gb, 11);   /* first_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_vbv_occupancy");
             get_bits(gb, 15);   /* latter_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after latter_half_vbv_occupancy");
         }
     } else {
         /* is setting low delay flag only once the smartest thing to do?
-         * low delay detection won't be overriden. */
+         * low delay detection won't be overridden. */
         if (s->picture_number == 0)
             s->low_delay = 0;
     }
@@ -1726,7 +1770,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     s->avctx->framerate.num = get_bits(gb, 16);
     if (!s->avctx->framerate.num) {
         av_log(s->avctx, AV_LOG_ERROR, "framerate==0\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     ctx->time_increment_bits = av_log2(s->avctx->framerate.num - 1) + 1;
@@ -1740,15 +1784,17 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     else
         s->avctx->framerate.den = 1;
 
+    s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+
     ctx->t_frame = 0;
 
     if (ctx->shape != BIN_ONLY_SHAPE) {
         if (ctx->shape == RECT_SHAPE) {
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "before width");
             width = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "before height");
             height = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "after height");
             if (width && height &&  /* they should be non zero but who knows */
                 !(s->width && s->codec_tag == AV_RL32("MP4S"))) {
                 if (s->width && s->height &&
@@ -1776,13 +1822,13 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             ctx->vol_sprite_usage == GMC_SPRITE) {
             if (ctx->vol_sprite_usage == STATIC_SPRITE) {
                 skip_bits(gb, 13); // sprite_width
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_width");
                 skip_bits(gb, 13); // sprite_height
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_height");
                 skip_bits(gb, 13); // sprite_left
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_left");
                 skip_bits(gb, 13); // sprite_top
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_top");
             }
             ctx->num_sprite_warping_points = get_bits(gb, 6);
             if (ctx->num_sprite_warping_points > 3) {
@@ -1790,7 +1836,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                        "%d sprite_warping_points\n",
                        ctx->num_sprite_warping_points);
                 ctx->num_sprite_warping_points = 0;
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
             s->sprite_warping_accuracy  = get_bits(gb, 2);
             ctx->sprite_brightness_change = get_bits1(gb);
@@ -1806,6 +1852,9 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->quant_precision != 5)
                 av_log(s->avctx, AV_LOG_ERROR,
                        "quant precision %d\n", s->quant_precision);
+            if (s->quant_precision<3 || s->quant_precision>9) {
+                s->quant_precision = 5;
+            }
         } else {
             s->quant_precision = 5;
         }
@@ -1881,6 +1930,11 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         else
             s->quarter_sample = 0;
 
+        if (get_bits_left(gb) < 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "VOL Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (!get_bits1(gb)) {
             int pos               = get_bits_count(gb);
             int estimation_method = get_bits(gb, 2);
@@ -1988,6 +2042,17 @@ no_cplx_est:
         }
     }
 
+    if (s->avctx->debug&FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG, "tb %d/%d, tincrbits:%d, qp_prec:%d, ps:%d,  %s%s%s%s\n",
+               s->avctx->framerate.den, s->avctx->framerate.num,
+               ctx->time_increment_bits,
+               s->quant_precision,
+               s->progressive_sequence,
+               ctx->scalability ? "scalability " :"" , s->quarter_sample ? "qpel " : "",
+               s->data_partitioning ? "partition " : "", ctx->rvlc ? "rvlc " : ""
+        );
+    }
+
     return 0;
 }
 
@@ -2019,11 +2084,6 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_version = ver;
         ctx->divx_build   = build;
         s->divx_packed  = e == 3 && last == 'p';
-        if (s->divx_packed && !ctx->showed_packed_warning) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "Invalid and inefficient vfw-avi packed B frames detected\n");
-            ctx->showed_packed_warning = 1;
-        }
     }
 
     /* libavcodec detection */
@@ -2047,6 +2107,14 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (e == 1)
         ctx->xvid_build = build;
 
+    return 0;
+}
+
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+{
+    Mpeg4DecContext *ctx = avctx->priv_data;
+    MpegEncContext *s = &ctx->m;
+
     if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) {
         if (s->codec_tag        == AV_RL32("XVID") ||
             s->codec_tag        == AV_RL32("XVIX") ||
@@ -2066,8 +2134,89 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_build   = -1;
     }
 
-    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0)
-        ff_xvid_idct_init(&s->idsp, s->avctx);
+    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
+        if (s->codec_tag == AV_RL32("XVIX"))
+            s->workaround_bugs |= FF_BUG_XVID_ILACE;
+
+        if (s->codec_tag == AV_RL32("UMP4"))
+            s->workaround_bugs |= FF_BUG_UMP4;
+
+        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
+
+        if (ctx->xvid_build <= 3U)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->xvid_build <= 1U)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->xvid_build <= 12U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->xvid_build <= 32U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+#define SET_QPEL_FUNC(postfix1, postfix2)                           \
+    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
+    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
+    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
+
+        if (ctx->lavc_build < 4653U)
+            s->workaround_bugs |= FF_BUG_STD_QPEL;
+
+        if (ctx->lavc_build < 4655U)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+
+        if (ctx->lavc_build < 4670U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->lavc_build <= 4712U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->divx_version < 500U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    }
+
+    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
+
+        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    }
+
+    if (avctx->debug & FF_DEBUG_BUGS)
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
+               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+
+    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+        s->codec_id == AV_CODEC_ID_MPEG4 &&
+        avctx->idct_algo == FF_IDCT_AUTO) {
+        avctx->idct_algo = FF_IDCT_XVID;
+        ff_mpv_idct_init(s);
+        return 1;
+    }
 
     return 0;
 }
@@ -2076,6 +2225,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
     int time_incr, time_increment;
+    int64_t pts;
 
     s->pict_type = get_bits(gb, 2) + AV_PICTURE_TYPE_I;        /* pict type: I = 0 , P = 1 */
     if (s->pict_type == AV_PICTURE_TYPE_B && s->low_delay &&
@@ -2098,7 +2248,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 
     if (ctx->time_increment_bits == 0 ||
         !(show_bits(gb, ctx->time_increment_bits + 1) & 1)) {
-        /* Headers seem incomplete; try to guess time_increment_bits. */
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits %d is invalid in relation to the current bitstream, this is likely caused by a missing VOL header\n", ctx->time_increment_bits);
+
         for (ctx->time_increment_bits = 1;
              ctx->time_increment_bits < 16;
              ctx->time_increment_bits++) {
@@ -2110,6 +2262,13 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             } else if ((show_bits(gb, ctx->time_increment_bits + 5) & 0x1F) == 0x18)
                 break;
         }
+
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits set to %d bits, based on bitstream analysis\n", ctx->time_increment_bits);
+        if (s->avctx->framerate.num && 4*s->avctx->framerate.num < 1<<ctx->time_increment_bits) {
+            s->avctx->framerate.num = 1<<ctx->time_increment_bits;
+            s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+        }
     }
 
     if (IS_3IV1)
@@ -2150,12 +2309,20 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
         s->pb_field_time = (ROUNDED_DIV(s->time, ctx->t_frame) -
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
-        if (!s->progressive_sequence) {
-            if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1)
+        if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1) {
+            s->pb_field_time = 2;
+            s->pp_field_time = 4;
+            if (!s->progressive_sequence)
                 return FRAME_SKIPPED;
         }
     }
 
+    if (s->avctx->framerate.den)
+        pts = ROUNDED_DIV(s->time, s->avctx->framerate.den);
+    else
+        pts = AV_NOPTS_VALUE;
+    ff_dlog(s->avctx, "MPEG4 PTS: %"PRId64"\n", pts);
+
     check_marker(gb, "before vop_coded");
 
     /* vop coded */
@@ -2164,6 +2331,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             av_log(s->avctx, AV_LOG_ERROR, "vop not coded\n");
         return FRAME_SKIPPED;
     }
+    if (ctx->new_pred)
+        decode_new_pred(ctx, gb);
+
     if (ctx->shape != BIN_ONLY_SHAPE &&
                     (s->pict_type == AV_PICTURE_TYPE_P ||
                      (s->pict_type == AV_PICTURE_TYPE_S &&
@@ -2178,11 +2348,11 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (ctx->shape != RECT_SHAPE) {
         if (ctx->vol_sprite_usage != 1 || s->pict_type != AV_PICTURE_TYPE_I) {
             skip_bits(gb, 13);  /* width */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after width");
             skip_bits(gb, 13);  /* height */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after height");
             skip_bits(gb, 13);  /* hor_spat_ref */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after hor_spat_ref");
             skip_bits(gb, 13);  /* ver_spat_ref */
         }
         skip_bits1(gb);         /* change_CR_disable */
@@ -2200,6 +2370,10 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->pict_type == AV_PICTURE_TYPE_B)
             skip_bits_long(gb, ctx->cplx_estimation_trash_b);
 
+        if (get_bits_left(gb) < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
         ctx->intra_dc_threshold = ff_mpeg4_dc_threshold[get_bits(gb, 3)];
         if (!s->progressive_sequence) {
             s->top_field_first = get_bits1(gb);
@@ -2237,7 +2411,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->qscale == 0) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Error, header damaged or not MPEG4 header (qscale=0)\n");
-            return -1;  // makes no sense to continue, as there is nothing left from the image then
+            return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
         }
 
         if (s->pict_type != AV_PICTURE_TYPE_I) {
@@ -2245,29 +2419,39 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->f_code == 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Error, header damaged or not MPEG4 header (f_code=0)\n");
-                return -1;  // makes no sense to continue, as there is nothing left from the image then
+                s->f_code = 1;
+                return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
             }
         } else
             s->f_code = 1;
 
         if (s->pict_type == AV_PICTURE_TYPE_B) {
             s->b_code = get_bits(gb, 3);
+            if (s->b_code == 0) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                       "Error, header damaged or not MPEG4 header (b_code=0)\n");
+                s->b_code=1;
+                return AVERROR_INVALIDDATA; // makes no sense to continue, as the MV decoding will break very quickly
+            }
         } else
             s->b_code = 1;
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
             av_log(s->avctx, AV_LOG_DEBUG,
-                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d\n",
+                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d time:%"PRId64" tincr:%d\n",
                    s->qscale, s->f_code, s->b_code,
                    s->pict_type == AV_PICTURE_TYPE_I ? "I" : (s->pict_type == AV_PICTURE_TYPE_P ? "P" : (s->pict_type == AV_PICTURE_TYPE_B ? "B" : "S")),
-                   gb->size_in_bits, s->progressive_sequence, s->alternate_scan,
+                   gb->size_in_bits,s->progressive_sequence, s->alternate_scan,
                    s->top_field_first, s->quarter_sample ? "q" : "h",
                    s->data_partitioning, ctx->resync_marker,
                    ctx->num_sprite_warping_points, s->sprite_warping_accuracy,
                    1 - s->no_rounding, s->vo_type,
                    ctx->vol_control_parameters ? " VOLC" : " ", ctx->intra_dc_threshold,
                    ctx->cplx_estimation_trash_i, ctx->cplx_estimation_trash_p,
-                   ctx->cplx_estimation_trash_b);
+                   ctx->cplx_estimation_trash_b,
+                   s->time,
+                   time_increment
+                  );
         }
 
         if (!ctx->scalability) {
@@ -2316,6 +2500,7 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
     unsigned startcode, v;
+    int ret;
 
     /* search next start code */
     align_get_bits(gb);
@@ -2330,8 +2515,8 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     for (;;) {
         if (get_bits_count(gb) >= gb->size_in_bits) {
             if (gb->size_in_bits == 8 &&
-                (ctx->divx_version >= 0 || ctx->xvid_build >= 0)) {
-                av_log(s->avctx, AV_LOG_WARNING, "frame skip %d\n", gb->size_in_bits);
+                (ctx->divx_version >= 0 || ctx->xvid_build >= 0) || s->codec_tag == AV_RL32("QMP4")) {
+                av_log(s->avctx, AV_LOG_VERBOSE, "frame skip %d\n", gb->size_in_bits);
                 return FRAME_SKIPPED;  // divx bug
             } else
                 return -1;  // end of stream
@@ -2404,8 +2589,8 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         }
 
         if (startcode >= 0x120 && startcode <= 0x12F) {
-            if (decode_vol_header(ctx, gb) < 0)
-                return -1;
+            if ((ret = decode_vol_header(ctx, gb)) < 0)
+                return ret;
         } else if (startcode == USER_DATA_STARTCODE) {
             decode_user_data(ctx, gb);
         } else if (startcode == GOP_STARTCODE) {
@@ -2425,64 +2610,33 @@ end:
         s->low_delay = 1;
     s->avctx->has_b_frames = !s->low_delay;
 
-    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_tag == AV_RL32("XVIX"))
-            s->workaround_bugs |= FF_BUG_XVID_ILACE;
-
-        if (s->codec_tag == AV_RL32("UMP4"))
-            s->workaround_bugs |= FF_BUG_UMP4;
-
-        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
-
-        if (ctx->xvid_build <= 3U)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
-
-        if (ctx->xvid_build <= 1U)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->xvid_build <= 12U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->xvid_build <= 32U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->lavc_build < 4653U)
-            s->workaround_bugs |= FF_BUG_STD_QPEL;
-
-        if (ctx->lavc_build < 4655U)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->lavc_build < 4670U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->lavc_build <= 4712U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
+    return decode_vop_header(ctx, gb);
+}
 
-        if (ctx->divx_version < 500U)
-            s->workaround_bugs |= FF_BUG_EDGE;
+av_cold void ff_mpeg4videodec_static_init(void) {
+    static int done = 0;
 
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    if (!done) {
+        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
+        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
+        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
+        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
+        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
+        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
+        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
+                        &ff_sprite_trajectory_tab[0][1], 4, 2,
+                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
+        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
+                        &ff_mb_type_b_tab[0][1], 2, 1,
+                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
+        done = 1;
     }
-
-
-    if (s->avctx->debug & FF_DEBUG_BUGS)
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
-               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-
-    return decode_vop_header(ctx, gb);
 }
 
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
@@ -2490,35 +2644,41 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext    *s = &ctx->m;
 
-    /* divx 5.01+ bistream reorder stuff */
+    /* divx 5.01+ bitstream reorder stuff */
+    /* Since this clobbers the input buffer and hwaccel codecs still need the
+     * data during hwaccel->end_frame we should not do this any earlier */
     if (s->divx_packed) {
-        int current_pos     = get_bits_count(&s->gb) >> 3;
+        int current_pos     = s->gb.buffer == s->bitstream_buffer ? 0 : (get_bits_count(&s->gb) >> 3);
         int startcode_found = 0;
 
-        if (buf_size - current_pos > 5) {
+        if (buf_size - current_pos > 7) {
+
             int i;
-            for (i = current_pos; i < buf_size - 3; i++)
+            for (i = current_pos; i < buf_size - 4; i++)
+
                 if (buf[i]     == 0 &&
                     buf[i + 1] == 0 &&
                     buf[i + 2] == 1 &&
                     buf[i + 3] == 0xB6) {
-                    startcode_found = 1;
+                    startcode_found = !(buf[i + 4] & 0x40);
                     break;
                 }
         }
-        if (s->gb.buffer == s->bitstream_buffer && buf_size > 7 &&
-            ctx->xvid_build >= 0) {       // xvid style
-            startcode_found = 1;
-            current_pos     = 0;
-        }
 
         if (startcode_found) {
-            av_fast_malloc(&s->bitstream_buffer,
+            if (!ctx->showed_packed_warning) {
+                av_log(s->avctx, AV_LOG_INFO, "Video uses a non-standard and "
+                       "wasteful way to store B-frames ('packed B-frames'). "
+                       "Consider using the mpeg4_unpack_bframes bitstream filter without encoding but stream copy to fix it.\n");
+                ctx->showed_packed_warning = 1;
+            }
+            av_fast_padded_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
-                           buf_size - current_pos +
-                           AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!s->bitstream_buffer)
+                           buf_size - current_pos);
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
                 return AVERROR(ENOMEM);
+            }
             memcpy(s->bitstream_buffer, buf + current_pos,
                    buf_size - current_pos);
             s->bitstream_buffer_size = buf_size - current_pos;
@@ -2528,6 +2688,7 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg4_update_thread_context(AVCodecContext *dst,
                                        const AVCodecContext *src)
 {
@@ -2540,22 +2701,20 @@ static int mpeg4_update_thread_context(AVCodecContext *dst,
     if (ret < 0)
         return ret;
 
+    memcpy(((uint8_t*)s) + sizeof(MpegEncContext), ((uint8_t*)s1) + sizeof(MpegEncContext), sizeof(Mpeg4DecContext) - sizeof(MpegEncContext));
+
     if (CONFIG_MPEG4_DECODER && !init && s1->xvid_build >= 0)
         ff_xvid_idct_init(&s->m.idsp, dst);
 
-    s->shape               = s1->shape;
-    s->time_increment_bits = s1->time_increment_bits;
-    s->xvid_build          = s1->xvid_build;
-
     return 0;
 }
+#endif
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext *s = &ctx->m;
     int ret;
-    static int done = 0;
 
     ctx->divx_version =
     ctx->divx_build   =
@@ -2565,31 +2724,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if ((ret = ff_h263_decode_init(avctx)) < 0)
         return ret;
 
-    if (!done) {
-        done = 1;
-
-        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
-        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
-        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
-        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
-        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
-        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
-        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
-                        &ff_sprite_trajectory_tab[0][1], 4, 2,
-                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
-        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
-                        &ff_mb_type_b_tab[0][1], 2, 1,
-                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
-    }
+    ff_mpeg4videodec_static_init();
 
     s->h263_pred = 1;
-    s->low_delay = 0; /* default, might be overriden in the vol header during header parsing */
+    s->low_delay = 0; /* default, might be overridden in the vol header during header parsing */
     s->decode_mb = mpeg4_decode_mb;
     ctx->time_increment_bits = 4; /* default value for broken headers */
 
@@ -2616,6 +2754,20 @@ static const AVProfile mpeg4_video_profiles[] = {
     { FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE, "Advanced Scalable Texture Profile" },
     { FF_PROFILE_MPEG4_SIMPLE_STUDIO,             "Simple Studio Profile" },
     { FF_PROFILE_MPEG4_ADVANCED_SIMPLE,           "Advanced Simple Profile" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+static const AVOption mpeg4_options[] = {
+    {"quarter_sample", "1/4 subpel MC", offsetof(MpegEncContext, quarter_sample), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, 0},
+    {"divx_packed", "divx style packed b frames", offsetof(MpegEncContext, divx_packed), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, 0},
+    {NULL}
+};
+
+static const AVClass mpeg4_class = {
+    "MPEG4 Video Decoder",
+    av_default_item_name,
+    mpeg4_options,
+    LIBAVUTIL_VERSION_INT,
 };
 
 AVCodec ff_mpeg4_decoder = {
@@ -2631,7 +2783,35 @@ AVCodec ff_mpeg4_decoder = {
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_FRAME_THREADS,
     .flush                 = ff_mpeg_flush,
+    .max_lowres            = 3,
     .pix_fmts              = ff_h263_hwaccel_pixfmt_list_420,
     .profiles              = NULL_IF_CONFIG_SMALL(mpeg4_video_profiles),
     .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg4_update_thread_context),
+    .priv_class = &mpeg4_class,
+};
+
+
+#if CONFIG_MPEG4_VDPAU_DECODER && FF_API_VDPAU
+static const AVClass mpeg4_vdpau_class = {
+    "MPEG4 Video VDPAU Decoder",
+    av_default_item_name,
+    mpeg4_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg4_vdpau_decoder = {
+    .name           = "mpeg4_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 part 2 (VDPAU)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .priv_data_size = sizeof(Mpeg4DecContext),
+    .init           = decode_init,
+    .close          = ff_h263_decode_end,
+    .decode         = ff_h263_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
+                      AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_VDPAU_MPEG4,
+                                                  AV_PIX_FMT_NONE },
+    .priv_class     = &mpeg4_vdpau_class,
 };
+#endif
diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c
index db7d3ad46b..ffa08beb87 100644
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -208,7 +208,7 @@ static inline int decide_ac_pred(MpegEncContext *s, int16_t block[6][64],
 }
 
 /**
- * modify mb_type & qscale so that encoding is acually possible in mpeg4
+ * modify mb_type & qscale so that encoding is actually possible in mpeg4
  */
 void ff_clean_mpeg4_qscales(MpegEncContext *s)
 {
@@ -278,19 +278,19 @@ static inline void mpeg4_encode_dc(PutBitContext *s, int level, int n)
 
     if (n < 4) {
         /* luminance */
-        put_bits(&s->pb, ff_mpeg4_DCtab_lum[size][1], ff_mpeg4_DCtab_lum[size][0]);
+        put_bits(s, ff_mpeg4_DCtab_lum[size][1], ff_mpeg4_DCtab_lum[size][0]);
     } else {
         /* chrominance */
-        put_bits(&s->pb, ff_mpeg4_DCtab_chrom[size][1], ff_mpeg4_DCtab_chrom[size][0]);
+        put_bits(s, ff_mpeg4_DCtab_chrom[size][1], ff_mpeg4_DCtab_chrom[size][0]);
     }
 
     /* encode remaining bits */
     if (size > 0) {
         if (level < 0)
             level = (-level) ^ ((1 << size) - 1);
-        put_bits(&s->pb, size, level);
+        put_bits(s, size, level);
         if (size > 8)
-            put_bits(&s->pb, 1, 1);
+            put_bits(s, 1, 1);
     }
 #endif
 }
@@ -525,9 +525,9 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                     s->last_mv[i][1][1] = 0;
             }
 
-            assert(s->dquant >= -2 && s->dquant <= 2);
-            assert((s->dquant & 1) == 0);
-            assert(mb_type >= 0);
+            av_assert2(s->dquant >= -2 && s->dquant <= 2);
+            av_assert2((s->dquant & 1) == 0);
+            av_assert2(mb_type >= 0);
 
             /* nothing to do if this MB was skipped in the next P Frame */
             if (s->next_picture.mbskip_table[s->mb_y * s->mb_stride + s->mb_x]) {  // FIXME avoid DCT & ...
@@ -547,7 +547,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
             if ((cbp | motion_x | motion_y | mb_type) == 0) {
                 /* direct MB with MV={0,0} */
-                assert(s->dquant == 0);
+                av_assert2(s->dquant == 0);
 
                 put_bits(&s->pb, 1, 1); /* mb not coded modb1=1 */
 
@@ -584,12 +584,12 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 s->misc_bits += get_bits_diff(s);
 
             if (!mb_type) {
-                assert(s->mv_dir & MV_DIRECT);
+                av_assert2(s->mv_dir & MV_DIRECT);
                 ff_h263_encode_motion_vector(s, motion_x, motion_y, 1);
                 s->b_count++;
                 s->f_count++;
             } else {
-                assert(mb_type > 0 && mb_type < 4);
+                av_assert2(mb_type > 0 && mb_type < 4);
                 if (s->mv_type != MV_TYPE_FIELD) {
                     if (s->mv_dir & MV_DIR_FORWARD) {
                         ff_h263_encode_motion_vector(s,
@@ -669,10 +669,6 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
                     x = s->mb_x * 16;
                     y = s->mb_y * 16;
-                    if (x + 16 > s->width)
-                        x = s->width - 16;
-                    if (y + 16 > s->height)
-                        y = s->height - 16;
 
                     offset = x + y * s->linesize;
                     p_pic  = s->new_picture.f->data[0] + offset;
@@ -689,7 +685,21 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                         b_pic = pic->f->data[0] + offset;
                         if (!pic->shared)
                             b_pic += INPLACE_OFFSET;
-                        diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+
+                        if (x + 16 > s->width || y + 16 > s->height) {
+                            int x1, y1;
+                            int xe = FFMIN(16, s->width - x);
+                            int ye = FFMIN(16, s->height - y);
+                            diff = 0;
+                            for (y1 = 0; y1 < ye; y1++) {
+                                for (x1 = 0; x1 < xe; x1++) {
+                                    diff += FFABS(p_pic[x1 + y1 * s->linesize] - b_pic[x1 + y1 * s->linesize]);
+                                }
+                            }
+                            diff = diff * 256 / (xe * ye);
+                        } else {
+                            diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+                        }
                         if (diff > s->qscale * 70) {  // FIXME check that 70 is optimal
                             s->mb_skipped = 0;
                             break;
@@ -754,7 +764,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 if (s->dquant)
                     put_bits(pb2, 2, dquant_code[s->dquant + 2]);
 
-                assert(!s->progressive_sequence);
+                av_assert2(!s->progressive_sequence);
                 if (cbp)
                     put_bits(pb2, 1, s->interlaced_dct);
                 put_bits(pb2, 1, 1);
@@ -778,7 +788,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                                              s->mv[0][1][1] - pred_y,
                                              s->f_code);
             } else {
-                assert(s->mv_type == MV_TYPE_8X8);
+                av_assert2(s->mv_type == MV_TYPE_8X8);
                 put_bits(&s->pb,
                          ff_h263_inter_MCBPC_bits[cbpc + 16],
                          ff_h263_inter_MCBPC_code[cbpc + 16]);
@@ -894,7 +904,7 @@ void ff_set_mpeg4_time(MpegEncContext *s)
         ff_mpeg4_init_direct_mv(s);
     } else {
         s->last_time_base = s->time_base;
-        s->time_base      = s->time / s->avctx->time_base.den;
+        s->time_base      = FFUDIV(s->time, s->avctx->time_base.den);
     }
 }
 
@@ -910,13 +920,12 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     if (s->reordered_input_picture[1])
         time = FFMIN(time, s->reordered_input_picture[1]->f->pts);
     time = time * s->avctx->time_base.num;
+    s->last_time_base = FFUDIV(time, s->avctx->time_base.den);
 
-    seconds  = time / s->avctx->time_base.den;
-    minutes  = seconds / 60;
-    seconds %= 60;
-    hours    = minutes / 60;
-    minutes %= 60;
-    hours   %= 24;
+    seconds = FFUDIV(time, s->avctx->time_base.den);
+    minutes = FFUDIV(seconds, 60); seconds = FFUMOD(seconds, 60);
+    hours   = FFUDIV(minutes, 60); minutes = FFUMOD(minutes, 60);
+    hours   = FFUMOD(hours  , 24);
 
     put_bits(&s->pb, 5, hours);
     put_bits(&s->pb, 6, minutes);
@@ -926,8 +935,6 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
     put_bits(&s->pb, 1, 0);  // broken link == NO
 
-    s->last_time_base = time / s->avctx->time_base.den;
-
     ff_mpeg4_stuffing(&s->pb);
 }
 
@@ -1011,6 +1018,8 @@ static void mpeg4_encode_vol_header(MpegEncContext *s,
 
     put_bits(&s->pb, 4, s->aspect_ratio_info); /* aspect ratio info */
     if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                   s->avctx->sample_aspect_ratio.num,  s->avctx->sample_aspect_ratio.den, 255);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.num);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.den);
     }
@@ -1099,11 +1108,10 @@ void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
     put_bits(&s->pb, 16, VOP_STARTCODE);    /* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);  /* pict type: I = 0 , P = 1 */
 
-    assert(s->time >= 0);
-    time_div  = s->time / s->avctx->time_base.den;
-    time_mod  = s->time % s->avctx->time_base.den;
+    time_div  = FFUDIV(s->time, s->avctx->time_base.den);
+    time_mod  = FFUMOD(s->time, s->avctx->time_base.den);
     time_incr = time_div - s->last_time_base;
-    assert(time_incr >= 0);
+    av_assert0(time_incr >= 0);
     while (time_incr--)
         put_bits(&s->pb, 1, 1);
 
@@ -1191,8 +1199,8 @@ static av_cold void init_uni_mpeg4_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN >= 63);
 
     for (slevel = -64; slevel < 64; slevel++) {
         if (slevel == 0)
@@ -1287,6 +1295,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int ret;
     static int done = 0;
 
+    if (avctx->width >= (1<<13) || avctx->height >= (1<<13)) {
+        av_log(avctx, AV_LOG_ERROR, "dimensions too large for MPEG-4\n");
+        return AVERROR(EINVAL);
+    }
+
     if ((ret = ff_mpv_encode_init(avctx)) < 0)
         return ret;
 
diff --git a/libavcodec/mpeg_er.c b/libavcodec/mpeg_er.c
index 9410b27616..dd87ae9cc9 100644
--- a/libavcodec/mpeg_er.c
+++ b/libavcodec/mpeg_er.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@ static void set_erpic(ERPicture *dst, Picture *src)
 {
     int i;
 
+    memset(dst, 0, sizeof(*dst));
     if (!src) {
         dst->f  = NULL;
         dst->tf = NULL;
diff --git a/libavcodec/mpeg_er.h b/libavcodec/mpeg_er.h
index ca1ea90165..bb627a4d06 100644
--- a/libavcodec/mpeg_er.h
+++ b/libavcodec/mpeg_er.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.c b/libavcodec/mpegaudio.c
index 1a8363540e..cba52992ef 100644
--- a/libavcodec/mpegaudio.c
+++ b/libavcodec/mpegaudio.c
@@ -2,20 +2,20 @@
  * MPEG Audio common code
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index b55680100b..1591a170d7 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,8 @@
 #ifndef AVCODEC_MPEGAUDIO_H
 #define AVCODEC_MPEGAUDIO_H
 
-#ifndef CONFIG_FLOAT
-#   define CONFIG_FLOAT 0
+#ifndef USE_FLOATS
+#   define USE_FLOATS 0
 #endif
 
 #include <stdint.h>
@@ -52,11 +52,13 @@
 #define WFRAC_BITS  16   /* fractional bits for window */
 #endif
 
+#define IMDCT_SCALAR 1.759
+
 #define FRAC_ONE    (1 << FRAC_BITS)
 
 #define FIX(a)   ((int)((a) * FRAC_ONE))
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #   define INTFLOAT float
 typedef float MPA_INT;
 typedef float OUT_INT;
diff --git a/libavcodec/mpegaudio_parser.c b/libavcodec/mpegaudio_parser.c
index 84c20d882f..7849fed649 100644
--- a/libavcodec/mpegaudio_parser.c
+++ b/libavcodec/mpegaudio_parser.c
@@ -3,27 +3,27 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "parser.h"
 #include "mpegaudiodecheader.h"
 #include "libavutil/common.h"
-
+#include "libavformat/id3v1.h" // for ID3v1_TAG_SIZE
 
 typedef struct MpegAudioParseContext {
     ParseContext pc;
@@ -49,12 +49,14 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
     uint32_t state= pc->state;
     int i;
     int next= END_NOT_FOUND;
+    int flush = !buf_size;
 
     for(i=0; i<buf_size; ){
         if(s->frame_size){
             int inc= FFMIN(buf_size - i, s->frame_size);
             i += inc;
             s->frame_size -= inc;
+            state = 0;
 
             if(!s->frame_size){
                 next= i;
@@ -67,24 +69,26 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
 
                 state= (state<<8) + buf[i++];
 
-                ret = avpriv_mpa_decode_header(avctx, state, &sr, &channels, &frame_size, &bit_rate);
+                ret = avpriv_mpa_decode_header2(state, &sr, &channels, &frame_size, &bit_rate, &codec_id);
                 if (ret < 4) {
                     if (i > 4)
                         s->header_count = -2;
                 } else {
+                    int header_threshold = avctx->codec_id != AV_CODEC_ID_NONE && avctx->codec_id != codec_id;
                     if((state&SAME_HEADER_MASK) != (s->header&SAME_HEADER_MASK) && s->header)
                         s->header_count= -3;
                     s->header= state;
                     s->header_count++;
                     s->frame_size = ret-4;
 
-                    if (s->header_count > 0) {
+                    if (s->header_count > header_threshold) {
                         avctx->sample_rate= sr;
                         avctx->channels   = channels;
                         s1->duration      = frame_size;
+                        avctx->codec_id   = codec_id;
                         if (s->no_bitrate || !avctx->bit_rate) {
                             s->no_bitrate = 1;
-                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / s->header_count;
+                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / (s->header_count - header_threshold);
                         }
                     }
 
@@ -110,6 +114,12 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
         return buf_size;
     }
 
+    if (flush && buf_size >= ID3v1_TAG_SIZE && memcmp(buf, "TAG", 3) == 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return next;
+    }
+
     *poutbuf = buf;
     *poutbuf_size = buf_size;
     return next;
diff --git a/libavcodec/mpegaudio_tablegen.c b/libavcodec/mpegaudio_tablegen.c
index b4c240bd7c..90c9de430a 100644
--- a/libavcodec/mpegaudio_tablegen.c
+++ b/libavcodec/mpegaudio_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio_tablegen.h b/libavcodec/mpegaudio_tablegen.h
index 8a3e51ae60..86b2cd32b0 100644
--- a/libavcodec/mpegaudio_tablegen.h
+++ b/libavcodec/mpegaudio_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
 
 #define TABLE_4_3_SIZE (8191 + 16)*4
 #if CONFIG_HARDCODED_TABLES
@@ -39,8 +40,9 @@ static float exp_table_float[512];
 static float expval_table_float[512][16];
 
 #define FRAC_BITS 23
+#define IMDCT_SCALAR 1.759
 
-static void mpegaudio_tableinit(void)
+static av_cold void mpegaudio_tableinit(void)
 {
     int i, value, exponent;
     for (i = 1; i < TABLE_4_3_SIZE; i++) {
@@ -48,7 +50,7 @@ static void mpegaudio_tableinit(void)
         double f, fm;
         int e, m;
         /* cbrtf() isn't available on all systems, so we use powf(). */
-        f  = value * powf(value, 1.0 / 3.0) * pow(2, (i & 3) * 0.25);
+        f  = value / IMDCT_SCALAR * pow(value, 1.0 / 3.0) * pow(2, (i & 3) * 0.25);
         fm = frexp(f, &e);
         m  = (uint32_t)(fm * (1LL << 31) + 0.5);
         e += FRAC_BITS - 31 + 5 - 100;
@@ -60,9 +62,9 @@ static void mpegaudio_tableinit(void)
     for (exponent = 0; exponent < 512; exponent++) {
         for (value = 0; value < 16; value++) {
             /* cbrtf() isn't available on all systems, so we use powf(). */
-            double f = (double)value * powf(value, 1.0 / 3.0) * pow(2, (exponent - 400) * 0.25 + FRAC_BITS + 5);
+            double f = (double)value * pow(value, 1.0 / 3.0) * pow(2, (exponent - 400) * 0.25 + FRAC_BITS + 5) / IMDCT_SCALAR;
             /* llrint() isn't always available, so round and cast manually. */
-            expval_table_fixed[exponent][value] = (long long int) (f >= 0 ? floor(f + 0.5) : ceil(f - 0.5));
+            expval_table_fixed[exponent][value] = (long long int) (f < 0xFFFFFFFF ? floor(f + 0.5) : 0xFFFFFFFF);
             expval_table_float[exponent][value] = f;
         }
         exp_table_fixed[exponent] = expval_table_fixed[exponent][1];
diff --git a/libavcodec/mpegaudiodata.c b/libavcodec/mpegaudiodata.c
index 009a02a4e8..0569281109 100644
--- a/libavcodec/mpegaudiodata.c
+++ b/libavcodec/mpegaudiodata.c
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,11 @@
 
 const uint16_t avpriv_mpa_bitrate_tab[2][3][15] = {
     { {0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448 },
-      {0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384 },
-      {0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 } },
-    { {0, 32, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 224, 256},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160}
+      {0, 32, 48, 56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320, 384 },
+      {0, 32, 40, 48,  56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320 } },
+    { {0, 32, 48, 56,  64,  80,  96, 112, 128, 144, 160, 176, 192, 224, 256},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160}
     }
 };
 
diff --git a/libavcodec/mpegaudiodata.h b/libavcodec/mpegaudiodata.h
index 2b8ff6587f..29a26588b2 100644
--- a/libavcodec/mpegaudiodata.h
+++ b/libavcodec/mpegaudiodata.h
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodec_fixed.c b/libavcodec/mpegaudiodec_fixed.c
index 2db1e184df..9421ffbe94 100644
--- a/libavcodec/mpegaudiodec_fixed.c
+++ b/libavcodec/mpegaudiodec_fixed.c
@@ -1,27 +1,27 @@
 /*
  * Fixed-point MPEG audio decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 
 #include "mpegaudio.h"
 
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 7bdfd90967..ddfa5e0daa 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -2,27 +2,27 @@
  * Float MPEG Audio decoder
  * Copyright (c) 2010 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 
 #include "mpegaudio.h"
 
@@ -46,6 +46,7 @@ AVCodec ff_mp1float_decoder = {
     .id             = AV_CODEC_ID_MP1,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -63,6 +64,7 @@ AVCodec ff_mp2float_decoder = {
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
     .decode         = decode_frame,
+    .close          = decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
@@ -78,6 +80,7 @@ AVCodec ff_mp3float_decoder = {
     .id             = AV_CODEC_ID_MP3,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -94,6 +97,7 @@ AVCodec ff_mp3adufloat_decoder = {
     .id             = AV_CODEC_ID_MP3ADU,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame_adu,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
diff --git a/libavcodec/mpegaudiodec_template.c b/libavcodec/mpegaudiodec_template.c
index ba1f82b99f..0f32ac7b51 100644
--- a/libavcodec/mpegaudiodec_template.c
+++ b/libavcodec/mpegaudiodec_template.c
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
@@ -84,7 +85,7 @@ typedef struct MPADecodeContext {
     int err_recognition;
     AVCodecContext* avctx;
     MPADSPContext mpadsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     AVFrame *frame;
 } MPADecodeContext;
 
@@ -184,6 +185,8 @@ static void compute_band_indexes(MPADecodeContext *s, GranuleDef *g)
 {
     if (g->block_type == 2) {
         if (g->switch_point) {
+            if(s->sample_rate_index == 8)
+                avpriv_request_sample(s->avctx, "switch point in 8khz");
             /* if switched mode, we handle the 36 first samples as
                 long blocks.  For 8000Hz, we handle the 72 first
                 exponents as long blocks */
@@ -213,7 +216,7 @@ static inline int l1_unscale(int n, int mant, int scale_factor)
     shift   = scale_factor_modshift[scale_factor];
     mod     = shift & 3;
     shift >>= 2;
-    val     = MUL64(mant + (-1 << n) + 1, scale_factor_mult[n-1][mod]);
+    val     = MUL64((int)(mant + (-1U << n) + 1), scale_factor_mult[n-1][mod]);
     shift  += n;
     /* NOTE: at this point, 1 <= shift >= 21 + 15 */
     return (int)((val + (1LL << (shift - 1))) >> shift);
@@ -243,7 +246,10 @@ static inline int l3_unscale(int value, int exponent)
     e  = table_4_3_exp  [4 * value + (exponent & 3)];
     m  = table_4_3_value[4 * value + (exponent & 3)];
     e -= exponent >> 2;
-    assert(e >= 1);
+#ifdef DEBUG
+    if(e < 1)
+        av_log(NULL, AV_LOG_WARNING, "l3_unscale: e is %d\n", e);
+#endif
     if (e > 31)
         return 0;
     m = (m + (1 << (e - 1))) >> e;
@@ -307,7 +313,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
 
     offset = 0;
     for (i = 0; i < 2; i++) {
@@ -318,7 +324,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_quad_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
 
     for (i = 0; i < 9; i++) {
         k = 0;
@@ -371,7 +377,7 @@ static av_cold void decode_init_static(void)
 
         for (j = 0; j < 2; j++) {
             e = -(j + 1) * ((i + 1) >> 1);
-            f = pow(2.0, e / 4.0);
+            f = exp2(e / 4.0);
             k = i & 1;
             is_table_lsf[j][k ^ 1][i] = FIXR(f);
             is_table_lsf[j][k    ][i] = FIXR(1.0);
@@ -382,11 +388,11 @@ static av_cold void decode_init_static(void)
     }
 
     for (i = 0; i < 8; i++) {
-        float ci, cs, ca;
+        double ci, cs, ca;
         ci = ci_table[i];
         cs = 1.0 / sqrt(1.0 + ci * ci);
         ca = cs * ci;
-#if !CONFIG_FLOAT
+#if !USE_FLOATS
         csa_table[i][0] = FIXHR(cs/4);
         csa_table[i][1] = FIXHR(ca/4);
         csa_table[i][2] = FIXHR(ca/4) + FIXHR(cs/4);
@@ -400,6 +406,16 @@ static av_cold void decode_init_static(void)
     }
 }
 
+#if USE_FLOATS
+static av_cold int decode_close(AVCodecContext * avctx)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    av_freep(&s->fdsp);
+
+    return 0;
+}
+#endif
+
 static av_cold int decode_init(AVCodecContext * avctx)
 {
     static int initialized_tables = 0;
@@ -412,7 +428,12 @@ static av_cold int decode_init(AVCodecContext * avctx)
 
     s->avctx = avctx;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#if USE_FLOATS
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+#endif
+
     ff_mpadsp_init(&s->mpadsp);
 
     if (avctx->request_sample_fmt == OUT_FMT &&
@@ -808,7 +829,7 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
     if (s->in_gb.buffer && *pos >= s->gb.size_in_bits) {
         s->gb           = s->in_gb;
         s->in_gb.buffer = NULL;
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert2((get_bits_count(&s->gb) & 7) == 0);
         skip_bits_long(&s->gb, *pos - *end_pos);
         *end_pos2 =
         *end_pos  = *end_pos2 + get_bits_count(&s->gb) - *pos;
@@ -822,7 +843,7 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
                 v = -v;
             *dst = v;
 */
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define READ_FLIP_SIGN(dst,src)                     \
     v = AV_RN32A(src) ^ (get_bits1(&s->gb) << 31);  \
     AV_WN32A(dst, v);
@@ -937,7 +958,7 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
                 s_index -= 4;
                 skip_bits_long(&s->gb, last_pos - pos);
                 av_log(s->avctx, AV_LOG_INFO, "overread, skip %d enddists: %d %d\n", last_pos - pos, end_pos-pos, end_pos2-pos);
-                if(s->err_recognition & AV_EF_BITSTREAM)
+                if(s->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     s_index=0;
                 break;
             }
@@ -964,10 +985,10 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
     }
     /* skip extension bits */
     bits_left = end_pos2 - get_bits_count(&s->gb);
-    if (bits_left < 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    if (bits_left < 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_COMPLIANT))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index=0;
-    } else if (bits_left > 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    } else if (bits_left > 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index = 0;
     }
@@ -1131,8 +1152,8 @@ found2:
         /* ms stereo ONLY */
         /* NOTE: the 1/sqrt(2) normalization factor is included in the
            global gain */
-#if CONFIG_FLOAT
-       s->fdsp.butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
+#if USE_FLOATS
+       s->fdsp->butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
 #else
         tab0 = g0->sb_hybrid;
         tab1 = g1->sb_hybrid;
@@ -1146,7 +1167,18 @@ found2:
     }
 }
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
+#if HAVE_MIPSFPU
+#   include "mips/compute_antialias_float.h"
+#endif /* HAVE_MIPSFPU */
+#else
+#if HAVE_MIPSDSPR1
+#   include "mips/compute_antialias_fixed.h"
+#endif /* HAVE_MIPSDSPR1 */
+#endif /* USE_FLOATS */
+
+#ifndef compute_antialias
+#if USE_FLOATS
 #define AA(j) do {                                                      \
         float tmp0 = ptr[-1-j];                                         \
         float tmp1 = ptr[   j];                                         \
@@ -1192,6 +1224,7 @@ static void compute_antialias(MPADecodeContext *s, GranuleDef *g)
         ptr += 18;
     }
 }
+#endif /* compute_antialias */
 
 static void compute_imdct(MPADecodeContext *s, GranuleDef *g,
                           INTFLOAT *sb_samples, INTFLOAT *mdct_buf)
@@ -1361,9 +1394,8 @@ static int mp_decode_layer3(MPADecodeContext *s)
     if (!s->adu_mode) {
         int skip;
         const uint8_t *ptr = s->gb.buffer + (get_bits_count(&s->gb)>>3);
-        int extrasize = av_clip(get_bits_left(&s->gb) >> 3, 0,
-                                FFMAX(0, LAST_BUF_SIZE - s->last_buf_size));
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        int extrasize = av_clip(get_bits_left(&s->gb) >> 3, 0, EXTRABYTES);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         /* now we get bits from the main_data_begin offset */
         ff_dlog(s->avctx, "seekback:%d, lastbuf:%d\n",
                 main_data_begin, s->last_buf_size);
@@ -1372,7 +1404,7 @@ static int mp_decode_layer3(MPADecodeContext *s)
         s->in_gb = s->gb;
         init_get_bits(&s->gb, s->last_buf, s->last_buf_size*8);
 #if !UNCHECKED_BITSTREAM_READER
-        s->gb.size_in_bits_plus8 += extrasize * 8;
+        s->gb.size_in_bits_plus8 += FFMAX(extrasize, LAST_BUF_SIZE - s->last_buf_size) * 8;
 #endif
         s->last_buf_size <<= 3;
         for (gr = 0; gr < nb_granules && (s->last_buf_size >> 3) < main_data_begin; gr++) {
@@ -1554,9 +1586,6 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
     default:
         nb_frames = mp_decode_layer3(s);
 
-        if (nb_frames < 0)
-            return nb_frames;
-
         s->last_buf_size=0;
         if (s->in_gb.buffer) {
             align_get_bits(&s->gb);
@@ -1571,7 +1600,7 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
         }
 
         align_get_bits(&s->gb);
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         i = get_bits_left(&s->gb) >> 3;
 
         if (i < 0 || i > BACKSTEP_SIZE || nb_frames < 0) {
@@ -1579,19 +1608,20 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
                 av_log(s->avctx, AV_LOG_ERROR, "invalid new backstep %d\n", i);
             i = FFMIN(BACKSTEP_SIZE, buf_size - HEADER_SIZE);
         }
-        assert(i <= buf_size - HEADER_SIZE && i >= 0);
+        av_assert1(i <= buf_size - HEADER_SIZE && i >= 0);
         memcpy(s->last_buf + s->last_buf_size, s->gb.buffer + buf_size - HEADER_SIZE - i, i);
         s->last_buf_size += i;
     }
 
+    if(nb_frames < 0)
+        return nb_frames;
+
     /* get output buffer */
     if (!samples) {
-        av_assert0(s->frame != NULL);
+        av_assert0(s->frame);
         s->frame->nb_samples = s->avctx->frame_size;
-        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0)
             return ret;
-        }
         samples = (OUT_INT **)s->frame->extended_data;
     }
 
@@ -1627,10 +1657,21 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     uint32_t header;
     int ret;
 
+    int skipped = 0;
+    while(buf_size && !*buf){
+        buf++;
+        buf_size--;
+        skipped++;
+    }
+
     if (buf_size < HEADER_SIZE)
         return AVERROR_INVALIDDATA;
 
     header = AV_RB32(buf);
+    if (header>>8 == AV_RB32("TAG")>>8) {
+        av_log(avctx, AV_LOG_DEBUG, "discarding ID3 tag\n");
+        return buf_size;
+    }
     if (ff_mpa_check_header(header) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Header missing\n");
         return AVERROR_INVALIDDATA;
@@ -1647,6 +1688,14 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     if (!avctx->bit_rate)
         avctx->bit_rate = s->bit_rate;
 
+    if (s->frame_size <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
+        return AVERROR_INVALIDDATA;
+    } else if (s->frame_size < buf_size) {
+        av_log(avctx, AV_LOG_DEBUG, "incorrect frame size - multiple frames in buffer?\n");
+        buf_size= s->frame_size;
+    }
+
     s->frame = data;
 
     ret = mp_decode_frame(s, NULL, buf, buf_size);
@@ -1667,13 +1716,15 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
             return ret;
     }
     s->frame_size = 0;
-    return buf_size;
+    return buf_size + skipped;
 }
 
 static void mp_flush(MPADecodeContext *ctx)
 {
     memset(ctx->synth_buf, 0, sizeof(ctx->synth_buf));
+    memset(ctx->mdct_buf, 0, sizeof(ctx->mdct_buf));
     ctx->last_buf_size = 0;
+    ctx->dither_state = 0;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -1690,6 +1741,7 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data,
     MPADecodeContext *s = avctx->priv_data;
     uint32_t header;
     int len, ret;
+    int av_unused out_size;
 
     len = buf_size;
 
@@ -1784,7 +1836,7 @@ static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
     int i;
 
     for (i = 0; i < s->frames; i++)
-        av_free(s->mp3decctx[i]);
+        av_freep(&s->mp3decctx[i]);
 
     return 0;
 }
@@ -1843,6 +1895,7 @@ static av_cold int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
+        s->mp3decctx[i]->fdsp = s->mp3decctx[0]->fdsp;
     }
 
     return 0;
@@ -1878,10 +1931,8 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = MPA_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_samples = (OUT_INT **)frame->extended_data;
 
     // Discard too short frames
@@ -1895,7 +1946,7 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         fsize = AV_RB16(buf) >> 4;
         fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
         m     = s->mp3decctx[fr];
-        assert(m != NULL);
+        av_assert1(m);
 
         if (fsize < HEADER_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "Frame size smaller than header size\n");
@@ -1903,8 +1954,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         }
         header = (AV_RB32(buf) & 0x000fffff) | s->syncword; // patch header
 
-        if (ff_mpa_check_header(header) < 0) // Bad header, discard block
-            break;
+        if (ff_mpa_check_header(header) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Bad header, discard block\n");
+            return AVERROR_INVALIDDATA;
+        }
 
         avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
 
@@ -1920,8 +1973,13 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         if (m->nb_channels > 1)
             outptr[1] = out_samples[s->coff[fr] + 1];
 
-        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0)
-            return ret;
+        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "failed to decode channel %d\n", ch);
+            memset(outptr[0], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            if (m->nb_channels > 1)
+                memset(outptr[1], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            ret = m->nb_channels * MPA_FRAME_SIZE*sizeof(OUT_INT);
+        }
 
         out_size += ret;
         buf      += fsize;
@@ -1929,6 +1987,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
         avctx->bit_rate += m->bit_rate;
     }
+    if (ch != avctx->channels) {
+        av_log(avctx, AV_LOG_ERROR, "failed to decode all channels\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     /* update codec info */
     avctx->sample_rate = s->mp3decctx[0]->sample_rate;
diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c
index 3d840d7ecb..d522c064ac 100644
--- a/libavcodec/mpegaudiodecheader.c
+++ b/libavcodec/mpegaudiodecheader.c
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "libavutil/common.h"
 
 #include "avcodec.h"
+#include "internal.h"
 #include "mpegaudio.h"
 #include "mpegaudiodata.h"
 #include "mpegaudiodecheader.h"
@@ -112,7 +113,7 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header)
     return 0;
 }
 
-int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate)
+int avpriv_mpa_decode_header2(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate, enum AVCodecID *codec_id)
 {
     MPADecodeHeader s1, *s = &s1;
 
@@ -125,17 +126,17 @@ int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_r
 
     switch(s->layer) {
     case 1:
-        avctx->codec_id = AV_CODEC_ID_MP1;
+        *codec_id = AV_CODEC_ID_MP1;
         *frame_size = 384;
         break;
     case 2:
-        avctx->codec_id = AV_CODEC_ID_MP2;
+        *codec_id = AV_CODEC_ID_MP2;
         *frame_size = 1152;
         break;
     default:
     case 3:
-        if (avctx->codec_id != AV_CODEC_ID_MP3ADU)
-            avctx->codec_id = AV_CODEC_ID_MP3;
+        if (*codec_id != AV_CODEC_ID_MP3ADU)
+            *codec_id = AV_CODEC_ID_MP3;
         if (s->lsf)
             *frame_size = 576;
         else
@@ -148,3 +149,8 @@ int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_r
     *bit_rate = s->bit_rate;
     return s->frame_size;
 }
+
+int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate)
+{
+    return avpriv_mpa_decode_header2(head, sample_rate, channels, frame_size, bit_rate, &avctx->codec_id);
+}
diff --git a/libavcodec/mpegaudiodecheader.h b/libavcodec/mpegaudiodecheader.h
index 764e8abde4..444b85f265 100644
--- a/libavcodec/mpegaudiodecheader.h
+++ b/libavcodec/mpegaudiodecheader.h
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,6 +56,8 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header);
    header, otherwise the coded frame size in bytes */
 int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
 
+int avpriv_mpa_decode_header2(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate, enum AVCodecID *codec_id);
+
 /* fast header check for resync */
 static inline int ff_mpa_check_header(uint32_t header){
     /* header */
diff --git a/libavcodec/mpegaudiodectab.h b/libavcodec/mpegaudiodectab.h
index 1221657988..accd12b8e2 100644
--- a/libavcodec/mpegaudiodectab.h
+++ b/libavcodec/mpegaudiodectab.h
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
index 58ea1d1585..5fe3444896 100644
--- a/libavcodec/mpegaudiodsp.c
+++ b/libavcodec/mpegaudiodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,4 +45,6 @@ av_cold void ff_mpadsp_init(MPADSPContext *s)
     if (ARCH_ARM)     ff_mpadsp_init_arm(s);
     if (ARCH_PPC)     ff_mpadsp_init_ppc(s);
     if (ARCH_X86)     ff_mpadsp_init_x86(s);
+    if (HAVE_MIPSFPU)   ff_mpadsp_init_mipsfpu(s);
+    if (HAVE_MIPSDSPR1) ff_mpadsp_init_mipsdspr1(s);
 }
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
index 909c652951..a722a2f36b 100644
--- a/libavcodec/mpegaudiodsp.h
+++ b/libavcodec/mpegaudiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@ typedef struct MPADSPContext {
                                int *dither_state, int16_t *samples, int incr);
     void (*dct32_float)(float *dst, const float *src);
     void (*dct32_fixed)(int *dst, const int *src);
+
     void (*imdct36_blocks_float)(float *out, float *buf, float *in,
                                  int count, int switch_point, int block_type);
     void (*imdct36_blocks_fixed)(int *out, int *buf, int *in,
@@ -58,6 +59,8 @@ void ff_mpadsp_init_aarch64(MPADSPContext *s);
 void ff_mpadsp_init_arm(MPADSPContext *s);
 void ff_mpadsp_init_ppc(MPADSPContext *s);
 void ff_mpadsp_init_x86(MPADSPContext *s);
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s);
+void ff_mpadsp_init_mipsdspr1(MPADSPContext *s);
 
 void ff_mpa_synth_init_float(float *window);
 void ff_mpa_synth_init_fixed(int32_t *window);
diff --git a/libavcodec/mpegaudiodsp_data.c b/libavcodec/mpegaudiodsp_data.c
index 5cf86b8352..4550de9b80 100644
--- a/libavcodec/mpegaudiodsp_data.c
+++ b/libavcodec/mpegaudiodsp_data.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp_fixed.c b/libavcodec/mpegaudiodsp_fixed.c
index 3c49a568b7..83c9d66095 100644
--- a/libavcodec/mpegaudiodsp_fixed.c
+++ b/libavcodec/mpegaudiodsp_fixed.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_float.c b/libavcodec/mpegaudiodsp_float.c
index 2d8d53ea26..c45b136089 100644
--- a/libavcodec/mpegaudiodsp_float.c
+++ b/libavcodec/mpegaudiodsp_float.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_template.c b/libavcodec/mpegaudiodsp_template.c
index 621bbd4eba..62454ca306 100644
--- a/libavcodec/mpegaudiodsp_template.c
+++ b/libavcodec/mpegaudiodsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "mpegaudiodsp.h"
 #include "mpegaudio.h"
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define RENAME(n) n##_float
 
 static inline float round_sample(float *sum)
@@ -125,7 +125,7 @@ void RENAME(ff_mpadsp_apply_window)(MPA_INT *synth_buf, MPA_INT *window,
     register const MPA_INT *w, *w2, *p;
     int j;
     OUT_INT *samples2;
-#if CONFIG_FLOAT
+#if USE_FLOATS
     float sum, sum2;
 #else
     int64_t sum, sum2;
@@ -200,7 +200,7 @@ av_cold void RENAME(ff_mpa_synth_init)(MPA_INT *window)
     for(i=0;i<257;i++) {
         INTFLOAT v;
         v = ff_mpa_enwindow[i];
-#if CONFIG_FLOAT
+#if USE_FLOATS
         v *= 1.0 / (1LL<<(16 + FRAC_BITS));
 #endif
         window[i] = v;
@@ -243,7 +243,7 @@ av_cold void RENAME(ff_init_mpadsp_tabs)(void)
                 else if (i <  18) d = 1;
             }
             //merge last stage of imdct into the window coefficients
-            d *= 0.5 / cos(M_PI * (2 * i + 19) / 72);
+            d *= 0.5 * IMDCT_SCALAR / cos(M_PI * (2 * i + 19) / 72);
 
             if (j == 2)
                 RENAME(ff_mdct_win)[j][i/3] = FIXHR((d / (1<<5)));
@@ -398,3 +398,4 @@ void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in,
         out++;
     }
 }
+
diff --git a/libavcodec/mpegaudioenc_fixed.c b/libavcodec/mpegaudioenc_fixed.c
new file mode 100644
index 0000000000..022b6fedd3
--- /dev/null
+++ b/libavcodec/mpegaudioenc_fixed.c
@@ -0,0 +1,41 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2fixed_encoder = {
+    .name                  = "mp2fixed",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 fixed point (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc_float.c b/libavcodec/mpegaudioenc_float.c
new file mode 100644
index 0000000000..4d4ab2d7fa
--- /dev/null
+++ b/libavcodec/mpegaudioenc_float.c
@@ -0,0 +1,42 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FLOATS 1
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2_encoder = {
+    .name                  = "mp2",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc.c b/libavcodec/mpegaudioenc_template.c
index 7b0261f2cc..ce93cc7c6d 100644
--- a/libavcodec/mpegaudioenc.c
+++ b/libavcodec/mpegaudioenc_template.c
@@ -2,20 +2,20 @@
  * The simplest mpeg audio layer 2 encoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,12 @@ typedef struct MpegAudioContext {
     int16_t filter_bank[512];
     int scale_factor_table[64];
     unsigned char scale_diff_table[128];
+#if USE_FLOATS
     float scale_factor_inv_table[64];
+#else
+    int8_t scale_factor_shift[64];
+    unsigned short scale_factor_mult[64];
+#endif
     unsigned short total_quant_bits[17]; /* total number of bits per allocation group */
 } MpegAudioContext;
 
@@ -103,10 +108,15 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     s->freq_index = i;
 
     /* encoding bitrate & frequency */
-    for(i=0;i<15;i++) {
+    for(i=1;i<15;i++) {
         if (avpriv_mpa_bitrate_tab[s->lsf][1][i] == bitrate)
             break;
     }
+    if (i == 15 && !avctx->bit_rate) {
+        i = 14;
+        bitrate = avpriv_mpa_bitrate_tab[s->lsf][1][i];
+        avctx->bit_rate = bitrate * 1000;
+    }
     if (i == 15){
         av_log(avctx, AV_LOG_ERROR, "bitrate %d is not allowed in mp2\n", bitrate);
         return AVERROR(EINVAL);
@@ -149,11 +159,17 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     }
 
     for(i=0;i<64;i++) {
-        v = (int)(pow(2.0, (3 - i) / 3.0) * (1 << 20));
+        v = (int)(exp2((3 - i) / 3.0) * (1 << 20));
         if (v <= 0)
             v = 1;
         s->scale_factor_table[i] = v;
-        s->scale_factor_inv_table[i] = pow(2.0, -(3 - i) / 3.0) / (float)(1 << 20);
+#if USE_FLOATS
+        s->scale_factor_inv_table[i] = exp2(-(3 - i) / 3.0) / (float)(1 << 20);
+#else
+#define P 15
+        s->scale_factor_shift[i] = 21 - P - (i / 3);
+        s->scale_factor_mult[i] = (1 << P) * exp2((i % 3) / 3.0);
+#endif
     }
     for(i=0;i<128;i++) {
         v = i - 64;
@@ -397,7 +413,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             ff_dlog(NULL, "%2d:%d in=%x %x %d\n",
                     j, i, vmax, s->scale_factor_table[index], index);
             /* store the scale factor */
-            assert(index >=0 && index <= 63);
+            av_assert2(index >=0 && index <= 63);
             sf[i] = index;
         }
 
@@ -459,7 +475,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             sf[1] = sf[2] = sf[0];
             break;
         default:
-            assert(0); //cannot happen
+            av_assert2(0); //cannot happen
             code = 0;           /* kill warning */
         }
 
@@ -579,7 +595,7 @@ static void compute_bit_allocation(MpegAudioContext *s,
         }
     }
     *padding = max_frame_size - current_frame_size;
-    assert(*padding >= 0);
+    av_assert0(*padding >= 0);
 }
 
 /*
@@ -668,14 +684,36 @@ static void encode_frame(MpegAudioContext *s,
                         qindex = s->alloc_table[j+b];
                         steps = ff_mpa_quant_steps[qindex];
                         for(m=0;m<3;m++) {
-                            float a;
                             sample = s->sb_samples[ch][k][l + m][i];
                             /* divide by scale factor */
-                            a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
-                            q[m] = (int)((a + 1.0) * steps * 0.5);
+#if USE_FLOATS
+                            {
+                                float a;
+                                a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
+                                q[m] = (int)((a + 1.0) * steps * 0.5);
+                            }
+#else
+                            {
+                                int q1, e, shift, mult;
+                                e = s->scale_factors[ch][i][k];
+                                shift = s->scale_factor_shift[e];
+                                mult = s->scale_factor_mult[e];
+
+                                /* normalize to P bits */
+                                if (shift < 0)
+                                    q1 = sample << (-shift);
+                                else
+                                    q1 = sample >> shift;
+                                q1 = (q1 * mult) >> P;
+                                q1 += 1 << P;
+                                if (q1 < 0)
+                                    q1 = 0;
+                                q[m] = (q1 * (unsigned)steps) >> (P + 1);
+                            }
+#endif
                             if (q[m] >= steps)
                                 q[m] = steps - 1;
-                            assert(q[m] >= 0 && q[m] < steps);
+                            av_assert2(q[m] >= 0 && q[m] < steps);
                         }
                         bits = ff_mpa_quant_bits[qindex];
                         if (bits < 0) {
@@ -725,10 +763,8 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     compute_bit_allocation(s, smr, bit_alloc, &padding);
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
@@ -743,25 +779,7 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 static const AVCodecDefault mp2_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
-AVCodec ff_mp2_encoder = {
-    .name                  = "mp2",
-    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
-    .type                  = AVMEDIA_TYPE_AUDIO,
-    .id                    = AV_CODEC_ID_MP2,
-    .priv_data_size        = sizeof(MpegAudioContext),
-    .init                  = MPA_encode_init,
-    .encode2               = MPA_encode_frame,
-    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                            AV_SAMPLE_FMT_NONE },
-    .supported_samplerates = (const int[]){
-        44100, 48000,  32000, 22050, 24000, 16000, 0
-    },
-    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
-                                                 AV_CH_LAYOUT_STEREO,
-                                                 0 },
-    .defaults              = mp2_defaults,
-};
diff --git a/libavcodec/mpegaudiotab.h b/libavcodec/mpegaudiotab.h
index d30ef1b752..42d42d883d 100644
--- a/libavcodec/mpegaudiotab.h
+++ b/libavcodec/mpegaudiotab.h
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegpicture.c b/libavcodec/mpegpicture.c
index f3a9dd05af..16b8f52fa7 100644
--- a/libavcodec/mpegpicture.c
+++ b/libavcodec/mpegpicture.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related picture management functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,17 +56,30 @@ do {\
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
                             ScratchpadContext *sc, int linesize)
 {
-    int alloc_size = FFALIGN(FFABS(linesize) + 32, 32);
+    int alloc_size = FFALIGN(FFABS(linesize) + 64, 32);
+
+    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+        || avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+        )
+        return 0;
+
+    if (linesize < 24) {
+        av_log(avctx, AV_LOG_ERROR, "Image too small, temporary buffers cannot function\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     // edge emu needs blocksize + filter length - 1
     // (= 17x17 for  halfpel / 21x21 for  h264)
     // VC1 computes luma and chroma simultaneously and needs 19X19 + 9x9
     // at uvlinesize. It supports only YUV420 so 24x24 is enough
     // linesize * interlaced * MBsize
-    FF_ALLOCZ_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size * 2 * 24,
+    // we also use this buffer for encoding in encode_mb_internal() needig an additional 32 lines
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size, 4 * 68,
                       fail);
 
-    FF_ALLOCZ_OR_GOTO(avctx, me->scratchpad, alloc_size * 2 * 16 * 3,
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, me->scratchpad, alloc_size, 4 * 16 * 2,
                       fail)
     me->temp            = me->scratchpad;
     sc->rd_scratchpad   = me->scratchpad;
@@ -165,8 +178,8 @@ static int alloc_frame_buffer(AVCodecContext *avctx,  Picture *pic,
     return 0;
 }
 
-static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
-                                int mb_stride, int mb_height, int b8_stride)
+static int alloc_picture_tables(AVCodecContext *avctx, Picture *pic, int encoding, int out_format,
+                                int mb_stride, int mb_width, int mb_height, int b8_stride)
 {
     const int big_mb_num    = mb_stride * (mb_height + 1) + 1;
     const int mb_array_size = mb_stride * mb_height;
@@ -189,7 +202,8 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
             return AVERROR(ENOMEM);
     }
 
-    if (out_format == FMT_H263 || encoding) {
+    if (out_format == FMT_H263 || encoding || avctx->debug_mv ||
+        (avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS)) {
         int mv_size        = 2 * (b8_array_size + 4) * sizeof(int16_t);
         int ref_index_size = 4 * mb_array_size;
 
@@ -201,6 +215,9 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
         }
     }
 
+    pic->alloc_mb_width  = mb_width;
+    pic->alloc_mb_height = mb_height;
+
     return 0;
 }
 
@@ -211,16 +228,21 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize)
 {
     int i, ret;
 
+    if (pic->qscale_table_buf)
+        if (   pic->alloc_mb_width  != mb_width
+            || pic->alloc_mb_height != mb_height)
+            ff_free_picture_tables(pic);
+
     if (shared) {
-        assert(pic->f->data[0]);
+        av_assert0(pic->f->data[0]);
         pic->shared = 1;
     } else {
-        assert(!pic->f->buf[0]);
+        av_assert0(!pic->f->buf[0]);
         if (alloc_frame_buffer(avctx, pic, me, sc,
                                chroma_x_shift, chroma_y_shift,
                                *linesize, *uvlinesize) < 0)
@@ -231,8 +253,8 @@ int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
     }
 
     if (!pic->qscale_table_buf)
-        ret = alloc_picture_tables(pic, encoding, out_format,
-                                   mb_stride, mb_height, b8_stride);
+        ret = alloc_picture_tables(avctx, pic, encoding, out_format,
+                                   mb_stride, mb_width, mb_height, b8_stride);
     else
         ret = make_tables_writable(pic);
     if (ret < 0)
@@ -268,6 +290,8 @@ fail:
  */
 void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 {
+    int off = offsetof(Picture, mb_mean) + sizeof(pic->mb_mean);
+
     pic->tf.f = pic->f;
     /* WM Image / Screen codecs allocate internal buffers with different
      * dimensions / colorspaces; ignore user-defined callbacks for these. */
@@ -282,6 +306,8 @@ void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 
     if (pic->needs_realloc)
         ff_free_picture_tables(pic);
+
+    memset((uint8_t*)pic + off, 0, sizeof(*pic) - off);
 }
 
 int ff_update_picture_tables(Picture *dst, Picture *src)
@@ -323,6 +349,9 @@ do {                                                                          \
         dst->ref_index[i]  = src->ref_index[i];
     }
 
+    dst->alloc_mb_width  = src->alloc_mb_width;
+    dst->alloc_mb_height = src->alloc_mb_height;
+
     return 0;
 }
 
@@ -376,7 +405,7 @@ static inline int pic_is_unused(Picture *pic)
     return 0;
 }
 
-static int find_unused_picture(Picture *picture, int shared)
+static int find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
     int i;
 
@@ -392,12 +421,26 @@ static int find_unused_picture(Picture *picture, int shared)
         }
     }
 
-    return AVERROR_INVALIDDATA;
+    av_log(avctx, AV_LOG_FATAL,
+           "Internal error, picture buffer overflow\n");
+    /* We could return -1, but the codec would crash trying to draw into a
+     * non-existing frame anyway. This is safer than waiting for a random crash.
+     * Also the return of this is never useful, an encoder must only allocate
+     * as much as allowed in the specification. This has no relationship to how
+     * much libavcodec could allocate (and MAX_PICTURE_COUNT is always large
+     * enough for such valid streams).
+     * Plus, a decoder has to check stream validity and remove frames if too
+     * many reference frames are around. Waiting for "OOM" is not correct at
+     * all. Similarly, missing reference frames have to be replaced by
+     * interpolated/MC frames, anything else is a bug in the codec ...
+     */
+    abort();
+    return -1;
 }
 
 int ff_find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
-    int ret = find_unused_picture(picture, shared);
+    int ret = find_unused_picture(avctx, picture, shared);
 
     if (ret >= 0 && ret < MAX_PICTURE_COUNT) {
         if (picture[ret].needs_realloc) {
@@ -413,6 +456,9 @@ void ff_free_picture_tables(Picture *pic)
 {
     int i;
 
+    pic->alloc_mb_width  =
+    pic->alloc_mb_height = 0;
+
     av_buffer_unref(&pic->mb_var_buf);
     av_buffer_unref(&pic->mc_mb_var_buf);
     av_buffer_unref(&pic->mb_mean_buf);
diff --git a/libavcodec/mpegpicture.h b/libavcodec/mpegpicture.h
index 115c2883d3..2db3d6733a 100644
--- a/libavcodec/mpegpicture.h
+++ b/libavcodec/mpegpicture.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "motion_est.h"
 #include "thread.h"
 
-#define MAX_PICTURE_COUNT 32
+#define MAX_PICTURE_COUNT 36
 #define EDGE_WIDTH 16
 
 typedef struct ScratchpadContext {
@@ -67,6 +67,9 @@ typedef struct Picture {
     AVBufferRef *mc_mb_var_buf;
     uint16_t *mc_mb_var;        ///< Table for motion compensated MB variances
 
+    int alloc_mb_width;         ///< mb_width used to allocate tables
+    int alloc_mb_height;        ///< mb_height used to allocate tables
+
     AVBufferRef *mb_mean_buf;
     uint8_t *mb_mean;           ///< Table for MB luminance
 
@@ -75,16 +78,16 @@ typedef struct Picture {
 
     int field_picture;          ///< whether or not the picture was encoded in separate fields
 
-    int mb_var_sum;             ///< sum of MB variance for current frame
-    int mc_mb_var_sum;          ///< motion compensated MB variance for current frame
+    int64_t mb_var_sum;         ///< sum of MB variance for current frame
+    int64_t mc_mb_var_sum;      ///< motion compensated MB variance for current frame
 
-    int b_frame_score;          /* */
+    int b_frame_score;
     int needs_realloc;          ///< Picture needs to be reallocated (eg due to a frame size change)
 
     int reference;
     int shared;
 
-    uint64_t encoding_error[4];
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
 } Picture;
 
 /**
@@ -94,7 +97,7 @@ typedef struct Picture {
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize);
 
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
diff --git a/libavcodec/mpegutils.c b/libavcodec/mpegutils.c
index bc430f0531..62cc36aa6e 100644
--- a/libavcodec/mpegutils.c
+++ b/libavcodec/mpegutils.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegutils.h b/libavcodec/mpegutils.h
index 60f971222e..9cfadfc4cf 100644
--- a/libavcodec/mpegutils.h
+++ b/libavcodec/mpegutils.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index c5e8040758..96634ec7b5 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -5,20 +5,20 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,9 +31,11 @@
 #include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/motion_vector.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "h264chroma.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -44,7 +46,6 @@
 #include "mjpegenc.h"
 #include "msmpeg4.h"
 #include "qpeldsp.h"
-#include "xvmc_internal.h"
 #include "thread.h"
 #include "wmv2.h"
 #include <limits.h>
@@ -57,10 +58,7 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
 
     nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     /* XXX: only mpeg1 */
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
@@ -116,13 +114,13 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -130,10 +128,10 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
         }
@@ -147,13 +145,14 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+    sum += block[0];
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -161,10 +160,10 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
             sum+=level;
@@ -180,6 +179,9 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
@@ -191,11 +193,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
             if (level < 0) {
                 level = -level;
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
                 level = -level;
             } else {
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
             }
             block[j] = level;
             sum+=level;
@@ -210,15 +212,12 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     qmul = qscale << 1;
 
     if (!s->h263_aic) {
-        if (n < 4)
-            block[0] = block[0] * s->y_dc_scale;
-        else
-            block[0] = block[0] * s->c_dc_scale;
+        block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
         qadd = (qscale - 1) | 1;
     }else{
         qadd = 0;
@@ -247,7 +246,7 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
@@ -267,14 +266,41 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     }
 }
 
+
+static void gray16(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 16);
+}
+
+static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 8);
+}
+
 /* init common dct for both encoder and decoder */
 static av_cold int dct_init(MpegEncContext *s)
 {
     ff_blockdsp_init(&s->bdsp, s->avctx);
+    ff_h264chroma_init(&s->h264chroma, 8); //for lowres
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
     ff_mpegvideodsp_init(&s->mdsp);
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
 
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        int i;
+        for (i=0; i<4; i++) {
+            s->hdsp.avg_pixels_tab[0][i] = gray16;
+            s->hdsp.put_pixels_tab[0][i] = gray16;
+            s->hdsp.put_no_rnd_pixels_tab[0][i] = gray16;
+
+            s->hdsp.avg_pixels_tab[1][i] = gray8;
+            s->hdsp.put_pixels_tab[1][i] = gray8;
+            s->hdsp.put_no_rnd_pixels_tab[1][i] = gray8;
+        }
+    }
+
     s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_c;
     s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_c;
     s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_c;
@@ -287,12 +313,16 @@ static av_cold int dct_init(MpegEncContext *s)
     if (HAVE_INTRINSICS_NEON)
         ff_mpv_common_init_neon(s);
 
+    if (ARCH_ALPHA)
+        ff_mpv_common_init_axp(s);
     if (ARCH_ARM)
         ff_mpv_common_init_arm(s);
     if (ARCH_PPC)
         ff_mpv_common_init_ppc(s);
     if (ARCH_X86)
         ff_mpv_common_init_x86(s);
+    if (ARCH_MIPS)
+        ff_mpv_common_init_mips(s);
 
     return 0;
 }
@@ -319,7 +349,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 0,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -330,6 +360,9 @@ static int init_duplicate_context(MpegEncContext *s)
     int yc_size = y_size + 2 * c_size;
     int i;
 
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
     s->sc.edge_emu_buffer =
     s->me.scratchpad   =
     s->me.temp         =
@@ -355,10 +388,7 @@ static int init_duplicate_context(MpegEncContext *s)
     }
     if (s->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp           = s->pblocks[4];
-        s->pblocks[4] = s->pblocks[5];
-        s->pblocks[5] = tmp;
+        FFSWAP(void *, s->pblocks[4], s->pblocks[5]);
     }
 
     if (s->out_format == FMT_H263) {
@@ -436,10 +466,7 @@ int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src)
     }
     if (dst->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp             = dst->pblocks[4];
-        dst->pblocks[4] = dst->pblocks[5];
-        dst->pblocks[5] = tmp;
+        FFSWAP(void *, dst->pblocks[4], dst->pblocks[5]);
     }
     if (!dst->sc.edge_emu_buffer &&
         (ret = ff_mpeg_framesize_alloc(dst->avctx, &dst->me,
@@ -459,9 +486,11 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     int i, ret;
     MpegEncContext *s = dst->priv_data, *s1 = src->priv_data;
 
-    if (dst == src || !s1->context_initialized)
+    if (dst == src)
         return 0;
 
+    av_assert0(s != s1);
+
     // FIXME can parameters change on I-frames?
     // in that case dst may need a reinit
     if (!s->context_initialized) {
@@ -472,18 +501,24 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
         s->bitstream_buffer      = NULL;
         s->bitstream_buffer_size = s->allocated_bitstream_buffer_size = 0;
 
-        ff_mpv_idct_init(s);
-        if ((err = ff_mpv_common_init(s)) < 0)
-            return err;
+        if (s1->context_initialized){
+//             s->picture_range_start  += MAX_PICTURE_COUNT;
+//             s->picture_range_end    += MAX_PICTURE_COUNT;
+            ff_mpv_idct_init(s);
+            if((err = ff_mpv_common_init(s)) < 0){
+                memset(s, 0, sizeof(MpegEncContext));
+                s->avctx = dst;
+                return err;
+            }
+        }
     }
 
     if (s->height != s1->height || s->width != s1->width || s->context_reinit) {
-        int err;
         s->context_reinit = 0;
         s->height = s1->height;
         s->width  = s1->width;
-        if ((err = ff_mpv_common_frame_size_change(s)) < 0)
-            return err;
+        if ((ret = ff_mpv_common_frame_size_change(s)) < 0)
+            return ret;
     }
 
     s->avctx->coded_height  = s1->avctx->coded_height;
@@ -494,9 +529,11 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     s->coded_picture_number = s1->coded_picture_number;
     s->picture_number       = s1->picture_number;
 
+    av_assert0(!s->picture || s->picture != s1->picture);
+    if(s->picture)
     for (i = 0; i < MAX_PICTURE_COUNT; i++) {
         ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        if (s1->picture[i].f->buf[0] &&
+        if (s1->picture && s1->picture[i].f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->picture[i], &s1->picture[i])) < 0)
             return ret;
     }
@@ -504,7 +541,7 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
 #define UPDATE_PICTURE(pic)\
 do {\
     ff_mpeg_unref_picture(s->avctx, &s->pic);\
-    if (s1->pic.f->buf[0])\
+    if (s1->pic.f && s1->pic.f->buf[0])\
         ret = ff_mpeg_ref_picture(s->avctx, &s->pic, &s1->pic);\
     else\
         ret = ff_update_picture_tables(&s->pic, &s1->pic);\
@@ -528,6 +565,7 @@ do {\
     // Error/bug resilience
     s->next_p_frame_damaged = s1->next_p_frame_damaged;
     s->workaround_bugs      = s1->workaround_bugs;
+    s->padding_bug_score    = s1->padding_bug_score;
 
     // MPEG4 timing info
     memcpy(&s->last_time_base, &s1->last_time_base,
@@ -544,11 +582,16 @@ do {\
 
     if (s1->bitstream_buffer) {
         if (s1->bitstream_buffer_size +
-            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size)
+            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size) {
             av_fast_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
                            s1->allocated_bitstream_buffer_size);
-            s->bitstream_buffer_size = s1->bitstream_buffer_size;
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
+                return AVERROR(ENOMEM);
+            }
+        }
+        s->bitstream_buffer_size = s1->bitstream_buffer_size;
         memcpy(s->bitstream_buffer, s1->bitstream_buffer,
                s1->bitstream_buffer_size);
         memset(s->bitstream_buffer + s->bitstream_buffer_size, 0,
@@ -567,7 +610,6 @@ do {\
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Context scratch buffers could not "
                    "be allocated due to unknown size.\n");
-            return AVERROR_BUG;
         }
 
     // MPEG2/interlacing info
@@ -617,6 +659,18 @@ void ff_mpv_decode_defaults(MpegEncContext *s)
     ff_mpv_common_defaults(s);
 }
 
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx)
+{
+    s->avctx           = avctx;
+    s->width           = avctx->coded_width;
+    s->height          = avctx->coded_height;
+    s->codec_id        = avctx->codec->id;
+    s->workaround_bugs = avctx->workaround_bugs;
+
+    /* convert fourcc to upper case */
+    s->codec_tag          = avpriv_toupper4(avctx->codec_tag);
+}
+
 /**
  * Initialize and allocates MpegEncContext fields dependent on the resolution.
  */
@@ -630,7 +684,7 @@ static int init_context_frame(MpegEncContext *s)
     mb_array_size = s->mb_height * s->mb_stride;
     mv_table_size = (s->mb_height + 2) * s->mb_stride + 1;
 
-    /* set default edge pos, will be overriden
+    /* set default edge pos, will be overridden
      * in decode_header if needed */
     s->h_edge_pos = s->mb_width * 16;
     s->v_edge_pos = s->mb_height * 16;
@@ -648,44 +702,35 @@ static int init_context_frame(MpegEncContext *s)
     c_size  = s->mb_stride * (s->mb_height + 1);
     yc_size = y_size + 2   * c_size;
 
-    FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_index2xy, (s->mb_num + 1) * sizeof(int),
-                      fail); // error ressilience code looks cleaner with this
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_index2xy, (s->mb_num + 1) * sizeof(int), fail); // error ressilience code looks cleaner with this
     for (y = 0; y < s->mb_height; y++)
         for (x = 0; x < s->mb_width; x++)
             s->mb_index2xy[x + y * s->mb_width] = x + y * s->mb_stride;
 
-    s->mb_index2xy[s->mb_height * s->mb_width] =
-        (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
+    s->mb_index2xy[s->mb_height * s->mb_width] = (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
 
     if (s->encoding) {
         /* Allocate MV tables */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,                 mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,          mv_table_size * 2 * sizeof(int16_t), fail)
         s->p_mv_table            = s->p_mv_table_base + s->mb_stride + 1;
         s->b_forw_mv_table       = s->b_forw_mv_table_base + s->mb_stride + 1;
         s->b_back_mv_table       = s->b_back_mv_table_base + s->mb_stride + 1;
-        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base +
-                                   s->mb_stride + 1;
-        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base +
-                                   s->mb_stride + 1;
+        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base + s->mb_stride + 1;
+        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base + s->mb_stride + 1;
         s->b_direct_mv_table     = s->b_direct_mv_table_base + s->mb_stride + 1;
 
         /* Allocate MB type table */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size *
-                          sizeof(uint16_t), fail); // needed for encoding
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size * sizeof(uint16_t), fail) // needed for encoding
 
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size *
-                          sizeof(int), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size * sizeof(int), fail)
 
         FF_ALLOC_OR_GOTO(s->avctx, s->cplx_tab,
                          mb_array_size * sizeof(float), fail);
@@ -708,34 +753,27 @@ static int init_context_frame(MpegEncContext *s)
                     s->b_field_mv_table[i][j][k] = s->b_field_mv_table_base[i][j][k] +
                                                    s->mb_stride + 1;
                 }
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j],
-                                  mb_array_size * 2 * sizeof(uint8_t), fail);
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j],
-                                  mv_table_size * 2 * sizeof(int16_t), fail);
-                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j]
-                                            + s->mb_stride + 1;
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j], mb_array_size * 2 * sizeof(uint8_t), fail)
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j], mv_table_size * 2 * sizeof(int16_t), fail)
+                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j] + s->mb_stride + 1;
             }
-            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i],
-                              mb_array_size * 2 * sizeof(uint8_t), fail);
+            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i], mb_array_size * 2 * sizeof(uint8_t), fail)
         }
     }
     if (s->out_format == FMT_H263) {
         /* cbp values */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size, fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size + (s->mb_height&1)*2*s->b8_stride, fail);
         s->coded_block = s->coded_block_base + s->b8_stride + 1;
 
         /* cbp, ac_pred, pred_dir */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table,
-                          mb_array_size * sizeof(uint8_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table,
-                          mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table     , mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table, mb_array_size * sizeof(uint8_t), fail);
     }
 
     if (s->h263_pred || s->h263_plus || !s->encoding) {
         /* dc values */
         // MN: we need these for  error resilience of intra-frames
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base,
-                          yc_size * sizeof(int16_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base, yc_size * sizeof(int16_t), fail);
         s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
         s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1;
         s->dc_val[2] = s->dc_val[1] + c_size;
@@ -756,6 +794,82 @@ fail:
     return AVERROR(ENOMEM);
 }
 
+static void clear_context(MpegEncContext *s)
+{
+    int i, j, k;
+
+    memset(&s->next_picture, 0, sizeof(s->next_picture));
+    memset(&s->last_picture, 0, sizeof(s->last_picture));
+    memset(&s->current_picture, 0, sizeof(s->current_picture));
+    memset(&s->new_picture, 0, sizeof(s->new_picture));
+
+    memset(s->thread_context, 0, sizeof(s->thread_context));
+
+    s->me.map = NULL;
+    s->me.score_map = NULL;
+    s->dct_error_sum = NULL;
+    s->block = NULL;
+    s->blocks = NULL;
+    memset(s->pblocks, 0, sizeof(s->pblocks));
+    s->ac_val_base = NULL;
+    s->ac_val[0] =
+    s->ac_val[1] =
+    s->ac_val[2] =NULL;
+    s->sc.edge_emu_buffer = NULL;
+    s->me.scratchpad = NULL;
+    s->me.temp =
+    s->sc.rd_scratchpad =
+    s->sc.b_scratchpad =
+    s->sc.obmc_scratchpad = NULL;
+
+    s->parse_context.buffer = NULL;
+    s->parse_context.buffer_size = 0;
+    s->bitstream_buffer = NULL;
+    s->allocated_bitstream_buffer_size = 0;
+    s->picture          = NULL;
+    s->mb_type          = NULL;
+    s->p_mv_table_base  = NULL;
+    s->b_forw_mv_table_base = NULL;
+    s->b_back_mv_table_base = NULL;
+    s->b_bidir_forw_mv_table_base = NULL;
+    s->b_bidir_back_mv_table_base = NULL;
+    s->b_direct_mv_table_base = NULL;
+    s->p_mv_table            = NULL;
+    s->b_forw_mv_table       = NULL;
+    s->b_back_mv_table       = NULL;
+    s->b_bidir_forw_mv_table = NULL;
+    s->b_bidir_back_mv_table = NULL;
+    s->b_direct_mv_table     = NULL;
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            for (k = 0; k < 2; k++) {
+                s->b_field_mv_table_base[i][j][k] = NULL;
+                s->b_field_mv_table[i][j][k] = NULL;
+            }
+            s->b_field_select_table[i][j] = NULL;
+            s->p_field_mv_table_base[i][j] = NULL;
+            s->p_field_mv_table[i][j] = NULL;
+        }
+        s->p_field_select_table[i] = NULL;
+    }
+
+    s->dc_val_base = NULL;
+    s->coded_block_base = NULL;
+    s->mbintra_table = NULL;
+    s->cbp_table = NULL;
+    s->pred_dir_table = NULL;
+
+    s->mbskip_table = NULL;
+
+    s->er.error_status_table = NULL;
+    s->er.er_temp_buffer = NULL;
+    s->mb_index2xy = NULL;
+    s->lambda_table = NULL;
+
+    s->cplx_tab = NULL;
+    s->bits_tab = NULL;
+}
+
 /**
  * init common structure for both encoder and decoder.
  * this assumes that some variables like width/height are already set
@@ -767,6 +881,8 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
                      s->avctx->active_thread_type & FF_THREAD_SLICE) ?
                     s->avctx->thread_count : 1;
 
+    clear_context(s);
+
     if (s->encoding && s->avctx->slices)
         nb_slices = s->avctx->slices;
 
@@ -799,12 +915,10 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     dct_init(s);
 
     /* set chroma shifts */
-    av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
-                                     &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
+    avcodec_get_chroma_sub_sample(s->avctx->pix_fmt,
+                                  &s->chroma_x_shift,
+                                  &s->chroma_y_shift);
 
-    /* convert fourcc to upper case */
-    s->codec_tag          = avpriv_toupper4(s->avctx->codec_tag);
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->picture,
                       MAX_PICTURE_COUNT * sizeof(Picture), fail);
@@ -813,10 +927,6 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
         if (!s->picture[i].f)
             goto fail;
     }
-    memset(&s->next_picture, 0, sizeof(s->next_picture));
-    memset(&s->last_picture, 0, sizeof(s->last_picture));
-    memset(&s->current_picture, 0, sizeof(s->current_picture));
-    memset(&s->new_picture, 0, sizeof(s->new_picture));
     s->next_picture.f = av_frame_alloc();
     if (!s->next_picture.f)
         goto fail;
@@ -830,24 +940,23 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     if (!s->new_picture.f)
         goto fail;
 
-    if (s->width && s->height) {
         if (init_context_frame(s))
             goto fail;
 
         s->parse_context.state = -1;
-    }
 
-    s->context_initialized = 1;
-    s->thread_context[0]   = s;
+        s->context_initialized = 1;
+        memset(s->thread_context, 0, sizeof(s->thread_context));
+        s->thread_context[0]   = s;
 
-    if (s->width && s->height) {
+//     if (s->width && s->height) {
         if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
             for (i = 0; i < nb_slices; i++) {
+                if (i) {
+                    s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                    if (!s->thread_context[i])
+                        goto fail;
+                }
                 if (init_duplicate_context(s->thread_context[i]) < 0)
                     goto fail;
                     s->thread_context[i]->start_mb_y =
@@ -862,7 +971,7 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
             s->end_mb_y   = s->mb_height;
         }
         s->slice_context_count = nb_slices;
-    }
+//     }
 
     return 0;
  fail:
@@ -917,6 +1026,7 @@ static void free_context_frame(MpegEncContext *s)
     av_freep(&s->er.er_temp_buffer);
     av_freep(&s->mb_index2xy);
     av_freep(&s->lambda_table);
+
     av_freep(&s->cplx_tab);
     av_freep(&s->bits_tab);
 
@@ -927,6 +1037,9 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
 {
     int i, err = 0;
 
+    if (!s->context_initialized)
+        return AVERROR(EINVAL);
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -961,17 +1074,20 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
     if ((err = init_context_frame(s)))
         goto fail;
 
+    memset(s->thread_context, 0, sizeof(s->thread_context));
     s->thread_context[0]   = s;
 
     if (s->width && s->height) {
         int nb_slices = s->slice_context_count;
         if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
             for (i = 0; i < nb_slices; i++) {
+                if (i) {
+                    s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                    if (!s->thread_context[i]) {
+                        err = AVERROR(ENOMEM);
+                        goto fail;
+                    }
+                }
                 if ((err = init_duplicate_context(s->thread_context[i])) < 0)
                     goto fail;
                     s->thread_context[i]->start_mb_y =
@@ -980,7 +1096,8 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
                         (s->mb_height * (i + 1) + nb_slices / 2) / nb_slices;
             }
         } else {
-            if (init_duplicate_context(s) < 0)
+            err = init_duplicate_context(s);
+            if (err < 0)
                 goto fail;
             s->start_mb_y = 0;
             s->end_mb_y   = s->mb_height;
@@ -999,6 +1116,9 @@ void ff_mpv_common_end(MpegEncContext *s)
 {
     int i;
 
+    if (!s)
+        return ;
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -1045,6 +1165,23 @@ void ff_mpv_common_end(MpegEncContext *s)
     s->linesize = s->uvlinesize = 0;
 }
 
+
+static void gray_frame(AVFrame *frame)
+{
+    int i, h_chroma_shift, v_chroma_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(frame->format, &h_chroma_shift, &v_chroma_shift);
+
+    for(i=0; i<frame->height; i++)
+        memset(frame->data[0] + frame->linesize[0]*i, 0x80, frame->width);
+    for(i=0; i<FF_CEIL_RSHIFT(frame->height, v_chroma_shift); i++) {
+        memset(frame->data[1] + frame->linesize[1]*i,
+               0x80, FF_CEIL_RSHIFT(frame->width, h_chroma_shift));
+        memset(frame->data[2] + frame->linesize[2]*i,
+               0x80, FF_CEIL_RSHIFT(frame->width, h_chroma_shift));
+    }
+}
+
 /**
  * generic function called after decoding
  * the header and before a frame is decoded.
@@ -1055,6 +1192,11 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     Picture *pic;
     s->mb_skipped = 0;
 
+    if (!ff_thread_can_start_frame(avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
+
     /* mark & release old frames */
     if (s->pict_type != AV_PICTURE_TYPE_B && s->last_picture_ptr &&
         s->last_picture_ptr != s->next_picture_ptr &&
@@ -1144,11 +1286,14 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         int h_chroma_shift, v_chroma_shift;
         av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
                                          &h_chroma_shift, &v_chroma_shift);
-        if (s->pict_type != AV_PICTURE_TYPE_I)
+        if (s->pict_type == AV_PICTURE_TYPE_B && s->next_picture_ptr && s->next_picture_ptr->f->buf[0])
+            av_log(avctx, AV_LOG_DEBUG,
+                   "allocating dummy last picture for B frame\n");
+        else if (s->pict_type != AV_PICTURE_TYPE_I)
             av_log(avctx, AV_LOG_ERROR,
                    "warning: first frame is no keyframe\n");
         else if (s->picture_structure != PICT_FRAME)
-            av_log(avctx, AV_LOG_INFO,
+            av_log(avctx, AV_LOG_DEBUG,
                    "allocate dummy last picture for field based first keyframe\n");
 
         /* Allocate a dummy frame */
@@ -1160,21 +1305,36 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->last_picture_ptr = &s->picture[i];
 
         s->last_picture_ptr->reference   = 3;
-        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->last_picture_ptr->f->key_frame = 0;
+        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->last_picture_ptr, 0) < 0) {
             s->last_picture_ptr = NULL;
             return -1;
         }
 
-        memset(s->last_picture_ptr->f->data[0], 0,
-               avctx->height * s->last_picture_ptr->f->linesize[0]);
-        memset(s->last_picture_ptr->f->data[1], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[1]);
-        memset(s->last_picture_ptr->f->data[2], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[2]);
+        if (!avctx->hwaccel
+#if FF_API_CAP_VDPAU
+            && !(avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+            ) {
+            for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i,
+                       0x80, avctx->width);
+            if (s->last_picture_ptr->f->data[2]) {
+                for(i=0; i<FF_CEIL_RSHIFT(avctx->height, v_chroma_shift); i++) {
+                    memset(s->last_picture_ptr->f->data[1] + s->last_picture_ptr->f->linesize[1]*i,
+                        0x80, FF_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                    memset(s->last_picture_ptr->f->data[2] + s->last_picture_ptr->f->linesize[2]*i,
+                        0x80, FF_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                }
+            }
+
+            if(s->codec_id == AV_CODEC_ID_FLV1 || s->codec_id == AV_CODEC_ID_H263){
+                for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i, 16, avctx->width);
+            }
+        }
 
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 0);
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 1);
@@ -1190,7 +1350,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->next_picture_ptr = &s->picture[i];
 
         s->next_picture_ptr->reference   = 3;
-        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->next_picture_ptr->f->key_frame = 0;
+        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->next_picture_ptr, 0) < 0) {
             s->next_picture_ptr = NULL;
@@ -1200,6 +1361,10 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         ff_thread_report_progress(&s->next_picture_ptr->tf, INT_MAX, 1);
     }
 
+#if 0 // BUFREF-FIXME
+    memset(s->last_picture.f->data, 0, sizeof(s->last_picture.f->data));
+    memset(s->next_picture.f->data, 0, sizeof(s->next_picture.f->data));
+#endif
     if (s->last_picture_ptr) {
         ff_mpeg_unref_picture(s->avctx, &s->last_picture);
         if (s->last_picture_ptr->f->buf[0] &&
@@ -1215,12 +1380,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
             return ret;
     }
 
-    if (s->pict_type != AV_PICTURE_TYPE_I &&
-        !(s->last_picture_ptr && s->last_picture_ptr->f->buf[0])) {
-        av_log(s, AV_LOG_ERROR,
-               "Non-reference picture received and no reference available\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(s->pict_type == AV_PICTURE_TYPE_I || (s->last_picture_ptr &&
+                                                 s->last_picture_ptr->f->buf[0]));
 
     if (s->picture_structure!= PICT_FRAME) {
         int i;
@@ -1249,12 +1410,9 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->dct_unquantize_inter = s->dct_unquantize_mpeg1_inter;
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        return ff_xvmc_field_start(s, avctx);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        gray_frame(s->current_picture_ptr->f);
+    }
 
     return 0;
 }
@@ -1262,120 +1420,939 @@ FF_ENABLE_DEPRECATION_WARNINGS
 /* called after a frame has been decoded. */
 void ff_mpv_frame_end(MpegEncContext *s)
 {
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* redraw edges for the frame if decoding didn't complete */
-    // just to make sure that all data is rendered.
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration) {
-        ff_xvmc_field_end(s);
-    } else
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     emms_c();
 
     if (s->current_picture.reference)
         ff_thread_report_progress(&s->current_picture_ptr->tf, INT_MAX, 0);
 }
 
+
+#if FF_API_VISMV
+static int clip_line(int *sx, int *sy, int *ex, int *ey, int maxx)
+{
+    if(*sx > *ex)
+        return clip_line(ex, ey, sx, sy, maxx);
+
+    if (*sx < 0) {
+        if (*ex < 0)
+            return 1;
+        *sy = *ey + (*sy - *ey) * (int64_t)*ex / (*ex - *sx);
+        *sx = 0;
+    }
+
+    if (*ex > maxx) {
+        if (*sx > maxx)
+            return 1;
+        *ey = *sy + (*ey - *sy) * (int64_t)(maxx - *sx) / (*ex - *sx);
+        *ex = maxx;
+    }
+    return 0;
+}
+
+
+/**
+ * Draw a line from (ex, ey) -> (sx, sy).
+ * @param w width of the image
+ * @param h height of the image
+ * @param stride stride/linesize of the image
+ * @param color color of the arrow
+ */
+static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey,
+                      int w, int h, int stride, int color)
+{
+    int x, y, fr, f;
+
+    if (clip_line(&sx, &sy, &ex, &ey, w - 1))
+        return;
+    if (clip_line(&sy, &sx, &ey, &ex, h - 1))
+        return;
+
+    sx = av_clip(sx, 0, w - 1);
+    sy = av_clip(sy, 0, h - 1);
+    ex = av_clip(ex, 0, w - 1);
+    ey = av_clip(ey, 0, h - 1);
+
+    buf[sy * stride + sx] += color;
+
+    if (FFABS(ex - sx) > FFABS(ey - sy)) {
+        if (sx > ex) {
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
+        }
+        buf += sx + sy * stride;
+        ex  -= sx;
+        f    = ((ey - sy) << 16) / ex;
+        for (x = 0; x <= ex; x++) {
+            y  = (x * f) >> 16;
+            fr = (x * f) & 0xFFFF;
+            buf[y * stride + x]       += (color * (0x10000 - fr)) >> 16;
+            if(fr) buf[(y + 1) * stride + x] += (color *            fr ) >> 16;
+        }
+    } else {
+        if (sy > ey) {
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
+        }
+        buf += sx + sy * stride;
+        ey  -= sy;
+        if (ey)
+            f = ((ex - sx) << 16) / ey;
+        else
+            f = 0;
+        for(y= 0; y <= ey; y++){
+            x  = (y*f) >> 16;
+            fr = (y*f) & 0xFFFF;
+            buf[y * stride + x]     += (color * (0x10000 - fr)) >> 16;
+            if(fr) buf[y * stride + x + 1] += (color *            fr ) >> 16;
+        }
+    }
+}
+
+/**
+ * Draw an arrow from (ex, ey) -> (sx, sy).
+ * @param w width of the image
+ * @param h height of the image
+ * @param stride stride/linesize of the image
+ * @param color color of the arrow
+ */
+static void draw_arrow(uint8_t *buf, int sx, int sy, int ex,
+                       int ey, int w, int h, int stride, int color, int tail, int direction)
+{
+    int dx,dy;
+
+    if (direction) {
+        FFSWAP(int, sx, ex);
+        FFSWAP(int, sy, ey);
+    }
+
+    sx = av_clip(sx, -100, w + 100);
+    sy = av_clip(sy, -100, h + 100);
+    ex = av_clip(ex, -100, w + 100);
+    ey = av_clip(ey, -100, h + 100);
+
+    dx = ex - sx;
+    dy = ey - sy;
+
+    if (dx * dx + dy * dy > 3 * 3) {
+        int rx =  dx + dy;
+        int ry = -dx + dy;
+        int length = ff_sqrt((rx * rx + ry * ry) << 8);
+
+        // FIXME subpixel accuracy
+        rx = ROUNDED_DIV(rx * 3 << 4, length);
+        ry = ROUNDED_DIV(ry * 3 << 4, length);
+
+        if (tail) {
+            rx = -rx;
+            ry = -ry;
+        }
+
+        draw_line(buf, sx, sy, sx + rx, sy + ry, w, h, stride, color);
+        draw_line(buf, sx, sy, sx - ry, sy + rx, w, h, stride, color);
+    }
+    draw_line(buf, sx, sy, ex, ey, w, h, stride, color);
+}
+#endif
+
+static int add_mb(AVMotionVector *mb, uint32_t mb_type,
+                  int dst_x, int dst_y,
+                  int src_x, int src_y,
+                  int direction)
+{
+    mb->w = IS_8X8(mb_type) || IS_8X16(mb_type) ? 8 : 16;
+    mb->h = IS_8X8(mb_type) || IS_16X8(mb_type) ? 8 : 16;
+    mb->src_x = src_x;
+    mb->src_y = src_y;
+    mb->dst_x = dst_x;
+    mb->dst_y = dst_y;
+    mb->source = direction ? 1 : -1;
+    mb->flags = 0; // XXX: does mb_type contain extra information that could be exported here?
+    return 1;
+}
+
 /**
  * Print debugging info for the given picture.
  */
-void ff_print_debug_info(MpegEncContext *s, Picture *p)
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample)
 {
-    AVFrame *pict;
-    if (s->avctx->hwaccel || !p || !p->mb_type)
+    if ((avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) && mbtype_table && motion_val[0]) {
+        const int shift = 1 + quarter_sample;
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+        int mb_x, mb_y, mbcount = 0;
+
+        /* size is width * height * 2 * 4 where 2 is for directions and 4 is
+         * for the maximum number of MB (4 MB in case of IS_8x8) */
+        AVMotionVector *mvs = av_malloc_array(mb_width * mb_height, 2 * 4 * sizeof(AVMotionVector));
+        if (!mvs)
+            return;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                int i, direction, mb_type = mbtype_table[mb_x + mb_y * mb_stride];
+                for (direction = 0; direction < 2; direction++) {
+                    if (!USES_LIST(mb_type, direction))
+                        continue;
+                    if (IS_8X8(mb_type)) {
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 4 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                      (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = (motion_val[direction][xy][0] >> shift) + sx;
+                            int my = (motion_val[direction][xy][1] >> shift) + sy;
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, direction);
+                        }
+                    } else if (IS_16X8(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 8;
+                            int sy = mb_y * 16 + 4 + 8 * i;
+                            int xy = (mb_x * 2 + (mb_y * 2 + i) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = (motion_val[direction][xy][0] >> shift);
+                            int my = (motion_val[direction][xy][1] >> shift);
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx + sx, my + sy, direction);
+                        }
+                    } else if (IS_8X16(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * i;
+                            int sy = mb_y * 16 + 8;
+                            int xy = (mb_x * 2 + i + mb_y * 2 * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0] >> shift;
+                            int my = motion_val[direction][xy][1] >> shift;
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx + sx, my + sy, direction);
+                        }
+                    } else {
+                          int sx = mb_x * 16 + 8;
+                          int sy = mb_y * 16 + 8;
+                          int xy = (mb_x + mb_y * mv_stride) << mv_sample_log2;
+                          int mx = (motion_val[direction][xy][0]>>shift) + sx;
+                          int my = (motion_val[direction][xy][1]>>shift) + sy;
+                          mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, direction);
+                    }
+                }
+            }
+        }
+
+        if (mbcount) {
+            AVFrameSideData *sd;
+
+            av_log(avctx, AV_LOG_DEBUG, "Adding %d MVs info to frame %d\n", mbcount, avctx->frame_number);
+            sd = av_frame_new_side_data(pict, AV_FRAME_DATA_MOTION_VECTORS, mbcount * sizeof(AVMotionVector));
+            if (!sd) {
+                av_freep(&mvs);
+                return;
+            }
+            memcpy(sd->data, mvs, mbcount * sizeof(AVMotionVector));
+        }
+
+        av_freep(&mvs);
+    }
+
+    /* TODO: export all the following to make them accessible for users (and filters) */
+    if (avctx->hwaccel || !mbtype_table
+#if FF_API_CAP_VDPAU
+        || (avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+        )
         return;
-    pict = p->f;
 
-    if (s->avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
+
+    if (avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
         int x,y;
 
-        av_log(s->avctx,AV_LOG_DEBUG,"New frame, type: ");
-        switch (pict->pict_type) {
-        case AV_PICTURE_TYPE_I:
-            av_log(s->avctx,AV_LOG_DEBUG,"I\n");
-            break;
-        case AV_PICTURE_TYPE_P:
-            av_log(s->avctx,AV_LOG_DEBUG,"P\n");
-            break;
-        case AV_PICTURE_TYPE_B:
-            av_log(s->avctx,AV_LOG_DEBUG,"B\n");
-            break;
-        case AV_PICTURE_TYPE_S:
-            av_log(s->avctx,AV_LOG_DEBUG,"S\n");
-            break;
-        case AV_PICTURE_TYPE_SI:
-            av_log(s->avctx,AV_LOG_DEBUG,"SI\n");
-            break;
-        case AV_PICTURE_TYPE_SP:
-            av_log(s->avctx,AV_LOG_DEBUG,"SP\n");
-            break;
-        }
-        for (y = 0; y < s->mb_height; y++) {
-            for (x = 0; x < s->mb_width; x++) {
-                if (s->avctx->debug & FF_DEBUG_SKIP) {
-                    int count = s->mbskip_table[x + y * s->mb_stride];
+        av_log(avctx, AV_LOG_DEBUG, "New frame, type: %c\n",
+               av_get_picture_type_char(pict->pict_type));
+        for (y = 0; y < mb_height; y++) {
+            for (x = 0; x < mb_width; x++) {
+                if (avctx->debug & FF_DEBUG_SKIP) {
+                    int count = mbskip_table ? mbskip_table[x + y * mb_stride] : 0;
                     if (count > 9)
                         count = 9;
-                    av_log(s->avctx, AV_LOG_DEBUG, "%1d", count);
+                    av_log(avctx, AV_LOG_DEBUG, "%1d", count);
                 }
-                if (s->avctx->debug & FF_DEBUG_QP) {
-                    av_log(s->avctx, AV_LOG_DEBUG, "%2d",
-                           p->qscale_table[x + y * s->mb_stride]);
+                if (avctx->debug & FF_DEBUG_QP) {
+                    av_log(avctx, AV_LOG_DEBUG, "%2d",
+                           qscale_table[x + y * mb_stride]);
                 }
-                if (s->avctx->debug & FF_DEBUG_MB_TYPE) {
-                    int mb_type = p->mb_type[x + y * s->mb_stride];
+                if (avctx->debug & FF_DEBUG_MB_TYPE) {
+                    int mb_type = mbtype_table[x + y * mb_stride];
                     // Type & MV direction
                     if (IS_PCM(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "P");
+                        av_log(avctx, AV_LOG_DEBUG, "P");
                     else if (IS_INTRA(mb_type) && IS_ACPRED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "A");
+                        av_log(avctx, AV_LOG_DEBUG, "A");
                     else if (IS_INTRA4x4(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "i");
+                        av_log(avctx, AV_LOG_DEBUG, "i");
                     else if (IS_INTRA16x16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "I");
+                        av_log(avctx, AV_LOG_DEBUG, "I");
                     else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "d");
+                        av_log(avctx, AV_LOG_DEBUG, "d");
                     else if (IS_DIRECT(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "D");
+                        av_log(avctx, AV_LOG_DEBUG, "D");
                     else if (IS_GMC(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "g");
+                        av_log(avctx, AV_LOG_DEBUG, "g");
                     else if (IS_GMC(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "G");
+                        av_log(avctx, AV_LOG_DEBUG, "G");
                     else if (IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "S");
+                        av_log(avctx, AV_LOG_DEBUG, "S");
                     else if (!USES_LIST(mb_type, 1))
-                        av_log(s->avctx, AV_LOG_DEBUG, ">");
+                        av_log(avctx, AV_LOG_DEBUG, ">");
                     else if (!USES_LIST(mb_type, 0))
-                        av_log(s->avctx, AV_LOG_DEBUG, "<");
+                        av_log(avctx, AV_LOG_DEBUG, "<");
                     else {
-                        assert(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
-                        av_log(s->avctx, AV_LOG_DEBUG, "X");
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        av_log(avctx, AV_LOG_DEBUG, "X");
                     }
 
                     // segmentation
                     if (IS_8X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "+");
+                        av_log(avctx, AV_LOG_DEBUG, "+");
                     else if (IS_16X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "-");
+                        av_log(avctx, AV_LOG_DEBUG, "-");
                     else if (IS_8X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "|");
+                        av_log(avctx, AV_LOG_DEBUG, "|");
                     else if (IS_INTRA(mb_type) || IS_16X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
+                        av_log(avctx, AV_LOG_DEBUG, " ");
                     else
-                        av_log(s->avctx, AV_LOG_DEBUG, "?");
+                        av_log(avctx, AV_LOG_DEBUG, "?");
 
 
                     if (IS_INTERLACED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "=");
+                        av_log(avctx, AV_LOG_DEBUG, "=");
                     else
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
+                        av_log(avctx, AV_LOG_DEBUG, " ");
                 }
             }
-            av_log(s->avctx, AV_LOG_DEBUG, "\n");
+            av_log(avctx, AV_LOG_DEBUG, "\n");
         }
     }
+
+    if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) ||
+        (avctx->debug_mv)) {
+        int mb_y;
+        int i;
+        int h_chroma_shift, v_chroma_shift, block_height;
+#if FF_API_VISMV
+        const int shift = 1 + quarter_sample;
+        uint8_t *ptr;
+        const int width          = avctx->width;
+        const int height         = avctx->height;
+#endif
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+
+        *low_delay = 0; // needed to see the vectors without trashing the buffers
+
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
+
+        av_frame_make_writable(pict);
+
+        pict->opaque = NULL;
+#if FF_API_VISMV
+        ptr          = pict->data[0];
+#endif
+        block_height = 16 >> v_chroma_shift;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            int mb_x;
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                const int mb_index = mb_x + mb_y * mb_stride;
+#if FF_API_VISMV
+                if ((avctx->debug_mv) && motion_val[0]) {
+                    int type;
+                    for (type = 0; type < 3; type++) {
+                        int direction = 0;
+                        switch (type) {
+                        case 0:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_P_FOR)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_P))
+                                continue;
+                            direction = 0;
+                            break;
+                        case 1:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_B_FOR)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_B))
+                                continue;
+                            direction = 0;
+                            break;
+                        case 2:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_B_BACK)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_B))
+                                continue;
+                            direction = 1;
+                            break;
+                        }
+                        if (!USES_LIST(mbtype_table[mb_index], direction))
+                            continue;
+
+                        if (IS_8X8(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 4; i++) {
+                                int sx = mb_x * 16 + 4 + 8 * (i & 1);
+                                int sy = mb_y * 16 + 4 + 8 * (i >> 1);
+                                int xy = (mb_x * 2 + (i & 1) +
+                                          (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = (motion_val[direction][xy][0] >> shift) + sx;
+                                int my = (motion_val[direction][xy][1] >> shift) + sy;
+                                draw_arrow(ptr, sx, sy, mx, my, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else if (IS_16X8(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 2; i++) {
+                                int sx = mb_x * 16 + 8;
+                                int sy = mb_y * 16 + 4 + 8 * i;
+                                int xy = (mb_x * 2 + (mb_y * 2 + i) * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = (motion_val[direction][xy][0] >> shift);
+                                int my = (motion_val[direction][xy][1] >> shift);
+
+                                if (IS_INTERLACED(mbtype_table[mb_index]))
+                                    my *= 2;
+
+                                draw_arrow(ptr, sx, sy, mx + sx, my + sy, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else if (IS_8X16(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 2; i++) {
+                                int sx = mb_x * 16 + 4 + 8 * i;
+                                int sy = mb_y * 16 + 8;
+                                int xy = (mb_x * 2 + i + mb_y * 2 * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = motion_val[direction][xy][0] >> shift;
+                                int my = motion_val[direction][xy][1] >> shift;
+
+                                if (IS_INTERLACED(mbtype_table[mb_index]))
+                                    my *= 2;
+
+                                draw_arrow(ptr, sx, sy, mx + sx, my + sy, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else {
+                              int sx= mb_x * 16 + 8;
+                              int sy= mb_y * 16 + 8;
+                              int xy= (mb_x + mb_y * mv_stride) << mv_sample_log2;
+                              int mx= (motion_val[direction][xy][0]>>shift) + sx;
+                              int my= (motion_val[direction][xy][1]>>shift) + sy;
+                              draw_arrow(ptr, sx, sy, mx, my, width, height, pict->linesize[0], 100, 0, direction);
+                        }
+                    }
+                }
+#endif
+                if ((avctx->debug & FF_DEBUG_VIS_QP)) {
+                    uint64_t c = (qscale_table[mb_index] * 128 / 31) *
+                                 0x0101010101010101ULL;
+                    int y;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[1]) = c;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[2]) = c;
+                    }
+                }
+                if ((avctx->debug & FF_DEBUG_VIS_MB_TYPE) &&
+                    motion_val[0]) {
+                    int mb_type = mbtype_table[mb_index];
+                    uint64_t u,v;
+                    int y;
+#define COLOR(theta, r) \
+    u = (int)(128 + r * cos(theta * 3.141592 / 180)); \
+    v = (int)(128 + r * sin(theta * 3.141592 / 180));
+
+
+                    u = v = 128;
+                    if (IS_PCM(mb_type)) {
+                        COLOR(120, 48)
+                    } else if ((IS_INTRA(mb_type) && IS_ACPRED(mb_type)) ||
+                               IS_INTRA16x16(mb_type)) {
+                        COLOR(30, 48)
+                    } else if (IS_INTRA4x4(mb_type)) {
+                        COLOR(90, 48)
+                    } else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type)) {
+                        // COLOR(120, 48)
+                    } else if (IS_DIRECT(mb_type)) {
+                        COLOR(150, 48)
+                    } else if (IS_GMC(mb_type) && IS_SKIP(mb_type)) {
+                        COLOR(170, 48)
+                    } else if (IS_GMC(mb_type)) {
+                        COLOR(190, 48)
+                    } else if (IS_SKIP(mb_type)) {
+                        // COLOR(180, 48)
+                    } else if (!USES_LIST(mb_type, 1)) {
+                        COLOR(240, 48)
+                    } else if (!USES_LIST(mb_type, 0)) {
+                        COLOR(0, 48)
+                    } else {
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        COLOR(300,48)
+                    }
+
+                    u *= 0x0101010101010101ULL;
+                    v *= 0x0101010101010101ULL;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[1]) = u;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[2]) = v;
+                    }
+
+                    // segmentation
+                    if (IS_8X8(mb_type) || IS_16X8(mb_type)) {
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 0 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 8 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                    }
+                    if (IS_8X8(mb_type) || IS_8X16(mb_type)) {
+                        for (y = 0; y < 16; y++)
+                            pict->data[0][16 * mb_x + 8 + (16 * mb_y + y) *
+                                          pict->linesize[0]] ^= 0x80;
+                    }
+                    if (IS_8X8(mb_type) && mv_sample_log2 >= 2) {
+                        int dm = 1 << (mv_sample_log2 - 2);
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                     (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            // FIXME bidir
+                            int32_t *mv = (int32_t *) &motion_val[0][xy];
+                            if (mv[0] != mv[dm] ||
+                                mv[dm * mv_stride] != mv[dm * (mv_stride + 1)])
+                                for (y = 0; y < 8; y++)
+                                    pict->data[0][sx + 4 + (sy + y) * pict->linesize[0]] ^= 0x80;
+                            if (mv[0] != mv[dm * mv_stride] || mv[dm] != mv[dm * (mv_stride + 1)])
+                                *(uint64_t *)(pict->data[0] + sx + (sy + 4) *
+                                              pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        }
+                    }
+
+                    if (IS_INTERLACED(mb_type) &&
+                        avctx->codec->id == AV_CODEC_ID_H264) {
+                        // hmm
+                    }
+                }
+                if (mbskip_table)
+                    mbskip_table[mb_index] = 0;
+            }
+        }
+    }
+}
+
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict)
+{
+    ff_print_debug_info2(s->avctx, pict, s->mbskip_table, p->mb_type,
+                         p->qscale_table, p->motion_val, &s->low_delay,
+                         s->mb_width, s->mb_height, s->mb_stride, s->quarter_sample);
+}
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type)
+{
+    AVBufferRef *ref = av_buffer_ref(p->qscale_table_buf);
+    int offset = 2*s->mb_stride + 1;
+    if(!ref)
+        return AVERROR(ENOMEM);
+    av_assert0(ref->size >= offset + s->mb_stride * ((f->height+15)/16));
+    ref->size -= offset;
+    ref->data += offset;
+    return av_frame_set_qp_table(f, ref, s->mb_stride, qp_type);
+}
+
+static inline int hpel_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest, uint8_t *src,
+                                     int field_based, int field_select,
+                                     int src_x, int src_y,
+                                     int width, int height, ptrdiff_t stride,
+                                     int h_edge_pos, int v_edge_pos,
+                                     int w, int h, h264_chroma_mc_func *pix_op,
+                                     int motion_x, int motion_y)
+{
+    const int lowres   = s->avctx->lowres;
+    const int op_index = FFMIN(lowres, 3);
+    const int s_mask   = (2 << lowres) - 1;
+    int emu = 0;
+    int sx, sy;
+
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x += motion_x >> lowres + 1;
+    src_y += motion_y >> lowres + 1;
+
+    src   += src_y * stride + src_x;
+
+    if ((unsigned)src_x > FFMAX( h_edge_pos - (!!sx) - w,                 0) ||
+        (unsigned)src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
+                                 s->linesize, s->linesize,
+                                 w + 1, (h + 1) << field_based,
+                                 src_x, src_y   << field_based,
+                                 h_edge_pos, v_edge_pos);
+        src = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    if (field_select)
+        src += s->linesize;
+    pix_op[op_index](dest, src, stride, h, sx, sy);
+    return emu;
+}
+
+/* apply one mpeg motion vector to the three components */
+static av_always_inline void mpeg_motion_lowres(MpegEncContext *s,
+                                                uint8_t *dest_y,
+                                                uint8_t *dest_cb,
+                                                uint8_t *dest_cr,
+                                                int field_based,
+                                                int bottom_field,
+                                                int field_select,
+                                                uint8_t **ref_picture,
+                                                h264_chroma_mc_func *pix_op,
+                                                int motion_x, int motion_y,
+                                                int h, int mb_y)
+{
+    uint8_t *ptr_y, *ptr_cb, *ptr_cr;
+    int mx, my, src_x, src_y, uvsrc_x, uvsrc_y, sx, sy, uvsx, uvsy;
+    ptrdiff_t uvlinesize, linesize;
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres-1+s->chroma_x_shift, 3);
+    const int block_s    = 8>>lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres;
+    const int v_edge_pos = s->v_edge_pos >> lowres;
+    linesize   = s->current_picture.f->linesize[0] << field_based;
+    uvlinesize = s->current_picture.f->linesize[1] << field_based;
+
+    // FIXME obviously not perfect but qpel will not work in lowres anyway
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    if(field_based){
+        motion_y += (bottom_field - field_select)*((1 << lowres)-1);
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x = s->mb_x * 2 * block_s + (motion_x >> lowres + 1);
+    src_y = (mb_y * 2 * block_s >> field_based) + (motion_y >> lowres + 1);
+
+    if (s->out_format == FMT_H263) {
+        uvsx    = ((motion_x >> 1) & s_mask) | (sx & 1);
+        uvsy    = ((motion_y >> 1) & s_mask) | (sy & 1);
+        uvsrc_x = src_x >> 1;
+        uvsrc_y = src_y >> 1;
+    } else if (s->out_format == FMT_H261) {
+        // even chroma mv's are full pel in H261
+        mx      = motion_x / 4;
+        my      = motion_y / 4;
+        uvsx    = (2 * mx) & s_mask;
+        uvsy    = (2 * my) & s_mask;
+        uvsrc_x = s->mb_x * block_s + (mx >> lowres);
+        uvsrc_y =    mb_y * block_s + (my >> lowres);
+    } else {
+        if(s->chroma_y_shift){
+            mx      = motion_x / 2;
+            my      = motion_y / 2;
+            uvsx    = mx & s_mask;
+            uvsy    = my & s_mask;
+            uvsrc_x = s->mb_x * block_s                 + (mx >> lowres + 1);
+            uvsrc_y =   (mb_y * block_s >> field_based) + (my >> lowres + 1);
+        } else {
+            if(s->chroma_x_shift){
+            //Chroma422
+                mx = motion_x / 2;
+                uvsx = mx & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_y = src_y;
+                uvsrc_x = s->mb_x*block_s               + (mx >> (lowres+1));
+            } else {
+            //Chroma444
+                uvsx = motion_x & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_x = src_x;
+                uvsrc_y = src_y;
+            }
+        }
+    }
+
+    ptr_y  = ref_picture[0] + src_y   * linesize   + src_x;
+    ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
+    ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
+
+    if ((unsigned) src_x > FFMAX( h_edge_pos - (!!sx) - 2 * block_s,       0) || uvsrc_y<0 ||
+        (unsigned) src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 linesize >> field_based, linesize >> field_based,
+                                 17, 17 + field_based,
+                                src_x, src_y << field_based, h_edge_pos,
+                                v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf =ubuf + 9 * s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf,  ptr_cb,
+                                     uvlinesize >> field_based, uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf,  ptr_cr,
+                                     uvlinesize >> field_based,uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
+    }
+
+    // FIXME use this for field pix too instead of the obnoxious hack which changes picture.f->data
+    if (bottom_field) {
+        dest_y  += s->linesize;
+        dest_cb += s->uvlinesize;
+        dest_cr += s->uvlinesize;
+    }
+
+    if (field_select) {
+        ptr_y   += s->linesize;
+        ptr_cb  += s->uvlinesize;
+        ptr_cr  += s->uvlinesize;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[lowres - 1](dest_y, ptr_y, linesize, h, sx, sy);
+
+    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        int hc = s->chroma_y_shift ? (h+1-bottom_field)>>1 : h;
+        uvsx = (uvsx << 2) >> lowres;
+        uvsy = (uvsy << 2) >> lowres;
+        if (hc) {
+            pix_op[op_index](dest_cb, ptr_cb, uvlinesize, hc, uvsx, uvsy);
+            pix_op[op_index](dest_cr, ptr_cr, uvlinesize, hc, uvsx, uvsy);
+        }
+    }
+    // FIXME h261 lowres loop filter
+}
+
+static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
+                                            uint8_t *dest_cb, uint8_t *dest_cr,
+                                            uint8_t **ref_picture,
+                                            h264_chroma_mc_func * pix_op,
+                                            int mx, int my)
+{
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres, 3);
+    const int block_s    = 8 >> lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres + 1;
+    const int v_edge_pos = s->v_edge_pos >> lowres + 1;
+    int emu = 0, src_x, src_y, sx, sy;
+    ptrdiff_t offset;
+    uint8_t *ptr;
+
+    if (s->quarter_sample) {
+        mx /= 2;
+        my /= 2;
+    }
+
+    /* In case of 8X8, we construct a single chroma motion vector
+       with a special rounding */
+    mx = ff_h263_round_chroma(mx);
+    my = ff_h263_round_chroma(my);
+
+    sx = mx & s_mask;
+    sy = my & s_mask;
+    src_x = s->mb_x * block_s + (mx >> lowres + 1);
+    src_y = s->mb_y * block_s + (my >> lowres + 1);
+
+    offset = src_y * s->uvlinesize + src_x;
+    ptr = ref_picture[1] + offset;
+    if ((unsigned) src_x > FFMAX(h_edge_pos - (!!sx) - block_s, 0) ||
+        (unsigned) src_y > FFMAX(v_edge_pos - (!!sy) - block_s, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[op_index](dest_cb, ptr, s->uvlinesize, block_s, sx, sy);
+
+    ptr = ref_picture[2] + offset;
+    if (emu) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+    }
+    pix_op[op_index](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
+}
+
+/**
+ * motion compensation of a single macroblock
+ * @param s context
+ * @param dest_y luma destination pointer
+ * @param dest_cb chroma cb/u destination pointer
+ * @param dest_cr chroma cr/v destination pointer
+ * @param dir direction (0->forward, 1->backward)
+ * @param ref_picture array[3] of pointers to the 3 planes of the reference picture
+ * @param pix_op halfpel motion compensation function (average or put normally)
+ * the motion vectors are taken from s->mv and the MV type from s->mv_type
+ */
+static inline void MPV_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest_y, uint8_t *dest_cb,
+                                     uint8_t *dest_cr,
+                                     int dir, uint8_t **ref_picture,
+                                     h264_chroma_mc_func *pix_op)
+{
+    int mx, my;
+    int mb_x, mb_y, i;
+    const int lowres  = s->avctx->lowres;
+    const int block_s = 8 >>lowres;
+
+    mb_x = s->mb_x;
+    mb_y = s->mb_y;
+
+    switch (s->mv_type) {
+    case MV_TYPE_16X16:
+        mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                           0, 0, 0,
+                           ref_picture, pix_op,
+                           s->mv[dir][0][0], s->mv[dir][0][1],
+                           2 * block_s, mb_y);
+        break;
+    case MV_TYPE_8X8:
+        mx = 0;
+        my = 0;
+        for (i = 0; i < 4; i++) {
+            hpel_motion_lowres(s, dest_y + ((i & 1) + (i >> 1) *
+                               s->linesize) * block_s,
+                               ref_picture[0], 0, 0,
+                               (2 * mb_x + (i & 1)) * block_s,
+                               (2 * mb_y + (i >> 1)) * block_s,
+                               s->width, s->height, s->linesize,
+                               s->h_edge_pos >> lowres, s->v_edge_pos >> lowres,
+                               block_s, block_s, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1]);
+
+            mx += s->mv[dir][i][0];
+            my += s->mv[dir][i][1];
+        }
+
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            chroma_4mv_motion_lowres(s, dest_cb, dest_cr, ref_picture,
+                                     pix_op, mx, my);
+        break;
+    case MV_TYPE_FIELD:
+        if (s->picture_structure == PICT_FRAME) {
+            /* top field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0], s->mv[dir][0][1],
+                               block_s, mb_y);
+            /* bottom field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 1, s->field_select[dir][1],
+                               ref_picture, pix_op,
+                               s->mv[dir][1][0], s->mv[dir][1][1],
+                               block_s, mb_y);
+        } else {
+            if (s->picture_structure != s->field_select[dir][0] + 1 &&
+                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+                ref_picture = s->current_picture_ptr->f->data;
+
+            }
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0],
+                               s->mv[dir][0][1], 2 * block_s, mb_y >> 1);
+            }
+        break;
+    case MV_TYPE_16X8:
+        for (i = 0; i < 2; i++) {
+            uint8_t **ref2picture;
+
+            if (s->picture_structure == s->field_select[dir][i] + 1 ||
+                s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+                ref2picture = ref_picture;
+            } else {
+                ref2picture = s->current_picture_ptr->f->data;
+            }
+
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][i],
+                               ref2picture, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1] +
+                               2 * block_s * i, block_s, mb_y >> 1);
+
+            dest_y  +=  2 * block_s *  s->linesize;
+            dest_cb += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+            dest_cr += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+        }
+        break;
+    case MV_TYPE_DMV:
+        if (s->picture_structure == PICT_FRAME) {
+            for (i = 0; i < 2; i++) {
+                int j;
+                for (j = 0; j < 2; j++) {
+                    mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                       1, j, j ^ i,
+                                       ref_picture, pix_op,
+                                       s->mv[dir][2 * i + j][0],
+                                       s->mv[dir][2 * i + j][1],
+                                       block_s, mb_y);
+                }
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+            }
+        } else {
+            for (i = 0; i < 2; i++) {
+                mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                   0, 0, s->picture_structure != i + 1,
+                                   ref_picture, pix_op,
+                                   s->mv[dir][2 * i][0],s->mv[dir][2 * i][1],
+                                   2 * block_s, mb_y >> 1);
+
+                // after put we make avg of the same block
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+
+                // opposite parity is always in the same
+                // frame if this is second field
+                if (!s->first_field) {
+                    ref_picture = s->current_picture_ptr->f->data;
+                }
+            }
+        }
+        break;
+    default:
+        av_assert2(0);
+    }
 }
 
 /**
@@ -1404,14 +2381,14 @@ static int lowest_referenced_row(MpegEncContext *s, int dir)
     }
 
     for (i = 0; i < mvs; i++) {
-        my = s->mv[dir][i][1]<<qpel_shift;
+        my = s->mv[dir][i][1];
         my_max = FFMAX(my_max, my);
         my_min = FFMIN(my_min, my);
     }
 
-    off = (FFMAX(-my_min, my_max) + 63) >> 6;
+    off = ((FFMAX(-my_min, my_max)<<qpel_shift) + 63) >> 6;
 
-    return FFMIN(FFMAX(s->mb_y + off, 0), s->mb_height-1);
+    return av_clip(s->mb_y + off, 0, s->mb_height - 1);
 unhandled:
     return s->mb_height-1;
 }
@@ -1488,18 +2465,15 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
  */
 static av_always_inline
 void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
-                            int is_mpeg12)
+                            int lowres_flag, int is_mpeg12)
 {
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if(CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration){
-        ff_xvmc_decode_mb(s);//xvmc uses pblocks
+    if (CONFIG_XVMC &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb) {
+        s->avctx->hwaccel->decode_mb(s);//xvmc uses pblocks
         return;
     }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
     if(s->avctx->debug&FF_DEBUG_DCT_COEFF) {
        /* print DCT coefficients */
@@ -1530,7 +2504,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else if (!is_mpeg12 && (s->h263_pred || s->h263_aic))
         s->mbintra_table[mb_xy]=1;
 
-    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) ||
+    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) || s->avctx->frame_skip_threshold || s->avctx->frame_skip_factor ||
         !(s->encoding && (s->intra_only || s->pict_type == AV_PICTURE_TYPE_B) &&
           s->avctx->mb_decision != FF_MB_DECISION_RD)) { // FIXME precalc
         uint8_t *dest_y, *dest_cb, *dest_cr;
@@ -1539,8 +2513,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         qpel_mc_func (*op_qpix)[16];
         const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
         const int uvlinesize = s->current_picture.f->linesize[1];
-        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band;
-        const int block_size = 8;
+        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band || lowres_flag;
+        const int block_size= lowres_flag ? 8>>s->avctx->lowres : 8;
 
         /* avoid copy if macroblock skipped in last frame too */
         /* skip only during decoding as we might trash the buffers during encoding a bit */
@@ -1549,7 +2523,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             if (s->mb_skipped) {
                 s->mb_skipped= 0;
-                assert(s->pict_type!=AV_PICTURE_TYPE_I);
+                av_assert2(s->pict_type!=AV_PICTURE_TYPE_I);
                 *mbskip_ptr = 1;
             } else if(!s->current_picture.reference) {
                 *mbskip_ptr = 1;
@@ -1589,19 +2563,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }
                 }
 
-                op_qpix= s->me.qpel_put;
-                if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
-                    op_pix = s->hdsp.put_pixels_tab;
+                if(lowres_flag){
+                    h264_chroma_mc_func *op_pix = s->h264chroma.put_h264_chroma_pixels_tab;
+
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix);
+                        op_pix = s->h264chroma.avg_h264_chroma_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix);
+                    }
                 }else{
-                    op_pix = s->hdsp.put_no_rnd_pixels_tab;
-                }
-                if (s->mv_dir & MV_DIR_FORWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
-                    op_pix = s->hdsp.avg_pixels_tab;
-                    op_qpix= s->me.qpel_avg;
-                }
-                if (s->mv_dir & MV_DIR_BACKWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    op_qpix = s->me.qpel_put;
+                    if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
+                        op_pix = s->hdsp.put_pixels_tab;
+                    }else{
+                        op_pix = s->hdsp.put_no_rnd_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
+                        op_pix = s->hdsp.avg_pixels_tab;
+                        op_qpix= s->me.qpel_avg;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    }
                 }
             }
 
@@ -1647,17 +2633,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }else{
                         //chroma422
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         add_dct(s, block[4], 4, dest_cb, dct_linesize);
                         add_dct(s, block[5], 5, dest_cr, dct_linesize);
                         add_dct(s, block[6], 6, dest_cb+dct_offset, dct_linesize);
                         add_dct(s, block[7], 7, dest_cr+dct_offset, dct_linesize);
                         if(!s->chroma_x_shift){//Chroma444
-                            add_dct(s, block[8], 8, dest_cb+8, dct_linesize);
-                            add_dct(s, block[9], 9, dest_cr+8, dct_linesize);
-                            add_dct(s, block[10], 10, dest_cb+8+dct_offset, dct_linesize);
-                            add_dct(s, block[11], 11, dest_cr+8+dct_offset, dct_linesize);
+                            add_dct(s, block[8], 8, dest_cb+block_size, dct_linesize);
+                            add_dct(s, block[9], 9, dest_cr+block_size, dct_linesize);
+                            add_dct(s, block[10], 10, dest_cb+block_size+dct_offset, dct_linesize);
+                            add_dct(s, block[11], 11, dest_cr+block_size+dct_offset, dct_linesize);
                         }
                     }
                 }//fi gray
@@ -1699,17 +2685,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }else{
 
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         s->idsp.idct_put(dest_cb,              dct_linesize, block[4]);
                         s->idsp.idct_put(dest_cr,              dct_linesize, block[5]);
                         s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
                         s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
                         if(!s->chroma_x_shift){//Chroma444
-                            s->idsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
-                            s->idsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
-                            s->idsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
-                            s->idsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
+                            s->idsp.idct_put(dest_cb + block_size,              dct_linesize, block[8]);
+                            s->idsp.idct_put(dest_cr + block_size,              dct_linesize, block[9]);
+                            s->idsp.idct_put(dest_cb + block_size + dct_offset, dct_linesize, block[10]);
+                            s->idsp.idct_put(dest_cr + block_size + dct_offset, dct_linesize, block[11]);
                         }
                     }
                 }//gray
@@ -1718,8 +2704,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 skip_idct:
         if(!readable){
             s->hdsp.put_pixels_tab[0][0](s->dest[0], dest_y ,   linesize,16);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            }
         }
     }
 }
@@ -1728,23 +2716,25 @@ void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64])
 {
 #if !CONFIG_SMALL
     if(s->out_format == FMT_MPEG1) {
-        mpv_decode_mb_internal(s, block, 1);
+        if(s->avctx->lowres) mpv_decode_mb_internal(s, block, 1, 1);
+        else                 mpv_decode_mb_internal(s, block, 0, 1);
     } else
 #endif
-        mpv_decode_mb_internal(s, block, 0);
+    if(s->avctx->lowres) mpv_decode_mb_internal(s, block, 1, 0);
+    else                  mpv_decode_mb_internal(s, block, 0, 0);
 }
 
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h)
 {
-    ff_draw_horiz_band(s->avctx, s->current_picture.f,
-                       s->last_picture.f, y, h, s->picture_structure,
+    ff_draw_horiz_band(s->avctx, s->current_picture_ptr->f,
+                       s->last_picture_ptr ? s->last_picture_ptr->f : NULL, y, h, s->picture_structure,
                        s->first_field, s->low_delay);
 }
 
 void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
     const int uvlinesize = s->current_picture.f->linesize[1];
-    const int mb_size= 4;
+    const int mb_size= 4 - s->avctx->lowres;
 
     s->block_index[0]= s->b8_stride*(s->mb_y*2    ) - 2 + s->mb_x*2;
     s->block_index[1]= s->b8_stride*(s->mb_y*2    ) - 1 + s->mb_x*2;
@@ -1754,9 +2744,9 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
     //block_index is not used by mpeg2, so it is not affected by chroma_format
 
-    s->dest[0] = s->current_picture.f->data[0] + ((s->mb_x - 1) <<  mb_size);
-    s->dest[1] = s->current_picture.f->data[1] + ((s->mb_x - 1) << (mb_size - s->chroma_x_shift));
-    s->dest[2] = s->current_picture.f->data[2] + ((s->mb_x - 1) << (mb_size - s->chroma_x_shift));
+    s->dest[0] = s->current_picture.f->data[0] + (int)((s->mb_x - 1U) <<  mb_size);
+    s->dest[1] = s->current_picture.f->data[1] + (int)((s->mb_x - 1U) << (mb_size - s->chroma_x_shift));
+    s->dest[2] = s->current_picture.f->data[2] + (int)((s->mb_x - 1U) << (mb_size - s->chroma_x_shift));
 
     if(!(s->pict_type==AV_PICTURE_TYPE_B && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME))
     {
@@ -1768,7 +2758,7 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
             s->dest[0] += (s->mb_y>>1) *   linesize << mb_size;
             s->dest[1] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
             s->dest[2] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
-            assert((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
+            av_assert1((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
         }
     }
 }
@@ -1789,6 +2779,7 @@ void ff_mpeg_flush(AVCodecContext *avctx){
     ff_mpeg_unref_picture(s->avctx, &s->next_picture);
 
     s->mb_x= s->mb_y= 0;
+    s->closed_gop= 0;
 
     s->parse_context.state= -1;
     s->parse_context.frame_start_found= 0;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 52c6f91d20..750388ac25 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "error_resilience.h"
 #include "fdctdsp.h"
 #include "get_bits.h"
+#include "h264chroma.h"
 #include "h263dsp.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
@@ -55,8 +56,9 @@
 #include "videodsp.h"
 
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 
-#define MAX_THREADS 16
+#define MAX_THREADS 32
 
 #define MAX_B_FRAMES 16
 
@@ -95,7 +97,7 @@ typedef struct MpegEncContext {
     int width, height;///< picture size. must be a multiple of 16
     int gop_size;
     int intra_only;   ///< if true, only intra pictures are generated
-    int bit_rate;     ///< wanted bit rate
+    int64_t bit_rate; ///< wanted bit rate
     enum OutputFormat out_format; ///< output format
     int h263_pred;    ///< use mpeg4/h263 ac/dc predictions
     int pb_frame;     ///< PB frame mode (0 = none, 1 = base, 2 = improved)
@@ -186,7 +188,7 @@ typedef struct MpegEncContext {
     uint8_t *coded_block_base;
     uint8_t *coded_block;          ///< used for coded block pattern prediction (msmpeg4v3, wmv1)
     int16_t (*ac_val_base)[16];
-    int16_t (*ac_val[3])[16];      ///< used for for mpeg4 AC prediction, all 3 arrays must be continuous
+    int16_t (*ac_val[3])[16];      ///< used for mpeg4 AC prediction, all 3 arrays must be continuous
     int mb_skipped;                ///< MUST BE SET only during DECODING
     uint8_t *mbskip_table;        /**< used to avoid copy if macroblock skipped (for black regions for example)
                                    and used for b-frame encoding & decoding (contains skip table of next P Frame) */
@@ -203,11 +205,14 @@ typedef struct MpegEncContext {
     int *lambda_table;
     int adaptive_quant;         ///< use adaptive quantization
     int dquant;                 ///< qscale difference to prev qscale
+    int closed_gop;             ///< MPEG1/2 GOP is closed
     int pict_type;              ///< AV_PICTURE_TYPE_I, AV_PICTURE_TYPE_P, AV_PICTURE_TYPE_B, ...
+    int vbv_delay;
     int last_pict_type; //FIXME removes
     int last_non_b_pict_type;   ///< used for mpeg4 gmc b-frames & ratecontrol
     int droppable;
     int frame_rate_index;
+    AVRational mpeg2_frame_rate_ext;
     int last_lambda_for[5];     ///< last lambda for a specific pict type
     int skipdct;                ///< skip dct and code zero residual
 
@@ -217,6 +222,7 @@ typedef struct MpegEncContext {
 
     BlockDSPContext bdsp;
     FDCTDSPContext fdsp;
+    H264ChromaContext h264chroma;
     HpelDSPContext hdsp;
     IDCTDSPContext idsp;
     MECmpContext mecc;
@@ -301,18 +307,22 @@ typedef struct MpegEncContext {
     int ac_esc_length;       ///< num of bits needed to encode the longest esc
     uint8_t *intra_ac_vlc_length;
     uint8_t *intra_ac_vlc_last_length;
+    uint8_t *intra_chroma_ac_vlc_length;
+    uint8_t *intra_chroma_ac_vlc_last_length;
     uint8_t *inter_ac_vlc_length;
     uint8_t *inter_ac_vlc_last_length;
     uint8_t *luma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
-    int coded_score[8];
+    int coded_score[12];
 
     /** precomputed matrix (combine qscale and DCT renorm) */
     int (*q_intra_matrix)[64];
+    int (*q_chroma_intra_matrix)[64];
     int (*q_inter_matrix)[64];
     /** identical to the above but for MMX & these are not permutated, second 64 entries are bias*/
     uint16_t (*q_intra_matrix16)[2][64];
+    uint16_t (*q_chroma_intra_matrix16)[2][64];
     uint16_t (*q_inter_matrix16)[2][64];
 
     /* noise reduction */
@@ -323,6 +333,7 @@ typedef struct MpegEncContext {
     /* bit rate control */
     int64_t total_bits;
     int frame_bits;                ///< bits used for the current frame
+    int stuffing_bits;             ///< bits used for stuffing
     int next_lambda;               ///< next lambda used for retrying to encode a frame
     RateControlContext rc_context; ///< contains stuff only accessed in ratecontrol.c
 
@@ -354,6 +365,7 @@ typedef struct MpegEncContext {
     int prev_mb_info, last_mb_info;
     uint8_t *mb_info_ptr;
     int mb_info_size;
+    int ehc_mode;
     int rc_strategy;
 
     /* H.263+ specific */
@@ -405,6 +417,7 @@ typedef struct MpegEncContext {
 
     /* MJPEG specific */
     struct MJpegContext *mjpeg_ctx;
+    int esc_pos;
 
     /* MSMPEG4 specific */
     int mv_table_index;
@@ -446,11 +459,13 @@ typedef struct MpegEncContext {
     int q_scale_type;
     int intra_vlc_format;
     int alternate_scan;
+    int seq_disp_ext;
     int repeat_first_field;
     int chroma_420_type;
     int chroma_format;
 #define CHROMA_420 1
 #define CHROMA_422 2
+#define CHROMA_444 3
     int chroma_x_shift;//depend on pix_format, that depend on chroma_format
     int chroma_y_shift;
 
@@ -464,7 +479,12 @@ typedef struct MpegEncContext {
     /* RTP specific */
     int rtp_mode;
 
+    char *tc_opt_str;        ///< timecode option string
+    AVTimecode tc;           ///< timecode context
+
     uint8_t *ptr_lastgob;
+    int swap_uv;             //vcr2 codec is an MPEG-2 variant with U and V swapped
+    int pack_pblocks;        //xvmc needs to keep blocks without gaps.
     int16_t (*pblocks[12])[64];
 
     int16_t (*block)[64]; ///< points to one of the following blocks
@@ -500,7 +520,7 @@ typedef struct MpegEncContext {
 
     /**
      * ratecontrol qmin qmax limiting method
-     * 0-> clipping, 1-> use a nice continuous function to limit qscale wthin qmin/qmax.
+     * 0-> clipping, 1-> use a nice continuous function to limit qscale within qmin/qmax.
      */
     float rc_qsquish;
     float rc_qmod_amp;
@@ -509,6 +529,7 @@ typedef struct MpegEncContext {
     float rc_buffer_aggressivity;
     float border_masking;
     int lmin, lmax;
+    int vbv_ignore_qmax;
 
     char *rc_eq;
 
@@ -535,7 +556,9 @@ typedef struct MpegEncContext {
 #define FF_MPV_FLAG_NAQ          0x0010
 #define FF_MPV_FLAG_MV0          0x0020
 
+#ifndef FF_MPV_OFFSET
 #define FF_MPV_OFFSET(x) offsetof(MpegEncContext, x)
+#endif
 #define FF_MPV_OPT_FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 #define FF_MPV_COMMON_OPTS \
 { "mpv_flags",      "Flags common for all mpegvideo-based encoders.", FF_MPV_OFFSET(mpv_flags), AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "mpv_flags" },\
@@ -583,16 +606,21 @@ extern const AVOption ff_mpv_generic_options[];
  */
 void ff_mpv_common_defaults(MpegEncContext *s);
 
+void ff_dct_encode_init_x86(MpegEncContext *s);
+
 int ff_mpv_common_init(MpegEncContext *s);
 void ff_mpv_common_init_arm(MpegEncContext *s);
+void ff_mpv_common_init_axp(MpegEncContext *s);
 void ff_mpv_common_init_neon(MpegEncContext *s);
 void ff_mpv_common_init_ppc(MpegEncContext *s);
 void ff_mpv_common_init_x86(MpegEncContext *s);
+void ff_mpv_common_init_mips(MpegEncContext *s);
 
 int ff_mpv_common_frame_size_change(MpegEncContext *s);
 void ff_mpv_common_end(MpegEncContext *s);
 
 void ff_mpv_decode_defaults(MpegEncContext *s);
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx);
 void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64]);
 void ff_mpv_report_decode_progress(MpegEncContext *s);
 
@@ -605,11 +633,20 @@ void ff_mpv_encode_init_x86(MpegEncContext *s);
 int ff_mpv_encode_end(AVCodecContext *avctx);
 int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *frame, int *got_packet);
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase);
 
 void ff_clean_intra_table_entries(MpegEncContext *s);
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h);
 void ff_mpeg_flush(AVCodecContext *avctx);
-void ff_print_debug_info(MpegEncContext *s, Picture *p);
+
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict);
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample);
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type);
+
 void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
 
 int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src);
@@ -617,6 +654,7 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst, const AVCodecContext *src
 void ff_set_qscale(MpegEncContext * s, int qscale);
 
 void ff_mpv_idct_init(MpegEncContext *s);
+int ff_dct_encode_init(MpegEncContext *s);
 void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64],
                        const uint16_t *quant_matrix, int bias, int qmin, int qmax, int intra);
 int ff_dct_quantize_c(MpegEncContext *s, int16_t *block, int n, int qscale, int *overflow);
@@ -631,7 +669,7 @@ void ff_mpv_motion(MpegEncContext *s,
                    qpel_mc_func (*qpix_op)[16]);
 
 static inline void ff_update_block_index(MpegEncContext *s){
-    const int block_size = 8;
+    const int block_size= 8 >> s->avctx->lowres;
 
     s->block_index[0]+=2;
     s->block_index[1]+=2;
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index e5ff3ed70e..8e565c12a7 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -5,23 +5,27 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/*
+ * non linear quantizers with large QPs and VBV with restrictive qmin fixes sponsored by NOA GmbH
+ */
+
 /**
  * @file
  * The simplest mpeg encoder (well, it was the simplest!).
@@ -61,11 +65,12 @@
 #include "wmv2.h"
 #include "rv10.h"
 #include <limits.h>
+#include "sp5x.h"
 
 #define QUANT_BIAS_SHIFT 8
 
 #define QMAT_SHIFT_MMX 16
-#define QMAT_SHIFT 22
+#define QMAT_SHIFT 21
 
 static int encode_picture(MpegEncContext *s, int picture_number);
 static int dct_quantize_refine(MpegEncContext *s, int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale);
@@ -92,6 +97,11 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
     for (qscale = qmin; qscale <= qmax; qscale++) {
         int i;
+        int qscale2;
+
+        if (s->q_scale_type) qscale2 = ff_mpeg2_non_linear_qscale[qscale];
+        else                 qscale2 = qscale << 1;
+
         if (fdsp->fdct == ff_jpeg_fdct_islow_8  ||
 #if CONFIG_FAANDCT
             fdsp->fdct == ff_faandct            ||
@@ -99,40 +109,40 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             fdsp->fdct == ff_jpeg_fdct_islow_10) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
             }
         } else if (fdsp->fdct == ff_fdct_ifast) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = ff_aanscales[i] * (int64_t) qscale * quant_matrix[j];
+                int64_t den = ff_aanscales[i] * (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << (QMAT_SHIFT + 14)) / den);
             }
         } else {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* We can safely suppose that 16 <= quant_matrix[i] <= 255
                  * Assume x = qscale * quant_matrix[i]
                  * So             16 <=              x  <= 7905
                  * so (1 << 19) / 16 >= (1 << 19) / (x) >= (1 << 19) / 7905
                  * so          32768 >= (1 << 19) / (x) >= 67 */
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
                 //qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) /
                 //                    (qscale * quant_matrix[i]);
-                qmat16[qscale][0][i] = (1 << QMAT_SHIFT_MMX) / den;
+                qmat16[qscale][0][i] = (2 << QMAT_SHIFT_MMX) / den;
 
                 if (qmat16[qscale][0][i] == 0 ||
                     qmat16[qscale][0][i] == 128 * 256)
@@ -162,9 +172,27 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
 static inline void update_qscale(MpegEncContext *s)
 {
-    s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
-                (FF_LAMBDA_SHIFT + 7);
-    s->qscale = av_clip(s->qscale, s->avctx->qmin, s->avctx->qmax);
+    if (s->q_scale_type == 1 && 0) {
+        int i;
+        int bestdiff=INT_MAX;
+        int best = 1;
+
+        for (i = 0 ; i<FF_ARRAY_ELEMS(ff_mpeg2_non_linear_qscale); i++) {
+            int diff = FFABS((ff_mpeg2_non_linear_qscale[i]<<(FF_LAMBDA_SHIFT + 6)) - (int)s->lambda * 139);
+            if (ff_mpeg2_non_linear_qscale[i] < s->avctx->qmin ||
+                (ff_mpeg2_non_linear_qscale[i] > s->avctx->qmax && !s->vbv_ignore_qmax))
+                continue;
+            if (diff < bestdiff) {
+                bestdiff = diff;
+                best = i;
+            }
+        }
+        s->qscale = best;
+    } else {
+        s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
+                    (FF_LAMBDA_SHIFT + 7);
+        s->qscale = av_clip(s->qscale, s->avctx->qmin, s->vbv_ignore_qmax ? 31 : s->avctx->qmax);
+    }
 
     s->lambda2 = (s->lambda * s->lambda + FF_LAMBDA_SCALE / 2) >>
                  FF_LAMBDA_SHIFT;
@@ -237,6 +265,23 @@ static void mpv_encode_defaults(MpegEncContext *s)
     s->picture_in_gop_number = 0;
 }
 
+av_cold int ff_dct_encode_init(MpegEncContext *s) {
+    if (ARCH_X86)
+        ff_dct_encode_init_x86(s);
+
+    if (CONFIG_H263_ENCODER)
+        ff_h263dsp_init(&s->h263dsp);
+    if (!s->dct_quantize)
+        s->dct_quantize = ff_dct_quantize_c;
+    if (!s->denoise_dct)
+        s->denoise_dct  = denoise_dct_c;
+    s->fast_dct_quantize = s->dct_quantize;
+    if (s->avctx->trellis)
+        s->dct_quantize  = dct_quantize_trellis_c;
+
+    return 0;
+}
+
 /* init video encoder */
 av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
 {
@@ -255,18 +300,22 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         }
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         format_supported = 0;
         /* JPEG color space */
         if (avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
             avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+            avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
             (avctx->color_range == AVCOL_RANGE_JPEG &&
              (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-              avctx->pix_fmt == AV_PIX_FMT_YUV422P)))
+              avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+              avctx->pix_fmt == AV_PIX_FMT_YUV444P)))
             format_supported = 1;
         /* MPEG color space */
         else if (avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL &&
                  (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-                  avctx->pix_fmt == AV_PIX_FMT_YUV422P))
+                  avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+                  avctx->pix_fmt == AV_PIX_FMT_YUV444P))
             format_supported = 1;
 
         if (!format_supported) {
@@ -282,6 +331,10 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     }
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUVJ444P:
+    case AV_PIX_FMT_YUV444P:
+        s->chroma_format = CHROMA_444;
+        break;
     case AV_PIX_FMT_YUVJ422P:
     case AV_PIX_FMT_YUV422P:
         s->chroma_format = CHROMA_422;
@@ -298,8 +351,9 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     s->height   = avctx->height;
     if (avctx->gop_size > 600 &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Warning keyframe interval too large! reducing it ...\n");
+        av_log(avctx, AV_LOG_WARNING,
+               "keyframe interval too large!, reducing it from %d to %d\n",
+               avctx->gop_size, 600);
         avctx->gop_size = 600;
     }
     s->gop_size     = avctx->gop_size;
@@ -307,6 +361,7 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     if (avctx->max_b_frames > MAX_B_FRAMES) {
         av_log(avctx, AV_LOG_ERROR, "Too many B-frames requested, maximum "
                "is %d.\n", MAX_B_FRAMES);
+        avctx->max_b_frames = MAX_B_FRAMES;
     }
     s->max_b_frames = avctx->max_b_frames;
     s->codec_id     = avctx->codec->id;
@@ -315,6 +370,24 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     s->mpeg_quant         = avctx->mpeg_quant;
     s->rtp_mode           = !!avctx->rtp_payload_size;
     s->intra_dc_precision = avctx->intra_dc_precision;
+
+    // workaround some differences between how applications specify dc precision
+    if (s->intra_dc_precision < 0) {
+        s->intra_dc_precision += 8;
+    } else if (s->intra_dc_precision >= 8)
+        s->intra_dc_precision -= 8;
+
+    if (s->intra_dc_precision < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+                "intra dc precision must be positive, note some applications use"
+                " 0 and some 8 as base meaning 8bit, the value must not be smaller than that\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) {
+        av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n");
+        return AVERROR(EINVAL);
+    }
     s->user_specified_pts = AV_NOPTS_VALUE;
 
     if (s->gop_size <= 1) {
@@ -352,9 +425,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->loop_filter = !!(s->avctx->flags & AV_CODEC_FLAG_LOOP_FILTER);
 
     if (avctx->rc_max_rate && !avctx->rc_buffer_size) {
-        av_log(avctx, AV_LOG_ERROR,
-               "a vbv buffer size is needed, "
-               "for encoding with a maximum bitrate\n");
+        switch(avctx->codec_id) {
+        case AV_CODEC_ID_MPEG1VIDEO:
+        case AV_CODEC_ID_MPEG2VIDEO:
+            avctx->rc_buffer_size = FFMAX(avctx->rc_max_rate, 15000000) * 112LL / 15000000 * 16384;
+            break;
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_MSMPEG4V1:
+        case AV_CODEC_ID_MSMPEG4V2:
+        case AV_CODEC_ID_MSMPEG4V3:
+            if       (avctx->rc_max_rate >= 15000000) {
+                avctx->rc_buffer_size = 320 + (avctx->rc_max_rate - 15000000LL) * (760-320) / (38400000 - 15000000);
+            } else if(avctx->rc_max_rate >=  2000000) {
+                avctx->rc_buffer_size =  80 + (avctx->rc_max_rate -  2000000LL) * (320- 80) / (15000000 -  2000000);
+            } else if(avctx->rc_max_rate >=   384000) {
+                avctx->rc_buffer_size =  40 + (avctx->rc_max_rate -   384000LL) * ( 80- 40) / ( 2000000 -   384000);
+            } else
+                avctx->rc_buffer_size = 40;
+            avctx->rc_buffer_size *= 16384;
+            break;
+        }
+        if (avctx->rc_buffer_size) {
+            av_log(avctx, AV_LOG_INFO, "Automatically choosing VBV buffer size of %d kbyte\n", avctx->rc_buffer_size/8192);
+        }
+    }
+
+    if ((!avctx->rc_max_rate) != (!avctx->rc_buffer_size)) {
+        av_log(avctx, AV_LOG_ERROR, "Either both buffer size and max rate or neither must be specified\n");
         return -1;
     }
 
@@ -369,7 +466,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (avctx->rc_max_rate && avctx->rc_max_rate < avctx->bit_rate) {
-        av_log(avctx, AV_LOG_INFO, "bitrate above max bitrate\n");
+        av_log(avctx, AV_LOG_ERROR, "bitrate above max bitrate\n");
         return -1;
     }
 
@@ -390,9 +487,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!s->fixed_qscale &&
         avctx->bit_rate * av_q2d(avctx->time_base) >
             avctx->bit_rate_tolerance) {
-        av_log(avctx, AV_LOG_ERROR,
-               "bitrate tolerance too small for bitrate\n");
-        return -1;
+        av_log(avctx, AV_LOG_WARNING,
+               "bitrate tolerance %d too small for bitrate %"PRId64", overriding\n", avctx->bit_rate_tolerance, (int64_t)avctx->bit_rate);
+        avctx->bit_rate_tolerance = 5 * avctx->bit_rate * av_q2d(avctx->time_base);
     }
 
     if (s->avctx->rc_max_rate &&
@@ -431,18 +528,74 @@ FF_ENABLE_DEPRECATION_WARNINGS
         av_log(avctx, AV_LOG_ERROR, "b frames not supported by codec\n");
         return -1;
     }
+    if (s->max_b_frames < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "max b frames must be 0 or positive for mpegvideo based encoders\n");
+        return -1;
+    }
 
     if ((s->codec_id == AV_CODEC_ID_MPEG4 ||
          s->codec_id == AV_CODEC_ID_H263  ||
          s->codec_id == AV_CODEC_ID_H263P) &&
         (avctx->sample_aspect_ratio.num > 255 ||
          avctx->sample_aspect_ratio.den > 255)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid pixel aspect ratio %i/%i, limit is 255/255\n",
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        (avctx->width  > 2048 ||
+         avctx->height > 1152 )) {
+        av_log(avctx, AV_LOG_ERROR, "H.263 does not support resolutions above 2048x1152\n");
+        return -1;
+    }
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        ((avctx->width &3) ||
+         (avctx->height&3) )) {
+        av_log(avctx, AV_LOG_ERROR, "w/h must be a multiple of 4\n");
         return -1;
     }
 
+    if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
+        (avctx->width  > 4095 ||
+         avctx->height > 4095 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-1 does not support resolutions above 4095x4095\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO &&
+        (avctx->width  > 16383 ||
+         avctx->height > 16383 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-2 does not support resolutions above 16383x16383\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_RV10 &&
+        (avctx->width &15 ||
+         avctx->height&15 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 16\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->codec_id == AV_CODEC_ID_RV20 &&
+        (avctx->width &3 ||
+         avctx->height&3 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 4\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_WMV1 ||
+         s->codec_id == AV_CODEC_ID_WMV2) &&
+         avctx->width & 1) {
+         av_log(avctx, AV_LOG_ERROR, "width must be multiple of 2\n");
+         return -1;
+    }
+
     if ((s->avctx->flags & (AV_CODEC_FLAG_INTERLACED_DCT | AV_CODEC_FLAG_INTERLACED_ME)) &&
         s->codec_id != AV_CODEC_ID_MPEG4 && s->codec_id != AV_CODEC_ID_MPEG2VIDEO) {
         av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n");
@@ -450,7 +603,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     // FIXME mpeg2 uses that too
-    if (s->mpeg_quant && s->codec_id != AV_CODEC_ID_MPEG4) {
+    if (s->mpeg_quant && (   s->codec_id != AV_CODEC_ID_MPEG4
+                          && s->codec_id != AV_CODEC_ID_MPEG2VIDEO)) {
         av_log(avctx, AV_LOG_ERROR,
                "mpeg2 style quantization not supported by codec\n");
         return -1;
@@ -489,9 +643,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (s->q_scale_type == 1) {
-        if (avctx->qmax > 12) {
+        if (avctx->qmax > 28) {
             av_log(avctx, AV_LOG_ERROR,
-                   "non linear quant only supports qmax <= 12 currently\n");
+                   "non linear quant only supports qmax <= 28 currently\n");
             return -1;
         }
     }
@@ -500,6 +654,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->codec_id != AV_CODEC_ID_MPEG4      &&
         s->codec_id != AV_CODEC_ID_MPEG1VIDEO &&
         s->codec_id != AV_CODEC_ID_MPEG2VIDEO &&
+        s->codec_id != AV_CODEC_ID_MJPEG      &&
         (s->codec_id != AV_CODEC_ID_H263P)) {
         av_log(avctx, AV_LOG_ERROR,
                "multi threaded encoding not supported by codec\n");
@@ -508,14 +663,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (s->avctx->thread_count < 1) {
         av_log(avctx, AV_LOG_ERROR,
-               "automatic thread number detection not supported by codec,"
+               "automatic thread number detection not supported by codec, "
                "patch welcome\n");
         return -1;
     }
 
-    if (s->avctx->thread_count > 1)
+    if (s->avctx->slices > 1 || s->avctx->thread_count > 1)
         s->rtp_mode = 1;
 
+    if (s->avctx->thread_count > 1 && s->codec_id == AV_CODEC_ID_H263P)
+        s->h263_slice_structured = 1;
+
     if (!avctx->time_base.den || !avctx->time_base.num) {
         av_log(avctx, AV_LOG_ERROR, "framerate not set\n");
         return -1;
@@ -535,8 +693,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         //return -1;
     }
 
-    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO ||
-        s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG) {
+    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO || s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG || s->codec_id==AV_CODEC_ID_AMV) {
         // (a + x * 3 / 8) / x
         s->intra_quant_bias = 3 << (QUANT_BIAS_SHIFT - 3);
         s->inter_quant_bias = 0;
@@ -546,6 +703,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->inter_quant_bias = -(1 << (QUANT_BIAS_SHIFT - 2));
     }
 
+    if (avctx->qmin > avctx->qmax || avctx->qmin <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "qmin and or qmax are invalid, they must be 0 < min <= max\n");
+        return AVERROR(EINVAL);
+    }
+
 #if FF_API_QUANT_BIAS
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS)
@@ -555,6 +717,8 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    av_log(avctx, AV_LOG_DEBUG, "intra_quant_bias = %d inter_quant_bias = %d\n",s->intra_quant_bias,s->inter_quant_bias);
+
     if (avctx->codec_id == AV_CODEC_ID_MPEG4 &&
         s->avctx->time_base.den > (1 << 16) - 1) {
         av_log(avctx, AV_LOG_ERROR,
@@ -579,6 +743,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->rtp_mode   = 1;
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         s->out_format = FMT_MJPEG;
         s->intra_only = 1; /* force intra only for jpeg */
         if (!CONFIG_MJPEG_ENCODER ||
@@ -604,13 +769,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     case AV_CODEC_ID_H263:
         if (!CONFIG_H263_ENCODER)
-        return -1;
+            return -1;
         if (ff_match_2uint16(ff_h263_format, FF_ARRAY_ELEMS(ff_h263_format),
                              s->width, s->height) == 8) {
-            av_log(avctx, AV_LOG_INFO,
+            av_log(avctx, AV_LOG_ERROR,
                    "The specified picture size of %dx%d is not valid for "
                    "the H.263 codec.\nValid sizes are 128x96, 176x144, "
-                   "352x288, 704x576, and 1408x1152."
+                   "352x288, 704x576, and 1408x1152. "
                    "Try H.263+.\n", s->width, s->height);
             return -1;
         }
@@ -715,9 +880,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (ff_mpv_common_init(s) < 0)
         return -1;
 
-    if (ARCH_X86)
-        ff_mpv_encode_init_x86(s);
-
     ff_fdctdsp_init(&s->fdsp, avctx);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
@@ -732,8 +894,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     FF_ALLOCZ_OR_GOTO(s->avctx, s->avctx->stats_out, 256, fail);
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix,   64 * 32 * sizeof(int), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix, 64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix,   64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->input_picture,
                       MAX_PICTURE_COUNT * sizeof(Picture *), fail);
@@ -745,15 +909,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                           2 * 64 * sizeof(uint16_t), fail);
     }
 
-    if (CONFIG_H263_ENCODER)
-        ff_h263dsp_init(&s->h263dsp);
-    if (!s->dct_quantize)
-        s->dct_quantize = ff_dct_quantize_c;
-    if (!s->denoise_dct)
-        s->denoise_dct  = denoise_dct_c;
-    s->fast_dct_quantize = s->dct_quantize;
-    if (avctx->trellis)
-        s->dct_quantize  = dct_quantize_trellis_c;
+    ff_dct_encode_init(s);
 
     if ((CONFIG_H263P_ENCODER || CONFIG_RV20_ENCODER) && s->modified_quant)
         s->chroma_qscale_table = ff_h263_chroma_qscale_table;
@@ -786,6 +942,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         } else {
             /* mpeg1/2 */
+            s->chroma_intra_matrix[j] =
             s->intra_matrix[j] = ff_mpeg1_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         }
@@ -901,6 +1058,10 @@ av_cold int ff_mpv_encode_end(AVCodecContext *avctx)
     av_freep(&s->avctx->stats_out);
     av_freep(&s->ac_stats);
 
+    if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+    if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+    s->q_chroma_intra_matrix=   NULL;
+    s->q_chroma_intra_matrix16= NULL;
     av_freep(&s->q_intra_matrix);
     av_freep(&s->q_inter_matrix);
     av_freep(&s->q_intra_matrix16);
@@ -953,7 +1114,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 1,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -972,18 +1133,17 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
 
         if (pts != AV_NOPTS_VALUE) {
             if (s->user_specified_pts != AV_NOPTS_VALUE) {
-                int64_t time = pts;
                 int64_t last = s->user_specified_pts;
 
-                if (time <= last) {
+                if (pts <= last) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "Error, Invalid timestamp=%"PRId64", "
-                           "last=%"PRId64"\n", pts, s->user_specified_pts);
-                    return -1;
+                           "Invalid pts (%"PRId64") <= last (%"PRId64")\n",
+                           pts, last);
+                    return AVERROR(EINVAL);
                 }
 
                 if (!s->low_delay && display_picture_number == 1)
-                    s->dts_delta = time - last;
+                    s->dts_delta = pts - last;
             }
             s->user_specified_pts = pts;
         } else {
@@ -1007,8 +1167,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
             direct = 0;
         if ((s->width & 15) || (s->height & 15))
             direct = 0;
+        if (((intptr_t)(pic_arg->data[0])) & (STRIDE_ALIGN-1))
+            direct = 0;
+        if (s->linesize & (STRIDE_ALIGN-1))
+            direct = 0;
 
-        ff_dlog(s->avctx, "%d %d %td %td\n", pic_arg->linesize[0],
+        ff_dlog(s->avctx, "%d %d %"PTRDIFF_SPECIFIER" %"PTRDIFF_SPECIFIER"\n", pic_arg->linesize[0],
                 pic_arg->linesize[1], s->linesize, s->uvlinesize);
 
         i = ff_find_unused_picture(s->avctx, s->picture, direct);
@@ -1046,6 +1210,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                     int h = s->height >> v_shift;
                     uint8_t *src = pic_arg->data[i];
                     uint8_t *dst = pic->f->data[i];
+                    int vpad = 16;
+
+                    if (   s->codec_id == AV_CODEC_ID_MPEG2VIDEO
+                        && !s->progressive_sequence
+                        && FFALIGN(s->height, 32) - s->height > 16)
+                        vpad = 32;
 
                     if (!s->avctx->rc_buffer_size)
                         dst += INPLACE_OFFSET;
@@ -1061,11 +1231,11 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                             src += src_stride;
                         }
                     }
-                    if ((s->width & 15) || (s->height & 15)) {
+                    if ((s->width & 15) || (s->height & (vpad-1))) {
                         s->mpvencdsp.draw_edges(dst, dst_stride,
                                                 w, h,
                                                 16 >> h_shift,
-                                                16 >> v_shift,
+                                                vpad >> v_shift,
                                                 EDGE_BOTTOM);
                     }
                 }
@@ -1104,19 +1274,23 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref)
                 uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride);
                 int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
 
-                switch (s->avctx->frame_skip_exp) {
+                switch (FFABS(s->avctx->frame_skip_exp)) {
                 case 0: score    =  FFMAX(score, v);          break;
                 case 1: score   += FFABS(v);                  break;
-                case 2: score   += v * v;                     break;
-                case 3: score64 += FFABS(v * v * (int64_t)v); break;
-                case 4: score64 += v * v * (int64_t)(v * v);  break;
+                case 2: score64 += v * (int64_t)v;                       break;
+                case 3: score64 += FFABS(v * (int64_t)v * v);            break;
+                case 4: score64 += (v * (int64_t)v) * (v * (int64_t)v);  break;
                 }
             }
         }
     }
+    emms_c();
 
     if (score)
         score64 = score;
+    if (s->avctx->frame_skip_exp < 0)
+        score64 = pow(score64 / (double)(s->mb_width * s->mb_height),
+                      -1.0/s->avctx->frame_skip_exp);
 
     if (score64 < s->avctx->frame_skip_threshold)
         return 1;
@@ -1151,7 +1325,7 @@ static int estimate_best_b_count(MpegEncContext *s)
 
     if (!c)
         return AVERROR(ENOMEM);
-    assert(scale >= 0 && scale <= 3);
+    av_assert0(scale >= 0 && scale <= 3);
 
     //emms_c();
     //s->next_picture_ptr->quality;
@@ -1181,29 +1355,31 @@ static int estimate_best_b_count(MpegEncContext *s)
     for (i = 0; i < s->max_b_frames + 2; i++) {
         Picture pre_input, *pre_input_ptr = i ? s->input_picture[i - 1] :
                                                 s->next_picture_ptr;
+        uint8_t *data[4];
 
         if (pre_input_ptr && (!i || s->input_picture[i - 1])) {
             pre_input = *pre_input_ptr;
+            memcpy(data, pre_input_ptr->f->data, sizeof(data));
 
             if (!pre_input.shared && i) {
-                pre_input.f->data[0] += INPLACE_OFFSET;
-                pre_input.f->data[1] += INPLACE_OFFSET;
-                pre_input.f->data[2] += INPLACE_OFFSET;
+                data[0] += INPLACE_OFFSET;
+                data[1] += INPLACE_OFFSET;
+                data[2] += INPLACE_OFFSET;
             }
 
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
                                        s->tmp_frames[i]->linesize[0],
-                                       pre_input.f->data[0],
+                                       data[0],
                                        pre_input.f->linesize[0],
                                        c->width, c->height);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
                                        s->tmp_frames[i]->linesize[1],
-                                       pre_input.f->data[1],
+                                       data[1],
                                        pre_input.f->linesize[1],
                                        c->width >> 1, c->height >> 1);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
                                        s->tmp_frames[i]->linesize[2],
-                                       pre_input.f->data[2],
+                                       data[2],
                                        pre_input.f->linesize[2],
                                        c->width >> 1, c->height >> 1);
         }
@@ -1266,6 +1442,19 @@ static int select_input_picture(MpegEncContext *s)
 
     /* set next picture type & ordering */
     if (!s->reordered_input_picture[0] && s->input_picture[0]) {
+        if (s->avctx->frame_skip_threshold || s->avctx->frame_skip_factor) {
+            if (s->picture_in_gop_number < s->gop_size &&
+                s->next_picture_ptr &&
+                skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
+                // FIXME check that te gop check above is +-1 correct
+                av_frame_unref(s->input_picture[0]->f);
+
+                ff_vbv_update(s, 0);
+
+                goto no_output_pic;
+            }
+        }
+
         if (/*s->picture_in_gop_number >= s->gop_size ||*/
             !s->next_picture_ptr || s->intra_only) {
             s->reordered_input_picture[0] = s->input_picture[0];
@@ -1275,19 +1464,6 @@ static int select_input_picture(MpegEncContext *s)
         } else {
             int b_frames;
 
-            if (s->avctx->frame_skip_threshold || s->avctx->frame_skip_factor) {
-                if (s->picture_in_gop_number < s->gop_size &&
-                    skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
-                    // FIXME check that te gop check above is +-1 correct
-                    av_frame_unref(s->input_picture[0]->f);
-
-                    emms_c();
-                    ff_vbv_update(s, 0);
-
-                    goto no_output_pic;
-                }
-            }
-
             if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
                 for (i = 0; i < s->max_b_frames + 1; i++) {
                     int pict_num = s->input_picture[0]->f->display_picture_number + i;
@@ -1436,25 +1612,26 @@ no_output_pic:
 
 static void frame_end(MpegEncContext *s)
 {
-    int i;
-
     if (s->unrestricted_mv &&
         s->current_picture.reference &&
         !s->intra_only) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
         int hshift = desc->log2_chroma_w;
         int vshift = desc->log2_chroma_h;
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[0], s->linesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[0],
+                                s->current_picture.f->linesize[0],
                                 s->h_edge_pos, s->v_edge_pos,
                                 EDGE_WIDTH, EDGE_WIDTH,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[1], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[1],
+                                s->current_picture.f->linesize[1],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
                                 EDGE_WIDTH >> vshift,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[2], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[2],
+                                s->current_picture.f->linesize[2],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
@@ -1469,14 +1646,6 @@ static void frame_end(MpegEncContext *s)
     if (s->pict_type!= AV_PICTURE_TYPE_B)
         s->last_non_b_pict_type = s->pict_type;
 
-    if (s->encoding) {
-        /* release non-reference frames */
-        for (i = 0; i < MAX_PICTURE_COUNT; i++) {
-            if (!s->picture[i].reference)
-                ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        }
-    }
-
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     av_frame_copy_props(s->avctx->coded_frame, s->current_picture.f);
@@ -1576,7 +1745,7 @@ static int frame_start(MpegEncContext *s)
     }
 
     if (s->dct_error_sum) {
-        assert(s->avctx->noise_reduction && s->encoding);
+        av_assert2(s->avctx->noise_reduction && s->encoding);
         update_noise_reduction(s);
     }
 
@@ -1590,6 +1759,8 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     int i, stuffing_count, ret;
     int context_count = s->slice_context_count;
 
+    s->vbv_ignore_qmax = 0;
+
     s->picture_in_gop_number++;
 
     if (load_input_picture(s, pic_arg) < 0)
@@ -1601,9 +1772,11 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
 
     /* output? */
     if (s->new_picture.f->data[0]) {
-        uint8_t *sd;
-        if (!pkt->data &&
-            (ret = ff_alloc_packet(pkt, s->mb_width*s->mb_height*MAX_MB_BYTES)) < 0)
+        int growing_buffer = context_count == 1 && !pkt->data && !s->data_partitioning;
+        int pkt_size = growing_buffer ? FFMAX(s->mb_width*s->mb_height*64+10000, avctx->internal->byte_buffer_size) - AV_INPUT_BUFFER_PADDING_SIZE
+                                              :
+                                              s->mb_width*s->mb_height*(MAX_MB_BYTES+100)+10000;
+        if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
             return ret;
         if (s->mb_info) {
             s->mb_info_ptr = av_packet_new_side_data(pkt,
@@ -1628,7 +1801,13 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
         if (ret < 0)
             return ret;
 vbv_retry:
-        if (encode_picture(s, s->picture_number) < 0)
+        ret = encode_picture(s, s->picture_number);
+        if (growing_buffer) {
+            av_assert0(s->pb.buf == avctx->internal->byte_buffer);
+            pkt->data = s->pb.buf;
+            pkt->size = avctx->internal->byte_buffer_size;
+        }
+        if (ret < 0)
             return -1;
 
         avctx->header_bits = s->header_bits;
@@ -1643,28 +1822,24 @@ vbv_retry:
 
         frame_end(s);
 
-        sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                     sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = s->current_picture.f->quality;
-
         if (CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG)
             ff_mjpeg_encode_picture_trailer(&s->pb, s->header_bits);
 
         if (avctx->rc_buffer_size) {
             RateControlContext *rcc = &s->rc_context;
-            int max_size = rcc->buffer_index * avctx->rc_max_available_vbv_use;
+            int max_size = FFMAX(rcc->buffer_index * avctx->rc_max_available_vbv_use, rcc->buffer_index - 500);
+            int hq = (s->avctx->mb_decision == FF_MB_DECISION_RD || s->avctx->trellis);
+            int min_step = hq ? 1 : (1<<(FF_LAMBDA_SHIFT + 7))/139;
 
             if (put_bits_count(&s->pb) > max_size &&
                 s->lambda < s->lmax) {
-                s->next_lambda = FFMAX(s->lambda + 1, s->lambda *
+                s->next_lambda = FFMAX(s->lambda + min_step, s->lambda *
                                        (s->qscale + 1) / s->qscale);
                 if (s->adaptive_quant) {
                     int i;
                     for (i = 0; i < s->mb_height * s->mb_stride; i++)
                         s->lambda_table[i] =
-                            FFMAX(s->lambda_table[i] + 1,
+                            FFMAX(s->lambda_table[i] + min_step,
                                   s->lambda_table[i] * (s->qscale + 1) /
                                   s->qscale);
                 }
@@ -1684,10 +1859,12 @@ vbv_retry:
                     PutBitContext *pb = &s->thread_context[i]->pb;
                     init_put_bits(pb, pb->buf, pb->buf_end - pb->buf);
                 }
+                s->vbv_ignore_qmax = 1;
+                av_log(s->avctx, AV_LOG_VERBOSE, "reencoding frame due to VBV\n");
                 goto vbv_retry;
             }
 
-            assert(s->avctx->rc_max_rate);
+            av_assert0(s->avctx->rc_max_rate);
         }
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
@@ -1697,6 +1874,10 @@ vbv_retry:
             s->current_picture_ptr->encoding_error[i] = s->current_picture.encoding_error[i];
             avctx->error[i] += s->current_picture_ptr->encoding_error[i];
         }
+        ff_side_data_set_encoder_stats(pkt, s->current_picture.f->quality,
+                                       s->current_picture_ptr->encoding_error,
+                                       (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                       s->pict_type);
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
             assert(avctx->header_bits + avctx->mv_bits + avctx->misc_bits +
@@ -1706,6 +1887,7 @@ vbv_retry:
         s->frame_bits  = put_bits_count(&s->pb);
 
         stuffing_count = ff_vbv_update(s, s->frame_bits);
+        s->stuffing_bits = 8*stuffing_count;
         if (stuffing_count) {
             if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) <
                     stuffing_count + 50) {
@@ -1760,7 +1942,7 @@ vbv_retry:
 
             vbv_delay = FFMAX(vbv_delay, min_delay);
 
-            assert(vbv_delay < 0xFFFF);
+            av_assert0(vbv_delay < 0xFFFF);
 
             s->vbv_delay_ptr[0] &= 0xF8;
             s->vbv_delay_ptr[0] |= vbv_delay >> 13;
@@ -1788,7 +1970,14 @@ vbv_retry:
     } else {
         s->frame_bits = 0;
     }
-    assert((s->frame_bits & 7) == 0);
+
+    /* release non-reference frames */
+    for (i = 0; i < MAX_PICTURE_COUNT; i++) {
+        if (!s->picture[i].reference)
+            ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
+    }
+
+    av_assert1((s->frame_bits & 7) == 0);
 
     pkt->size = s->frame_bits / 8;
     *got_packet = !!pkt->size;
@@ -1912,15 +2101,17 @@ static void get_visual_weight(int16_t *weight, uint8_t *ptr, int stride)
 static av_always_inline void encode_mb_internal(MpegEncContext *s,
                                                 int motion_x, int motion_y,
                                                 int mb_block_height,
+                                                int mb_block_width,
                                                 int mb_block_count)
 {
-    int16_t weight[8][64];
-    int16_t orig[8][64];
+    int16_t weight[12][64];
+    int16_t orig[12][64];
     const int mb_x = s->mb_x;
     const int mb_y = s->mb_y;
     int i;
-    int skip_dct[8];
+    int skip_dct[12];
     int dct_offset = s->linesize * 8; // default for progressive frames
+    int uv_dct_offset = s->uvlinesize * 8;
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     ptrdiff_t wrap_y, wrap_c;
 
@@ -1962,27 +2153,31 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
     ptr_y  = s->new_picture.f->data[0] +
              (mb_y * 16 * wrap_y)              + mb_x * 16;
     ptr_cb = s->new_picture.f->data[1] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
     ptr_cr = s->new_picture.f->data[2] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
 
-    if (mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) {
-        uint8_t *ebuf = s->sc.edge_emu_buffer + 32;
+    if((mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) && s->codec_id != AV_CODEC_ID_AMV){
+        uint8_t *ebuf = s->sc.edge_emu_buffer + 36 * wrap_y;
+        int cw = (s->width  + s->chroma_x_shift) >> s->chroma_x_shift;
+        int ch = (s->height + s->chroma_y_shift) >> s->chroma_y_shift;
         s->vdsp.emulated_edge_mc(ebuf, ptr_y,
                                  wrap_y, wrap_y,
                                  16, 16, mb_x * 16, mb_y * 16,
                                  s->width, s->height);
         ptr_y = ebuf;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb,
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y, ptr_cb,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cb = ebuf + 18 * wrap_y;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 8, ptr_cr,
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cb = ebuf + 16 * wrap_y;
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y + 16, ptr_cr,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cr = ebuf + 18 * wrap_y + 8;
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cr = ebuf + 16 * wrap_y + 16;
     }
 
     if (s->mb_intra) {
@@ -2003,8 +2198,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
-                    if (s->chroma_format == CHROMA_422)
+                    if (s->chroma_format == CHROMA_422 ||
+                        s->chroma_format == CHROMA_444)
                         wrap_c <<= 1;
                 }
             }
@@ -2021,11 +2218,16 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         } else {
             s->pdsp.get_pixels(s->block[4], ptr_cb, wrap_c);
             s->pdsp.get_pixels(s->block[5], ptr_cr, wrap_c);
-            if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.get_pixels(s->block[6],
-                                   ptr_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.get_pixels(s->block[7],
-                                   ptr_cr + (dct_offset >> 1), wrap_c);
+            if (!s->chroma_y_shift && s->chroma_x_shift) { /* 422 */
+                s->pdsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
+            } else if (!s->chroma_y_shift && !s->chroma_x_shift) { /* 444 */
+                s->pdsp.get_pixels(s->block[ 6], ptr_cb + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 7], ptr_cr + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 8], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[ 9], ptr_cr + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
             }
         }
     } else {
@@ -2081,6 +2283,7 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
                     if (s->chroma_format == CHROMA_422)
                         wrap_c <<= 1;
@@ -2102,10 +2305,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             s->pdsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             s->pdsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
             if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.diff_pixels(s->block[6], ptr_cb + (dct_offset >> 1),
-                                    dest_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.diff_pixels(s->block[7], ptr_cr + (dct_offset >> 1),
-                                    dest_cr + (dct_offset >> 1), wrap_c);
+                s->pdsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
+                                    dest_cb + uv_dct_offset, wrap_c);
+                s->pdsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
+                                    dest_cr + uv_dct_offset, wrap_c);
             }
         }
         /* pre quantization */
@@ -2127,12 +2330,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale)
                 skip_dct[5] = 1;
             if (!s->chroma_y_shift) { /* 422 */
-                if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1),
-                                   dest_cb + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cb + uv_dct_offset,
+                                   dest_cb + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[6] = 1;
-                if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1),
-                                   dest_cr + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cr + uv_dct_offset,
+                                   dest_cr + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[7] = 1;
             }
@@ -2154,17 +2357,17 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             get_visual_weight(weight[5], ptr_cr                , wrap_c);
         if (!s->chroma_y_shift) { /* 422 */
             if (!skip_dct[6])
-                get_visual_weight(weight[6], ptr_cb + (dct_offset >> 1),
+                get_visual_weight(weight[6], ptr_cb + uv_dct_offset,
                                   wrap_c);
             if (!skip_dct[7])
-                get_visual_weight(weight[7], ptr_cr + (dct_offset >> 1),
+                get_visual_weight(weight[7], ptr_cr + uv_dct_offset,
                                   wrap_c);
         }
         memcpy(orig[0], s->block[0], sizeof(int16_t) * 64 * mb_block_count);
     }
 
     /* DCT & quantize */
-    assert(s->out_format != FMT_MJPEG || s->qscale == 8);
+    av_assert2(s->out_format != FMT_MJPEG || s->qscale == 8);
     {
         for (i = 0; i < mb_block_count; i++) {
             if (!skip_dct[i]) {
@@ -2210,6 +2413,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         s->block_last_index[5] = 0;
         s->block[4][0] =
         s->block[5][0] = (1024 + s->c_dc_scale / 2) / s->c_dc_scale;
+        if (!s->chroma_y_shift) { /* 422 / 444 */
+            for (i=6; i<12; i++) {
+                s->block_last_index[i] = 0;
+                s->block[i][0] = s->block[4][0];
+            }
+        }
     }
 
     // non c quantize code returns incorrect block_last_index FIXME
@@ -2260,18 +2469,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             ff_h263_encode_mb(s, s->block, motion_x, motion_y);
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         if (CONFIG_MJPEG_ENCODER)
             ff_mjpeg_encode_mb(s, s->block);
         break;
     default:
-        assert(0);
+        av_assert1(0);
     }
 }
 
 static av_always_inline void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 {
-    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 6);
-    else                                encode_mb_internal(s, motion_x, motion_y, 16, 8);
+    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 8, 6);
+    else if (s->chroma_format == CHROMA_422) encode_mb_internal(s, motion_x, motion_y, 16, 8, 8);
+    else encode_mb_internal(s, motion_x, motion_y, 16, 16, 12);
 }
 
 static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){
@@ -2362,7 +2573,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
         s->dest[0] = s->sc.rd_scratchpad;
         s->dest[1] = s->sc.rd_scratchpad + 16*s->linesize;
         s->dest[2] = s->sc.rd_scratchpad + 16*s->linesize + 8;
-        assert(s->linesize >= 32); //FIXME
+        av_assert0(s->linesize >= 32); //FIXME
     }
 
     encode_mb(s, motion_x, motion_y);
@@ -2408,7 +2619,7 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in
         }
     }
 
-    assert(acc>=0);
+    av_assert2(acc>=0);
 
     return acc;
 }
@@ -2458,6 +2669,8 @@ static int pre_estimate_motion_thread(AVCodecContext *c, void *arg){
 static int estimate_motion_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
 
+    ff_check_alignment();
+
     s->me.dia_size= s->avctx->dia_size;
     s->first_slice_line=1;
     for(s->mb_y= s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
@@ -2484,6 +2697,8 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y;
 
+    ff_check_alignment();
+
     for(mb_y=s->start_mb_y; mb_y < s->end_mb_y; mb_y++) {
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
             int xx = mb_x * 16;
@@ -2511,7 +2726,7 @@ static void write_slice_end(MpegEncContext *s){
 
         ff_mpeg4_stuffing(&s->pb);
     }else if(CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG){
-        ff_mjpeg_encode_stuffing(&s->pb);
+        ff_mjpeg_encode_stuffing(s);
     }
 
     avpriv_align_put_bits(&s->pb);
@@ -2564,6 +2779,35 @@ static void update_mb_info(MpegEncContext *s, int startcode)
     write_mb_info(s);
 }
 
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase)
+{
+    if (   s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold
+        && s->slice_context_count == 1
+        && s->pb.buf == s->avctx->internal->byte_buffer) {
+        int lastgob_pos = s->ptr_lastgob - s->pb.buf;
+        int vbv_pos     = s->vbv_delay_ptr - s->pb.buf;
+
+        uint8_t *new_buffer = NULL;
+        int new_buffer_size = 0;
+
+        av_fast_padded_malloc(&new_buffer, &new_buffer_size,
+                              s->avctx->internal->byte_buffer_size + size_increase);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
+
+        memcpy(new_buffer, s->avctx->internal->byte_buffer, s->avctx->internal->byte_buffer_size);
+        av_free(s->avctx->internal->byte_buffer);
+        s->avctx->internal->byte_buffer      = new_buffer;
+        s->avctx->internal->byte_buffer_size = new_buffer_size;
+        rebase_put_bits(&s->pb, new_buffer, new_buffer_size);
+        s->ptr_lastgob   = s->pb.buf + lastgob_pos;
+        s->vbv_delay_ptr = s->pb.buf + vbv_pos;
+    }
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
 static int encode_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y, pdif = 0;
@@ -2575,6 +2819,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
     uint8_t bit_buf_tex[2][MAX_MB_BYTES];
     PutBitContext pb[2], pb2[2], tex_pb[2];
 
+    ff_check_alignment();
+
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
         init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
@@ -2598,6 +2844,11 @@ static int encode_thread(AVCodecContext *c, void *arg){
 
         s->current_picture.encoding_error[i] = 0;
     }
+    if(s->codec_id==AV_CODEC_ID_AMV){
+        s->last_dc[0] = 128*8/13;
+        s->last_dc[1] = 128*8/14;
+        s->last_dc[2] = 128*8/14;
+    }
     s->mb_skip_run = 0;
     memset(s->last_mv, 0, sizeof(s->last_mv));
 
@@ -2633,7 +2884,10 @@ static int encode_thread(AVCodecContext *c, void *arg){
 //            int d;
             int dmin= INT_MAX;
             int dir;
+            int size_increase =  s->avctx->internal->byte_buffer_size/4
+                               + s->mb_width*MAX_MB_BYTES;
 
+            ff_mpv_reallocate_putbitbuffer(s, MAX_MB_BYTES, size_increase);
             if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < MAX_MB_BYTES){
                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
                 return -1;
@@ -2641,7 +2895,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
             if(s->data_partitioning){
                 if(   s->pb2   .buf_end - s->pb2   .buf - (put_bits_count(&s->    pb2)>>3) < MAX_MB_BYTES
                    || s->tex_pb.buf_end - s->tex_pb.buf - (put_bits_count(&s->tex_pb )>>3) < MAX_MB_BYTES){
-                    av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                    av_log(s->avctx, AV_LOG_ERROR, "encoded partitioned frame too large\n");
                     return -1;
                 }
             }
@@ -2677,6 +2931,9 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 case AV_CODEC_ID_MPEG1VIDEO:
                     if(s->mb_skip_run) is_gob_start=0;
                     break;
+                case AV_CODEC_ID_MJPEG:
+                    if(s->mb_x==0 && s->mb_y!=0) is_gob_start=1;
+                    break;
                 }
 
                 if(is_gob_start){
@@ -2688,7 +2945,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         }
                     }
 
-                    assert((put_bits_count(&s->pb)&7) == 0);
+                    av_assert2((put_bits_count(&s->pb)&7) == 0);
                     current_packet_size= put_bits_ptr(&s->pb) - s->ptr_lastgob;
 
                     if (s->error_rate && s->resync_mb_x + s->resync_mb_y > 0) {
@@ -2895,8 +3152,9 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         int16_t ac[6][16];
                         const int mvdir= (best_s.mv_dir&MV_DIR_BACKWARD) ? 1 : 0;
                         static const int dquant_tab[4]={-1,1,-2,2};
+                        int storecoefs = s->mb_intra && s->dc_val[0];
 
-                        assert(backup_s.dquant == 0);
+                        av_assert2(backup_s.dquant == 0);
 
                         //FIXME intra
                         s->mv_dir= best_s.mv_dir;
@@ -2914,7 +3172,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
                                 continue;
                             backup_s.dquant= dquant;
-                            if(s->mb_intra && s->dc_val[0]){
+                            if(storecoefs){
                                 for(i=0; i<6; i++){
                                     dc[i]= s->dc_val[0][ s->block_index[i] ];
                                     memcpy(ac[i], s->ac_val[0][s->block_index[i]], sizeof(int16_t)*16);
@@ -2924,7 +3182,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb,
                                          &dmin, &next_block, s->mv[mvdir][0][0], s->mv[mvdir][0][1]);
                             if(best_s.qscale != qp){
-                                if(s->mb_intra && s->dc_val[0]){
+                                if(storecoefs){
                                     for(i=0; i<6; i++){
                                         s->dc_val[0][ s->block_index[i] ]= dc[i];
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(int16_t)*16);
@@ -3268,7 +3526,7 @@ static int estimate_qp(MpegEncContext *s, int dry_run){
 
 /* must be called before writing the header */
 static void set_frame_distances(MpegEncContext * s){
-    assert(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
+    av_assert1(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
     s->time = s->current_picture_ptr->f->pts * s->avctx->time_base.num;
 
     if(s->pict_type==AV_PICTURE_TYPE_B){
@@ -3324,6 +3582,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         update_qscale(s);
     }
 
+    if(s->codec_id != AV_CODEC_ID_AMV && s->codec_id != AV_CODEC_ID_MJPEG){
+        if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+        if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+        s->q_chroma_intra_matrix   = s->q_intra_matrix;
+        s->q_chroma_intra_matrix16 = s->q_intra_matrix16;
+    }
+
     s->mb_intra=0; //for the rate distortion & bit compare functions
     for(i=1; i<context_count; i++){
         ret = ff_update_duplicate_context(s->thread_context[i], s);
@@ -3366,7 +3631,9 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->pict_type= AV_PICTURE_TYPE_I;
         for(i=0; i<s->mb_stride*s->mb_height; i++)
             s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
-        ff_dlog(s, "Scene change detected, encoding as I Frame %d %d\n",
+        if(s->msmpeg4_version >= 3)
+            s->no_rounding=1;
+        ff_dlog(s, "Scene change detected, encoding as I Frame %"PRId64" %"PRId64"\n",
                 s->current_picture.mb_var_sum, s->current_picture.mc_mb_var_sum);
     }
 
@@ -3433,17 +3700,50 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->qscale= 3; //reduce clipping problems
 
     if (s->out_format == FMT_MJPEG) {
+        const uint16_t *  luma_matrix = ff_mpeg1_default_intra_matrix;
+        const uint16_t *chroma_matrix = ff_mpeg1_default_intra_matrix;
+
+        if (s->avctx->intra_matrix) {
+            chroma_matrix =
+            luma_matrix = s->avctx->intra_matrix;
+        }
+        if (s->avctx->chroma_intra_matrix)
+            chroma_matrix = s->avctx->chroma_intra_matrix;
+
         /* for mjpeg, we do include qscale in the matrix */
         for(i=1;i<64;i++){
             int j = s->idsp.idct_permutation[i];
 
-            s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
+            s->chroma_intra_matrix[j] = av_clip_uint8((chroma_matrix[i] * s->qscale) >> 3);
+            s->       intra_matrix[j] = av_clip_uint8((  luma_matrix[i] * s->qscale) >> 3);
         }
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg2_dc_scale_table[s->intra_dc_precision];
+        s->chroma_intra_matrix[0] =
         s->intra_matrix[0] = ff_mpeg2_dc_scale_table[s->intra_dc_precision][8];
         ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
                        s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        s->qscale= 8;
+    }
+    if(s->codec_id == AV_CODEC_ID_AMV){
+        static const uint8_t y[32]={13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13};
+        static const uint8_t c[32]={14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14};
+        for(i=1;i<64;i++){
+            int j= s->idsp.idct_permutation[ff_zigzag_direct[i]];
+
+            s->intra_matrix[j] = sp5x_quant_table[5*2+0][i];
+            s->chroma_intra_matrix[j] = sp5x_quant_table[5*2+1][i];
+        }
+        s->y_dc_scale_table= y;
+        s->c_dc_scale_table= c;
+        s->intra_matrix[0] = 13;
+        s->chroma_intra_matrix[0] = 14;
+        ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
+                       s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
         s->qscale= 8;
     }
 
@@ -3456,12 +3756,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     if (s->current_picture.f->key_frame)
         s->picture_in_gop_number=0;
 
+    s->mb_x = s->mb_y = 0;
     s->last_bits= put_bits_count(&s->pb);
     switch(s->out_format) {
     case FMT_MJPEG:
         if (CONFIG_MJPEG_ENCODER)
             ff_mjpeg_encode_picture_header(s->avctx, &s->pb, &s->intra_scantable,
-                                           s->intra_matrix);
+                                           s->intra_matrix, s->chroma_intra_matrix);
         break;
     case FMT_H261:
         if (CONFIG_H261_ENCODER)
@@ -3491,7 +3792,7 @@ static int encode_picture(MpegEncContext *s, int picture_number)
             ff_mpeg1_encode_picture_header(s, picture_number);
         break;
     default:
-        assert(0);
+        av_assert0(0);
     }
     bits= put_bits_count(&s->pb);
     s->header_bits= bits - s->last_bits;
@@ -3501,6 +3802,8 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     }
     s->avctx->execute(s->avctx, encode_thread, &s->thread_context[0], NULL, context_count, sizeof(void*));
     for(i=1; i<context_count; i++){
+        if (s->pb.buf_end == s->thread_context[i]->pb.buf)
+            set_put_bits_buffer_size(&s->pb, FFMIN(s->thread_context[i]->pb.buf_end - s->pb.buf, INT_MAX/8-32));
         merge_context_after_encode(s, s->thread_context[i]);
     }
     emms_c();
@@ -3535,6 +3838,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                                   int16_t *block, int n,
                                   int qscale, int *overflow){
     const int *qmat;
+    const uint16_t *matrix;
     const uint8_t *scantable= s->intra_scantable.scantable;
     const uint8_t *perm_scantable= s->intra_scantable.permutated;
     int max=0;
@@ -3556,6 +3860,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     uint8_t * length;
     uint8_t * last_length;
     const int lambda= s->lambda2 >> (FF_LAMBDA_SHIFT - 6);
+    int mpeg2_qscale;
 
     s->fdsp.fdct(block);
 
@@ -3564,6 +3869,9 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     qmul= qscale*16;
     qadd= ((qscale-1)|1)*8;
 
+    if (s->q_scale_type) mpeg2_qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 mpeg2_qscale = qscale << 1;
+
     if (s->mb_intra) {
         int q;
         if (!s->h263_aic) {
@@ -3582,15 +3890,23 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        matrix = n < 4 ? s->intra_matrix : s->chroma_intra_matrix;
+        if(s->mpeg_quant || s->out_format == FMT_MPEG1 || s->out_format == FMT_MJPEG)
             bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
+        matrix = s->inter_matrix;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
@@ -3628,7 +3944,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 //                coeff[2][k]= -level+2;
             }
             coeff_count[i]= FFMIN(level, 2);
-            assert(coeff_count[i]);
+            av_assert2(coeff_count[i]);
             max |=level;
         }else{
             coeff[0][i]= (level>>31)|1;
@@ -3662,17 +3978,20 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             const int alevel= FFABS(level);
             int unquant_coeff;
 
-            assert(level);
+            av_assert2(level);
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                 unquant_coeff= alevel*qmul + qadd;
+            } else if(s->out_format == FMT_MJPEG) {
+                j = s->idsp.idct_permutation[scantable[i]];
+                unquant_coeff = alevel * matrix[j] * 8;
             }else{ //MPEG1
                 j = s->idsp.idct_permutation[scantable[i]]; // FIXME: optimize
                 if(s->mb_intra){
-                        unquant_coeff = (int)(  alevel  * qscale * s->intra_matrix[j]) >> 3;
+                        unquant_coeff = (int)(  alevel  * mpeg2_qscale * matrix[j]) >> 4;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }else{
-                        unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[j])) >> 4;
+                        unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[j])) >> 5;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }
                 unquant_coeff<<= 3;
@@ -3693,7 +4012,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + last_length[UNI_AC_ENC_INDEX(run, level)]*lambda;
@@ -3719,7 +4038,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                   for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + score_tab[i-run];
@@ -3752,7 +4071,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         survivor[ survivor_count++ ]= i+1;
     }
 
-    if(s->out_format != FMT_H263){
+    if(s->out_format != FMT_H263 && s->out_format != FMT_H261){
         last_score= 256*256*256*120;
         for(i= survivor[0]; i<=last_non_zero + 1; i++){
             int score= score_tab[i];
@@ -3785,10 +4104,10 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             int alevel= FFABS(level);
             int unquant_coeff, score, distortion;
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     unquant_coeff= (alevel*qmul + qadd)>>3;
             }else{ //MPEG1
-                    unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[0])) >> 4;
+                    unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[0])) >> 5;
                     unquant_coeff =   (unquant_coeff - 1) | 1;
             }
             unquant_coeff = (unquant_coeff + 4) >> 3;
@@ -3811,7 +4130,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     }
 
     i= last_i;
-    assert(last_level);
+    av_assert2(last_level);
 
     block[ perm_scantable[last_non_zero] ]= last_level;
     i -= last_run + 1;
@@ -3895,8 +4214,13 @@ static int messed_sign=0;
         start_i = 1;
 //        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
 //            bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
         dc= 0;
         start_i = 0;
@@ -3927,8 +4251,8 @@ STOP_TIMER("memset rem[]")}
         weight[i] = w;
 //        w=weight[i] = (63*qns + (w/2)) / w;
 
-        assert(w>0);
-        assert(w<(1<<6));
+        av_assert2(w>0);
+        av_assert2(w<(1<<6));
         sum += w*w;
     }
     lambda= sum*(uint64_t)s->lambda2 >> (FF_LAMBDA_SHIFT - 6 + 6 + 6 + 6);
@@ -3994,7 +4318,7 @@ STOP_TIMER("dct")}
             const int level= block[0];
             int change, old_coeff;
 
-            assert(s->mb_intra);
+            av_assert2(s->mb_intra);
 
             old_coeff= q*level;
 
@@ -4038,7 +4362,7 @@ STOP_TIMER("dct")}
             }else{
                 old_coeff=0;
                 run2--;
-                assert(run2>=0 || i >= last_non_zero );
+                av_assert2(run2>=0 || i >= last_non_zero );
             }
 
             for(change=-1; change<=1; change+=2){
@@ -4066,7 +4390,7 @@ STOP_TIMER("dct")}
                                          - last_length[UNI_AC_ENC_INDEX(run, level+64)];
                         }
                     }else{
-                        assert(FFABS(new_level)==1);
+                        av_assert2(FFABS(new_level)==1);
 
                         if(analyze_gradient){
                             int g= d1[ scantable[i] ];
@@ -4099,7 +4423,7 @@ STOP_TIMER("dct")}
                     }
                 }else{
                     new_coeff=0;
-                    assert(FFABS(level)==1);
+                    av_assert2(FFABS(level)==1);
 
                     if(i < last_non_zero){
                         int next_i= i + run2 + 1;
@@ -4128,7 +4452,7 @@ STOP_TIMER("dct")}
                 score *= lambda;
 
                 unquant_change= new_coeff - old_coeff;
-                assert((score < 100*lambda && score > -100*lambda) || lambda==0);
+                av_assert2((score < 100*lambda && score > -100*lambda) || lambda==0);
 
                 score += s->mpvencdsp.try_8x8basis(rem, weight, basis[j],
                                                    unquant_change);
@@ -4160,7 +4484,7 @@ STOP_TIMER("iterative step")}
 
             if(best_coeff > last_non_zero){
                 last_non_zero= best_coeff;
-                assert(block[j]);
+                av_assert2(block[j]);
 #ifdef REFINE_STATS
 after_last++;
 #endif
@@ -4188,7 +4512,7 @@ if(block[j]){
 #ifdef REFINE_STATS
 count++;
 if(256*256*256*64 % count == 0){
-    printf("after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
+    av_log(s->avctx, AV_LOG_DEBUG, "after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
 }
 #endif
             run=0;
@@ -4287,13 +4611,13 @@ int ff_dct_quantize_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        bias= s->intra_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        bias= s->intra_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     } else {
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
-        bias= s->inter_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        bias= s->inter_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     }
     threshold1= (1<<QMAT_SHIFT) - bias - 1;
     threshold2= (threshold1<<1);
diff --git a/libavcodec/mpegvideo_motion.c b/libavcodec/mpegvideo_motion.c
index 05375a1afa..51ba435231 100644
--- a/libavcodec/mpegvideo_motion.c
+++ b/libavcodec/mpegvideo_motion.c
@@ -4,25 +4,26 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "h261.h"
@@ -178,7 +179,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 
     ptr = ref_picture[2];
     s->mdsp.gmc(dest_cr, ptr, uvlinesize, 8,
@@ -186,7 +187,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 }
 
 static inline int hpel_motion(MpegEncContext *s,
@@ -210,18 +211,16 @@ static inline int hpel_motion(MpegEncContext *s,
         dxy |= (motion_y & 1) << 1;
     src += src_y * s->linesize + src_x;
 
-    if (s->unrestricted_mv) {
-        if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 8, 0) ||
-            (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 1) - 8, 0)) {
+        if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 7, 0) ||
+            (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 1) - 7, 0)) {
             s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
                                      s->linesize, s->linesize,
                                      9, 9,
-                                     src_x, src_y, s->h_edge_pos,
-                                     s->v_edge_pos);
+                                     src_x, src_y,
+                                     s->h_edge_pos, s->v_edge_pos);
             src = s->sc.edge_emu_buffer;
             emu = 1;
         }
-    }
     pix_op[dxy](dest, src, s->linesize, 8);
     return emu;
 }
@@ -308,8 +307,8 @@ void mpeg_motion_internal(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 1) - h, 0)) {
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 1) - h + 1, 0)) {
         if (is_mpeg12 ||
             s->codec_id == AV_CODEC_ID_MPEG2VIDEO ||
             s->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
@@ -318,26 +317,29 @@ void mpeg_motion_internal(MpegEncContext *s,
                    src_y);
             return;
         }
+        src_y = (unsigned)src_y << field_based;
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
                                  s->linesize, s->linesize,
                                  17, 17 + field_based,
-                                 src_x, src_y << field_based,
+                                 src_x, src_y,
                                  s->h_edge_pos, s->v_edge_pos);
         ptr_y = s->sc.edge_emu_buffer;
         if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-            uint8_t *uvbuf = s->sc.edge_emu_buffer + 18 * s->linesize;
-            s->vdsp.emulated_edge_mc(uvbuf, ptr_cb,
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+            uvsrc_y = (unsigned)uvsrc_y << field_based;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
                                      s->uvlinesize, s->uvlinesize,
                                      9, 9 + field_based,
-                                     uvsrc_x, uvsrc_y << field_based,
+                                     uvsrc_x, uvsrc_y,
                                      s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-            s->vdsp.emulated_edge_mc(uvbuf + 16, ptr_cr,
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
                                      s->uvlinesize, s->uvlinesize,
                                      9, 9 + field_based,
-                                     uvsrc_x, uvsrc_y << field_based,
+                                     uvsrc_x, uvsrc_y,
                                      s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-            ptr_cb = uvbuf;
-            ptr_cr = uvbuf + 16;
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
         }
     }
 
@@ -395,7 +397,7 @@ static void mpeg_motion_field(MpegEncContext *s, uint8_t *dest_y,
                               int motion_x, int motion_y, int h, int mb_y)
 {
 #if !CONFIG_SMALL
-    if(s->out_format == FMT_MPEG1)
+    if (s->out_format == FMT_MPEG1)
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 1,
                              bottom_field, field_select, ref_picture, pix_op,
                              motion_x, motion_y, h, 1, mb_y);
@@ -470,7 +472,7 @@ static inline void obmc_motion(MpegEncContext *s,
     int i;
     uint8_t *ptr[5];
 
-    assert(s->quarter_sample == 0);
+    av_assert2(s->quarter_sample == 0);
 
     for (i = 0; i < 5; i++) {
         if (i && mv[i][0] == mv[MID][0] && mv[i][1] == mv[MID][1]) {
@@ -537,8 +539,8 @@ static inline void qpel_motion(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 3) - h, 0)) {
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 3) - h + 1, 0)) {
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
                                  s->linesize, s->linesize,
                                  17, 17 + field_based,
@@ -546,19 +548,20 @@ static inline void qpel_motion(MpegEncContext *s,
                                  s->h_edge_pos, s->v_edge_pos);
         ptr_y = s->sc.edge_emu_buffer;
         if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-            uint8_t *uvbuf = s->sc.edge_emu_buffer + 18 * s->linesize;
-            s->vdsp.emulated_edge_mc(uvbuf, ptr_cb,
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
                                      s->uvlinesize, s->uvlinesize,
                                      9, 9 + field_based,
                                      uvsrc_x, uvsrc_y << field_based,
                                      s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-            s->vdsp.emulated_edge_mc(uvbuf + 16, ptr_cr,
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
                                      s->uvlinesize, s->uvlinesize,
                                      9, 9 + field_based,
                                      uvsrc_x, uvsrc_y << field_based,
                                      s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-            ptr_cb = uvbuf;
-            ptr_cr = uvbuf + 16;
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
         }
     }
 
@@ -620,8 +623,8 @@ static void chroma_4mv_motion(MpegEncContext *s,
 
     offset = src_y * s->uvlinesize + src_x;
     ptr    = ref_picture[1] + offset;
-    if ((unsigned)src_x > FFMAX((s->h_edge_pos >> 1) - (dxy & 1) - 8, 0) ||
-        (unsigned)src_y > FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 8, 0)) {
+    if ((unsigned)src_x >= FFMAX((s->h_edge_pos >> 1) - (dxy  & 1) - 7, 0) ||
+        (unsigned)src_y >= FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 7, 0)) {
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                  s->uvlinesize, s->uvlinesize,
                                  9, 9, src_x, src_y,
@@ -672,7 +675,7 @@ static inline void apply_obmc(MpegEncContext *s,
     const int mot_xy     = mb_x * 2 + mb_y * 2 * mot_stride;
     int mx, my, i;
 
-    assert(!s->mb_skipped);
+    av_assert2(!s->mb_skipped);
 
     AV_COPY32(mv_cache[1][1], cur_frame->motion_val[0][mot_xy]);
     AV_COPY32(mv_cache[1][2], cur_frame->motion_val[0][mot_xy + 1]);
@@ -778,8 +781,8 @@ static inline void apply_8x8(MpegEncContext *s,
                 dxy &= ~12;
 
             ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
-            if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 8, 0) ||
-                (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 3) - 8, 0)) {
+            if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 7, 0) ||
+                (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 3) - 7, 0)) {
                 s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                          s->linesize, s->linesize,
                                          9, 9,
@@ -899,8 +902,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                                   s->mv[dir][1][0], s->mv[dir][1][1], 8, mb_y);
             }
         } else {
-            if (s->picture_structure != s->field_select[dir][0] + 1 &&
-                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+            if (   s->picture_structure != s->field_select[dir][0] + 1 && s->pict_type != AV_PICTURE_TYPE_B && !s->first_field
+                || !ref_picture[0]) {
                 ref_picture = s->current_picture_ptr->f->data;
             }
 
@@ -914,8 +917,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
         for (i = 0; i < 2; i++) {
             uint8_t **ref2picture;
 
-            if (s->picture_structure == s->field_select[dir][i] + 1
-                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+            if ((s->picture_structure == s->field_select[dir][i] + 1
+                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) && ref_picture[0]) {
                 ref2picture = ref_picture;
             } else {
                 ref2picture = s->current_picture_ptr->f->data;
@@ -944,6 +947,9 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                 pix_op = s->hdsp.avg_pixels_tab;
             }
         } else {
+            if (!ref_picture[0]) {
+                ref_picture = s->current_picture_ptr->f->data;
+            }
             for (i = 0; i < 2; i++) {
                 mpeg_motion(s, dest_y, dest_cb, dest_cr,
                             s->picture_structure != i + 1,
@@ -962,7 +968,7 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
             }
         }
         break;
-    default: assert(0);
+    default: av_assert2(0);
     }
 }
 
diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index 37808c6053..1f74bfb86b 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,6 +44,9 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
     int top_field_first, repeat_first_field, progressive_frame;
     int horiz_size_ext, vert_size_ext, bit_rate_ext;
     int did_set_size=0;
+    int set_dim_ret = 0;
+    int bit_rate = 0;
+    int vbv_delay = 0;
     int chroma_format;
     enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
 //FIXME replace the crap with get_bits()
@@ -57,6 +60,8 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         case PICTURE_START_CODE:
             if (bytes_left >= 2) {
                 s->pict_type = (buf[1] >> 3) & 7;
+                if (bytes_left >= 4)
+                vbv_delay = ((buf[1] & 0x07) << 13) | (buf[2] << 5) | (buf[3]  >> 3);
             }
             break;
         case SEQ_START_CODE:
@@ -64,14 +69,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                 pc->width  = (buf[0] << 4) | (buf[1] >> 4);
                 pc->height = ((buf[1] & 0x0f) << 8) | buf[2];
                 if(!avctx->width || !avctx->height || !avctx->coded_width || !avctx->coded_height){
-                    ff_set_dimensions(avctx, pc->width, pc->height);
+                    set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
                     did_set_size=1;
                 }
                 pix_fmt = AV_PIX_FMT_YUV420P;
                 frame_rate_index = buf[3] & 0xf;
                 pc->frame_rate = avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_index];
-                avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
+                bit_rate = (buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6);
                 avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+                avctx->ticks_per_frame = 1;
             }
             break;
         case EXT_START_CODE:
@@ -95,14 +101,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                         case 3: pix_fmt = AV_PIX_FMT_YUV444P; break;
                         }
 
-                        pc->width  |=(horiz_size_ext << 12);
-                        pc->height |=( vert_size_ext << 12);
-                        avctx->bit_rate += (bit_rate_ext << 18) * 400;
+                        pc->width  = (pc->width & 0xFFF) | (horiz_size_ext << 12);
+                        pc->height = (pc->height& 0xFFF) | ( vert_size_ext << 12);
+                        bit_rate = (bit_rate&0x3FFFF) | (bit_rate_ext << 18);
                         if(did_set_size)
-                            ff_set_dimensions(avctx, pc->width, pc->height);
-                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1) * 2;
+                            set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
+                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1);
                         avctx->framerate.den = pc->frame_rate.den * (frame_rate_ext_d + 1);
                         avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+                        avctx->ticks_per_frame = 2;
                     }
                     break;
                 case 0x8: /* picture coding extension */
@@ -148,6 +155,16 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         }
     }
  the_end: ;
+    if (set_dim_ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && bit_rate) {
+        avctx->rc_max_rate = 400LL*bit_rate;
+    }
+    if (bit_rate &&
+        ((avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && bit_rate != 0x3FFFF) || vbv_delay != 0xFFFF)) {
+        avctx->bit_rate = 400LL*bit_rate;
+    }
 
     if (pix_fmt != AV_PIX_FMT_NONE) {
         s->format = pix_fmt;
@@ -157,7 +174,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
 
 #if FF_API_AVCTX_TIMEBASE
     if (avctx->framerate.num)
-        avctx->time_base = av_inv_q(avctx->framerate);
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
 }
 
@@ -187,7 +204,7 @@ static int mpegvideo_parse(AVCodecParserContext *s,
        function should be negligible for uncorrupted streams */
     mpegvideo_extract_headers(s, avctx, buf, buf_size);
     ff_dlog(NULL, "pict_type=%d frame_rate=%0.3f repeat_pict=%d\n",
-            s->pict_type, (double)avctx->time_base.den / avctx->time_base.num, s->repeat_pict);
+            s->pict_type, av_q2d(avctx->framerate), s->repeat_pict);
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;
@@ -199,18 +216,28 @@ static int mpegvideo_split(AVCodecContext *avctx,
 {
     int i;
     uint32_t state= -1;
+    int found=0;
 
     for(i=0; i<buf_size; i++){
         state= (state<<8) | buf[i];
-        if(state != 0x1B3 && state != 0x1B5 && state < 0x200 && state >= 0x100)
+        if(state == 0x1B3){
+            found=1;
+        }else if(found && state != 0x1B5 && state < 0x200 && state >= 0x100)
             return i-3;
     }
     return 0;
 }
 
+static int mpegvideo_parse_init(AVCodecParserContext *s)
+{
+    s->pict_type = AV_PICTURE_TYPE_NONE; // first frame might be partial
+    return 0;
+}
+
 AVCodecParser ff_mpegvideo_parser = {
     .codec_ids      = { AV_CODEC_ID_MPEG1VIDEO, AV_CODEC_ID_MPEG2VIDEO },
     .priv_data_size = sizeof(struct MpvParseContext),
+    .parser_init    = mpegvideo_parse_init,
     .parser_parse   = mpegvideo_parse,
     .parser_close   = ff_parse_close,
     .split          = mpegvideo_split,
diff --git a/libavcodec/mpegvideo_xvmc.c b/libavcodec/mpegvideo_xvmc.c
index b7de79c583..b469c4e26f 100644
--- a/libavcodec/mpegvideo_xvmc.c
+++ b/libavcodec/mpegvideo_xvmc.c
@@ -2,20 +2,20 @@
  * XVideo Motion Compensation
  * Copyright (c) 2003 Ivan Kalvachev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include "xvmc_internal.h"
 #include "version.h"
 
-#if FF_API_XVMC
-
 /**
  * Initialize the block field of the MpegEncContext pointer passed as
  * parameter after making sure that the data is not corrupted.
@@ -50,6 +48,15 @@ void ff_xvmc_init_block(MpegEncContext *s)
     s->block = (int16_t (*)[64])(render->data_blocks + render->next_free_data_block_num * 64);
 }
 
+static void exchange_uv(MpegEncContext *s)
+{
+    int16_t (*tmp)[64];
+
+    tmp           = s->pblocks[4];
+    s->pblocks[4] = s->pblocks[5];
+    s->pblocks[5] = tmp;
+}
+
 /**
  * Fill individual block pointers, so there are no gaps in the data_block array
  * in case not all blocks in the macroblock are coded.
@@ -67,6 +74,9 @@ void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp)
             s->pblocks[i] = NULL;
         cbp += cbp;
     }
+    if (s->swap_uv) {
+        exchange_uv(s);
+    }
 }
 
 /**
@@ -74,8 +84,9 @@ void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp)
  * This function should be called for every new field and/or frame.
  * It should be safe to call the function a few times for the same field.
  */
-int ff_xvmc_field_start(MpegEncContext *s, AVCodecContext *avctx)
+static int ff_xvmc_field_start(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size)
 {
+    struct MpegEncContext *s = avctx->priv_data;
     struct xvmc_pix_fmt *last, *next, *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
     const int mb_block_count = 4 + (1 << s->chroma_format);
 
@@ -142,20 +153,22 @@ return -1;
  * some leftover blocks, for example from error_resilience(), may remain.
  * It should be safe to call the function a few times for the same field.
  */
-void ff_xvmc_field_end(MpegEncContext *s)
+static int ff_xvmc_field_end(AVCodecContext *avctx)
 {
+    struct MpegEncContext *s = avctx->priv_data;
     struct xvmc_pix_fmt *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
     assert(render);
 
     if (render->filled_mv_blocks_num > 0)
         ff_mpeg_draw_horiz_band(s, 0, 0);
+    return 0;
 }
 
 /**
  * Synthesize the data needed by XvMC to render one macroblock of data.
  * Fill all relevant fields, if necessary do IDCT.
  */
-void ff_xvmc_decode_mb(MpegEncContext *s)
+static void ff_xvmc_decode_mb(struct MpegEncContext *s)
 {
     XvMCMacroBlock *mv_block;
     struct xvmc_pix_fmt *render;
@@ -314,7 +327,7 @@ void ff_xvmc_decode_mb(MpegEncContext *s)
                  * slowdown. */
             }
             // copy blocks only if the codec doesn't support pblocks reordering
-            if (s->avctx->xvmc_acceleration == 1) {
+            if (!s->pack_pblocks) {
                 memcpy(&render->data_blocks[render->next_free_data_block_num*64],
                        s->pblocks[i], sizeof(*s->pblocks[i]));
             }
@@ -334,4 +347,30 @@ void ff_xvmc_decode_mb(MpegEncContext *s)
         ff_mpeg_draw_horiz_band(s, 0, 0);
 }
 
-#endif /* FF_API_XVMC */
+#if CONFIG_MPEG1_XVMC_HWACCEL
+AVHWAccel ff_mpeg1_xvmc_hwaccel = {
+    .name           = "mpeg1_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
+
+#if CONFIG_MPEG2_XVMC_HWACCEL
+AVHWAccel ff_mpeg2_xvmc_hwaccel = {
+    .name           = "mpeg2_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
diff --git a/libavcodec/mpegvideodata.c b/libavcodec/mpegvideodata.c
index f27dd90a93..5f1d8f7c00 100644
--- a/libavcodec/mpegvideodata.c
+++ b/libavcodec/mpegvideodata.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,13 @@ const uint8_t ff_default_chroma_qscale_table[32] = {
     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 };
 
+const uint8_t ff_mpeg2_non_linear_qscale[32] = {
+     0,  1,  2,  3,  4,  5,   6,   7,
+     8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112,
+};
+
 const uint8_t ff_mpeg1_dc_scale_table[128] = {
 //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
diff --git a/libavcodec/mpegvideodata.h b/libavcodec/mpegvideodata.h
index d3ace23d56..14f4806d66 100644
--- a/libavcodec/mpegvideodata.h
+++ b/libavcodec/mpegvideodata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@ extern const uint8_t ff_alternate_vertical_scan[64];
 extern const uint8_t ff_mpeg1_dc_scale_table[128];
 extern const uint8_t * const ff_mpeg2_dc_scale_table[4];
 
+extern const uint8_t ff_mpeg2_non_linear_qscale[32];
+
 extern const uint8_t ff_default_chroma_qscale_table[32];
 
 #endif /* AVCODEC_MPEGVIDEODATA_H */
diff --git a/libavcodec/mpegvideodsp.c b/libavcodec/mpegvideodsp.c
index 915a844d6b..a58e45ad43 100644
--- a/libavcodec/mpegvideodsp.c
+++ b/libavcodec/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideodsp.h b/libavcodec/mpegvideodsp.h
index b0f45db0bb..293e2548d3 100644
--- a/libavcodec/mpegvideodsp.h
+++ b/libavcodec/mpegvideodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 109bbe5da8..d6caf4ac1c 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "config.h"
+#include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -40,7 +41,7 @@ static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
                           (BASIS_SHIFT - RECON_SHIFT));
         int w = weight[i];
         b >>= RECON_SHIFT;
-        assert(-512 < b && b < 512);
+        av_assert2(-512 < b && b < 512);
 
         sum += (w * b) * (w * b) >> 4;
     }
@@ -175,4 +176,6 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
         ff_mpegvideoencdsp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_mpegvideoencdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_mpegvideoencdsp_init_mips(c, avctx);
 }
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 91a292a296..33f0282fcc 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,5 +52,7 @@ void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                  AVCodecContext *avctx);
 
 #endif /* AVCODEC_MPEGVIDEOENCDSP_H */
diff --git a/libavcodec/mpl2dec.c b/libavcodec/mpl2dec.c
new file mode 100644
index 0000000000..ca95dc8ae8
--- /dev/null
+++ b/libavcodec/mpl2dec.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MPL2 subtitles decoder
+ *
+ * @see http://web.archive.org/web/20090328040233/http://napisy.ussbrowarek.org/mpl2-eng.html
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int mpl2_event_to_ass(AVBPrint *buf, const char *p)
+{
+    if (*p == ' ')
+        p++;
+
+    while (*p) {
+        int got_style = 0;
+
+        while (*p && strchr("/\\_", *p)) {
+            if      (*p == '/')  av_bprintf(buf, "{\\i1}");
+            else if (*p == '\\') av_bprintf(buf, "{\\b1}");
+            else if (*p == '_')  av_bprintf(buf, "{\\u1}");
+            got_style = 1;
+            p++;
+        }
+
+        while (*p && *p != '|') {
+            if (*p != '\r' && *p != '\n')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+
+        if (*p == '|') {
+            if (got_style)
+                av_bprintf(buf, "{\\r}");
+            av_bprintf(buf, "\\N");
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int mpl2_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    const int ts_start     = av_rescale_q(avpkt->pts,      avctx->time_base, (AVRational){1,100});
+    const int ts_duration  = avpkt->duration != -1 ?
+                             av_rescale_q(avpkt->duration, avctx->time_base, (AVRational){1,100}) : -1;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr && !mpl2_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect_bprint(sub, &buf, ts_start, ts_duration);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_mpl2_decoder = {
+    .name           = "mpl2",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MPL2,
+    .decode         = mpl2_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+};
diff --git a/libavcodec/mqc.c b/libavcodec/mqc.c
index 01445817cf..f2d1e3b838 100644
--- a/libavcodec/mqc.c
+++ b/libavcodec/mqc.c
@@ -2,20 +2,20 @@
  * MQ-coder encoder and decoder common functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mqc.h b/libavcodec/mqc.h
index a65433e2b2..39104b1f3d 100644
--- a/libavcodec/mqc.h
+++ b/libavcodec/mqc.h
@@ -2,20 +2,20 @@
  * MQ-coder: structures, common and decoder functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,16 +43,34 @@ typedef struct MqcState {
     unsigned int c;
     unsigned int ct;
     uint8_t cx_states[19];
+    int raw;
 } MqcState;
 
+/* encoder */
+
+/** initialize the encoder */
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp);
+
+/** code bit d with context cx */
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d);
+
+/** number of encoded bytes */
+int ff_mqc_length(MqcState *mqc);
+
+/** flush the encoder [returns number of bytes encoded] */
+int ff_mqc_flush(MqcState *mqc);
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len);
+
 /* decoder */
 
 /**
  * Initialize MQ-decoder.
  * @param mqc   MQ decoder state
  * @param bp    byte poiter
+ * @param raw   raw mode
+ * @param reset reset states
  */
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp);
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset);
 
 /**
  * MQ decoder.
diff --git a/libavcodec/mqcdec.c b/libavcodec/mqcdec.c
index 889763ac66..34aa51951e 100644
--- a/libavcodec/mqcdec.c
+++ b/libavcodec/mqcdec.c
@@ -2,20 +2,20 @@
  * MQ-coder decoder
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,9 +68,11 @@ static int exchange(MqcState *mqc, uint8_t *cxstate, int lps)
     return d;
 }
 
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset)
 {
-    ff_mqc_init_contexts(mqc);
+    mqc->raw = raw;
+    if (reset)
+        ff_mqc_init_contexts(mqc);
     mqc->bp = bp;
     mqc->c  = (*mqc->bp ^ 0xff) << 16;
     bytein(mqc);
@@ -78,8 +80,20 @@ void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
     mqc->a = 0x8000;
 }
 
+static int mqc_decode_bypass(MqcState *mqc) {
+    int bit = !(mqc->c & 0x40000000);
+    if (!(mqc->c & 0xff)) {
+        mqc->c -= 0x100;
+        bytein(mqc);
+    }
+    mqc->c += mqc->c;
+    return bit;
+}
+
 int ff_mqc_decode(MqcState *mqc, uint8_t *cxstate)
 {
+    if (mqc->raw)
+        return mqc_decode_bypass(mqc);
     mqc->a -= ff_mqc_qe[*cxstate];
     if ((mqc->c >> 16) < mqc->a) {
         if (mqc->a & 0x8000)
diff --git a/libavcodec/mqcenc.c b/libavcodec/mqcenc.c
new file mode 100644
index 0000000000..7c9e1a0df9
--- /dev/null
+++ b/libavcodec/mqcenc.c
@@ -0,0 +1,139 @@
+/*
+ * MQ-coder encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MQ-coder encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include "libavutil/avassert.h"
+#include "mqc.h"
+
+static void byteout(MqcState *mqc)
+{
+retry:
+    if (*mqc->bp == 0xff){
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 20;
+        mqc->c &= 0xfffff;
+        mqc->ct = 7;
+    } else if ((mqc->c & 0x8000000)){
+        (*mqc->bp)++;
+        mqc->c &= 0x7ffffff;
+        goto retry;
+    } else{
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 19;
+        mqc->c &= 0x7ffff;
+        mqc->ct = 8;
+    }
+}
+
+static void renorme(MqcState *mqc)
+{
+    do{
+        mqc->a += mqc->a;
+        mqc->c += mqc->c;
+        if (!--mqc->ct)
+            byteout(mqc);
+    } while (!(mqc->a & 0x8000));
+}
+
+static void setbits(MqcState *mqc)
+{
+    int tmp = mqc->c + mqc->a;
+    mqc->c |= 0xffff;
+    if (mqc->c >= tmp)
+        mqc->c -= 0x8000;
+}
+
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp)
+{
+    ff_mqc_init_contexts(mqc);
+    mqc->a = 0x8000;
+    mqc->c = 0;
+    mqc->bp = bp-1;
+    mqc->bpstart = bp;
+    mqc->ct = 12 + (*mqc->bp == 0xff);
+}
+
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d)
+{
+    int qe;
+
+    qe = ff_mqc_qe[*cxstate];
+    mqc->a -= qe;
+    if ((*cxstate & 1) == d){
+        if (!(mqc->a & 0x8000)){
+            if (mqc->a < qe)
+                mqc->a = qe;
+            else
+                mqc->c += qe;
+            *cxstate = ff_mqc_nmps[*cxstate];
+            renorme(mqc);
+        } else
+            mqc->c += qe;
+    } else{
+        if (mqc->a < qe)
+            mqc->c += qe;
+        else
+            mqc->a = qe;
+        *cxstate = ff_mqc_nlps[*cxstate];
+        renorme(mqc);
+    }
+}
+
+int ff_mqc_length(MqcState *mqc)
+{
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush(MqcState *mqc)
+{
+    setbits(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    if (*mqc->bp != 0xff)
+        mqc->bp++;
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len)
+{
+    MqcState mqc2 = *mqc;
+    mqc2.bpstart=
+    mqc2.bp = dst;
+    *mqc2.bp = *mqc->bp;
+    ff_mqc_flush(&mqc2);
+    *dst_len = mqc2.bp - dst;
+    if (mqc->bp < mqc->bpstart) {
+        av_assert1(mqc->bpstart - mqc->bp == 1);
+        av_assert1(*dst_len > 0);
+        av_assert1(mqc->bp[0] == 0 && dst[0] == 0);
+        (*dst_len) --;
+        memmove(dst, dst+1, *dst_len);
+        return mqc->bp - mqc->bpstart + 1 + *dst_len;
+    }
+    return mqc->bp - mqc->bpstart + *dst_len;
+}
diff --git a/libavcodec/msgsmdec.c b/libavcodec/msgsmdec.c
index be5062ad91..4c4ddb46f5 100644
--- a/libavcodec/msgsmdec.c
+++ b/libavcodec/msgsmdec.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msgsmdec.h b/libavcodec/msgsmdec.h
index adbda9a9d0..b2a1a627a5 100644
--- a/libavcodec/msgsmdec.h
+++ b/libavcodec/msgsmdec.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index 94bd86e0e3..c9a2108912 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "msmpeg4data.h"
 #include "mpegvideodata.h"
 #include "vc1data.h"
+#include "libavutil/imgutils.h"
 
 /*
  * You can also call this codec : MPEG4 with a twist !
@@ -52,6 +53,9 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 {
         int level, uni_code, uni_len;
 
+        if(ff_v2_dc_chroma_table[255 + 256][1])
+            return;
+
         for(level=-256; level<256; level++){
             int size, v, l;
             /* find number of bits */
@@ -104,8 +108,6 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 
 av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
 {
-    static int initialized=0;
-
     switch(s->msmpeg4_version){
     case 1:
     case 2:
@@ -144,11 +146,7 @@ av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
     }
     //Note the default tables are set in common_init in mpegvideo.c
 
-    if(!initialized){
-        initialized=1;
-
-        init_h263_dc_for_msmpeg4();
-    }
+    init_h263_dc_for_msmpeg4();
 }
 
 /* predict coded block */
@@ -178,13 +176,13 @@ int ff_msmpeg4_coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_block
     return pred;
 }
 
-static int get_dc(uint8_t *src, int stride, int scale)
+static int get_dc(uint8_t *src, int stride, int scale, int block_size)
 {
     int y;
     int sum=0;
-    for(y=0; y<8; y++){
+    for(y=0; y<block_size; y++){
         int x;
-        for(x=0; x<8; x++){
+        for(x=0; x<block_size; x++){
             sum+=src[x + y*stride];
         }
     }
@@ -230,13 +228,13 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
         "addl %%eax, %2         \n\t"
         "addl %%eax, %1         \n\t"
         "addl %0, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %0         \n\t"
         "movl %1, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %1         \n\t"
         "movl %2, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %2         \n\t"
         : "+b" (a), "+c" (b), "+D" (c)
         : "g" (scale), "S" (ff_inverse[scale])
@@ -276,17 +274,18 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
                     *dir_ptr = 0;
                 }
             }else{
+                int bs = 8 >> s->avctx->lowres;
                 if(n<4){
                     wrap= s->linesize;
-                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * 8*  wrap ) + ((n & 1) + 2*s->mb_x) * 8;
+                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * bs*  wrap ) + ((n & 1) + 2*s->mb_x) * bs;
                 }else{
                     wrap= s->uvlinesize;
-                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * 8 * wrap) + s->mb_x * 8;
+                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * bs * wrap) + s->mb_x * bs;
                 }
                 if(s->mb_x==0) a= (1024 + (scale>>1))/scale;
-                else           a= get_dc(dest-8, wrap, scale*8);
+                else           a= get_dc(dest-bs, wrap, scale*8>>(2*s->avctx->lowres), bs);
                 if(s->mb_y==0) c= (1024 + (scale>>1))/scale;
-                else           c= get_dc(dest-8*wrap, wrap, scale*8);
+                else           c= get_dc(dest-bs*wrap, wrap, scale*8>>(2*s->avctx->lowres), bs);
 
                 if (s->h263_aic_dir==0) {
                     pred= a;
diff --git a/libavcodec/msmpeg4.h b/libavcodec/msmpeg4.h
index e57ae6655f..bcdb967401 100644
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -2,20 +2,20 @@
  * MSMPEG4 backend for encoder and decoder
  * copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,10 +69,12 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
 #define CONFIG_MSMPEG4_DECODER (CONFIG_MSMPEG4V1_DECODER || \
                                 CONFIG_MSMPEG4V2_DECODER || \
                                 CONFIG_MSMPEG4V3_DECODER || \
+                                CONFIG_WMV1_DECODER      || \
                                 CONFIG_WMV2_DECODER      || \
                                 CONFIG_VC1_DECODER)
 #define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
                                 CONFIG_MSMPEG4V3_ENCODER || \
+                                CONFIG_WMV1_ENCODER      || \
                                 CONFIG_WMV2_ENCODER)
 
 #endif /* AVCODEC_MSMPEG4_H */
diff --git a/libavcodec/msmpeg4data.c b/libavcodec/msmpeg4data.c
index cf291afc80..8eb07e95cb 100644
--- a/libavcodec/msmpeg4data.c
+++ b/libavcodec/msmpeg4data.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4data.h b/libavcodec/msmpeg4data.h
index ca2dac14bd..24a10d9f2e 100644
--- a/libavcodec/msmpeg4data.h
+++ b/libavcodec/msmpeg4data.h
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4dec.c b/libavcodec/msmpeg4dec.c
index 67a8609cdd..aaadd9c073 100644
--- a/libavcodec/msmpeg4dec.c
+++ b/libavcodec/msmpeg4dec.c
@@ -1,24 +1,24 @@
 /*
  * MSMPEG4 backend for encoder and decoder
  * Copyright (c) 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "msmpeg4.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/x86/asm.h"
 #include "h263.h"
 #include "mpeg4video.h"
@@ -104,6 +105,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
 static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
 {
     int cbp, code, i;
+    uint32_t * const mb_type_ptr = &s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride];
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         if (s->use_skip_mb_code) {
@@ -117,6 +119,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 s->mv[0][0][0] = 0;
                 s->mv[0][0][1] = 0;
                 s->mb_skipped = 1;
+                *mb_type_ptr = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
                 return 0;
             }
         }
@@ -165,6 +168,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
         s->mv_type = MV_TYPE_16X16;
         s->mv[0][0][0] = mx;
         s->mv[0][0][1] = my;
+        *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
     } else {
         if(s->msmpeg4_version==2){
             s->ac_pred = get_bits1(&s->gb);
@@ -174,6 +178,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
             cbp|= get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
             if(s->pict_type==AV_PICTURE_TYPE_P) cbp^=0x3C;
         }
+        *mb_type_ptr = MB_TYPE_INTRA;
     }
 
     s->bdsp.clear_blocks(s->block[0]);
@@ -283,18 +288,19 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64])
 av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
-    static int done = 0;
-    int i;
+    static volatile int done = 0;
+    int i, ret;
     MVTable *mv;
 
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
     if (ff_h263_decode_init(avctx) < 0)
         return -1;
 
     ff_msmpeg4_common_init(s);
 
     if (!done) {
-        done = 1;
-
         for(i=0;i<NB_RL_TABLES;i++) {
             ff_rl_init(&ff_rl_table[i], ff_static_rl_table_store[i]);
         }
@@ -364,6 +370,7 @@ av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
         INIT_VLC_STATIC(&ff_inter_intra_vlc, INTER_INTRA_VLC_BITS, 4,
                  &ff_table_inter_intra[0][1], 2, 1,
                  &ff_table_inter_intra[0][0], 2, 1, 8);
+        done = 1;
     }
 
     switch(s->msmpeg4_version){
@@ -534,7 +541,7 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
             s->no_rounding = 0;
         }
     }
-    ff_dlog(s->avctx, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s->avctx, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     s->esc3_level_length= 0;
@@ -581,8 +588,11 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         } else {
             level = get_vlc2(&s->gb, v2_dc_chroma_vlc.table, DC_VLC_BITS, 3);
         }
-        if (level < 0)
+        if (level < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
+        }
         level-=256;
     }else{  //FIXME optimize use unified tables & index
         if (n < 4) {
@@ -592,6 +602,7 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         }
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
         }
 
@@ -648,7 +659,6 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "dc overflow- block: %d qscale: %d//\n", n, s->qscale);
             if(s->inter_intra_pred) level=0;
-            else                    return -1;
         }
         if (n < 4) {
             rl = &ff_rl_table[s->rl_table_index];
@@ -835,9 +845,10 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
             if(i&(~63)){
                 const int left= get_bits_left(&s->gb);
                 if (((i + 192 == 64 && level / qmul == -1) ||
-                     !(s->avctx->err_recognition & AV_EF_BITSTREAM)) &&
+                     !(s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))) &&
                     left >= 0) {
                     av_log(s->avctx, AV_LOG_ERROR, "ignoring overflow at %d %d\n", s->mb_x, s->mb_y);
+                    i = 63;
                     break;
                 }else{
                     av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
@@ -914,6 +925,7 @@ AVCodec ff_msmpeg4v1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -930,6 +942,7 @@ AVCodec ff_msmpeg4v2_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -946,6 +959,7 @@ AVCodec ff_msmpeg4v3_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -962,6 +976,7 @@ AVCodec ff_wmv1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/msmpeg4enc.c b/libavcodec/msmpeg4enc.c
index 631c167811..e1ade248a3 100644
--- a/libavcodec/msmpeg4enc.c
+++ b/libavcodec/msmpeg4enc.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,6 @@
 #include "libavutil/avutil.h"
 #include "libavutil/mem.h"
 #include "mpegvideo.h"
-#include "msmpeg4.h"
 #include "h263.h"
 #include "internal.h"
 #include "mpeg4video.h"
@@ -160,8 +159,8 @@ av_cold int ff_msmpeg4_encode_init(MpegEncContext *s)
 static void find_best_tables(MpegEncContext * s)
 {
     int i;
-    int best       =-1, best_size       =9999999;
-    int chroma_best=-1, best_chroma_size=9999999;
+    int best        = 0, best_size        = INT_MAX;
+    int chroma_best = 0, best_chroma_size = INT_MAX;
 
     for(i=0; i<3; i++){
         int level;
@@ -241,7 +240,7 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     s->per_mb_rl_table = 0;
     if(s->msmpeg4_version==4)
         s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE && s->pict_type==AV_PICTURE_TYPE_P);
-    ff_dlog(s, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
@@ -284,14 +283,15 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 void ff_msmpeg4_encode_ext_header(MpegEncContext * s)
 {
-        put_bits(&s->pb, 5, s->avctx->time_base.den / s->avctx->time_base.num); //yes 29.97 -> 29
+        unsigned fps = s->avctx->time_base.den / s->avctx->time_base.num / FFMAX(s->avctx->ticks_per_frame, 1);
+        put_bits(&s->pb, 5, FFMIN(fps, 31)); //yes 29.97 -> 29
 
         put_bits(&s->pb, 11, FFMIN(s->bit_rate/1024, 2047));
 
         if(s->msmpeg4_version>=3)
             put_bits(&s->pb, 1, s->flipflop_rounding);
         else
-            assert(s->flipflop_rounding==0);
+            av_assert0(s->flipflop_rounding==0);
 }
 
 void ff_msmpeg4_encode_motion(MpegEncContext * s,
@@ -504,7 +504,7 @@ void ff_msmpeg4_encode_mb(MpegEncContext * s,
 static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr)
 {
     int sign, code;
-    int pred, extquant;
+    int pred, av_uninit(extquant);
     int extrabits = 0;
 
     int16_t *dc_val;
diff --git a/libavcodec/msrle.c b/libavcodec/msrle.c
index a1adeba949..df9f795b66 100644
--- a/libavcodec/msrle.c
+++ b/libavcodec/msrle.c
@@ -1,21 +1,21 @@
 /*
  * Microsoft RLE video decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "msrledec.h"
+#include "libavutil/imgutils.h"
 
 typedef struct MsrleContext {
     AVCodecContext *avctx;
@@ -50,10 +51,14 @@ typedef struct MsrleContext {
 static av_cold int msrle_decode_init(AVCodecContext *avctx)
 {
     MsrleContext *s = avctx->priv_data;
+    int i;
 
     s->avctx = avctx;
 
     switch (avctx->bits_per_coded_sample) {
+    case 1:
+        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        break;
     case 4:
     case 8:
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -70,6 +75,10 @@ static av_cold int msrle_decode_init(AVCodecContext *avctx)
     if (!s->frame)
         return AVERROR(ENOMEM);
 
+    if (avctx->extradata_size >= 4)
+        for (i = 0; i < FFMIN(avctx->extradata_size, AVPALETTE_SIZE)/4; i++)
+            s->pal[i] = 0xFFU<<24 | AV_RL32(avctx->extradata+4*i);
+
     return 0;
 }
 
@@ -86,30 +95,30 @@ static int msrle_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
-    if (avctx->bits_per_coded_sample <= 8) {
+    if (avctx->bits_per_coded_sample > 1 && avctx->bits_per_coded_sample <= 8) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
 
         if (pal) {
             s->frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
         }
-
         /* make the palette available */
         memcpy(s->frame->data[1], s->pal, AVPALETTE_SIZE);
     }
 
     /* FIXME how to correctly detect RLE ??? */
     if (avctx->height * istride == avpkt->size) { /* assume uncompressed */
-        int linesize = avctx->width * avctx->bits_per_coded_sample / 8;
+        int linesize = av_image_get_linesize(avctx->pix_fmt, avctx->width, 0);
         uint8_t *ptr = s->frame->data[0];
         uint8_t *buf = avpkt->data + (avctx->height-1)*istride;
         int i, j;
 
+        if (linesize < 0)
+            return linesize;
+
         for (i = 0; i < avctx->height; i++) {
             if (avctx->bits_per_coded_sample == 4) {
                 for (j = 0; j < avctx->width - 1; j += 2) {
diff --git a/libavcodec/msrledec.c b/libavcodec/msrledec.c
index 370d9bdfce..3aa5e3ceb9 100644
--- a/libavcodec/msrledec.c
+++ b/libavcodec/msrledec.c
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,17 +36,15 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
     unsigned char rle_code;
     unsigned char extra_byte, odd_pixel;
     unsigned char stream_byte;
-    unsigned int pixel_ptr = 0;
-    int row_dec = pic->linesize[0];
-    int row_ptr = (avctx->height - 1) * row_dec;
-    int frame_size = FFABS(row_dec) * avctx->height;
+    int pixel_ptr = 0;
+    int line = avctx->height - 1;
     int i;
 
-    while (row_ptr >= 0) {
+    while (line >= 0 && pixel_ptr <= avctx->width) {
         if (bytestream2_get_bytes_left(gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "MS RLE: bytestream overrun, %d rows left\n",
-                   row_ptr);
+                   "MS RLE: bytestream overrun, %dx%d left\n",
+                   avctx->width - pixel_ptr, line);
             return AVERROR_INVALIDDATA;
         }
         rle_code = stream_byte = bytestream2_get_byteu(gb);
@@ -55,7 +53,7 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
             stream_byte = bytestream2_get_byte(gb);
             if (stream_byte == 0) {
                 /* line is done, goto the next one */
-                row_ptr -= row_dec;
+                line--;
                 pixel_ptr = 0;
             } else if (stream_byte == 1) {
                 /* decode is done */
@@ -65,13 +63,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
                 stream_byte = bytestream2_get_byte(gb);
                 pixel_ptr += stream_byte;
                 stream_byte = bytestream2_get_byte(gb);
-                row_ptr -= stream_byte * row_dec;
+                avpriv_request_sample(avctx, "Unused stream byte %X", stream_byte);
             } else {
                 // copy pixels from encoded stream
                 odd_pixel =  stream_byte & 1;
                 rle_code = (stream_byte + 1) / 2;
                 extra_byte = rle_code & 0x01;
-                if (row_ptr + pixel_ptr + stream_byte > frame_size ||
+                if (pixel_ptr + 2*rle_code - odd_pixel > avctx->width ||
                     bytestream2_get_bytes_left(gb) < rle_code) {
                     av_log(avctx, AV_LOG_ERROR,
                            "MS RLE: frame/stream ptr just went out of bounds (copy)\n");
@@ -82,13 +80,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
                     if (pixel_ptr >= avctx->width)
                         break;
                     stream_byte = bytestream2_get_byteu(gb);
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                     pixel_ptr++;
                     if (i + 1 == rle_code && odd_pixel)
                         break;
                     if (pixel_ptr >= avctx->width)
                         break;
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                     pixel_ptr++;
                 }
 
@@ -98,9 +96,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
             }
         } else {
             // decode a run of data
-            if (row_ptr + pixel_ptr + stream_byte > frame_size) {
+            if (pixel_ptr + rle_code > avctx->width + 1) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "MS RLE: frame ptr just went out of bounds (run)\n");
+                       "MS RLE: frame ptr just went out of bounds (run) %d %d %d\n", pixel_ptr, rle_code, avctx->width);
                 return AVERROR_INVALIDDATA;
             }
             stream_byte = bytestream2_get_byte(gb);
@@ -108,9 +106,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVPicture *pic,
                 if (pixel_ptr >= avctx->width)
                     break;
                 if ((i & 1) == 0)
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                 else
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                 pixel_ptr++;
             }
         }
@@ -138,7 +136,8 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
     unsigned int width= FFABS(pic->linesize[0]) / (depth >> 3);
 
     output     = pic->data[0] + (avctx->height - 1) * pic->linesize[0];
-    output_end = pic->data[0] +  avctx->height      * pic->linesize[0];
+    output_end = output + FFABS(pic->linesize[0]);
+
     while (bytestream2_get_bytes_left(gb) > 0) {
         p1 = bytestream2_get_byteu(gb);
         if(p1 == 0) { //Escape code
@@ -155,6 +154,7 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
                     }
                 }
                 output = pic->data[0] + line * pic->linesize[0];
+                output_end = output + FFABS(pic->linesize[0]);
                 pos = 0;
                 continue;
             } else if(p2 == 1) { //End-of-picture
@@ -169,11 +169,11 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
                     return -1;
                 }
                 output = pic->data[0] + line * pic->linesize[0] + pos * (depth >> 3);
+                output_end = pic->data[0] + line * pic->linesize[0] + FFABS(pic->linesize[0]);
                 continue;
             }
             // Copy data
-            if ((pic->linesize[0] > 0 && output + p2 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p2 * (depth >> 3) < output_end)) {
+            if (output + p2 * (depth >> 3) > output_end) {
                 bytestream2_skip(gb, 2 * (depth >> 3));
                 continue;
             } else if (bytestream2_get_bytes_left(gb) < p2 * (depth >> 3)) {
@@ -182,9 +182,9 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
             }
 
             if ((depth == 8) || (depth == 24)) {
-                for(i = 0; i < p2 * (depth >> 3); i++) {
-                    *output++ = bytestream2_get_byteu(gb);
-                }
+                bytestream2_get_bufferu(gb, output, p2 * (depth >> 3));
+                output += p2 * (depth >> 3);
+
                 // RLE8 copy is actually padded - and runs are not!
                 if(depth == 8 && (p2 & 1)) {
                     bytestream2_skip(gb, 1);
@@ -203,36 +203,39 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic,
             pos += p2;
         } else { //run of pixels
             uint8_t pix[3]; //original pixel
-            switch(depth){
-            case  8: pix[0] = bytestream2_get_byte(gb);
-                     break;
-            case 16: pix16  = bytestream2_get_le16(gb);
-                     break;
-            case 24: pix[0] = bytestream2_get_byte(gb);
-                     pix[1] = bytestream2_get_byte(gb);
-                     pix[2] = bytestream2_get_byte(gb);
-                     break;
-            case 32: pix32  = bytestream2_get_le32(gb);
-                     break;
-            }
-            if ((pic->linesize[0] > 0 && output + p1 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p1 * (depth >> 3) < output_end))
+            if (output + p1 * (depth >> 3) > output_end)
                 continue;
-            for(i = 0; i < p1; i++) {
-                switch(depth){
-                case  8: *output++ = pix[0];
-                         break;
-                case 16: *(uint16_t*)output = pix16;
-                         output += 2;
-                         break;
-                case 24: *output++ = pix[0];
-                         *output++ = pix[1];
-                         *output++ = pix[2];
-                         break;
-                case 32: *(uint32_t*)output = pix32;
-                         output += 4;
-                         break;
+
+            switch(depth){
+            case  8:
+                pix[0] = bytestream2_get_byte(gb);
+                memset(output, pix[0], p1);
+                output += p1;
+                break;
+            case 16:
+                pix16  = bytestream2_get_le16(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint16_t*)output = pix16;
+                        output += 2;
+                }
+                break;
+            case 24:
+                pix[0] = bytestream2_get_byte(gb);
+                pix[1] = bytestream2_get_byte(gb);
+                pix[2] = bytestream2_get_byte(gb);
+                for(i = 0; i < p1; i++) {
+                        *output++ = pix[0];
+                        *output++ = pix[1];
+                        *output++ = pix[2];
+                }
+                break;
+            case 32:
+                pix32  = bytestream2_get_le32(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint32_t*)output = pix32;
+                        output += 4;
                 }
+                break;
             }
             pos += p1;
         }
diff --git a/libavcodec/msrledec.h b/libavcodec/msrledec.h
index a594de37be..3f666360c5 100644
--- a/libavcodec/msrledec.h
+++ b/libavcodec/msrledec.h
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss1.c b/libavcodec/mss1.c
index a31af06e73..a579d9d9a4 100644
--- a/libavcodec/mss1.c
+++ b/libavcodec/mss1.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 1 (aka Windows Media Video V7 Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,7 +60,7 @@ static void arith_normalise(ArithCoder *c)
     }
 }
 
-ARITH_GET_BIT()
+ARITH_GET_BIT(arith)
 
 static int arith_get_bits(ArithCoder *c, int bits)
 {
@@ -105,7 +105,7 @@ static int arith_get_prob(ArithCoder *c, int16_t *probs)
     return sym;
 }
 
-ARITH_GET_MODEL_SYM()
+ARITH_GET_MODEL_SYM(arith)
 
 static void arith_init(ArithCoder *c, GetBitContext *gb)
 {
@@ -130,7 +130,7 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
         r = arith_get_bits(acoder, 8);
         g = arith_get_bits(acoder, 8);
         b = arith_get_bits(acoder, 8);
-        *pal++ = (r << 16) | (g << 8) | b;
+        *pal++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
     }
 
     return !!ncol;
@@ -139,8 +139,6 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
 static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                              AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     MSS1Context *ctx = avctx->priv_data;
     MSS12Context *c = &ctx->ctx;
     GetBitContext gb;
@@ -148,13 +146,13 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int pal_changed = 0;
     int ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+
     arith_init(&acoder, &gb);
 
-    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0)
         return ret;
-    }
 
     c->pal_pic    =  ctx->pic->data[0] + ctx->pic->linesize[0] * (avctx->height - 1);
     c->pal_stride = -ctx->pic->linesize[0];
@@ -184,7 +182,7 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mss1_decode_init(AVCodecContext *avctx)
@@ -199,6 +197,8 @@ static av_cold int mss1_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     ret = ff_mss12_decode_init(&c->ctx, 0, &c->sc, NULL);
+    if (ret < 0)
+        av_frame_free(&c->pic);
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
diff --git a/libavcodec/mss12.c b/libavcodec/mss12.c
index d4b621fc89..6b58aa2955 100644
--- a/libavcodec/mss12.c
+++ b/libavcodec/mss12.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -450,7 +450,7 @@ static int decode_pivot(SliceContext *sc, ArithCoder *acoder, int base)
         val = acoder->get_number(acoder, (base + 1) / 2 - 2) + 3;
     }
 
-    if (val >= base)
+    if ((unsigned)val >= base)
         return -1;
 
     return inv ? base - val : val;
@@ -588,6 +588,11 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
                avctx->coded_width, avctx->coded_height);
         return AVERROR_INVALIDDATA;
     }
+    if (avctx->coded_width < 1 || avctx->coded_height < 1) {
+        av_log(avctx, AV_LOG_ERROR, "Frame dimensions %dx%d too small",
+               avctx->coded_width, avctx->coded_height);
+        return AVERROR_INVALIDDATA;
+    }
 
     av_log(avctx, AV_LOG_DEBUG, "Encoder version %"PRIu32".%"PRIu32"\n",
            AV_RB32(avctx->extradata + 4), AV_RB32(avctx->extradata + 8));
@@ -647,11 +652,11 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
     }
 
     for (i = 0; i < 256; i++)
-        c->pal[i] = AV_RB24(avctx->extradata + 52 +
+        c->pal[i] = 0xFFU << 24 | AV_RB24(avctx->extradata + 52 +
                             (version ? 8 : 0) + i * 3);
 
     c->mask_stride = FFALIGN(avctx->width, 16);
-    c->mask        = av_malloc(c->mask_stride * avctx->height);
+    c->mask        = av_malloc_array(c->mask_stride, avctx->height);
     if (!c->mask) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate mask plane\n");
         return AVERROR(ENOMEM);
diff --git a/libavcodec/mss12.h b/libavcodec/mss12.h
index 5b1fee8913..f953167389 100644
--- a/libavcodec/mss12.h
+++ b/libavcodec/mss12.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,8 +99,8 @@ int ff_mss12_decode_init(MSS12Context *c, int version,
                          SliceContext *sc1, SliceContext *sc2);
 int ff_mss12_decode_end(MSS12Context *ctx);
 
-#define ARITH_GET_BIT(VERSION)                                          \
-static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
+#define ARITH_GET_BIT(prefix)                                           \
+static int prefix ## _get_bit(ArithCoder *c)                            \
 {                                                                       \
     int range = c->high - c->low + 1;                                   \
     int bit   = 2 * c->value - c->low >= c->high;                       \
@@ -110,22 +110,22 @@ static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
     else                                                                \
         c->high = c->low + (range >> 1) - 1;                            \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return bit;                                                         \
 }
 
-#define ARITH_GET_MODEL_SYM(VERSION)                                    \
-static int arith ## VERSION ## _get_model_sym(ArithCoder *c, Model *m)  \
+#define ARITH_GET_MODEL_SYM(prefix)                                     \
+static int prefix ## _get_model_sym(ArithCoder *c, Model *m)            \
 {                                                                       \
     int idx, val;                                                       \
                                                                         \
-    idx = arith ## VERSION ## _get_prob(c, m->cum_prob);                \
+    idx = prefix ## _get_prob(c, m->cum_prob);                          \
                                                                         \
     val = m->idx2sym[idx];                                              \
     ff_mss12_model_update(m, idx);                                      \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return val;                                                         \
 }
diff --git a/libavcodec/mss2.c b/libavcodec/mss2.c
index 4a4eaa0b88..74e52af6cd 100644
--- a/libavcodec/mss2.c
+++ b/libavcodec/mss2.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,13 +52,13 @@ static void arith2_normalise(ArithCoder *c)
             c->value ^= 0x8000;
             c->low   ^= 0x8000;
         }
-        c->high  = c->high  << 8 & 0xFFFFFF | 0xFF;
-        c->value = c->value << 8 & 0xFFFFFF | bytestream2_get_byte(c->gbc.gB);
-        c->low   = c->low   << 8 & 0xFFFFFF;
+        c->high  = (uint16_t)c->high  << 8  | 0xFF;
+        c->value = (uint16_t)c->value << 8  | bytestream2_get_byte(c->gbc.gB);
+        c->low   = (uint16_t)c->low   << 8;
     }
 }
 
-ARITH_GET_BIT(2)
+ARITH_GET_BIT(arith2)
 
 /* L. Stuiver and A. Moffat: "Piecewise Integer Mapping for Arithmetic Coding."
  * In Proc. 8th Data Compression Conference (DCC '98), pp. 3-12, Mar. 1998 */
@@ -131,7 +131,7 @@ static int arith2_get_prob(ArithCoder *c, int16_t *probs)
     return i;
 }
 
-ARITH_GET_MODEL_SYM(2)
+ARITH_GET_MODEL_SYM(arith2)
 
 static int arith2_get_consumed_bytes(ArithCoder *c)
 {
@@ -318,7 +318,7 @@ static int decode_rle(GetBitContext *gb, uint8_t *pal_dst, int pal_stride,
     if (next_code != 1 << current_length)
         return AVERROR_INVALIDDATA;
 
-    if (i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0))
+    if ((i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0)) < 0)
         return i;
 
     /* frame decode */
@@ -381,7 +381,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     ff_mpeg_flush(avctx);
 
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
 
     s->loop_filter = avctx->skip_loop_filter < AVDISCARD_ALL;
 
@@ -424,8 +425,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     if (v->respic == 3) {
         ctx->dsp.upsample_plane(f->data[0], f->linesize[0], w,      h);
-        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w >> 1, h >> 1);
-        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w >> 1, h >> 1);
+        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w+1 >> 1, h+1 >> 1);
+        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w+1 >> 1, h+1 >> 1);
     } else if (v->respic)
         avpriv_request_sample(v->s.avctx,
                               "Asymmetric WMV9 rectangle subsampling");
@@ -479,7 +480,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     av_assert0(AV_INPUT_BUFFER_PADDING_SIZE >=
                ARITH2_PADDING + (MIN_CACHE_BITS + 7) / 8);
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
     if (keyframe = get_bits1(&gb))
         skip_bits(&gb, 7);
@@ -595,10 +597,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (c->mvX < 0 || c->mvY < 0) {
         FFSWAP(uint8_t *, c->pal_pic, c->last_pal_pic);
 
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
 
         if (ctx->last_pic->data[0]) {
             av_assert0(frame->linesize[0] == ctx->last_pic->linesize[0]);
@@ -609,10 +609,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
     } else {
-        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, ctx->last_pic)) < 0)
             return ret;
 
@@ -641,7 +639,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 ff_mss12_slicecontext_reset(&ctx->sc[1]);
         }
         if (is_rle) {
-            init_get_bits(&gb, buf, buf_size * 8);
+            if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+                return ret;
             if (ret = decode_rle(&gb, c->pal_pic, c->pal_stride,
                                  c->rgb_pic, c->rgb_stride, c->pal, keyframe,
                                  ctx->split_position, 0,
@@ -819,10 +818,11 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     if (ret = ff_mss12_decode_init(c, 1, &ctx->sc[0], &ctx->sc[1]))
         return ret;
+    ctx->last_pic   = av_frame_alloc();
     c->pal_stride   = c->mask_stride;
     c->pal_pic      = av_mallocz(c->pal_stride * avctx->height);
     c->last_pal_pic = av_mallocz(c->pal_stride * avctx->height);
-    if (!c->pal_pic || !c->last_pal_pic) {
+    if (!c->pal_pic || !c->last_pal_pic || !ctx->last_pic) {
         mss2_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -836,11 +836,6 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = c->free_colours == 127 ? AV_PIX_FMT_RGB555
                                             : AV_PIX_FMT_RGB24;
 
-    ctx->last_pic = av_frame_alloc();
-    if (!ctx->last_pic) {
-        mss2_decode_end(avctx);
-        return AVERROR(ENOMEM);
-    }
 
     return 0;
 }
diff --git a/libavcodec/mss2dsp.c b/libavcodec/mss2dsp.c
index aa13577d48..c5fc1f8dcf 100644
--- a/libavcodec/mss2dsp.c
+++ b/libavcodec/mss2dsp.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -106,6 +106,9 @@ static void upsample_plane_c(uint8_t *plane, int plane_stride, int w, int h)
     uint8_t *src1, *src2, *dst1, *dst2, *p, a, b;
     int i, j;
 
+    if(!w || !h)
+        return;
+
     w += (w & 1);
     h += (h & 1);
 
diff --git a/libavcodec/mss2dsp.h b/libavcodec/mss2dsp.h
index 61c3a04fcd..7368abb581 100644
--- a/libavcodec/mss2dsp.h
+++ b/libavcodec/mss2dsp.h
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss3.c b/libavcodec/mss3.c
index c124834036..01941967a5 100644
--- a/libavcodec/mss3.c
+++ b/libavcodec/mss3.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 3 (aka Microsoft ATC Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -296,7 +296,7 @@ static void rac_normalise(RangeCoder *c)
             c->low |= *c->src++;
         } else if (!c->low) {
             c->got_error = 1;
-            return;
+            c->low = 1;
         }
         if (c->range >= RAC_BOTTOM)
             return;
@@ -731,10 +731,8 @@ static int mss3_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return buf_size;
     c->got_error = 0;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = keyframe;
     c->pic->pict_type = keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     if (!bytestream2_get_bytes_left(&gb)) {
@@ -840,6 +838,7 @@ static av_cold int mss3_decode_init(AVCodecContext *avctx)
                                             b_width * b_height);
         if (!c->dct_coder[i].prev_dc) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
+            av_frame_free(&c->pic);
             while (i >= 0) {
                 av_freep(&c->dct_coder[i].prev_dc);
                 i--;
diff --git a/libavcodec/mss34dsp.c b/libavcodec/mss34dsp.c
index 11abb2de11..0397add17d 100644
--- a/libavcodec/mss34dsp.c
+++ b/libavcodec/mss34dsp.c
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,8 +84,8 @@ void ff_mss34_gen_quant_mat(uint16_t *qmat, int quality, int luma)
     blk[6 * step] = (-(t3 + t7) + t8 + tA) >> shift;                \
     blk[7 * step] = (-(t1 + t6) + t9 + tB) >> shift;                \
 
-#define SOP_ROW(a) ((a) << 16) + 0x2000
-#define SOP_COL(a) ((a + 32) << 16)
+#define SOP_ROW(a) (((a) << 16) + 0x2000)
+#define SOP_COL(a) (((a) + 32) << 16)
 
 void ff_mss34_dct_put(uint8_t *dst, int stride, int *block)
 {
diff --git a/libavcodec/mss34dsp.h b/libavcodec/mss34dsp.h
index b2cc5501ec..2f9827d3e5 100644
--- a/libavcodec/mss34dsp.h
+++ b/libavcodec/mss34dsp.h
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss4.c b/libavcodec/mss4.c
index a953a5759d..9639fc820e 100644
--- a/libavcodec/mss4.c
+++ b/libavcodec/mss4.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 4 (aka Microsoft Expression Encoder Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -125,7 +125,7 @@ static const uint8_t mss4_vec_entry_vlc_syms[2][9] = {
 #define MAX_ENTRIES  162
 
 typedef struct MSS4Context {
-    AVFrame   *pic;
+    AVFrame    *pic;
 
     VLC        dc_vlc[2], ac_vlc[2];
     VLC        vec_entry_vlc[2];
@@ -363,7 +363,7 @@ static int get_value_cached(GetBitContext *gb, int vec_pos, uint8_t *vec,
     return prev[component];
 }
 
-#define MKVAL(vals)  (vals[0] | (vals[1] << 3) | (vals[2] << 6))
+#define MKVAL(vals)  ((vals)[0] | ((vals)[1] << 3) | ((vals)[2] << 6))
 
 /* Image mode - the hardest to comprehend MSS4 coding mode.
  *
@@ -553,10 +553,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = (frame_type == INTRA_FRAME);
     c->pic->pict_type = (frame_type == INTRA_FRAME) ? AV_PICTURE_TYPE_I
                                                    : AV_PICTURE_TYPE_P;
@@ -574,7 +572,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mss34_gen_quant_mat(c->quant_mat[i], quality, !i);
     }
 
-    init_get_bits(&gb, buf + HEADER_SIZE, (buf_size - HEADER_SIZE) * 8);
+    if ((ret = init_get_bits8(&gb, buf + HEADER_SIZE, buf_size - HEADER_SIZE)) < 0)
+        return ret;
 
     mb_width  = FFALIGN(width,  16) >> 4;
     mb_height = FFALIGN(height, 16) >> 4;
@@ -652,7 +651,7 @@ static av_cold int mss4_decode_init(AVCodecContext *avctx)
     }
     for (i = 0; i < 3; i++) {
         c->dc_stride[i] = FFALIGN(avctx->width, 16) >> (2 + !!i);
-        c->prev_dc[i]   = av_malloc(sizeof(**c->prev_dc) * c->dc_stride[i]);
+        c->prev_dc[i]   = av_malloc_array(c->dc_stride[i], sizeof(**c->prev_dc));
         if (!c->prev_dc[i]) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
             mss4_free_vlcs(c);
diff --git a/libavcodec/msvideo1.c b/libavcodec/msvideo1.c
index 62f52b153d..891675fcb0 100644
--- a/libavcodec/msvideo1.c
+++ b/libavcodec/msvideo1.c
@@ -1,21 +1,21 @@
 /*
  * Microsoft Video-1 Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -67,6 +67,8 @@ static av_cold int msvideo1_decode_init(AVCodecContext *avctx)
     if (s->avctx->bits_per_coded_sample == 8) {
         s->mode_8bit = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (avctx->extradata_size >= AVPALETTE_SIZE)
+            memcpy(s->pal, avctx->extradata, AVPALETTE_SIZE);
     } else {
         s->mode_8bit = 0;
         avctx->pix_fmt = AV_PIX_FMT_RGB555;
@@ -300,10 +302,8 @@ static int msvideo1_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (s->mode_8bit) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
diff --git a/libavcodec/msvideo1enc.c b/libavcodec/msvideo1enc.c
new file mode 100644
index 0000000000..b6ae92b2ac
--- /dev/null
+++ b/libavcodec/msvideo1enc.c
@@ -0,0 +1,305 @@
+/*
+ * Microsoft Video-1 Encoder
+ * Copyright (c) 2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Microsoft Video-1 encoder
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "libavutil/lfg.h"
+#include "elbg.h"
+#include "libavutil/imgutils.h"
+/**
+ * Encoder context
+ */
+typedef struct Msvideo1EncContext {
+    AVCodecContext *avctx;
+    AVLFG rnd;
+    uint8_t *prev;
+
+    int block[16*3];
+    int block2[16*3];
+    int codebook[8*3];
+    int codebook2[8*3];
+    int output[16*3];
+    int output2[16*3];
+    int avg[3];
+    int bestpos;
+    int keyint;
+} Msvideo1EncContext;
+
+enum MSV1Mode{
+    MODE_SKIP = 0,
+    MODE_FILL,
+    MODE_2COL,
+    MODE_8COL,
+};
+
+#define SKIP_PREFIX 0x8400
+#define SKIPS_MAX 0x03FF
+#define MKRGB555(in, off) (((in)[off] << 10) | ((in)[(off) + 1] << 5) | ((in)[(off) + 2]))
+
+static const int remap[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 };
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+    const AVFrame *p = pict;
+    uint16_t *src;
+    uint8_t *prevptr;
+    uint8_t *dst, *buf;
+    int keyframe = 0;
+    int no_skips = 1;
+    int i, j, k, x, y, ret;
+    int skips = 0;
+    int quality = 24;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+    dst= buf= pkt->data;
+
+    if(!c->prev)
+        c->prev = av_malloc(avctx->width * 3 * (avctx->height + 3));
+    prevptr = c->prev + avctx->width * 3 * (FFALIGN(avctx->height, 4) - 1);
+    src = (uint16_t*)(p->data[0] + p->linesize[0]*(FFALIGN(avctx->height, 4) - 1));
+    if(c->keyint >= avctx->keyint_min)
+        keyframe = 1;
+
+
+    for(y = 0; y < avctx->height; y += 4){
+        for(x = 0; x < avctx->width; x += 4){
+            int bestmode = MODE_SKIP;
+            int bestscore = INT_MAX;
+            int flags = 0;
+            int score;
+
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    uint16_t val = src[x + i - j*p->linesize[0]/2];
+                    for(k = 0; k < 3; k++){
+                        c->block[(i + j*4)*3 + k] =
+                        c->block2[remap[i + j*4]*3 + k] = (val >> (10-k*5)) & 0x1F;
+                    }
+                }
+            }
+            if(!keyframe){
+                bestscore = 0;
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4*3; i++){
+                        int t = prevptr[x*3 + i - j*3*avctx->width] - c->block[i + j*4*3];
+                        bestscore += t*t;
+                    }
+                }
+                bestscore /= quality;
+            }
+            // try to find optimal value to fill whole 4x4 block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            if(c->avg[0] == 1) // red component = 1 will be written as skip code
+                c->avg[0] = 0;
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->avg[k] - c->block[(i+j*4)*3+k];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 2;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_FILL;
+            }
+            // search for optimal filling of 2-color block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            // last output value should be always 1, swap codebooks if needed
+            if(!c->output[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook[i], c->codebook[i+3]);
+                for(i = 0; i < 16; i++)
+                    c->output[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook[c->output[i+j*4]*3 + k] - c->block[i*3+k+j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 6;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_2COL;
+            }
+            // search for optimal filling of 2-color 2x2 subblocks
+            score = 0;
+            for(i = 0; i < 4; i++){
+                avpriv_init_elbg(c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+                avpriv_do_elbg  (c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+            }
+            // last value should be always 1, swap codebooks if needed
+            if(!c->output2[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook2[i+18], c->codebook2[i+21]);
+                for(i = 12; i < 16; i++)
+                    c->output2[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3+k] - c->block[i*3+k + j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 18;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_8COL;
+            }
+
+            if(bestmode == MODE_SKIP){
+                skips++;
+                no_skips = 0;
+            }
+            if((bestmode != MODE_SKIP && skips) || skips == SKIPS_MAX){
+                bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+                skips = 0;
+            }
+
+            switch(bestmode){
+            case MODE_FILL:
+                bytestream_put_le16(&dst, MKRGB555(c->avg,0) | 0x8000);
+                for(j = 0; j < 4; j++)
+                    for(i = 0; i < 4; i++)
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->avg[k];
+                break;
+            case MODE_2COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output[i + j*4]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook[c->output[i + j*4]*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 0));
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 3));
+                break;
+            case MODE_8COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output2[remap[i + j*4]]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook2, 0) | 0x8000);
+                for(i = 3; i < 24; i += 3)
+                    bytestream_put_le16(&dst, MKRGB555(c->codebook2, i));
+                break;
+            }
+        }
+        src     -= p->linesize[0] << 1;
+        prevptr -= avctx->width * 3 * 4;
+    }
+    if(skips)
+        bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+    //EOF
+    bytestream_put_byte(&dst, 0);
+    bytestream_put_byte(&dst, 0);
+
+    if(no_skips)
+        keyframe = 1;
+    if(keyframe)
+        c->keyint = 0;
+    else
+        c->keyint++;
+    if (keyframe) pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = dst - buf;
+    *got_packet = 1;
+
+    return 0;
+}
+
+
+/**
+ * init encoder
+ */
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    c->avctx = avctx;
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
+        return -1;
+    }
+    if((avctx->width&3) || (avctx->height&3)){
+        av_log(avctx, AV_LOG_ERROR, "width and height must be multiples of 4\n");
+        return -1;
+    }
+
+    avctx->bits_per_coded_sample = 16;
+
+    c->keyint = avctx->keyint_min;
+    av_lfg_init(&c->rnd, 1);
+
+    return 0;
+}
+
+
+
+/**
+ * Uninit encoder
+ */
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    av_freep(&c->prev);
+
+    return 0;
+}
+
+AVCodec ff_msvideo1_encoder = {
+    .name           = "msvideo1",
+    .long_name = NULL_IF_CONFIG_SMALL("Microsoft Video-1"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MSVIDEO1,
+    .priv_data_size = sizeof(Msvideo1EncContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts = (const enum AVPixelFormat[]){AV_PIX_FMT_RGB555, AV_PIX_FMT_NONE},
+};
diff --git a/libavcodec/mvcdec.c b/libavcodec/mvcdec.c
index 1546bcce18..74f279a666 100644
--- a/libavcodec/mvcdec.c
+++ b/libavcodec/mvcdec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics Motion Video Compressor 1 & 2 decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mxpegdec.c b/libavcodec/mxpegdec.c
index a8ef6d0e0d..2e3ebe6e70 100644
--- a/libavcodec/mxpegdec.c
+++ b/libavcodec/mxpegdec.c
@@ -2,20 +2,20 @@
  * MxPEG decoder
  * Copyright (c) 2011 Anatoly Nenashev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,7 @@ static av_cold int mxpeg_decode_end(AVCodecContext *avctx)
     for (i = 0; i < 2; ++i)
         av_frame_free(&s->picture[i]);
 
+    s->bitmask_size = 0;
     av_freep(&s->mxm_bitmask);
     av_freep(&s->completion_bitmask);
 
@@ -105,6 +106,7 @@ static int mxpeg_decode_mxm(MXpegDecodeContext *s,
     }
 
     if (s->bitmask_size != bitmask_size) {
+        s->bitmask_size = 0;
         av_freep(&s->mxm_bitmask);
         s->mxm_bitmask = av_malloc(bitmask_size);
         if (!s->mxm_bitmask) {
@@ -272,11 +274,9 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
                     }
                     /* use stored SOF data to allocate current picture */
                     av_frame_unref(jpg->picture_ptr);
-                    if (ff_get_buffer(avctx, jpg->picture_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                    if ((ret = ff_get_buffer(avctx, jpg->picture_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
                     jpg->picture_ptr->pict_type = AV_PICTURE_TYPE_P;
                     jpg->picture_ptr->key_frame = 0;
                     jpg->got_picture = 1;
@@ -292,17 +292,15 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
 
                     /* allocate dummy reference picture if needed */
                     if (!reference_ptr->data[0] &&
-                        ff_get_buffer(avctx, reference_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                        (ret = ff_get_buffer(avctx, reference_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
 
-                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, reference_ptr);
+                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, s->bitmask_size, reference_ptr);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 } else {
-                    ret = ff_mjpeg_decode_sos(jpg, NULL, NULL);
+                    ret = ff_mjpeg_decode_sos(jpg, NULL, 0, NULL);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 }
@@ -346,5 +344,6 @@ AVCodec ff_mxpeg_decoder = {
     .close          = mxpeg_decode_end,
     .decode         = mxpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 5033282061..5f18593c9c 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -50,7 +50,7 @@ typedef struct NellyMoserDecodeContext {
     AVLFG           random_state;
     GetBitContext   gb;
     float           scale_bias;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      imdct_ctx;
     DECLARE_ALIGNED(32, float, imdct_buf)[2][NELLY_BUF_LEN];
     float          *imdct_out;
@@ -105,7 +105,7 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
                (NELLY_BUF_LEN - NELLY_FILL_LEN) * sizeof(float));
 
         s->imdct_ctx.imdct_half(&s->imdct_ctx, s->imdct_out, aptr);
-        s->fdsp.vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
+        s->fdsp->vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
                                    s->imdct_out, ff_sine_128,
                                    NELLY_BUF_LEN / 2);
         FFSWAP(float *, s->imdct_out, s->imdct_prev);
@@ -121,7 +121,9 @@ static av_cold int decode_init(AVCodecContext * avctx) {
     av_lfg_init(&s->random_state, 0);
     ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     s->scale_bias = 1.0/(32768*8);
     avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
@@ -141,16 +143,19 @@ static int decode_tag(AVCodecContext *avctx, void *data,
 {
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
+    const uint8_t *side=av_packet_get_side_data(avpkt, 'F', NULL);
     int buf_size = avpkt->size;
     NellyMoserDecodeContext *s = avctx->priv_data;
     int blocks, i, ret;
     float   *samples_flt;
 
     blocks     = buf_size / NELLY_BLOCK_LEN;
+
     if (blocks <= 0) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
+
     if (buf_size % NELLY_BLOCK_LEN) {
         av_log(avctx, AV_LOG_WARNING, "Leftover bytes: %d.\n",
                buf_size % NELLY_BLOCK_LEN);
@@ -162,13 +167,13 @@ static int decode_tag(AVCodecContext *avctx, void *data,
      * 22050 Hz - 4
      * 44100 Hz - 8
      */
+    if(side && blocks>1 && avctx->sample_rate%11025==0 && (1<<((side[0]>>2)&3)) == blocks)
+        avctx->sample_rate= 11025*(blocks/2);
 
     /* get output buffer */
     frame->nb_samples = NELLY_SAMPLES * blocks;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples_flt = (float *)frame->data[0];
 
     for (i=0 ; i<blocks ; i++) {
@@ -186,6 +191,7 @@ static av_cold int decode_end(AVCodecContext * avctx) {
     NellyMoserDecodeContext *s = avctx->priv_data;
 
     ff_mdct_end(&s->imdct_ctx);
+    av_freep(&s->fdsp);
 
     return 0;
 }
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 9d120816e0..d7368d07e0 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
  *
  * Generic codec information: libavcodec/nellymoserdec.c
  *
- * Some information also from: http://samples.libav.org/A-codecs/Nelly_Moser/ASAO/ASAO.zip
+ * Some information also from: http://samples.mplayerhq.hu/A-codecs/Nelly_Moser/ASAO/ASAO.zip
  *                             (Copyright Joseph Artsimovich and UAB "DKD")
  *
  * for more information about nellymoser format, visit:
@@ -56,7 +56,7 @@
 typedef struct NellyMoserEncodeContext {
     AVCodecContext  *avctx;
     int             last_frame;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      mdct_ctx;
     AudioFrameQueue afq;
     DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
@@ -66,7 +66,7 @@ typedef struct NellyMoserEncodeContext {
     uint8_t         (*path)[OPT_SIZE];
 } NellyMoserEncodeContext;
 
-static float pow_table[POW_TABLE_SIZE];     ///< -pow(2, -i / 2048.0 - 3.0);
+static float pow_table[POW_TABLE_SIZE];     ///< pow(2, -i / 2048.0 - 3.0);
 
 static const uint8_t sf_lut[96] = {
      0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  4,  4,
@@ -122,12 +122,12 @@ static void apply_mdct(NellyMoserEncodeContext *s)
     float *in1 = s->buf + NELLY_BUF_LEN;
     float *in2 = s->buf + 2 * NELLY_BUF_LEN;
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff);
 }
 
@@ -138,10 +138,11 @@ static av_cold int encode_end(AVCodecContext *avctx)
     ff_mdct_end(&s->mdct_ctx);
 
     if (s->avctx->trellis) {
-        av_free(s->opt);
-        av_free(s->path);
+        av_freep(&s->opt);
+        av_freep(&s->path);
     }
     ff_af_queue_close(&s->afq);
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -170,12 +171,16 @@ static av_cold int encode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
         goto error;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
 
     /* Generate overlap window */
-    ff_sine_window_init(ff_sine_128, 128);
+    ff_init_ff_sine_windows(7);
     for (i = 0; i < POW_TABLE_SIZE; i++)
-        pow_table[i] = -pow(2, -i / 2048.0 - 3.0 + POW_TABLE_OFFSET);
+        pow_table[i] = pow(2, -i / 2048.0 - 3.0 + POW_TABLE_OFFSET);
 
     if (s->avctx->trellis) {
         s->opt  = av_malloc(NELLY_BANDS * OPT_SIZE * sizeof(float  ));
@@ -231,7 +236,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
     float  (*opt )[OPT_SIZE] = s->opt ;
     uint8_t(*path)[OPT_SIZE] = s->path;
 
-    for (i = 0; i < OPT_SIZE; i++) {
+    for (i = 0; i < NELLY_BANDS * OPT_SIZE; i++) {
         opt[0][i] = INFINITY;
     }
 
@@ -266,7 +271,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
                 }
             }
         }
-        assert(c); //FIXME
+        av_assert1(c); //FIXME
     }
 
     best_val = INFINITY;
@@ -303,7 +308,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
 
     apply_mdct(s);
 
-    init_put_bits(&pb, output, output_size * 8);
+    init_put_bits(&pb, output, output_size);
 
     i = 0;
     for (band = 0; band < NELLY_BANDS; band++) {
@@ -392,10 +397,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->last_frame = 1;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, NELLY_BLOCK_LEN))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, NELLY_BLOCK_LEN, 0)) < 0)
         return ret;
-    }
     encode_block(s, avpkt->data, avpkt->size);
 
     /* Get the next frame pts/duration */
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fe952ae256..a96ae519f5 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard
  * Copyright (c) 2014 James Yu <james.yu@linaro.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/noise_bsf.c b/libavcodec/noise_bsf.c
index 81840bd07b..556ad5c549 100644
--- a/libavcodec/noise_bsf.c
+++ b/libavcodec/noise_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,9 +32,13 @@ static int noise(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, const ch
     int amount= args ? atoi(args) : (*state % 10001+1);
     int i;
 
+    if(amount <= 0)
+        return AVERROR(EINVAL);
+
     *poutbuf= av_malloc(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!*poutbuf)
         return AVERROR(ENOMEM);
+
     memcpy(*poutbuf, buf, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
     for(i=0; i<buf_size; i++){
         (*state) += (*poutbuf)[i] + 1;
@@ -45,7 +49,7 @@ static int noise(AVBitStreamFilterContext *bsfc, AVCodecContext *avctx, const ch
 }
 
 AVBitStreamFilter ff_noise_bsf={
-    "noise",
-    sizeof(int),
-    noise,
+    .name           = "noise",
+    .priv_data_size = sizeof(int),
+    .filter         = noise,
 };
diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c
index c56003c3ad..f3270cb19d 100644
--- a/libavcodec/nuv.c
+++ b/libavcodec/nuv.c
@@ -2,25 +2,26 @@
  * NuppelVideo decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <limits.h>
 
 #include "libavutil/bswap.h"
 #include "libavutil/common.h"
@@ -121,23 +122,26 @@ static int codec_reinit(AVCodecContext *avctx, int width, int height,
     if (quality >= 0)
         get_quant_quality(c, quality);
     if (width != c->width || height != c->height) {
-        void *ptr;
+        // also reserve space for a possible additional header
+        int buf_size = height * width * 3 / 2
+                     + FFMAX(AV_LZO_OUTPUT_PADDING, AV_INPUT_BUFFER_PADDING_SIZE)
+                     + RTJPEG_HEADER_SIZE;
+        if (buf_size > INT_MAX/8)
+            return -1;
         if ((ret = av_image_check_size(height, width, 0, avctx)) < 0)
             return ret;
         avctx->width  = c->width  = width;
         avctx->height = c->height = height;
-        ptr = av_fast_realloc(c->decomp_buf, &c->decomp_size,
-                              c->height * c->width * 3 / 2 +
-                              AV_INPUT_BUFFER_PADDING_SIZE +
-                              RTJPEG_HEADER_SIZE);
-        if (!ptr) {
+        av_fast_malloc(&c->decomp_buf, &c->decomp_size,
+                       buf_size);
+        if (!c->decomp_buf) {
             av_log(avctx, AV_LOG_ERROR,
                    "Can't allocate decompression buffer.\n");
             return AVERROR(ENOMEM);
-        } else
-            c->decomp_buf = ptr;
+        }
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
         av_frame_unref(c->pic);
+        return 1;
     } else if (quality != c->quality)
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
 
@@ -153,6 +157,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *picture   = data;
     int orig_size      = buf_size;
     int keyframe, ret;
+    int size_change = 0;
     int result, init_frame = !avctx->frame_number;
     enum {
         NUV_UNCOMPRESSED  = '0',
@@ -181,7 +186,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return orig_size;
     }
 
-    if (buf[0] != 'V' || buf_size < 12) {
+    if (buf_size < 12 || buf[0] != 'V') {
         av_log(avctx, AV_LOG_ERROR, "not a nuv video frame\n");
         return AVERROR_INVALIDDATA;
     }
@@ -198,24 +203,32 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         keyframe = 1;
         break;
     }
+retry:
     // skip rest of the frameheader.
     buf       = &buf[12];
     buf_size -= 12;
     if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO) {
-        int outlen = c->decomp_size - AV_INPUT_BUFFER_PADDING_SIZE;
+        int outlen = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING);
         int inlen  = buf_size;
         if (av_lzo1x_decode(c->decomp_buf, &outlen, buf, &inlen)) {
             av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
             return AVERROR_INVALIDDATA;
         }
         buf      = c->decomp_buf;
-        buf_size = outlen;
+        buf_size = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING) - outlen;
+        memset(c->decomp_buf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     }
     if (c->codec_frameheader) {
         int w, h, q;
-        if (buf_size < RTJPEG_HEADER_SIZE || buf[4] != RTJPEG_HEADER_SIZE ||
-            buf[5] != RTJPEG_FILE_VERSION) {
-            av_log(avctx, AV_LOG_ERROR, "invalid nuv video frame\n");
+        if (buf_size < RTJPEG_HEADER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "Too small NUV video frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        // There seem to exist two variants of this header: one starts with 'V'
+        // and 5 bytes unknown, the other matches current MythTV and is 4 bytes size,
+        // 1 byte header size (== 12), 1 byte version (== 0)
+        if (buf[0] != 'V' && AV_RL16(&buf[4]) != 0x000c) {
+            av_log(avctx, AV_LOG_ERROR, "Unknown secondary frame header (wrong codec_tag?)\n");
             return AVERROR_INVALIDDATA;
         }
         w = AV_RL16(&buf[6]);
@@ -223,22 +236,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         q = buf[10];
         if ((result = codec_reinit(avctx, w, h, q)) < 0)
             return result;
-        if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO)
-            buf = c->decomp_buf;
+        if (result) {
+            buf = avpkt->data;
+            buf_size = avpkt->size;
+            size_change = 1;
+            goto retry;
+        }
         buf       = &buf[RTJPEG_HEADER_SIZE];
         buf_size -= RTJPEG_HEADER_SIZE;
     }
 
-    if (keyframe) {
+    if (size_change || keyframe) {
         av_frame_unref(c->pic);
         init_frame = 1;
     }
 
-    result = ff_reget_buffer(avctx, c->pic);
-    if (result < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_reget_buffer(avctx, c->pic)) < 0)
         return result;
-    }
     if (init_frame) {
         memset(c->pic->data[0], 0,    avctx->height * c->pic->linesize[0]);
         memset(c->pic->data[1], 0x80, avctx->height * c->pic->linesize[1] / 2);
@@ -256,7 +270,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "uncompressed frame too short\n");
             height = buf_size / c->width / 3 * 2;
         }
-        copy_frame(c->pic, buf, c->width, height);
+        if(height > 0)
+            copy_frame(c->pic, buf, c->width, height);
         break;
     }
     case NUV_RTJPEG_IN_LZO:
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index 3f19ed0f3a..18bcd96f09 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -1,1049 +1,1194 @@
 /*
- * NVIDIA NVENC Support
- * Copyright (C) 2015 Luca Barbato
+ * H.264 hardware encoding using nvidia nvenc
+ * Copyright (c) 2014 Timo Rothenpieler <timo@rothenpieler.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "config.h"
-
-#include <cuda.h>
-#include <nvEncodeAPI.h>
-#include <string.h>
-
-#define CUDA_LIBNAME "libcuda.so"
-
-#if HAVE_DLFCN_H
-#include <dlfcn.h>
-
-#define NVENC_LIBNAME "libnvidia-encode.so"
-
-#elif HAVE_WINDOWS_H
+#if defined(_WIN32)
 #include <windows.h>
-
-#if ARCH_X86_64
-#define NVENC_LIBNAME "nvEncodeAPI64.dll"
 #else
-#define NVENC_LIBNAME "nvEncodeAPI.dll"
+#include <dlfcn.h>
 #endif
 
-#define dlopen(filename, flags) LoadLibrary((filename))
-#define dlsym(handle, symbol)   GetProcAddress(handle, symbol)
-#define dlclose(handle)         FreeLibrary(handle)
-#endif
+#include <nvEncodeAPI.h>
 
-#include "libavutil/common.h"
+#include "libavutil/internal.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
 #include "internal.h"
-#include "nvenc.h"
-
-#define NVENC_CAP 0x30
-#define BITSTREAM_BUFFER_SIZE 1024 * 1024
-
-#define LOAD_LIBRARY(l, path)                   \
-    do {                                        \
-        if (!((l) = dlopen(path, RTLD_LAZY))) { \
-            av_log(avctx, AV_LOG_ERROR,         \
-                   "Cannot load %s\n",          \
-                   path);                       \
-            return AVERROR_UNKNOWN;             \
-        }                                       \
-    } while (0)
-
-#define LOAD_SYMBOL(fun, lib, symbol)        \
-    do {                                     \
-        if (!((fun) = dlsym(lib, symbol))) { \
-            av_log(avctx, AV_LOG_ERROR,      \
-                   "Cannot load %s\n",       \
-                   symbol);                  \
-            return AVERROR_UNKNOWN;          \
-        }                                    \
-    } while (0)
-
-static av_cold int nvenc_load_libraries(AVCodecContext *avctx)
-{
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
-    PNVENCODEAPICREATEINSTANCE nvenc_create_instance;
+#include "thread.h"
 
-    LOAD_LIBRARY(nvel->cuda, CUDA_LIBNAME);
+#if defined(_WIN32)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
 
-    LOAD_SYMBOL(nvel->cu_init, nvel->cuda, "cuInit");
-    LOAD_SYMBOL(nvel->cu_device_get_count, nvel->cuda, "cuDeviceGetCount");
-    LOAD_SYMBOL(nvel->cu_device_get, nvel->cuda, "cuDeviceGet");
-    LOAD_SYMBOL(nvel->cu_device_get_name, nvel->cuda, "cuDeviceGetName");
-    LOAD_SYMBOL(nvel->cu_device_compute_capability, nvel->cuda,
-                "cuDeviceComputeCapability");
-    LOAD_SYMBOL(nvel->cu_ctx_create, nvel->cuda, "cuCtxCreate_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_pop_current, nvel->cuda, "cuCtxPopCurrent_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_destroy, nvel->cuda, "cuCtxDestroy_v2");
+#if defined(_WIN32)
+#define LOAD_FUNC(l, s) GetProcAddress(l, s)
+#define DL_CLOSE_FUNC(l) FreeLibrary(l)
+#else
+#define LOAD_FUNC(l, s) dlsym(l, s)
+#define DL_CLOSE_FUNC(l) dlclose(l)
+#endif
 
-    LOAD_LIBRARY(nvel->nvenc, NVENC_LIBNAME);
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0
+} CUresult;
+typedef int CUdevice;
+typedef void* CUcontext;
 
-    LOAD_SYMBOL(nvenc_create_instance, nvel->nvenc,
-                "NvEncodeAPICreateInstance");
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
 
-    nvel->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
 
-    if ((nvenc_create_instance(&nvel->nvenc_funcs)) != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot create the NVENC instance");
-        return AVERROR_UNKNOWN;
-    }
+typedef struct NvencInputSurface
+{
+    NV_ENC_INPUT_PTR input_surface;
+    int width;
+    int height;
 
-    return 0;
-}
+    int lockCount;
 
-static int nvenc_open_session(AVCodecContext *avctx)
+    NV_ENC_BUFFER_FORMAT format;
+} NvencInputSurface;
+
+typedef struct NvencOutputSurface
 {
-    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params = { 0 };
-    NVENCContext *ctx                           = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv             = &ctx->nvel.nvenc_funcs;
-    int ret;
-
-    params.version    = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
-    params.apiVersion = NVENCAPI_VERSION;
-    params.device     = ctx->cu_context;
-    params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
-
-    ret = nv->nvEncOpenEncodeSessionEx(&params, &ctx->nvenc_ctx);
-    if (ret != NV_ENC_SUCCESS) {
-        ctx->nvenc_ctx = NULL;
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot open the NVENC Session\n");
-        return AVERROR_UNKNOWN;
-    }
+    NV_ENC_OUTPUT_PTR output_surface;
+    int size;
 
-    return 0;
-}
+    NvencInputSurface* input_surface;
+
+    int busy;
+} NvencOutputSurface;
 
-static int nvenc_check_codec_support(AVCodecContext *avctx)
+typedef struct NvencData
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int i, ret, count = 0;
-    GUID *guids = NULL;
+    union {
+        int64_t timestamp;
+        NvencOutputSurface *surface;
+    } u;
+} NvencData;
 
-    ret = nv->nvEncGetEncodeGUIDCount(ctx->nvenc_ctx, &count);
+typedef struct NvencDataList
+{
+    NvencData* data;
 
-    if (ret != NV_ENC_SUCCESS || !count)
-        return AVERROR(ENOSYS);
+    uint32_t pos;
+    uint32_t count;
+    uint32_t size;
+} NvencDataList;
 
-    guids = av_malloc(count * sizeof(GUID));
-    if (!guids)
-        return AVERROR(ENOMEM);
+typedef struct NvencDynLoadFunctions
+{
+    PCUINIT cu_init;
+    PCUDEVICEGETCOUNT cu_device_get_count;
+    PCUDEVICEGET cu_device_get;
+    PCUDEVICEGETNAME cu_device_get_name;
+    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
+    PCUCTXCREATE cu_ctx_create;
+    PCUCTXPOPCURRENT cu_ctx_pop_current;
+    PCUCTXDESTROY cu_ctx_destroy;
+
+    NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
+    int nvenc_device_count;
+    CUdevice nvenc_devices[16];
+
+#if defined(_WIN32)
+    HMODULE cuda_lib;
+    HMODULE nvenc_lib;
+#else
+    void* cuda_lib;
+    void* nvenc_lib;
+#endif
+} NvencDynLoadFunctions;
 
-    ret = nv->nvEncGetEncodeGUIDs(ctx->nvenc_ctx, guids, count, &count);
-    if (ret != NV_ENC_SUCCESS) {
-        ret = AVERROR(ENOSYS);
-        goto fail;
-    }
+typedef struct NvencValuePair
+{
+    const char *str;
+    uint32_t num;
+} NvencValuePair;
 
-    ret = AVERROR(ENOSYS);
-    for (i = 0; i < count; i++) {
-        if (!memcmp(&guids[i], &ctx->params.encodeGUID, sizeof(*guids))) {
-            ret = 0;
-            break;
+typedef struct NvencContext
+{
+    AVClass *avclass;
+
+    NvencDynLoadFunctions nvenc_dload_funcs;
+
+    NV_ENC_INITIALIZE_PARAMS init_encode_params;
+    NV_ENC_CONFIG encode_config;
+    CUcontext cu_context;
+
+    int max_surface_count;
+    NvencInputSurface *input_surfaces;
+    NvencOutputSurface *output_surfaces;
+
+    NvencDataList output_surface_queue;
+    NvencDataList output_surface_ready_queue;
+    NvencDataList timestamp_list;
+    int64_t last_dts;
+
+    void *nvencoder;
+
+    char *preset;
+    char *profile;
+    char *level;
+    char *tier;
+    int cbr;
+    int twopass;
+    int gpu;
+    int buffer_delay;
+} NvencContext;
+
+static const NvencValuePair nvenc_h264_level_pairs[] = {
+    { "auto", NV_ENC_LEVEL_AUTOSELECT },
+    { "1"   , NV_ENC_LEVEL_H264_1     },
+    { "1.0" , NV_ENC_LEVEL_H264_1     },
+    { "1b"  , NV_ENC_LEVEL_H264_1b    },
+    { "1.0b", NV_ENC_LEVEL_H264_1b    },
+    { "1.1" , NV_ENC_LEVEL_H264_11    },
+    { "1.2" , NV_ENC_LEVEL_H264_12    },
+    { "1.3" , NV_ENC_LEVEL_H264_13    },
+    { "2"   , NV_ENC_LEVEL_H264_2     },
+    { "2.0" , NV_ENC_LEVEL_H264_2     },
+    { "2.1" , NV_ENC_LEVEL_H264_21    },
+    { "2.2" , NV_ENC_LEVEL_H264_22    },
+    { "3"   , NV_ENC_LEVEL_H264_3     },
+    { "3.0" , NV_ENC_LEVEL_H264_3     },
+    { "3.1" , NV_ENC_LEVEL_H264_31    },
+    { "3.2" , NV_ENC_LEVEL_H264_32    },
+    { "4"   , NV_ENC_LEVEL_H264_4     },
+    { "4.0" , NV_ENC_LEVEL_H264_4     },
+    { "4.1" , NV_ENC_LEVEL_H264_41    },
+    { "4.2" , NV_ENC_LEVEL_H264_42    },
+    { "5"   , NV_ENC_LEVEL_H264_5     },
+    { "5.0" , NV_ENC_LEVEL_H264_5     },
+    { "5.1" , NV_ENC_LEVEL_H264_51    },
+    { NULL }
+};
+
+static const NvencValuePair nvenc_hevc_level_pairs[] = {
+    { "auto", NV_ENC_LEVEL_AUTOSELECT },
+    { "1"   , NV_ENC_LEVEL_HEVC_1     },
+    { "1.0" , NV_ENC_LEVEL_HEVC_1     },
+    { "2"   , NV_ENC_LEVEL_HEVC_2     },
+    { "2.0" , NV_ENC_LEVEL_HEVC_2     },
+    { "2.1" , NV_ENC_LEVEL_HEVC_21    },
+    { "3"   , NV_ENC_LEVEL_HEVC_3     },
+    { "3.0" , NV_ENC_LEVEL_HEVC_3     },
+    { "3.1" , NV_ENC_LEVEL_HEVC_31    },
+    { "4"   , NV_ENC_LEVEL_HEVC_4     },
+    { "4.0" , NV_ENC_LEVEL_HEVC_4     },
+    { "4.1" , NV_ENC_LEVEL_HEVC_41    },
+    { "5"   , NV_ENC_LEVEL_HEVC_5     },
+    { "5.0" , NV_ENC_LEVEL_HEVC_5     },
+    { "5.1" , NV_ENC_LEVEL_HEVC_51    },
+    { "5.2" , NV_ENC_LEVEL_HEVC_52    },
+    { "6"   , NV_ENC_LEVEL_HEVC_6     },
+    { "6.0" , NV_ENC_LEVEL_HEVC_6     },
+    { "6.1" , NV_ENC_LEVEL_HEVC_61    },
+    { "6.2" , NV_ENC_LEVEL_HEVC_62    },
+    { NULL }
+};
+
+static int input_string_to_uint32(AVCodecContext *avctx, const NvencValuePair *pair, const char *input, uint32_t *output)
+{
+    for (; pair->str; ++pair) {
+        if (!strcmp(input, pair->str)) {
+            *output = pair->num;
+            return 0;
         }
     }
 
-fail:
-    av_free(guids);
-
-    return ret;
+    return AVERROR(EINVAL);
 }
 
-static int nvenc_check_cap(AVCodecContext *avctx, NV_ENC_CAPS cap)
+static NvencData* data_queue_dequeue(NvencDataList* queue)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_CAPS_PARAM params        = { 0 };
-    int ret, val = 0;
+    uint32_t mask;
+    uint32_t read_pos;
 
-    params.version     = NV_ENC_CAPS_PARAM_VER;
-    params.capsToQuery = cap;
+    av_assert0(queue);
+    av_assert0(queue->size);
+    av_assert0(queue->data);
 
-    ret = nv->nvEncGetEncodeCaps(ctx->nvenc_ctx, ctx->params.encodeGUID, &params, &val);
+    if (!queue->count)
+        return NULL;
 
-    if (ret == NV_ENC_SUCCESS)
-        return val;
-    return 0;
+    /* Size always is a multiple of two */
+    mask = queue->size - 1;
+    read_pos = (queue->pos - queue->count) & mask;
+    queue->count--;
+
+    return &queue->data[read_pos];
 }
 
-static int nvenc_check_capabilities(AVCodecContext *avctx)
+static int data_queue_enqueue(NvencDataList* queue, NvencData *data)
 {
-    int ret;
+    NvencDataList new_queue;
+    NvencData* tmp_data;
+    uint32_t mask;
 
-    ret = nvenc_check_codec_support(avctx);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_VERBOSE, "Codec not supported\n");
-        return ret;
-    }
+    if (!queue->size) {
+        /* size always has to be a multiple of two */
+        queue->size = 4;
+        queue->pos = 0;
+        queue->count = 0;
 
-    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_YUV444_ENCODE);
-    if (avctx->pix_fmt == AV_PIX_FMT_YUV444P && ret <= 0) {
-        av_log(avctx, AV_LOG_VERBOSE, "YUV444P not supported\n");
-        return AVERROR(ENOSYS);
-    }
-
-    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_WIDTH_MAX);
-    if (ret < avctx->width) {
-        av_log(avctx, AV_LOG_VERBOSE, "Width %d exceeds %d\n",
-               avctx->width, ret);
-        return AVERROR(ENOSYS);
-    }
+        queue->data = av_malloc(queue->size * sizeof(*(queue->data)));
 
-    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_HEIGHT_MAX);
-    if (ret < avctx->height) {
-        av_log(avctx, AV_LOG_VERBOSE, "Height %d exceeds %d\n",
-               avctx->height, ret);
-        return AVERROR(ENOSYS);
+        if (!queue->data) {
+            queue->size = 0;
+            return AVERROR(ENOMEM);
+        }
     }
 
-    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_NUM_MAX_BFRAMES);
-    if (ret < avctx->max_b_frames) {
-        av_log(avctx, AV_LOG_VERBOSE, "Max b-frames %d exceed %d\n",
-               avctx->max_b_frames, ret);
+    if (queue->count == queue->size) {
+        new_queue.size = queue->size << 1;
+        new_queue.pos = 0;
+        new_queue.count = 0;
+        new_queue.data = av_malloc(new_queue.size * sizeof(*(queue->data)));
 
-        return AVERROR(ENOSYS);
-    }
+        if (!new_queue.data)
+            return AVERROR(ENOMEM);
 
-    return 0;
-}
+        while (tmp_data = data_queue_dequeue(queue))
+            data_queue_enqueue(&new_queue, tmp_data);
 
-static int nvenc_check_device(AVCodecContext *avctx, int idx)
-{
-    NVENCContext *ctx               = avctx->priv_data;
-    NVENCLibraryContext *nvel       = &ctx->nvel;
-    char name[128]                  = { 0 };
-    int major, minor, ret;
-    CUdevice cu_device;
-    CUcontext dummy;
-    int loglevel = AV_LOG_VERBOSE;
-
-    if (ctx->device == LIST_DEVICES)
-        loglevel = AV_LOG_INFO;
-
-    ret = nvel->cu_device_get(&cu_device, idx);
-    if (ret != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot access the CUDA device %d\n",
-               idx);
-        return -1;
+        av_free(queue->data);
+        *queue = new_queue;
     }
 
-    ret = nvel->cu_device_get_name(name, sizeof(name), cu_device);
-    if (ret != CUDA_SUCCESS)
-        return -1;
-
-    ret = nvel->cu_device_compute_capability(&major, &minor, cu_device);
-    if (ret != CUDA_SUCCESS)
-        return -1;
+    mask = queue->size - 1;
 
-    av_log(avctx, loglevel, "Device %d [%s] ", cu_device, name);
+    queue->data[queue->pos] = *data;
+    queue->pos = (queue->pos + 1) & mask;
+    queue->count++;
 
-    if (((major << 4) | minor) < NVENC_CAP)
-        goto fail;
+    return 0;
+}
 
-    ret = nvel->cu_ctx_create(&ctx->cu_context, 0, cu_device);
-    if (ret != CUDA_SUCCESS)
-        goto fail;
+static int out_surf_queue_enqueue(NvencDataList* queue, NvencOutputSurface* surface)
+{
+    NvencData data;
+    data.u.surface = surface;
 
-    ret = nvel->cu_ctx_pop_current(&dummy);
-    if (ret != CUDA_SUCCESS)
-        goto fail2;
+    return data_queue_enqueue(queue, &data);
+}
 
-    if ((ret = nvenc_open_session(avctx)) < 0)
-        goto fail2;
+static NvencOutputSurface* out_surf_queue_dequeue(NvencDataList* queue)
+{
+    NvencData* res = data_queue_dequeue(queue);
 
-    if ((ret = nvenc_check_capabilities(avctx)) < 0)
-        goto fail3;
+    if (!res)
+        return NULL;
 
-    av_log(avctx, loglevel, "supports NVENC\n");
+    return res->u.surface;
+}
 
-    if (ctx->device == cu_device || ctx->device == ANY_DEVICE)
-        return 0;
+static int timestamp_queue_enqueue(NvencDataList* queue, int64_t timestamp)
+{
+    NvencData data;
+    data.u.timestamp = timestamp;
 
-fail3:
-    nvel->nvenc_funcs.nvEncDestroyEncoder(ctx->nvenc_ctx);
-    ctx->nvenc_ctx = NULL;
+    return data_queue_enqueue(queue, &data);
+}
 
-fail2:
-    nvel->cu_ctx_destroy(ctx->cu_context);
-    ctx->cu_context = NULL;
+static int64_t timestamp_queue_dequeue(NvencDataList* queue)
+{
+    NvencData* res = data_queue_dequeue(queue);
 
-fail:
-    if (ret != 0)
-        av_log(avctx, loglevel, "does not support NVENC (major %d minor %d)\n",
-               major, minor);
+    if (!res)
+        return AV_NOPTS_VALUE;
 
-    return AVERROR(ENOSYS);
+    return res->u.timestamp;
 }
 
-static int nvenc_setup_device(AVCodecContext *avctx)
+#define CHECK_LOAD_FUNC(t, f, s) \
+do { \
+    (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
+    if (!(f)) { \
+        av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
+        goto error; \
+    } \
+} while (0)
+
+static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx)
 {
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
-    int i, nb_devices = 0;
-
-    if ((nvel->cu_init(0)) != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot init CUDA\n");
-        return AVERROR_UNKNOWN;
-    }
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
 
-    if ((nvel->cu_device_get_count(&nb_devices)) != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot enumerate the CUDA devices\n");
-        return AVERROR_UNKNOWN;
-    }
+    if (dl_fn->cuda_lib)
+        return 1;
 
-    switch (avctx->codec->id) {
-    case AV_CODEC_ID_H264:
-        ctx->params.encodeGUID = NV_ENC_CODEC_H264_GUID;
-        break;
-    case AV_CODEC_ID_HEVC:
-        ctx->params.encodeGUID = NV_ENC_CODEC_HEVC_GUID;
-        break;
-    default:
-        return AVERROR_BUG;
-    }
+#if defined(_WIN32)
+    dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
+#else
+    dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+#endif
 
-    for (i = 0; i < nb_devices; ++i) {
-        if ((nvenc_check_device(avctx, i)) >= 0 && ctx->device != LIST_DEVICES)
-            return 0;
+    if (!dl_fn->cuda_lib) {
+        av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
+        goto error;
     }
 
-    if (ctx->device == LIST_DEVICES)
-        return AVERROR_EXIT;
+    CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
+    CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
+    CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
+    CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
+    CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
+    CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
+    CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
+    CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
 
-    return AVERROR(ENOSYS);
-}
+    return 1;
 
-typedef struct GUIDTuple {
-    const GUID guid;
-    int flags;
-} GUIDTuple;
+error:
 
-static int nvec_map_preset(NVENCContext *ctx)
-{
-    GUIDTuple presets[] = {
-        { NV_ENC_PRESET_DEFAULT_GUID },
-        { NV_ENC_PRESET_HP_GUID },
-        { NV_ENC_PRESET_HQ_GUID },
-        { NV_ENC_PRESET_BD_GUID },
-        { NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID, NVENC_LOWLATENCY },
-        { NV_ENC_PRESET_LOW_LATENCY_HP_GUID,      NVENC_LOWLATENCY },
-        { NV_ENC_PRESET_LOW_LATENCY_HQ_GUID,      NVENC_LOWLATENCY },
-        { NV_ENC_PRESET_LOSSLESS_DEFAULT_GUID,    NVENC_LOSSLESS },
-        { NV_ENC_PRESET_LOSSLESS_HP_GUID,         NVENC_LOSSLESS },
-        { { 0 } }
-    };
-
-    GUIDTuple *t = &presets[ctx->preset];
-
-    ctx->params.presetGUID = t->guid;
-    ctx->flags             = t->flags;
+    if (dl_fn->cuda_lib)
+        DL_CLOSE_FUNC(dl_fn->cuda_lib);
 
-    return AVERROR(EINVAL);
-}
+    dl_fn->cuda_lib = NULL;
 
-static void set_constqp(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
-{
-    rc->rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
-    rc->constQP.qpInterB = avctx->global_quality;
-    rc->constQP.qpInterP = avctx->global_quality;
-    rc->constQP.qpIntra  = avctx->global_quality;
+    return 0;
 }
 
-static void set_vbr(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func)
 {
-    if (avctx->qmin >= 0) {
-        rc->enableMinQP    = 1;
-        rc->minQP.qpInterB = avctx->qmin;
-        rc->minQP.qpInterP = avctx->qmin;
-        rc->minQP.qpIntra  = avctx->qmin;
-    }
-
-    if (avctx->qmax >= 0) {
-        rc->enableMaxQP = 1;
-        rc->maxQP.qpInterB = avctx->qmax;
-        rc->maxQP.qpInterP = avctx->qmax;
-        rc->maxQP.qpIntra  = avctx->qmax;
+    if (err != CUDA_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
+        return 0;
     }
+    return 1;
 }
+#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error
 
-static void nvenc_override_rate_control(AVCodecContext *avctx,
-                                        NV_ENC_RC_PARAMS *rc)
+static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
-
-    switch (ctx->rc) {
-    case NV_ENC_PARAMS_RC_CONSTQP:
-        if (avctx->global_quality < 0) {
-            av_log(avctx, AV_LOG_WARNING,
-                   "The constant quality rate-control requires "
-                   "the 'global_quality' option set.\n");
-            return;
-        }
-        set_constqp(avctx, rc);
-        return;
-    case NV_ENC_PARAMS_RC_2_PASS_VBR:
-    case NV_ENC_PARAMS_RC_VBR:
-        if (avctx->qmin < 0 && avctx->qmax < 0) {
-            av_log(avctx, AV_LOG_WARNING,
-                   "The variable bitrate rate-control requires "
-                   "the 'qmin' and/or 'qmax' option set.\n");
-            return;
-        }
-    case NV_ENC_PARAMS_RC_VBR_MINQP:
-        if (avctx->qmin < 0) {
-            av_log(avctx, AV_LOG_WARNING,
-                   "The variable bitrate rate-control requires "
-                   "the 'qmin' option set.\n");
-            return;
-        }
-        set_vbr(avctx, rc);
+    int device_count = 0;
+    CUdevice cu_device = 0;
+    char gpu_name[128];
+    int smminor = 0, smmajor = 0;
+    int i, smver, target_smver;
+
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+        target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30;
         break;
-    case NV_ENC_PARAMS_RC_CBR:
+    case AV_CODEC_ID_H265:
+        target_smver = 0x52;
         break;
-    case NV_ENC_PARAMS_RC_2_PASS_QUALITY:
-    case NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP:
-        if (!(ctx->flags & NVENC_LOWLATENCY)) {
-            av_log(avctx, AV_LOG_WARNING,
-                   "The multipass rate-control requires "
-                   "a low-latency preset.\n");
-            return;
-        }
+    default:
+        av_log(avctx, AV_LOG_FATAL, "Unknown codec name\n");
+        goto error;
     }
 
-    rc->rateControlMode = ctx->rc;
-}
+    if (!nvenc_dyload_cuda(avctx))
+        return 0;
 
-static void nvenc_setup_rate_control(AVCodecContext *avctx)
-{
-    NVENCContext *ctx    = avctx->priv_data;
-    NV_ENC_RC_PARAMS *rc = &ctx->config.rcParams;
+    if (dl_fn->nvenc_device_count > 0)
+        return 1;
 
-    if (avctx->bit_rate > 0)
-        rc->averageBitRate = avctx->bit_rate;
+    check_cuda_errors(dl_fn->cu_init(0));
 
-    if (avctx->rc_max_rate > 0)
-        rc->maxBitRate = avctx->rc_max_rate;
+    check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
 
-    if (ctx->rc > 0) {
-        nvenc_override_rate_control(avctx, rc);
-    } else if (avctx->global_quality > 0) {
-        set_constqp(avctx, rc);
-    } else if (avctx->qmin >= 0 && avctx->qmax >= 0) {
-        rc->rateControlMode = NV_ENC_PARAMS_RC_VBR;
-        set_vbr(avctx, rc);
+    if (!device_count) {
+        av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+        goto error;
     }
 
-    if (avctx->rc_buffer_size > 0)
-        rc->vbvBufferSize = avctx->rc_buffer_size;
-
-    if (rc->averageBitRate > 0)
-        avctx->bit_rate = rc->averageBitRate;
-}
-
-static int nvenc_setup_h264_config(AVCodecContext *avctx)
-{
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
-    NV_ENC_CONFIG_H264 *h264               = &cc->encodeCodecConfig.h264Config;
-    NV_ENC_CONFIG_H264_VUI_PARAMETERS *vui = &h264->h264VUIParameters;
+    av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
 
-    vui->colourDescriptionPresentFlag = 1;
-    vui->videoSignalTypePresentFlag   = 1;
+    dl_fn->nvenc_device_count = 0;
 
-    vui->colourMatrix            = avctx->colorspace;
-    vui->colourPrimaries         = avctx->color_primaries;
-    vui->transferCharacteristics = avctx->color_trc;
+    for (i = 0; i < device_count; ++i) {
+        check_cuda_errors(dl_fn->cu_device_get(&cu_device, i));
+        check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device));
+        check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device));
 
-    vui->videoFullRangeFlag = avctx->color_range == AVCOL_RANGE_JPEG;
+        smver = (smmajor << 4) | smminor;
 
-    h264->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
-    h264->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
+        av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available");
 
-    h264->maxNumRefFrames = avctx->refs;
-    h264->idrPeriod       = cc->gopLength;
+        if (smver >= target_smver)
+            dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device;
+    }
 
-    if (ctx->profile)
-        avctx->profile = ctx->profile;
+    if (!dl_fn->nvenc_device_count) {
+        av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
+        goto error;
+    }
 
-    if (avctx->pix_fmt == AV_PIX_FMT_YUV444P)
-        h264->chromaFormatIDC = 3;
-    else
-        h264->chromaFormatIDC = 1;
+    return 1;
 
-    switch (ctx->profile) {
-    case NV_ENC_H264_PROFILE_BASELINE:
-        cc->profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_MAIN:
-        cc->profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH_444:
-        cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_CONSTRAINED_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH_GUID;
-        break;
-    }
+error:
 
-    h264->level = ctx->level;
+    dl_fn->nvenc_device_count = 0;
 
     return 0;
 }
 
-static int nvenc_setup_hevc_config(AVCodecContext *avctx)
+static av_cold int nvenc_dyload_nvenc(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
-    NV_ENC_CONFIG_HEVC *hevc               = &cc->encodeCodecConfig.hevcConfig;
+    PNVENCODEAPICREATEINSTANCE nvEncodeAPICreateInstance = 0;
+    NVENCSTATUS nvstatus;
 
-    hevc->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
-    hevc->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
 
-    hevc->maxNumRefFramesInDPB = avctx->refs;
-    hevc->idrPeriod            = cc->gopLength;
+    if (!nvenc_check_cuda(avctx))
+        return 0;
 
-    /* No other profile is supported in the current SDK version 5 */
-    cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN_GUID;
-    avctx->profile  = FF_PROFILE_HEVC_MAIN;
+    if (dl_fn->nvenc_lib)
+        return 1;
 
-    if (ctx->level) {
-        hevc->level = ctx->level;
+#if defined(_WIN32)
+    if (sizeof(void*) == 8) {
+        dl_fn->nvenc_lib = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
     } else {
-        hevc->level = NV_ENC_LEVEL_AUTOSELECT;
+        dl_fn->nvenc_lib = LoadLibrary(TEXT("nvEncodeAPI.dll"));
     }
+#else
+    dl_fn->nvenc_lib = dlopen("libnvidia-encode.so.1", RTLD_LAZY);
+#endif
 
-    if (ctx->tier) {
-        hevc->tier = ctx->tier;
+    if (!dl_fn->nvenc_lib) {
+        av_log(avctx, AV_LOG_FATAL, "Failed loading the nvenc library\n");
+        goto error;
     }
 
-    return 0;
-}
-static int nvenc_setup_codec_config(AVCodecContext *avctx)
-{
-    switch (avctx->codec->id) {
-    case AV_CODEC_ID_H264:
-        return nvenc_setup_h264_config(avctx);
-    case AV_CODEC_ID_HEVC:
-        return nvenc_setup_hevc_config(avctx);
+    nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance");
+
+    if (!nvEncodeAPICreateInstance) {
+        av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
+        goto error;
     }
-    return 0;
-}
 
-static int nvenc_setup_encoder(AVCodecContext *avctx)
-{
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PRESET_CONFIG preset_cfg = { 0 };
-    int ret;
-
-    ctx->params.version = NV_ENC_INITIALIZE_PARAMS_VER;
-
-    ctx->params.encodeHeight = avctx->height;
-    ctx->params.encodeWidth  = avctx->width;
-
-    if (avctx->sample_aspect_ratio.num &&
-        avctx->sample_aspect_ratio.den &&
-        (avctx->sample_aspect_ratio.num != 1 ||
-         avctx->sample_aspect_ratio.den != 1)) {
-        av_reduce(&ctx->params.darWidth,
-                  &ctx->params.darHeight,
-                  avctx->width * avctx->sample_aspect_ratio.num,
-                  avctx->height * avctx->sample_aspect_ratio.den,
-                  INT_MAX / 8);
-    } else {
-        ctx->params.darHeight = avctx->height;
-        ctx->params.darWidth  = avctx->width;
+    dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+
+    nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
+
+    if (nvstatus != NV_ENC_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
+        goto error;
     }
 
-    ctx->params.frameRateNum = avctx->time_base.den;
-    ctx->params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
 
-    ctx->params.enableEncodeAsync = 0;
-    ctx->params.enablePTD         = 1;
+    return 1;
 
-    ctx->params.encodeConfig = &ctx->config;
+error:
+    if (dl_fn->nvenc_lib)
+        DL_CLOSE_FUNC(dl_fn->nvenc_lib);
 
-    nvec_map_preset(ctx);
+    dl_fn->nvenc_lib = NULL;
 
-    preset_cfg.version           = NV_ENC_PRESET_CONFIG_VER;
-    preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER;
+    return 0;
+}
 
-    ret = nv->nvEncGetEncodePresetConfig(ctx->nvenc_ctx,
-                                         ctx->params.encodeGUID,
-                                         ctx->params.presetGUID,
-                                         &preset_cfg);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Cannot get the preset configuration\n");
-        return AVERROR_UNKNOWN;
-    }
+static av_cold void nvenc_unload_nvenc(AVCodecContext *avctx)
+{
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
 
-    memcpy(&ctx->config, &preset_cfg.presetCfg, sizeof(ctx->config));
+    DL_CLOSE_FUNC(dl_fn->nvenc_lib);
+    dl_fn->nvenc_lib = NULL;
 
-    ctx->config.version = NV_ENC_CONFIG_VER;
+    dl_fn->nvenc_device_count = 0;
 
-    if (avctx->gop_size > 0) {
-        if (avctx->max_b_frames > 0) {
-            ctx->last_dts = -2;
-            /* 0 is intra-only,
-             * 1 is I/P only,
-             * 2 is one B Frame,
-             * 3 two B frames, and so on. */
-            ctx->config.frameIntervalP = avctx->max_b_frames + 1;
-        } else if (avctx->max_b_frames == 0) {
-            ctx->config.frameIntervalP = 1;
-        }
-        ctx->config.gopLength = avctx->gop_size;
-    } else if (avctx->gop_size == 0) {
-        ctx->config.frameIntervalP = 0;
-        ctx->config.gopLength      = 1;
-    }
+    DL_CLOSE_FUNC(dl_fn->cuda_lib);
+    dl_fn->cuda_lib = NULL;
 
-    if (ctx->config.frameIntervalP > 1)
-        avctx->max_b_frames = ctx->config.frameIntervalP - 1;
+    dl_fn->cu_init = NULL;
+    dl_fn->cu_device_get_count = NULL;
+    dl_fn->cu_device_get = NULL;
+    dl_fn->cu_device_get_name = NULL;
+    dl_fn->cu_device_compute_capability = NULL;
+    dl_fn->cu_ctx_create = NULL;
+    dl_fn->cu_ctx_pop_current = NULL;
+    dl_fn->cu_ctx_destroy = NULL;
 
-    nvenc_setup_rate_control(avctx);
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
+}
 
-    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
-    } else {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
+static av_cold int nvenc_encode_init(AVCodecContext *avctx)
+{
+    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 };
+    NV_ENC_PRESET_CONFIG preset_config = { 0 };
+    CUcontext cu_context_curr;
+    CUresult cu_res;
+    GUID encoder_preset = NV_ENC_PRESET_HQ_GUID;
+    GUID codec;
+    NVENCSTATUS nv_status = NV_ENC_SUCCESS;
+    int surfaceCount = 0;
+    int i, num_mbs;
+    int isLL = 0;
+    int lossless = 0;
+    int res = 0;
+    int dw, dh;
+    int qp_inter_p;
+
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    if (!nvenc_dyload_nvenc(avctx))
+        return AVERROR_EXTERNAL;
+
+    ctx->last_dts = AV_NOPTS_VALUE;
+
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
+    ctx->init_encode_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
+    preset_config.version = NV_ENC_PRESET_CONFIG_VER;
+    preset_config.presetCfg.version = NV_ENC_CONFIG_VER;
+    encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
+    encode_session_params.apiVersion = NVENCAPI_VERSION;
+
+    if (ctx->gpu >= dl_fn->nvenc_device_count) {
+        av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count);
+        res = AVERROR(EINVAL);
+        goto error;
     }
 
-    if ((ret = nvenc_setup_codec_config(avctx)) < 0)
-        return ret;
+    ctx->cu_context = NULL;
+    cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins
 
-    ret = nv->nvEncInitializeEncoder(ctx->nvenc_ctx, &ctx->params);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot initialize the decoder");
-        return AVERROR_UNKNOWN;
+    if (cu_res != CUDA_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
+        res = AVERROR_EXTERNAL;
+        goto error;
     }
 
-    return 0;
-}
+    cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr);
 
-static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
-{
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int ret;
-    NV_ENC_CREATE_INPUT_BUFFER in_buffer      = { 0 };
-    NV_ENC_CREATE_BITSTREAM_BUFFER out_buffer = { 0 };
+    if (cu_res != CUDA_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
 
-    in_buffer.version  = NV_ENC_CREATE_INPUT_BUFFER_VER;
-    out_buffer.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
+    encode_session_params.device = ctx->cu_context;
+    encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
 
-    in_buffer.width  = avctx->width;
-    in_buffer.height = avctx->height;
+    nv_status = p_nvenc->nvEncOpenEncodeSessionEx(&encode_session_params, &ctx->nvencoder);
+    if (nv_status != NV_ENC_SUCCESS) {
+        ctx->nvencoder = NULL;
+        av_log(avctx, AV_LOG_FATAL, "OpenEncodeSessionEx failed: 0x%x\n", (int)nv_status);
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
 
-    in_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
+    if (ctx->preset) {
+        if (!strcmp(ctx->preset, "slow")) {
+            encoder_preset = NV_ENC_PRESET_HQ_GUID;
+            ctx->twopass = 1;
+        } else if (!strcmp(ctx->preset, "medium")) {
+            encoder_preset = NV_ENC_PRESET_HQ_GUID;
+            ctx->twopass = 0;
+        } else if (!strcmp(ctx->preset, "fast")) {
+            encoder_preset = NV_ENC_PRESET_HP_GUID;
+            ctx->twopass = 0;
+        } else if (!strcmp(ctx->preset, "hq")) {
+            encoder_preset = NV_ENC_PRESET_HQ_GUID;
+        } else if (!strcmp(ctx->preset, "hp")) {
+            encoder_preset = NV_ENC_PRESET_HP_GUID;
+        } else if (!strcmp(ctx->preset, "bd")) {
+            encoder_preset = NV_ENC_PRESET_BD_GUID;
+        } else if (!strcmp(ctx->preset, "ll")) {
+            encoder_preset = NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID;
+            isLL = 1;
+        } else if (!strcmp(ctx->preset, "llhp")) {
+            encoder_preset = NV_ENC_PRESET_LOW_LATENCY_HP_GUID;
+            isLL = 1;
+        } else if (!strcmp(ctx->preset, "llhq")) {
+            encoder_preset = NV_ENC_PRESET_LOW_LATENCY_HQ_GUID;
+            isLL = 1;
+        } else if (!strcmp(ctx->preset, "lossless")) {
+            encoder_preset = NV_ENC_PRESET_LOSSLESS_DEFAULT_GUID;
+            lossless = 1;
+        } else if (!strcmp(ctx->preset, "losslesshp")) {
+            encoder_preset = NV_ENC_PRESET_LOSSLESS_HP_GUID;
+            lossless = 1;
+        } else if (!strcmp(ctx->preset, "default")) {
+            encoder_preset = NV_ENC_PRESET_DEFAULT_GUID;
+        } else {
+            av_log(avctx, AV_LOG_FATAL, "Preset \"%s\" is unknown! Supported presets: slow, medium, high, hp, hq, bd, ll, llhp, llhq, lossless, losslesshp, default\n", ctx->preset);
+            res = AVERROR(EINVAL);
+            goto error;
+        }
+    }
 
-    switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_YUV420P:
-        in_buffer.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
-        break;
-    case AV_PIX_FMT_NV12:
-        in_buffer.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
+    if (ctx->twopass < 0) {
+        ctx->twopass = isLL;
+    }
+
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+        codec = NV_ENC_CODEC_H264_GUID;
         break;
-    case AV_PIX_FMT_YUV444P:
-        in_buffer.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+    case AV_CODEC_ID_H265:
+        codec = NV_ENC_CODEC_HEVC_GUID;
         break;
     default:
-        return AVERROR_BUG;
+        av_log(avctx, AV_LOG_ERROR, "Unknown codec name\n");
+        res = AVERROR(EINVAL);
+        goto error;
     }
 
-    ret = nv->nvEncCreateInputBuffer(ctx->nvenc_ctx, &in_buffer);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "CreateInputBuffer failed\n");
-        return AVERROR_UNKNOWN;
+    nv_status = p_nvenc->nvEncGetEncodePresetConfig(ctx->nvencoder, codec, encoder_preset, &preset_config);
+    if (nv_status != NV_ENC_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "GetEncodePresetConfig failed: 0x%x\n", (int)nv_status);
+        res = AVERROR_EXTERNAL;
+        goto error;
     }
 
-    ctx->in[idx].in        = in_buffer.inputBuffer;
-    ctx->in[idx].format    = in_buffer.bufferFmt;
+    ctx->init_encode_params.encodeGUID = codec;
+    ctx->init_encode_params.encodeHeight = avctx->height;
+    ctx->init_encode_params.encodeWidth = avctx->width;
 
-    /* 1MB is large enough to hold most output frames.
-     * NVENC increases this automaticaly if it's not enough. */
-    out_buffer.size = BITSTREAM_BUFFER_SIZE;
-
-    out_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
+    if (avctx->sample_aspect_ratio.num && avctx->sample_aspect_ratio.den &&
+        (avctx->sample_aspect_ratio.num != 1 || avctx->sample_aspect_ratio.num != 1)) {
+        av_reduce(&dw, &dh,
+                  avctx->width * avctx->sample_aspect_ratio.num,
+                  avctx->height * avctx->sample_aspect_ratio.den,
+                  1024 * 1024);
+        ctx->init_encode_params.darHeight = dh;
+        ctx->init_encode_params.darWidth = dw;
+    } else {
+        ctx->init_encode_params.darHeight = avctx->height;
+        ctx->init_encode_params.darWidth = avctx->width;
+    }
 
-    ret = nv->nvEncCreateBitstreamBuffer(ctx->nvenc_ctx, &out_buffer);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "CreateBitstreamBuffer failed\n");
-        return AVERROR_UNKNOWN;
+    // De-compensate for hardware, dubiously, trying to compensate for
+    // playback at 704 pixel width.
+    if (avctx->width == 720 &&
+        (avctx->height == 480 || avctx->height == 576)) {
+        av_reduce(&dw, &dh,
+                  ctx->init_encode_params.darWidth * 44,
+                  ctx->init_encode_params.darHeight * 45,
+                  1024 * 1024);
+        ctx->init_encode_params.darHeight = dh;
+        ctx->init_encode_params.darWidth = dw;
     }
 
-    ctx->out[idx].out  = out_buffer.bitstreamBuffer;
-    ctx->out[idx].busy = 0;
+    ctx->init_encode_params.frameRateNum = avctx->time_base.den;
+    ctx->init_encode_params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
 
-    return 0;
-}
+    num_mbs = ((avctx->width + 15) >> 4) * ((avctx->height + 15) >> 4);
+    ctx->max_surface_count = (num_mbs >= 8160) ? 32 : 48;
 
-static int nvenc_setup_surfaces(AVCodecContext *avctx)
-{
-    NVENCContext *ctx = avctx->priv_data;
-    int i, ret;
+    if (ctx->buffer_delay >= ctx->max_surface_count)
+        ctx->buffer_delay = ctx->max_surface_count - 1;
 
-    ctx->nb_surfaces = FFMAX(4 + avctx->max_b_frames,
-                             ctx->nb_surfaces);
+    ctx->init_encode_params.enableEncodeAsync = 0;
+    ctx->init_encode_params.enablePTD = 1;
 
-    ctx->in = av_mallocz(ctx->nb_surfaces * sizeof(*ctx->in));
-    if (!ctx->in)
-        return AVERROR(ENOMEM);
+    ctx->init_encode_params.presetGUID = encoder_preset;
 
-    ctx->out = av_mallocz(ctx->nb_surfaces * sizeof(*ctx->out));
-    if (!ctx->out)
-        return AVERROR(ENOMEM);
+    ctx->init_encode_params.encodeConfig = &ctx->encode_config;
+    memcpy(&ctx->encode_config, &preset_config.presetCfg, sizeof(ctx->encode_config));
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
 
-    ctx->timestamps = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
-    if (!ctx->timestamps)
-        return AVERROR(ENOMEM);
-    ctx->pending = av_fifo_alloc(ctx->nb_surfaces * sizeof(ctx->out));
-    if (!ctx->pending)
-        return AVERROR(ENOMEM);
-    ctx->ready = av_fifo_alloc(ctx->nb_surfaces * sizeof(ctx->out));
-    if (!ctx->ready)
-        return AVERROR(ENOMEM);
-
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if ((ret = nvenc_alloc_surface(avctx, i)) < 0)
-            return ret;
+    if (avctx->refs >= 0) {
+        /* 0 means "let the hardware decide" */
+        switch (avctx->codec->id) {
+        case AV_CODEC_ID_H264:
+            ctx->encode_config.encodeCodecConfig.h264Config.maxNumRefFrames = avctx->refs;
+            break;
+        case AV_CODEC_ID_H265:
+            ctx->encode_config.encodeCodecConfig.hevcConfig.maxNumRefFramesInDPB = avctx->refs;
+            break;
+        /* Earlier switch/case will return if unknown codec is passed. */
+        }
     }
 
-    return 0;
-}
+    if (avctx->gop_size > 0) {
+        if (avctx->max_b_frames >= 0) {
+            /* 0 is intra-only, 1 is I/P only, 2 is one B Frame, 3 two B frames, and so on. */
+            ctx->encode_config.frameIntervalP = avctx->max_b_frames + 1;
+        }
 
-#define EXTRADATA_SIZE 512
+        ctx->encode_config.gopLength = avctx->gop_size;
+        switch (avctx->codec->id) {
+        case AV_CODEC_ID_H264:
+            ctx->encode_config.encodeCodecConfig.h264Config.idrPeriod = avctx->gop_size;
+            ctx->encode_config.encodeCodecConfig.h264Config.hierarchicalPFrames = 1;
+            ctx->encode_config.encodeCodecConfig.h264Config.hierarchicalBFrames = 1;
+            break;
+        case AV_CODEC_ID_H265:
+            ctx->encode_config.encodeCodecConfig.hevcConfig.idrPeriod = avctx->gop_size;
+            break;
+        /* Earlier switch/case will return if unknown codec is passed. */
+        }
+    } else if (avctx->gop_size == 0) {
+        ctx->encode_config.frameIntervalP = 0;
+        ctx->encode_config.gopLength = 1;
+        switch (avctx->codec->id) {
+        case AV_CODEC_ID_H264:
+            ctx->encode_config.encodeCodecConfig.h264Config.idrPeriod = 1;
+            break;
+        case AV_CODEC_ID_H265:
+            ctx->encode_config.encodeCodecConfig.hevcConfig.idrPeriod = 1;
+            break;
+        /* Earlier switch/case will return if unknown codec is passed. */
+        }
+    }
 
-static int nvenc_setup_extradata(AVCodecContext *avctx)
-{
-    NVENCContext *ctx                     = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv       = &ctx->nvel.nvenc_funcs;
-    NV_ENC_SEQUENCE_PARAM_PAYLOAD payload = { 0 };
-    int ret;
+    /* when there're b frames, set dts offset */
+    if (ctx->encode_config.frameIntervalP >= 2)
+        ctx->last_dts = -2;
 
-    avctx->extradata = av_mallocz(EXTRADATA_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!avctx->extradata)
-        return AVERROR(ENOMEM);
+    if (avctx->bit_rate > 0)
+        ctx->encode_config.rcParams.averageBitRate = avctx->bit_rate;
 
-    payload.version              = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
-    payload.spsppsBuffer         = avctx->extradata;
-    payload.inBufferSize         = EXTRADATA_SIZE;
-    payload.outSPSPPSPayloadSize = &avctx->extradata_size;
+    if (avctx->rc_max_rate > 0)
+        ctx->encode_config.rcParams.maxBitRate = avctx->rc_max_rate;
+
+    if (lossless) {
+        if (avctx->codec->id == AV_CODEC_ID_H264)
+            ctx->encode_config.encodeCodecConfig.h264Config.qpPrimeYZeroTransformBypassFlag = 1;
+
+        ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
+        ctx->encode_config.rcParams.constQP.qpInterB = 0;
+        ctx->encode_config.rcParams.constQP.qpInterP = 0;
+        ctx->encode_config.rcParams.constQP.qpIntra = 0;
+
+        avctx->qmin = -1;
+        avctx->qmax = -1;
+    } else if (ctx->cbr) {
+        if (!ctx->twopass) {
+            ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
+        } else {
+            ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_2_PASS_QUALITY;
 
-    ret = nv->nvEncGetSequenceParams(ctx->nvenc_ctx, &payload);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot get the extradata\n");
-        return AVERROR_UNKNOWN;
-    }
+            if (avctx->codec->id == AV_CODEC_ID_H264) {
+                ctx->encode_config.encodeCodecConfig.h264Config.adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
+                ctx->encode_config.encodeCodecConfig.h264Config.fmoMode = NV_ENC_H264_FMO_DISABLE;
+            }
+        }
+    } else if (avctx->global_quality > 0) {
+        ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
+        ctx->encode_config.rcParams.constQP.qpInterB = avctx->global_quality;
+        ctx->encode_config.rcParams.constQP.qpInterP = avctx->global_quality;
+        ctx->encode_config.rcParams.constQP.qpIntra = avctx->global_quality;
 
-    return 0;
-}
+        avctx->qmin = -1;
+        avctx->qmax = -1;
+    } else {
+        if (avctx->qmin >= 0 && avctx->qmax >= 0) {
+            ctx->encode_config.rcParams.enableMinQP = 1;
+            ctx->encode_config.rcParams.enableMaxQP = 1;
+
+            ctx->encode_config.rcParams.minQP.qpInterB = avctx->qmin;
+            ctx->encode_config.rcParams.minQP.qpInterP = avctx->qmin;
+            ctx->encode_config.rcParams.minQP.qpIntra = avctx->qmin;
+
+            ctx->encode_config.rcParams.maxQP.qpInterB = avctx->qmax;
+            ctx->encode_config.rcParams.maxQP.qpInterP = avctx->qmax;
+            ctx->encode_config.rcParams.maxQP.qpIntra = avctx->qmax;
+
+            qp_inter_p = (avctx->qmax + 3 * avctx->qmin) / 4; // biased towards Qmin
+
+            if (ctx->twopass) {
+                ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_2_PASS_VBR;
+                if (avctx->codec->id == AV_CODEC_ID_H264) {
+                    ctx->encode_config.encodeCodecConfig.h264Config.adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
+                    ctx->encode_config.encodeCodecConfig.h264Config.fmoMode = NV_ENC_H264_FMO_DISABLE;
+                }
+            } else {
+                ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR_MINQP;
+            }
+        } else {
+            qp_inter_p = 26; // default to 26
 
-av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
-{
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int i;
+            if (ctx->twopass) {
+                ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_2_PASS_VBR;
+            } else {
+                ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR;
+            }
+        }
 
-    av_fifo_free(ctx->timestamps);
-    av_fifo_free(ctx->pending);
-    av_fifo_free(ctx->ready);
+        ctx->encode_config.rcParams.enableInitialRCQP = 1;
+        ctx->encode_config.rcParams.initialRCQP.qpInterP  = qp_inter_p;
 
-    if (ctx->in) {
-        for (i = 0; i < ctx->nb_surfaces; ++i) {
-            nv->nvEncDestroyInputBuffer(ctx->nvenc_ctx, ctx->in[i].in);
-            nv->nvEncDestroyBitstreamBuffer(ctx->nvenc_ctx, ctx->out[i].out);
+        if(avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
+            ctx->encode_config.rcParams.initialRCQP.qpIntra = qp_inter_p * fabs(avctx->i_quant_factor);
+            ctx->encode_config.rcParams.initialRCQP.qpIntra += qp_inter_p * avctx->i_quant_offset;
+            ctx->encode_config.rcParams.initialRCQP.qpInterB = qp_inter_p * fabs(avctx->b_quant_factor);
+            ctx->encode_config.rcParams.initialRCQP.qpInterB += qp_inter_p * avctx->b_quant_offset;
+        } else {
+            ctx->encode_config.rcParams.initialRCQP.qpIntra = qp_inter_p;
+            ctx->encode_config.rcParams.initialRCQP.qpInterB = qp_inter_p;
         }
     }
 
-    av_freep(&ctx->in);
-    av_freep(&ctx->out);
+    if (avctx->rc_buffer_size > 0)
+        ctx->encode_config.rcParams.vbvBufferSize = avctx->rc_buffer_size;
 
-    if (ctx->nvenc_ctx)
-        nv->nvEncDestroyEncoder(ctx->nvenc_ctx);
+    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
+    } else {
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
+    }
 
-    if (ctx->cu_context)
-        ctx->nvel.cu_ctx_destroy(ctx->cu_context);
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1;
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.videoSignalTypePresentFlag = 1;
+
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourMatrix = avctx->colorspace;
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourPrimaries = avctx->color_primaries;
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.transferCharacteristics = avctx->color_trc;
+
+        ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag = avctx->color_range == AVCOL_RANGE_JPEG;
+
+        ctx->encode_config.encodeCodecConfig.h264Config.disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
+        ctx->encode_config.encodeCodecConfig.h264Config.repeatSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
+
+        if (!ctx->profile) {
+            switch (avctx->profile) {
+            case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
+                break;
+            case FF_PROFILE_H264_BASELINE:
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
+                break;
+            case FF_PROFILE_H264_MAIN:
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
+                break;
+            case FF_PROFILE_H264_HIGH:
+            case FF_PROFILE_UNKNOWN:
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
+                break;
+            default:
+                av_log(avctx, AV_LOG_WARNING, "Unsupported profile requested, falling back to high\n");
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
+                break;
+            }
+        } else {
+            if (!strcmp(ctx->profile, "high")) {
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
+                avctx->profile = FF_PROFILE_H264_HIGH;
+            } else if (!strcmp(ctx->profile, "main")) {
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
+                avctx->profile = FF_PROFILE_H264_MAIN;
+            } else if (!strcmp(ctx->profile, "baseline")) {
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
+                avctx->profile = FF_PROFILE_H264_BASELINE;
+            } else if (!strcmp(ctx->profile, "high444p")) {
+                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
+                avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
+            } else {
+                av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile);
+                res = AVERROR(EINVAL);
+                goto error;
+            }
+        }
 
-    if (ctx->nvel.nvenc)
-        dlclose(ctx->nvel.nvenc);
+        ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
 
-    if (ctx->nvel.cuda)
-        dlclose(ctx->nvel.cuda);
+        if (ctx->level) {
+            res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level);
 
-    return 0;
-}
+            if (res) {
+                av_log(avctx, AV_LOG_FATAL, "Level \"%s\" is unknown! Supported levels: auto, 1, 1b, 1.1, 1.2, 1.3, 2, 2.1, 2.2, 3, 3.1, 3.2, 4, 4.1, 4.2, 5, 5.1\n", ctx->level);
+                goto error;
+            }
+        } else {
+            ctx->encode_config.encodeCodecConfig.h264Config.level = NV_ENC_LEVEL_AUTOSELECT;
+        }
 
-av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
-{
-    int ret;
+        break;
+    case AV_CODEC_ID_H265:
+        ctx->encode_config.encodeCodecConfig.hevcConfig.disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
+        ctx->encode_config.encodeCodecConfig.hevcConfig.repeatSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
 
-    if ((ret = nvenc_load_libraries(avctx)) < 0)
-        return ret;
+        /* No other profile is supported in the current SDK version 5 */
+        ctx->encode_config.profileGUID = NV_ENC_HEVC_PROFILE_MAIN_GUID;
+        avctx->profile = FF_PROFILE_HEVC_MAIN;
 
-    if ((ret = nvenc_setup_device(avctx)) < 0)
-        return ret;
+        if (ctx->level) {
+            res = input_string_to_uint32(avctx, nvenc_hevc_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.hevcConfig.level);
 
-    if ((ret = nvenc_setup_encoder(avctx)) < 0)
-        return ret;
+            if (res) {
+                av_log(avctx, AV_LOG_FATAL, "Level \"%s\" is unknown! Supported levels: auto, 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6, 6.1, 6.2\n", ctx->level);
+                goto error;
+            }
+        } else {
+            ctx->encode_config.encodeCodecConfig.hevcConfig.level = NV_ENC_LEVEL_AUTOSELECT;
+        }
 
-    if ((ret = nvenc_setup_surfaces(avctx)) < 0)
-        return ret;
+        if (ctx->tier) {
+            if (!strcmp(ctx->tier, "main")) {
+                ctx->encode_config.encodeCodecConfig.hevcConfig.tier = NV_ENC_TIER_HEVC_MAIN;
+            } else if (!strcmp(ctx->tier, "high")) {
+                ctx->encode_config.encodeCodecConfig.hevcConfig.tier = NV_ENC_TIER_HEVC_HIGH;
+            } else {
+                av_log(avctx, AV_LOG_FATAL, "Tier \"%s\" is unknown! Supported tiers: main, high\n", ctx->tier);
+                res = AVERROR(EINVAL);
+                goto error;
+            }
+        }
 
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-        if ((ret = nvenc_setup_extradata(avctx)) < 0)
-            return ret;
+        break;
+    /* Earlier switch/case will return if unknown codec is passed. */
     }
 
-    return 0;
-}
+    nv_status = p_nvenc->nvEncInitializeEncoder(ctx->nvencoder, &ctx->init_encode_params);
+    if (nv_status != NV_ENC_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "InitializeEncoder failed: 0x%x\n", (int)nv_status);
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
 
-static NVENCInputSurface *get_input_surface(NVENCContext *ctx)
-{
-    int i;
+    ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces));
 
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if (!ctx->in[i].locked) {
-            ctx->in[i].locked = 1;
-            return &ctx->in[i];
-        }
+    if (!ctx->input_surfaces) {
+        res = AVERROR(ENOMEM);
+        goto error;
     }
 
-    return NULL;
-}
+    ctx->output_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->output_surfaces));
 
-static NVENCOutputSurface *get_output_surface(NVENCContext *ctx)
-{
-    int i;
-
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if (!ctx->out[i].busy) {
-            return &ctx->out[i];
-        }
+    if (!ctx->output_surfaces) {
+        res = AVERROR(ENOMEM);
+        goto error;
     }
 
-    return NULL;
-}
+    for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) {
+        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
+        NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
+        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
+        allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
 
-static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
-{
-    uint8_t *buf = in->bufferDataPtr;
-    int off      = frame->height * in->pitch;
+        allocSurf.width = (avctx->width + 31) & ~31;
+        allocSurf.height = (avctx->height + 31) & ~31;
 
-    switch (frame->format) {
-    case AV_PIX_FMT_YUV420P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
+        allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[2], frame->linesize[2],
-                            frame->width >> 1, frame->height >> 1);
+        switch (avctx->pix_fmt) {
+        case AV_PIX_FMT_YUV420P:
+            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
+            break;
 
-        buf += off >> 2;
+        case AV_PIX_FMT_NV12:
+            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
+            break;
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[1], frame->linesize[1],
-                            frame->width >> 1, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_NV12:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_YUV444P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height);
-        buf += off;
-
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[2], frame->linesize[2],
-                            frame->width, frame->height);
-        break;
-    default:
-        return AVERROR_BUG;
-    }
+        case AV_PIX_FMT_YUV444P:
+            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+            break;
 
-    return 0;
-}
+        default:
+            av_log(avctx, AV_LOG_FATAL, "Invalid input pixel format\n");
+            res = AVERROR(EINVAL);
+            goto error;
+        }
 
-static int nvenc_enqueue_frame(AVCodecContext *avctx, const AVFrame *frame,
-                               NVENCInputSurface **in_surf)
-{
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_LOCK_INPUT_BUFFER params = { 0 };
-    NVENCInputSurface *in           = get_input_surface(ctx);
-    int ret;
+        nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+        if (nv_status != NV_ENC_SUCCESS) {
+            av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
+            res = AVERROR_EXTERNAL;
+            goto error;
+        }
+
+        ctx->input_surfaces[surfaceCount].lockCount = 0;
+        ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer;
+        ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt;
+        ctx->input_surfaces[surfaceCount].width = allocSurf.width;
+        ctx->input_surfaces[surfaceCount].height = allocSurf.height;
 
-    if (!in)
-        return AVERROR_BUG;
+        /* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */
+        allocOut.size = 1024 * 1024;
 
-    params.version     = NV_ENC_LOCK_INPUT_BUFFER_VER;
-    params.inputBuffer = in->in;
+        allocOut.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
 
+        nv_status = p_nvenc->nvEncCreateBitstreamBuffer(ctx->nvencoder, &allocOut);
+        if (nv_status != NV_ENC_SUCCESS) {
+            av_log(avctx, AV_LOG_FATAL, "CreateBitstreamBuffer failed\n");
+            ctx->output_surfaces[surfaceCount++].output_surface = NULL;
+            res = AVERROR_EXTERNAL;
+            goto error;
+        }
 
-    ret = nv->nvEncLockInputBuffer(ctx->nvenc_ctx, &params);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot lock the buffer %p.\n",
-               in);
-        return AVERROR_UNKNOWN;
+        ctx->output_surfaces[surfaceCount].output_surface = allocOut.bitstreamBuffer;
+        ctx->output_surfaces[surfaceCount].size = allocOut.size;
+        ctx->output_surfaces[surfaceCount].busy = 0;
     }
 
-    ret = nvenc_copy_frame(&params, frame);
-    if (ret < 0)
-        goto fail;
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+        uint32_t outSize = 0;
+        char tmpHeader[256];
+        NV_ENC_SEQUENCE_PARAM_PAYLOAD payload = { 0 };
+        payload.version = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
+
+        payload.spsppsBuffer = tmpHeader;
+        payload.inBufferSize = sizeof(tmpHeader);
+        payload.outSPSPPSPayloadSize = &outSize;
+
+        nv_status = p_nvenc->nvEncGetSequenceParams(ctx->nvencoder, &payload);
+        if (nv_status != NV_ENC_SUCCESS) {
+            av_log(avctx, AV_LOG_FATAL, "GetSequenceParams failed\n");
+            goto error;
+        }
+
+        avctx->extradata_size = outSize;
+        avctx->extradata = av_mallocz(outSize + AV_INPUT_BUFFER_PADDING_SIZE);
+
+        if (!avctx->extradata) {
+            res = AVERROR(ENOMEM);
+            goto error;
+        }
 
-    ret = nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, in->in);
-    if (ret != NV_ENC_SUCCESS) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot unlock the buffer %p.\n",
-               in);
-        return AVERROR_UNKNOWN;
+        memcpy(avctx->extradata, tmpHeader, outSize);
     }
 
-    *in_surf = in;
+    if (ctx->encode_config.frameIntervalP > 1)
+        avctx->has_b_frames = 2;
+
+    if (ctx->encode_config.rcParams.averageBitRate > 0)
+        avctx->bit_rate = ctx->encode_config.rcParams.averageBitRate;
 
     return 0;
 
-fail:
-    nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, in->in);
+error:
 
-    return ret;
-}
+    for (i = 0; i < surfaceCount; ++i) {
+        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        if (ctx->output_surfaces[i].output_surface)
+            p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
+    }
 
-static void nvenc_codec_specific_pic_params(AVCodecContext *avctx,
-                                            NV_ENC_PIC_PARAMS *params)
-{
-    NVENCContext *ctx = avctx->priv_data;
+    if (ctx->nvencoder)
+        p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
 
-    switch (avctx->codec->id) {
-    case AV_CODEC_ID_H264:
-        params->codecPicParams.h264PicParams.sliceMode =
-            ctx->config.encodeCodecConfig.h264Config.sliceMode;
-        params->codecPicParams.h264PicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.h264Config.sliceModeData;
-        break;
-    case AV_CODEC_ID_HEVC:
-        params->codecPicParams.hevcPicParams.sliceMode =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceMode;
-        params->codecPicParams.hevcPicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceModeData;
-        break;
-    }
-}
+    if (ctx->cu_context)
+        dl_fn->cu_ctx_destroy(ctx->cu_context);
 
-static inline int nvenc_enqueue_timestamp(AVFifoBuffer *f, int64_t pts)
-{
-    return av_fifo_generic_write(f, &pts, sizeof(pts), NULL);
-}
+    nvenc_unload_nvenc(avctx);
 
-static inline int nvenc_dequeue_timestamp(AVFifoBuffer *f, int64_t *pts)
-{
-    return av_fifo_generic_read(f, pts, sizeof(*pts), NULL);
-}
+    ctx->nvencoder = NULL;
+    ctx->cu_context = NULL;
 
-static inline int nvenc_enqueue_surface(AVFifoBuffer *f,
-                                        NVENCOutputSurface *surf)
-{
-    surf->busy = 1;
-    return av_fifo_generic_write(f, &surf, sizeof(surf), NULL);
+    return res;
 }
 
-static inline int nvenc_dequeue_surface(AVFifoBuffer *f,
-                                        NVENCOutputSurface **surf)
+static av_cold int nvenc_encode_close(AVCodecContext *avctx)
 {
-    return av_fifo_generic_read(f, surf, sizeof(*surf), NULL);
-}
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    int i;
 
-static int nvenc_set_timestamp(NVENCContext *ctx,
-                               NV_ENC_LOCK_BITSTREAM *params,
-                               AVPacket *pkt)
-{
-    pkt->pts      = params->outputTimeStamp;
-    pkt->duration = params->outputDuration;
+    av_freep(&ctx->timestamp_list.data);
+    av_freep(&ctx->output_surface_ready_queue.data);
+    av_freep(&ctx->output_surface_queue.data);
+
+    for (i = 0; i < ctx->max_surface_count; ++i) {
+        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
+    }
+    ctx->max_surface_count = 0;
+
+    p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    ctx->nvencoder = NULL;
+
+    dl_fn->cu_ctx_destroy(ctx->cu_context);
+    ctx->cu_context = NULL;
+
+    nvenc_unload_nvenc(avctx);
 
-    return nvenc_dequeue_timestamp(ctx->timestamps, &pkt->dts);
+    return 0;
 }
 
-static int nvenc_get_frame(AVCodecContext *avctx, AVPacket *pkt)
+static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencOutputSurface *tmpoutsurf)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_LOCK_BITSTREAM params    = { 0 };
-    NVENCOutputSurface *out         = NULL;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    ret = nvenc_dequeue_surface(ctx->pending, &out);
-    if (ret)
-        return ret;
+    uint32_t slice_mode_data;
+    uint32_t *slice_offsets;
+    NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
+    NVENCSTATUS nv_status;
+    int res = 0;
+
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+      break;
+    case AV_CODEC_ID_H265:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
+      break;
+    default:
+      av_log(avctx, AV_LOG_ERROR, "Unknown codec name\n");
+      res = AVERROR(EINVAL);
+      goto error;
+    }
+    slice_offsets = av_mallocz(slice_mode_data * sizeof(*slice_offsets));
 
-    params.version         = NV_ENC_LOCK_BITSTREAM_VER;
-    params.outputBitstream = out->out;
+    if (!slice_offsets)
+        return AVERROR(ENOMEM);
 
-    ret = nv->nvEncLockBitstream(ctx->nvenc_ctx, &params);
-    if (ret < 0)
-        return AVERROR_UNKNOWN;
+    lock_params.version = NV_ENC_LOCK_BITSTREAM_VER;
 
-    ret = ff_alloc_packet(pkt, params.bitstreamSizeInBytes);
-    if (ret < 0)
-        return ret;
+    lock_params.doNotWait = 0;
+    lock_params.outputBitstream = tmpoutsurf->output_surface;
+    lock_params.sliceOffsets = slice_offsets;
 
-    memcpy(pkt->data, params.bitstreamBufferPtr, pkt->size);
+    nv_status = p_nvenc->nvEncLockBitstream(ctx->nvencoder, &lock_params);
+    if (nv_status != NV_ENC_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "Failed locking bitstream buffer\n");
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
 
-    ret = nv->nvEncUnlockBitstream(ctx->nvenc_ctx, out->out);
-    if (ret < 0)
-        return AVERROR_UNKNOWN;
+    if (res = ff_alloc_packet2(avctx, pkt, lock_params.bitstreamSizeInBytes,0)) {
+        p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+        goto error;
+    }
 
-    out->busy = out->in->locked = 0;
+    memcpy(pkt->data, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes);
 
-    ret = nvenc_set_timestamp(ctx, &params, pkt);
-    if (ret < 0)
-        return ret;
+    nv_status = p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+    if (nv_status != NV_ENC_SUCCESS)
+        av_log(avctx, AV_LOG_ERROR, "Failed unlocking bitstream buffer, expect the gates of mordor to open\n");
 
-    switch (params.pictureType) {
+    switch (lock_params.pictureType) {
     case NV_ENC_PIC_TYPE_IDR:
         pkt->flags |= AV_PKT_FLAG_KEY;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-    case NV_ENC_PIC_TYPE_INTRA_REFRESH:
     case NV_ENC_PIC_TYPE_I:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
         break;
@@ -1056,79 +1201,240 @@ FF_DISABLE_DEPRECATION_WARNINGS
     case NV_ENC_PIC_TYPE_BI:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
         break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown picture type encountered, expect the output to be broken.\n");
+        av_log(avctx, AV_LOG_ERROR, "Please report this error and include as much information on how to reproduce it as possible.\n");
+        res = AVERROR_EXTERNAL;
+        goto error;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     }
 
+    pkt->pts = lock_params.outputTimeStamp;
+    pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list);
+
+    /* when there're b frame(s), set dts offset */
+    if (ctx->encode_config.frameIntervalP >= 2)
+        pkt->dts -= 1;
+
+    if (pkt->dts > pkt->pts)
+        pkt->dts = pkt->pts;
+
+    if (ctx->last_dts != AV_NOPTS_VALUE && pkt->dts <= ctx->last_dts)
+        pkt->dts = ctx->last_dts + 1;
+
+    ctx->last_dts = pkt->dts;
+
+    av_free(slice_offsets);
+
     return 0;
+
+error:
+
+    av_free(slice_offsets);
+    timestamp_queue_dequeue(&ctx->timestamp_list);
+
+    return res;
 }
 
-int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                          const AVFrame *frame, int *got_packet)
+static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+    const AVFrame *frame, int *got_packet)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PIC_PARAMS params        = { 0 };
-    NVENCInputSurface *in           = NULL;
-    NVENCOutputSurface *out         = NULL;
-    int ret;
+    NVENCSTATUS nv_status;
+    NvencOutputSurface *tmpoutsurf;
+    int res, i = 0;
+
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    params.version = NV_ENC_PIC_PARAMS_VER;
+    NV_ENC_PIC_PARAMS pic_params = { 0 };
+    pic_params.version = NV_ENC_PIC_PARAMS_VER;
 
     if (frame) {
-        ret = nvenc_enqueue_frame(avctx, frame, &in);
-        if (ret < 0)
-            return ret;
-        out = get_output_surface(ctx);
-        if (!out)
-            return AVERROR_BUG;
-
-        out->in = in;
-
-        params.inputBuffer     = in->in;
-        params.bufferFmt       = in->format;
-        params.inputWidth      = frame->width;
-        params.inputHeight     = frame->height;
-        params.outputBitstream = out->out;
-        params.inputTimeStamp  = frame->pts;
+        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+        NvencInputSurface *inSurf = NULL;
+
+        for (i = 0; i < ctx->max_surface_count; ++i) {
+            if (!ctx->input_surfaces[i].lockCount) {
+                inSurf = &ctx->input_surfaces[i];
+                break;
+            }
+        }
+
+        av_assert0(inSurf);
+
+        inSurf->lockCount = 1;
+
+        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
+        lockBufferParams.inputBuffer = inSurf->input_surface;
+
+        nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+        if (nv_status != NV_ENC_SUCCESS) {
+            av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n");
+            return 0;
+        }
+
+        if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
+            uint8_t *buf = lockBufferParams.bufferDataPtr;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[0], frame->linesize[0],
+                avctx->width, avctx->height);
+
+            buf += inSurf->height * lockBufferParams.pitch;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
+                frame->data[2], frame->linesize[2],
+                avctx->width >> 1, avctx->height >> 1);
+
+            buf += (inSurf->height * lockBufferParams.pitch) >> 2;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
+                frame->data[1], frame->linesize[1],
+                avctx->width >> 1, avctx->height >> 1);
+        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
+            uint8_t *buf = lockBufferParams.bufferDataPtr;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[0], frame->linesize[0],
+                avctx->width, avctx->height);
+
+            buf += inSurf->height * lockBufferParams.pitch;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[1], frame->linesize[1],
+                avctx->width, avctx->height >> 1);
+        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+            uint8_t *buf = lockBufferParams.bufferDataPtr;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[0], frame->linesize[0],
+                avctx->width, avctx->height);
+
+            buf += inSurf->height * lockBufferParams.pitch;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[1], frame->linesize[1],
+                avctx->width, avctx->height);
+
+            buf += inSurf->height * lockBufferParams.pitch;
+
+            av_image_copy_plane(buf, lockBufferParams.pitch,
+                frame->data[2], frame->linesize[2],
+                avctx->width, avctx->height);
+        } else {
+            av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
+            return AVERROR(EINVAL);
+        }
+
+        nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface);
+        if (nv_status != NV_ENC_SUCCESS) {
+            av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        for (i = 0; i < ctx->max_surface_count; ++i)
+            if (!ctx->output_surfaces[i].busy)
+                break;
+
+        if (i == ctx->max_surface_count) {
+            inSurf->lockCount = 0;
+            av_log(avctx, AV_LOG_FATAL, "No free output surface found!\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        ctx->output_surfaces[i].input_surface = inSurf;
+
+        pic_params.inputBuffer = inSurf->input_surface;
+        pic_params.bufferFmt = inSurf->format;
+        pic_params.inputWidth = avctx->width;
+        pic_params.inputHeight = avctx->height;
+        pic_params.outputBitstream = ctx->output_surfaces[i].output_surface;
+        pic_params.completionEvent = 0;
 
         if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
-            if (frame->top_field_first)
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
-            else
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
+            if (frame->top_field_first) {
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
+            } else {
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
+            }
         } else {
-            params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+            pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
         }
 
-        nvenc_codec_specific_pic_params(avctx, &params);
+        pic_params.encodePicFlags = 0;
+        pic_params.inputTimeStamp = frame->pts;
+        pic_params.inputDuration = 0;
+        switch (avctx->codec->id) {
+        case AV_CODEC_ID_H264:
+          pic_params.codecPicParams.h264PicParams.sliceMode = ctx->encode_config.encodeCodecConfig.h264Config.sliceMode;
+          pic_params.codecPicParams.h264PicParams.sliceModeData = ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+          break;
+        case AV_CODEC_ID_H265:
+          pic_params.codecPicParams.hevcPicParams.sliceMode = ctx->encode_config.encodeCodecConfig.hevcConfig.sliceMode;
+          pic_params.codecPicParams.hevcPicParams.sliceModeData = ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
+          break;
+        default:
+          av_log(avctx, AV_LOG_ERROR, "Unknown codec name\n");
+          return AVERROR(EINVAL);
+        }
+
+        res = timestamp_queue_enqueue(&ctx->timestamp_list, frame->pts);
 
-        ret = nvenc_enqueue_timestamp(ctx->timestamps, frame->pts);
-        if (ret < 0)
-            return ret;
+        if (res)
+            return res;
     } else {
-        params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+        pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
     }
 
-    ret = nv->nvEncEncodePicture(ctx->nvenc_ctx, &params);
+    nv_status = p_nvenc->nvEncEncodePicture(ctx->nvencoder, &pic_params);
+
+    if (frame && nv_status == NV_ENC_ERR_NEED_MORE_INPUT) {
+        res = out_surf_queue_enqueue(&ctx->output_surface_queue, &ctx->output_surfaces[i]);
 
-    if (ret != NV_ENC_SUCCESS &&
-        ret != NV_ENC_ERR_NEED_MORE_INPUT) {
+        if (res)
+            return res;
 
-        return AVERROR_UNKNOWN;
+        ctx->output_surfaces[i].busy = 1;
     }
 
-    if (out) {
-        ret = nvenc_enqueue_surface(ctx->pending, out);
-        if (ret < 0)
-            return ret;
+    if (nv_status != NV_ENC_SUCCESS && nv_status != NV_ENC_ERR_NEED_MORE_INPUT) {
+        av_log(avctx, AV_LOG_ERROR, "EncodePicture failed!\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    if (nv_status != NV_ENC_ERR_NEED_MORE_INPUT) {
+        while (ctx->output_surface_queue.count) {
+            tmpoutsurf = out_surf_queue_dequeue(&ctx->output_surface_queue);
+            res = out_surf_queue_enqueue(&ctx->output_surface_ready_queue, tmpoutsurf);
+
+            if (res)
+                return res;
+        }
+
+        if (frame) {
+            res = out_surf_queue_enqueue(&ctx->output_surface_ready_queue, &ctx->output_surfaces[i]);
+
+            if (res)
+                return res;
+
+            ctx->output_surfaces[i].busy = 1;
+        }
     }
 
-    if (ret != NV_ENC_ERR_NEED_MORE_INPUT &&
-        av_fifo_size(ctx->pending)) {
-        ret = nvenc_get_frame(avctx, pkt);
-        if (ret < 0)
-            return ret;
+    if (ctx->output_surface_ready_queue.count && (!frame || ctx->output_surface_ready_queue.count + ctx->output_surface_queue.count >= ctx->buffer_delay)) {
+        tmpoutsurf = out_surf_queue_dequeue(&ctx->output_surface_ready_queue);
+
+        res = process_output_surface(avctx, pkt, tmpoutsurf);
+
+        if (res)
+            return res;
+
+        tmpoutsurf->busy = 0;
+        av_assert0(tmpoutsurf->input_surface->lockCount);
+        tmpoutsurf->input_surface->lockCount--;
+
         *got_packet = 1;
     } else {
         *got_packet = 0;
@@ -1136,3 +1442,107 @@ int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     return 0;
 }
+
+static const enum AVPixelFormat pix_fmts_nvenc[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_NONE
+};
+
+#define OFFSET(x) offsetof(NvencContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "hq" }, 0, 0, VE },
+    { "profile", "Set the encoding profile (high, main or baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
+    { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
+    { NULL }
+};
+
+static const AVCodecDefault nvenc_defaults[] = {
+    { "b", "0" },
+    { "qmin", "-1" },
+    { "qmax", "-1" },
+    { "qdiff", "-1" },
+    { "qblur", "-1" },
+    { "qcomp", "-1" },
+    { NULL },
+};
+
+#if CONFIG_NVENC_ENCODER
+static const AVClass nvenc_class = {
+    .class_name = "nvenc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_nvenc_encoder = {
+    .name = "nvenc",
+    .long_name = NULL_IF_CONFIG_SMALL("NVIDIA NVENC h264 encoder"),
+    .type = AVMEDIA_TYPE_VIDEO,
+    .id = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(NvencContext),
+    .init = nvenc_encode_init,
+    .encode2 = nvenc_encode_frame,
+    .close = nvenc_encode_close,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .priv_class = &nvenc_class,
+    .defaults = nvenc_defaults,
+    .pix_fmts = pix_fmts_nvenc,
+};
+#endif
+
+/* Add an alias for nvenc_h264 */
+#if CONFIG_NVENC_H264_ENCODER
+static const AVClass nvenc_h264_class = {
+    .class_name = "nvenc_h264",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_nvenc_h264_encoder = {
+    .name = "nvenc_h264",
+    .long_name = NULL_IF_CONFIG_SMALL("NVIDIA NVENC h264 encoder"),
+    .type = AVMEDIA_TYPE_VIDEO,
+    .id = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(NvencContext),
+    .init = nvenc_encode_init,
+    .encode2 = nvenc_encode_frame,
+    .close = nvenc_encode_close,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .priv_class = &nvenc_h264_class,
+    .defaults = nvenc_defaults,
+    .pix_fmts = pix_fmts_nvenc,
+};
+#endif
+
+#if CONFIG_NVENC_HEVC_ENCODER
+static const AVClass nvenc_hevc_class = {
+    .class_name = "nvenc_hevc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_nvenc_hevc_encoder = {
+    .name = "nvenc_hevc",
+    .long_name = NULL_IF_CONFIG_SMALL("NVIDIA NVENC hevc encoder"),
+    .type = AVMEDIA_TYPE_VIDEO,
+    .id = AV_CODEC_ID_H265,
+    .priv_data_size = sizeof(NvencContext),
+    .init = nvenc_encode_init,
+    .encode2 = nvenc_encode_frame,
+    .close = nvenc_encode_close,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .priv_class = &nvenc_hevc_class,
+    .defaults = nvenc_defaults,
+    .pix_fmts = pix_fmts_nvenc,
+};
+#endif
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
deleted file mode 100644
index 8819b3ca32..0000000000
--- a/libavcodec/nvenc.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_NVENC_H
-#define AVCODEC_NVENC_H
-
-#include <cuda.h>
-#include <nvEncodeAPI.h>
-
-#include "libavutil/fifo.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-
-typedef struct NVENCInputSurface {
-    NV_ENC_INPUT_PTR in;
-    NV_ENC_BUFFER_FORMAT format;
-    int locked;
-} NVENCInputSurface;
-
-typedef struct NVENCOutputSurface {
-    NV_ENC_OUTPUT_PTR out;
-    NVENCInputSurface *in;
-    int busy;
-} NVENCOutputSurface;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
-
-typedef NVENCSTATUS (NVENCAPI *PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
-
-typedef struct NVENCLibraryContext
-{
-    void *cuda;
-    void *nvenc;
-
-    PCUINIT cu_init;
-    PCUDEVICEGETCOUNT cu_device_get_count;
-    PCUDEVICEGET cu_device_get;
-    PCUDEVICEGETNAME cu_device_get_name;
-    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
-    PCUCTXCREATE cu_ctx_create;
-    PCUCTXPOPCURRENT cu_ctx_pop_current;
-    PCUCTXDESTROY cu_ctx_destroy;
-
-    NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
-} NVENCLibraryContext;
-
-enum {
-    PRESET_DEFAULT,
-    PRESET_HP,
-    PRESET_HQ,
-    PRESET_BD ,
-    PRESET_LOW_LATENCY_DEFAULT ,
-    PRESET_LOW_LATENCY_HQ ,
-    PRESET_LOW_LATENCY_HP,
-    PRESET_LOSSLESS_DEFAULT,
-    PRESET_LOSSLESS_HP,
-};
-
-enum {
-    NV_ENC_H264_PROFILE_BASELINE,
-    NV_ENC_H264_PROFILE_MAIN,
-    NV_ENC_H264_PROFILE_HIGH,
-    NV_ENC_H264_PROFILE_HIGH_444,
-    NV_ENC_H264_PROFILE_CONSTRAINED_HIGH,
-};
-
-enum {
-    NVENC_LOWLATENCY = 1,
-    NVENC_LOSSLESS,
-};
-
-enum {
-    LIST_DEVICES = -2,
-    ANY_DEVICE,
-};
-
-typedef struct NVENCContext {
-    AVClass *class;
-    NVENCLibraryContext nvel;
-
-    NV_ENC_INITIALIZE_PARAMS params;
-    NV_ENC_CONFIG config;
-
-    CUcontext cu_context;
-
-    int nb_surfaces;
-    NVENCInputSurface *in;
-    NVENCOutputSurface *out;
-    AVFifoBuffer *timestamps;
-    AVFifoBuffer *pending, *ready;
-
-    int64_t last_dts;
-
-    void *nvenc_ctx;
-
-    int preset;
-    int profile;
-    int level;
-    int tier;
-    int rc;
-    int device;
-    int flags;
-} NVENCContext;
-
-int ff_nvenc_encode_init(AVCodecContext *avctx);
-
-int ff_nvenc_encode_close(AVCodecContext *avctx);
-
-int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                          const AVFrame *frame, int *got_packet);
-
-#endif /* AVCODEC_NVENC_H */
diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
deleted file mode 100644
index 52a2a6798f..0000000000
--- a/libavcodec/nvenc_h264.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-#include "internal.h"
-
-#include "nvenc.h"
-
-#define OFFSET(x) offsetof(NVENCContext, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_HQ }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
-    { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
-    { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
-    { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
-    { "ll",         "low latency",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
-    { "llhq",       "low latency hq",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HQ }, 0, 0, VE, "preset" },
-    { "llhp",       "low latency hp",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HP }, 0, 0, VE, "preset" },
-    { "profile",  "Set the encoding profile",             OFFSET(profile),     AV_OPT_TYPE_INT,    { .i64 = NV_ENC_H264_PROFILE_HIGH }, NV_ENC_H264_PROFILE_BASELINE, NV_ENC_H264_PROFILE_CONSTRAINED_HIGH, VE, "profile" },
-    { "baseline", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_BASELINE },            0, 0, VE, "profile" },
-    { "main",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_MAIN },                0, 0, VE, "profile" },
-    { "high",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH },                0, 0, VE, "profile" },
-    { "high_444", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444 },            0, 0, VE, "profile" },
-    { "constrained_high", "",                             0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH },    0, 0, VE, "profile" },
-    { "level",    "Set the encoding level restriction",   OFFSET(level),       AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
-    { "1.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
-    { "1.b",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
-    { "1.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_11 }, 0, 0, VE,  "level" },
-    { "1.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_12 }, 0, 0, VE,  "level" },
-    { "1.3",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_13 }, 0, 0, VE,  "level" },
-    { "2.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_2 },  0, 0, VE,  "level" },
-    { "2.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_21 }, 0, 0, VE,  "level" },
-    { "2.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_22 }, 0, 0, VE,  "level" },
-    { "3.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_3 },  0, 0, VE,  "level" },
-    { "3.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_31 }, 0, 0, VE,  "level" },
-    { "3.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_32 }, 0, 0, VE,  "level" },
-    { "4.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_4 },  0, 0, VE,  "level" },
-    { "4.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_41 }, 0, 0, VE,  "level" },
-    { "4.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_42 }, 0, 0, VE,  "level" },
-    { "5.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_5 },  0, 0, VE,  "level" },
-    { "5.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_51 }, 0, 0, VE,  "level" },
-    { "rc",       "Override the preset rate-control",     OFFSET(rc),          AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, 0, VE },
-    { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
-    { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
-    { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
-    { "vbr_minqp",        "Variable bitrate mode with MinQP",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR_MINQP },            0, 0, VE, "rc" },
-    { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
-    { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
-    { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
-    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
-    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
-    { NULL }
-};
-
-static const AVClass nvenc_hevc_class = {
-    .class_name = "nvenc_h264",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
-static const AVCodecDefault defaults[] = {
-    { "b", "0" },
-    { "qmin", "-1" },
-    { "qmax", "-1" },
-    { "qdiff", "-1" },
-    { "qblur", "-1" },
-    { "qcomp", "-1" },
-    { NULL },
-};
-
-AVCodec ff_h264_nvenc_encoder = {
-    .name           = "nvenc_h264",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC H264 encoder"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_H264,
-    .init           = ff_nvenc_encode_init,
-    .encode2        = ff_nvenc_encode_frame,
-    .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_class,
-    .defaults       = defaults,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
-                                                    AV_PIX_FMT_YUV420P,
-                                                    AV_PIX_FMT_YUV444P,
-                                                    AV_PIX_FMT_NONE },
-    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
-};
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
deleted file mode 100644
index 58bde7731e..0000000000
--- a/libavcodec/nvenc_hevc.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-#include "internal.h"
-
-#include "nvenc.h"
-
-#define OFFSET(x) offsetof(NVENCContext, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_HQ }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
-    { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
-    { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
-    { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
-    { "ll",         "low latency",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
-    { "llhq",       "low latency hq",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HQ }, 0, 0, VE, "preset" },
-    { "llhp",       "low latency hp",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HP }, 0, 0, VE, "preset" },
-    { "lossless",   "lossless",                           0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_DEFAULT }, 0, 0, VE, "preset" },
-    { "losslesshp", "lossless hp",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_HP }, 0, 0, VE, "preset" },
-    { "profile", "Set the encoding profile",             OFFSET(profile),      AV_OPT_TYPE_INT,    { .i64 = FF_PROFILE_HEVC_MAIN }, FF_PROFILE_HEVC_MAIN, FF_PROFILE_HEVC_MAIN, VE, "profile" },
-    { "high",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = FF_PROFILE_HEVC_MAIN }, 0, 0, VE, "profile" },
-    { "level",   "Set the encoding level restriction",   OFFSET(level),        AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
-    { "1.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
-    { "2.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
-    { "2.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_21 }, 0, 0, VE,  "level" },
-    { "3.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_3 },  0, 0, VE,  "level" },
-    { "3.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_31 }, 0, 0, VE,  "level" },
-    { "4.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_4 },  0, 0, VE,  "level" },
-    { "4.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_41 }, 0, 0, VE,  "level" },
-    { "5.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_5 },  0, 0, VE,  "level" },
-    { "5.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_51 }, 0, 0, VE,  "level" },
-    { "5.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_52 }, 0, 0, VE,  "level" },
-    { "6.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_6 },  0, 0, VE,  "level" },
-    { "6.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_61 }, 0, 0, VE,  "level" },
-    { "6.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_62 }, 0, 0, VE,  "level" },
-    { "tier",    "Set the encoding tier",                OFFSET(tier),         AV_OPT_TYPE_INT,    { .i64 = NV_ENC_TIER_HEVC_MAIN }, NV_ENC_TIER_HEVC_MAIN, NV_ENC_TIER_HEVC_HIGH, VE, "tier"},
-    { "main",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_MAIN }, 0, 0, VE, "tier" },
-    { "high",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_HIGH }, 0, 0, VE, "tier" },
-    { "rc",      "Override the preset rate-control",     OFFSET(rc),           AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, 0, VE, "rc" },
-    { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
-    { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
-    { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
-    { "vbr_minqp",        "Variable bitrate mode with MinQP",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR_MINQP },            0, 0, VE, "rc" },
-    { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
-    { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
-    { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
-    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
-    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
-    { NULL }
-};
-
-static const AVClass nvenc_hevc_class = {
-    .class_name = "nvenc_hevc",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
-static const AVCodecDefault defaults[] = {
-    { "b", "0" },
-    { "qmin", "-1" },
-    { "qmax", "-1" },
-    { "qdiff", "-1" },
-    { "qblur", "-1" },
-    { "qcomp", "-1" },
-    { NULL },
-};
-
-AVCodec ff_hevc_nvenc_encoder = {
-    .name           = "nvenc_hevc",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC HEVC encoder"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_HEVC,
-    .init           = ff_nvenc_encode_init,
-    .encode2        = ff_nvenc_encode_frame,
-    .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_class,
-    .defaults       = defaults,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
-                                                    AV_PIX_FMT_YUV420P,
-                                                    AV_PIX_FMT_YUV444P,
-                                                    AV_PIX_FMT_NONE },
-    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
-};
diff --git a/libavcodec/on2avc.c b/libavcodec/on2avc.c
index 3f531bc83b..15f4dd1c66 100644
--- a/libavcodec/on2avc.c
+++ b/libavcodec/on2avc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ enum WindowTypes {
 
 typedef struct On2AVCContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct, mdct_half, mdct_small;
     FFTContext fft128, fft256, fft512, fft1024;
     void (*wtf)(struct On2AVCContext *ctx, float *out, float *in, int size);
@@ -119,12 +119,12 @@ static int on2avc_decode_band_types(On2AVCContext *c, GetBitContext *gb)
         run_len   = 1;
         do {
             run = get_bits(gb, bits_per_sect);
+            if (run > num_bands - band - run_len) {
+                av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
+                return AVERROR_INVALIDDATA;
+            }
             run_len += run;
         } while (run == esc_val);
-        if (band + run_len > num_bands) {
-            av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
-            return AVERROR_INVALIDDATA;
-        }
         for (i = band; i < band + run_len; i++) {
             c->band_type[i]    = band_type;
             c->band_run_end[i] = band + run_len;
@@ -313,7 +313,7 @@ static void zero_head_and_tail(float *src, int len, int order0, int order1)
 }
 
 static void pretwiddle(float *src, float *dst, int dst_len, int tab_step,
-                       int step, int order0, int order1, const double **tabs)
+                       int step, int order0, int order1, const double * const *tabs)
 {
     float *src2, *out;
     const double *tab;
@@ -341,7 +341,7 @@ static void pretwiddle(float *src, float *dst, int dst_len, int tab_step,
 
 static void twiddle(float *src1, float *src2, int src2_len,
                     const double *tab, int tab_len, int step,
-                    int order0, int order1, const double **tabs)
+                    int order0, int order1, const double * const *tabs)
 {
     int steps;
     int mask;
@@ -714,7 +714,7 @@ static int on2avc_reconstruct_stereo(On2AVCContext *c, AVFrame *dst, int offset)
         }
 
         memcpy(out, saved, 448 * sizeof(float));
-        c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+        c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
         memcpy(wout + 128,  buf + 64,         448 * sizeof(float));
         memcpy(saved,       buf + 512,        448 * sizeof(float));
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
@@ -750,20 +750,20 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
          c->prev_window_type == WINDOW_TYPE_LONG_STOP) &&
         (c->window_type == WINDOW_TYPE_LONG ||
          c->window_type == WINDOW_TYPE_LONG_START)) {
-        c->fdsp.vector_fmul_window(out, saved, buf, c->long_win, 512);
+        c->fdsp->vector_fmul_window(out, saved, buf, c->long_win, 512);
     } else {
         float *wout = out + 448;
         memcpy(out, saved, 448 * sizeof(float));
 
         if (c->window_type == WINDOW_TYPE_8SHORT) {
-            c->fdsp.vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
             memcpy(wout + 4*128, temp, 64 * sizeof(float));
         } else {
-            c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
             memcpy(wout + 128, buf + 64, 448 * sizeof(float));
         }
     }
@@ -772,9 +772,9 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
     switch (c->window_type) {
     case WINDOW_TYPE_8SHORT:
         memcpy(saved,       temp + 64,         64 * sizeof(float));
-        c->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
         break;
     case WINDOW_TYPE_LONG_START:
@@ -795,7 +795,9 @@ static int on2avc_decode_subframe(On2AVCContext *c, const uint8_t *buf,
     GetBitContext gb;
     int i, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
+
     if (get_bits1(&gb)) {
         av_log(c->avctx, AV_LOG_ERROR, "enh bit set\n");
         return AVERROR_INVALIDDATA;
@@ -846,10 +848,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
     if (c->is_av500) {
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         if ((ret = on2avc_decode_subframe(c, buf, buf_size, frame, 0)) < 0)
             return ret;
@@ -872,10 +872,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
 
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE * num_frames;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         audio_off = 0;
         bytestream2_init(&gb, buf, buf_size);
@@ -908,6 +906,11 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     On2AVCContext *c = avctx->priv_data;
     int i;
 
+    if (avctx->channels > 2U) {
+        avpriv_request_sample(avctx, "Decoding more than 2 channels");
+        return AVERROR_PATCHWELCOME;
+    }
+
     c->avctx = avctx;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO
@@ -918,10 +921,7 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "0x500 version should be mono\n");
         return AVERROR_INVALIDDATA;
     }
-    if (avctx->channels > 2) {
-        av_log(avctx, AV_LOG_ERROR, "Only 1 or 2 channels are supported.\n");
-        return AVERROR(EINVAL);
-    }
+
     if (avctx->channels == 2)
         av_log(avctx, AV_LOG_WARNING,
                "Stereo mode support is not good, patch is welcome\n");
@@ -951,13 +951,14 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     ff_fft_init(&c->fft256,  7, 0);
     ff_fft_init(&c->fft512,  8, 1);
     ff_fft_init(&c->fft1024, 9, 1);
-    avpriv_float_dsp_init(&c->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    c->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     if (init_vlc(&c->scale_diff, 9, ON2AVC_SCALE_DIFFS,
                  ff_on2avc_scale_diff_bits,  1, 1,
                  ff_on2avc_scale_diff_codes, 4, 4, 0)) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-        return AVERROR(ENOMEM);
+        goto vlc_fail;
     }
     for (i = 1; i < 9; i++) {
         int idx = i - 1;
@@ -965,9 +966,7 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_quad_cb_bits[idx],  1, 1,
                                ff_on2avc_quad_cb_codes[idx], 4, 4,
                                ff_on2avc_quad_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
     for (i = 9; i < 16; i++) {
@@ -976,13 +975,16 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_pair_cb_bits[idx],  1, 1,
                                ff_on2avc_pair_cb_codes[idx], 2, 2,
                                ff_on2avc_pair_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
 
     return 0;
+vlc_fail:
+    av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
+    on2avc_free_vlcs(c);
+    av_freep(&c->fdsp);
+    return AVERROR(ENOMEM);
 }
 
 static av_cold int on2avc_decode_close(AVCodecContext *avctx)
@@ -997,6 +999,8 @@ static av_cold int on2avc_decode_close(AVCodecContext *avctx)
     ff_fft_end(&c->fft512);
     ff_fft_end(&c->fft1024);
 
+    av_freep(&c->fdsp);
+
     on2avc_free_vlcs(c);
 
     return 0;
diff --git a/libavcodec/on2avcdata.c b/libavcodec/on2avcdata.c
index d039f23509..abe598350b 100644
--- a/libavcodec/on2avcdata.c
+++ b/libavcodec/on2avcdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -7641,11 +7641,11 @@ static const double tabs_4_10[4 * 2][10] = {
      -0.099339873,     -0.041293536,    0.31028851,     0.17727433,    -0.92756648 }
 };
 
-const double *ff_on2avc_tabs_4_10_1[4] = {
+const double * const ff_on2avc_tabs_4_10_1[4] = {
     tabs_4_10[0], tabs_4_10[1], tabs_4_10[2], tabs_4_10[3]
 };
 
-const double *ff_on2avc_tabs_4_10_2[4] = {
+const double * const ff_on2avc_tabs_4_10_2[4] = {
     tabs_4_10[4], tabs_4_10[5], tabs_4_10[6], tabs_4_10[7]
 };
 
@@ -7724,12 +7724,12 @@ static const double tabs_9_20[9 * 2][20] = {
       0.22783, 0.058894795, -0.61350902, 0.69559873, -0.27013783, }
 };
 
-const double* ff_on2avc_tabs_9_20_1[9] = {
+const double* const ff_on2avc_tabs_9_20_1[9] = {
     tabs_9_20[0], tabs_9_20[1], tabs_9_20[2], tabs_9_20[3], tabs_9_20[4],
     tabs_9_20[5], tabs_9_20[6], tabs_9_20[7], tabs_9_20[8]
 };
 
-const double* ff_on2avc_tabs_9_20_2[9] = {
+const double* const ff_on2avc_tabs_9_20_2[9] = {
     tabs_9_20[ 9], tabs_9_20[10], tabs_9_20[11], tabs_9_20[12], tabs_9_20[13],
     tabs_9_20[14], tabs_9_20[15], tabs_9_20[16], tabs_9_20[17]
 };
@@ -7927,7 +7927,7 @@ static const double tabs_19_40[19 * 2][40] = {
       0.019871848, -0.11989559, 0.036659135, 0.26632201, -0.3057397, -0.23220335, 0.68741352, -0.54024027, }
 };
 
-const double* ff_on2avc_tabs_19_40_1[19] = {
+const double* const ff_on2avc_tabs_19_40_1[19] = {
     tabs_19_40[ 0], tabs_19_40[ 1], tabs_19_40[ 2], tabs_19_40[ 3],
     tabs_19_40[ 4], tabs_19_40[ 5], tabs_19_40[ 6], tabs_19_40[ 7],
     tabs_19_40[ 8], tabs_19_40[ 9], tabs_19_40[10], tabs_19_40[11],
@@ -7935,7 +7935,7 @@ const double* ff_on2avc_tabs_19_40_1[19] = {
     tabs_19_40[16], tabs_19_40[17], tabs_19_40[18],
 };
 
-const double* ff_on2avc_tabs_19_40_2[19] = {
+const double* const ff_on2avc_tabs_19_40_2[19] = {
     tabs_19_40[19], tabs_19_40[20], tabs_19_40[21], tabs_19_40[22],
     tabs_19_40[23], tabs_19_40[24], tabs_19_40[25], tabs_19_40[26],
     tabs_19_40[27], tabs_19_40[28], tabs_19_40[29], tabs_19_40[30],
@@ -8826,7 +8826,7 @@ static const double tabs_20_84[20 * 4][84] = {
       0.51434408, -0.41486443, 0.27672635, -0.10432054, },
 };
 
-const double* ff_on2avc_tabs_20_84_1[20] = {
+const double* const ff_on2avc_tabs_20_84_1[20] = {
     tabs_20_84[ 0], tabs_20_84[ 1], tabs_20_84[ 2], tabs_20_84[ 3],
     tabs_20_84[ 4], tabs_20_84[ 5], tabs_20_84[ 6], tabs_20_84[ 7],
     tabs_20_84[ 8], tabs_20_84[ 9], tabs_20_84[10], tabs_20_84[11],
@@ -8834,7 +8834,7 @@ const double* ff_on2avc_tabs_20_84_1[20] = {
     tabs_20_84[16], tabs_20_84[17], tabs_20_84[18], tabs_20_84[19]
 };
 
-const double* ff_on2avc_tabs_20_84_2[20] = {
+const double* const ff_on2avc_tabs_20_84_2[20] = {
     tabs_20_84[20], tabs_20_84[21], tabs_20_84[22], tabs_20_84[23],
     tabs_20_84[24], tabs_20_84[25], tabs_20_84[26], tabs_20_84[27],
     tabs_20_84[28], tabs_20_84[29], tabs_20_84[30], tabs_20_84[31],
@@ -8842,7 +8842,7 @@ const double* ff_on2avc_tabs_20_84_2[20] = {
     tabs_20_84[36], tabs_20_84[37], tabs_20_84[38], tabs_20_84[39]
 };
 
-const double* ff_on2avc_tabs_20_84_3[20] = {
+const double* const ff_on2avc_tabs_20_84_3[20] = {
     tabs_20_84[40], tabs_20_84[41], tabs_20_84[42], tabs_20_84[43],
     tabs_20_84[44], tabs_20_84[45], tabs_20_84[46], tabs_20_84[47],
     tabs_20_84[48], tabs_20_84[49], tabs_20_84[50], tabs_20_84[51],
@@ -8850,7 +8850,7 @@ const double* ff_on2avc_tabs_20_84_3[20] = {
     tabs_20_84[56], tabs_20_84[57], tabs_20_84[58], tabs_20_84[59]
 };
 
-const double* ff_on2avc_tabs_20_84_4[20] = {
+const double* const ff_on2avc_tabs_20_84_4[20] = {
     tabs_20_84[60], tabs_20_84[61], tabs_20_84[62], tabs_20_84[63],
     tabs_20_84[64], tabs_20_84[65], tabs_20_84[66], tabs_20_84[67],
     tabs_20_84[68], tabs_20_84[69], tabs_20_84[70], tabs_20_84[71],
diff --git a/libavcodec/on2avcdata.h b/libavcodec/on2avcdata.h
index 39d29110ec..7f498e58e4 100644
--- a/libavcodec/on2avcdata.h
+++ b/libavcodec/on2avcdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,16 +64,16 @@ extern const double ff_on2avc_tab_84_1[];
 extern const double ff_on2avc_tab_84_2[];
 extern const double ff_on2avc_tab_84_3[];
 extern const double ff_on2avc_tab_84_4[];
-extern const double* ff_on2avc_tabs_4_10_1[4];
-extern const double* ff_on2avc_tabs_4_10_2[4];
-extern const double* ff_on2avc_tabs_9_20_1[9];
-extern const double* ff_on2avc_tabs_9_20_2[9];
-extern const double* ff_on2avc_tabs_19_40_1[19];
-extern const double* ff_on2avc_tabs_19_40_2[19];
-extern const double* ff_on2avc_tabs_20_84_1[20];
-extern const double* ff_on2avc_tabs_20_84_2[20];
-extern const double* ff_on2avc_tabs_20_84_3[20];
-extern const double* ff_on2avc_tabs_20_84_4[20];
+extern const double* const ff_on2avc_tabs_4_10_1[4];
+extern const double* const ff_on2avc_tabs_4_10_2[4];
+extern const double* const ff_on2avc_tabs_9_20_1[9];
+extern const double* const ff_on2avc_tabs_9_20_2[9];
+extern const double* const ff_on2avc_tabs_19_40_1[19];
+extern const double* const ff_on2avc_tabs_19_40_2[19];
+extern const double* const ff_on2avc_tabs_20_84_1[20];
+extern const double* const ff_on2avc_tabs_20_84_2[20];
+extern const double* const ff_on2avc_tabs_20_84_3[20];
+extern const double* const ff_on2avc_tabs_20_84_4[20];
 extern const float ff_on2avc_ctab_1[2048];
 extern const float ff_on2avc_ctab_2[2048];
 extern const float ff_on2avc_ctab_3[2048];
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 49c8aea030..ea2563b571 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,6 +70,13 @@ static const AVClass *codec_child_class_next(const AVClass *prev)
     return NULL;
 }
 
+static AVClassCategory get_category(void *ptr)
+{
+    AVCodecContext* avctx = ptr;
+    if(avctx->codec && avctx->codec->decode) return AV_CLASS_CATEGORY_DECODER;
+    else                                     return AV_CLASS_CATEGORY_ENCODER;
+}
+
 static const AVClass av_codec_context_class = {
     .class_name              = "AVCodecContext",
     .item_name               = context_to_name,
@@ -78,20 +85,34 @@ static const AVClass av_codec_context_class = {
     .log_level_offset_offset = offsetof(AVCodecContext, log_level_offset),
     .child_next              = codec_child_next,
     .child_class_next        = codec_child_class_next,
+    .category                = AV_CLASS_CATEGORY_ENCODER,
+    .get_category            = get_category,
 };
 
 int avcodec_get_context_defaults3(AVCodecContext *s, const AVCodec *codec)
 {
+    int flags=0;
     memset(s, 0, sizeof(AVCodecContext));
 
     s->av_class = &av_codec_context_class;
 
     s->codec_type = codec ? codec->type : AVMEDIA_TYPE_UNKNOWN;
-    s->codec      = codec;
-    av_opt_set_defaults(s);
+    if (codec) {
+        s->codec = codec;
+        s->codec_id = codec->id;
+    }
+
+    if(s->codec_type == AVMEDIA_TYPE_AUDIO)
+        flags= AV_OPT_FLAG_AUDIO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_VIDEO)
+        flags= AV_OPT_FLAG_VIDEO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_SUBTITLE)
+        flags= AV_OPT_FLAG_SUBTITLE_PARAM;
+    av_opt_set_defaults2(s, flags, flags);
 
     s->time_base           = (AVRational){0,1};
     s->framerate           = (AVRational){ 0, 1 };
+    s->pkt_timebase        = (AVRational){ 0, 1 };
     s->get_buffer2         = avcodec_default_get_buffer2;
     s->get_format          = avcodec_default_get_format;
     s->execute             = avcodec_default_execute;
@@ -151,6 +172,9 @@ void avcodec_free_context(AVCodecContext **pavctx)
 
     av_freep(&avctx->extradata);
     av_freep(&avctx->subtitle_header);
+    av_freep(&avctx->intra_matrix);
+    av_freep(&avctx->inter_matrix);
+    av_freep(&avctx->rc_override);
 
     av_freep(pavctx);
 }
@@ -166,15 +190,34 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
                src, dest);
         return AVERROR(EINVAL);
     }
+
+    av_opt_free(dest);
+    av_freep(&dest->rc_override);
+    av_freep(&dest->intra_matrix);
+    av_freep(&dest->inter_matrix);
+    av_freep(&dest->extradata);
+    av_freep(&dest->subtitle_header);
+
     memcpy(dest, src, sizeof(*dest));
+    av_opt_copy(dest, src);
 
     dest->priv_data       = orig_priv_data;
     dest->codec           = orig_codec;
 
+    if (orig_priv_data && src->codec && src->codec->priv_class &&
+        dest->codec && dest->codec->priv_class)
+        av_opt_copy(orig_priv_data, src->priv_data);
+
+
     /* set values specific to opened codecs back to their default state */
     dest->slice_offset    = NULL;
     dest->hwaccel         = NULL;
     dest->internal        = NULL;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    dest->coded_frame     = NULL;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     /* reallocate values that should be allocated separately */
     dest->extradata       = NULL;
@@ -182,16 +225,6 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     dest->inter_matrix    = NULL;
     dest->rc_override     = NULL;
     dest->subtitle_header = NULL;
-#if FF_API_MPV_OPT
-    FF_DISABLE_DEPRECATION_WARNINGS
-    dest->rc_eq           = NULL;
-    if (src->rc_eq) {
-        dest->rc_eq = av_strdup(src->rc_eq);
-        if (!dest->rc_eq)
-            return AVERROR(ENOMEM);
-    }
-    FF_ENABLE_DEPRECATION_WARNINGS
-#endif
 
 #define alloc_and_copy_or_fail(obj, size, pad) \
     if (src->obj && size > 0) { \
@@ -204,11 +237,12 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     }
     alloc_and_copy_or_fail(extradata,    src->extradata_size,
                            AV_INPUT_BUFFER_PADDING_SIZE);
+    dest->extradata_size  = src->extradata_size;
     alloc_and_copy_or_fail(intra_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(inter_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(rc_override,  src->rc_override_count * sizeof(*src->rc_override), 0);
-    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 0);
-    dest->subtitle_header_size = src->subtitle_header_size;
+    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 1);
+    av_assert0(dest->subtitle_header_size == src->subtitle_header_size);
 #undef alloc_and_copy_or_fail
 
     return 0;
@@ -218,11 +252,10 @@ fail:
     av_freep(&dest->intra_matrix);
     av_freep(&dest->inter_matrix);
     av_freep(&dest->extradata);
-#if FF_API_MPV_OPT
-    FF_DISABLE_DEPRECATION_WARNINGS
-    av_freep(&dest->rc_eq);
-    FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    av_freep(&dest->subtitle_header);
+    dest->subtitle_header_size = 0;
+    dest->extradata_size = 0;
+    av_opt_free(dest);
     return AVERROR(ENOMEM);
 }
 
@@ -230,3 +263,224 @@ const AVClass *avcodec_get_class(void)
 {
     return &av_codec_context_class;
 }
+
+#define FOFFSET(x) offsetof(AVFrame,x)
+
+static const AVOption frame_options[]={
+{"best_effort_timestamp", "", FOFFSET(best_effort_timestamp), AV_OPT_TYPE_INT64, {.i64 = AV_NOPTS_VALUE }, INT64_MIN, INT64_MAX, 0},
+{"pkt_pos", "", FOFFSET(pkt_pos), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"pkt_size", "", FOFFSET(pkt_size), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"sample_aspect_ratio", "", FOFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"width", "", FOFFSET(width), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"height", "", FOFFSET(height), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"format", "", FOFFSET(format), AV_OPT_TYPE_INT, {.i64 = -1 }, 0, INT_MAX, 0},
+{"channel_layout", "", FOFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, 0},
+{"sample_rate", "", FOFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{NULL},
+};
+
+static const AVClass av_frame_class = {
+    .class_name              = "AVFrame",
+    .item_name               = NULL,
+    .option                  = frame_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_frame_class(void)
+{
+    return &av_frame_class;
+}
+
+#define SROFFSET(x) offsetof(AVSubtitleRect,x)
+
+static const AVOption subtitle_rect_options[]={
+{"x", "", SROFFSET(x), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"y", "", SROFFSET(y), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"w", "", SROFFSET(w), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"h", "", SROFFSET(h), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"type", "", SROFFSET(type), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"flags", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0, "flags"},
+{"forced", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0},
+{NULL},
+};
+
+static const AVClass av_subtitle_rect_class = {
+    .class_name             = "AVSubtitleRect",
+    .item_name              = NULL,
+    .option                 = subtitle_rect_options,
+    .version                = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_subtitle_rect_class(void)
+{
+    return &av_subtitle_rect_class;
+}
+
+#ifdef TEST
+static int dummy_init(AVCodecContext *ctx)
+{
+    //TODO: this code should set every possible pointer that could be set by codec and is not an option;
+    ctx->extradata_size = 8;
+    ctx->extradata = av_malloc(ctx->extradata_size);
+    return 0;
+}
+
+static int dummy_close(AVCodecContext *ctx)
+{
+    av_freep(&ctx->extradata);
+    ctx->extradata_size = 0;
+    return 0;
+}
+
+static int dummy_encode(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame, int *got_packet)
+{
+    return AVERROR(ENOSYS);
+}
+
+typedef struct Dummy12Context {
+    AVClass  *av_class;
+    int      num;
+    char*    str;
+} Dummy12Context;
+
+typedef struct Dummy3Context {
+    void     *fake_av_class;
+    int      num;
+    char*    str;
+} Dummy3Context;
+
+#define OFFSET(x) offsetof(Dummy12Context, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption dummy_options[] = {
+    { "str", "set str", OFFSET(str), AV_OPT_TYPE_STRING, { .str = "i'm src default value" }, 0, 0, VE},
+    { "num", "set num", OFFSET(num), AV_OPT_TYPE_INT,    { .i64 = 1500100900 },    0, INT_MAX, VE},
+    { NULL },
+};
+
+static const AVClass dummy_v1_class = {
+    .class_name = "dummy_v1_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass dummy_v2_class = {
+    .class_name = "dummy_v2_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+/* codec with options */
+static AVCodec dummy_v1_encoder = {
+    .name             = "dummy_v1_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 1,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v1_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with options, different class */
+static AVCodec dummy_v2_encoder = {
+    .name             = "dummy_v2_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 2,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v2_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with priv data, but no class */
+static AVCodec dummy_v3_encoder = {
+    .name             = "dummy_v3_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 3,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_data_size   = sizeof(Dummy3Context),
+};
+
+/* codec without priv data */
+static AVCodec dummy_v4_encoder = {
+    .name             = "dummy_v4_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 4,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+};
+
+static void test_copy_print_codec(const AVCodecContext *ctx)
+{
+    printf("%-14s: %dx%d prv: %s",
+           ctx->codec ? ctx->codec->name : "NULL",
+           ctx->width, ctx->height,
+           ctx->priv_data ? "set" : "null");
+    if (ctx->codec && ctx->codec->priv_class && ctx->codec->priv_data_size) {
+        int64_t i64;
+        char *str = NULL;
+        av_opt_get_int(ctx->priv_data, "num", 0, &i64);
+        av_opt_get(ctx->priv_data, "str", 0, (uint8_t**)&str);
+        printf(" opts: %"PRId64" %s", i64, str);
+        av_free(str);
+    }
+    printf("\n");
+}
+
+static void test_copy(const AVCodec *c1, const AVCodec *c2)
+{
+    AVCodecContext *ctx1, *ctx2;
+    printf("%s -> %s\nclosed:\n", c1 ? c1->name : "NULL", c2 ? c2->name : "NULL");
+    ctx1 = avcodec_alloc_context3(c1);
+    ctx2 = avcodec_alloc_context3(c2);
+    ctx1->width = ctx1->height = 128;
+    if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+        av_opt_set(ctx2->priv_data, "num", "667", 0);
+        av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+    }
+    avcodec_copy_context(ctx2, ctx1);
+    test_copy_print_codec(ctx1);
+    test_copy_print_codec(ctx2);
+    if (ctx1->codec) {
+        printf("opened:\n");
+        avcodec_open2(ctx1, ctx1->codec, NULL);
+        if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+            av_opt_set(ctx2->priv_data, "num", "667", 0);
+            av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+        }
+        avcodec_copy_context(ctx2, ctx1);
+        test_copy_print_codec(ctx1);
+        test_copy_print_codec(ctx2);
+        avcodec_close(ctx1);
+    }
+    avcodec_free_context(&ctx1);
+    avcodec_free_context(&ctx2);
+}
+
+int main(void)
+{
+    AVCodec *dummy_codec[] = {
+        &dummy_v1_encoder,
+        &dummy_v2_encoder,
+        &dummy_v3_encoder,
+        &dummy_v4_encoder,
+        NULL,
+    };
+    int i, j;
+
+    for (i = 0; dummy_codec[i]; i++)
+        avcodec_register(dummy_codec[i]);
+
+    printf("testing avcodec_copy_context()\n");
+    for (i = 0; i < FF_ARRAY_ELEMS(dummy_codec); i++)
+        for (j = 0; j < FF_ARRAY_ELEMS(dummy_codec); j++)
+            test_copy(dummy_codec[i], dummy_codec[j]);
+    return 0;
+}
+#endif
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index d30905ebf8..8b32ae239b 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -1,19 +1,21 @@
 /*
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +29,6 @@
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "version.h"
-#include "config.h"
 
 #define OFFSET(x) offsetof(AVCodecContext,x)
 #define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C
@@ -41,12 +42,13 @@
 #define AV_CODEC_DEFAULT_BITRATE 200*1000
 
 static const AVOption avcodec_options[] = {
-{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE }, INT_MIN, INT_MAX, V|A|E},
+{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = AV_CODEC_DEFAULT_BITRATE }, 0, INT_MAX, A|V|E},
+{"ab", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = 128*1000 }, 0, INT_MAX, A|E},
 {"bt", "Set video bitrate tolerance (in bits/s). In 1-pass mode, bitrate tolerance specifies how far "
        "ratecontrol is willing to deviate from the target average bitrate value. This is not related "
        "to minimum/maximum bitrate. Lowering tolerance too much has an adverse effect on quality.",
        OFFSET(bit_rate_tolerance), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE*20 }, 1, INT_MAX, V|E},
-{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|E|D, "flags"},
+{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|S|E|D, "flags"},
 {"unaligned", "allow decoders to produce unaligned output", 0, AV_OPT_TYPE_CONST, { .i64 = AV_CODEC_FLAG_UNALIGNED }, INT_MIN, INT_MAX, V | D, "flags" },
 {"mv4", "use four motion vectors per macroblock (MPEG-4)", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_4MV }, INT_MIN, INT_MAX, V|E, "flags"},
 {"qpel", "use 1/4-pel motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QPEL }, INT_MIN, INT_MAX, V|E, "flags"},
@@ -68,7 +70,7 @@ static const AVOption avcodec_options[] = {
 {"emu_edge", "do not draw edges", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_EMU_EDGE }, INT_MIN, INT_MAX, 0, "flags"},
 #endif
 {"psnr", "error[?] variables will be set during encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PSNR }, INT_MIN, INT_MAX, V|E, "flags"},
-{"truncated", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, 0, "flags"},
+{"truncated", "Input bitstream might be randomly truncated", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, V|D, "flags"},
 #if FF_API_NORMALIZE_AQP
 {"naq", "normalize adaptive quantization", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_NORMALIZE_AQP }, INT_MIN, INT_MAX, V|E, "flags"},
 #endif
@@ -82,8 +84,12 @@ static const AVOption avcodec_options[] = {
 {"output_corrupt", "Output even potentially corrupted frames", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_OUTPUT_CORRUPT }, INT_MIN, INT_MAX, V|D, "flags"},
 {"fast", "allow non-spec-compliant speedup tricks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_FAST }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"},
-{"ignorecrop", "ignore cropping information from sps", 1, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"ignorecrop", "ignore cropping information from sps", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
 {"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"},
+{"chunks", "Frame data might be split into multiple chunks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_CHUNKS }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"export_mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_EXPORT_MVS}, INT_MIN, INT_MAX, V|D, "flags2"},
+{"skip_manual", "do not skip samples and export skip information as frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SKIP_MANUAL}, INT_MIN, INT_MAX, V|D, "flags2"},
 #if FF_API_MOTION_EST
 {"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"},
 {"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" },
@@ -97,12 +103,12 @@ static const AVOption avcodec_options[] = {
 {"x1", "X1 motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_X1 }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"hex", "hex motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_HEX }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"umh", "umh motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_UMH }, INT_MIN, INT_MAX, V|E, "me_method" },
+{"iter", "iter motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ITER }, INT_MIN, INT_MAX, V|E, "me_method" },
 #endif
-{"extradata_size", NULL, OFFSET(extradata_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, INT_MIN, INT_MAX},
 {"g", "set the group of picture (GOP) size", OFFSET(gop_size), AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E},
-{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
-{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
+{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
+{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
 {"cutoff", "set cutoff bandwidth", OFFSET(cutoff), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
 {"frame_size", NULL, OFFSET(frame_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
 {"frame_number", NULL, OFFSET(frame_number), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
@@ -112,9 +118,9 @@ static const AVOption avcodec_options[] = {
           OFFSET(qcompress), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -FLT_MAX, FLT_MAX, V|E},
 {"qblur", "video quantizer scale blur (VBR)", OFFSET(qblur), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -1, FLT_MAX, V|E},
 {"qmin", "minimum video quantizer scale (VBR)", OFFSET(qmin), AV_OPT_TYPE_INT, {.i64 = 2 }, -1, 69, V|E},
-{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 69, V|E},
+{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 1024, V|E},
 {"qdiff", "maximum difference between the quantizer scales (VBR)", OFFSET(max_qdiff), AV_OPT_TYPE_INT, {.i64 = 3 }, INT_MIN, INT_MAX, V|E},
-{"bf", "use 'frames' B frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
+{"bf", "set maximum number of B frames between non-B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
 {"b_qfactor", "QP factor between P- and B-frames", OFFSET(b_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
 #if FF_API_RC_STRATEGY
 {"rc_strategy", "ratecontrol method", OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -159,11 +165,15 @@ static const AVOption avcodec_options[] = {
 {"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
 {"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
 {"b_qoffset", "QP offset between P- and B-frames", OFFSET(b_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
-{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, A|V|D, "err_detect"},
-{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, V|D, "err_detect"},
+{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0 }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"ignore_err", "ignore errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_IGNORE_ERR }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"careful",    "consider things that violate the spec, are fast to check and have not been seen in the wild as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CAREFUL }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"compliant",  "consider all spec non compliancies as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_COMPLIANT }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"aggressive", "consider things that a sane encoder should not do as an error", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_AGGRESSIVE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
 {"has_b_frames", NULL, OFFSET(has_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"block_align", NULL, OFFSET(block_align), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"mpeg_quant", "use MPEG quantizers instead of H.263", OFFSET(mpeg_quant), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -176,9 +186,9 @@ static const AVOption avcodec_options[] = {
 #if FF_API_MPV_OPT
 {"rc_eq", "deprecated, use encoder private options instead", OFFSET(rc_eq), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, V|E},
 #endif
-{"maxrate", "Set maximum bitrate tolerance (in bits/s). Requires bufsize to be set.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-{"minrate", "Set minimum bitrate tolerance (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
-            OFFSET(rc_min_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
+{"maxrate", "maximum bitrate (in bits/s). Used for VBV together with bufsize.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT_MAX, V|A|E},
+{"minrate", "minimum bitrate (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
+            OFFSET(rc_min_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 {"bufsize", "set ratecontrol buffer size (in bits)", OFFSET(rc_buffer_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|E},
 #if FF_API_MPV_OPT
 {"rc_buf_aggressivity", "deprecated, use encoder private options instead", OFFSET(rc_buffer_aggressivity), AV_OPT_TYPE_FLOAT, {.dbl = 1.0 }, -FLT_MAX, FLT_MAX, V|E},
@@ -221,14 +231,14 @@ static const AVOption avcodec_options[] = {
 {"ipp", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_IPP }, INT_MIN, INT_MAX, V|E|D, "idct"},
 #endif /* FF_API_UNUSED_MEMBERS */
 {"xvid", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#if FF_API_IDCT_XVIDMMX
-{"xvidmmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#endif /* FF_API_IDCT_XVIDMMX */
+{"xvidmmx", "deprecated, for compatibility only", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"faani", "floating point AAN IDCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"slice_count", NULL, OFFSET(slice_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"ec", "set error concealment strategy", OFFSET(error_concealment), AV_OPT_TYPE_FLAGS, {.i64 = 3 }, INT_MIN, INT_MAX, V|D, "ec"},
 {"guess_mvs", "iterative motion vector (MV) search (slow)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_GUESS_MVS }, INT_MIN, INT_MAX, V|D, "ec"},
 {"deblock", "use strong deblock filter for damaged MBs", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_DEBLOCK }, INT_MIN, INT_MAX, V|D, "ec"},
+{"favor_inter", "favor predicting from the previous frame", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_FAVOR_INTER }, INT_MIN, INT_MAX, V|D, "ec"},
 {"bits_per_coded_sample", NULL, OFFSET(bits_per_coded_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"pred", "prediction method", OFFSET(prediction_method), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "pred"},
 {"left", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PRED_LEFT }, INT_MIN, INT_MAX, V|E, "pred"},
@@ -245,6 +255,7 @@ static const AVOption avcodec_options[] = {
 {"mv", "motion vector", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MV }, INT_MIN, INT_MAX, V|D, "debug"},
 #endif
 {"dct_coeff", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_DCT_COEFF }, INT_MIN, INT_MAX, V|D, "debug"},
+{"green_metadata", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_GREEN_MD }, INT_MIN, INT_MAX, V|D, "debug"},
 {"skip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_SKIP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"startcode", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_STARTCODE }, INT_MIN, INT_MAX, V|D, "debug"},
 #if FF_API_UNUSED_MEMBERS
@@ -253,14 +264,13 @@ static const AVOption avcodec_options[] = {
 {"er", "error recognition", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_ER }, INT_MIN, INT_MAX, V|D, "debug"},
 {"mmco", "memory management control operations (H.264)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MMCO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"bugs", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUGS }, INT_MIN, INT_MAX, V|D, "debug"},
-#if FF_API_DEBUG_MV
 {"vis_qp", "visualize quantization parameter (QP), lower QP are tinted greener", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_QP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"vis_mb_type", "visualize block types", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MB_TYPE }, INT_MIN, INT_MAX, V|D, "debug"},
-#endif
 {"buffers", "picture buffer allocations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUFFERS }, INT_MIN, INT_MAX, V|D, "debug"},
-{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|D, "debug"},
-#if FF_API_DEBUG_MV
-{"vismv", "visualize motion vectors (MVs)", OFFSET(debug_mv), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|D, "debug_mv"},
+{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|A|D, "debug"},
+{"nomc", "skip motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_NOMC }, INT_MIN, INT_MAX, V|A|D, "debug"},
+#if FF_API_VISMV
+{"vismv", "visualize motion vectors (MVs) (deprecated)", OFFSET(debug_mv), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|D, "debug_mv"},
 {"pf", "forward predicted MVs of P-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_P_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bf", "forward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bb", "backward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_BACK }, INT_MIN, INT_MAX, V|D, "debug_mv"},
@@ -284,6 +294,10 @@ static const AVOption avcodec_options[] = {
 {"vsad", "sum of absolute vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"vsse", "sum of squared vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"nsse", "noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#if CONFIG_SNOW_ENCODER
+{"w53", "5/3 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W53 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"w97", "9/7 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W97 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#endif
 {"dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"pre_dia_size", "diamond type & size for motion estimation pre-pass", OFFSET(pre_dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -310,7 +324,7 @@ static const AVOption avcodec_options[] = {
 #if FF_API_XVMC
 {"xvmc_acceleration", NULL, OFFSET(xvmc_acceleration), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 #endif /* FF_API_XVMC */
-{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "mbd"},
+{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 2, V|E, "mbd"},
 {"simple", "use mbcmp (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_BITS }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_RD }, INT_MIN, INT_MAX, V|E, "mbd"},
@@ -328,13 +342,13 @@ static const AVOption avcodec_options[] = {
 #if FF_API_ERROR_RATE
 {"error", NULL, OFFSET(error_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-{"threads", NULL, OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|E|D, "threads"},
+{"threads", "set the number of threads", OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|A|E|D, "threads"},
 {"auto", "autodetect a suitable number of threads to use", 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, INT_MIN, INT_MAX, V|E|D, "threads"},
 #if FF_API_MPV_OPT
 {"me_threshold", "motion estimation threshold", OFFSET(me_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"mb_threshold", "macroblock threshold", OFFSET(mb_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, V|E},
+{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, -8, 16, V|E},
 {"nssew", "nsse weight", OFFSET(nsse_weight), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
 {"skip_top", "number of macroblock rows at the top which are skipped", OFFSET(skip_top), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
 {"skip_bottom", "number of macroblock rows at the bottom which are skipped", OFFSET(skip_bottom), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
@@ -355,8 +369,13 @@ static const AVOption avcodec_options[] = {
 {"dts_96_24", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_96_24 }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_hra", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_HRA }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_ma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_MA }, INT_MIN, INT_MAX, A|E, "profile"},
+{"mpeg4_sp",   NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_core", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_CORE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_main", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_MAIN }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_asp",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_ADVANCED_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
 {"level", NULL, OFFSET(level), AV_OPT_TYPE_INT, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
 {"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
+{"lowres", "decode at 1= 1/2, 2=1/4, 3=1/8 resolutions", OFFSET(lowres), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|A|D},
 {"skip_threshold", "frame skip threshold", OFFSET(frame_skip_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skip_factor", "frame skip factor", OFFSET(frame_skip_factor), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skip_exp", "frame skip exponent", OFFSET(frame_skip_exp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -367,18 +386,19 @@ static const AVOption avcodec_options[] = {
 {"mblmin", "minimum macroblock Lagrange factor (VBR)", OFFSET(mb_lmin), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 2 }, 1, FF_LAMBDA_MAX, V|E},
 {"mblmax", "maximum macroblock Lagrange factor (VBR)", OFFSET(mb_lmax), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 31 }, 1, FF_LAMBDA_MAX, V|E},
 {"mepc", "motion estimation bitrate penalty compensation (1.0 = 256)", OFFSET(me_penalty_compensation), AV_OPT_TYPE_INT, {.i64 = 256 }, INT_MIN, INT_MAX, V|E},
-{"skip_loop_filter", NULL, OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_idct"       , NULL, OFFSET(skip_idct)       , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_frame"      , NULL, OFFSET(skip_frame)      , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"none"            , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"default"         , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"noref"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"bidir"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"nokey"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"all"             , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_loop_filter", "skip loop filtering process for the selected frames", OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_idct"       , "skip IDCT/dequantization for the selected frames",    OFFSET(skip_idct),        AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_frame"      , "skip decoding for the selected frames",               OFFSET(skip_frame),       AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"none"            , "discard no frame",                    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"default"         , "discard useless frames",              0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"noref"           , "discard all non-reference frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"bidir"           , "discard all bidirectional frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nokey"           , "discard all frames except keyframes", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nointra"         , "discard all frames except I frames",  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONINTRA}, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"all"             , "discard all frames",                  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
 {"bidir_refine", "refine the two motion vectors used in bidirectional macroblocks", OFFSET(bidir_refine), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, 4, V|E},
 {"brd_scale", "downscale frames for dynamic B-frame decision", OFFSET(brd_scale), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 10, V|E},
-{"keyint_min", "minimum interval between IDR-frames (x264)", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
+{"keyint_min", "minimum interval between IDR-frames", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
 {"refs", "reference frames to consider for motion compensation", OFFSET(refs), AV_OPT_TYPE_INT, {.i64 = 1 }, INT_MIN, INT_MAX, V|E},
 {"chromaoffset", "chroma QP offset from luma", OFFSET(chromaoffset), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"trellis", "rate-distortion optimal quantization", OFFSET(trellis), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
@@ -390,23 +410,23 @@ static const AVOption avcodec_options[] = {
 {"compression_level", NULL, OFFSET(compression_level), AV_OPT_TYPE_INT, {.i64 = FF_COMPRESSION_DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 {"min_prediction_order", NULL, OFFSET(min_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
 {"max_prediction_order", NULL, OFFSET(max_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
-{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, V|E},
+{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = -1 }, -1, INT64_MAX, V|E},
 {"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"channel_layout", NULL, OFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|E|D, "channel_layout"},
 {"request_channel_layout", NULL, OFFSET(request_channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|D, "request_channel_layout"},
-{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 1.0/3 }, 0.0, FLT_MAX, V|E},
+{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, 0.0, FLT_MAX, V|E},
 {"rc_min_vbv_use", NULL, OFFSET(rc_min_vbv_overflow_use),  AV_OPT_TYPE_FLOAT, {.dbl = 3 },     0.0, FLT_MAX, V|E},
 {"ticks_per_frame", NULL, OFFSET(ticks_per_frame), AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, A|V|E|D},
 {"color_primaries", "color primaries", OFFSET(color_primaries), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_UNSPECIFIED }, 1, AVCOL_PRI_NB-1, V|E|D, "color_primaries_type"},
-{"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470m",      "BT.470 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470bg",     "BT.470 BG",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte170m",   "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte240m",   "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"film",        "Film",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt2020",      "BT.2020",     0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smptest428_1", "SMPTE ST 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt709",       "BT.709",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"unspecified", "Unspecified",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470m",      "BT.470 M",       0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470bg",     "BT.470 BG",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte170m",   "SMPTE 170 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte240m",   "SMPTE 240 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"film",        "Film",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },         INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt2020",      "BT.2020",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte428_1",  "SMPTE ST 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
 {"color_trc", "color transfer characteristics", OFFSET(color_trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, AVCOL_TRC_NB-1, V|E|D, "color_trc_type"},
 {"bt709",        "BT.709",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"unspecified",  "Unspecified",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
@@ -422,8 +442,8 @@ static const AVOption avcodec_options[] = {
 {"iec61966_2_1", "IEC 61966-2-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_10bit", "BT.2020 - 10 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_12bit", "BT.2020 - 12 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest2084",  "SMPTE ST 2084",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest428_1", "SMPTE ST 428-1",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
+{"smpte2084",    "SMPTE ST 2084",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
+{"smpte428_1",   "SMPTE ST 428-1",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"colorspace", "color space", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_UNSPECIFIED }, 0, AVCOL_SPC_NB-1, V|E|D, "colorspace_type"},
 {"rgb",         "RGB",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_RGB },         INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
 {"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709 },       INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
@@ -448,8 +468,8 @@ static const AVOption avcodec_options[] = {
 {"bottomleft",  "Bottom-left", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOMLEFT },  INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"bottom",      "Bottom",      0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOM },      INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"log_level_offset", "set the log level offset", OFFSET(log_level_offset), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX },
-{"slices", "number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
-{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|E|D, "thread_type"},
+{"slices", "set the number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
+{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|A|E|D, "thread_type"},
 {"slice", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"frame", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_FRAME }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"audio_service_type", "audio service type", OFFSET(audio_service_type), AV_OPT_TYPE_INT, {.i64 = AV_AUDIO_SERVICE_TYPE_MAIN }, 0, AV_AUDIO_SERVICE_TYPE_NB-1, A|E, "audio_service_type"},
@@ -462,21 +482,28 @@ static const AVOption avcodec_options[] = {
 {"em", "Emergency",          0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_EMERGENCY },         INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"vo", "Voice Over",         0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_VOICE_OVER },        INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
-{"request_sample_fmt", NULL, OFFSET(request_sample_fmt), AV_OPT_TYPE_INT, {.i64 = AV_SAMPLE_FMT_NONE }, AV_SAMPLE_FMT_NONE, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
-{"u8" , "8-bit unsigned integer", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16", "16-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32", "32-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"flt", "32-bit float",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLT }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dbl", "64-bit double",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBL }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"u8p" , "8-bit unsigned integer planar", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8P  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16p", "16-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32p", "32-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"fltp", "32-bit float planar",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLTP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dblp", "64-bit double planar",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBLP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, A|V|D },
+{"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"},
+{"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
+{"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
+{"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, A|V|D },
 #if FF_API_SIDEDATA_ONLY_PKT
-{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, A|V|E },
+{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, A|V|E },
 #endif
+{"skip_alpha", "Skip processing alpha", OFFSET(skip_alpha), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, V|D },
+{"field_order", "Field order", OFFSET(field_order), AV_OPT_TYPE_INT, {.i64 = AV_FIELD_UNKNOWN }, 0, 5, V|D|E, "field_order" },
+{"progressive", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_PROGRESSIVE }, 0, 0, V|D|E, "field_order" },
+{"tt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TT }, 0, 0, V|D|E, "field_order" },
+{"bb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BB }, 0, 0, V|D|E, "field_order" },
+{"tb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TB }, 0, 0, V|D|E, "field_order" },
+{"bt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BT }, 0, 0, V|D|E, "field_order" },
+{"dump_separator", "set information dump field separator", OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, A|V|S|D|E},
+{"codec_whitelist", "List of decoders that are allowed to be used", OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, CHAR_MAX, A|V|S|D },
+{"pixel_format", "set pixel format", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, 0 },
+{"video_size", "set video size", OFFSET(width), AV_OPT_TYPE_IMAGE_SIZE, {.str=NULL}, 0, INT_MAX, 0 },
 {NULL},
 };
 
diff --git a/libavcodec/opus.c b/libavcodec/opus.c
index 1bc417b477..6b3d3c310e 100644
--- a/libavcodec/opus.c
+++ b/libavcodec/opus.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/opus.h b/libavcodec/opus.h
index 55c91fa012..3a7ea9f504 100644
--- a/libavcodec/opus.h
+++ b/libavcodec/opus.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/frame.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
 #include "get_bits.h"
@@ -57,7 +57,7 @@
 #define SILK_HISTORY                 322
 #define SILK_MAX_LPC                 16
 
-#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> (s - 1)) + 1) >> 1)
+#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> ((s) - 1)) + 1) >> 1)
 #define ROUND_MUL16(a,b)  ((MUL16(a, b) + 16384) >> 15)
 #define opus_ilog(i) (av_log2(i) + !!(i))
 
@@ -104,19 +104,19 @@ typedef struct SilkContext SilkContext;
 typedef struct CeltContext CeltContext;
 
 typedef struct OpusPacket {
-    int packet_size;                /** packet size */
-    int data_size;                  /** size of the useful data -- packet size - padding */
-    int code;                       /** packet code: specifies the frame layout */
-    int stereo;                     /** whether this packet is mono or stereo */
-    int vbr;                        /** vbr flag */
-    int config;                     /** configuration: tells the audio mode,
+    int packet_size;                /**< packet size */
+    int data_size;                  /**< size of the useful data -- packet size - padding */
+    int code;                       /**< packet code: specifies the frame layout */
+    int stereo;                     /**< whether this packet is mono or stereo */
+    int vbr;                        /**< vbr flag */
+    int config;                     /**< configuration: tells the audio mode,
                                      **                bandwidth, and frame duration */
-    int frame_count;                /** frame count */
-    int frame_offset[MAX_FRAMES];   /** frame offsets */
-    int frame_size[MAX_FRAMES];     /** frame sizes */
-    int frame_duration;             /** frame duration, in samples @ 48kHz */
-    enum OpusMode mode;             /** mode */
-    enum OpusBandwidth bandwidth;   /** bandwidth */
+    int frame_count;                /**< frame count */
+    int frame_offset[MAX_FRAMES];   /**< frame offsets */
+    int frame_size[MAX_FRAMES];     /**< frame sizes */
+    int frame_duration;             /**< frame duration, in samples @ 48kHz */
+    enum OpusMode mode;             /**< mode */
+    enum OpusBandwidth bandwidth;   /**< bandwidth */
 } OpusPacket;
 
 typedef struct OpusStreamContext {
@@ -144,7 +144,7 @@ typedef struct OpusStreamContext {
     float *out_dummy;
     int    out_dummy_allocated_size;
 
-    AVAudioResampleContext *avr;
+    SwrContext *swr;
     AVAudioFifo *celt_delay;
     int silk_samplerate;
     /* number of samples we still want to get from the resampler */
@@ -186,7 +186,7 @@ typedef struct OpusContext {
     int             nb_streams;
     int      nb_stereo_streams;
 
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     int16_t gain_i;
     float   gain;
 
@@ -289,7 +289,7 @@ static av_always_inline unsigned int opus_getrawbits(OpusRangeCoder *rc, unsigne
         rc->rb.bytes--;
     }
 
-    value = rc->rb.cacheval & ((1<<count)-1);
+    value = av_mod_uintp2(rc->rb.cacheval, count);
     rc->rb.cacheval    >>= count;
     rc->rb.cachelen     -= count;
     rc->total_read_bits += count;
diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index 07a4f775f9..9911de3848 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +62,7 @@ struct CeltContext {
     // constant values that do not change during context lifetime
     AVCodecContext    *avctx;
     IMDCT15Context    *imdct[4];
-    AVFloatDSPContext  dsp;
+    AVFloatDSPContext  *dsp;
     int output_channels;
 
     // values that have inter-frame effect and must be reset on flush
@@ -991,7 +991,7 @@ static inline int celt_pulses2bits(const uint8_t *cache, int pulses)
    return (pulses == 0) ? 0 : cache[pulses] + 1;
 }
 
-static inline void celt_normalize_residual(const int * restrict iy, float * restrict X,
+static inline void celt_normalize_residual(const int * av_restrict iy, float * av_restrict X,
                                            int N, float g)
 {
     int i;
@@ -1295,7 +1295,7 @@ static inline float celt_decode_pulses(OpusRangeCoder *rc, int *y, unsigned int
 {
     unsigned int idx;
 #define CELT_PVQ_U(n, k) (celt_pvq_u_row[FFMIN(n, k)][FFMAX(n, k)])
-#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, k + 1))
+#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, (k) + 1))
     idx = opus_rc_unimodel(rc, CELT_PVQ_V(N, K));
     return celt_cwrsi(N, K, idx, y);
 }
@@ -1454,7 +1454,7 @@ static unsigned int celt_decode_band(CeltContext *s, OpusRangeCoder *rc,
         if (itheta == 0) {
             imid = 32767;
             iside = 0;
-            fill &= (1 << blocks) - 1;
+            fill = av_mod_uintp2(fill, blocks);
             delta = -16384;
         } else if (itheta == 16384) {
             imid = 0;
@@ -1666,7 +1666,7 @@ static unsigned int celt_decode_band(CeltContext *s, OpusRangeCoder *rc,
             for (j = 0; j < N0; j++)
                 lowband_out[j] = n * X[j];
         }
-        cm &= (1 << blocks) - 1;
+        cm = av_mod_uintp2(cm, blocks);
     }
     return cm;
 }
@@ -2072,7 +2072,7 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
 
     /* stereo -> mono downmix */
     if (s->output_channels < s->coded_channels) {
-        s->dsp.vector_fmac_scalar(s->coeffs[0], s->coeffs[1], 1.0, FFALIGN(frame_size, 16));
+        s->dsp->vector_fmac_scalar(s->coeffs[0], s->coeffs[1], 1.0, FFALIGN(frame_size, 16));
         imdct_scale = 0.5;
     } else if (s->output_channels > s->coded_channels)
         memcpy(s->coeffs[1], s->coeffs[0], frame_size * sizeof(float));
@@ -2098,7 +2098,7 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
 
             imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, s->coeffs[i] + j,
                               s->blocks, imdct_scale);
-            s->dsp.vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
+            s->dsp->vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
                                       celt_window, CELT_OVERLAP / 2);
         }
 
@@ -2181,6 +2181,7 @@ void ff_celt_free(CeltContext **ps)
     for (i = 0; i < FF_ARRAY_ELEMS(s->imdct); i++)
         ff_imdct15_uninit(&s->imdct[i]);
 
+    av_freep(&s->dsp);
     av_freep(ps);
 }
 
@@ -2208,7 +2209,11 @@ int ff_celt_init(AVCodecContext *avctx, CeltContext **ps, int output_channels)
             goto fail;
     }
 
-    avpriv_float_dsp_init(&s->dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->dsp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
 
     ff_celt_flush(s);
 
diff --git a/libavcodec/opus_parser.c b/libavcodec/opus_parser.c
index d256fbb2d5..c30fd7bbd4 100644
--- a/libavcodec/opus_parser.c
+++ b/libavcodec/opus_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/opus_silk.c b/libavcodec/opus_silk.c
index f881325d7b..841d1ed25c 100644
--- a/libavcodec/opus_silk.c
+++ b/libavcodec/opus_silk.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/opusdec.c b/libavcodec/opusdec.c
index acae6e1f66..f07f502d18 100644
--- a/libavcodec/opusdec.c
+++ b/libavcodec/opusdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,9 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
-#include "celp_filters.h"
-#include "fft.h"
 #include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
@@ -114,9 +112,9 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 {
     int celt_size = av_audio_fifo_size(s->celt_delay);
     int ret, i;
-
-    ret = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size, nb_samples,
-                             NULL, 0, 0);
+    ret = swr_convert(s->swr,
+                      (uint8_t**)s->out, nb_samples,
+                      NULL, 0);
     if (ret < 0)
         return ret;
     else if (ret != nb_samples) {
@@ -155,19 +153,20 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 
 static int opus_init_resample(OpusStreamContext *s)
 {
-    float delay[16] = { 0.0 };
-    uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
+    static const float delay[16] = { 0.0 };
+    const uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
     int ret;
 
-    av_opt_set_int(s->avr, "in_sample_rate", s->silk_samplerate, 0);
-    ret = avresample_open(s->avr);
+    av_opt_set_int(s->swr, "in_sample_rate", s->silk_samplerate, 0);
+    ret = swr_init(s->swr);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Error opening the resampler.\n");
         return ret;
     }
 
-    ret = avresample_convert(s->avr, NULL, 0, 0, delayptr, sizeof(delay),
-                             silk_resample_delay[s->packet.bandwidth]);
+    ret = swr_convert(s->swr,
+                      NULL, 0,
+                      delayptr, silk_resample_delay[s->packet.bandwidth]);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Error feeding initial silence to the resampler.\n");
@@ -218,7 +217,7 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
 
     /* decode the silk frame */
     if (s->packet.mode == OPUS_MODE_SILK || s->packet.mode == OPUS_MODE_HYBRID) {
-        if (!avresample_is_open(s->avr)) {
+        if (!swr_is_initialized(s->swr)) {
             ret = opus_init_resample(s);
             if (ret < 0)
                 return ret;
@@ -232,16 +231,14 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
             av_log(s->avctx, AV_LOG_ERROR, "Error decoding a SILK frame.\n");
             return samples;
         }
-
-        samples = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size,
-                                     s->packet.frame_duration,
-                                     (uint8_t**)s->silk_output,
-                                     sizeof(s->silk_buf[0]),
-                                     samples);
+        samples = swr_convert(s->swr,
+                              (uint8_t**)s->out, s->packet.frame_duration,
+                              (const uint8_t**)s->silk_output, samples);
         if (samples < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error resampling SILK data.\n");
             return samples;
         }
+        av_assert2((samples & 7) == 0);
         s->delayed_samples += s->packet.frame_duration - samples;
     } else
         ff_silk_flush(s->silk);
@@ -379,10 +376,10 @@ static int opus_decode_subpacket(OpusStreamContext *s,
     s->out_size = out_size;
 
     /* check if we need to flush the resampler */
-    if (avresample_is_open(s->avr)) {
+    if (swr_is_initialized(s->swr)) {
         if (buf) {
             int64_t cur_samplerate;
-            av_opt_get_int(s->avr, "in_sample_rate", 0, &cur_samplerate);
+            av_opt_get_int(s->swr, "in_sample_rate", 0, &cur_samplerate);
             flush_needed = (s->packet.mode == OPUS_MODE_CELT) || (cur_samplerate != s->silk_samplerate);
         } else {
             flush_needed = !!s->delayed_samples;
@@ -411,7 +408,7 @@ static int opus_decode_subpacket(OpusStreamContext *s,
             av_log(s->avctx, AV_LOG_ERROR, "Error flushing the resampler.\n");
             return ret;
         }
-        avresample_close(s->avr);
+        swr_close(s->swr);
         output_samples += s->delayed_samples;
         s->delayed_samples = 0;
 
@@ -461,8 +458,11 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* calculate the number of delayed samples */
     for (i = 0; i < c->nb_streams; i++) {
+        OpusStreamContext *s = &c->streams[i];
+        s->out[0] =
+        s->out[1] = NULL;
         delayed_samples = FFMAX(delayed_samples,
-                                c->streams[i].delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
+                                s->delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
     }
 
     /* decode the header of the first sub-packet to find out the sample count */
@@ -487,10 +487,8 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* setup the data buffers */
     ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ret < 0)
         return ret;
-    }
     frame->nb_samples = 0;
 
     memset(c->out, 0, c->nb_streams * 2 * sizeof(*c->out));
@@ -588,7 +586,7 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
         }
 
         if (c->gain_i) {
-            c->fdsp.vector_fmul_scalar((float*)frame->extended_data[i],
+            c->fdsp->vector_fmul_scalar((float*)frame->extended_data[i],
                                        (float*)frame->extended_data[i],
                                        c->gain, FFALIGN(decoded_samples, 8));
         }
@@ -613,7 +611,7 @@ static av_cold void opus_decode_flush(AVCodecContext *ctx)
 
         if (s->celt_delay)
             av_audio_fifo_drain(s->celt_delay, av_audio_fifo_size(s->celt_delay));
-        avresample_close(s->avr);
+        swr_close(s->swr);
 
         av_audio_fifo_drain(c->sync_buffers[i], av_audio_fifo_size(c->sync_buffers[i]));
 
@@ -637,7 +635,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
         s->out_dummy_allocated_size = 0;
 
         av_audio_fifo_free(s->celt_delay);
-        avresample_free(&s->avr);
+        swr_free(&s->swr);
     }
 
     av_freep(&c->streams);
@@ -654,6 +652,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
     c->nb_streams = 0;
 
     av_freep(&c->channel_maps);
+    av_freep(&c->fdsp);
 
     return 0;
 }
@@ -666,7 +665,9 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt  = AV_SAMPLE_FMT_FLTP;
     avctx->sample_rate = 48000;
 
-    avpriv_float_dsp_init(&c->fdsp, 0);
+    c->fdsp = avpriv_float_dsp_alloc(0);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     /* find out the channel configuration */
     ret = ff_opus_parse_extradata(avctx, c);
@@ -699,18 +700,19 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
             s->redundancy_output[j] = s->redundancy_buf[j];
         }
 
-        s->fdsp = &c->fdsp;
+        s->fdsp = c->fdsp;
 
-        s->avr = avresample_alloc_context();
-        if (!s->avr)
+        s->swr =swr_alloc();
+        if (!s->swr)
             goto fail;
 
         layout = (s->output_channels == 1) ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
-        av_opt_set_int(s->avr, "in_sample_fmt",      avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "out_sample_fmt",     avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "in_channel_layout",  layout,             0);
-        av_opt_set_int(s->avr, "out_channel_layout", layout,             0);
-        av_opt_set_int(s->avr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "in_sample_fmt",      avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "out_sample_fmt",     avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "in_channel_layout",  layout,             0);
+        av_opt_set_int(s->swr, "out_channel_layout", layout,             0);
+        av_opt_set_int(s->swr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "filter_size",        16,                 0);
 
         ret = ff_silk_init(avctx, &s->silk, s->output_channels);
         if (ret < 0)
diff --git a/libavcodec/paf.h b/libavcodec/paf.h
new file mode 100644
index 0000000000..ce8245f223
--- /dev/null
+++ b/libavcodec/paf.h
@@ -0,0 +1,28 @@
+/*
+ * Packed Animation File decoder/demuxer common code
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PAF_H
+#define AVCODEC_PAF_H
+
+#define PAF_SOUND_SAMPLES     2205
+#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+
+#endif /* AVCODEC_PAF_H */
diff --git a/libavcodec/pafaudio.c b/libavcodec/pafaudio.c
index c83e7f5cfc..12f473ae0a 100644
--- a/libavcodec/pafaudio.c
+++ b/libavcodec/pafaudio.c
@@ -2,20 +2,20 @@
  * Packed Animation File audio decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
-
-#define PAF_SOUND_SAMPLES     2205
-#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+#include "paf.h"
 
 static av_cold int paf_audio_init(AVCodecContext *avctx)
 {
diff --git a/libavcodec/pafvideo.c b/libavcodec/pafvideo.c
index b77f47e005..cab3129f8f 100644
--- a/libavcodec/pafvideo.c
+++ b/libavcodec/pafvideo.c
@@ -2,20 +2,20 @@
  * Packed Animation File video decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,15 +26,24 @@
 #include "copy_block.h"
 #include "internal.h"
 
+
 static const uint8_t block_sequences[16][8] = {
-    { 0, 0, 0, 0, 0, 0, 0, 0 }, { 2, 0, 0, 0, 0, 0, 0, 0 },
-    { 5, 7, 0, 0, 0, 0, 0, 0 }, { 5, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 0, 0, 0, 0, 0, 0, 0 }, { 5, 7, 5, 7, 0, 0, 0, 0 },
-    { 5, 7, 5, 0, 0, 0, 0, 0 }, { 5, 7, 6, 0, 0, 0, 0, 0 },
-    { 5, 5, 0, 0, 0, 0, 0, 0 }, { 3, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 6, 0, 0, 0, 0, 0, 0 }, { 2, 4, 0, 0, 0, 0, 0, 0 },
-    { 2, 4, 5, 7, 0, 0, 0, 0 }, { 2, 4, 5, 0, 0, 0, 0, 0 },
-    { 2, 4, 6, 0, 0, 0, 0, 0 }, { 2, 4, 5, 7, 5, 7, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 2, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 0, 0, 0, 0, 0, 0 },
+    { 5, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 5, 7, 0, 0, 0, 0 },
+    { 5, 7, 5, 0, 0, 0, 0, 0 },
+    { 5, 7, 6, 0, 0, 0, 0, 0 },
+    { 5, 5, 0, 0, 0, 0, 0, 0 },
+    { 3, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 6, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 0, 0, 0, 0 },
+    { 2, 4, 5, 0, 0, 0, 0, 0 },
+    { 2, 4, 6, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 5, 7, 0, 0 },
 };
 
 typedef struct PAFVideoDecContext {
@@ -156,9 +165,11 @@ static int decode_0(PAFVideoDecContext *c, uint8_t *pkt, uint8_t code)
     i = bytestream2_get_byte(&c->gb);
     if (i) {
         if (code & 0x10) {
-            int pos = bytestream2_tell(&c->gb) & 3;
-            if (pos)
-                bytestream2_skip(&c->gb, 4 - pos);
+            int align;
+
+            align = bytestream2_tell(&c->gb) & 3;
+            if (align)
+                bytestream2_skip(&c->gb, 4 - align);
         }
         do {
             int page, val, x, y;
diff --git a/libavcodec/pamenc.c b/libavcodec/pamenc.c
index 0be07e1f1b..50c9fcb404 100644
--- a/libavcodec/pamenc.c
+++ b/libavcodec/pamenc.c
@@ -2,52 +2,39 @@
  * PAM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream_start, *bytestream, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, w, n, linesize, depth, maxval, ret;
     const char *tuple_type;
     uint8_t *ptr;
 
-    if ((ret = ff_alloc_packet(pkt, avpicture_get_size(avctx->pix_fmt,
-                                                       avctx->width,
-                                                       avctx->height) + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
-        return ret;
-    }
-
-    bytestream_start =
-    bytestream       = pkt->data;
-    bytestream_end   = pkt->data + pkt->size;
-
     h = avctx->height;
     w = avctx->width;
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_MONOWHITE:
-        n          = (w + 7) >> 3;
+    case AV_PIX_FMT_MONOBLACK:
+        n          = w;
         depth      = 1;
         maxval     = 1;
         tuple_type = "BLACKANDWHITE";
@@ -58,21 +45,59 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         maxval     = 255;
         tuple_type = "GRAYSCALE";
         break;
+    case AV_PIX_FMT_GRAY16BE:
+        n          = w * 2;
+        depth      = 1;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE";
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        n          = w * 2;
+        depth      = 2;
+        maxval     = 255;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
+    case AV_PIX_FMT_YA16BE:
+        n          = w * 4;
+        depth      = 2;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
     case AV_PIX_FMT_RGB24:
         n          = w * 3;
         depth      = 3;
         maxval     = 255;
         tuple_type = "RGB";
         break;
-    case AV_PIX_FMT_RGB32:
+    case AV_PIX_FMT_RGBA:
         n          = w * 4;
         depth      = 4;
         maxval     = 255;
         tuple_type = "RGB_ALPHA";
         break;
+    case AV_PIX_FMT_RGB48BE:
+        n          = w * 6;
+        depth      = 3;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB";
+        break;
+    case AV_PIX_FMT_RGBA64BE:
+        n          = w * 8;
+        depth      = 4;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB_ALPHA";
+        break;
     default:
         return -1;
     }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, n*h + 200, 0)) < 0)
+        return ret;
+
+    bytestream_start =
+    bytestream       = pkt->data;
+    bytestream_end   = pkt->data + pkt->size;
+
     snprintf(bytestream, bytestream_end - bytestream,
              "P7\nWIDTH %d\nHEIGHT %d\nDEPTH %d\nMAXVAL %d\nTUPLTYPE %s\nENDHDR\n",
              w, h, depth, maxval, tuple_type);
@@ -81,16 +106,11 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     ptr      = p->data[0];
     linesize = p->linesize[0];
 
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+    if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK){
         int j;
-        unsigned int v;
-
         for (i = 0; i < h; i++) {
-            for (j = 0; j < w; j++) {
-                v = ((uint32_t *)ptr)[j];
-                bytestream_put_be24(&bytestream, v);
-                *bytestream++ = v >> 24;
-            }
+            for (j = 0; j < w; j++)
+                *bytestream++ = ptr[j >> 3] >> (7 - j & 7) & 1;
             ptr += linesize;
         }
     } else {
@@ -127,7 +147,10 @@ AVCodec ff_pam_encoder = {
     .init           = pam_encode_init,
     .encode2        = pam_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_GRAY8, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/parser.c b/libavcodec/parser.c
index 355187ae45..2809158c35 100644
--- a/libavcodec/parser.c
+++ b/libavcodec/parser.c
@@ -3,26 +3,28 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
@@ -41,20 +43,21 @@ AVCodecParser *av_parser_next(const AVCodecParser *p)
 
 void av_register_codec_parser(AVCodecParser *parser)
 {
-    parser->next = av_first_parser;
-    av_first_parser = parser;
+    do {
+        parser->next = av_first_parser;
+    } while (parser->next != avpriv_atomic_ptr_cas((void * volatile *)&av_first_parser, parser->next, parser));
 }
 
 AVCodecParserContext *av_parser_init(int codec_id)
 {
-    AVCodecParserContext *s;
+    AVCodecParserContext *s = NULL;
     AVCodecParser *parser;
     int ret;
 
     if (codec_id == AV_CODEC_ID_NONE)
         return NULL;
 
-    for (parser = av_first_parser; parser != NULL; parser = parser->next) {
+    for (parser = av_first_parser; parser; parser = parser->next) {
         if (parser->codec_ids[0] == codec_id ||
             parser->codec_ids[1] == codec_id ||
             parser->codec_ids[2] == codec_id ||
@@ -67,25 +70,18 @@ AVCodecParserContext *av_parser_init(int codec_id)
 found:
     s = av_mallocz(sizeof(AVCodecParserContext));
     if (!s)
-        return NULL;
+        goto err_out;
     s->parser = parser;
-    if (parser->priv_data_size) {
-        s->priv_data = av_mallocz(parser->priv_data_size);
-        if (!s->priv_data) {
-            av_free(s);
-            return NULL;
-        }
-    }
+    s->priv_data = av_mallocz(parser->priv_data_size);
+    if (!s->priv_data)
+        goto err_out;
+    s->fetch_timestamp=1;
+    s->pict_type = AV_PICTURE_TYPE_I;
     if (parser->parser_init) {
         ret = parser->parser_init(s);
-        if (ret != 0) {
-            av_free(s->priv_data);
-            av_free(s);
-            return NULL;
-        }
+        if (ret != 0)
+            goto err_out;
     }
-    s->fetch_timestamp      = 1;
-    s->pict_type            = AV_PICTURE_TYPE_I;
     s->key_frame            = -1;
 #if FF_API_CONVERGENCE_DURATION
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -98,25 +94,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->format               = -1;
 
     return s;
+
+err_out:
+    if (s)
+        av_freep(&s->priv_data);
+    av_free(s);
+    return NULL;
 }
 
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove)
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy)
 {
     int i;
 
-    s->dts    =
-    s->pts    = AV_NOPTS_VALUE;
-    s->pos    = -1;
-    s->offset = 0;
+    if (!fuzzy) {
+        s->dts    =
+        s->pts    = AV_NOPTS_VALUE;
+        s->pos    = -1;
+        s->offset = 0;
+    }
     for (i = 0; i < AV_PARSER_PTS_NB; i++) {
         if (s->cur_offset + off >= s->cur_frame_offset[i] &&
             (s->frame_offset < s->cur_frame_offset[i] ||
-             (!s->frame_offset && !s->next_frame_offset)) &&
-            s->cur_frame_end[i]) {
-            s->dts    = s->cur_frame_dts[i];
-            s->pts    = s->cur_frame_pts[i];
-            s->pos    = s->cur_frame_pos[i];
-            s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+             (!s->frame_offset && !s->next_frame_offset)) && // first field/frame
+            // check disabled since MPEG-TS does not send complete PES packets
+            /*s->next_frame_offset + off <*/  s->cur_frame_end[i]){
+
+            if (!fuzzy || s->cur_frame_dts[i] != AV_NOPTS_VALUE) {
+                s->dts    = s->cur_frame_dts[i];
+                s->pts    = s->cur_frame_pts[i];
+                s->pos    = s->cur_frame_pos[i];
+                s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+            }
             if (remove)
                 s->cur_frame_offset[i] = INT64_MAX;
             if (s->cur_offset + off < s->cur_frame_end[i])
@@ -159,11 +167,12 @@ int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
         s->last_pts        = s->pts;
         s->last_dts        = s->dts;
         s->last_pos        = s->pos;
-        ff_fetch_timestamp(s, 0, 0);
+        ff_fetch_timestamp(s, 0, 0, 0);
     }
     /* WARNING: the returned index can be negative */
     index = s->parser->parser_parse(s, avctx, (const uint8_t **) poutbuf,
                                     poutbuf_size, buf, buf_size);
+    av_assert0(index > -0x20000000); // The API does not allow returning AVERROR codes
     /* update the file pointer */
     if (*poutbuf_size) {
         /* fill the data for the current frame */
@@ -219,7 +228,7 @@ void av_parser_close(AVCodecParserContext *s)
     if (s) {
         if (s->parser->parser_close)
             s->parser->parser_close(s);
-        av_free(s->priv_data);
+        av_freep(&s->priv_data);
         av_free(s);
     }
 }
@@ -250,8 +259,11 @@ int ff_combine_frame(ParseContext *pc, int next,
                                            *buf_size + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
 
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", *buf_size + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         memcpy(&pc->buffer[pc->index], *buf, *buf_size);
         pc->index += *buf_size;
@@ -266,9 +278,12 @@ int ff_combine_frame(ParseContext *pc, int next,
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            next + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
-
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", next + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->overread_index =
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         if (next > -AV_INPUT_BUFFER_PADDING_SIZE)
             memcpy(&pc->buffer[pc->index], *buf,
@@ -303,13 +318,14 @@ void ff_parse_close(AVCodecParserContext *s)
 
 int ff_mpeg4video_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for (i = 0; i < buf_size; i++) {
-        state = state << 8 | buf[i];
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
         if (state == 0x1B3 || state == 0x1B6)
-            return i - 3;
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
diff --git a/libavcodec/parser.h b/libavcodec/parser.h
index ea1cae28a2..ef35547e9b 100644
--- a/libavcodec/parser.h
+++ b/libavcodec/parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,7 +53,8 @@ void ff_parse_close(AVCodecParserContext *s);
  * Fetch timestamps for a specific byte within the current access unit.
  * @param off byte position within the access unit
  * @param remove Found timestamps will be removed if set to 1, kept if set to 0.
+ * @param fuzzy Only use found value if it is more informative than what we already have
  */
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove);
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy);
 
 #endif /* AVCODEC_PARSER_H */
diff --git a/libavcodec/pcm-bluray.c b/libavcodec/pcm-bluray.c
index 51fcd2d007..22c1c08bcf 100644
--- a/libavcodec/pcm-bluray.c
+++ b/libavcodec/pcm-bluray.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM format found in Blu-ray PCM streams
  * Copyright (c) 2009, 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,13 +71,14 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = bits_per_samples[header[3] >> 6];
-    if (!avctx->bits_per_coded_sample) {
-        av_log(avctx, AV_LOG_ERROR, "reserved sample depth (0)\n");
+    if (!(avctx->bits_per_coded_sample == 16 || avctx->bits_per_coded_sample == 24)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported sample depth (%d)\n", avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
                                                            : AV_SAMPLE_FMT_S32;
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
+        avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     /* get the sample rate. Not all values are used. */
     switch (header[2] & 0x0f) {
@@ -116,9 +117,9 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
-                avctx->sample_rate, avctx->bit_rate);
+                avctx->sample_rate, (int64_t)avctx->bit_rate);
     return 0;
 }
 
@@ -154,10 +155,8 @@ static int pcm_bluray_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = samples;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst16 = (int16_t *)frame->data[0];
     dst32 = (int32_t *)frame->data[0];
 
diff --git a/libavcodec/pcm-dvd.c b/libavcodec/pcm-dvd.c
index 744b720c73..a78c05dbe2 100644
--- a/libavcodec/pcm-dvd.c
+++ b/libavcodec/pcm-dvd.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM formats found in Video DVD streams
  * Copyright (c) 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 typedef struct PCMDVDContext {
     uint32_t last_header;    // Cached header to see if parsing is needed
     int block_size;          // Size of a block of samples in bytes
+    int last_block_size;     // Size of the last block of samples in bytes
     int samples_per_block;   // Number of samples per channel per block
     int groups_per_block;    // Number of 20/24bit sample groups per block
     uint8_t *extra_samples;  // Pointer to leftover samples from a frame
@@ -69,9 +70,10 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* early exit if the header didn't change apart from the frame number */
     if (s->last_header == header_int)
         return 0;
+    s->last_header = -1;
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        ff_dlog(avctx, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
+        av_log(avctx, AV_LOG_DEBUG, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
                 header[0], header[1], header[2]);
     /*
      * header[0] emphasis (1), muse(1), reserved(1), frame number(5)
@@ -85,7 +87,9 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = 16 + (header[1] >> 6 & 3) * 4;
     if (avctx->bits_per_coded_sample == 28) {
-        av_log(avctx, AV_LOG_ERROR, "PCM DVD unsupported sample depth\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "PCM DVD unsupported sample depth %i\n",
+               avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
@@ -136,9 +140,9 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
-                avctx->sample_rate, avctx->bit_rate);
+                avctx->sample_rate, (int64_t)avctx->bit_rate);
 
     s->last_header = header_int;
 
@@ -170,6 +174,17 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
         return dst16;
     }
     case 20:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    t = bytestream2_get_byteu(&gb);
+                    *dst32++ += (t & 0xf0) << 8;
+                    *dst32++ += (t & 0x0f) << 12;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -184,8 +199,19 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += (t & 0x0f) << 12;
             }
         } while (--blocks);
+        }
         return dst32;
     case 24:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -198,6 +224,7 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += bytestream2_get_byteu(&gb) << 8;
             }
         } while (--blocks);
+        }
         return dst32;
     default:
         return NULL;
@@ -222,6 +249,11 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     if ((retval = pcm_dvd_parse_header(avctx, src)))
         return retval;
+    if (s->last_block_size && s->last_block_size != s->block_size) {
+        av_log(avctx, AV_LOG_WARNING, "block_size has changed %d != %d\n", s->last_block_size, s->block_size);
+        s->extra_sample_count = 0;
+    }
+    s->last_block_size = s->block_size;
     src      += 3;
     buf_size -= 3;
 
@@ -229,10 +261,8 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blocks * s->samples_per_block;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst = frame->data[0];
 
     /* consume leftover samples from last packet */
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 959c50b279..2cb5a360d6 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -2,20 +2,20 @@
  * PCM codecs
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,13 +69,24 @@ static av_cold int pcm_encode_init(AVCodecContext *avctx)
         bytestream_put_ ## endian(&dst, v);                             \
     }
 
+#define ENCODE_PLANAR(type, endian, dst, n, shift, offset)              \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        samples_ ## type = (const type *) frame->extended_data[c];      \
+        for (i = n; i > 0; i--) {                                       \
+            register type v = (*samples_ ## type++ >> shift) + offset;  \
+            bytestream_put_ ## endian(&dst, v);                         \
+        }                                                               \
+    }
+
 static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                             const AVFrame *frame, int *got_packet_ptr)
 {
-    int n, sample_size, v, ret;
+    int n, c, sample_size, v, ret;
     const short *samples;
     unsigned char *dst;
-    const uint8_t *srcu8;
+    const uint8_t *samples_uint8_t;
     const int16_t *samples_int16_t;
     const int32_t *samples_int32_t;
     const int64_t *samples_int64_t;
@@ -86,10 +97,8 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     n           = frame->nb_samples * avctx->channels;
     samples     = (const short *)frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, n * sample_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, n * sample_size, n * sample_size)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch (avctx->codec->id) {
@@ -102,6 +111,9 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S24LE:
         ENCODE(int32_t, le24, samples, dst, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le24, dst, n, 8, 0)
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         ENCODE(int32_t, be24, samples, dst, n, 8, 0)
         break;
@@ -127,11 +139,10 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         ENCODE(uint16_t, be16, samples, dst, n, 0, 0x8000)
         break;
     case AV_CODEC_ID_PCM_S8:
-        srcu8 = frame->data[0];
-        for (; n > 0; n--) {
-            v      = *srcu8++;
-            *dst++ = v - 128;
-        }
+        ENCODE(uint8_t, byte, samples, dst, n, 0, -128)
+        break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        ENCODE_PLANAR(uint8_t, byte, dst, n, 0, -128)
         break;
 #if HAVE_BIGENDIAN
     case AV_CODEC_ID_PCM_F64LE:
@@ -141,9 +152,15 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_F32LE:
         ENCODE(int32_t, le32, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le32, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         ENCODE(int16_t, le16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        ENCODE_PLANAR(int16_t, le16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
     case AV_CODEC_ID_PCM_S32BE:
@@ -159,6 +176,9 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S16BE:
         ENCODE(int16_t, be16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        ENCODE_PLANAR(int16_t, be16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
     case AV_CODEC_ID_PCM_S32LE:
@@ -166,7 +186,18 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 #endif /* HAVE_BIGENDIAN */
     case AV_CODEC_ID_PCM_U8:
         memcpy(dst, samples, n * sample_size);
-        dst += n * sample_size;
+        break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            const uint8_t *src = frame->extended_data[c];
+            bytestream_put_buffer(&dst, src, n * sample_size);
+        }
         break;
     case AV_CODEC_ID_PCM_ALAW:
         for (; n > 0; n--) {
@@ -202,7 +233,7 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_ALAW:
         for (i = 0; i < 256; i++)
             s->table[i] = alaw2linear(i);
@@ -218,7 +249,7 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt = avctx->codec->sample_fmts[0];
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
-        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec->id);
+        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec_id);
 
     return 0;
 }
@@ -240,28 +271,17 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         dst += size / 8;                                                \
     }
 
-#if HAVE_BIGENDIAN
-#define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        int n2;                                                         \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            n2 = n;                                                     \
-            DECODE(size, endian, src, samples, n2, 0, 0)                \
-        }                                                               \
-    }
-#else
 #define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            memcpy(samples, src, n * size / 8);                         \
-            src += n * size / 8;                                        \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        dst = frame->extended_data[c];                                \
+        for (i = n; i > 0; i--) {                                       \
+            uint ## size ## _t v = bytestream_get_ ## endian(&src);     \
+            AV_WN ## size ## A(dst, (v - offset) << shift);             \
+            dst += size / 8;                                            \
         }                                                               \
     }
-#endif /* HAVE_BIGENDIAN */
 
 static int pcm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
@@ -289,12 +309,24 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR(EINVAL);
     }
 
+    if (avctx->channels == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_id != avctx->codec->id) {
+        av_log(avctx, AV_LOG_ERROR, "codec ids mismatch\n");
+        return AVERROR(EINVAL);
+    }
+
     n = avctx->channels * sample_size;
 
     if (n && buf_size % n) {
         if (buf_size < n) {
-            av_log(avctx, AV_LOG_ERROR, "invalid PCM packet\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid PCM packet, data has size %d but at least a size of %d was expected\n",
+                   buf_size, n);
+            return AVERROR_INVALIDDATA;
         } else
             buf_size -= buf_size % n;
     }
@@ -303,13 +335,11 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = n * samples_per_block / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = frame->data[0];
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_U32LE:
         DECODE(32, le32, src, samples, n, 0, 0x80000000)
         break;
@@ -319,6 +349,9 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S24LE:
         DECODE(32, le24, src, samples, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         DECODE(32, be24, src, samples, n, 8, 0)
         break;
@@ -337,18 +370,6 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
             samples += 2;
         }
         break;
-    case AV_CODEC_ID_PCM_S16BE_PLANAR:
-        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S16LE_PLANAR:
-        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S24LE_PLANAR:
-        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
-        break;
-    case AV_CODEC_ID_PCM_S32LE_PLANAR:
-        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
-        break;
     case AV_CODEC_ID_PCM_U16LE:
         DECODE(16, le16, src, samples, n, 0, 0x8000)
         break;
@@ -359,6 +380,15 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         for (; n > 0; n--)
             *samples++ = *src++ + 128;
         break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            int i;
+            samples = frame->extended_data[c];
+            for (i = n; i > 0; i--)
+                *samples++ = *src++ + 128;
+        }
+        break;
 #if HAVE_BIGENDIAN
     case AV_CODEC_ID_PCM_F64LE:
         DECODE(64, le64, src, samples, n, 0, 0)
@@ -367,9 +397,15 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_F32LE:
         DECODE(32, le32, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         DECODE(16, le16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
     case AV_CODEC_ID_PCM_S32BE:
@@ -385,6 +421,9 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S16BE:
         DECODE(16, be16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
     case AV_CODEC_ID_PCM_S32LE:
@@ -393,6 +432,18 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_U8:
         memcpy(samples, src, n * sample_size);
         break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            samples = frame->extended_data[c];
+            bytestream_get_buffer(&src, samples, n * sample_size);
+        }
+        break;
     case AV_CODEC_ID_PCM_ZORK:
         for (; n > 0; n--) {
             int v = *src++;
@@ -489,25 +540,26 @@ AVCodec ff_ ## name_ ## _decoder = {                                        \
     PCM_DECODER(id, sample_fmt_, name, long_name_)
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
-PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law");
+PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law / G.711 A-law");
 PCM_CODEC  (PCM_F32BE,        AV_SAMPLE_FMT_FLT, pcm_f32be,        "PCM 32-bit floating point big-endian");
 PCM_CODEC  (PCM_F32LE,        AV_SAMPLE_FMT_FLT, pcm_f32le,        "PCM 32-bit floating point little-endian");
 PCM_CODEC  (PCM_F64BE,        AV_SAMPLE_FMT_DBL, pcm_f64be,        "PCM 64-bit floating point big-endian");
 PCM_CODEC  (PCM_F64LE,        AV_SAMPLE_FMT_DBL, pcm_f64le,        "PCM 64-bit floating point little-endian");
-PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P, pcm_lxf,          "PCM signed 20-bit little-endian planar");
-PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law");
+PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P,pcm_lxf,          "PCM signed 20-bit little-endian planar");
+PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law / G.711 mu-law");
 PCM_CODEC  (PCM_S8,           AV_SAMPLE_FMT_U8,  pcm_s8,           "PCM signed 8-bit");
+PCM_CODEC  (PCM_S8_PLANAR,    AV_SAMPLE_FMT_U8P, pcm_s8_planar,    "PCM signed 8-bit planar");
 PCM_CODEC  (PCM_S16BE,        AV_SAMPLE_FMT_S16, pcm_s16be,        "PCM signed 16-bit big-endian");
-PCM_DECODER(PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
+PCM_CODEC  (PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
 PCM_CODEC  (PCM_S16LE,        AV_SAMPLE_FMT_S16, pcm_s16le,        "PCM signed 16-bit little-endian");
-PCM_DECODER(PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P, pcm_s16le_planar, "PCM 16-bit little-endian planar");
+PCM_CODEC  (PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16le_planar, "PCM signed 16-bit little-endian planar");
 PCM_CODEC  (PCM_S24BE,        AV_SAMPLE_FMT_S32, pcm_s24be,        "PCM signed 24-bit big-endian");
 PCM_CODEC  (PCM_S24DAUD,      AV_SAMPLE_FMT_S16, pcm_s24daud,      "PCM D-Cinema audio signed 24-bit");
 PCM_CODEC  (PCM_S24LE,        AV_SAMPLE_FMT_S32, pcm_s24le,        "PCM signed 24-bit little-endian");
-PCM_DECODER(PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
+PCM_CODEC  (PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
 PCM_CODEC  (PCM_S32BE,        AV_SAMPLE_FMT_S32, pcm_s32be,        "PCM signed 32-bit big-endian");
 PCM_CODEC  (PCM_S32LE,        AV_SAMPLE_FMT_S32, pcm_s32le,        "PCM signed 32-bit little-endian");
-PCM_DECODER(PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
+PCM_CODEC  (PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
 PCM_CODEC  (PCM_U8,           AV_SAMPLE_FMT_U8,  pcm_u8,           "PCM unsigned 8-bit");
 PCM_CODEC  (PCM_U16BE,        AV_SAMPLE_FMT_S16, pcm_u16be,        "PCM unsigned 16-bit big-endian");
 PCM_CODEC  (PCM_U16LE,        AV_SAMPLE_FMT_S16, pcm_u16le,        "PCM unsigned 16-bit little-endian");
@@ -516,3 +568,4 @@ PCM_CODEC  (PCM_U24LE,        AV_SAMPLE_FMT_S32, pcm_u24le,        "PCM unsigned
 PCM_CODEC  (PCM_U32BE,        AV_SAMPLE_FMT_S32, pcm_u32be,        "PCM unsigned 32-bit big-endian");
 PCM_CODEC  (PCM_U32LE,        AV_SAMPLE_FMT_S32, pcm_u32le,        "PCM unsigned 32-bit little-endian");
 PCM_DECODER(PCM_ZORK,         AV_SAMPLE_FMT_U8,  pcm_zork,         "PCM Zork");
+
diff --git a/libavcodec/pcm_tablegen.c b/libavcodec/pcm_tablegen.c
index 7b4bc8c6c9..bf8e7fb707 100644
--- a/libavcodec/pcm_tablegen.c
+++ b/libavcodec/pcm_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pcm_tablegen.h b/libavcodec/pcm_tablegen.h
index 79d6561646..1387210a58 100644
--- a/libavcodec/pcm_tablegen.h
+++ b/libavcodec/pcm_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pcx.c b/libavcodec/pcx.c
index d9f0d24ab1..1d3ee8d96a 100644
--- a/libavcodec/pcx.c
+++ b/libavcodec/pcx.c
@@ -5,20 +5,20 @@
  * This decoder does not support CGA palettes. I am unable to find samples
  * and Netpbm cannot generate them.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,44 +28,37 @@
 #include "get_bits.h"
 #include "internal.h"
 
-/**
- * @return advanced src pointer
- */
-static const uint8_t *pcx_rle_decode(const uint8_t *src,
-                                     const uint8_t *end,
-                                     uint8_t *dst,
-                                     unsigned int bytes_per_scanline,
-                                     int compressed)
+static void pcx_rle_decode(GetByteContext *gb,
+                           uint8_t *dst,
+                           unsigned int bytes_per_scanline,
+                           int compressed)
 {
     unsigned int i = 0;
     unsigned char run, value;
 
     if (compressed) {
-        while (i < bytes_per_scanline && src < end) {
+        while (i < bytes_per_scanline && bytestream2_get_bytes_left(gb)>0) {
             run   = 1;
-            value = *src++;
-            if (value >= 0xc0 && src < end) {
+            value = bytestream2_get_byte(gb);
+            if (value >= 0xc0 && bytestream2_get_bytes_left(gb)>0) {
                 run   = value & 0x3f;
-                value = *src++;
+                value = bytestream2_get_byte(gb);
             }
             while (i < bytes_per_scanline && run--)
                 dst[i++] = value;
         }
     } else {
-        memcpy(dst, src, bytes_per_scanline);
-        src += bytes_per_scanline;
+        bytestream2_get_buffer(gb, dst, bytes_per_scanline);
     }
-
-    return src;
 }
 
-static void pcx_palette(const uint8_t **src, uint32_t *dst,
-                        unsigned int pallen)
+static void pcx_palette(GetByteContext *gb, uint32_t *dst, int pallen)
 {
-    unsigned int i;
+    int i;
 
+    pallen = FFMIN(pallen, bytestream2_get_bytes_left(gb) / 3);
     for (i = 0; i < pallen; i++)
-        *dst++ = bytestream_get_be24(src);
+        *dst++ = 0xFF000000 | bytestream2_get_be24u(gb);
     if (pallen < 256)
         memset(dst, 0, (256 - pallen) * sizeof(*dst));
 }
@@ -73,28 +66,32 @@ static void pcx_palette(const uint8_t **src, uint32_t *dst,
 static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    AVFrame *const p   = data;
+    GetByteContext gb;
+    AVFrame * const p  = data;
     int compressed, xmin, ymin, xmax, ymax;
+    int ret;
     unsigned int w, h, bits_per_pixel, bytes_per_line, nplanes, stride, y, x,
                  bytes_per_scanline;
-    uint8_t *ptr;
-    const uint8_t *buf_end = buf + buf_size;
-    uint8_t const *bufstart = buf;
-    uint8_t *scanline;
-    int ret = -1;
+    uint8_t *ptr, *scanline;
+
+    if (avpkt->size < 128)
+        return AVERROR_INVALIDDATA;
+
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    if (buf[0] != 0x0a || buf[1] > 5) {
+    if (bytestream2_get_byteu(&gb) != 0x0a || bytestream2_get_byteu(&gb) > 5) {
         av_log(avctx, AV_LOG_ERROR, "this is not PCX encoded data\n");
         return AVERROR_INVALIDDATA;
     }
 
-    compressed = buf[2];
-    xmin       = AV_RL16(buf + 4);
-    ymin       = AV_RL16(buf + 6);
-    xmax       = AV_RL16(buf + 8);
-    ymax       = AV_RL16(buf + 10);
+    compressed                     = bytestream2_get_byteu(&gb);
+    bits_per_pixel                 = bytestream2_get_byteu(&gb);
+    xmin                           = bytestream2_get_le16u(&gb);
+    ymin                           = bytestream2_get_le16u(&gb);
+    xmax                           = bytestream2_get_le16u(&gb);
+    ymax                           = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.num = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_le16u(&gb);
 
     if (xmax < xmin || ymax < ymin) {
         av_log(avctx, AV_LOG_ERROR, "invalid image dimensions\n");
@@ -104,13 +101,13 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     w = xmax - xmin + 1;
     h = ymax - ymin + 1;
 
-    bits_per_pixel     = buf[3];
-    bytes_per_line     = AV_RL16(buf + 66);
-    nplanes            = buf[65];
+    bytestream2_skipu(&gb, 49);
+    nplanes            = bytestream2_get_byteu(&gb);
+    bytes_per_line     = bytestream2_get_le16u(&gb);
     bytes_per_scanline = nplanes * bytes_per_line;
 
     if (bytes_per_scanline < (w * bits_per_pixel * nplanes + 7) / 8 ||
-        (!compressed && bytes_per_scanline > buf_size / h)) {
+        (!compressed && bytes_per_scanline > bytestream2_get_bytes_left(&gb) / h)) {
         av_log(avctx, AV_LOG_ERROR, "PCX data is corrupted\n");
         return AVERROR_INVALIDDATA;
     }
@@ -133,29 +130,26 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    buf += 128;
+    bytestream2_skipu(&gb, 60);
 
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
     ptr    = p->data[0];
     stride = p->linesize[0];
 
-    scanline = av_malloc(bytes_per_scanline);
+    scanline = av_malloc(bytes_per_scanline + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!scanline)
         return AVERROR(ENOMEM);
 
     if (nplanes == 3 && bits_per_pixel == 8) {
         for (y = 0; y < h; y++) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++) {
                 ptr[3 * x]     = scanline[x];
@@ -166,39 +160,37 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ptr += stride;
         }
     } else if (nplanes == 1 && bits_per_pixel == 8) {
-        const uint8_t *palstart = bufstart + buf_size - 769;
+        int palstart = avpkt->size - 769;
 
-        if (buf_size < 769) {
+        if (avpkt->size < 769) {
             av_log(avctx, AV_LOG_ERROR, "File is too short\n");
             ret = avctx->err_recognition & AV_EF_EXPLODE ?
-                  AVERROR_INVALIDDATA : buf_size;
+                  AVERROR_INVALIDDATA : avpkt->size;
             goto end;
         }
 
         for (y = 0; y < h; y++, ptr += stride) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
             memcpy(ptr, scanline, w);
         }
 
-        if (buf != palstart) {
+        if (bytestream2_tell(&gb) != palstart) {
             av_log(avctx, AV_LOG_WARNING, "image data possibly corrupted\n");
-            buf = palstart;
+            bytestream2_seek(&gb, palstart, SEEK_SET);
         }
-        if (*buf++ != 12) {
+        if (bytestream2_get_byte(&gb) != 12) {
             av_log(avctx, AV_LOG_ERROR, "expected palette after image data\n");
             ret = avctx->err_recognition & AV_EF_EXPLODE ?
-                  AVERROR_INVALIDDATA : buf_size;
+                  AVERROR_INVALIDDATA : avpkt->size;
             goto end;
         }
     } else if (nplanes == 1) {   /* all packed formats, max. 16 colors */
         GetBitContext s;
 
         for (y = 0; y < h; y++) {
-            init_get_bits(&s, scanline, bytes_per_scanline << 3);
+            init_get_bits8(&s, scanline, bytes_per_scanline);
 
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++)
                 ptr[x] = get_bits(&s, bits_per_pixel);
@@ -208,8 +200,7 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         int i;
 
         for (y = 0; y < h; y++) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++) {
                 int m = 0x80 >> (x & 7), v = 0;
@@ -223,16 +214,20 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    ret = bytestream2_tell(&gb);
     if (nplanes == 1 && bits_per_pixel == 8) {
-        pcx_palette(&buf, (uint32_t *)p->data[1], 256);
+        pcx_palette(&gb, (uint32_t *)p->data[1], 256);
+        ret += 256 * 3;
+    } else if (bits_per_pixel * nplanes == 1) {
+        AV_WN32A(p->data[1]  , 0xFF000000);
+        AV_WN32A(p->data[1]+4, 0xFFFFFFFF);
     } else if (bits_per_pixel < 8) {
-        const uint8_t *palette = bufstart + 16;
-        pcx_palette(&palette, (uint32_t *)p->data[1], 16);
+        bytestream2_seek(&gb, 16, SEEK_SET);
+        pcx_palette(&gb, (uint32_t *)p->data[1], 16);
     }
 
     *got_frame = 1;
 
-    ret = buf - bufstart;
 end:
     av_free(scanline);
     return ret;
diff --git a/libavcodec/pcxenc.c b/libavcodec/pcxenc.c
index 7fc0d9cc76..f0ffedfa56 100644
--- a/libavcodec/pcxenc.c
+++ b/libavcodec/pcxenc.c
@@ -2,20 +2,20 @@
  * PC Paintbrush PCX (.pcx) image encoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 
 #include "avcodec.h"
 #include "bytestream.h"
+#include "libavutil/imgutils.h"
 #include "internal.h"
 
 static const uint32_t monoblack_pal[16] = { 0x000000, 0xFFFFFF };
@@ -100,8 +101,9 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const uint8_t *buf_end;
     uint8_t *buf;
 
-    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size;
+    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size, sw, sh;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     const uint8_t *src;
 
     if (avctx->width > 65535 || avctx->height > 65535) {
@@ -119,6 +121,11 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
+        bpp = 8;
+        nplanes = 1;
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         bpp = 8;
         nplanes = 1;
@@ -138,13 +145,16 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     line_bytes = (line_bytes + 1) & ~1;
 
     max_pkt_size = 128 + avctx->height * 2 * line_bytes * nplanes + (pal ? 256*3 + 1 : 0);
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
     buf     = pkt->data;
     buf_end = pkt->data + pkt->size;
 
+    sw = avctx->sample_aspect_ratio.num;
+    sh = avctx->sample_aspect_ratio.den;
+    if (sw > 0xFFFFu || sh > 0xFFFFu)
+        av_reduce(&sw, &sh, sw, sh, 0xFFFFu);
+
     bytestream_put_byte(&buf, 10);                  // manufacturer
     bytestream_put_byte(&buf, 5);                   // version
     bytestream_put_byte(&buf, 1);                   // encoding
@@ -153,8 +163,8 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     bytestream_put_le16(&buf, 0);                   // y min
     bytestream_put_le16(&buf, avctx->width - 1);    // x max
     bytestream_put_le16(&buf, avctx->height - 1);   // y max
-    bytestream_put_le16(&buf, 0);                   // horizontal DPI
-    bytestream_put_le16(&buf, 0);                   // vertical DPI
+    bytestream_put_le16(&buf, sw);                  // horizontal DPI
+    bytestream_put_le16(&buf, sh);                  // vertical DPI
     for (i = 0; i < 16; i++)
         bytestream_put_be24(&buf, pal ? pal[i] : 0);// palette (<= 16 color only)
     bytestream_put_byte(&buf, 0);                   // reserved
diff --git a/libavcodec/pel_template.c b/libavcodec/pel_template.c
index b832ae7624..6da7a56b2d 100644
--- a/libavcodec/pel_template.c
+++ b/libavcodec/pel_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pgssubdec.c b/libavcodec/pgssubdec.c
index 72142e6f7c..50fc5b6e64 100644
--- a/libavcodec/pgssubdec.c
+++ b/libavcodec/pgssubdec.c
@@ -2,20 +2,20 @@
  * PGS subtitle decoder
  * Copyright (c) 2009 Stephen Backway
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 
 #include "libavutil/colorspace.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
 #define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
 #define MAX_EPOCH_PALETTES 8   // Max 8 allowed per PGS epoch
@@ -90,9 +91,11 @@ typedef struct PGSSubPalettes {
 } PGSSubPalettes;
 
 typedef struct PGSSubContext {
+    AVClass *class;
     PGSSubPresentation presentation;
     PGSSubPalettes     palettes;
     PGSSubObjects      objects;
+    int forced_subs_only;
 } PGSSubContext;
 
 static void flush_cache(AVCodecContext *avctx)
@@ -133,7 +136,7 @@ static PGSSubPalette * find_palette(int id, PGSSubPalettes *palettes)
 
 static av_cold int init_decoder(AVCodecContext *avctx)
 {
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    avctx->pix_fmt     = AV_PIX_FMT_PAL8;
 
     return 0;
 }
@@ -148,7 +151,7 @@ static av_cold int close_decoder(AVCodecContext *avctx)
 /**
  * Decode the RLE data.
  *
- * The subtitle is stored as an Run Length Encoded image.
+ * The subtitle is stored as a Run Length Encoded image.
  *
  * @param avctx contains the current codec context
  * @param sub pointer to the processed subtitle data
@@ -163,7 +166,7 @@ static int decode_rle(AVCodecContext *avctx, AVSubtitleRect *rect,
 
     rle_bitmap_end = buf + buf_size;
 
-    rect->pict.data[0] = av_malloc(rect->w * rect->h);
+    rect->pict.data[0] = av_malloc_array(rect->w, rect->h);
 
     if (!rect->pict.data[0])
         return AVERROR(ENOMEM);
@@ -295,7 +298,7 @@ static int parse_object_segment(AVCodecContext *avctx,
     object->w = width;
     object->h = height;
 
-    av_fast_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
+    av_fast_padded_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
 
     if (!object->rle)
         return AVERROR(ENOMEM);
@@ -378,8 +381,8 @@ static int parse_presentation_segment(AVCodecContext *avctx,
                                       int64_t pts)
 {
     PGSSubContext *ctx = avctx->priv_data;
-
     int i, state, ret;
+    const uint8_t *buf_end = buf + buf_size;
 
     // Video descriptor
     int w = bytestream_get_be16(&buf);
@@ -428,8 +431,16 @@ static int parse_presentation_segment(AVCodecContext *avctx,
         }
     }
 
+
     for (i = 0; i < ctx->presentation.object_count; i++)
     {
+
+        if (buf_end - buf < 8) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficent space for object\n");
+            ctx->presentation.object_count = i;
+            return AVERROR_INVALIDDATA;
+        }
+
         ctx->presentation.objects[i].id = bytestream_get_be16(&buf);
         ctx->presentation.objects[i].window_id = bytestream_get_byte(&buf);
         ctx->presentation.objects[i].composition_flag = bytestream_get_byte(&buf);
@@ -480,11 +491,14 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
 {
     AVSubtitle    *sub = data;
     PGSSubContext *ctx = avctx->priv_data;
+    int64_t pts;
     PGSSubPalette *palette;
     int i, ret;
 
+    pts = ctx->presentation.pts != AV_NOPTS_VALUE ? ctx->presentation.pts : sub->pts;
     memset(sub, 0, sizeof(*sub));
-    sub->pts = ctx->presentation.pts;
+    sub->pts = pts;
+    ctx->presentation.pts = AV_NOPTS_VALUE;
     sub->start_display_time = 0;
     // There is no explicit end time for PGS subtitles.  The end time
     // is defined by the start of the next sub which may contain no
@@ -495,7 +509,7 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
     // Blank if last object_count was 0.
     if (!ctx->presentation.object_count)
         return 1;
-    sub->rects = av_mallocz(sizeof(*sub->rects) * ctx->presentation.object_count);
+    sub->rects = av_mallocz_array(ctx->presentation.object_count, sizeof(*sub->rects));
     if (!sub->rects) {
         return AVERROR(ENOMEM);
     }
@@ -570,6 +584,7 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
             return AVERROR(ENOMEM);
         }
 
+        if (!ctx->forced_subs_only || ctx->presentation.objects[i].composition_flag & 0x40)
         memcpy(sub->rects[i]->pict.data[1], palette->clut, sub->rects[i]->nb_colors * sizeof(uint32_t));
 
     }
@@ -625,7 +640,7 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
             ret = parse_object_segment(avctx, buf, segment_length);
             break;
         case PRESENTATION_SEGMENT:
-            ret = parse_presentation_segment(avctx, buf, segment_length, avpkt->pts);
+            ret = parse_presentation_segment(avctx, buf, segment_length, ((AVSubtitle*)(data))->pts);
             break;
         case WINDOW_SEGMENT:
             /*
@@ -657,6 +672,20 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
     return buf_size;
 }
 
+#define OFFSET(x) offsetof(PGSSubContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SD},
+    { NULL },
+};
+
+static const AVClass pgsdec_class = {
+    .class_name = "PGS subtitle decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_pgssub_decoder = {
     .name           = "pgssub",
     .long_name      = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
@@ -666,4 +695,5 @@ AVCodec ff_pgssub_decoder = {
     .init           = init_decoder,
     .close          = close_decoder,
     .decode         = decode,
+    .priv_class     = &pgsdec_class,
 };
diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index 9477bc402d..ff6eb7f4fc 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -2,20 +2,20 @@
  * Pictor/PC Paint decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     uint32_t *palette;
     int bits_per_plane, bpp, etype, esize, npal, pos_after_pal;
-    int i, x, y, plane, tmp, ret;
+    int i, x, y, plane, tmp, ret, val;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
 
@@ -127,7 +127,7 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_PATCHWELCOME;
     }
 
-    if (bytestream2_peek_byte(&s->g) == 0xFF) {
+    if (bytestream2_peek_byte(&s->g) == 0xFF || bpp == 1 || bpp == 4 || bpp == 8) {
         bytestream2_skip(&s->g, 2);
         etype = bytestream2_get_le16(&s->g);
         esize = bytestream2_get_le16(&s->g);
@@ -140,16 +140,16 @@ static int decode_frame(AVCodecContext *avctx,
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (av_image_check_size(s->width, s->height, 0, avctx) < 0)
+        return -1;
     if (s->width != avctx->width && s->height != avctx->height) {
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     memset(frame->data[0], 0, s->height * frame->linesize[0]);
     frame->pict_type           = AV_PICTURE_TYPE_I;
     frame->palette_has_changed = 1;
@@ -165,7 +165,7 @@ static int decode_frame(AVCodecContext *avctx,
         npal = FFMIN(esize, 16);
         for (i = 0; i < npal; i++) {
             int pal_idx = bytestream2_get_byte(&s->g);
-            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 16)];
+            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 15)];
         }
     } else if (etype == 3) {
         npal = FFMIN(esize, 16);
@@ -175,13 +175,15 @@ static int decode_frame(AVCodecContext *avctx,
         }
     } else if (etype == 4 || etype == 5) {
         npal = FFMIN(esize / 3, 256);
-        for (i = 0; i < npal; i++)
+        for (i = 0; i < npal; i++) {
             palette[i] = bytestream2_get_be24(&s->g) << 2;
+            palette[i] |= 0xFFU << 24 | palette[i] >> 6 & 0x30303;
+        }
     } else {
         if (bpp == 1) {
             npal = 2;
-            palette[0] = 0x000000;
-            palette[1] = 0xFFFFFF;
+            palette[0] = 0xFF000000;
+            palette[1] = 0xFFFFFFFF;
         } else if (bpp == 2) {
             npal = 4;
             for (i = 0; i < npal; i++)
@@ -196,10 +198,11 @@ static int decode_frame(AVCodecContext *avctx,
     // skip remaining palette bytes
     bytestream2_seek(&s->g, pos_after_pal, SEEK_SET);
 
-    x = 0;
+    val = 0;
     y = s->height - 1;
-    plane = 0;
     if (bytestream2_get_le16(&s->g)) {
+        x = 0;
+        plane = 0;
         while (bytestream2_get_bytes_left(&s->g) >= 6) {
             int stop_size, marker, t1, t2;
 
@@ -213,7 +216,7 @@ static int decode_frame(AVCodecContext *avctx,
             while (plane < s->nb_planes &&
                    bytestream2_get_bytes_left(&s->g) > stop_size) {
                 int run = 1;
-                int val = bytestream2_get_byte(&s->g);
+                val = bytestream2_get_byte(&s->g);
                 if (val == marker) {
                     run = bytestream2_get_byte(&s->g);
                     if (run == 0)
@@ -232,9 +235,20 @@ static int decode_frame(AVCodecContext *avctx,
                 }
             }
         }
+
+        if (x < avctx->width) {
+            int run = (y + 1) * avctx->width - x;
+            if (bits_per_plane == 8)
+                picmemset_8bpp(s, frame, val, run, &x, &y);
+            else
+                picmemset(s, frame, val, run / (8 / bits_per_plane), &x, &y, &plane, bits_per_plane);
+        }
     } else {
-        avpriv_request_sample(avctx, "Uncompressed image");
-        return avpkt->size;
+        while (y >= 0 && bytestream2_get_bytes_left(&s->g) > 0) {
+            memcpy(frame->data[0] + y * frame->linesize[0], s->g.buffer, FFMIN(avctx->width, bytestream2_get_bytes_left(&s->g)));
+            bytestream2_skip(&s->g, avctx->width);
+            y--;
+        }
     }
 finish:
 
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c
index 71423f9cfc..322e1dd111 100644
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #define BIT_DEPTH 8
 #include "pixblockdsp_template.c"
 
-static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
+static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
                           const uint8_t *s2, int stride)
 {
     int i;
@@ -60,17 +60,25 @@ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
     switch (avctx->bits_per_raw_sample) {
     case 9:
     case 10:
+    case 12:
+    case 14:
         c->get_pixels = get_pixels_16_c;
         break;
     default:
-        c->get_pixels = get_pixels_8_c;
+        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = get_pixels_8_c;
+        }
         break;
     }
 
+    if (ARCH_ALPHA)
+        ff_pixblockdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_pixblockdsp_init_mips(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h
index 8094d14b68..79ed86c3a6 100644
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 typedef struct PixblockDSPContext {
     void (*get_pixels)(int16_t *block /* align 16 */,
                        const uint8_t *pixels /* align 8 */,
-                       int line_size);
+                       ptrdiff_t line_size);
     void (*diff_pixels)(int16_t *block /* align 16 */,
                         const uint8_t *s1 /* align 8 */,
                         const uint8_t *s2 /* align 8 */,
@@ -34,11 +34,15 @@ typedef struct PixblockDSPContext {
 } PixblockDSPContext;
 
 void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
+void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                               unsigned high_bit_depth);
 void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
 
 #endif /* AVCODEC_PIXBLOCKDSP_H */
diff --git a/libavcodec/pixblockdsp_template.c b/libavcodec/pixblockdsp_template.c
index 71d3cf150d..d1e91022a7 100644
--- a/libavcodec/pixblockdsp_template.c
+++ b/libavcodec/pixblockdsp_template.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "bit_depth_template.c"
 
-static void FUNCC(get_pixels)(int16_t *restrict block, const uint8_t *_pixels,
-                              int line_size)
+static void FUNCC(get_pixels)(int16_t *av_restrict block, const uint8_t *_pixels,
+                              ptrdiff_t line_size)
 {
     const pixel *pixels = (const pixel *) _pixels;
     int i;
diff --git a/libavcodec/pixels.h b/libavcodec/pixels.h
index d9d2fde92c..98eacd4df6 100644
--- a/libavcodec/pixels.h
+++ b/libavcodec/pixels.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/png.c b/libavcodec/png.c
index cd75dc1815..ef52b51bd4 100644
--- a/libavcodec/png.c
+++ b/libavcodec/png.c
@@ -2,29 +2,25 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
-#include "bytestream.h"
 #include "png.h"
 
-const uint8_t ff_pngsig[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
-const uint8_t ff_mngsig[8] = { 138, 77, 78, 71, 13, 10, 26, 10 };
-
 /* Mask to determine which y pixels are valid in a pass */
 const uint8_t ff_png_pass_ymask[NB_PASSES] = {
     0x80, 0x80, 0x08, 0x88, 0x22, 0xaa, 0x55,
@@ -40,11 +36,6 @@ static const uint8_t ff_png_pass_xshift[NB_PASSES] = {
     3, 3, 2, 2, 1, 1, 0
 };
 
-/* Mask to determine which pixels are valid in a pass */
-const uint8_t ff_png_pass_mask[NB_PASSES] = {
-    0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff
-};
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size)
 {
     return av_mallocz_array(items, size);
diff --git a/libavcodec/png.h b/libavcodec/png.h
index b8c72eebc9..948c2f714f 100644
--- a/libavcodec/png.h
+++ b/libavcodec/png.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,15 +49,12 @@
 
 #define NB_PASSES 7
 
-extern const uint8_t ff_pngsig[8];
-extern const uint8_t ff_mngsig[8];
+#define PNGSIG 0x89504e470d0a1a0a
+#define MNGSIG 0x8a4d4e470d0a1a0a
 
 /* Mask to determine which y pixels are valid in a pass */
 extern const uint8_t ff_png_pass_ymask[NB_PASSES];
 
-/* Mask to determine which pixels are valid in a pass */
-extern const uint8_t ff_png_pass_mask[NB_PASSES];
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size);
 
 void ff_png_zfree(void *opaque, void *ptr);
diff --git a/libavcodec/png_parser.c b/libavcodec/png_parser.c
index c66caf31a3..74f2964118 100644
--- a/libavcodec/png_parser.c
+++ b/libavcodec/png_parser.c
@@ -2,20 +2,20 @@
  * PNG parser
  * Copyright (c) 2009 Peter Holik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,20 +24,14 @@
  * PNG parser
  */
 
-#include "libavutil/intreadwrite.h"
-#include "libavutil/common.h"
-
 #include "parser.h"
-
-#define PNG_SIGNATURE UINT64_C(0x89504e470d0a1a0a)
-#define MNG_SIGNATURE UINT64_C(0x8a4d4e470d0a1a0a)
+#include "png.h"
 
 typedef struct PNGParseContext {
     ParseContext pc;
-
-    int chunk_pos;          ///< position inside current chunk
-    uint32_t chunk_length;  ///< length of the current chunk
-    int remaining_size;     ///< remaining size of the current chunk
+    uint32_t chunk_pos;           ///< position inside current chunk
+    uint32_t chunk_length;        ///< length of the current chunk
+    uint32_t remaining_size;      ///< remaining size of the current chunk
 } PNGParseContext;
 
 static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
@@ -48,16 +42,15 @@ static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int next = END_NOT_FOUND;
     int i = 0;
 
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
     if (!ppc->pc.frame_start_found) {
         uint64_t state64 = ppc->pc.state64;
         for (; i < buf_size; i++) {
             state64 = (state64 << 8) | buf[i];
-            if (state64 == PNG_SIGNATURE ||
-                state64 == MNG_SIGNATURE) {
+            if (state64 == PNGSIG || state64 == MNGSIG) {
                 i++;
                 ppc->pc.frame_start_found = 1;
                 break;
diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index 5fa7a2dfdf..d18014142a 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -2,45 +2,55 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avstring.h"
+//#define DEBUG
+
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "apng.h"
 #include "png.h"
 #include "pngdsp.h"
-
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "thread.h"
 
 #include <zlib.h>
 
 typedef struct PNGDecContext {
     PNGDSPContext dsp;
+    AVCodecContext *avctx;
 
     GetByteContext gb;
-    AVFrame *prev;
+    ThreadFrame previous_picture;
+    ThreadFrame last_picture;
+    ThreadFrame picture;
 
     int state;
     int width, height;
+    int cur_w, cur_h;
+    int last_w, last_h;
+    int x_offset, y_offset;
+    int last_x_offset, last_y_offset;
+    uint8_t dispose_op, blend_op;
+    uint8_t last_dispose_op;
     int bit_depth;
     int color_type;
     int compression_type;
@@ -49,13 +59,19 @@ typedef struct PNGDecContext {
     int channels;
     int bits_per_pixel;
     int bpp;
+    int has_trns;
+    uint8_t transparent_color_be[6];
 
     uint8_t *image_buf;
     int image_linesize;
     uint32_t palette[256];
     uint8_t *crow_buf;
     uint8_t *last_row;
+    unsigned int last_row_size;
     uint8_t *tmp_row;
+    unsigned int tmp_row_size;
+    uint8_t *buffer;
+    int buffer_size;
     int pass;
     int crow_size; /* compressed row size (include filter type) */
     int row_size; /* decompressed row size */
@@ -64,9 +80,14 @@ typedef struct PNGDecContext {
     z_stream zstream;
 } PNGDecContext;
 
+/* Mask to determine which pixels are valid in a pass */
+static const uint8_t png_pass_mask[NB_PASSES] = {
+    0x01, 0x01, 0x11, 0x11, 0x55, 0x55, 0xff,
+};
+
 /* Mask to determine which y pixels can be written in a pass */
 static const uint8_t png_pass_dsp_ymask[NB_PASSES] = {
-    0xff, 0xff, 0x0f, 0xcc, 0x33, 0xff, 0x55,
+    0xff, 0xff, 0x0f, 0xff, 0x33, 0xff, 0x55,
 };
 
 /* Mask to determine which pixels to overwrite while displaying */
@@ -85,40 +106,55 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
     uint8_t *d;
     const uint8_t *s;
 
-    mask     = ff_png_pass_mask[pass];
+    mask     = png_pass_mask[pass];
     dsp_mask = png_pass_dsp_mask[pass];
 
     switch (bits_per_pixel) {
     case 1:
-        /* we must initialize the line to zero before writing to it */
-        if (pass == 0)
-            memset(dst, 0, (width + 7) >> 3);
         src_x = 0;
         for (x = 0; x < width; x++) {
             j = (x & 7);
             if ((dsp_mask << j) & 0x80) {
                 b = (src[src_x >> 3] >> (7 - (src_x & 7))) & 1;
+                dst[x >> 3] &= 0xFF7F>>j;
                 dst[x >> 3] |= b << (7 - j);
             }
             if ((mask << j) & 0x80)
                 src_x++;
         }
         break;
+    case 2:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 2 * (x & 3);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 2] >> (6 - 2*(src_x & 3))) & 3;
+                dst[x >> 2] &= 0xFF3F>>j2;
+                dst[x >> 2] |= b << (6 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
+    case 4:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 4*(x&1);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 1] >> (4 - 4*(src_x & 1))) & 15;
+                dst[x >> 1] &= 0xFF0F>>j2;
+                dst[x >> 1] |= b << (4 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
     default:
         bpp = bits_per_pixel >> 3;
         d   = dst;
         s   = src;
-        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            for (x = 0; x < width; x++) {
-                j = x & 7;
-                if ((dsp_mask << j) & 0x80) {
-                    *(uint32_t *)d = (s[3] << 24) | (s[0] << 16) | (s[1] << 8) | s[2];
-                }
-                d += bpp;
-                if ((mask << j) & 0x80)
-                    s += bpp;
-            }
-        } else {
             for (x = 0; x < width; x++) {
                 j = x & 7;
                 if ((dsp_mask << j) & 0x80) {
@@ -128,7 +164,6 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
                 if ((mask << j) & 0x80)
                     s += bpp;
             }
-        }
         break;
     }
 }
@@ -170,7 +205,7 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
             b = dst[2];                                                       \
         if (bpp >= 4)                                                         \
             a = dst[3];                                                       \
-        for (; i < size; i += bpp) {                                          \
+        for (; i <= size - bpp; i += bpp) {                                   \
             dst[i + 0] = r = op(r, src[i + 0], last[i + 0]);                  \
             if (bpp == 1)                                                     \
                 continue;                                                     \
@@ -193,12 +228,9 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
         UNROLL1(3, op)                                                        \
     } else if (bpp == 4) {                                                    \
         UNROLL1(4, op)                                                        \
-    } else {                                                                  \
-        for (; i < size; i += bpp) {                                          \
-            int j;                                                            \
-            for (j = 0; j < bpp; j++)                                         \
-                dst[i + j] = op(dst[i + j - bpp], src[i + j], last[i + j]);   \
-        }                                                                     \
+    }                                                                         \
+    for (; i < size; i++) {                                                   \
+        dst[i] = op(dst[i - bpp], src[i], last[i]);                           \
     }
 
 /* NOTE: 'dst' can be equal to 'last' */
@@ -217,12 +249,12 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp == 4) {
             p = *(int *)dst;
             for (; i < size; i += bpp) {
-                int s = *(int *)(src + i);
+                unsigned s = *(int *)(src + i);
                 p = ((s & 0x7f7f7f7f) + (p & 0x7f7f7f7f)) ^ ((s ^ p) & 0x80808080);
                 *(int *)(dst + i) = p;
             }
         } else {
-#define OP_SUB(x, s, l) x + s
+#define OP_SUB(x, s, l) ((x) + (s))
             UNROLL_FILTER(OP_SUB);
         }
         break;
@@ -234,7 +266,7 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
             p      = (last[i] >> 1);
             dst[i] = p + src[i];
         }
-#define OP_AVG(x, s, l) (((x + l) >> 1) + s) & 0xff
+#define OP_AVG(x, s, l) (((((x) + (l)) >> 1) + (s)) & 0xff)
         UNROLL_FILTER(OP_AVG);
         break;
     case PNG_FILTER_VALUE_PAETH:
@@ -245,55 +277,33 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp > 2 && size > 4) {
             /* would write off the end of the array if we let it process
              * the last pixel with bpp=3 */
-            int w = bpp == 4 ? size : size - 3;
-            dsp->add_paeth_prediction(dst + i, src + i, last + i, w - i, bpp);
-            i = w;
+            int w = (bpp & 3) ? size - 3 : size;
+
+            if (w > i) {
+                dsp->add_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
+                i = w;
+            }
         }
         ff_add_png_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
         break;
     }
 }
 
-static av_always_inline void convert_to_rgb32_loco(uint8_t *dst,
-                                                   const uint8_t *src,
-                                                   int width, int loco)
-{
-    int j;
-    unsigned int r, g, b, a;
-
-    for (j = 0; j < width; j++) {
-        r = src[0];
-        g = src[1];
-        b = src[2];
-        a = src[3];
-        if (loco) {
-            r = (r + g) & 0xff;
-            b = (b + g) & 0xff;
-        }
-        *(uint32_t *) dst = (a << 24) | (r << 16) | (g << 8) | b;
-        dst += 4;
-        src += 4;
-    }
-}
-
-static void convert_to_rgb32(uint8_t *dst, const uint8_t *src,
-                             int width, int loco)
-{
-    if (loco)
-        convert_to_rgb32_loco(dst, src, width, 1);
-    else
-        convert_to_rgb32_loco(dst, src, width, 0);
+/* This used to be called "deloco" in FFmpeg
+ * and is actually an inverse reversible colorspace transformation */
+#define YUV2RGB(NAME, TYPE) \
+static void deloco_ ## NAME(TYPE *dst, int size, int alpha) \
+{ \
+    int i; \
+    for (i = 0; i < size; i += 3 + alpha) { \
+        int g = dst [i + 1]; \
+        dst[i + 0] += g; \
+        dst[i + 2] += g; \
+    } \
 }
 
-static void deloco_rgb24(uint8_t *dst, int size)
-{
-    int i;
-    for (i = 0; i < size; i += 3) {
-        int g = dst[i + 1];
-        dst[i + 0] += g;
-        dst[i + 2] += g;
-    }
-}
+YUV2RGB(rgb8, uint8_t)
+YUV2RGB(rgb16, uint16_t)
 
 /* process exactly one decompressed row */
 static void png_handle_row(PNGDecContext *s)
@@ -302,39 +312,41 @@ static void png_handle_row(PNGDecContext *s)
     int got_line;
 
     if (!s->interlace_type) {
-        ptr = s->image_buf + s->image_linesize * s->y;
-        /* need to swap bytes correctly for RGB_ALPHA */
-        if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
-                           s->last_row, s->row_size, s->bpp);
-            convert_to_rgb32(ptr, s->tmp_row, s->width,
-                             s->filter_type == PNG_FILTER_TYPE_LOCO);
-            FFSWAP(uint8_t *, s->last_row, s->tmp_row);
-        } else {
-            /* in normal case, we avoid one copy */
-            if (s->y == 0)
-                last_row = s->last_row;
-            else
-                last_row = ptr - s->image_linesize;
+        ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
+        if (s->y == 0)
+            last_row = s->last_row;
+        else
+            last_row = ptr - s->image_linesize;
 
-            png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
-                           last_row, s->row_size, s->bpp);
-        }
+        png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
+                       last_row, s->row_size, s->bpp);
         /* loco lags by 1 row so that it doesn't interfere with top prediction */
-        if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-            s->color_type == PNG_COLOR_TYPE_RGB && s->y > 0)
-            deloco_rgb24(ptr - s->image_linesize, s->row_size);
+        if (s->filter_type == PNG_FILTER_TYPE_LOCO && s->y > 0) {
+            if (s->bit_depth == 16) {
+                deloco_rgb16((uint16_t *)(ptr - s->image_linesize), s->row_size / 2,
+                             s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            } else {
+                deloco_rgb8(ptr - s->image_linesize, s->row_size,
+                            s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            }
+        }
         s->y++;
-        if (s->y == s->height) {
+        if (s->y == s->cur_h) {
             s->state |= PNG_ALLIMAGE;
-            if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-                s->color_type == PNG_COLOR_TYPE_RGB)
-                deloco_rgb24(ptr, s->row_size);
+            if (s->filter_type == PNG_FILTER_TYPE_LOCO) {
+                if (s->bit_depth == 16) {
+                    deloco_rgb16((uint16_t *)ptr, s->row_size / 2,
+                                 s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                } else {
+                    deloco_rgb8(ptr, s->row_size,
+                                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                }
+            }
         }
     } else {
         got_line = 0;
         for (;;) {
-            ptr = s->image_buf + s->image_linesize * s->y;
+            ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
             if ((ff_png_pass_ymask[s->pass] << (s->y & 7)) & 0x80) {
                 /* if we already read one row, it is time to stop to
                  * wait for the next one */
@@ -343,15 +355,16 @@ static void png_handle_row(PNGDecContext *s)
                 png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
                                s->last_row, s->pass_row_size, s->bpp);
                 FFSWAP(uint8_t *, s->last_row, s->tmp_row);
+                FFSWAP(unsigned int, s->last_row_size, s->tmp_row_size);
                 got_line = 1;
             }
             if ((png_pass_dsp_ymask[s->pass] << (s->y & 7)) & 0x80) {
-                /* NOTE: RGB32 is handled directly in png_put_interlaced_row */
-                png_put_interlaced_row(ptr, s->width, s->bits_per_pixel, s->pass,
+                png_put_interlaced_row(ptr, s->cur_w, s->bits_per_pixel, s->pass,
                                        s->color_type, s->last_row);
             }
             s->y++;
-            if (s->y == s->height) {
+            if (s->y == s->cur_h) {
+                memset(s->last_row, 0, s->row_size);
                 for (;;) {
                     if (s->pass == NB_PASSES - 1) {
                         s->state |= PNG_ALLIMAGE;
@@ -361,7 +374,7 @@ static void png_handle_row(PNGDecContext *s)
                         s->y = 0;
                         s->pass_row_size = ff_png_pass_row_size(s->pass,
                                                                 s->bits_per_pixel,
-                                                                s->width);
+                                                                s->cur_w);
                         s->crow_size = s->pass_row_size + 1;
                         if (s->pass_row_size != 0)
                             break;
@@ -378,14 +391,15 @@ static int png_decode_idat(PNGDecContext *s, int length)
 {
     int ret;
     s->zstream.avail_in = FFMIN(length, bytestream2_get_bytes_left(&s->gb));
-    s->zstream.next_in  = s->gb.buffer;
+    s->zstream.next_in  = (unsigned char *)s->gb.buffer;
     bytestream2_skip(&s->gb, length);
 
     /* decode one line if possible */
     while (s->zstream.avail_in > 0) {
         ret = inflate(&s->zstream, Z_PARTIAL_FLUSH);
         if (ret != Z_OK && ret != Z_STREAM_END) {
-            return -1;
+            av_log(s->avctx, AV_LOG_ERROR, "inflate returned error %d\n", ret);
+            return AVERROR_EXTERNAL;
         }
         if (s->zstream.avail_out == 0) {
             if (!(s->state & PNG_ALLIMAGE)) {
@@ -403,214 +417,753 @@ static int png_decode_idat(PNGDecContext *s, int length)
     return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx,
-                        void *data, int *got_frame,
-                        AVPacket *avpkt)
+static int decode_zbuf(AVBPrint *bp, const uint8_t *data,
+                       const uint8_t *data_end)
 {
-    PNGDecContext *const s = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    AVFrame *p             = data;
-    uint8_t *crow_buf_base = NULL;
-    uint32_t tag, length;
+    z_stream zstream;
+    unsigned char *buf;
+    unsigned buf_size;
     int ret;
 
-    /* check signature */
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough data %d\n",
-               buf_size);
+    zstream.zalloc = ff_png_zalloc;
+    zstream.zfree  = ff_png_zfree;
+    zstream.opaque = NULL;
+    if (inflateInit(&zstream) != Z_OK)
+        return AVERROR_EXTERNAL;
+    zstream.next_in  = (unsigned char *)data;
+    zstream.avail_in = data_end - data;
+    av_bprint_init(bp, 0, -1);
+
+    while (zstream.avail_in > 0) {
+        av_bprint_get_buffer(bp, 1, &buf, &buf_size);
+        if (!buf_size) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        zstream.next_out  = buf;
+        zstream.avail_out = buf_size;
+        ret = inflate(&zstream, Z_PARTIAL_FLUSH);
+        if (ret != Z_OK && ret != Z_STREAM_END) {
+            ret = AVERROR_EXTERNAL;
+            goto fail;
+        }
+        bp->len += zstream.next_out - buf;
+        if (ret == Z_STREAM_END)
+            break;
+    }
+    inflateEnd(&zstream);
+    bp->str[bp->len] = 0;
+    return 0;
+
+fail:
+    inflateEnd(&zstream);
+    av_bprint_finalize(bp, NULL);
+    return ret;
+}
+
+static uint8_t *iso88591_to_utf8(const uint8_t *in, size_t size_in)
+{
+    size_t extra = 0, i;
+    uint8_t *out, *q;
+
+    for (i = 0; i < size_in; i++)
+        extra += in[i] >= 0x80;
+    if (size_in == SIZE_MAX || extra > SIZE_MAX - size_in - 1)
+        return NULL;
+    q = out = av_malloc(size_in + extra + 1);
+    if (!out)
+        return NULL;
+    for (i = 0; i < size_in; i++) {
+        if (in[i] >= 0x80) {
+            *(q++) = 0xC0 | (in[i] >> 6);
+            *(q++) = 0x80 | (in[i] & 0x3F);
+        } else {
+            *(q++) = in[i];
+        }
+    }
+    *(q++) = 0;
+    return out;
+}
+
+static int decode_text_chunk(PNGDecContext *s, uint32_t length, int compressed,
+                             AVDictionary **dict)
+{
+    int ret, method;
+    const uint8_t *data        = s->gb.buffer;
+    const uint8_t *data_end    = data + length;
+    const uint8_t *keyword     = data;
+    const uint8_t *keyword_end = memchr(keyword, 0, data_end - keyword);
+    uint8_t *kw_utf8 = NULL, *text, *txt_utf8 = NULL;
+    unsigned text_len;
+    AVBPrint bp;
+
+    if (!keyword_end)
         return AVERROR_INVALIDDATA;
+    data = keyword_end + 1;
+
+    if (compressed) {
+        if (data == data_end)
+            return AVERROR_INVALIDDATA;
+        method = *(data++);
+        if (method)
+            return AVERROR_INVALIDDATA;
+        if ((ret = decode_zbuf(&bp, data, data_end)) < 0)
+            return ret;
+        text_len = bp.len;
+        av_bprint_finalize(&bp, (char **)&text);
+        if (!text)
+            return AVERROR(ENOMEM);
+    } else {
+        text = (uint8_t *)data;
+        text_len = data_end - text;
+    }
+
+    kw_utf8  = iso88591_to_utf8(keyword, keyword_end - keyword);
+    txt_utf8 = iso88591_to_utf8(text, text_len);
+    if (text != data)
+        av_free(text);
+    if (!(kw_utf8 && txt_utf8)) {
+        av_free(kw_utf8);
+        av_free(txt_utf8);
+        return AVERROR(ENOMEM);
     }
-    if (memcmp(buf, ff_pngsig, 8) != 0 &&
-        memcmp(buf, ff_mngsig, 8) != 0) {
-        char signature[5 * 8 + 1] = { 0 };
-        int i;
-        for (i = 0; i < 8; i++) {
-            av_strlcatf(signature + i * 5, sizeof(signature) - i * 5,
-                        " 0x%02x", buf[i]);
+
+    av_dict_set(dict, kw_utf8, txt_utf8,
+                AV_DICT_DONT_STRDUP_KEY | AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+static int decode_ihdr_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    if (length != 13)
+        return AVERROR_INVALIDDATA;
+
+    if (s->state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "IHDR after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->state & PNG_IHDR) {
+        av_log(avctx, AV_LOG_ERROR, "Multiple IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->width  = s->cur_w = bytestream2_get_be32(&s->gb);
+    s->height = s->cur_h = bytestream2_get_be32(&s->gb);
+    if (av_image_check_size(s->width, s->height, 0, avctx)) {
+        s->cur_w = s->cur_h = s->width = s->height = 0;
+        av_log(avctx, AV_LOG_ERROR, "Invalid image size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->bit_depth        = bytestream2_get_byte(&s->gb);
+    s->color_type       = bytestream2_get_byte(&s->gb);
+    s->compression_type = bytestream2_get_byte(&s->gb);
+    s->filter_type      = bytestream2_get_byte(&s->gb);
+    s->interlace_type   = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->state |= PNG_IHDR;
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "width=%d height=%d depth=%d color_type=%d "
+                "compression_type=%d filter_type=%d interlace_type=%d\n",
+                s->width, s->height, s->bit_depth, s->color_type,
+                s->compression_type, s->filter_type, s->interlace_type);
+
+    return 0;
+}
+
+static int decode_phys_chunk(AVCodecContext *avctx, PNGDecContext *s)
+{
+    if (s->state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "pHYs after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->sample_aspect_ratio.num = bytestream2_get_be32(&s->gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_be32(&s->gb);
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.den < 0)
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    bytestream2_skip(&s->gb, 1); /* unit specifier */
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_idat_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length, AVFrame *p)
+{
+    int ret;
+    size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+
+    if (!(s->state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "IDAT without IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (!(s->state & PNG_IDAT)) {
+        /* init image info */
+        avctx->width  = s->width;
+        avctx->height = s->height;
+
+        s->channels       = ff_png_get_nb_channels(s->color_type);
+        s->bits_per_pixel = s->bit_depth * s->channels;
+        s->bpp            = (s->bits_per_pixel + 7) >> 3;
+        s->row_size       = (s->cur_w * s->bits_per_pixel + 7) >> 3;
+
+        if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        } else if ((s->bits_per_pixel == 1 || s->bits_per_pixel == 2 || s->bits_per_pixel == 4 || s->bits_per_pixel == 8) &&
+                s->color_type == PNG_COLOR_TYPE_PALETTE) {
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        } else if (s->bit_depth == 1 && s->bits_per_pixel == 1 && avctx->codec_id != AV_CODEC_ID_APNG) {
+            avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        } else if (s->bit_depth == 8 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "unsupported bit depth %d "
+                    "and color type %d\n",
+                    s->bit_depth, s->color_type);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+            switch (avctx->pix_fmt) {
+            case AV_PIX_FMT_RGB24:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+                break;
+
+            case AV_PIX_FMT_RGB48BE:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+                break;
+
+            case AV_PIX_FMT_GRAY8:
+                avctx->pix_fmt = AV_PIX_FMT_YA8;
+                break;
+
+            case AV_PIX_FMT_GRAY16BE:
+                avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+                break;
+
+            default:
+                av_assert0(0);
+            }
+
+            s->bpp += byte_depth;
+        }
+
+        if ((ret = ff_thread_get_buffer(avctx, &s->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        if (avctx->codec_id == AV_CODEC_ID_APNG && s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            ff_thread_release_buffer(avctx, &s->previous_picture);
+            if ((ret = ff_thread_get_buffer(avctx, &s->previous_picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+                return ret;
+        }
+        ff_thread_finish_setup(avctx);
+
+        p->pict_type        = AV_PICTURE_TYPE_I;
+        p->key_frame        = 1;
+        p->interlaced_frame = !!s->interlace_type;
+
+        /* compute the compressed row size */
+        if (!s->interlace_type) {
+            s->crow_size = s->row_size + 1;
+        } else {
+            s->pass          = 0;
+            s->pass_row_size = ff_png_pass_row_size(s->pass,
+                    s->bits_per_pixel,
+                    s->cur_w);
+            s->crow_size = s->pass_row_size + 1;
+        }
+        ff_dlog(avctx, "row_size=%d crow_size =%d\n",
+                s->row_size, s->crow_size);
+        s->image_buf      = p->data[0];
+        s->image_linesize = p->linesize[0];
+        /* copy the palette if needed */
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+            memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
+        /* empty row is used if differencing to the first row */
+        av_fast_padded_mallocz(&s->last_row, &s->last_row_size, s->row_size);
+        if (!s->last_row)
+            return AVERROR_INVALIDDATA;
+        if (s->interlace_type ||
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            av_fast_padded_malloc(&s->tmp_row, &s->tmp_row_size, s->row_size);
+            if (!s->tmp_row)
+                return AVERROR_INVALIDDATA;
+        }
+        /* compressed row */
+        av_fast_padded_malloc(&s->buffer, &s->buffer_size, s->row_size + 16);
+        if (!s->buffer)
+            return AVERROR(ENOMEM);
+
+        /* we want crow_buf+1 to be 16-byte aligned */
+        s->crow_buf          = s->buffer + 15;
+        s->zstream.avail_out = s->crow_size;
+        s->zstream.next_out  = s->crow_buf;
+    }
+
+    s->state |= PNG_IDAT;
+
+    /* set image to non-transparent bpp while decompressing */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp -= byte_depth;
+
+    ret = png_decode_idat(s, length);
+
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp += byte_depth;
+
+    if (ret < 0)
+        return ret;
+
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_plte_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int n, i, r, g, b;
+
+    if ((length % 3) != 0 || length > 256 * 3)
+        return AVERROR_INVALIDDATA;
+    /* read the palette */
+    n = length / 3;
+    for (i = 0; i < n; i++) {
+        r = bytestream2_get_byte(&s->gb);
+        g = bytestream2_get_byte(&s->gb);
+        b = bytestream2_get_byte(&s->gb);
+        s->palette[i] = (0xFFU << 24) | (r << 16) | (g << 8) | b;
+    }
+    for (; i < 256; i++)
+        s->palette[i] = (0xFFU << 24);
+    s->state |= PNG_PLTE;
+    bytestream2_skip(&s->gb, 4);     /* crc */
+
+    return 0;
+}
+
+static int decode_trns_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int v, i;
+
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        if (length > 256 || !(s->state & PNG_PLTE))
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length; i++) {
+            v = bytestream2_get_byte(&s->gb);
+            s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
         }
-        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature %s\n",
-               signature);
+    } else if (s->color_type == PNG_COLOR_TYPE_GRAY || s->color_type == PNG_COLOR_TYPE_RGB) {
+        if ((s->color_type == PNG_COLOR_TYPE_GRAY && length != 2) ||
+            (s->color_type == PNG_COLOR_TYPE_RGB && length != 6))
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length / 2; i++) {
+            /* only use the least significant bits */
+            v = bytestream2_get_be16(&s->gb) & ((1 << s->bit_depth) - 1);
+
+            if (s->bit_depth > 8)
+                AV_WB16(&s->transparent_color_be[2 * i], v);
+            else
+                s->transparent_color_be[i] = v;
+        }
+    } else {
         return AVERROR_INVALIDDATA;
     }
 
-    bytestream2_init(&s->gb, buf + 8, buf_size - 8);
-    s->y = s->state = 0;
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->has_trns = 1;
+
+    return 0;
+}
+
+static void handle_small_bpp(PNGDecContext *s, AVFrame *p)
+{
+    if (s->bits_per_pixel == 1 && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        int i, j, k;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 8;
+            for (k = 7; k >= 1; k--)
+                if ((s->width&7) >= k)
+                    pd[8*i + k - 1] = (pd[i]>>8-k) & 1;
+            for (i--; i >= 0; i--) {
+                pd[8*i + 7]=  pd[i]     & 1;
+                pd[8*i + 6]= (pd[i]>>1) & 1;
+                pd[8*i + 5]= (pd[i]>>2) & 1;
+                pd[8*i + 4]= (pd[i]>>3) & 1;
+                pd[8*i + 3]= (pd[i]>>4) & 1;
+                pd[8*i + 2]= (pd[i]>>5) & 1;
+                pd[8*i + 1]= (pd[i]>>6) & 1;
+                pd[8*i + 0]=  pd[i]>>7;
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 2) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 4;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if ((s->width&3) >= 3) pd[4*i + 2]= (pd[i] >> 2) & 3;
+                if ((s->width&3) >= 2) pd[4*i + 1]= (pd[i] >> 4) & 3;
+                if ((s->width&3) >= 1) pd[4*i + 0]=  pd[i] >> 6;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]=  pd[i]     & 3;
+                    pd[4*i + 2]= (pd[i]>>2) & 3;
+                    pd[4*i + 1]= (pd[i]>>4) & 3;
+                    pd[4*i + 0]=  pd[i]>>6;
+                }
+            } else {
+                if ((s->width&3) >= 3) pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                if ((s->width&3) >= 2) pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                if ((s->width&3) >= 1) pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]= ( pd[i]     & 3)*0x55;
+                    pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                    pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                    pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 4) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width/2;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if (s->width&1) pd[2*i+0]= pd[i]>>4;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = pd[i] & 15;
+                    pd[2*i + 0] = pd[i] >> 4;
+                }
+            } else {
+                if (s->width & 1) pd[2*i + 0]= (pd[i] >> 4) * 0x11;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = (pd[i] & 15) * 0x11;
+                    pd[2*i + 0] = (pd[i] >> 4) * 0x11;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    }
+}
+
+static int decode_fctl_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    uint32_t sequence_number;
+    int cur_w, cur_h, x_offset, y_offset, dispose_op, blend_op;
+
+    if (length != 26)
+        return AVERROR_INVALIDDATA;
+
+    if (!(s->state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "fctl before IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->last_w = s->cur_w;
+    s->last_h = s->cur_h;
+    s->last_x_offset = s->x_offset;
+    s->last_y_offset = s->y_offset;
+    s->last_dispose_op = s->dispose_op;
+
+    sequence_number = bytestream2_get_be32(&s->gb);
+    cur_w           = bytestream2_get_be32(&s->gb);
+    cur_h           = bytestream2_get_be32(&s->gb);
+    x_offset        = bytestream2_get_be32(&s->gb);
+    y_offset        = bytestream2_get_be32(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* delay_num (2), delay_den (2) */
+    dispose_op      = bytestream2_get_byte(&s->gb);
+    blend_op        = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    if (sequence_number == 0 &&
+        (cur_w != s->width ||
+         cur_h != s->height ||
+         x_offset != 0 ||
+         y_offset != 0) ||
+        cur_w <= 0 || cur_h <= 0 ||
+        x_offset < 0 || y_offset < 0 ||
+        cur_w > s->width - x_offset|| cur_h > s->height - y_offset)
+            return AVERROR_INVALIDDATA;
+
+    if (sequence_number == 0 && dispose_op == APNG_DISPOSE_OP_PREVIOUS) {
+        // No previous frame to revert to for the first frame
+        // Spec says to just treat it as a APNG_DISPOSE_OP_BACKGROUND
+        dispose_op = APNG_DISPOSE_OP_BACKGROUND;
+    }
+
+    if (blend_op == APNG_BLEND_OP_OVER && !s->has_trns && (
+            avctx->pix_fmt == AV_PIX_FMT_RGB24 ||
+            avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
+            avctx->pix_fmt == AV_PIX_FMT_PAL8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+            avctx->pix_fmt == AV_PIX_FMT_MONOBLACK
+        )) {
+        // APNG_BLEND_OP_OVER is the same as APNG_BLEND_OP_SOURCE when there is no alpha channel
+        blend_op = APNG_BLEND_OP_SOURCE;
+    }
+
+    s->cur_w      = cur_w;
+    s->cur_h      = cur_h;
+    s->x_offset   = x_offset;
+    s->y_offset   = y_offset;
+    s->dispose_op = dispose_op;
+    s->blend_op   = blend_op;
+
+    return 0;
+}
+
+static void handle_p_frame_png(PNGDecContext *s, AVFrame *p)
+{
+    int i, j;
+    uint8_t *pd      = p->data[0];
+    uint8_t *pd_last = s->last_picture.f->data[0];
+    int ls = FFMIN(av_image_get_linesize(p->format, s->width, 0), s->width * s->bpp);
+
+    ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+    for (j = 0; j < s->height; j++) {
+        for (i = 0; i < ls; i++)
+            pd[i] += pd_last[i];
+        pd      += s->image_linesize;
+        pd_last += s->image_linesize;
+    }
+}
+
+// divide by 255 and round to nearest
+// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
+#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
+
+static int handle_p_frame_apng(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p)
+{
+    size_t x, y;
+    uint8_t *buffer = av_malloc(s->image_linesize * s->height);
+
+    if (!buffer)
+        return AVERROR(ENOMEM);
+
+    if (s->blend_op == APNG_BLEND_OP_OVER &&
+        avctx->pix_fmt != AV_PIX_FMT_RGBA &&
+        avctx->pix_fmt != AV_PIX_FMT_GRAY8A &&
+        avctx->pix_fmt != AV_PIX_FMT_PAL8) {
+        avpriv_request_sample(avctx, "Blending with pixel format %s",
+                              av_get_pix_fmt_name(avctx->pix_fmt));
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Do the disposal operation specified by the last frame on the frame
+    if (s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+        ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+        memcpy(buffer, s->last_picture.f->data[0], s->image_linesize * s->height);
+
+        if (s->last_dispose_op == APNG_DISPOSE_OP_BACKGROUND)
+            for (y = s->last_y_offset; y < s->last_y_offset + s->last_h; ++y)
+                memset(buffer + s->image_linesize * y + s->bpp * s->last_x_offset, 0, s->bpp * s->last_w);
+
+        memcpy(s->previous_picture.f->data[0], buffer, s->image_linesize * s->height);
+        ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    } else {
+        ff_thread_await_progress(&s->previous_picture, INT_MAX, 0);
+        memcpy(buffer, s->previous_picture.f->data[0], s->image_linesize * s->height);
+    }
+
+    // Perform blending
+    if (s->blend_op == APNG_BLEND_OP_SOURCE) {
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            size_t row_start = s->image_linesize * y + s->bpp * s->x_offset;
+            memcpy(buffer + row_start, p->data[0] + row_start, s->bpp * s->cur_w);
+        }
+    } else { // APNG_BLEND_OP_OVER
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            uint8_t *foreground = p->data[0] + s->image_linesize * y + s->bpp * s->x_offset;
+            uint8_t *background = buffer + s->image_linesize * y + s->bpp * s->x_offset;
+            for (x = s->x_offset; x < s->x_offset + s->cur_w; ++x, foreground += s->bpp, background += s->bpp) {
+                size_t b;
+                uint8_t foreground_alpha, background_alpha, output_alpha;
+                uint8_t output[4];
+
+                // Since we might be blending alpha onto alpha, we use the following equations:
+                // output_alpha = foreground_alpha + (1 - foreground_alpha) * background_alpha
+                // output = (foreground_alpha * foreground + (1 - foreground_alpha) * background_alpha * background) / output_alpha
+
+                switch (avctx->pix_fmt) {
+                case AV_PIX_FMT_RGBA:
+                    foreground_alpha = foreground[3];
+                    background_alpha = background[3];
+                    break;
+
+                case AV_PIX_FMT_GRAY8A:
+                    foreground_alpha = foreground[1];
+                    background_alpha = background[1];
+                    break;
+
+                case AV_PIX_FMT_PAL8:
+                    foreground_alpha = s->palette[foreground[0]] >> 24;
+                    background_alpha = s->palette[background[0]] >> 24;
+                    break;
+                }
+
+                if (foreground_alpha == 0)
+                    continue;
+
+                if (foreground_alpha == 255) {
+                    memcpy(background, foreground, s->bpp);
+                    continue;
+                }
+
+                if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+                    // TODO: Alpha blending with PAL8 will likely need the entire image converted over to RGBA first
+                    avpriv_request_sample(avctx, "Alpha blending palette samples");
+                    background[0] = foreground[0];
+                    continue;
+                }
+
+                output_alpha = foreground_alpha + FAST_DIV255((255 - foreground_alpha) * background_alpha);
+
+                for (b = 0; b < s->bpp - 1; ++b) {
+                    if (output_alpha == 0) {
+                        output[b] = 0;
+                    } else if (background_alpha == 255) {
+                        output[b] = FAST_DIV255(foreground_alpha * foreground[b] + (255 - foreground_alpha) * background[b]);
+                    } else {
+                        output[b] = (255 * foreground_alpha * foreground[b] + (255 - foreground_alpha) * background_alpha * background[b]) / (255 * output_alpha);
+                    }
+                }
+                output[b] = output_alpha;
+                memcpy(background, output, s->bpp);
+            }
+        }
+    }
+
+    // Copy blended buffer into the frame and free
+    memcpy(p->data[0], buffer, s->image_linesize * s->height);
+    av_free(buffer);
+
+    return 0;
+}
+
+static int decode_frame_common(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p, AVPacket *avpkt)
+{
+    AVDictionary *metadata  = NULL;
+    uint32_t tag, length;
+    int decode_next_dat = 0;
+    int ret;
 
-    /* init the zlib */
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    ret = inflateInit(&s->zstream);
-    if (ret != Z_OK)
-        return -1;
     for (;;) {
-        if (bytestream2_get_bytes_left(&s->gb) <= 0)
+        length = bytestream2_get_bytes_left(&s->gb);
+        if (length <= 0) {
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && length == 0) {
+                if (!(s->state & PNG_IDAT))
+                    return 0;
+                else
+                    goto exit_loop;
+            }
+            av_log(avctx, AV_LOG_ERROR, "%d bytes left\n", length);
+            if (   s->state & PNG_ALLIMAGE
+                && avctx->strict_std_compliance <= FF_COMPLIANCE_NORMAL)
+                goto exit_loop;
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
+
         length = bytestream2_get_be32(&s->gb);
-        if (length > 0x7fffffff)
+        if (length > 0x7fffffff || length > bytestream2_get_bytes_left(&s->gb)) {
+            av_log(avctx, AV_LOG_ERROR, "chunk too big\n");
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
         tag = bytestream2_get_le32(&s->gb);
-        ff_dlog(avctx, "png: tag=%c%c%c%c length=%u\n",
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "png: tag=%c%c%c%c length=%u\n",
                 (tag & 0xff),
                 ((tag >> 8) & 0xff),
                 ((tag >> 16) & 0xff),
                 ((tag >> 24) & 0xff), length);
         switch (tag) {
         case MKTAG('I', 'H', 'D', 'R'):
-            if (length != 13)
+            if ((ret = decode_ihdr_chunk(avctx, s, length)) < 0)
                 goto fail;
-            s->width  = bytestream2_get_be32(&s->gb);
-            s->height = bytestream2_get_be32(&s->gb);
-            if (av_image_check_size(s->width, s->height, 0, avctx)) {
-                s->width = s->height = 0;
+            break;
+        case MKTAG('p', 'H', 'Y', 's'):
+            if ((ret = decode_phys_chunk(avctx, s)) < 0)
                 goto fail;
-            }
-            s->bit_depth        = bytestream2_get_byte(&s->gb);
-            s->color_type       = bytestream2_get_byte(&s->gb);
-            s->compression_type = bytestream2_get_byte(&s->gb);
-            s->filter_type      = bytestream2_get_byte(&s->gb);
-            s->interlace_type   = bytestream2_get_byte(&s->gb);
-            bytestream2_skip(&s->gb, 4); /* crc */
-            s->state |= PNG_IHDR;
-            ff_dlog(avctx, "width=%d height=%d depth=%d color_type=%d "
-                           "compression_type=%d filter_type=%d interlace_type=%d\n",
-                    s->width, s->height, s->bit_depth, s->color_type,
-                    s->compression_type, s->filter_type, s->interlace_type);
             break;
-        case MKTAG('I', 'D', 'A', 'T'):
-            if (!(s->state & PNG_IHDR))
+        case MKTAG('f', 'c', 'T', 'L'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if ((ret = decode_fctl_chunk(avctx, s, length)) < 0)
+                goto fail;
+            decode_next_dat = 1;
+            break;
+        case MKTAG('f', 'd', 'A', 'T'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if (!decode_next_dat) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
-            if (!(s->state & PNG_IDAT)) {
-                /* init image info */
-                avctx->width  = s->width;
-                avctx->height = s->height;
-
-                s->channels       = ff_png_get_nb_channels(s->color_type);
-                s->bits_per_pixel = s->bit_depth * s->channels;
-                s->bpp            = (s->bits_per_pixel + 7) >> 3;
-                s->row_size       = (avctx->width * s->bits_per_pixel + 7) >> 3;
-
-                if (s->bit_depth == 8 &&
-                    s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB24;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB32;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-                } else if (s->bit_depth == 1 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_PALETTE) {
-                    avctx->pix_fmt = AV_PIX_FMT_PAL8;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA16BE;
-                } else {
-                    goto fail;
-                }
-
-                if (ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                    goto fail;
-                }
-                p->pict_type        = AV_PICTURE_TYPE_I;
-                p->key_frame        = 1;
-                p->interlaced_frame = !!s->interlace_type;
-
-                /* compute the compressed row size */
-                if (!s->interlace_type) {
-                    s->crow_size = s->row_size + 1;
-                } else {
-                    s->pass          = 0;
-                    s->pass_row_size = ff_png_pass_row_size(s->pass,
-                                                            s->bits_per_pixel,
-                                                            s->width);
-                    s->crow_size = s->pass_row_size + 1;
-                }
-                ff_dlog(avctx, "row_size=%d crow_size =%d\n",
-                        s->row_size, s->crow_size);
-                s->image_buf      = p->data[0];
-                s->image_linesize = p->linesize[0];
-                /* copy the palette if needed */
-                if (s->color_type == PNG_COLOR_TYPE_PALETTE)
-                    memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
-                /* empty row is used if differencing to the first row */
-                s->last_row = av_mallocz(s->row_size);
-                if (!s->last_row)
-                    goto fail;
-                if (s->interlace_type ||
-                    s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    s->tmp_row = av_malloc(s->row_size);
-                    if (!s->tmp_row)
-                        goto fail;
-                }
-                /* compressed row */
-                crow_buf_base = av_malloc(s->row_size + 16);
-                if (!crow_buf_base)
-                    goto fail;
-
-                /* we want crow_buf+1 to be 16-byte aligned */
-                s->crow_buf          = crow_buf_base + 15;
-                s->zstream.avail_out = s->crow_size;
-                s->zstream.next_out  = s->crow_buf;
             }
-            s->state |= PNG_IDAT;
-            if (png_decode_idat(s, length) < 0)
+            bytestream2_get_be32(&s->gb);
+            length -= 4;
+            /* fallthrough */
+        case MKTAG('I', 'D', 'A', 'T'):
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && !decode_next_dat)
+                goto skip_tag;
+            if ((ret = decode_idat_chunk(avctx, s, length, p)) < 0)
                 goto fail;
-            bytestream2_skip(&s->gb, 4); /* crc */
             break;
         case MKTAG('P', 'L', 'T', 'E'):
-        {
-            int n, i, r, g, b;
-
-            if ((length % 3) != 0 || length > 256 * 3)
+            if (decode_plte_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            /* read the palette */
-            n = length / 3;
-            for (i = 0; i < n; i++) {
-                r = bytestream2_get_byte(&s->gb);
-                g = bytestream2_get_byte(&s->gb);
-                b = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (0xff << 24) | (r << 16) | (g << 8) | b;
-            }
-            for (; i < 256; i++)
-                s->palette[i] = (0xff << 24);
-            s->state |= PNG_PLTE;
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
         case MKTAG('t', 'R', 'N', 'S'):
-        {
-            int v, i;
-
-            /* read the transparency. XXX: Only palette mode supported */
-            if (s->color_type != PNG_COLOR_TYPE_PALETTE ||
-                length > 256 ||
-                !(s->state & PNG_PLTE))
+            if (decode_trns_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            for (i = 0; i < length; i++) {
-                v = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
-            }
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
+        case MKTAG('t', 'E', 'X', 't'):
+            if (decode_text_chunk(s, length, 0, &metadata) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken tEXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
+        case MKTAG('z', 'T', 'X', 't'):
+            if (decode_text_chunk(s, length, 1, &metadata) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken zTXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
         case MKTAG('I', 'E', 'N', 'D'):
             if (!(s->state & PNG_ALLIMAGE))
+                av_log(avctx, AV_LOG_ERROR, "IEND without all image\n");
+            if (!(s->state & (PNG_ALLIMAGE|PNG_IDAT))) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
+            }
             bytestream2_skip(&s->gb, 4); /* crc */
             goto exit_loop;
         default:
@@ -621,40 +1174,221 @@ skip_tag:
         }
     }
 exit_loop:
-    /* handle p-frames only if a predecessor frame is available */
-    if (s->prev->data[0]) {
-        if (!(avpkt->flags & AV_PKT_FLAG_KEY)) {
-            int i, j;
-            uint8_t *pd      = p->data[0];
-            uint8_t *pd_last = s->prev->data[0];
-
-            for (j = 0; j < s->height; j++) {
-                for (i = 0; i < s->width * s->bpp; i++)
-                    pd[i] += pd_last[i];
-                pd      += s->image_linesize;
-                pd_last += s->image_linesize;
+
+    if (s->bits_per_pixel <= 4)
+        handle_small_bpp(s, p);
+
+    /* apply transparency if needed */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+        size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+        size_t raw_bpp = s->bpp - byte_depth;
+        unsigned x, y;
+
+        for (y = 0; y < s->height; ++y) {
+            uint8_t *row = &s->image_buf[s->image_linesize * y];
+
+            /* since we're updating in-place, we have to go from right to left */
+            for (x = s->width; x > 0; --x) {
+                uint8_t *pixel = &row[s->bpp * (x - 1)];
+                memmove(pixel, &row[raw_bpp * (x - 1)], raw_bpp);
+
+                if (!memcmp(pixel, s->transparent_color_be, raw_bpp)) {
+                    memset(&pixel[raw_bpp], 0, byte_depth);
+                } else {
+                    memset(&pixel[raw_bpp], 0xff, byte_depth);
+                }
             }
         }
     }
 
-    av_frame_unref(s->prev);
-    if ((ret = av_frame_ref(s->prev, p)) < 0)
-        goto fail;
+    /* handle p-frames only if a predecessor frame is available */
+    if (s->last_picture.f->data[0]) {
+        if (   !(avpkt->flags & AV_PKT_FLAG_KEY) && avctx->codec_tag != AV_RL32("MPNG")
+            && s->last_picture.f->width == p->width
+            && s->last_picture.f->height== p->height
+            && s->last_picture.f->format== p->format
+         ) {
+            if (CONFIG_PNG_DECODER && avctx->codec_id != AV_CODEC_ID_APNG)
+                handle_p_frame_png(s, p);
+            else if (CONFIG_APNG_DECODER &&
+                     avctx->codec_id == AV_CODEC_ID_APNG &&
+                     (ret = handle_p_frame_apng(avctx, s, p)) < 0)
+                goto fail;
+        }
+    }
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+
+    av_frame_set_metadata(p, metadata);
+    metadata   = NULL;
+    return 0;
+
+fail:
+    av_dict_free(&metadata);
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    return ret;
+}
+
+#if CONFIG_PNG_DECODER
+static int decode_frame_png(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    const uint8_t *buf     = avpkt->data;
+    int buf_size           = avpkt->size;
+    AVFrame *p;
+    int64_t sig;
+    int ret;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    bytestream2_init(&s->gb, buf, buf_size);
+
+    /* check signature */
+    sig = bytestream2_get_be64(&s->gb);
+    if (sig != PNGSIG &&
+        sig != MNGSIG) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature 0x%08"PRIX64".\n", sig);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->y = s->state = s->has_trns = 0;
+
+    /* init the zlib */
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    ret = inflateInit(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto the_end;
+
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        return ret;
 
     *got_frame = 1;
 
     ret = bytestream2_tell(&s->gb);
 the_end:
     inflateEnd(&s->zstream);
-    av_free(crow_buf_base);
     s->crow_buf = NULL;
-    av_freep(&s->last_row);
-    av_freep(&s->tmp_row);
     return ret;
-fail:
-    ret = -1;
-    goto the_end;
 }
+#endif
+
+#if CONFIG_APNG_DECODER
+static int decode_frame_apng(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    int ret;
+    AVFrame *p;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    if (!(s->state & PNG_IHDR)) {
+        if (!avctx->extradata_size)
+            return AVERROR_INVALIDDATA;
+
+        /* only init fields, there is no zlib use in extradata */
+        s->zstream.zalloc = ff_png_zalloc;
+        s->zstream.zfree  = ff_png_zfree;
+
+        bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
+        if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+            goto end;
+    }
+
+    /* reset state for a new frame */
+    if ((ret = inflateInit(&s->zstream)) != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        ret = AVERROR_EXTERNAL;
+        goto end;
+    }
+    s->y = 0;
+    s->state &= ~(PNG_IDAT | PNG_ALLIMAGE);
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto end;
+
+    if (!(s->state & PNG_ALLIMAGE))
+        av_log(avctx, AV_LOG_WARNING, "Frame did not contain a complete image\n");
+    if (!(s->state & (PNG_ALLIMAGE|PNG_IDAT))) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        goto end;
+
+    *got_frame = 1;
+    ret = bytestream2_tell(&s->gb);
+
+end:
+    inflateEnd(&s->zstream);
+    return ret;
+}
+#endif
+
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    PNGDecContext *psrc = src->priv_data;
+    PNGDecContext *pdst = dst->priv_data;
+    int ret;
+
+    if (dst == src)
+        return 0;
+
+    ff_thread_release_buffer(dst, &pdst->picture);
+    if (psrc->picture.f->data[0] &&
+        (ret = ff_thread_ref_frame(&pdst->picture, &psrc->picture)) < 0)
+        return ret;
+    if (CONFIG_APNG_DECODER && dst->codec_id == AV_CODEC_ID_APNG) {
+        pdst->width             = psrc->width;
+        pdst->height            = psrc->height;
+        pdst->bit_depth         = psrc->bit_depth;
+        pdst->color_type        = psrc->color_type;
+        pdst->compression_type  = psrc->compression_type;
+        pdst->interlace_type    = psrc->interlace_type;
+        pdst->filter_type       = psrc->filter_type;
+        pdst->cur_w = psrc->cur_w;
+        pdst->cur_h = psrc->cur_h;
+        pdst->x_offset = psrc->x_offset;
+        pdst->y_offset = psrc->y_offset;
+        pdst->has_trns = psrc->has_trns;
+        memcpy(pdst->transparent_color_be, psrc->transparent_color_be, sizeof(pdst->transparent_color_be));
+
+        pdst->dispose_op = psrc->dispose_op;
+
+        memcpy(pdst->palette, psrc->palette, sizeof(pdst->palette));
+
+        pdst->state |= psrc->state & (PNG_IHDR | PNG_PLTE);
+
+        ff_thread_release_buffer(dst, &pdst->last_picture);
+        if (psrc->last_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->last_picture, &psrc->last_picture)) < 0)
+            return ret;
+
+        ff_thread_release_buffer(dst, &pdst->previous_picture);
+        if (psrc->previous_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->previous_picture, &psrc->previous_picture)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+#endif
 
 static av_cold int png_dec_init(AVCodecContext *avctx)
 {
@@ -662,11 +1396,21 @@ static av_cold int png_dec_init(AVCodecContext *avctx)
 
     avctx->color_range = AVCOL_RANGE_JPEG;
 
-    s->prev = av_frame_alloc();
-    if (!s->prev)
+    s->avctx = avctx;
+    s->previous_picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    s->picture.f = av_frame_alloc();
+    if (!s->previous_picture.f || !s->last_picture.f || !s->picture.f) {
+        av_frame_free(&s->previous_picture.f);
+        av_frame_free(&s->last_picture.f);
+        av_frame_free(&s->picture.f);
         return AVERROR(ENOMEM);
+    }
 
-    ff_pngdsp_init(&s->dsp);
+    if (!avctx->internal->is_copy) {
+        avctx->internal->allocate_progress = 1;
+        ff_pngdsp_init(&s->dsp);
+    }
 
     return 0;
 }
@@ -675,11 +1419,39 @@ static av_cold int png_dec_end(AVCodecContext *avctx)
 {
     PNGDecContext *s = avctx->priv_data;
 
-    av_frame_free(&s->prev);
+    ff_thread_release_buffer(avctx, &s->previous_picture);
+    av_frame_free(&s->previous_picture.f);
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+    ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+    av_freep(&s->buffer);
+    s->buffer_size = 0;
+    av_freep(&s->last_row);
+    s->last_row_size = 0;
+    av_freep(&s->tmp_row);
+    s->tmp_row_size = 0;
 
     return 0;
 }
 
+#if CONFIG_APNG_DECODER
+AVCodec ff_apng_decoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGDecContext),
+    .init           = png_dec_init,
+    .close          = png_dec_end,
+    .decode         = decode_frame_apng,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+};
+#endif
+
+#if CONFIG_PNG_DECODER
 AVCodec ff_png_decoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
@@ -688,6 +1460,9 @@ AVCodec ff_png_decoder = {
     .priv_data_size = sizeof(PNGDecContext),
     .init           = png_dec_init,
     .close          = png_dec_end,
-    .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .decode         = decode_frame_png,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
 };
+#endif
diff --git a/libavcodec/pngdsp.c b/libavcodec/pngdsp.c
index c0e9402948..d2753163da 100644
--- a/libavcodec/pngdsp.c
+++ b/libavcodec/pngdsp.c
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index 607fe64fd8..5475d0d943 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,9 @@
 #include <stdint.h>
 
 typedef struct PNGDSPContext {
-    void (*add_bytes_l2)(uint8_t *dst  /* align 16 */,
+    void (*add_bytes_l2)(uint8_t *dst,
                          uint8_t *src1 /* align 16 */,
-                         uint8_t *src2 /* align 16 */, int w);
+                         uint8_t *src2, int w);
 
     /* this might write to dst[w] */
     void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
index 6ab4b7f54c..f6ad830cd9 100644
--- a/libavcodec/pngenc.c
+++ b/libavcodec/pngenc.c
@@ -2,37 +2,50 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
+#include "internal.h"
 #include "bytestream.h"
 #include "huffyuvencdsp.h"
 #include "png.h"
+#include "apng.h"
 
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/libm.h"
+#include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
 
 #include <zlib.h>
 
 #define IOBUF_SIZE 4096
 
+typedef struct APNGFctlChunk {
+    uint32_t sequence_number;
+    uint32_t width, height;
+    uint32_t x_offset, y_offset;
+    uint16_t delay_num, delay_den;
+    uint8_t dispose_op, blend_op;
+} APNGFctlChunk;
+
 typedef struct PNGEncContext {
+    AVClass *class;
     HuffYUVEncDSPContext hdsp;
 
     uint8_t *bytestream;
@@ -43,6 +56,23 @@ typedef struct PNGEncContext {
 
     z_stream zstream;
     uint8_t buf[IOBUF_SIZE];
+    int dpi;                     ///< Physical pixel density, in dots per inch, if set
+    int dpm;                     ///< Physical pixel density, in dots per meter, if set
+
+    int is_progressive;
+    int bit_depth;
+    int color_type;
+    int bits_per_pixel;
+
+    // APNG
+    uint32_t palette_checksum;   // Used to ensure a single unique palette
+    uint32_t sequence_number;
+
+    AVFrame *prev_frame;
+    AVFrame *last_frame;
+    APNGFctlChunk last_frame_fctl;
+    uint8_t *last_frame_packet;
+    size_t last_frame_packet_size;
 } PNGEncContext;
 
 static void png_get_interlaced_row(uint8_t *dst, int row_size,
@@ -52,8 +82,9 @@ static void png_get_interlaced_row(uint8_t *dst, int row_size,
     int x, mask, dst_x, j, b, bpp;
     uint8_t *d;
     const uint8_t *s;
+    static const int masks[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 
-    mask = ff_png_pass_mask[pass];
+    mask = masks[pass];
     switch (bits_per_pixel) {
     case 1:
         memset(dst, 0, row_size);
@@ -111,6 +142,22 @@ static void sub_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
     }
 }
 
+static void sub_left_prediction(PNGEncContext *c, uint8_t *dst, const uint8_t *src, int bpp, int size)
+{
+    const uint8_t *src1 = src + bpp;
+    const uint8_t *src2 = src;
+    int x, unaligned_w;
+
+    memcpy(dst, src, bpp);
+    dst += bpp;
+    size -= bpp;
+    unaligned_w = FFMIN(32 - bpp, size);
+    for (x = 0; x < unaligned_w; x++)
+        *dst++ = *src1++ - *src2++;
+    size -= unaligned_w;
+    c->hdsp.diff_bytes(dst, src1, src2, size);
+}
+
 static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
                            uint8_t *src, uint8_t *top, int size, int bpp)
 {
@@ -121,8 +168,7 @@ static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
         memcpy(dst, src, size);
         break;
     case PNG_FILTER_VALUE_SUB:
-        c->hdsp.diff_bytes(dst, src, src - bpp, size);
-        memcpy(dst, src, bpp);
+        sub_left_prediction(c, dst, src, bpp, size);
         break;
     case PNG_FILTER_VALUE_UP:
         c->hdsp.diff_bytes(dst, src, top, size);
@@ -145,7 +191,7 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
                                   uint8_t *src, uint8_t *top, int size, int bpp)
 {
     int pred = s->filter_type;
-    assert(bpp || !pred);
+    av_assert0(bpp || !pred);
     if (!top && pred)
         pred = PNG_FILTER_VALUE_SUB;
     if (pred == PNG_FILTER_VALUE_MIXED) {
@@ -171,45 +217,56 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
     }
 }
 
-static void convert_from_rgb32(uint8_t *dst, const uint8_t *src, int width)
-{
-    uint8_t *d;
-    int j;
-    unsigned int v;
-
-    d = dst;
-    for (j = 0; j < width; j++) {
-        v    = ((const uint32_t *) src)[j];
-        d[0] = v >> 16;
-        d[1] = v >> 8;
-        d[2] = v;
-        d[3] = v >> 24;
-        d   += 4;
-    }
-}
-
 static void png_write_chunk(uint8_t **f, uint32_t tag,
                             const uint8_t *buf, int length)
 {
-    uint32_t crc;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
     uint8_t tagbuf[4];
 
     bytestream_put_be32(f, length);
-    crc = crc32(0, Z_NULL, 0);
     AV_WL32(tagbuf, tag);
-    crc = crc32(crc, tagbuf, 4);
+    crc = av_crc(crc_table, crc, tagbuf, 4);
     bytestream_put_be32(f, av_bswap32(tag));
     if (length > 0) {
-        crc = crc32(crc, buf, length);
+        crc = av_crc(crc_table, crc, buf, length);
         memcpy(*f, buf, length);
         *f += length;
     }
-    bytestream_put_be32(f, crc);
+    bytestream_put_be32(f, ~crc);
+}
+
+static void png_write_image_data(AVCodecContext *avctx,
+                                 const uint8_t *buf, int length)
+{
+    PNGEncContext *s = avctx->priv_data;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
+
+    if (avctx->codec_id == AV_CODEC_ID_PNG || avctx->frame_number == 0) {
+        png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), buf, length);
+        return;
+    }
+
+    bytestream_put_be32(&s->bytestream, length + 4);
+
+    bytestream_put_be32(&s->bytestream, MKBETAG('f', 'd', 'A', 'T'));
+    bytestream_put_be32(&s->bytestream, s->sequence_number);
+    crc = av_crc(crc_table, crc, s->bytestream - 8, 8);
+
+    crc = av_crc(crc_table, crc, buf, length);
+    memcpy(s->bytestream, buf, length);
+    s->bytestream += length;
+
+    bytestream_put_be32(&s->bytestream, ~crc);
+
+    ++s->sequence_number;
 }
 
 /* XXX: do filtering */
-static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
+static int png_write_row(AVCodecContext *avctx, const uint8_t *data, int size)
 {
+    PNGEncContext *s = avctx->priv_data;
     int ret;
 
     s->zstream.avail_in = size;
@@ -220,8 +277,7 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
             return -1;
         if (s->zstream.avail_out == 0) {
             if (s->bytestream_end - s->bytestream > IOBUF_SIZE + 100)
-                png_write_chunk(&s->bytestream,
-                                MKTAG('I', 'D', 'A', 'T'), s->buf, IOBUF_SIZE);
+                png_write_image_data(avctx, s->buf, IOBUF_SIZE);
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
         }
@@ -229,136 +285,112 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
     return 0;
 }
 
-static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+#define AV_WB32_PNG(buf, n) AV_WB32(buf, lrint((n) * 100000))
+static int png_get_chrm(enum AVColorPrimaries prim,  uint8_t *buf)
 {
-    PNGEncContext *s       = avctx->priv_data;
-    const AVFrame *const p = pict;
-    int bit_depth, color_type, y, len, row_size, ret, is_progressive;
-    int bits_per_pixel, pass_row_size, enc_row_size, max_packet_size;
-    int compression_level;
-    uint8_t *ptr, *top, *crow_buf, *crow;
-    uint8_t *crow_base       = NULL;
-    uint8_t *progressive_buf = NULL;
-    uint8_t *rgba_buf        = NULL;
-    uint8_t *top_buf         = NULL;
-
-    is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
-    switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGBA64BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB48BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_RGB32:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB24:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_GRAY16BE:
-        bit_depth  = 16;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_GRAY8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_MONOBLACK:
-        bit_depth  = 1;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_PAL8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_PALETTE;
-        break;
-    default:
-        return -1;
+    double rx, ry, gx, gy, bx, by, wx = 0.3127, wy = 0.3290;
+    switch (prim) {
+        case AVCOL_PRI_BT709:
+            rx = 0.640; ry = 0.330;
+            gx = 0.300; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_BT470M:
+            rx = 0.670; ry = 0.330;
+            gx = 0.210; gy = 0.710;
+            bx = 0.140; by = 0.080;
+            wx = 0.310; wy = 0.316;
+            break;
+        case AVCOL_PRI_BT470BG:
+            rx = 0.640; ry = 0.330;
+            gx = 0.290; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_SMPTE170M:
+        case AVCOL_PRI_SMPTE240M:
+            rx = 0.630; ry = 0.340;
+            gx = 0.310; gy = 0.595;
+            bx = 0.155; by = 0.070;
+            break;
+        case AVCOL_PRI_BT2020:
+            rx = 0.708; ry = 0.292;
+            gx = 0.170; gy = 0.797;
+            bx = 0.131; by = 0.046;
+            break;
+        default:
+            return 0;
     }
-    bits_per_pixel = ff_png_get_nb_channels(color_type) * bit_depth;
-    row_size       = (avctx->width * bits_per_pixel + 7) >> 3;
 
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
-                      ? Z_DEFAULT_COMPRESSION
-                      : av_clip(avctx->compression_level, 0, 9);
-    ret = deflateInit2(&s->zstream, compression_level,
-                       Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY);
-    if (ret != Z_OK)
-        return -1;
+    AV_WB32_PNG(buf     , wx); AV_WB32_PNG(buf + 4 , wy);
+    AV_WB32_PNG(buf + 8 , rx); AV_WB32_PNG(buf + 12, ry);
+    AV_WB32_PNG(buf + 16, gx); AV_WB32_PNG(buf + 20, gy);
+    AV_WB32_PNG(buf + 24, bx); AV_WB32_PNG(buf + 28, by);
+    return 1;
+}
 
-    enc_row_size    = deflateBound(&s->zstream, row_size);
-    max_packet_size = avctx->height * (enc_row_size +
-                                       ((enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) * 12)
-                      + AV_INPUT_BUFFER_MIN_SIZE;
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate output packet of size %d.\n",
-               max_packet_size);
-        return ret;
-    }
+static int png_get_gama(enum AVColorTransferCharacteristic trc, uint8_t *buf)
+{
+    double gamma = avpriv_get_gamma_from_trc(trc);
+    if (gamma <= 1e-6)
+        return 0;
 
-    s->bytestream_start =
-    s->bytestream       = pkt->data;
-    s->bytestream_end   = pkt->data + pkt->size;
+    AV_WB32_PNG(buf, 1.0 / gamma);
+    return 1;
+}
 
-    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
-    if (!crow_base)
-        goto fail;
-    // pixel data should be aligned, but there's a control byte before it
-    crow_buf = crow_base + 15;
-    if (is_progressive) {
-        progressive_buf = av_malloc(row_size + 1);
-        if (!progressive_buf)
-            goto fail;
-    }
-    if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        rgba_buf = av_malloc(row_size + 1);
-        if (!rgba_buf)
-            goto fail;
-    }
-    if (is_progressive || color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        top_buf = av_malloc(row_size + 1);
-        if (!top_buf)
-            goto fail;
-    }
+static int encode_headers(AVCodecContext *avctx, const AVFrame *pict)
+{
+    PNGEncContext *s = avctx->priv_data;
 
     /* write png header */
-    memcpy(s->bytestream, ff_pngsig, 8);
-    s->bytestream += 8;
-
     AV_WB32(s->buf, avctx->width);
     AV_WB32(s->buf + 4, avctx->height);
-    s->buf[8]  = bit_depth;
-    s->buf[9]  = color_type;
+    s->buf[8]  = s->bit_depth;
+    s->buf[9]  = s->color_type;
     s->buf[10] = 0; /* compression type */
     s->buf[11] = 0; /* filter type */
-    s->buf[12] = is_progressive; /* interlace type */
-
+    s->buf[12] = s->is_progressive; /* interlace type */
     png_write_chunk(&s->bytestream, MKTAG('I', 'H', 'D', 'R'), s->buf, 13);
 
+    /* write physical information */
+    if (s->dpm) {
+      AV_WB32(s->buf, s->dpm);
+      AV_WB32(s->buf + 4, s->dpm);
+      s->buf[8] = 1; /* unit specifier is meter */
+    } else {
+      AV_WB32(s->buf, avctx->sample_aspect_ratio.num);
+      AV_WB32(s->buf + 4, avctx->sample_aspect_ratio.den);
+      s->buf[8] = 0; /* unit specifier is unknown */
+    }
+    png_write_chunk(&s->bytestream, MKTAG('p', 'H', 'Y', 's'), s->buf, 9);
+
+    /* write colorspace information */
+    if (pict->color_primaries == AVCOL_PRI_BT709 &&
+        pict->color_trc == AVCOL_TRC_IEC61966_2_1) {
+        s->buf[0] = 1; /* rendering intent, relative colorimetric by default */
+        png_write_chunk(&s->bytestream, MKTAG('s', 'R', 'G', 'B'), s->buf, 1);
+    }
+
+    if (png_get_chrm(pict->color_primaries, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('c', 'H', 'R', 'M'), s->buf, 32);
+    if (png_get_gama(pict->color_trc, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('g', 'A', 'M', 'A'), s->buf, 4);
+
     /* put the palette if needed */
-    if (color_type == PNG_COLOR_TYPE_PALETTE) {
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
         int has_alpha, alpha, i;
         unsigned int v;
         uint32_t *palette;
-        uint8_t *alpha_ptr;
+        uint8_t *ptr, *alpha_ptr;
 
-        palette   = (uint32_t *)p->data[1];
+        palette   = (uint32_t *)pict->data[1];
         ptr       = s->buf;
         alpha_ptr = s->buf + 256 * 3;
         has_alpha = 0;
         for (i = 0; i < 256; i++) {
             v     = palette[i];
             alpha = v >> 24;
-            if (alpha && alpha != 0xff)
+            if (alpha != 0xff)
                 has_alpha = 1;
             *alpha_ptr++ = alpha;
             bytestream_put_be24(&ptr, v);
@@ -371,48 +403,71 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    /* now put each row */
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, const AVFrame *pict)
+{
+    PNGEncContext *s       = avctx->priv_data;
+    const AVFrame *const p = pict;
+    int y, len, ret;
+    int row_size, pass_row_size;
+    uint8_t *ptr, *top, *crow_buf, *crow;
+    uint8_t *crow_base       = NULL;
+    uint8_t *progressive_buf = NULL;
+    uint8_t *top_buf         = NULL;
+
+    row_size = (pict->width * s->bits_per_pixel + 7) >> 3;
+
+    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
+    if (!crow_base) {
+        ret = AVERROR(ENOMEM);
+        goto the_end;
+    }
+    // pixel data should be aligned, but there's a control byte before it
+    crow_buf = crow_base + 15;
+    if (s->is_progressive) {
+        progressive_buf = av_malloc(row_size + 1);
+        top_buf = av_malloc(row_size + 1);
+        if (!progressive_buf || !top_buf) {
+            ret = AVERROR(ENOMEM);
+            goto the_end;
+        }
+    }
+
+    /* put each row */
     s->zstream.avail_out = IOBUF_SIZE;
     s->zstream.next_out  = s->buf;
-    if (is_progressive) {
+    if (s->is_progressive) {
         int pass;
 
         for (pass = 0; pass < NB_PASSES; pass++) {
             /* NOTE: a pass is completely omitted if no pixels would be
              * output */
-            pass_row_size = ff_png_pass_row_size(pass, bits_per_pixel, avctx->width);
+            pass_row_size = ff_png_pass_row_size(pass, s->bits_per_pixel, pict->width);
             if (pass_row_size > 0) {
                 top = NULL;
-                for (y = 0; y < avctx->height; y++)
+                for (y = 0; y < pict->height; y++)
                     if ((ff_png_pass_ymask[pass] << (y & 7)) & 0x80) {
                         ptr = p->data[0] + y * p->linesize[0];
                         FFSWAP(uint8_t *, progressive_buf, top_buf);
-                        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                            convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                            ptr = rgba_buf;
-                        }
                         png_get_interlaced_row(progressive_buf, pass_row_size,
-                                               bits_per_pixel, pass,
-                                               ptr, avctx->width);
+                                               s->bits_per_pixel, pass,
+                                               ptr, pict->width);
                         crow = png_choose_filter(s, crow_buf, progressive_buf,
-                                                 top, pass_row_size, bits_per_pixel >> 3);
-                        png_write_row(s, crow, pass_row_size + 1);
+                                                 top, pass_row_size, s->bits_per_pixel >> 3);
+                        png_write_row(avctx, crow, pass_row_size + 1);
                         top = progressive_buf;
                     }
             }
         }
     } else {
         top = NULL;
-        for (y = 0; y < avctx->height; y++) {
+        for (y = 0; y < pict->height; y++) {
             ptr = p->data[0] + y * p->linesize[0];
-            if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                FFSWAP(uint8_t *, rgba_buf, top_buf);
-                convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                ptr = rgba_buf;
-            }
             crow = png_choose_filter(s, crow_buf, ptr, top,
-                                     row_size, bits_per_pixel >> 3);
-            png_write_row(s, crow, row_size + 1);
+                                     row_size, s->bits_per_pixel >> 3);
+            png_write_row(avctx, crow, row_size + 1);
             top = ptr;
         }
     }
@@ -422,38 +477,499 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         if (ret == Z_OK || ret == Z_STREAM_END) {
             len = IOBUF_SIZE - s->zstream.avail_out;
             if (len > 0 && s->bytestream_end - s->bytestream > len + 100) {
-                png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), s->buf, len);
+                png_write_image_data(avctx, s->buf, len);
             }
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
             if (ret == Z_STREAM_END)
                 break;
         } else {
-            goto fail;
+            ret = -1;
+            goto the_end;
         }
     }
+
+    ret = 0;
+
+the_end:
+    av_freep(&crow_base);
+    av_freep(&progressive_buf);
+    av_freep(&top_buf);
+    deflateReset(&s->zstream);
+    return ret;
+}
+
+static int encode_png(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            12 * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // IDAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+    ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+    if (ret < 0)
+        return ret;
+
+    s->bytestream_start =
+    s->bytestream       = pkt->data;
+    s->bytestream_end   = pkt->data + pkt->size;
+
+    AV_WB64(s->bytestream, PNGSIG);
+    s->bytestream += 8;
+
+    ret = encode_headers(avctx, pict);
+    if (ret < 0)
+        return ret;
+
+    ret = encode_frame(avctx, pict);
+    if (ret < 0)
+        return ret;
+
     png_write_chunk(&s->bytestream, MKTAG('I', 'E', 'N', 'D'), NULL, 0);
 
-    pkt->size   = s->bytestream - s->bytestream_start;
+    pkt->size = s->bytestream - s->bytestream_start;
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
-    ret         = 0;
 
-the_end:
-    av_free(crow_base);
-    av_free(progressive_buf);
-    av_free(rgba_buf);
-    av_free(top_buf);
-    deflateEnd(&s->zstream);
-    return ret;
+    return 0;
+}
+
+static int apng_do_inverse_blend(AVFrame *output, const AVFrame *input,
+                                  APNGFctlChunk *fctl_chunk, uint8_t bpp)
+{
+    // output: background, input: foreground
+    // output the image such that when blended with the background, will produce the foreground
+
+    unsigned int x, y;
+    unsigned int leftmost_x = input->width;
+    unsigned int rightmost_x = 0;
+    unsigned int topmost_y = input->height;
+    unsigned int bottommost_y = 0;
+    const uint8_t *input_data = input->data[0];
+    uint8_t *output_data = output->data[0];
+    ptrdiff_t input_linesize = input->linesize[0];
+    ptrdiff_t output_linesize = output->linesize[0];
+
+    // Find bounding box of changes
+    for (y = 0; y < input->height; ++y) {
+        for (x = 0; x < input->width; ++x) {
+            if (!memcmp(input_data + bpp * x, output_data + bpp * x, bpp))
+                continue;
+
+            if (x < leftmost_x)
+                leftmost_x = x;
+            if (x >= rightmost_x)
+                rightmost_x = x + 1;
+            if (y < topmost_y)
+                topmost_y = y;
+            if (y >= bottommost_y)
+                bottommost_y = y + 1;
+        }
+
+        input_data += input_linesize;
+        output_data += output_linesize;
+    }
+
+    if (leftmost_x == input->width && rightmost_x == 0) {
+        // Empty frame
+        // APNG does not support empty frames, so we make it a 1x1 frame
+        leftmost_x = topmost_y = 0;
+        rightmost_x = bottommost_y = 1;
+    }
+
+    // Do actual inverse blending
+    if (fctl_chunk->blend_op == APNG_BLEND_OP_SOURCE) {
+        output_data = output->data[0];
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            memcpy(output_data,
+                   input->data[0] + input_linesize * y + bpp * leftmost_x,
+                   bpp * (rightmost_x - leftmost_x));
+            output_data += output_linesize;
+        }
+    } else { // APNG_BLEND_OP_OVER
+        size_t transparent_palette_index;
+        uint32_t *palette;
+
+        switch (input->format) {
+        case AV_PIX_FMT_RGBA64BE:
+        case AV_PIX_FMT_YA16BE:
+        case AV_PIX_FMT_RGBA:
+        case AV_PIX_FMT_GRAY8A:
+            break;
+
+        case AV_PIX_FMT_PAL8:
+            palette = (uint32_t*)input->data[1];
+            for (transparent_palette_index = 0; transparent_palette_index < 256; ++transparent_palette_index)
+                if (palette[transparent_palette_index] >> 24 == 0)
+                    break;
+            break;
+
+        default:
+            // No alpha, so blending not possible
+            return -1;
+        }
+
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            uint8_t *foreground = input->data[0] + input_linesize * y + bpp * leftmost_x;
+            uint8_t *background = output->data[0] + output_linesize * y + bpp * leftmost_x;
+            output_data = output->data[0] + output_linesize * (y - topmost_y);
+            for (x = leftmost_x; x < rightmost_x; ++x, foreground += bpp, background += bpp, output_data += bpp) {
+                if (!memcmp(foreground, background, bpp)) {
+                    if (input->format == AV_PIX_FMT_PAL8) {
+                        if (transparent_palette_index == 256) {
+                            // Need fully transparent colour, but none exists
+                            return -1;
+                        }
+
+                        *output_data = transparent_palette_index;
+                    } else {
+                        memset(output_data, 0, bpp);
+                    }
+                    continue;
+                }
+
+                // Check for special alpha values, since full inverse
+                // alpha-on-alpha blending is rarely possible, and when
+                // possible, doesn't compress much better than
+                // APNG_BLEND_OP_SOURCE blending
+                switch (input->format) {
+                case AV_PIX_FMT_RGBA64BE:
+                    if (((uint16_t*)foreground)[3] == 0xffff ||
+                        ((uint16_t*)background)[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_YA16BE:
+                    if (((uint16_t*)foreground)[1] == 0xffff ||
+                        ((uint16_t*)background)[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_RGBA:
+                    if (foreground[3] == 0xff || background[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_GRAY8A:
+                    if (foreground[1] == 0xff || background[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_PAL8:
+                    if (palette[*foreground] >> 24 == 0xff ||
+                        palette[*background] >> 24 == 0)
+                        break;
+                    return -1;
+                }
+
+                memmove(output_data, foreground, bpp);
+            }
+        }
+    }
+
+    output->width = rightmost_x - leftmost_x;
+    output->height = bottommost_y - topmost_y;
+    fctl_chunk->width = output->width;
+    fctl_chunk->height = output->height;
+    fctl_chunk->x_offset = leftmost_x;
+    fctl_chunk->y_offset = topmost_y;
+
+    return 0;
+}
+
+static int apng_encode_frame(AVCodecContext *avctx, const AVFrame *pict,
+                             APNGFctlChunk *best_fctl_chunk, APNGFctlChunk *best_last_fctl_chunk)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    unsigned int y;
+    AVFrame* diffFrame;
+    uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+    uint8_t *original_bytestream, *original_bytestream_end;
+    uint8_t *temp_bytestream = 0, *temp_bytestream_end;
+    uint32_t best_sequence_number;
+    uint8_t *best_bytestream;
+    size_t best_bytestream_size = SIZE_MAX;
+    APNGFctlChunk last_fctl_chunk = *best_last_fctl_chunk;
+    APNGFctlChunk fctl_chunk = *best_fctl_chunk;
+
+    if (avctx->frame_number == 0) {
+        best_fctl_chunk->width = pict->width;
+        best_fctl_chunk->height = pict->height;
+        best_fctl_chunk->x_offset = 0;
+        best_fctl_chunk->y_offset = 0;
+        best_fctl_chunk->blend_op = APNG_BLEND_OP_SOURCE;
+        return encode_frame(avctx, pict);
+    }
+
+    diffFrame = av_frame_alloc();
+    if (!diffFrame)
+        return AVERROR(ENOMEM);
+
+    diffFrame->format = pict->format;
+    diffFrame->width = pict->width;
+    diffFrame->height = pict->height;
+    if ((ret = av_frame_get_buffer(diffFrame, 32)) < 0)
+        goto fail;
+
+    original_bytestream = s->bytestream;
+    original_bytestream_end = s->bytestream_end;
+
+    temp_bytestream = av_malloc(original_bytestream_end - original_bytestream);
+    temp_bytestream_end = temp_bytestream + (original_bytestream_end - original_bytestream);
+    if (!temp_bytestream) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (last_fctl_chunk.dispose_op = 0; last_fctl_chunk.dispose_op < 3; ++last_fctl_chunk.dispose_op) {
+        // 0: APNG_DISPOSE_OP_NONE
+        // 1: APNG_DISPOSE_OP_BACKGROUND
+        // 2: APNG_DISPOSE_OP_PREVIOUS
+
+        for (fctl_chunk.blend_op = 0; fctl_chunk.blend_op < 2; ++fctl_chunk.blend_op) {
+            // 0: APNG_BLEND_OP_SOURCE
+            // 1: APNG_BLEND_OP_OVER
+
+            uint32_t original_sequence_number = s->sequence_number, sequence_number;
+            uint8_t *bytestream_start = s->bytestream;
+            size_t bytestream_size;
+
+            // Do disposal
+            if (last_fctl_chunk.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+                memcpy(diffFrame->data[0], s->last_frame->data[0],
+                       s->last_frame->linesize[0] * s->last_frame->height);
+
+                if (last_fctl_chunk.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                    for (y = last_fctl_chunk.y_offset; y < last_fctl_chunk.y_offset + last_fctl_chunk.height; ++y) {
+                        size_t row_start = s->last_frame->linesize[0] * y + bpp * last_fctl_chunk.x_offset;
+                        memset(diffFrame->data[0] + row_start, 0, bpp * last_fctl_chunk.width);
+                    }
+                }
+            } else {
+                if (!s->prev_frame)
+                    continue;
+
+                memcpy(diffFrame->data[0], s->prev_frame->data[0],
+                       s->prev_frame->linesize[0] * s->prev_frame->height);
+            }
+
+            // Do inverse blending
+            if (apng_do_inverse_blend(diffFrame, pict, &fctl_chunk, bpp) < 0)
+                continue;
+
+            // Do encoding
+            ret = encode_frame(avctx, diffFrame);
+            sequence_number = s->sequence_number;
+            s->sequence_number = original_sequence_number;
+            bytestream_size = s->bytestream - bytestream_start;
+            s->bytestream = bytestream_start;
+            if (ret < 0)
+                goto fail;
+
+            if (bytestream_size < best_bytestream_size) {
+                *best_fctl_chunk = fctl_chunk;
+                *best_last_fctl_chunk = last_fctl_chunk;
+
+                best_sequence_number = sequence_number;
+                best_bytestream = s->bytestream;
+                best_bytestream_size = bytestream_size;
+
+                if (best_bytestream == original_bytestream) {
+                    s->bytestream = temp_bytestream;
+                    s->bytestream_end = temp_bytestream_end;
+                } else {
+                    s->bytestream = original_bytestream;
+                    s->bytestream_end = original_bytestream_end;
+                }
+            }
+        }
+    }
+
+    s->sequence_number = best_sequence_number;
+    s->bytestream = original_bytestream + best_bytestream_size;
+    s->bytestream_end = original_bytestream_end;
+    if (best_bytestream != original_bytestream)
+        memcpy(original_bytestream, best_bytestream, best_bytestream_size);
+
+    ret = 0;
+
 fail:
-    ret = -1;
-    goto the_end;
+    av_freep(&temp_bytestream);
+    av_frame_free(&diffFrame);
+    return ret;
+}
+
+static int encode_apng(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+    APNGFctlChunk fctl_chunk;
+
+    if (pict && avctx->codec_id == AV_CODEC_ID_APNG && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        uint32_t checksum = ~av_crc(av_crc_get_table(AV_CRC_32_IEEE_LE), ~0U, pict->data[1], 256 * sizeof(uint32_t));
+
+        if (avctx->frame_number == 0) {
+            s->palette_checksum = checksum;
+        } else if (checksum != s->palette_checksum) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Input contains more than one unique palette. APNG does not support multiple palettes.\n");
+            return -1;
+        }
+    }
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            (4 + 12) * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // fdAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+
+    if (avctx->frame_number == 0) {
+        s->bytestream = avctx->extradata = av_malloc(FF_MIN_BUFFER_SIZE);
+        if (!avctx->extradata)
+            return AVERROR(ENOMEM);
+
+        ret = encode_headers(avctx, pict);
+        if (ret < 0)
+            return ret;
+
+        avctx->extradata_size = s->bytestream - avctx->extradata;
+
+        s->last_frame_packet = av_malloc(max_packet_size);
+        if (!s->last_frame_packet)
+            return AVERROR(ENOMEM);
+    } else if (s->last_frame) {
+        ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+        if (ret < 0)
+            return ret;
+
+        memcpy(pkt->data, s->last_frame_packet, s->last_frame_packet_size);
+        pkt->size = s->last_frame_packet_size;
+        pkt->pts = pkt->dts = s->last_frame->pts;
+    }
+
+    if (pict) {
+        s->bytestream_start =
+        s->bytestream       = s->last_frame_packet;
+        s->bytestream_end   = s->bytestream + max_packet_size;
+
+        // We're encoding the frame first, so we have to do a bit of shuffling around
+        // to have the image data write to the correct place in the buffer
+        fctl_chunk.sequence_number = s->sequence_number;
+        ++s->sequence_number;
+        s->bytestream += 26 + 12;
+
+        ret = apng_encode_frame(avctx, pict, &fctl_chunk, &s->last_frame_fctl);
+        if (ret < 0)
+            return ret;
+
+        fctl_chunk.delay_num = 0; // delay filled in during muxing
+        fctl_chunk.delay_den = 0;
+    } else {
+        s->last_frame_fctl.dispose_op = APNG_DISPOSE_OP_NONE;
+    }
+
+    if (s->last_frame) {
+        uint8_t* last_fctl_chunk_start = pkt->data;
+        uint8_t buf[26];
+
+        AV_WB32(buf + 0, s->last_frame_fctl.sequence_number);
+        AV_WB32(buf + 4, s->last_frame_fctl.width);
+        AV_WB32(buf + 8, s->last_frame_fctl.height);
+        AV_WB32(buf + 12, s->last_frame_fctl.x_offset);
+        AV_WB32(buf + 16, s->last_frame_fctl.y_offset);
+        AV_WB16(buf + 20, s->last_frame_fctl.delay_num);
+        AV_WB16(buf + 22, s->last_frame_fctl.delay_den);
+        buf[24] = s->last_frame_fctl.dispose_op;
+        buf[25] = s->last_frame_fctl.blend_op;
+        png_write_chunk(&last_fctl_chunk_start, MKTAG('f', 'c', 'T', 'L'), buf, 26);
+
+        *got_packet = 1;
+    }
+
+    if (pict) {
+        if (!s->last_frame) {
+            s->last_frame = av_frame_alloc();
+            if (!s->last_frame)
+                return AVERROR(ENOMEM);
+        } else if (s->last_frame_fctl.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            if (!s->prev_frame) {
+                s->prev_frame = av_frame_alloc();
+                if (!s->prev_frame)
+                    return AVERROR(ENOMEM);
+
+                s->prev_frame->format = pict->format;
+                s->prev_frame->width = pict->width;
+                s->prev_frame->height = pict->height;
+                if ((ret = av_frame_get_buffer(s->prev_frame, 32)) < 0)
+                    return ret;
+            }
+
+            // Do disposal, but not blending
+            memcpy(s->prev_frame->data[0], s->last_frame->data[0],
+                   s->last_frame->linesize[0] * s->last_frame->height);
+            if (s->last_frame_fctl.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                uint32_t y;
+                uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+                for (y = s->last_frame_fctl.y_offset; y < s->last_frame_fctl.y_offset + s->last_frame_fctl.height; ++y) {
+                    size_t row_start = s->last_frame->linesize[0] * y + bpp * s->last_frame_fctl.x_offset;
+                    memset(s->prev_frame->data[0] + row_start, 0, bpp * s->last_frame_fctl.width);
+                }
+            }
+        }
+
+        av_frame_unref(s->last_frame);
+        ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+        if (ret < 0)
+            return ret;
+
+        s->last_frame_fctl = fctl_chunk;
+        s->last_frame_packet_size = s->bytestream - s->bytestream_start;
+    } else {
+        av_frame_free(&s->last_frame);
+    }
+
+    return 0;
 }
 
 static av_cold int png_enc_init(AVCodecContext *avctx)
 {
     PNGEncContext *s = avctx->priv_data;
+    int compression_level;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA:
+        avctx->bits_per_coded_sample = 32;
+        break;
+    case AV_PIX_FMT_RGB24:
+        avctx->bits_per_coded_sample = 24;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        avctx->bits_per_coded_sample = 1;
+        break;
+    case AV_PIX_FMT_PAL8:
+        avctx->bits_per_coded_sample = 8;
+    }
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -470,9 +986,105 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK)
         s->filter_type = PNG_FILTER_VALUE_NONE;
 
+    if (s->dpi && s->dpm) {
+      av_log(avctx, AV_LOG_ERROR, "Only one of 'dpi' or 'dpm' options should be set\n");
+      return AVERROR(EINVAL);
+    } else if (s->dpi) {
+      s->dpm = s->dpi * 10000 / 254;
+    }
+
+    s->is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA64BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB48BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_RGBA:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB24:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+        s->bit_depth  = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        s->bit_depth = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_YA16BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        s->bit_depth  = 1;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_PALETTE;
+        break;
+    default:
+        return -1;
+    }
+    s->bits_per_pixel = ff_png_get_nb_channels(s->color_type) * s->bit_depth;
+
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
+                      ? Z_DEFAULT_COMPRESSION
+                      : av_clip(avctx->compression_level, 0, 9);
+    if (deflateInit2(&s->zstream, compression_level, Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY) != Z_OK)
+        return -1;
+
+    return 0;
+}
+
+static av_cold int png_enc_close(AVCodecContext *avctx)
+{
+    PNGEncContext *s = avctx->priv_data;
+
+    deflateEnd(&s->zstream);
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->prev_frame);
+    av_freep(&s->last_frame_packet);
     return 0;
 }
 
+#define OFFSET(x) offsetof(PNGEncContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    {"dpi", "Set image resolution (in dots per inch)",  OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    {"dpm", "Set image resolution (in dots per meter)", OFFSET(dpm), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    { NULL }
+};
+
+static const AVClass pngenc_class = {
+    .class_name = "PNG encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass apngenc_class = {
+    .class_name = "APNG encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_png_encoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
@@ -480,10 +1092,37 @@ AVCodec ff_png_encoder = {
     .id             = AV_CODEC_ID_PNG,
     .priv_data_size = sizeof(PNGEncContext),
     .init           = png_enc_init,
-    .encode2        = encode_frame,
+    .close          = png_enc_close,
+    .encode2        = encode_png,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &pngenc_class,
+};
+
+AVCodec ff_apng_encoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGEncContext),
+    .init           = png_enc_init,
+    .close          = png_enc_close,
+    .encode2        = encode_apng,
+    .capabilities   = CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_PAL8, AV_PIX_FMT_GRAY8,
-        AV_PIX_FMT_RGBA64BE, AV_PIX_FMT_RGB48BE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
+    .priv_class     = &apngenc_class,
 };
diff --git a/libavcodec/pnm.c b/libavcodec/pnm.c
index 1c380b0ff8..1675959fbf 100644
--- a/libavcodec/pnm.c
+++ b/libavcodec/pnm.c
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,12 +37,12 @@ static void pnm_get(PNMContext *sc, char *str, int buf_size)
     int c;
 
     /* skip spaces and comments */
-    for (;;) {
+    while (sc->bytestream < sc->bytestream_end) {
         c = *sc->bytestream++;
         if (c == '#')  {
-            do {
+            while (c != '\n' && sc->bytestream < sc->bytestream_end) {
                 c = *sc->bytestream++;
-            } while (c != '\n' && sc->bytestream < sc->bytestream_end);
+            }
         } else if (!pnm_space(c)) {
             break;
         }
@@ -63,9 +63,9 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
     int h, w, depth, maxval;
 
     pnm_get(s, buf1, sizeof(buf1));
-    s->type= buf1[1]-'0';
     if(buf1[0] != 'P')
         return AVERROR_INVALIDDATA;
+    s->type= buf1[1]-'0';
 
     if (s->type==1 || s->type==4) {
         avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
@@ -107,26 +107,38 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
             }
         }
         /* check that all tags are present */
-        if (w <= 0 || h <= 0 || maxval <= 0 || depth <= 0 || tuple_type[0] == '\0' || av_image_check_size(w, h, 0, avctx))
+        if (w <= 0 || h <= 0 || maxval <= 0 || depth <= 0 || tuple_type[0] == '\0' || av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
             return AVERROR_INVALIDDATA;
 
         avctx->width  = w;
         avctx->height = h;
+        s->maxval     = maxval;
         if (depth == 1) {
-            if (maxval == 1)
-                avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-            else
+            if (maxval == 1) {
+                avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            } else if (maxval < 256) {
                 avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            }
+        } else if (depth == 2) {
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_YA16;
+            }
         } else if (depth == 3) {
             if (maxval < 256) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+                avctx->pix_fmt = AV_PIX_FMT_RGB24;
             } else {
-                av_log(avctx, AV_LOG_ERROR, "16-bit components are only supported for grayscale\n");
-                avctx->pix_fmt = AV_PIX_FMT_NONE;
-                return AVERROR_INVALIDDATA;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             }
         } else if (depth == 4) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64;
+            }
         } else {
             return AVERROR_INVALIDDATA;
         }
@@ -135,14 +147,16 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
         return AVERROR_INVALIDDATA;
     }
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->width = atoi(buf1);
-    if (avctx->width <= 0)
-        return AVERROR_INVALIDDATA;
+    w = atoi(buf1);
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->height = atoi(buf1);
-    if(av_image_check_size(avctx->width, avctx->height, 0, avctx))
+    h = atoi(buf1);
+    if(w <= 0 || h <= 0 || av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
         return AVERROR_INVALIDDATA;
-    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE) {
+
+    avctx->width  = w;
+    avctx->height = h;
+
+    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE && avctx->pix_fmt != AV_PIX_FMT_MONOBLACK) {
         pnm_get(s, buf1, sizeof(buf1));
         s->maxval = atoi(buf1);
         if (s->maxval <= 0) {
@@ -151,17 +165,14 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
         }
         if (s->maxval >= 256) {
             if (avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-                avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                if (s->maxval != 65535)
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
             } else if (avctx->pix_fmt == AV_PIX_FMT_RGB24) {
-                if (s->maxval > 255)
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             } else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P && s->maxval < 65536) {
                 if (s->maxval < 512)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
                 else if (s->maxval < 1024)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
                 else
                     avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
             } else {
diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
index 5fc6513ed7..5bc0aad29f 100644
--- a/libavcodec/pnm.h
+++ b/libavcodec/pnm.h
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pnm_parser.c b/libavcodec/pnm_parser.c
index 175ca36267..a5eb81f5f6 100644
--- a/libavcodec/pnm_parser.c
+++ b/libavcodec/pnm_parser.c
@@ -2,20 +2,20 @@
  * PNM image parser
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index d23e2c0d3e..d4261a4530 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -2,29 +2,39 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "pnm.h"
 
+static void samplecpy(uint8_t *dst, const uint8_t *src, int n, int maxval)
+{
+    if (maxval <= 255) {
+        memcpy(dst, src, n);
+    } else {
+        int i;
+        for (i=0; i<n/2; i++) {
+            ((uint16_t *)dst)[i] = AV_RB16(src+2*i);
+        }
+    }
+}
 
 static int pnm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
@@ -33,36 +43,51 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size         = avpkt->size;
     PNMContext * const s = avctx->priv_data;
     AVFrame * const p    = data;
-    int i, j, n, linesize, h, upgrade = 0;
+    int i, j, n, linesize, h, upgrade = 0, is_mono = 0;
     unsigned char *ptr;
     int components, sample_len, ret;
 
     s->bytestream_start =
-    s->bytestream       = buf;
-    s->bytestream_end   = buf + buf_size;
+    s->bytestream       = (uint8_t *)buf;
+    s->bytestream_end   = (uint8_t *)buf + buf_size;
 
     if ((ret = ff_pnm_decode_header(avctx, s)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
+    avctx->bits_per_raw_sample = av_log2(s->maxval) + 1;
 
     switch (avctx->pix_fmt) {
     default:
         return AVERROR(EINVAL);
-    case AV_PIX_FMT_RGB48BE:
+    case AV_PIX_FMT_RGBA64:
+        n = avctx->width * 8;
+        components=4;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGB48:
         n = avctx->width * 6;
         components=3;
         sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGBA:
+        n = avctx->width * 4;
+        components=4;
+        sample_len=8;
         goto do_read;
     case AV_PIX_FMT_RGB24:
         n = avctx->width * 3;
         components=3;
         sample_len=8;
+        if (s->maxval < 255)
+            upgrade = 1;
         goto do_read;
     case AV_PIX_FMT_GRAY8:
         n = avctx->width;
@@ -71,48 +96,71 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         if (s->maxval < 255)
             upgrade = 1;
         goto do_read;
-    case AV_PIX_FMT_GRAY16BE:
-    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8A:
+        n = avctx->width * 2;
+        components=2;
+        sample_len=8;
+        goto do_read;
+    case AV_PIX_FMT_GRAY16:
         n = avctx->width * 2;
         components=1;
         sample_len=16;
         if (s->maxval < 65535)
             upgrade = 2;
         goto do_read;
+    case AV_PIX_FMT_YA16:
+        n =  avctx->width * 4;
+        components=2;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
     case AV_PIX_FMT_MONOWHITE:
     case AV_PIX_FMT_MONOBLACK:
         n = (avctx->width + 7) >> 3;
         components=1;
         sample_len=1;
+        is_mono = 1;
     do_read:
         ptr      = p->data[0];
         linesize = p->linesize[0];
         if (s->bytestream + n * avctx->height > s->bytestream_end)
             return AVERROR_INVALIDDATA;
-        if(s->type < 4){
+        if(s->type < 4 || (is_mono && s->type==7)){
             for (i=0; i<avctx->height; i++) {
                 PutBitContext pb;
                 init_put_bits(&pb, ptr, linesize);
                 for(j=0; j<avctx->width * components; j++){
                     unsigned int c=0;
                     int v=0;
+                    if(s->type < 4)
                     while(s->bytestream < s->bytestream_end && (*s->bytestream < '0' || *s->bytestream > '9' ))
                         s->bytestream++;
                     if(s->bytestream >= s->bytestream_end)
                         return AVERROR_INVALIDDATA;
-                    do{
-                        v= 10*v + c;
-                        c= (*s->bytestream++) - '0';
-                    }while(c <= 9);
-                    put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
+                    if (is_mono) {
+                        /* read a single digit */
+                        v = (*s->bytestream++)&1;
+                    } else {
+                        /* read a sequence of digits */
+                        do {
+                            v = 10*v + c;
+                            c = (*s->bytestream++) - '0';
+                        } while (c <= 9);
+                    }
+                    if (sample_len == 16) {
+                        ((uint16_t*)ptr)[j] = (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval;
+                    } else
+                        put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
                 }
-                flush_put_bits(&pb);
+                if (sample_len != 16)
+                    flush_put_bits(&pb);
                 ptr+= linesize;
             }
         }else{
         for (i = 0; i < avctx->height; i++) {
             if (!upgrade)
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
             else if (upgrade == 1) {
                 unsigned int j, f = (255 * 128 + s->maxval / 2) / s->maxval;
                 for (j = 0; j < n; j++)
@@ -130,8 +178,8 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_PIX_FMT_YUV420P:
-    case AV_PIX_FMT_YUV420P9BE:
-    case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
         {
             unsigned char *ptr1, *ptr2;
 
@@ -143,7 +191,7 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             if (s->bytestream + n * avctx->height * 3 / 2 > s->bytestream_end)
                 return AVERROR_INVALIDDATA;
             for (i = 0; i < avctx->height; i++) {
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr           += linesize;
             }
@@ -152,9 +200,9 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             n >>= 1;
             h = avctx->height >> 1;
             for (i = 0; i < h; i++) {
-                memcpy(ptr1, s->bytestream, n);
+                samplecpy(ptr1, s->bytestream, n, s->maxval);
                 s->bytestream += n;
-                memcpy(ptr2, s->bytestream, n);
+                samplecpy(ptr2, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr1 += p->linesize[1];
                 ptr2 += p->linesize[2];
@@ -202,24 +250,6 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
         break;
-    case AV_PIX_FMT_RGB32:
-        ptr      = p->data[0];
-        linesize = p->linesize[0];
-        if (s->bytestream + avctx->width * avctx->height * 4 > s->bytestream_end)
-            return AVERROR_INVALIDDATA;
-        for (i = 0; i < avctx->height; i++) {
-            int j, r, g, b, a;
-
-            for (j = 0; j < avctx->width; j++) {
-                r = *s->bytestream++;
-                g = *s->bytestream++;
-                b = *s->bytestream++;
-                a = *s->bytestream++;
-                ((uint32_t *)ptr)[j] = (a << 24) | (r << 16) | (g << 8) | b;
-            }
-            ptr += linesize;
-        }
-        break;
     }
     *got_frame = 1;
 
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index 791176a11c..b3eb5d90d3 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -2,42 +2,38 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream, *bytestream_start, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, h1, c, n, linesize, ret;
     uint8_t *ptr, *ptr1, *ptr2;
 
-    if ((ret = ff_alloc_packet(pkt, avpicture_get_size(avctx->pix_fmt,
+    if ((ret = ff_alloc_packet2(avctx, pkt, avpicture_get_size(avctx->pix_fmt,
                                                        avctx->width,
-                                                       avctx->height) + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                                                       avctx->height) + 200, 0)) < 0)
         return ret;
-    }
 
     bytestream_start =
     bytestream       = pkt->data;
@@ -67,6 +63,10 @@ static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         n  = avctx->width * 6;
         break;
     case AV_PIX_FMT_YUV420P:
+        if (avctx->width & 1 || avctx->height & 1) {
+            av_log(avctx, AV_LOG_ERROR, "pgmyuv needs even width and height\n");
+            return AVERROR(EINVAL);
+        }
         c  = '5';
         n  = avctx->width;
         h1 = (h * 3) / 2;
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 23e59aa3fb..bd4f427895 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -2,10 +2,11 @@
 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
 OBJS-$(CONFIG_FFT)                     += ppc/fft_init.o                \
-                                          ppc/fft_altivec.o
+                                          ppc/fft_altivec.o             \
+                                          ppc/fft_vsx.o
 OBJS-$(CONFIG_FMTCONVERT)              += ppc/fmtconvert_altivec.o
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
-OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
+OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_H264QPEL)                += ppc/h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
@@ -22,7 +23,7 @@ OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 OBJS-$(CONFIG_VP8DSP)                  += ppc/vp8dsp_altivec.o
 
 # decoders/encoders
-OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
+OBJS-$(CONFIG_LLAUDDSP)                += ppc/lossless_audiodsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VC1_DECODER)             += ppc/vc1dsp_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
diff --git a/libavcodec/ppc/asm.S b/libavcodec/ppc/asm.S
index 141dee9b78..a3edeed202 100644
--- a/libavcodec/ppc/asm.S
+++ b/libavcodec/ppc/asm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/audiodsp.c b/libavcodec/ppc/audiodsp.c
index 2a0a6d8ef0..c88c3d9167 100644
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                            int order)
@@ -63,7 +63,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
 
 av_cold void ff_audiodsp_init_ppc(AudioDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c
index 679bc0454f..45c492ab3b 100644
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -143,27 +143,24 @@ static void clear_block_altivec(int16_t *block)
 }
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c)
 {
     // common optimizations whether AltiVec is available or not
-    if (!high_bit_depth) {
-        switch (check_dcbzl_effect()) {
-        case 32:
-            c->clear_blocks = clear_blocks_dcbz32_ppc;
-            break;
-        case 128:
-            c->clear_blocks = clear_blocks_dcbz128_ppc;
-            break;
-        default:
-            break;
-        }
+    switch (check_dcbzl_effect()) {
+    case 32:
+        c->clear_blocks = clear_blocks_dcbz32_ppc;
+        break;
+    case 128:
+        c->clear_blocks = clear_blocks_dcbz128_ppc;
+        break;
+    default:
+        break;
     }
 
 #if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    if (!high_bit_depth)
-        c->clear_block = clear_block_altivec;
+    c->clear_block = clear_block_altivec;
 #endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/dct-test.c b/libavcodec/ppc/dct-test.c
index 2acbe2aeb0..2328516ca4 100644
--- a/libavcodec/ppc/dct-test.c
+++ b/libavcodec/ppc/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,7 +21,7 @@
 #include "fdct.h"
 
 static const struct algo fdct_tab_arch[] = {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
 #endif
     { 0 }
diff --git a/libavcodec/ppc/fdct.h b/libavcodec/ppc/fdct.h
index 74710354ef..437f815258 100644
--- a/libavcodec/ppc/fdct.h
+++ b/libavcodec/ppc/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fdctdsp.c b/libavcodec/ppc/fdctdsp.c
index 89ceb4736f..40f4c6c967 100644
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003  James Klicman <james@klicman.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavcodec/fdctdsp.h"
 #include "fdct.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define vs16(v)   ((vector signed short) (v))
 #define vs32(v)     ((vector signed int) (v))
@@ -59,7 +59,7 @@
 #define WA (SQRT_2 * (-C3 - C5))
 #define WB (SQRT_2 *  (C5 - C3))
 
-static vector float fdctconsts[3] = {
+static const vector float fdctconsts[3] = {
     { W0, W1, W2, W3 },
     { W4, W5, W6, W7 },
     { W8, W9, WA, WB }
@@ -196,7 +196,7 @@ static vector float fdctconsts[3] = {
 void ff_fdct_altivec(int16_t *block)
 {
     vector signed short *bp;
-    vector float *cp = fdctconsts;
+    const vector float *cp = fdctconsts;
     vector float b00, b10, b20, b30, b40, b50, b60, b70;
     vector float b01, b11, b21, b31, b41, b51, b61, b71;
     vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
@@ -465,7 +465,7 @@ void ff_fdct_altivec(int16_t *block)
 av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/fft_altivec.S b/libavcodec/ppc/fft_altivec.S
index c92b30b897..aab669ea45 100644
--- a/libavcodec/ppc/fft_altivec.S
+++ b/libavcodec/ppc/fft_altivec.S
@@ -5,20 +5,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c
index 9e79f70ed0..675fa33a95 100644
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@ -3,20 +3,20 @@
  * AltiVec-enabled
  * Copyright (c) 2009 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,10 +36,14 @@
  * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
  */
 
+#if HAVE_VSX
+#include "fft_vsx.h"
+#else
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
+#endif
 
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_GNU_AS && HAVE_ALTIVEC
 static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
     int j, k;
@@ -94,7 +98,11 @@ static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample
         k--;
     } while(k >= 0);
 
+#if HAVE_VSX
+    ff_fft_calc_vsx(s, (FFTComplex*)output);
+#else
     ff_fft_calc_altivec(s, (FFTComplex*)output);
+#endif
 
     /* post rotation + reordering */
     j = -n32;
@@ -143,11 +151,15 @@ static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample
 
 av_cold void ff_fft_init_ppc(FFTContext *s)
 {
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_GNU_AS && HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
+#if HAVE_VSX
+    s->fft_calc = ff_fft_calc_interleave_vsx;
+#else
     s->fft_calc   = ff_fft_calc_interleave_altivec;
+#endif
     if (s->mdct_bits >= 5) {
         s->imdct_calc = imdct_calc_altivec;
         s->imdct_half = imdct_half_altivec;
diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c
new file mode 100644
index 0000000000..e92975f74e
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.c
@@ -0,0 +1,227 @@
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+#include "fft_vsx.h"
+
+#if HAVE_VSX
+
+static void fft32_vsx_interleave(FFTComplex *z)
+{
+    fft16_vsx_interleave(z);
+    fft8_vsx_interleave(z+16);
+    fft8_vsx_interleave(z+24);
+    pass_vsx_interleave(z,ff_cos_32,4);
+}
+
+static void fft64_vsx_interleave(FFTComplex *z)
+{
+    fft32_vsx_interleave(z);
+    fft16_vsx_interleave(z+32);
+    fft16_vsx_interleave(z+48);
+    pass_vsx_interleave(z,ff_cos_64, 8);
+}
+static void fft128_vsx_interleave(FFTComplex *z)
+{
+    fft64_vsx_interleave(z);
+    fft32_vsx_interleave(z+64);
+    fft32_vsx_interleave(z+96);
+    pass_vsx_interleave(z,ff_cos_128,16);
+}
+static void fft256_vsx_interleave(FFTComplex *z)
+{
+    fft128_vsx_interleave(z);
+    fft64_vsx_interleave(z+128);
+    fft64_vsx_interleave(z+192);
+    pass_vsx_interleave(z,ff_cos_256,32);
+}
+static void fft512_vsx_interleave(FFTComplex *z)
+{
+    fft256_vsx_interleave(z);
+    fft128_vsx_interleave(z+256);
+    fft128_vsx_interleave(z+384);
+    pass_vsx_interleave(z,ff_cos_512,64);
+}
+static void fft1024_vsx_interleave(FFTComplex *z)
+{
+    fft512_vsx_interleave(z);
+    fft256_vsx_interleave(z+512);
+    fft256_vsx_interleave(z+768);
+    pass_vsx_interleave(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx_interleave(FFTComplex *z)
+{
+    fft1024_vsx_interleave(z);
+    fft512_vsx_interleave(z+1024);
+    fft512_vsx_interleave(z+1536);
+    pass_vsx_interleave(z,ff_cos_2048,256);
+}
+static void fft4096_vsx_interleave(FFTComplex *z)
+{
+    fft2048_vsx_interleave(z);
+    fft1024_vsx_interleave(z+2048);
+    fft1024_vsx_interleave(z+3072);
+    pass_vsx_interleave(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx_interleave(FFTComplex *z)
+{
+    fft4096_vsx_interleave(z);
+    fft2048_vsx_interleave(z+4096);
+    fft2048_vsx_interleave(z+6144);
+    pass_vsx_interleave(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx_interleave(FFTComplex *z)
+{
+    fft8192_vsx_interleave(z);
+    fft4096_vsx_interleave(z+8192);
+    fft4096_vsx_interleave(z+12288);
+    pass_vsx_interleave(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx_interleave(FFTComplex *z)
+{
+    fft16384_vsx_interleave(z);
+    fft8192_vsx_interleave(z+16384);
+    fft8192_vsx_interleave(z+24576);
+    pass_vsx_interleave(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx_interleave(FFTComplex *z)
+{
+    fft32768_vsx_interleave(z);
+    fft16384_vsx_interleave(z+32768);
+    fft16384_vsx_interleave(z+49152);
+    pass_vsx_interleave(z,ff_cos_65536,8192);
+}
+
+static void fft32_vsx(FFTComplex *z)
+{
+    fft16_vsx(z);
+    fft8_vsx(z+16);
+    fft8_vsx(z+24);
+    pass_vsx(z,ff_cos_32,4);
+}
+
+static void fft64_vsx(FFTComplex *z)
+{
+    fft32_vsx(z);
+    fft16_vsx(z+32);
+    fft16_vsx(z+48);
+    pass_vsx(z,ff_cos_64, 8);
+}
+static void fft128_vsx(FFTComplex *z)
+{
+    fft64_vsx(z);
+    fft32_vsx(z+64);
+    fft32_vsx(z+96);
+    pass_vsx(z,ff_cos_128,16);
+}
+static void fft256_vsx(FFTComplex *z)
+{
+    fft128_vsx(z);
+    fft64_vsx(z+128);
+    fft64_vsx(z+192);
+    pass_vsx(z,ff_cos_256,32);
+}
+static void fft512_vsx(FFTComplex *z)
+{
+    fft256_vsx(z);
+    fft128_vsx(z+256);
+    fft128_vsx(z+384);
+    pass_vsx(z,ff_cos_512,64);
+}
+static void fft1024_vsx(FFTComplex *z)
+{
+    fft512_vsx(z);
+    fft256_vsx(z+512);
+    fft256_vsx(z+768);
+    pass_vsx(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx(FFTComplex *z)
+{
+    fft1024_vsx(z);
+    fft512_vsx(z+1024);
+    fft512_vsx(z+1536);
+    pass_vsx(z,ff_cos_2048,256);
+}
+static void fft4096_vsx(FFTComplex *z)
+{
+    fft2048_vsx(z);
+    fft1024_vsx(z+2048);
+    fft1024_vsx(z+3072);
+    pass_vsx(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx(FFTComplex *z)
+{
+    fft4096_vsx(z);
+    fft2048_vsx(z+4096);
+    fft2048_vsx(z+6144);
+    pass_vsx(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx(FFTComplex *z)
+{
+    fft8192_vsx(z);
+    fft4096_vsx(z+8192);
+    fft4096_vsx(z+12288);
+    pass_vsx(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx(FFTComplex *z)
+{
+    fft16384_vsx(z);
+    fft8192_vsx(z+16384);
+    fft8192_vsx(z+24576);
+    pass_vsx(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx(FFTComplex *z)
+{
+    fft32768_vsx(z);
+    fft16384_vsx(z+32768);
+    fft16384_vsx(z+49152);
+    pass_vsx(z,ff_cos_65536,8192);
+}
+
+static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
+    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
+    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
+};
+static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
+    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
+    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
+    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
+};
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx_interleave[s->nbits-2](z);
+}
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx[s->nbits-2](z);
+}
+#endif /* HAVE_VSX */
diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h
new file mode 100644
index 0000000000..a85475d160
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.h
@@ -0,0 +1,830 @@
+#ifndef AVCODEC_PPC_FFT_VSX_H
+#define AVCODEC_PPC_FFT_VSX_H
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+
+#if HAVE_VSX
+
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
+
+
+#define byte_2complex (2*sizeof(FFTComplex))
+#define byte_4complex (4*sizeof(FFTComplex))
+#define byte_6complex (6*sizeof(FFTComplex))
+#define byte_8complex (8*sizeof(FFTComplex))
+#define byte_10complex (10*sizeof(FFTComplex))
+#define byte_12complex (12*sizeof(FFTComplex))
+#define byte_14complex (14*sizeof(FFTComplex))
+
+inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f vz0, vzo1, vzo2, vzo3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
+    vec_f y0, y1, y2, y3;
+    vec_f y4, y5, y8, y9;
+    vec_f y10, y13, y14, y15;
+    vec_f y16, y17, y18, y19;
+    vec_f y20, y21, y22, y23;
+    vec_f wr1, wi1, wr0, wi0;
+    vec_f wr2, wi2, wr3, wi3;
+    vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+    vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+    vzo2plus1 = vec_ld(i2+16, &(out[0]));
+    vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+    vzo3plus1 = vec_ld(i3+16, &(out[0]));
+    vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+    vz0plus1 = vec_ld(16, &(out[0]));
+    vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+    vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+    x0 = vec_add(vzo2, vzo3);
+    x1 = vec_sub(vzo2, vzo3);
+    y0 = vec_add(vzo2plus1, vzo3plus1);
+    y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+    wr1 = vec_splats(wre[1]);
+    wi1 = vec_splats(wim[-1]);
+    wi2 = vec_splats(wim[-2]);
+    wi3 = vec_splats(wim[-3]);
+    wr2 = vec_splats(wre[2]);
+    wr3 = vec_splats(wre[3]);
+
+    x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+    x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+
+    y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+    y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+    y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+    y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+
+    ymulwi2 = vec_mul(y4, wi2);
+    ymulwi3 = vec_mul(y5, wi3);
+    x4 = vec_mul(x2, wr1);
+    x5 = vec_mul(x3, wi1);
+    y8 = vec_madd(y2, wr2, ymulwi2);
+    y9 = vec_msub(y2, wr2, ymulwi2);
+    x6 = vec_add(x4, x5);
+    x7 = vec_sub(x4, x5);
+    y13 = vec_madd(y3, wr3, ymulwi3);
+    y14 = vec_msub(y3, wr3, ymulwi3);
+
+    x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
+    y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+    y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+    x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
+    x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
+
+    y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+    y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+    x11 = vec_add(vz0, x9);
+    x12 = vec_sub(vz0, x9);
+    x13 = vec_add(vzo1, x10);
+    x14 = vec_sub(vzo1, x10);
+
+    y18 = vec_add(vz0plus1, y16);
+    y19 = vec_sub(vz0plus1, y16);
+    y20 = vec_add(vzo1plus1, y17);
+    y21 = vec_sub(vzo1plus1, y17);
+
+    x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
+    x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
+    y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+    y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+
+    vec_st(x11, 0, &(out[0]));
+    vec_st(y18, 16, &(out[0]));
+    vec_st(x15, i1, &(out[0]));
+    vec_st(y22, i1+16, &(out[0]));
+    vec_st(x12, i2, &(out[0]));
+    vec_st(y19, i2+16, &(out[0]));
+    vec_st(x16, i3, &(out[0]));
+    vec_st(y23, i3+16, &(out[0]));
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+        wr0 = vec_splats(wre[0]);
+        wr1 = vec_splats(wre[1]);
+        wi0 = vec_splats(wim[0]);
+        wi1 = vec_splats(wim[-1]);
+
+        wr2 = vec_splats(wre[2]);
+        wr3 = vec_splats(wre[3]);
+        wi2 = vec_splats(wim[-2]);
+        wi3 = vec_splats(wim[-3]);
+
+        vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+        vzo2plus1 = vec_ld(i2+16, &(out[0]));
+        vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+        vzo3plus1 = vec_ld(i3+16, &(out[0]));
+        vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+        vz0plus1 = vec_ld(16, &(out[0]));
+        vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+        vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+        x0 = vec_add(vzo2, vzo3);
+        x1 = vec_sub(vzo2, vzo3);
+
+        y0 = vec_add(vzo2plus1, vzo3plus1);
+        y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+        x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
+        x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+        x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
+        x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+
+        y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+        y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+        xmulwi0 = vec_mul(x4, wi0);
+        xmulwi1 = vec_mul(x5, wi1);
+
+        y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+        y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+
+        x8 = vec_madd(x2, wr0, xmulwi0);
+        x9 = vec_msub(x2, wr0, xmulwi0);
+        ymulwi2 = vec_mul(y4, wi2);
+        ymulwi3 = vec_mul(y5, wi3);
+
+        x13 = vec_madd(x3, wr1, xmulwi1);
+        x14 = vec_msub(x3, wr1, xmulwi1);
+
+        y8 = vec_madd(y2, wr2, ymulwi2);
+        y9 = vec_msub(y2, wr2, ymulwi2);
+        y13 = vec_madd(y3, wr3, ymulwi3);
+        y14 = vec_msub(y3, wr3, ymulwi3);
+
+        x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
+        x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
+
+        y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+        y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+        x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
+        x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
+
+        y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+        y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+        x18 = vec_add(vz0, x16);
+        x19 = vec_sub(vz0, x16);
+        x20 = vec_add(vzo1, x17);
+        x21 = vec_sub(vzo1, x17);
+
+        y18 = vec_add(vz0plus1, y16);
+        y19 = vec_sub(vz0plus1, y16);
+        y20 = vec_add(vzo1plus1, y17);
+        y21 = vec_sub(vzo1plus1, y17);
+
+        x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
+        x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
+
+        y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+        y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+        vec_st(x18, 0, &(out[0]));
+        vec_st(y18, 16, &(out[0]));
+        vec_st(x22, i1, &(out[0]));
+        vec_st(y22, i1+16, &(out[0]));
+        vec_st(x19, i2, &(out[0]));
+        vec_st(y19, i2+16, &(out[0]));
+        vec_st(x23, i3, &(out[0]));
+        vec_st(y23, i3+16, &(out[0]));
+    } while (n-=2);
+}
+
+inline static void fft2_vsx_interleave(FFTComplex *z)
+{
+    FFTSample r1, i1;
+
+    r1 = z[0].re - z[1].re;
+    z[0].re += z[1].re;
+    z[1].re = r1;
+
+    i1 = z[0].im - z[1].im;
+    z[0].im += z[1].im;
+    z[1].im = i1;
+ }
+
+inline static void fft4_vsx_interleave(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+    vec_st(a, 0, &(out[0]));
+    vec_st(b, byte_2complex, &(out[0]));
+}
+
+inline static void fft8_vsx_interleave(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34;
+
+    float* out=  (float*)z;
+    vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
+
+    x4 = vec_add(x0, x1);
+    x5 = vec_sub(x0, x1);
+    x6 = vec_add(x2, x3);
+    x7 = vec_sub(x2, x3);
+
+    x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
+    x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
+    x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
+    x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
+
+    x12 = vec_add(x8, x9);
+    x13 = vec_sub(x8, x9);
+    x14 = vec_add(x10, x11);
+    x15 = vec_sub(x10, x11);
+    x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
+    x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
+    x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
+    x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
+    x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
+
+    x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
+    x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
+    x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
+    x24 = vec_add(x22, x23);
+    x25 = vec_sub(x22, x23);
+    x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
+
+    x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
+    x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
+
+    x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
+    x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
+    x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
+    x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
+    x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
+    x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
+
+    vec_st(x29, 0, &(out[0]));
+    vec_st(x33, byte_2complex, &(out[0]));
+    vec_st(x31, byte_4complex, &(out[0]));
+    vec_st(x34, byte_6complex, &(out[0]));
+}
+
+inline static void fft16_vsx_interleave(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
+    vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34, x35;
+    vec_f x36, x37, x38, x39;
+    vec_f x40, x41, x42, x43;
+    vec_f x44, x45, x46, x47;
+    vec_f x48, x49, x50, x51;
+    vec_f x52, x53, x54, x55;
+    vec_f x56, x57, x58, x59;
+    vec_f x60, x61, x62, x63;
+    vec_f x64, x65, x66, x67;
+    vec_f x68, x69, x70, x71;
+    vec_f x72, x73, x74, x75;
+    vec_f x76, x77, x78, x79;
+    vec_f x80, x81, x82, x83;
+    vec_f x84, x85, x86;
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz4 = vec_ld(byte_8complex, &(out[0]));
+    vz5 = vec_ld(byte_10complex, &(out[0]));
+    vz6 = vec_ld(byte_12complex, &(out[0]));
+    vz7 = vec_ld(byte_14complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
+    x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
+    x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
+    x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
+
+    x8 = vec_add(x0, x1);
+    x9 = vec_sub(x0, x1);
+    x10 = vec_add(x2, x3);
+    x11 = vec_sub(x2, x3);
+
+    x12 = vec_add(x4, x5);
+    x13 = vec_sub(x4, x5);
+    x14 = vec_add(x6, x7);
+    x15 = vec_sub(x6, x7);
+
+    x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
+    x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
+    x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
+    x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
+    x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
+    x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
+    x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
+    x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
+
+    x24 = vec_add(x16, x17);
+    x25 = vec_sub(x16, x17);
+    x26 = vec_add(x18, x19);
+    x27 = vec_sub(x18, x19);
+    x28 = vec_add(x20, x21);
+    x29 = vec_sub(x20, x21);
+    x30 = vec_add(x22, x23);
+    x31 = vec_sub(x22, x23);
+
+    x32 = vec_add(x24, x26);
+    x33 = vec_sub(x24, x26);
+    x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
+
+    x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
+    x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
+    x37 = vec_add(x35, x36);
+    x38 = vec_sub(x35, x36);
+    x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
+
+    x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
+    x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
+    x42 = vec_add(x40, x41);
+    x43 = vec_sub(x40, x41);
+    x44 = vec_mul(x42, vc0);
+    x45 = vec_mul(x43, vc0);
+
+    x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
+    x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
+
+    x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
+    x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
+    x50 = vec_add(x48, x49);
+    x51 = vec_sub(x48, x49);
+    x52 = vec_mul(x50, vc1);
+    x53 = vec_mul(x50, vc2);
+    x54 = vec_mul(x51, vc1);
+    x55 = vec_mul(x51, vc2);
+
+    x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
+    x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
+    x58 = vec_add(x56, x57);
+    x59 = vec_sub(x56, x57);
+
+    x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
+    x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
+    x62 = vec_add(x52, x61);
+    x63 = vec_sub(x52, x61);
+    x64 = vec_add(x60, x53);
+    x65 = vec_sub(x60, x53);
+    x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
+    x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
+
+    x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
+    x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
+    x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
+    x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
+
+    x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
+    x73 = vec_add(x25, x72);
+    x74 = vec_sub(x25, x72);
+    x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
+    x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
+    x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
+    x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
+
+    x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
+    x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
+    x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
+    x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
+    vec_st(x79, 0, &(out[0]));
+    vec_st(x80, byte_2complex, &(out[0]));
+    vec_st(x81, byte_4complex, &(out[0]));
+    vec_st(x82, byte_6complex, &(out[0]));
+    x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
+    x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
+    x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
+    x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
+    vec_st(x83, byte_8complex, &(out[0]));
+    vec_st(x84, byte_10complex, &(out[0]));
+    vec_st(x85, byte_12complex, &(out[0]));
+    vec_st(x86, byte_14complex, &(out[0]));
+}
+
+inline static void fft4_vsx(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a,b, vcprm(0,s0,1,s1));
+    d = vec_perm(a, b, vcprm(2,s3,3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s2,s3));
+
+    vec_st(c, 0, &(out[0]));
+    vec_st(d, byte_2complex, &(out[0]));
+    return;
+}
+
+inline static void fft8_vsx(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7, vz8;
+
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+    vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz8, vc2, vz3);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+
+    vz2 = vec_sub(vz4, vz6);
+    vz3 = vec_sub(vz5, vz7);
+
+    vz0 = vec_add(vz4, vz6);
+    vz1 = vec_add(vz5, vz7);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    return;
+}
+
+inline static void fft16_vsx(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
+    vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
+    vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
+
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f vz8, vz9, vz10, vz11;
+    vec_f vz12, vz13;
+
+    vz0 = vec_ld(byte_8complex, &(out[0]));
+    vz1 = vec_ld(byte_10complex, &(out[0]));
+    vz2 = vec_ld(byte_12complex, &(out[0]));
+    vz3 = vec_ld(byte_14complex, &(out[0]));
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1= vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+    vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz12, vc2, vz3);
+    vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+    vz2 = vec_sub(vz8, vz10);
+    vz3 = vec_sub(vz9, vz11);
+    vz0 = vec_add(vz8, vz10);
+    vz1 = vec_add(vz9, vz11);
+
+    vz8 = vec_madd(vz4, vc3, vc0);
+    vz9 = vec_madd(vz5, vc3, vc0);
+    vz10 = vec_madd(vz6, vc3, vc0);
+    vz11 = vec_madd(vz7, vc3, vc0);
+
+    vz8 = vec_madd(vz5, vc4, vz8);
+    vz9 = vec_madd(vz4, vc5, vz9);
+    vz10 = vec_madd(vz7, vc5, vz10);
+    vz11 = vec_madd(vz6, vc4, vz11);
+
+    vz12 = vec_sub(vz10, vz8);
+    vz10 = vec_add(vz10, vz8);
+
+    vz13 = vec_sub(vz9, vz11);
+    vz11 = vec_add(vz9, vz11);
+
+    vz4 = vec_sub(vz0, vz10);
+    vz0 = vec_add(vz0, vz10);
+
+    vz7= vec_sub(vz3, vz12);
+    vz3= vec_add(vz3, vz12);
+
+    vz5 = vec_sub(vz1, vz11);
+    vz1 = vec_add(vz1, vz11);
+
+    vz6 = vec_sub(vz2, vz13);
+    vz2 = vec_add(vz2, vz13);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    vec_st(vz4, byte_8complex, &(out[0]));
+    vec_st(vz5, byte_10complex, &(out[0]));
+    vec_st(vz6, byte_12complex, &(out[0]));
+    vec_st(vz7, byte_14complex, &(out[0]));
+    return;
+
+}
+inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f v0, v1, v2, v3;
+    vec_f v4, v5, v6, v7;
+    vec_f v8, v9, v10, v11;
+    vec_f v12, v13;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+
+    v8 = vec_ld(0, &(wre[0]));
+    v10 = vec_ld(0, &(wim[0]));
+    v9 = vec_ld(0, &(wim[-4]));
+    v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+    v4 = vec_ld(i2, &(out[0]));
+    v5 = vec_ld(i2+16, &(out[0]));
+    v6 = vec_ld(i3, &(out[0]));
+    v7 = vec_ld(i3+16, &(out[0]));
+    v10 = vec_mul(v4, v8); // r2*wre
+    v11 = vec_mul(v5, v8); // i2*wre
+    v12 = vec_mul(v6, v8); // r3*wre
+    v13 = vec_mul(v7, v8); // i3*wre
+
+    v0 = vec_ld(0, &(out[0])); // r0
+    v3 = vec_ld(i1+16, &(out[0])); // i1
+    v10 = vec_madd(v5, v9, v10); // r2*wim
+    v11 = vec_nmsub(v4, v9, v11); // i2*wim
+    v12 = vec_nmsub(v7, v9, v12); // r3*wim
+    v13 = vec_madd(v6, v9, v13); // i3*wim
+
+    v1 = vec_ld(16, &(out[0])); // i0
+    v2 = vec_ld(i1, &(out[0])); // r1
+    v8 = vec_sub(v12, v10);
+    v12 = vec_add(v12, v10);
+    v9 = vec_sub(v11, v13);
+    v13 = vec_add(v11, v13);
+    v4 = vec_sub(v0, v12);
+    v0 = vec_add(v0, v12);
+    v7 = vec_sub(v3, v8);
+    v3 = vec_add(v3, v8);
+
+    vec_st(v0, 0, &(out[0])); // r0
+    vec_st(v3, i1+16, &(out[0])); // i1
+    vec_st(v4, i2, &(out[0])); // r2
+    vec_st(v7, i3+16, &(out[0]));// i3
+
+    v5 = vec_sub(v1, v13);
+    v1 = vec_add(v1, v13);
+    v6 = vec_sub(v2, v9);
+    v2 = vec_add(v2, v9);
+
+    vec_st(v1, 16, &(out[0])); // i0
+    vec_st(v2, i1, &(out[0])); // r1
+    vec_st(v5, i2+16, &(out[0])); // i2
+    vec_st(v6, i3, &(out[0])); // r3
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+
+        v8 = vec_ld(0, &(wre[0]));
+        v10 = vec_ld(0, &(wim[0]));
+        v9 = vec_ld(0, &(wim[-4]));
+        v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+        v4 = vec_ld(i2, &(out[0])); // r2
+        v5 = vec_ld(i2+16, &(out[0])); // i2
+        v6 = vec_ld(i3, &(out[0])); // r3
+        v7 = vec_ld(i3+16, &(out[0]));// i3
+        v10 = vec_mul(v4, v8); // r2*wre
+        v11 = vec_mul(v5, v8); // i2*wre
+        v12 = vec_mul(v6, v8); // r3*wre
+        v13 = vec_mul(v7, v8); // i3*wre
+
+        v0 = vec_ld(0, &(out[0])); // r0
+        v3 = vec_ld(i1+16, &(out[0])); // i1
+        v10 = vec_madd(v5, v9, v10); // r2*wim
+        v11 = vec_nmsub(v4, v9, v11); // i2*wim
+        v12 = vec_nmsub(v7, v9, v12); // r3*wim
+        v13 = vec_madd(v6, v9, v13); // i3*wim
+
+        v1 = vec_ld(16, &(out[0])); // i0
+        v2 = vec_ld(i1, &(out[0])); // r1
+        v8 = vec_sub(v12, v10);
+        v12 = vec_add(v12, v10);
+        v9 = vec_sub(v11, v13);
+        v13 = vec_add(v11, v13);
+        v4 = vec_sub(v0, v12);
+        v0 = vec_add(v0, v12);
+        v7 = vec_sub(v3, v8);
+        v3 = vec_add(v3, v8);
+
+        vec_st(v0, 0, &(out[0])); // r0
+        vec_st(v3, i1+16, &(out[0])); // i1
+        vec_st(v4, i2, &(out[0])); // r2
+        vec_st(v7, i3+16, &(out[0])); // i3
+
+        v5 = vec_sub(v1, v13);
+        v1 = vec_add(v1, v13);
+        v6 = vec_sub(v2, v9);
+        v2 = vec_add(v2, v9);
+
+        vec_st(v1, 16, &(out[0])); // i0
+        vec_st(v2, i1, &(out[0])); // r1
+        vec_st(v5, i2+16, &(out[0])); // i2
+        vec_st(v6, i3, &(out[0])); // r3
+    } while (n-=2);
+}
+
+#endif
+
+#endif /* AVCODEC_PPC_FFT_VSX_H */
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
index 796c4314ea..7323eff0bd 100644
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fmtconvert.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
                                                float mul, int len)
@@ -57,7 +57,7 @@ static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
 av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
                                      AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/h264chroma_init.c b/libavcodec/ppc/h264chroma_init.c
index 178f239720..876efeca09 100644
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/h264chroma.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
 
@@ -50,7 +50,7 @@
 
 av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index 293fef5c90..cb1e0957e1 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -1,30 +1,32 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/mem.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
 
 /* this code assume that stride % 16 == 0 */
 
 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
-        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
-        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
+        vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
+        vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
         psum = vec_mladd(vB, vsrc1ssH, psum);\
@@ -49,8 +51,8 @@
 
 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
 \
-        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
-        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
+        vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
+        vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
         psum = vec_mladd(vE, vsrc1ssH, psum);\
@@ -70,6 +72,43 @@
 #define noop(a) a
 #define add28(a) vec_add(v28ss, a)
 
+#if HAVE_BIGENDIAN
+#define GET_VSRC1(vs0, off, b, perm0, s){    \
+    vec_u8 vsrcCuc, vsrcDuc;                 \
+    vsrcCuc = vec_ld(off, s);                \
+    if (loadSecond){                         \
+        vsrcDuc = vec_ld(off + b, s);        \
+    } else                                   \
+        vsrcDuc = vsrcCuc;                   \
+                                             \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
+}
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vec_u8 vsrcCuc, vsrcDuc;                         \
+    vsrcCuc = vec_ld(off, s);                        \
+    if (loadSecond){                                 \
+        vsrcDuc = vec_ld(off + b, s);                \
+    } else                                           \
+        vsrcDuc = vsrcCuc;                           \
+                                                     \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
+    if (reallyBadAlign){                             \
+        vs1 = vsrcDuc;                               \
+    } else                                           \
+        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
+ }
+
+#else
+
+#define GET_VSRC1(vs0, off, b, perm0, s){            \
+    vs0 = vec_vsx_ld(off, s);                        \
+ }
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vs0 = vec_vsx_ld(off, s);                        \
+    vs1 = vec_vsx_ld(off + 1, s);                    \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 #ifdef PREFIX_h264_chroma_mc8_altivec
 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                                     int stride, int h, int x, int y) {
@@ -80,23 +119,27 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
     const vec_u16 v6us = vec_splat_u16(6);
-    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -110,89 +153,28 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
 
     if (ABCD[3]) {
-        if (!loadSecond) {// -> !reallyBadAlign
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
-        } else {
-            vec_u8 vsrcDuc;
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrcDuc = vec_ld(stride + 16, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                if (reallyBadAlign)
-                    vsrc3uc = vsrcDuc;
-                else
-                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
+        for (i = 0 ; i < h ; i++) {
+            GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+            CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
         }
     } else {
         const vec_s16 vE = vec_add(vB, vC);
         if (ABCD[2]) { // x == 0 B == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrcDuc = vec_ld(stride + 15, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
+            for (i = 0 ; i < h ; i++) {
+                GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
+                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
+                vsrc0uc = vsrc1uc;
             }
         } else { // y == 0 C == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrcDuc = vec_ld(15, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    if (reallyBadAlign)
-                        vsrc1uc = vsrcDuc;
-                    else
-                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
+            for (i = 0 ; i < h ; i++) {
+               GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
+               CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
             }
         }
     }
@@ -209,23 +191,27 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
     const vec_u16 v6us  = vec_splat_u16(6);
-    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -239,47 +225,14 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
-
-    if (!loadSecond) {// -> !reallyBadAlign
-        for (i = 0 ; i < h ; i++) {
-
-
-            vsrcCuc = vec_ld(stride + 0, src);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
 
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
-    } else {
-        vec_u8 vsrcDuc;
-        for (i = 0 ; i < h ; i++) {
-            vsrcCuc = vec_ld(stride + 0, src);
-            vsrcDuc = vec_ld(stride + 16, src);
-
-            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-            if (reallyBadAlign)
-                vsrc3uc = vsrcDuc;
-            else
-                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
+    for (i = 0 ; i < h ; i++) {
+        GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+        CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
     }
 }
 #endif
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index 90afbde118..3822c7f55a 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include "libavcodec/h264data.h"
 #include "libavcodec/h264dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /****************************************************************************
  * IDCT transform:
@@ -62,10 +62,17 @@
     b2 = vec_mergeh( a1, a3 ); \
     b3 = vec_mergel( a1, a3 )
 
+#if HAVE_BIGENDIAN
+#define vdst_load(d)              \
+    vdst_orig = vec_ld(0, dst);   \
+    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);
+#else
+#define vdst_load(d) vdst = vec_vsx_ld(0, dst)
+#endif
+
 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
-    vdst_orig = vec_ld(0, dst);                               \
-    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
-    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
+    vdst_load();                                              \
+    vdst_ss = (vec_s16) VEC_MERGEH(zero_u8v, vdst);           \
     va = vec_add(va, vdst_ss);                                \
     va_u8 = vec_packsu(va, zero_s16v);                        \
     va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
@@ -165,26 +172,43 @@ static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
     d7 = vec_sub(b0v, b7v); \
 }
 
+#if HAVE_BIGENDIAN
+#define GET_2PERM(ldv, stv, d)  \
+    ldv = vec_lvsl(0, d);       \
+    stv = vec_lvsr(8, d);
+#define dstv_load(d)            \
+    vec_u8 hv = vec_ld( 0, d ); \
+    vec_u8 lv = vec_ld( 7, d);  \
+    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );
+#define dest_unligned_store(d)                                 \
+    vec_u8 edgehv;                                             \
+    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );  \
+    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );       \
+    lv    = vec_sel( lv, bodyv, edgelv );                      \
+    vec_st( lv, 7, d );                                        \
+    hv    = vec_ld( 0, d );                                    \
+    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
+    hv    = vec_sel( hv, bodyv, edgehv );                      \
+    vec_st( hv, 0, d );
+#else
+
+#define GET_2PERM(ldv, stv, d) {}
+#define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d)
+#define dest_unligned_store(d)\
+    vec_u8 dst8 = vec_perm((vec_u8)idstsum8, dstv, vcprm(2,3,s2,s3));\
+    vec_vsx_st(dst8, 0, d)
+#endif /* HAVE_BIGENDIAN */
+
 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
     /* unaligned load */                                       \
-    vec_u8 hv = vec_ld( 0, dest );                           \
-    vec_u8 lv = vec_ld( 7, dest );                           \
-    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
+    dstv_load(dest);                                           \
     vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16 dst16 = (vec_u16)VEC_MERGEH(zero_u8v, dstv);   \
     vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
     vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    vec_u8 edgehv;                                           \
     /* unaligned store */                                      \
-    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
-    lv    = vec_sel( lv, bodyv, edgelv );                      \
-    vec_st( lv, 7, dest );                                     \
-    hv    = vec_ld( 0, dest );                                 \
-    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
-    hv    = vec_sel( hv, bodyv, edgehv );                      \
-    vec_st( hv, 0, dest );                                     \
- }
+    dest_unligned_store(dest);\
+}
 
 static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
 {
@@ -192,8 +216,8 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
     vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 
-    vec_u8 perm_ldv = vec_lvsl(0, dst);
-    vec_u8 perm_stv = vec_lvsr(8, dst);
+    vec_u8 perm_ldv, perm_stv;
+    GET_2PERM(perm_ldv, perm_stv, dst);
 
     const vec_u16 onev = vec_splat_u16(1);
     const vec_u16 twov = vec_splat_u16(2);
@@ -232,32 +256,41 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 }
 
+#if HAVE_BIGENDIAN
+#define DST_LD vec_ld
+#else
+#define DST_LD vec_vsx_ld
+#endif
 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
 {
     vec_s16 dc16;
     vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
+    vec_s32 v_dc32;
     LOAD_ZERO;
     DECLARE_ALIGNED(16, int, dc);
     int i;
 
     dc = (block[0] + 32) >> 6;
     block[0] = 0;
-    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
+    v_dc32 = vec_lde(0, &dc);
+    dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);
 
     if (size == 4)
-        dc16 = vec_sld(dc16, zero_s16v, 8);
+        dc16 = VEC_SLD16(dc16, zero_s16v, 8);
     dcplus = vec_packsu(dc16, zero_s16v);
     dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
 
+#if HAVE_BIGENDIAN
     aligner = vec_lvsr(0, dst);
     dcplus = vec_perm(dcplus, dcplus, aligner);
     dcminus = vec_perm(dcminus, dcminus, aligner);
+#endif
 
     for (i = 0; i < size; i += 4) {
-        v0 = vec_ld(0, dst+0*stride);
-        v1 = vec_ld(0, dst+1*stride);
-        v2 = vec_ld(0, dst+2*stride);
-        v3 = vec_ld(0, dst+3*stride);
+        v0 = DST_LD(0, dst+0*stride);
+        v1 = DST_LD(0, dst+1*stride);
+        v2 = DST_LD(0, dst+2*stride);
+        v3 = DST_LD(0, dst+3*stride);
 
         v0 = vec_adds(v0, dcplus);
         v1 = vec_adds(v1, dcplus);
@@ -269,10 +302,10 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl
         v2 = vec_subs(v2, dcminus);
         v3 = vec_subs(v3, dcminus);
 
-        vec_st(v0, 0, dst+0*stride);
-        vec_st(v1, 0, dst+1*stride);
-        vec_st(v2, 0, dst+2*stride);
-        vec_st(v3, 0, dst+3*stride);
+        VEC_ST(v0, 0, dst+0*stride);
+        VEC_ST(v1, 0, dst+1*stride);
+        VEC_ST(v2, 0, dst+2*stride);
+        VEC_ST(v3, 0, dst+3*stride);
 
         dst += 4*stride;
     }
@@ -633,6 +666,9 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     temp[2] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweight = vec_splat(vtemp, 3);
     voffset = vec_splat(vtemp, 5);
@@ -641,8 +677,8 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vblock);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vblock);
 
         if (w == 16 || aligned) {
             v0 = vec_mladd(v0, vweight, zero_s16v);
@@ -679,6 +715,9 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
     temp[3] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweights = vec_splat(vtemp, 3);
     vweightd = vec_splat(vtemp, 5);
@@ -690,10 +729,10 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
-        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
-        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vdst);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vdst);
+        v2 = (vec_s16)VEC_MERGEH(zero_u8v, vsrc);
+        v3 = (vec_s16)VEC_MERGEL(zero_u8v, vsrc);
 
         if (w == 8) {
             if (src_aligned)
@@ -745,7 +784,7 @@ H264_WEIGHT( 8)
 av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/h264qpel.c b/libavcodec/ppc/h264qpel.c
index 92c86f32fc..575f504d32 100644
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include "libavcodec/h264qpel.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
@@ -191,86 +191,79 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, cons
     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 }\
 
+#if HAVE_BIGENDIAN
+#define put_unligned_store(s, dest) {    \
+    tmp1 = vec_ld(0, dest);              \
+    mask = vec_lvsl(0, dest);            \
+    tmp2 = vec_ld(15, dest);             \
+    edges = vec_perm(tmp2, tmp1, mask);  \
+    align = vec_lvsr(0, dest);           \
+    tmp2 = vec_perm(s, edges, align);    \
+    tmp1 = vec_perm(edges, s, align);    \
+    vec_st(tmp2, 15, dest);              \
+    vec_st(tmp1, 0 , dest);              \
+ }
+#else
+#define put_unligned_store(s, dest) vec_vsx_st(s, 0, dest);
+#endif /* HAVE_BIGENDIAN */
+
 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
-
+    vec_u8 a, b, d, mask_;
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
         d = vec_avg(a, b);
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        put_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
 
+#if HAVE_BIGENDIAN
+#define avg_unligned_store(s, dest){            \
+    tmp1 = vec_ld(0, dest);                     \
+    mask = vec_lvsl(0, dest);                   \
+    tmp2 = vec_ld(15, dest);                    \
+    a = vec_avg(vec_perm(tmp1, tmp2, mask), s); \
+    edges = vec_perm(tmp2, tmp1, mask);         \
+    align = vec_lvsr(0, dest);                  \
+    tmp2 = vec_perm(a, edges, align);           \
+    tmp1 = vec_perm(edges, a, align);           \
+    vec_st(tmp2, 15, dest);                     \
+    vec_st(tmp1, 0 , dest);                     \
+ }
+#else
+#define avg_unligned_store(s, dest){            \
+    a = vec_avg(vec_vsx_ld(0, dst), s);         \
+    vec_vsx_st(a, 0, dst);                      \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+    vec_u8 a, b, d, mask_;
 
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
-        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
+        d = vec_avg(a, b);
+        avg_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
@@ -286,7 +279,7 @@ H264_MC(avg_, 16, altivec)
 
 av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/h264qpel_template.c b/libavcodec/ppc/h264qpel_template.c
index fe83146e63..5ff72e32f0 100644
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@ -1,30 +1,104 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/mem.h"
+#include "config.h"
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 
-#ifdef DEBUG
-#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
+
+#if HAVE_BIGENDIAN
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    vec_u8 srcR1 = vec_ld(-2, s);\
+    vec_u8 srcR2 = vec_ld(14, s);\
+    switch (ali) {\
+    default: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = vec_perm(srcR1, srcR2, pp3);\
+    } break;\
+    case 11: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = srcR2;\
+    } break;\
+    case 12: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = srcR2;\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 13: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = srcR2;\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 14: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = srcR2;\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 15: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = srcR2;\
+        srcP0 = vec_perm(srcR2, srcR3, pp0);\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    }\
+ }
 #else
-#define ASSERT_ALIGNED(ptr) ;
-#endif
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    srcM2 =  vec_vsx_ld(-2, s);\
+    srcM1 = vec_vsx_ld(-1, s);\
+    srcP0 = vec_vsx_ld(0, s);\
+    srcP1 = vec_vsx_ld(1, s);\
+    srcP2 = vec_vsx_ld(2, s);\
+    srcP3 = vec_vsx_ld(3, s);\
+ }
+#endif /* HAVE_BIGENDIAN */
 
 /* this code assume stride % 16 == 0 */
 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
@@ -35,12 +109,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v5ss = vec_splat_s16(5);
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
@@ -59,79 +128,32 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
 
     vec_u8 sum, fsum;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     for (i = 0 ; i < 16 ; i ++) {
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -178,7 +200,10 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 perm = vec_lvsl(0, src);
+    vec_u8 perm;
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -186,52 +211,41 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
 
     const uint8_t *srcbis = src - (srcStride * 2);
 
-    const vec_u8 srcM2a = vec_ld(0, srcbis);
-    const vec_u8 srcM2b = vec_ld(16, srcbis);
-    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcM1b = vec_ld(16, srcbis);
-    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP0b = vec_ld(16, srcbis);
-    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP1b = vec_ld(16, srcbis);
-    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP2b = vec_ld(16, srcbis);
-    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
-    //srcbis += srcStride;
-
-    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
-    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
-    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
-    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
-    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
+    const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+
+    vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+    vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
+    vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+    vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+    vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+    vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+    vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+    vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+    vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+    vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
 
     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
               psumA, psumB, sumA, sumB,
               srcP3ssA, srcP3ssB,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
 
-    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
+    vec_u8 sum, fsum, srcP3;
 
     for (i = 0 ; i < 16 ; i++) {
-        srcP3a = vec_ld(0, srcbis += srcStride);
-        srcP3b = vec_ld(16, srcbis);
-        srcP3 = vec_perm(srcP3a, srcP3b, perm);
-        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
-        //srcbis += srcStride;
+        srcP3 = load_with_perm_vec(0, srcbis, perm);
+        srcbis += srcStride;
+
+        srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
 
         sum1A = vec_adds(srcP0ssA, srcP1ssA);
         sum1B = vec_adds(srcP0ssB, srcP1ssB);
@@ -288,12 +302,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
 {
     register int i;
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u32 v10ui = vec_splat_u32(10);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -325,81 +334,35 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
     vec_u8 fsum, sumv, sum;
     vec_s16 ssume, ssumo;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     src -= (2 * srcStride);
     for (i = 0 ; i < 21 ; i ++) {
         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -448,8 +411,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
-        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
-        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+        vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+        vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 
         tmpbis += tmpStride;
 
@@ -474,10 +437,14 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         pp2Be = vec_mule(sum2B, v5ss);
         pp2Bo = vec_mulo(sum2B, v5ss);
 
-        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
         pp3Ao = vec_mulo(sum3A, v1ss);
-        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
         pp3Bo = vec_mulo(sum3B, v1ss);
+#if !HAVE_BIGENDIAN
+        sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
+        sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
+#endif
+        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
+        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
 
         pp1cAe = vec_add(pp1Ae, v512si);
         pp1cAo = vec_add(pp1Ao, v512si);
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 780240ba20..87a1f05b6a 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,16 +34,15 @@
 #include "libavcodec/hpeldsp.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* next one assumes that ((line_size % 16) == 0) */
 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
+    register vector unsigned char pixelsv1;
+    register vector unsigned char pixelsv1B;
+    register vector unsigned char pixelsv1C;
+    register vector unsigned char pixelsv1D;
 
-    register vector unsigned char perm = vec_lvsl(0, pixels);
     int i;
     register ptrdiff_t line_size_2 = line_size << 1;
     register ptrdiff_t line_size_3 = line_size + line_size_2;
@@ -55,22 +54,14 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, pixels);
-        pixelsv2  = vec_ld(15, pixels);
-        pixelsv1B = vec_ld(line_size, pixels);
-        pixelsv2B = vec_ld(15 + line_size, pixels);
-        pixelsv1C = vec_ld(line_size_2, pixels);
-        pixelsv2C = vec_ld(15 + line_size_2, pixels);
-        pixelsv1D = vec_ld(line_size_3, pixels);
-        pixelsv2D = vec_ld(15 + line_size_3, pixels);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               line_size, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               line_size_2, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               line_size_3, (unsigned char*)block);
+        pixelsv1  = unaligned_load( 0, pixels);
+        pixelsv1B = unaligned_load(line_size, pixels);
+        pixelsv1C = unaligned_load(line_size_2, pixels);
+        pixelsv1D = unaligned_load(line_size_3, pixels);
+        VEC_ST(pixelsv1, 0, (unsigned char*)block);
+        VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
+        VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
+        VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
         pixels+=line_size_4;
         block +=line_size_4;
     }
@@ -80,15 +71,12 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
-    register vector unsigned char perm = vec_lvsl(0, pixels);
-    int i;
+    register vector unsigned char pixelsv, blockv;
 
+    int i;
     for (i = 0; i < h; i++) {
-        pixelsv1 = vec_ld( 0, pixels);
-        pixelsv2 = vec_ld(16,pixels);
         blockv = vec_ld(0, block);
-        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+        pixelsv = VEC_LD( 0, pixels);
         blockv = vec_avg(blockv,pixelsv);
         vec_st(blockv, 0, (unsigned char*)block);
         pixels+=line_size;
@@ -108,9 +96,7 @@ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff
        int rightside = ((unsigned long)block & 0x0000000F);
 
        blockv = vec_ld(0, block);
-       pixelsv1 = vec_ld( 0, pixels);
-       pixelsv2 = vec_ld(16, pixels);
-       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+       pixelsv = VEC_LD( 0, pixels);
 
        if (rightside) {
            pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
@@ -132,21 +118,16 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -155,17 +136,10 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -191,22 +165,16 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vcone);
@@ -215,17 +183,10 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -251,24 +212,18 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vctwo);
@@ -279,20 +234,13 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     for (i = 0; i < h ; i++) {
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -319,25 +267,19 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vcone);
@@ -346,22 +288,13 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelssum1 = vec_add(pixelssum1, vcone);
 
     for (i = 0; i < h ; i++) {
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -376,7 +309,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 
         blockv = vec_packsu(temp3, temp4);
 
-        vec_st(blockv, 0, block);
+        VEC_ST(blockv, 0, block);
 
         block += line_size;
         pixels += line_size;
@@ -388,7 +321,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2, blocktemp;
+    register vector unsigned char blockv, blocktemp;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
 
     register const vector unsigned char vczero = (const vector unsigned char)
@@ -396,16 +329,10 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned short vctwo = (const vector unsigned short)
                                         vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -414,17 +341,11 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -449,7 +370,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 
 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/hpeldsp_altivec.h b/libavcodec/ppc/hpeldsp_altivec.h
index 98dd80ea1c..590809f539 100644
--- a/libavcodec/ppc/hpeldsp_altivec.h
+++ b/libavcodec/ppc/hpeldsp_altivec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c
index 4c16283f07..6701524e4a 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/huffyuvdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+#if HAVE_ALTIVEC
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     register int i;
     register vector unsigned char vdst, vsrc;
@@ -53,7 +53,7 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
 
 av_cold void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index c85b58e6ff..80e71fdaf0 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -1,28 +1,28 @@
 /*
  * Copyright (c) 2001 Michel Lespinasse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /* NOTE: This code is based on GPL code from the libmpeg2 project.  The
  * author, Michel Lespinasses, has given explicit permission to release
- * under LGPL as part of Libav.
+ * under LGPL as part of FFmpeg.
  *
- * Libav integration by Dieter Shirley
+ * FFmpeg integration by Dieter Shirley
  *
  * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
  * project.  I've deleted all of the libmpeg2-specific code, renamed the
@@ -43,7 +43,7 @@
 #include "libavutil/ppc/types_altivec.h"
 #include "libavcodec/idctdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define IDCT_HALF                                       \
     /* 1st stage */                                     \
@@ -153,6 +153,22 @@ static const vec_s16 constants[5] = {
     { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
 };
 
+static void idct_altivec(int16_t *blk)
+{
+    vec_s16 *block = (vec_s16 *) blk;
+
+    IDCT;
+
+    block[0] = vx0;
+    block[1] = vx1;
+    block[2] = vx2;
+    block[3] = vx3;
+    block[4] = vx4;
+    block[5] = vx5;
+    block[6] = vx6;
+    block[7] = vx7;
+}
+
 static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
 {
     vec_s16 *block = (vec_s16 *) blk;
@@ -193,16 +209,26 @@ static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
 
     IDCT;
 
+#if HAVE_BIGENDIAN
     p0    = vec_lvsl(0, dest);
     p1    = vec_lvsl(stride, dest);
     p     = vec_splat_u8(-1);
     perm0 = vec_mergeh(p, p0);
     perm1 = vec_mergeh(p, p1);
+#endif
 
-#define ADD(dest, src, perm)                                \
-    /* *(uint64_t *) &tmp = *(uint64_t *) dest; */          \
+#if HAVE_BIGENDIAN
+#define GET_TMP2(dest, prm)                                 \
     tmp  = vec_ld(0, dest);                                 \
-    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm);    \
+    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
+#else
+#define GET_TMP2(dest, prm)                                 \
+    tmp  = vec_vsx_ld(0, dest);                             \
+    tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
+#endif
+
+#define ADD(dest, src, perm)                                \
+    GET_TMP2(dest, perm);                                   \
     tmp3 = vec_adds(tmp2, src);                             \
     tmp  = vec_packsu(tmp3, tmp3);                          \
     vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
@@ -230,13 +256,14 @@ static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
 av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    if (!high_bit_depth) {
-        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
+    if (!high_bit_depth && avctx->lowres == 0) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
+            c->idct      = idct_altivec;
             c->idct_add  = idct_add_altivec;
             c->idct_put  = idct_put_altivec;
             c->perm_type = FF_IDCT_PERM_TRANSPOSE;
diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/lossless_audiodsp_altivec.c
index 3b9d045b82..bdec25223d 100644
--- a/libavcodec/ppc/apedsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,23 @@
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_BIGENDIAN
+#define GET_T(tt0,tt1,src,a,b){       \
+        a = vec_ld(16, src);          \
+        tt0 = vec_perm(b, a, align);  \
+        b = vec_ld(32, src);          \
+        tt1 = vec_perm(a, b, align);  \
+ }
+#else
+#define GET_T(tt0,tt1,src,a,b){       \
+        tt0 = vec_vsx_ld(0, src);     \
+        tt1 = vec_vsx_ld(16, src);    \
+ }
+#endif
+
+#if HAVE_ALTIVEC
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                     const int16_t *v2,
                                                     const int16_t *v3,
@@ -38,26 +52,23 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
     LOAD_ZERO;
     vec_s16 *pv1 = (vec_s16 *) v1;
     register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
-    register vec_s16 t0, t1, i0, i1, i4;
-    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
+    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
     register vec_s32 res = zero_s32v;
+#if HAVE_BIGENDIAN
     register vec_u8 align = vec_lvsl(0, v2);
+    i2 = vec_ld(0, v2);
+    i3 = vec_ld(0, v3);
+#endif
     int32_t ires;
 
     order >>= 4;
     do {
-        i1     = vec_ld(16, v2);
-        t0     = vec_perm(i2, i1, align);
-        i2     = vec_ld(32, v2);
-        t1     = vec_perm(i1, i2, align);
+        GET_T(t0,t1,v2,i1,i2);
         i0     = pv1[0];
         i1     = pv1[1];
         res    = vec_msum(t0, i0, res);
         res    = vec_msum(t1, i1, res);
-        i4     = vec_ld(16, v3);
-        t0     = vec_perm(i3, i4, align);
-        i3     = vec_ld(32, v3);
-        t1     = vec_perm(i4, i3, align);
+        GET_T(t0,t1,v3,i4,i3);
         pv1[0] = vec_mladd(t0, muls, i0);
         pv1[1] = vec_mladd(t1, muls, i1);
         pv1   += 2;
@@ -71,9 +82,9 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 }
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
+av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mathops.h b/libavcodec/ppc/mathops.h
index 34ddb11800..dbd714fcd4 100644
--- a/libavcodec/ppc/mathops.h
+++ b/libavcodec/ppc/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
index b074d28b5c..9f75ed256a 100644
--- a/libavcodec/ppc/me_cmp.c
+++ b/libavcodec/ppc/me_cmp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,27 +34,44 @@
 #include "libavcodec/mpegvideo.h"
 #include "libavcodec/me_cmp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+
+#if HAVE_BIGENDIAN
+#define GET_PERM(per1, per2, pix) {\
+    per1 = vec_lvsl(0, pix);\
+    per2 = vec_add(per1, vec_splat_u8(1));\
+}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    vector unsigned char pix2l  = vec_ld(0,  pix);\
+    vector unsigned char pix2r  = vec_ld(16, pix);\
+    v  = vec_perm(pix2l, pix2r, per1);\
+    iv = vec_perm(pix2l, pix2r, per2);\
+}
+#else
+#define GET_PERM(per1, per2, pix) {}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    v  = vec_vsx_ld(0,  pix);\
+    iv = vec_vsx_ld(1,  pix);\
+}
+#endif
 static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
 
+    GET_PERM(perm1, perm2, pix2);
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
         vector unsigned char pix1v  = vec_ld(0,  pix1);
-        vector unsigned char pix2l  = vec_ld(0,  pix2);
-        vector unsigned char pix2r  = vec_ld(16, pix2);
-        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
 
         /* Calculate the average vector. */
         vector unsigned char avgv = vec_avg(pix2v, pix2iv);
@@ -80,13 +97,14 @@ static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned char pix1v, pix3v, avgv, t5;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+
     uint8_t *pix3 = pix2 + stride;
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
@@ -96,19 +114,14 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l = vec_ld(0,  pix2);
-    vector unsigned char pix2r = vec_ld(15, pix2);
-    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
+    vector unsigned char pix2v = VEC_LD(0, pix2);
 
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15] */
         pix1v = vec_ld(0,  pix1);
-
-        pix2l = vec_ld(0,  pix3);
-        pix2r = vec_ld(15, pix3);
-        pix3v = vec_perm(pix2l, pix2r, perm);
+        pix3v = VEC_LD(0,  pix3);
 
         /* Calculate the average vector. */
         avgv = vec_avg(pix2v, pix3v);
@@ -134,20 +147,21 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                              ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     uint8_t *pix3 = pix2 + stride;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
     const vector unsigned short two =
         (const vector unsigned short) vec_splat_u16(2);
     vector unsigned char avgv, t5;
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned char pix1v, pix3v, pix3iv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
     vector unsigned short avghv, avglv;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
+    GET_PERM(perm1, perm2, pix2);
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
      * iteration becomes pix2 in the next iteration. We can use this
@@ -156,19 +170,16 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l  = vec_ld(0,  pix2);
-    vector unsigned char pix2r  = vec_ld(16, pix2);
-    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
-
+    LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
     vector unsigned short pix2hv  =
-        (vector unsigned short) vec_mergeh(zero, pix2v);
+        (vector unsigned short) VEC_MERGEH(zero, pix2v);
     vector unsigned short pix2lv  =
-        (vector unsigned short) vec_mergel(zero, pix2v);
+        (vector unsigned short) VEC_MERGEL(zero, pix2v);
     vector unsigned short pix2ihv =
-        (vector unsigned short) vec_mergeh(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEH(zero, pix2iv);
     vector unsigned short pix2ilv =
-        (vector unsigned short) vec_mergel(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEL(zero, pix2iv);
+
     vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
     vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
     vector unsigned short t3, t4;
@@ -178,11 +189,7 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
         pix1v  = vec_ld(0, pix1);
-
-        pix2l  = vec_ld(0, pix3);
-        pix2r  = vec_ld(16, pix3);
-        pix3v  = vec_perm(pix2l, pix2r, perm1);
-        pix3iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
 
         /* Note that AltiVec does have vec_avg, but this works on vector pairs
          * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
@@ -191,10 +198,10 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * vectors of shorts and do the averaging by hand. */
 
         /* Split the pixel vectors into shorts. */
-        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
-        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
-        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
-        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
+        pix3hv  = (vector unsigned short) VEC_MERGEH(zero, pix3v);
+        pix3lv  = (vector unsigned short) VEC_MERGEL(zero, pix3v);
+        pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
+        pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
 
         /* Do the averaging on them. */
         t3 = vec_add(pix3hv, pix3ihv);
@@ -229,19 +236,17 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
-        vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t1 =vec_ld(0, pix1);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -266,14 +271,13 @@ static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
@@ -281,14 +285,10 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char pix1l = VEC_LD(0, pix1);
+        vector unsigned char pix2l = VEC_LD(0, pix2);
+        vector unsigned char t1 = vec_and(pix1l, permclear);
+        vector unsigned char t2 = vec_and(pix2l, permclear);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -315,14 +315,13 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
@@ -330,14 +329,8 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
+        vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -367,19 +360,17 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
         vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -399,15 +390,15 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
     /* Sum up the four partial sums, and put the result into s. */
     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
     sumsqr = vec_splat(sumsqr, 3);
-    vec_ste(sumsqr, 0, &s);
 
+    vec_ste(sumsqr, 0, &s);
     return s;
 }
 
 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                      uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register const vector unsigned char vzero =
         (const vector unsigned char) vec_splat_u8(0);
     register vector signed short temp0, temp1, temp2, temp3, temp4,
@@ -432,24 +423,19 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
             { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
 
+
 #define ONEITERBUTTERFLY(i, res)                                            \
     {                                                                       \
-        register vector unsigned char src1 = vec_ld(stride * i, src);       \
-        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
-        register vector unsigned char srcO =                                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
-        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
-        register vector unsigned char dstO =                                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+        register vector unsigned char srcO =  unaligned_load(stride * i, src);  \
+        register vector unsigned char dstO = unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         /* We're in the 8x8 function, we only care for the first 8. */      \
         register vector signed short srcV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -461,6 +447,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         register vector signed short op3  = vec_perm(but2, but2, perm3);    \
         res  = vec_mladd(but2, vprod3, op3);                                \
     }
+
         ONEITERBUTTERFLY(0, temp0);
         ONEITERBUTTERFLY(1, temp1);
         ONEITERBUTTERFLY(2, temp2);
@@ -510,13 +497,14 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7C), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
 }
 
 /*
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and
+ * 16x8 works with 16 elements; it can avoid replicating loads, and
  * gives the compiler more room for scheduling. It's only used from
  * inside hadamard8_diff16_altivec.
  *
@@ -536,7 +524,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
 static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
                                       uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register vector signed short
         temp0 __asm__ ("v0"),
         temp1 __asm__ ("v1"),
@@ -584,31 +572,23 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
 
 #define ONEITERBUTTERFLY(i, res1, res2)                                     \
     {                                                                       \
-        register vector unsigned char src1 __asm__ ("v22") =                \
-            vec_ld(stride * i, src);                                        \
-        register vector unsigned char src2 __asm__ ("v23") =                \
-            vec_ld(stride * i + 16, src);                                   \
         register vector unsigned char srcO __asm__ ("v22") =                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 __asm__ ("v24") =                \
-            vec_ld(stride * i, dst);                                        \
-        register vector unsigned char dst2 __asm__ ("v25") =                \
-            vec_ld(stride * i + 16, dst);                                   \
+            unaligned_load(stride * i, src);                                    \
         register vector unsigned char dstO __asm__ ("v23") =                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+            unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         register vector signed short srcV __asm__ ("v24") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV __asm__ ("v25") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
         register vector signed short srcW __asm__ ("v26") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstW __asm__ ("v27") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -639,6 +619,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         res1 = vec_mladd(but2, vprod3, op3);                                \
         res2 = vec_mladd(but2S, vprod3, op3S);                              \
     }
+
         ONEITERBUTTERFLY(0, temp0, temp0S);
         ONEITERBUTTERFLY(1, temp1, temp1S);
         ONEITERBUTTERFLY(2, temp2, temp2S);
@@ -725,6 +706,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7CS), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
@@ -746,7 +728,7 @@ static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
 
 av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegaudiodsp_altivec.c b/libavcodec/ppc/mpegaudiodsp_altivec.c
index 21cbcf376f..ddfe5dcbb7 100644
--- a/libavcodec/ppc/mpegaudiodsp_altivec.c
+++ b/libavcodec/ppc/mpegaudiodsp_altivec.c
@@ -2,20 +2,20 @@
  * Altivec optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -132,7 +132,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
 
 av_cold void ff_mpadsp_init_ppc(MPADSPContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index 550a03ad41..1b6bda6c36 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -4,20 +4,20 @@
  * dct_unquantize_h263_altivec:
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
@@ -42,8 +42,6 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
-
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
 
@@ -59,6 +57,7 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
         nCoeffs= 63; //does not always use zigzag table
     } else {
         i = 0;
+        av_assert2(s->block_last_index[n]>=0);
         nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
     }
 
@@ -117,7 +116,7 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
 
 av_cold void ff_mpv_common_init_ppc(MpegEncContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegvideodsp.c b/libavcodec/ppc/mpegvideodsp.c
index 0b426e5e2a..7696954335 100644
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
  * to preserve proper dst alignment. */
 static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
@@ -66,7 +66,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
                                                    vec_lvsl(0, src));
 
     if (src_really_odd != 0x0000000F)
-        /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+        /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
          * on the second vector. */
         srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
     else
@@ -88,7 +88,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
         srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
 
         if (src_really_odd != 0x0000000F)
-            /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+            /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
              * on the second vector. */
             srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
         else
@@ -127,7 +127,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
 
 av_cold void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     c->gmc1 = gmc1_altivec;
 #endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c
index 7354816208..3e6765ce15 100644
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,36 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
+static int pix_norm1_altivec(uint8_t *pix, int line_size)
+{
+    int i, s = 0;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sum;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned pixels. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char pixv = vec_perm(pixl, pixr, perm);
+        vector unsigned char pixv = vec_vsx_ld(0,  pix);
+
+        /* Square the values, and add them to our sum. */
+        sv = vec_msum(pixv, pixv, sv);
+
+        pix += line_size;
+    }
+    /* Sum up the four partial sums, and put the result into s. */
+    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
+    sum = vec_splat(sum, 3);
+    vec_ste(sum, 0, &s);
+    return s;
+}
+#else
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
     int i, s = 0;
@@ -58,7 +86,37 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
 
     return s;
 }
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static int pix_sum_altivec(uint8_t *pix, int line_size)
+{
+    int i, s;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sumdiffs;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned 16 pixels into t1. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char t1   = vec_perm(pixl, pixr, perm);
+        vector unsigned char t1   = vec_vsx_ld(0,  pix);
 
+        /* Add each 4 pixel group together and put 4 results into sad. */
+        sad = vec_sum4s(t1, sad);
+
+        pix += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s. */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+    return s;
+}
+#else
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
     int i, s;
@@ -88,12 +146,14 @@ static int pix_sum_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
+#endif /* HAVE_VSX */
+
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                          AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index 9cac70e2ef..84aa562bb6 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,10 +33,38 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
-                               int line_size)
+                               ptrdiff_t line_size)
+{
+    int i;
+    vector unsigned char perm =
+        (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
+            0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
+    const vector unsigned char zero =
+        (const vector unsigned char) vec_splat_u8(0);
+
+    for (i = 0; i < 8; i++) {
+        /* Read potentially unaligned pixels.
+         * We're reading 16 pixels, and actually only want 8,
+         * but we simply ignore the extras. */
+        vector unsigned char bytes = vec_vsx_ld(0, pixels);
+
+        // Convert the bytes into shorts.
+        //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
+        vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
+
+        // Save the data to the block, we assume the block is 16-byte aligned.
+        vec_vsx_st(shorts, i * 16, (vector signed short *) block);
+
+        pixels += line_size;
+    }
+}
+#else
+static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
+                               ptrdiff_t line_size)
 {
     int i;
     vec_u8 perm = vec_lvsl(0, pixels);
@@ -60,6 +88,71 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
     }
 }
 
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
+                                const uint8_t *s2, int stride)
+{
+  int i;
+  const vector unsigned char zero =
+    (const vector unsigned char) vec_splat_u8(0);
+  vector signed short shorts1, shorts2;
+
+  for (i = 0; i < 4; i++) {
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    vector unsigned char bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes =vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+
+    /* The code below is a copy of the code above...
+     * This is a manual unroll. */
+
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes = vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+  }
+}
+#else
 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
                                 const uint8_t *s2, int stride)
 {
@@ -131,11 +224,13 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
     }
 }
 
+#endif /* HAVE_VSX */
+
 #endif /* HAVE_ALTIVEC */
 
 #if HAVE_VSX
 static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
-                           int line_size)
+                           ptrdiff_t line_size)
 {
     int i;
     for (i = 0; i < 8; i++) {
@@ -171,7 +266,7 @@ av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/svq1enc_altivec.c b/libavcodec/ppc/svq1enc_altivec.c
index 222f7c1c37..4e25e253f6 100644
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/svq1enc.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                      int size)
 {
@@ -76,7 +76,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
 
 av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index caf8721ebf..35bb280842 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/vc1dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 // main steps of 8x8 transform
 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
@@ -304,16 +304,23 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
     src2 = vec_pack(s2, sA);
     src3 = vec_pack(s3, sB);
 
+#if HAVE_BIGENDIAN
     p0 = vec_lvsl (0, dest);
     p1 = vec_lvsl (stride, dest);
     p = vec_splat_u8 (-1);
     perm0 = vec_mergeh (p, p0);
     perm1 = vec_mergeh (p, p1);
+#define GET_TMP2(dst, p)        \
+    tmp = vec_ld (0, dest);     \
+    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), p);
+#else
+#define GET_TMP2(dst,p)         \
+    tmp = vec_vsx_ld (0, dst);  \
+    tmp2 = (vector signed short)vec_mergeh (tmp, vec_splat_u8(0));
+#endif
 
 #define ADD(dest,src,perm)                                              \
-    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
-    tmp = vec_ld (0, dest);                                             \
-    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm);  \
+    GET_TMP2(dest, perm);                                               \
     tmp3 = vec_adds (tmp2, src);                                        \
     tmp = vec_packsu (tmp3, tmp3);                                      \
     vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest);        \
@@ -344,7 +351,7 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
 
 av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/videodsp_ppc.c b/libavcodec/ppc/videodsp_ppc.c
index b9e003b487..915702252e 100644
--- a/libavcodec/ppc/videodsp_ppc.c
+++ b/libavcodec/ppc/videodsp_ppc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2004 Romain Dolbeau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/vorbisdsp_altivec.c b/libavcodec/ppc/vorbisdsp_altivec.c
index a7aad86163..d7557c815b 100644
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavcodec/vorbisdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                             intptr_t blocksize)
 {
@@ -54,7 +54,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
 
 av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 68e7102a53..4a367b655e 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,17 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/vp3dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static const vec_s16 constants =
     {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
+#if HAVE_BIGENDIAN
 static const vec_u8 interleave_high =
     {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+#else
+static const vec_u8 interleave_high =
+    {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+#endif
 
 #define IDCT_START \
     vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
@@ -156,9 +161,18 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
     TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
     IDCT_1D(ADD8, SHIFT4)
 
-#define ADD(a)\
+#if HAVE_BIGENDIAN
+#define GET_VDST16\
     vdst = vec_ld(0, dst);\
-    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
+    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
+#else
+#define GET_VDST16\
+    vdst = vec_vsx_ld(0,dst);\
+    vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
+#endif
+
+#define ADD(a)\
+    GET_VDST16;\
     vdst_16 = vec_adds(a, vdst_16);\
     t = vec_packsu(vdst_16, vdst_16);\
     vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
@@ -179,7 +193,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
 
 av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 869fe670e7..23e4ace7da 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2010 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavcodec/vp8dsp.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
 
 // h subpel filter uses msum to multiply+add 4 pixel taps at once
@@ -59,17 +59,30 @@ static const vec_s8 h_subpel_filters_outer[3] =
     vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
     vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
 
+#if HAVE_BIGENDIAN
+#define GET_PIXHL(offset)                   \
+    a = vec_ld((offset)-is6tap-1, src);     \
+    b = vec_ld((offset)-is6tap-1+15, src);  \
+    pixh  = vec_perm(a, b, permh##offset);  \
+    pixl  = vec_perm(a, b, perml##offset)
+
+#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
+#else
+#define GET_PIXHL(offset)                   \
+    a = vec_vsx_ld((offset)-is6tap-1, src); \
+    pixh  = vec_perm(a, a, perm_inner);     \
+    pixl  = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
+
+#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
+#endif
+
 #define FILTER_H(dstv, off) \
-    a = vec_ld((off)-is6tap-1,    src); \
-    b = vec_ld((off)-is6tap-1+15, src); \
-\
-    pixh  = vec_perm(a, b, permh##off); \
-    pixl  = vec_perm(a, b, perml##off); \
+    GET_PIXHL(off);                            \
     filth = vec_msum(filter_inner, pixh, c64); \
     filtl = vec_msum(filter_inner, pixl, c64); \
 \
     if (is6tap) { \
-        outer = vec_perm(a, b, perm_6tap##off); \
+        GET_OUTER(off);                                \
         filth = vec_msum(filter_outerh, outer, filth); \
         filtl = vec_msum(filter_outerl, outer, filtl); \
     } \
@@ -84,9 +97,12 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  int h, int mx, int w, int is6tap)
 {
     LOAD_H_SUBPEL_FILTER(mx-1);
-    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
+#if HAVE_BIGENDIAN
+    vec_u8 align_vec0, align_vec8, permh0, permh8;
     vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
-    vec_u8 a, b, pixh, pixl, outer;
+    vec_u8 b;
+#endif
+    vec_u8 filt, a, pixh, pixl, outer;
     vec_s16 f16h, f16l;
     vec_s32 filth, filtl;
 
@@ -97,6 +113,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     align_vec0 = vec_lvsl( -is6tap-1, src);
     align_vec8 = vec_lvsl(8-is6tap-1, src);
 
@@ -107,6 +124,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
     perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
     perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
+#endif
 
     while (h --> 0) {
         FILTER_H(f16h, 0);
@@ -164,6 +182,12 @@ static const vec_u8 v_subpel_filters[7] =
     dstv = vec_adds(dstv, c64); \
     dstv = vec_sra(dstv, c7)
 
+#if HAVE_BIGENDIAN
+#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
+#else
+#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
+#endif
+
 static av_always_inline
 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  uint8_t *src, ptrdiff_t src_stride,
@@ -175,6 +199,7 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
     // so combine this permute with the alignment permute vector
     align_vech = vec_lvsl(0, src);
@@ -183,22 +208,23 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
         perm_vec = vec_mergeh(align_vech, align_vecl);
     else
         perm_vec = vec_mergeh(align_vech, align_vech);
+#endif
 
     if (is6tap)
-        s0 = load_with_perm_vec(-2*src_stride, src, perm_vec);
-    s1 = load_with_perm_vec(-1*src_stride, src, perm_vec);
-    s2 = load_with_perm_vec( 0*src_stride, src, perm_vec);
-    s3 = load_with_perm_vec( 1*src_stride, src, perm_vec);
+        s0 = LOAD_HL(-2*src_stride, src, perm_vec);
+    s1 = LOAD_HL(-1*src_stride, src, perm_vec);
+    s2 = LOAD_HL( 0*src_stride, src, perm_vec);
+    s3 = LOAD_HL( 1*src_stride, src, perm_vec);
     if (is6tap)
-        s4 = load_with_perm_vec( 2*src_stride, src, perm_vec);
+        s4 = LOAD_HL( 2*src_stride, src, perm_vec);
 
     src += (2+is6tap)*src_stride;
 
     while (h --> 0) {
         if (is6tap)
-            s5 = load_with_perm_vec(0, src, perm_vec);
+            s5 = LOAD_HL(0, src, perm_vec);
         else
-            s4 = load_with_perm_vec(0, src, perm_vec);
+            s4 = LOAD_HL(0, src, perm_vec);
 
         FILTER_V(f16h, vec_mule);
 
@@ -272,39 +298,25 @@ EPEL_HV(4,  4,4)
 
 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    register vector unsigned char perm = vec_lvsl(0, src);
+    register vector unsigned char perm;
     int i;
     register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
     register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
     register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
 
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
 // hand-unrolling the loop by 4 gains about 15%
 // mininum execution time goes from 74 to 60 cycles
 // it's faster than -funroll-loops, but using
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, src);
-        pixelsv2  = vec_ld(15, src);
-        pixelsv1B = vec_ld(sstride, src);
-        pixelsv2B = vec_ld(15 + sstride, src);
-        pixelsv1C = vec_ld(sstride2, src);
-        pixelsv2C = vec_ld(15 + sstride2, src);
-        pixelsv1D = vec_ld(sstride3, src);
-        pixelsv2D = vec_ld(15 + sstride3, src);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               dstride, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               dstride2, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               dstride3, (unsigned char*)dst);
+        vec_st(load_with_perm_vec(0, src, perm), 0, dst);
+        vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
+        vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
+        vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
         src += sstride4;
         dst += dstride4;
     }
@@ -315,7 +327,7 @@ static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *s
 
 av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/proresdata.c b/libavcodec/proresdata.c
index fcaf32a1af..9849b5cc00 100644
--- a/libavcodec/proresdata.c
+++ b/libavcodec/proresdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdata.h b/libavcodec/proresdata.h
index 1e5d05ece6..ee8278d5f4 100644
--- a/libavcodec/proresdata.h
+++ b/libavcodec/proresdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
new file mode 100644
index 0000000000..14ede5d16b
--- /dev/null
+++ b/libavcodec/proresdec.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PRORESDEC_H
+#define AVCODEC_PRORESDEC_H
+
+#include "blockdsp.h"
+#include "proresdsp.h"
+
+typedef struct {
+    const uint8_t *data;
+    unsigned mb_x;
+    unsigned mb_y;
+    unsigned mb_count;
+    unsigned data_size;
+    int ret;
+} SliceContext;
+
+typedef struct {
+    BlockDSPContext bdsp;
+    ProresDSPContext prodsp;
+    AVFrame *frame;
+    int frame_type;              ///< 0 = progressive, 1 = tff, 2 = bff
+    uint8_t qmat_luma[64];
+    uint8_t qmat_chroma[64];
+    SliceContext *slices;
+    int slice_count;             ///< number of slices in the current picture
+    unsigned mb_width;           ///< width of the current picture in mb
+    unsigned mb_height;          ///< height of the current picture in mb
+    uint8_t progressive_scan[64];
+    uint8_t interlaced_scan[64];
+    const uint8_t *scan;
+    int first_field;
+    int alpha_info;
+} ProresContext;
+
+#endif /* AVCODEC_PRORESDEC_H */
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
new file mode 100644
index 0000000000..932f85f73b
--- /dev/null
+++ b/libavcodec/proresdec2.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy), 'ap4h' (4444)
+ */
+
+//#define DEBUG
+
+#define LONG_BITSTREAM_READER
+
+#include "libavutil/internal.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "idctdsp.h"
+#include "internal.h"
+#include "simple_idct.h"
+#include "proresdec.h"
+#include "proresdata.h"
+
+static void permute(uint8_t *dst, const uint8_t *src, const uint8_t permutation[64])
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = permutation[src[i]];
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+    uint8_t idct_permutation[64];
+
+    avctx->bits_per_raw_sample = 10;
+
+    ff_blockdsp_init(&ctx->bdsp, avctx);
+    ff_proresdsp_init(&ctx->prodsp, avctx);
+
+    ff_init_scantable_permutation(idct_permutation,
+                                  ctx->prodsp.idct_permutation_type);
+
+    permute(ctx->progressive_scan, ff_prores_progressive_scan, idct_permutation);
+    permute(ctx->interlaced_scan, ff_prores_interlaced_scan, idct_permutation);
+
+    return 0;
+}
+
+static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
+                               const int data_size, AVCodecContext *avctx)
+{
+    int hdr_size, width, height, flags;
+    int version;
+    const uint8_t *ptr;
+
+    hdr_size = AV_RB16(buf);
+    ff_dlog(avctx, "header size %d\n", hdr_size);
+    if (hdr_size > data_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    version = AV_RB16(buf + 2);
+    ff_dlog(avctx, "%.4s version %d\n", buf+4, version);
+    if (version > 1) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported version: %d\n", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    width  = AV_RB16(buf + 8);
+    height = AV_RB16(buf + 10);
+    if (width != avctx->width || height != avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "picture resolution change: %dx%d -> %dx%d\n",
+               avctx->width, avctx->height, width, height);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ctx->frame_type = (buf[12] >> 2) & 3;
+    ctx->alpha_info = buf[17] & 0xf;
+
+    if (ctx->alpha_info > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
+        return AVERROR_INVALIDDATA;
+    }
+    if (avctx->skip_alpha) ctx->alpha_info = 0;
+
+    ff_dlog(avctx, "frame type %d\n", ctx->frame_type);
+
+    if (ctx->frame_type == 0) {
+        ctx->scan = ctx->progressive_scan; // permuted
+    } else {
+        ctx->scan = ctx->interlaced_scan; // permuted
+        ctx->frame->interlaced_frame = 1;
+        ctx->frame->top_field_first = ctx->frame_type == 1;
+    }
+
+    if (ctx->alpha_info) {
+        avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUVA444P10 : AV_PIX_FMT_YUVA422P10;
+    } else {
+        avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUV444P10 : AV_PIX_FMT_YUV422P10;
+    }
+
+    ptr   = buf + 20;
+    flags = buf[19];
+    ff_dlog(avctx, "flags %x\n", flags);
+
+    if (flags & 2) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_luma, ctx->prodsp.idct_permutation, ptr);
+        ptr += 64;
+    } else {
+        memset(ctx->qmat_luma, 4, 64);
+    }
+
+    if (flags & 1) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_chroma, ctx->prodsp.idct_permutation, ptr);
+    } else {
+        memset(ctx->qmat_chroma, 4, 64);
+    }
+
+    return hdr_size;
+}
+
+static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, const int buf_size)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i, hdr_size, slice_count;
+    unsigned pic_data_size;
+    int log2_slice_mb_width, log2_slice_mb_height;
+    int slice_mb_count, mb_x, mb_y;
+    const uint8_t *data_ptr, *index_ptr;
+
+    hdr_size = buf[0] >> 3;
+    if (hdr_size < 8 || hdr_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    pic_data_size = AV_RB32(buf + 1);
+    if (pic_data_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    log2_slice_mb_width  = buf[7] >> 4;
+    log2_slice_mb_height = buf[7] & 0xF;
+    if (log2_slice_mb_width > 3 || log2_slice_mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported slice resolution: %dx%d\n",
+               1 << log2_slice_mb_width, 1 << log2_slice_mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->mb_width  = (avctx->width  + 15) >> 4;
+    if (ctx->frame_type)
+        ctx->mb_height = (avctx->height + 31) >> 5;
+    else
+        ctx->mb_height = (avctx->height + 15) >> 4;
+
+    slice_count = AV_RB16(buf + 5);
+
+    if (ctx->slice_count != slice_count || !ctx->slices) {
+        av_freep(&ctx->slices);
+        ctx->slice_count = 0;
+        ctx->slices = av_mallocz_array(slice_count, sizeof(*ctx->slices));
+        if (!ctx->slices)
+            return AVERROR(ENOMEM);
+        ctx->slice_count = slice_count;
+    }
+
+    if (!slice_count)
+        return AVERROR(EINVAL);
+
+    if (hdr_size + slice_count*2 > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong slice count\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // parse slice information
+    index_ptr = buf + hdr_size;
+    data_ptr  = index_ptr + slice_count*2;
+
+    slice_mb_count = 1 << log2_slice_mb_width;
+    mb_x = 0;
+    mb_y = 0;
+
+    for (i = 0; i < slice_count; i++) {
+        SliceContext *slice = &ctx->slices[i];
+
+        slice->data = data_ptr;
+        data_ptr += AV_RB16(index_ptr + i*2);
+
+        while (ctx->mb_width - mb_x < slice_mb_count)
+            slice_mb_count >>= 1;
+
+        slice->mb_x = mb_x;
+        slice->mb_y = mb_y;
+        slice->mb_count = slice_mb_count;
+        slice->data_size = data_ptr - slice->data;
+
+        if (slice->data_size < 6) {
+            av_log(avctx, AV_LOG_ERROR, "error, wrong slice data size\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        mb_x += slice_mb_count;
+        if (mb_x == ctx->mb_width) {
+            slice_mb_count = 1 << log2_slice_mb_width;
+            mb_x = 0;
+            mb_y++;
+        }
+        if (data_ptr > buf + buf_size) {
+            av_log(avctx, AV_LOG_ERROR, "error, slice out of bounds\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (mb_x || mb_y != ctx->mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "error wrong mb count y %d h %d\n",
+               mb_y, ctx->mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return pic_data_size;
+}
+
+#define DECODE_CODEWORD(val, codebook)                                  \
+    do {                                                                \
+        unsigned int rice_order, exp_order, switch_bits;                \
+        unsigned int q, buf, bits;                                      \
+                                                                        \
+        UPDATE_CACHE(re, gb);                                           \
+        buf = GET_CACHE(re, gb);                                        \
+                                                                        \
+        /* number of bits to switch between rice and exp golomb */      \
+        switch_bits =  codebook & 3;                                    \
+        rice_order  =  codebook >> 5;                                   \
+        exp_order   = (codebook >> 2) & 7;                              \
+                                                                        \
+        q = 31 - av_log2(buf);                                          \
+                                                                        \
+        if (q > switch_bits) { /* exp golomb */                         \
+            bits = exp_order - switch_bits + (q<<1);                    \
+            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+                ((switch_bits + 1) << rice_order);                      \
+            SKIP_BITS(re, gb, bits);                                    \
+        } else if (rice_order) {                                        \
+            SKIP_BITS(re, gb, q+1);                                     \
+            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
+            SKIP_BITS(re, gb, rice_order);                              \
+        } else {                                                        \
+            val = q;                                                    \
+            SKIP_BITS(re, gb, q+1);                                     \
+        }                                                               \
+    } while (0)
+
+#define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static av_always_inline void decode_dc_coeffs(GetBitContext *gb, int16_t *out,
+                                              int blocks_per_slice)
+{
+    int16_t prev_dc;
+    int code, i, sign;
+
+    OPEN_READER(re, gb);
+
+    DECODE_CODEWORD(code, FIRST_DC_CB);
+    prev_dc = TOSIGNED(code);
+    out[0] = prev_dc;
+
+    out += 64; // dc coeff for the next block
+
+    code = 5;
+    sign = 0;
+    for (i = 1; i < blocks_per_slice; i++, out += 64) {
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
+        if(code) sign ^= -(code & 1);
+        else     sign  = 0;
+        prev_dc += (((code + 1) >> 1) ^ sign) - sign;
+        out[0] = prev_dc;
+    }
+    CLOSE_READER(re, gb);
+}
+
+// adaptive codebook switching lut according to previous run/level values
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
+
+static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
+                                             int16_t *out, int blocks_per_slice)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int block_mask, sign;
+    unsigned pos, run, level;
+    int max_coeffs, i, bits_left;
+    int log2_block_count = av_log2(blocks_per_slice);
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);                                           \
+    run   = 4;
+    level = 2;
+
+    max_coeffs = 64 << log2_block_count;
+    block_mask = blocks_per_slice - 1;
+
+    for (pos = block_mask;;) {
+        bits_left = gb->size_in_bits - re_index;
+        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+            break;
+
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
+        pos += run + 1;
+        if (pos >= max_coeffs) {
+            av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
+            return AVERROR_INVALIDDATA;
+        }
+
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
+        level += 1;
+
+        i = pos >> log2_block_count;
+
+        sign = SHOW_SBITS(re, gb, 1);
+        SKIP_BITS(re, gb, 1);
+        out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
+    }
+
+    CLOSE_READER(re, gb);
+    return 0;
+}
+
+static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
+                             uint16_t *dst, int dst_stride,
+                             const uint8_t *buf, unsigned buf_size,
+                             const int16_t *qmat)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, blocks_per_slice = slice->mb_count<<2;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        ctx->prodsp.idct_put(dst, dst_stride, block+(0<<6), qmat);
+        ctx->prodsp.idct_put(dst             +8, dst_stride, block+(1<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride  , dst_stride, block+(2<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride+8, dst_stride, block+(3<<6), qmat);
+        block += 4*64;
+        dst += 16;
+    }
+    return 0;
+}
+
+static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, unsigned buf_size,
+                               const int16_t *qmat, int log2_blocks_per_mb)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        for (j = 0; j < log2_blocks_per_mb; j++) {
+            ctx->prodsp.idct_put(dst,              dst_stride, block+(0<<6), qmat);
+            ctx->prodsp.idct_put(dst+4*dst_stride, dst_stride, block+(1<<6), qmat);
+            block += 2*64;
+            dst += 8;
+        }
+    }
+    return 0;
+}
+
+static void unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
+                         const int num_bits)
+{
+    const int mask = (1 << num_bits) - 1;
+    int i, idx, val, alpha_val;
+
+    idx       = 0;
+    alpha_val = mask;
+    do {
+        do {
+            if (get_bits1(gb)) {
+                val = get_bits(gb, num_bits);
+            } else {
+                int sign;
+                val  = get_bits(gb, num_bits == 16 ? 7 : 4);
+                sign = val & 1;
+                val  = (val + 2) >> 1;
+                if (sign)
+                    val = -val;
+            }
+            alpha_val = (alpha_val + val) & mask;
+            if (num_bits == 16) {
+                dst[idx++] = alpha_val >> 6;
+            } else {
+                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
+            }
+            if (idx >= num_coeffs)
+                break;
+        } while (get_bits_left(gb)>0 && get_bits1(gb));
+        val = get_bits(gb, 4);
+        if (!val)
+            val = get_bits(gb, 11);
+        if (idx + val > num_coeffs)
+            val = num_coeffs - idx;
+        if (num_bits == 16) {
+            for (i = 0; i < val; i++)
+                dst[idx++] = alpha_val >> 6;
+        } else {
+            for (i = 0; i < val; i++)
+                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
+
+        }
+    } while (idx < num_coeffs);
+}
+
+/**
+ * Decode alpha slice plane.
+ */
+static void decode_slice_alpha(ProresContext *ctx,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, int buf_size,
+                               int blocks_per_slice)
+{
+    GetBitContext gb;
+    int i;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+
+    for (i = 0; i < blocks_per_slice<<2; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    if (ctx->alpha_info == 2) {
+        unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 16);
+    } else {
+        unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 8);
+    }
+
+    block = blocks;
+    for (i = 0; i < 16; i++) {
+        memcpy(dst, block, 16 * blocks_per_slice * sizeof(*dst));
+        dst   += dst_stride >> 1;
+        block += 16 * blocks_per_slice;
+    }
+}
+
+static int decode_slice_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    ProresContext *ctx = avctx->priv_data;
+    SliceContext *slice = &ctx->slices[jobnr];
+    const uint8_t *buf = slice->data;
+    AVFrame *pic = ctx->frame;
+    int i, hdr_size, qscale, log2_chroma_blocks_per_mb;
+    int luma_stride, chroma_stride;
+    int y_data_size, u_data_size, v_data_size, a_data_size;
+    uint8_t *dest_y, *dest_u, *dest_v, *dest_a;
+    int16_t qmat_luma_scaled[64];
+    int16_t qmat_chroma_scaled[64];
+    int mb_x_shift;
+    int ret;
+
+    slice->ret = -1;
+    //av_log(avctx, AV_LOG_INFO, "slice %d mb width %d mb x %d y %d\n",
+    //       jobnr, slice->mb_count, slice->mb_x, slice->mb_y);
+
+    // slice header
+    hdr_size = buf[0] >> 3;
+    qscale = av_clip(buf[1], 1, 224);
+    qscale = qscale > 128 ? qscale - 96 << 2: qscale;
+    y_data_size = AV_RB16(buf + 2);
+    u_data_size = AV_RB16(buf + 4);
+    v_data_size = slice->data_size - y_data_size - u_data_size - hdr_size;
+    if (hdr_size > 7) v_data_size = AV_RB16(buf + 6);
+    a_data_size = slice->data_size - y_data_size - u_data_size -
+                  v_data_size - hdr_size;
+
+    if (y_data_size < 0 || u_data_size < 0 || v_data_size < 0
+        || hdr_size+y_data_size+u_data_size+v_data_size > slice->data_size){
+        av_log(avctx, AV_LOG_ERROR, "invalid plane data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    buf += hdr_size;
+
+    for (i = 0; i < 64; i++) {
+        qmat_luma_scaled  [i] = ctx->qmat_luma  [i] * qscale;
+        qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * qscale;
+    }
+
+    if (ctx->frame_type == 0) {
+        luma_stride   = pic->linesize[0];
+        chroma_stride = pic->linesize[1];
+    } else {
+        luma_stride   = pic->linesize[0] << 1;
+        chroma_stride = pic->linesize[1] << 1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10 || avctx->pix_fmt == AV_PIX_FMT_YUVA444P10) {
+        mb_x_shift = 5;
+        log2_chroma_blocks_per_mb = 2;
+    } else {
+        mb_x_shift = 4;
+        log2_chroma_blocks_per_mb = 1;
+    }
+
+    dest_y = pic->data[0] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+    dest_u = pic->data[1] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_v = pic->data[2] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_a = pic->data[3] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+
+    if (ctx->frame_type && ctx->first_field ^ ctx->frame->top_field_first) {
+        dest_y += pic->linesize[0];
+        dest_u += pic->linesize[1];
+        dest_v += pic->linesize[2];
+        dest_a += pic->linesize[3];
+    }
+
+    ret = decode_slice_luma(avctx, slice, (uint16_t*)dest_y, luma_stride,
+                            buf, y_data_size, qmat_luma_scaled);
+    if (ret < 0)
+        return ret;
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_u, chroma_stride,
+                                  buf + y_data_size, u_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_v, chroma_stride,
+                                  buf + y_data_size + u_data_size, v_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+    }
+    /* decode alpha plane if available */
+    if (ctx->alpha_info && pic->data[3] && a_data_size)
+        decode_slice_alpha(ctx, (uint16_t*)dest_a, luma_stride,
+                           buf + y_data_size + u_data_size + v_data_size,
+                           a_data_size, slice->mb_count);
+
+    slice->ret = 0;
+    return 0;
+}
+
+static int decode_picture(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i;
+
+    avctx->execute2(avctx, decode_slice_thread, NULL, NULL, ctx->slice_count);
+
+    for (i = 0; i < ctx->slice_count; i++)
+        if (ctx->slices[i].ret < 0)
+            return ctx->slices[i].ret;
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    ProresContext *ctx = avctx->priv_data;
+    AVFrame *frame = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    int frame_hdr_size, pic_size, ret;
+
+    if (buf_size < 28 || AV_RL32(buf + 4) != AV_RL32("icpf")) {
+        av_log(avctx, AV_LOG_ERROR, "invalid frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->frame = frame;
+    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
+    ctx->frame->key_frame = 1;
+    ctx->first_field = 1;
+
+    buf += 8;
+    buf_size -= 8;
+
+    frame_hdr_size = decode_frame_header(ctx, buf, buf_size, avctx);
+    if (frame_hdr_size < 0)
+        return frame_hdr_size;
+
+    buf += frame_hdr_size;
+    buf_size -= frame_hdr_size;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+ decode_picture:
+    pic_size = decode_picture_header(avctx, buf, buf_size);
+    if (pic_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture header\n");
+        return pic_size;
+    }
+
+    if ((ret = decode_picture(avctx)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture\n");
+        return ret;
+    }
+
+    buf += pic_size;
+    buf_size -= pic_size;
+
+    if (ctx->frame_type && buf_size > 0 && ctx->first_field) {
+        ctx->first_field = 0;
+        goto decode_picture;
+    }
+
+    *got_frame      = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->slices);
+
+    return 0;
+}
+
+AVCodec ff_prores_decoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = decode_init,
+    .close          = decode_close,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+};
diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec_lgpl.c
index 8a53719d9f..9e5674eccc 100644
--- a/libavcodec/proresdec.c
+++ b/libavcodec/proresdec_lgpl.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -86,7 +86,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ctx->slice_data       = NULL;
 
     avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
-    ff_proresdsp_init(&ctx->dsp);
+    ff_proresdsp_init(&ctx->dsp, avctx);
 
     ctx->scantable_type = -1;   // set scantable type to uninitialized
     memset(ctx->qmat_luma, 4, 64);
@@ -140,6 +140,7 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
         av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
         return AVERROR_INVALIDDATA;
     }
+    if (avctx->skip_alpha) ctx->alpha_info = 0;
 
     switch (ctx->chroma_factor) {
     case 2:
@@ -262,7 +263,7 @@ static int decode_picture_header(ProresContext *ctx, const uint8_t *buf,
 
     if (ctx->total_slices != num_slices) {
         av_freep(&ctx->slice_data);
-        ctx->slice_data = av_malloc((num_slices + 1) * sizeof(ctx->slice_data[0]));
+        ctx->slice_data = av_malloc_array(num_slices + 1, sizeof(ctx->slice_data[0]));
         if (!ctx->slice_data)
             return AVERROR(ENOMEM);
         ctx->total_slices = num_slices;
@@ -506,8 +507,9 @@ static void unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
                 dst[idx++] = alpha_val >> 6;
             else
                 dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
-            if (idx >= num_coeffs - 1)
+            if (idx >= num_coeffs) {
                 break;
+            }
         } while (get_bits1(gb));
         val = get_bits(gb, 4);
         if (!val)
@@ -619,7 +621,7 @@ static int decode_slice(AVCodecContext *avctx, void *tdata)
     coff[2]     = coff[1] + u_data_size;
     v_data_size = hdr_size > 7 ? AV_RB16(buf + 6) : slice_data_size - coff[2];
     coff[3]     = coff[2] + v_data_size;
-    a_data_size = slice_data_size - coff[3];
+    a_data_size = ctx->alpha_info ? slice_data_size - coff[3] : 0;
 
     /* if V or alpha component size is negative that means that previous
        component sizes are too large */
@@ -769,8 +771,8 @@ static av_cold int decode_close(AVCodecContext *avctx)
 }
 
 
-AVCodec ff_prores_decoder = {
-    .name           = "prores",
+AVCodec ff_prores_lgpl_decoder = {
+    .name           = "prores_lgpl",
     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PRORES,
diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c
index 3af2f0b9bb..82d6009b58 100644
--- a/libavcodec/proresdsp.c
+++ b/libavcodec/proresdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 #define CLIP_MIN (1 << (PRORES_BITS_PER_SAMPLE - 8))           ///< minimum value for clipping resulting pixels
 #define CLIP_MAX (1 << PRORES_BITS_PER_SAMPLE) - CLIP_MIN - 1  ///< maximum value for clipping resulting pixels
 
-#define CLIP_AND_BIAS(x) (av_clip((x) + BIAS, CLIP_MIN, CLIP_MAX))
+#define CLIP(x) (av_clip((x), CLIP_MIN, CLIP_MAX))
 
 /**
  * Add bias value, clamp and output pixels of a slice
@@ -44,7 +44,7 @@ static void put_pixels(uint16_t *dst, int stride, const int16_t *in)
         for (x = 0; x < 8; x++) {
             src_offset = (y << 3) + x;
 
-            dst[dst_offset + x] = CLIP_AND_BIAS(in[src_offset]);
+            dst[dst_offset + x] = CLIP(in[src_offset]);
         }
     }
 }
@@ -55,13 +55,13 @@ static void prores_idct_put_c(uint16_t *out, int linesize, int16_t *block, const
     put_pixels(out, linesize >> 1, block);
 }
 
-av_cold void ff_proresdsp_init(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
     dsp->idct_put = prores_idct_put_c;
     dsp->idct_permutation_type = FF_IDCT_PERM_NONE;
 
     if (ARCH_X86)
-        ff_proresdsp_init_x86(dsp);
+        ff_proresdsp_init_x86(dsp, avctx);
 
     ff_init_scantable_permutation(dsp->idct_permutation,
                                   dsp->idct_permutation_type);
diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h
index e8a3ea96a7..159862ec3d 100644
--- a/libavcodec/proresdsp.h
+++ b/libavcodec/proresdsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #define AVCODEC_PRORESDSP_H
 
 #include <stdint.h>
+#include "avcodec.h"
 
 #define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder
 
@@ -33,8 +34,8 @@ typedef struct ProresDSPContext {
     void (* idct_put) (uint16_t *out, int linesize, int16_t *block, const int16_t *qmat);
 } ProresDSPContext;
 
-void ff_proresdsp_init(ProresDSPContext *dsp);
+void ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx);
 
-void ff_proresdsp_init_x86(ProresDSPContext *dsp);
+void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx);
 
 #endif /* AVCODEC_PRORESDSP_H */
diff --git a/libavcodec/proresenc_anatoliy.c b/libavcodec/proresenc_anatoliy.c
new file mode 100644
index 0000000000..0516066163
--- /dev/null
+++ b/libavcodec/proresenc_anatoliy.c
@@ -0,0 +1,630 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Apple ProRes encoder (Anatoliy Wasserman version)
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy)
+ */
+
+#include "avcodec.h"
+#include "dct.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "fdctdsp.h"
+
+#define DEFAULT_SLICE_MB_WIDTH 8
+
+#define FF_PROFILE_PRORES_PROXY     0
+#define FF_PROFILE_PRORES_LT        1
+#define FF_PROFILE_PRORES_STANDARD  2
+#define FF_PROFILE_PRORES_HQ        3
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_PRORES_PROXY,    "apco"},
+    { FF_PROFILE_PRORES_LT,       "apcs"},
+    { FF_PROFILE_PRORES_STANDARD, "apcn"},
+    { FF_PROFILE_PRORES_HQ,       "apch"},
+    { FF_PROFILE_UNKNOWN }
+};
+
+static const int qp_start_table[4] = { 4, 1, 1, 1 };
+static const int qp_end_table[4]   = { 8, 9, 6, 6 };
+static const int bitrate_table[5]  = { 1000, 2100, 3500, 5400 };
+
+static const uint8_t progressive_scan[64] = {
+     0,  1,  8,  9,  2,  3, 10, 11,
+    16, 17, 24, 25, 18, 19, 26, 27,
+     4,  5, 12, 20, 13,  6,  7, 14,
+    21, 28, 29, 22, 15, 23, 30, 31,
+    32, 33, 40, 48, 41, 34, 35, 42,
+    49, 56, 57, 50, 43, 36, 37, 44,
+    51, 58, 59, 52, 45, 38, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+static const uint8_t QMAT_LUMA[4][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }
+};
+
+static const uint8_t QMAT_CHROMA[4][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 63, 63,
+         7,  7, 11, 12, 14, 63, 63, 63,
+         9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }
+};
+
+
+typedef struct {
+    FDCTDSPContext fdsp;
+    uint8_t* fill_y;
+    uint8_t* fill_u;
+    uint8_t* fill_v;
+
+    int qmat_luma[16][64];
+    int qmat_chroma[16][64];
+} ProresContext;
+
+static void encode_codeword(PutBitContext *pb, int val, int codebook)
+{
+    unsigned int rice_order, exp_order, switch_bits, first_exp, exp, zeros;
+
+    /* number of bits to switch between rice and exp golomb */
+    switch_bits = codebook & 3;
+    rice_order  = codebook >> 5;
+    exp_order   = (codebook >> 2) & 7;
+
+    first_exp = ((switch_bits + 1) << rice_order);
+
+    if (val >= first_exp) { /* exp golomb */
+        val -= first_exp;
+        val += (1 << exp_order);
+        exp = av_log2(val);
+        zeros = exp - exp_order + switch_bits + 1;
+        put_bits(pb, zeros, 0);
+        put_bits(pb, exp + 1, val);
+    } else if (rice_order) {
+        put_bits(pb, (val >> rice_order), 0);
+        put_bits(pb, 1, 1);
+        put_sbits(pb, rice_order, val);
+    } else {
+        put_bits(pb, val, 0);
+        put_bits(pb, 1, 1);
+    }
+}
+
+#define QSCALE(qmat,ind,val) ((val) / ((qmat)[ind]))
+#define TO_GOLOMB(val) (((val) << 1) ^ ((val) >> 31))
+#define DIFF_SIGN(val, sign) (((val) >> 31) ^ (sign))
+#define IS_NEGATIVE(val) ((((val) >> 31) ^ -1) + 1)
+#define TO_GOLOMB2(val,sign) ((val)==0 ? 0 : ((val) << 1) + (sign))
+
+static av_always_inline int get_level(int val)
+{
+    int sign = (val >> 31);
+    return (val ^ sign) - sign;
+}
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
+        int blocks_per_slice, int *qmat)
+{
+    int prev_dc, code;
+    int i, sign, idx;
+    int new_dc, delta, diff_sign, new_code;
+
+    prev_dc = QSCALE(qmat, 0, in[0] - 16384);
+    code = TO_GOLOMB(prev_dc);
+    encode_codeword(pb, code, FIRST_DC_CB);
+
+    code = 5; sign = 0; idx = 64;
+    for (i = 1; i < blocks_per_slice; i++, idx += 64) {
+        new_dc    = QSCALE(qmat, 0, in[idx] - 16384);
+        delta     = new_dc - prev_dc;
+        diff_sign = DIFF_SIGN(delta, sign);
+        new_code  = TO_GOLOMB2(get_level(delta), diff_sign);
+
+        encode_codeword(pb, new_code, dc_codebook[FFMIN(code, 6)]);
+
+        code      = new_code;
+        sign      = delta >> 31;
+        prev_dc   = new_dc;
+    }
+}
+
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29,
+        0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28,
+        0x28, 0x28, 0x28, 0x4C };
+
+static void encode_ac_coeffs(AVCodecContext *avctx, PutBitContext *pb,
+        int16_t *in, int blocks_per_slice, int *qmat)
+{
+    int prev_run = 4;
+    int prev_level = 2;
+
+    int run = 0, level, code, i, j;
+    for (i = 1; i < 64; i++) {
+        int indp = progressive_scan[i];
+        for (j = 0; j < blocks_per_slice; j++) {
+            int val = QSCALE(qmat, indp, in[(j << 6) + indp]);
+            if (val) {
+                encode_codeword(pb, run, run_to_cb[FFMIN(prev_run, 15)]);
+
+                prev_run   = run;
+                run        = 0;
+                level      = get_level(val);
+                code       = level - 1;
+
+                encode_codeword(pb, code, lev_to_cb[FFMIN(prev_level, 9)]);
+
+                prev_level = level;
+
+                put_bits(pb, 1, IS_NEGATIVE(val));
+            } else {
+                ++run;
+            }
+        }
+    }
+}
+
+static void get(uint8_t *pixels, int stride, int16_t* block)
+{
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        AV_WN64(block, AV_RN64(pixels));
+        AV_WN64(block+4, AV_RN64(pixels+8));
+        pixels += stride;
+        block += 8;
+    }
+}
+
+static void fdct_get(FDCTDSPContext *fdsp, uint8_t *pixels, int stride, int16_t* block)
+{
+    get(pixels, stride, block);
+    fdsp->fdct(block);
+}
+
+static int encode_slice_plane(AVCodecContext *avctx, int mb_count,
+        uint8_t *src, int src_stride, uint8_t *buf, unsigned buf_size,
+        int *qmat, int chroma)
+{
+    ProresContext* ctx = avctx->priv_data;
+    FDCTDSPContext *fdsp = &ctx->fdsp;
+    LOCAL_ALIGNED(16, int16_t, blocks, [DEFAULT_SLICE_MB_WIDTH << 8]);
+    int16_t *block;
+    int i, blocks_per_slice;
+    PutBitContext pb;
+
+    block = blocks;
+    for (i = 0; i < mb_count; i++) {
+        fdct_get(fdsp, src,                  src_stride, block + (0 << 6));
+        fdct_get(fdsp, src + 8 * src_stride, src_stride, block + ((2 - chroma) << 6));
+        if (!chroma) {
+            fdct_get(fdsp, src + 16,                  src_stride, block + (1 << 6));
+            fdct_get(fdsp, src + 16 + 8 * src_stride, src_stride, block + (3 << 6));
+        }
+
+        block += (256 >> chroma);
+        src   += (32  >> chroma);
+    }
+
+    blocks_per_slice = mb_count << (2 - chroma);
+    init_put_bits(&pb, buf, buf_size);
+
+    encode_dc_coeffs(&pb, blocks, blocks_per_slice, qmat);
+    encode_ac_coeffs(avctx, &pb, blocks, blocks_per_slice, qmat);
+
+    flush_put_bits(&pb);
+    return put_bits_ptr(&pb) - pb.buf;
+}
+
+static av_always_inline unsigned encode_slice_data(AVCodecContext *avctx,
+        uint8_t *dest_y, uint8_t *dest_u, uint8_t *dest_v, int luma_stride,
+        int chroma_stride, unsigned mb_count, uint8_t *buf, unsigned data_size,
+        unsigned* y_data_size, unsigned* u_data_size, unsigned* v_data_size,
+        int qp)
+{
+    ProresContext* ctx = avctx->priv_data;
+
+    *y_data_size = encode_slice_plane(avctx, mb_count, dest_y, luma_stride,
+            buf, data_size, ctx->qmat_luma[qp - 1], 0);
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        *u_data_size = encode_slice_plane(avctx, mb_count, dest_u,
+                chroma_stride, buf + *y_data_size, data_size - *y_data_size,
+                ctx->qmat_chroma[qp - 1], 1);
+
+        *v_data_size = encode_slice_plane(avctx, mb_count, dest_v,
+                chroma_stride, buf + *y_data_size + *u_data_size,
+                data_size - *y_data_size - *u_data_size,
+                ctx->qmat_chroma[qp - 1], 1);
+    }
+
+    return *y_data_size + *u_data_size + *v_data_size;
+}
+
+static void subimage_with_fill(uint16_t *src, unsigned x, unsigned y,
+        unsigned stride, unsigned width, unsigned height, uint16_t *dst,
+        unsigned dst_width, unsigned dst_height)
+{
+
+    int box_width = FFMIN(width - x, dst_width);
+    int box_height = FFMIN(height - y, dst_height);
+    int i, j, src_stride = stride >> 1;
+    uint16_t last_pix, *last_line;
+
+    src += y * src_stride + x;
+    for (i = 0; i < box_height; ++i) {
+        for (j = 0; j < box_width; ++j) {
+            dst[j] = src[j];
+        }
+        last_pix = dst[j - 1];
+        for (; j < dst_width; j++)
+            dst[j] = last_pix;
+        src += src_stride;
+        dst += dst_width;
+    }
+    last_line = dst - dst_width;
+    for (; i < dst_height; i++) {
+        for (j = 0; j < dst_width; ++j) {
+            dst[j] = last_line[j];
+        }
+        dst += dst_width;
+    }
+}
+
+static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int mb_x,
+        int mb_y, unsigned mb_count, uint8_t *buf, unsigned data_size,
+        int unsafe, int *qp)
+{
+    int luma_stride, chroma_stride;
+    int hdr_size = 6, slice_size;
+    uint8_t *dest_y, *dest_u, *dest_v;
+    unsigned y_data_size = 0, u_data_size = 0, v_data_size = 0;
+    ProresContext* ctx = avctx->priv_data;
+    int tgt_bits   = (mb_count * bitrate_table[avctx->profile]) >> 2;
+    int low_bytes  = (tgt_bits - (tgt_bits >> 3)) >> 3; // 12% bitrate fluctuation
+    int high_bytes = (tgt_bits + (tgt_bits >> 3)) >> 3;
+
+    luma_stride   = pic->linesize[0];
+    chroma_stride = pic->linesize[1];
+
+    dest_y = pic->data[0] + (mb_y << 4) * luma_stride   + (mb_x << 5);
+    dest_u = pic->data[1] + (mb_y << 4) * chroma_stride + (mb_x << 4);
+    dest_v = pic->data[2] + (mb_y << 4) * chroma_stride + (mb_x << 4);
+
+    if (unsafe) {
+
+        subimage_with_fill((uint16_t *) pic->data[0], mb_x << 4, mb_y << 4,
+                luma_stride, avctx->width, avctx->height,
+                (uint16_t *) ctx->fill_y, mb_count << 4, 16);
+        subimage_with_fill((uint16_t *) pic->data[1], mb_x << 3, mb_y << 4,
+                chroma_stride, avctx->width >> 1, avctx->height,
+                (uint16_t *) ctx->fill_u, mb_count << 3, 16);
+        subimage_with_fill((uint16_t *) pic->data[2], mb_x << 3, mb_y << 4,
+                chroma_stride, avctx->width >> 1, avctx->height,
+                (uint16_t *) ctx->fill_v, mb_count << 3, 16);
+
+        encode_slice_data(avctx, ctx->fill_y, ctx->fill_u, ctx->fill_v,
+                mb_count << 5, mb_count << 4, mb_count, buf + hdr_size,
+                data_size - hdr_size, &y_data_size, &u_data_size, &v_data_size,
+                *qp);
+    } else {
+        slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                data_size - hdr_size, &y_data_size, &u_data_size, &v_data_size,
+                *qp);
+
+        if (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]) {
+            do {
+                *qp += 1;
+                slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                        luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                        data_size - hdr_size, &y_data_size, &u_data_size,
+                        &v_data_size, *qp);
+            } while (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]);
+        } else if (slice_size < low_bytes && *qp
+                > qp_start_table[avctx->profile]) {
+            do {
+                *qp -= 1;
+                slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                        luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                        data_size - hdr_size, &y_data_size, &u_data_size,
+                        &v_data_size, *qp);
+            } while (slice_size < low_bytes && *qp > qp_start_table[avctx->profile]);
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    buf[1] = *qp;
+    AV_WB16(buf + 2, y_data_size);
+    AV_WB16(buf + 4, u_data_size);
+
+    return hdr_size + y_data_size + u_data_size + v_data_size;
+}
+
+static int prores_encode_picture(AVCodecContext *avctx, const AVFrame *pic,
+        uint8_t *buf, const int buf_size)
+{
+    int mb_width = (avctx->width + 15) >> 4;
+    int mb_height = (avctx->height + 15) >> 4;
+    int hdr_size, sl_size, i;
+    int mb_y, sl_data_size, qp;
+    int unsafe_bot, unsafe_right;
+    uint8_t *sl_data, *sl_data_sizes;
+    int slice_per_line = 0, rem = mb_width;
+
+    for (i = av_log2(DEFAULT_SLICE_MB_WIDTH); i >= 0; --i) {
+        slice_per_line += rem >> i;
+        rem &= (1 << i) - 1;
+    }
+
+    qp = qp_start_table[avctx->profile];
+    hdr_size = 8; sl_data_size = buf_size - hdr_size;
+    sl_data_sizes = buf + hdr_size;
+    sl_data = sl_data_sizes + (slice_per_line * mb_height * 2);
+    for (mb_y = 0; mb_y < mb_height; mb_y++) {
+        int mb_x = 0;
+        int slice_mb_count = DEFAULT_SLICE_MB_WIDTH;
+        while (mb_x < mb_width) {
+            while (mb_width - mb_x < slice_mb_count)
+                slice_mb_count >>= 1;
+
+            unsafe_bot = (avctx->height & 0xf) && (mb_y == mb_height - 1);
+            unsafe_right = (avctx->width & 0xf) && (mb_x + slice_mb_count == mb_width);
+
+            sl_size = encode_slice(avctx, pic, mb_x, mb_y, slice_mb_count,
+                    sl_data, sl_data_size, unsafe_bot || unsafe_right, &qp);
+
+            bytestream_put_be16(&sl_data_sizes, sl_size);
+            sl_data           += sl_size;
+            sl_data_size      -= sl_size;
+            mb_x              += slice_mb_count;
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    AV_WB32(buf + 1, sl_data - buf);
+    AV_WB16(buf + 5, slice_per_line * mb_height);
+    buf[7] = av_log2(DEFAULT_SLICE_MB_WIDTH) << 4;
+
+    return sl_data - buf;
+}
+
+static int prores_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    int header_size = 148;
+    uint8_t *buf;
+    int pic_size, ret;
+    int frame_size = FFALIGN(avctx->width, 16) * FFALIGN(avctx->height, 16)*16 + 500 + AV_INPUT_BUFFER_MIN_SIZE; //FIXME choose tighter limit
+
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    buf = pkt->data;
+    pic_size = prores_encode_picture(avctx, pict, buf + header_size + 8,
+            pkt->size - header_size - 8);
+
+    bytestream_put_be32(&buf, pic_size + 8 + header_size);
+    bytestream_put_buffer(&buf, "icpf", 4);
+
+    bytestream_put_be16(&buf, header_size);
+    bytestream_put_be16(&buf, 0);
+    bytestream_put_buffer(&buf, "fmpg", 4);
+    bytestream_put_be16(&buf, avctx->width);
+    bytestream_put_be16(&buf, avctx->height);
+    *buf++ = 0x83; // {10}(422){00}{00}(frame){11}
+    *buf++ = 0;
+    *buf++ = 2;
+    *buf++ = 2;
+    *buf++ = 6;
+    *buf++ = 32;
+    *buf++ = 0;
+    *buf++ = 3;
+
+    bytestream_put_buffer(&buf, QMAT_LUMA[avctx->profile],   64);
+    bytestream_put_buffer(&buf, QMAT_CHROMA[avctx->profile], 64);
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = pic_size + 8 + header_size;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static void scale_mat(const uint8_t* src, int* dst, int scale)
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = src[i] * scale;
+}
+
+static av_cold int prores_encode_init(AVCodecContext *avctx)
+{
+    int i;
+    ProresContext* ctx = avctx->priv_data;
+
+    if (avctx->pix_fmt != AV_PIX_FMT_YUV422P10) {
+        av_log(avctx, AV_LOG_ERROR, "need YUV422P10\n");
+        return AVERROR_PATCHWELCOME;
+    }
+    avctx->bits_per_raw_sample = 10;
+
+    if (avctx->width & 0x1) {
+        av_log(avctx, AV_LOG_ERROR,
+                "frame width needs to be multiple of 2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65534 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR,
+                "The maximum dimensions are 65534x65535\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((avctx->height & 0xf) || (avctx->width & 0xf)) {
+        ctx->fill_y = av_malloc(4 * (DEFAULT_SLICE_MB_WIDTH << 8));
+        if (!ctx->fill_y)
+            return AVERROR(ENOMEM);
+        ctx->fill_u = ctx->fill_y + (DEFAULT_SLICE_MB_WIDTH << 9);
+        ctx->fill_v = ctx->fill_u + (DEFAULT_SLICE_MB_WIDTH << 8);
+    }
+
+    if (avctx->profile == FF_PROFILE_UNKNOWN) {
+        avctx->profile = FF_PROFILE_PRORES_STANDARD;
+        av_log(avctx, AV_LOG_INFO,
+                "encoding with ProRes standard (apcn) profile\n");
+
+    } else if (avctx->profile < FF_PROFILE_PRORES_PROXY
+            || avctx->profile > FF_PROFILE_PRORES_HQ) {
+        av_log(
+                avctx,
+                AV_LOG_ERROR,
+                "unknown profile %d, use [0 - apco, 1 - apcs, 2 - apcn (default), 3 - apch]\n",
+                avctx->profile);
+        return AVERROR(EINVAL);
+    }
+
+    ff_fdctdsp_init(&ctx->fdsp, avctx);
+
+    avctx->codec_tag = AV_RL32((const uint8_t*)profiles[avctx->profile].name);
+
+    for (i = 1; i <= 16; i++) {
+        scale_mat(QMAT_LUMA[avctx->profile]  , ctx->qmat_luma[i - 1]  , i);
+        scale_mat(QMAT_CHROMA[avctx->profile], ctx->qmat_chroma[i - 1], i);
+    }
+
+    return 0;
+}
+
+static av_cold int prores_encode_close(AVCodecContext *avctx)
+{
+    ProresContext* ctx = avctx->priv_data;
+    av_freep(&ctx->fill_y);
+
+    return 0;
+}
+
+AVCodec ff_prores_aw_encoder = {
+    .name           = "prores_aw",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .profiles       = profiles
+};
+
+AVCodec ff_prores_encoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .profiles       = profiles
+};
diff --git a/libavcodec/proresenc.c b/libavcodec/proresenc_kostya.c
index 0564b12ae2..3bc1d5d772 100644
--- a/libavcodec/proresenc.c
+++ b/libavcodec/proresenc_kostya.c
@@ -3,20 +3,23 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This encoder appears to be based on Anatoliy Wassermans considering
+ * similarities in the bugs.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +40,7 @@
 #define MAX_PLANES 4
 
 enum {
+    PRORES_PROFILE_AUTO  = -1,
     PRORES_PROFILE_PROXY = 0,
     PRORES_PROFILE_LT,
     PRORES_PROFILE_STANDARD,
@@ -437,12 +441,11 @@ static int encode_slice_plane(ProresContext *ctx, PutBitContext *pb,
 
 static void put_alpha_diff(PutBitContext *pb, int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff) {
@@ -686,12 +689,11 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
 
 static int est_alpha_diff(int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff)
@@ -935,23 +937,15 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int sizes[4] = { 0 };
     int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
     int frame_size, picture_size, slice_size;
-    int pkt_size, ret, max_slice_size = 0;
+    int pkt_size, ret;
+    int max_slice_size = (ctx->frame_size_upper_bound - 200) / (ctx->pictures_per_frame * ctx->slices_per_picture + 1);
     uint8_t frame_flags;
 
     ctx->pic = pic;
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     pkt_size = ctx->frame_size_upper_bound;
 
-    if ((ret = ff_alloc_packet(pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     orig_buf = pkt->data;
 
@@ -1008,7 +1002,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         // slices
         if (!ctx->force_quant) {
-            ret = avctx->execute2(avctx, find_quant_thread, NULL, NULL,
+            ret = avctx->execute2(avctx, find_quant_thread, (void*)pic, NULL,
                                   ctx->mb_height);
             if (ret)
                 return ret;
@@ -1030,9 +1024,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     uint8_t *start = pkt->data;
                     // Recompute new size according to max_slice_size
                     // and deduce delta
-                    int delta = 200 + ctx->pictures_per_frame *
-                                ctx->slices_per_picture * max_slice_size -
-                                pkt_size;
+                    int delta = 200 + (ctx->pictures_per_frame *
+                                ctx->slices_per_picture + 1) *
+                                max_slice_size - pkt_size;
 
                     delta = FFMAX(delta, 2 * max_slice_size);
                     ctx->frame_size_upper_bound += delta;
@@ -1059,7 +1053,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     slice_hdr        = pkt->data + (slice_hdr        - start);
                     tmp              = pkt->data + (tmp              - start);
                 }
-                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)) * 8);
+                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)));
                 ret = encode_slice(avctx, pic, &pb, sizes, x, y, q,
                                    mbs_per_slice);
                 if (ret < 0)
@@ -1078,10 +1072,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
 
-        if (ctx->pictures_per_frame == 1)
-            picture_size = buf - picture_size_pos - 6;
-        else
-            picture_size = buf - picture_size_pos + 1;
+        picture_size = buf - (picture_size_pos - 1);
         bytestream_put_be32(&picture_size_pos, picture_size);
     }
 
@@ -1103,7 +1094,7 @@ static av_cold int encode_close(AVCodecContext *avctx)
 
     if (ctx->tdata) {
         for (i = 0; i < avctx->thread_count; i++)
-            av_free(ctx->tdata[i].nodes);
+            av_freep(&ctx->tdata[i].nodes);
     }
     av_freep(&ctx->tdata);
     av_freep(&ctx->slice_q);
@@ -1134,6 +1125,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
 
     avctx->bits_per_raw_sample = 10;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+    avctx->coded_frame->key_frame = 1;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     ctx->fdct      = prores_fdct;
     ctx->scantable = interlaced ? ff_prores_interlaced_scan
@@ -1146,7 +1143,23 @@ static av_cold int encode_init(AVCodecContext *avctx)
                "there should be an integer power of two MBs per slice\n");
         return AVERROR(EINVAL);
     }
+    if (ctx->profile == PRORES_PROFILE_AUTO) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
+                        !(desc->log2_chroma_w + desc->log2_chroma_h))
+                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
+        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
+               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
+               ? "4:4:4:4 profile because of the used input colorspace"
+               : "HQ profile to keep best quality");
+    }
     if (av_pix_fmt_desc_get(avctx->pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        if (ctx->profile != PRORES_PROFILE_4444) {
+            // force alpha and warn
+            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
+                   "encode alpha. Override with -profile if needed.\n");
+            ctx->alpha_bits = 0;
+        }
         if (ctx->alpha_bits & 7) {
             av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
             return AVERROR(EINVAL);
@@ -1248,16 +1261,16 @@ static av_cold int encode_init(AVCodecContext *avctx)
             ctx->bits_per_mb += ls * 4;
     }
 
-    ctx->frame_size_upper_bound = ctx->pictures_per_frame *
-                                  ctx->slices_per_picture *
+    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
+                                   ctx->slices_per_picture + 1) *
                                   (2 + 2 * ctx->num_planes +
                                    (mps * ctx->bits_per_mb) / 8)
                                   + 200;
 
     if (ctx->alpha_bits) {
          // The alpha plane is run-coded and might exceed the bit budget.
-         ctx->frame_size_upper_bound += ctx->pictures_per_frame *
-                                        ctx->slices_per_picture *
+         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
+                                         ctx->slices_per_picture + 1) *
          /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
          /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
     }
@@ -1281,8 +1294,10 @@ static const AVOption options[] = {
     { "mbs_per_slice", "macroblocks per slice", OFFSET(mbs_per_slice),
         AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
     { "profile",       NULL, OFFSET(profile), AV_OPT_TYPE_INT,
-        { .i64 = PRORES_PROFILE_STANDARD },
-        PRORES_PROFILE_PROXY, PRORES_PROFILE_4444, VE, "profile" },
+        { .i64 = PRORES_PROFILE_AUTO },
+        PRORES_PROFILE_AUTO, PRORES_PROFILE_4444, VE, "profile" },
+    { "auto",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO },
+        0, 0, VE, "profile" },
     { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY },
         0, 0, VE, "profile" },
     { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT },
@@ -1323,8 +1338,8 @@ static const AVClass proresenc_class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-AVCodec ff_prores_encoder = {
-    .name           = "prores",
+AVCodec ff_prores_ks_encoder = {
+    .name           = "prores_ks",
     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PRORES,
diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
index 5179ede083..824eefb79e 100644
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,10 +35,10 @@ av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens,
     int i, j, k = 0;
 
     ctx->avctx = avctx;
-    ctx->ch        = av_mallocz(sizeof(ctx->ch[0]) * avctx->channels * 2);
-    ctx->group     = av_mallocz(sizeof(ctx->group[0]) * num_groups);
-    ctx->bands     = av_malloc (sizeof(ctx->bands[0])     * num_lens);
-    ctx->num_bands = av_malloc (sizeof(ctx->num_bands[0]) * num_lens);
+    ctx->ch        = av_mallocz_array(sizeof(ctx->ch[0]), avctx->channels * 2);
+    ctx->group     = av_mallocz_array(sizeof(ctx->group[0]), num_groups);
+    ctx->bands     = av_malloc_array (sizeof(ctx->bands[0]),      num_lens);
+    ctx->num_bands = av_malloc_array (sizeof(ctx->num_bands[0]),  num_lens);
 
     if (!ctx->ch || !ctx->group || !ctx->bands || !ctx->num_bands) {
         ff_psy_end(ctx);
@@ -81,7 +81,7 @@ FFPsyChannelGroup *ff_psy_find_group(FFPsyContext *ctx, int channel)
 
 av_cold void ff_psy_end(FFPsyContext *ctx)
 {
-    if (ctx->model->end)
+    if (ctx->model && ctx->model->end)
         ctx->model->end(ctx);
     av_freep(&ctx->bands);
     av_freep(&ctx->num_bands);
@@ -94,6 +94,7 @@ typedef struct FFPsyPreprocessContext{
     float stereo_att;
     struct FFIIRFilterCoeffs *fcoeffs;
     struct FFIIRFilterState **fstate;
+    struct FFIIRFilterContext fiir;
 }FFPsyPreprocessContext;
 
 #define FILT_ORDER 4
@@ -111,12 +112,15 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
     if (avctx->cutoff > 0)
         cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
 
-    if (cutoff_coeff)
+    if (!cutoff_coeff && avctx->codec_id == AV_CODEC_ID_AAC)
+        cutoff_coeff = 2.0 * AAC_CUTOFF(avctx) / avctx->sample_rate;
+
+    if (cutoff_coeff && cutoff_coeff < 0.98)
     ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
                                              FF_FILTER_MODE_LOWPASS, FILT_ORDER,
                                              cutoff_coeff, 0.0, 0.0);
     if (ctx->fcoeffs) {
-        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
+        ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
         if (!ctx->fstate) {
             av_free(ctx);
             return NULL;
@@ -124,6 +128,9 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
         for (i = 0; i < avctx->channels; i++)
             ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
     }
+
+    ff_iir_filter_init(&ctx->fiir);
+
     return ctx;
 }
 
@@ -131,21 +138,22 @@ void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx, float **audio, int ch
 {
     int ch;
     int frame_size = ctx->avctx->frame_size;
+    FFIIRFilterContext *iir = &ctx->fiir;
 
     if (ctx->fstate) {
         for (ch = 0; ch < channels; ch++)
-            ff_iir_filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
-                              &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
+            iir->filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
+                            &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
     }
 }
 
 av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
 {
     int i;
-    ff_iir_filter_free_coeffs(ctx->fcoeffs);
+    ff_iir_filter_free_coeffsp(&ctx->fcoeffs);
     if (ctx->fstate)
         for (i = 0; i < ctx->avctx->channels; i++)
-            ff_iir_filter_free_state(ctx->fstate[i]);
+            ff_iir_filter_free_statep(&ctx->fstate[i]);
     av_freep(&ctx->fstate);
     av_free(ctx);
 }
diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h
index 1cc30668e4..a04cc4d226 100644
--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,8 @@
 /** maximum number of channels */
 #define PSY_MAX_CHANS 20
 
+#define AAC_CUTOFF(s) ((s)->bit_rate ? FFMIN3(4000 + (s)->bit_rate/8, 12000 + (s)->bit_rate/32, (s)->sample_rate / 2) : ((s)->sample_rate / 2))
+
 /**
  * single band psychoacoustic information
  */
@@ -36,8 +38,7 @@ typedef struct FFPsyBand {
     int   bits;
     float energy;
     float threshold;
-    float distortion;
-    float perceptual_weight;
+    float spread;    /* Energy spread over the band */
 } FFPsyBand;
 
 /**
@@ -65,6 +66,7 @@ typedef struct FFPsyWindowInfo {
     int window_shape;                 ///< window shape (sine/KBD/whatever)
     int num_windows;                  ///< number of windows in a frame
     int grouping[8];                  ///< window grouping (for e.g. AAC)
+    float clipping[8];                ///< maximum absolute normalized intensity in the given window for clip avoidance
     int *window_sizes;                ///< sequence of window sizes inside one frame (for eg. WMA)
 } FFPsyWindowInfo;
 
@@ -86,6 +88,7 @@ typedef struct FFPsyContext {
     struct {
         int size;                     ///< size of the bitresevoir in bits
         int bits;                     ///< number of bits used in the bitresevoir
+        int alloc;                    ///< number of bits allocated by the psy, or -1 if no allocation was done
     } bitres;
 
     void* model_priv_data;            ///< psychoacoustic model implementation private data
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 3b3f3ad8b4..572471586d 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -6,20 +6,20 @@
  * to Michael Niedermayer <michaelni@gmx.at> for writing initial
  * implementation.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index c72da533aa..c89b4ca2e0 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include <pthread.h>
 #elif HAVE_W32THREADS
 #include "compat/w32pthreads.h"
+#elif HAVE_OS2THREADS
+#include "compat/os2threads.h"
 #endif
 
 #include "avcodec.h"
@@ -46,6 +48,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/log.h"
 #include "libavutil/mem.h"
+#include "libavutil/opt.h"
 
 /**
  * Context used by codec threads and stored in their AVCodecInternal thread_ctx.
@@ -77,6 +80,10 @@ typedef struct PerThreadContext {
                                      * Set when the codec calls get_buffer().
                                      * State is returned to STATE_SETTING_UP afterwards.
                                      */
+        STATE_GET_FORMAT,           /**<
+                                     * Set when the codec calls get_format().
+                                     * State is returned to STATE_SETTING_UP afterwards.
+                                     */
         STATE_SETUP_FINISHED        ///< Set after the codec has called ff_thread_finish_setup().
     } state;
 
@@ -90,6 +97,9 @@ typedef struct PerThreadContext {
 
     AVFrame *requested_frame;       ///< AVFrame the codec passed to get_buffer()
     int      requested_flags;       ///< flags passed to get_buffer() for requested_frame
+
+    const enum AVPixelFormat *available_formats; ///< Format array for get_format()
+    enum AVPixelFormat result_format;            ///< get_format() result
 } PerThreadContext;
 
 /**
@@ -112,6 +122,9 @@ typedef struct FrameThreadContext {
     int die;                       ///< Set when threads should exit.
 } FrameThreadContext;
 
+#define THREAD_SAFE_CALLBACKS(avctx) \
+((avctx)->thread_safe_callbacks || (avctx)->get_buffer2 == avcodec_default_get_buffer2)
+
 /**
  * Codec worker thread.
  *
@@ -126,20 +139,16 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
     AVCodecContext *avctx = p->avctx;
     const AVCodec *codec = avctx->codec;
 
+    pthread_mutex_lock(&p->mutex);
     while (1) {
-        if (p->state == STATE_INPUT_READY && !fctx->die) {
-            pthread_mutex_lock(&p->mutex);
             while (p->state == STATE_INPUT_READY && !fctx->die)
                 pthread_cond_wait(&p->input_cond, &p->mutex);
-            pthread_mutex_unlock(&p->mutex);
-        }
 
         if (fctx->die) break;
 
-        if (!codec->update_thread_context && avctx->thread_safe_callbacks)
+        if (!codec->update_thread_context && THREAD_SAFE_CALLBACKS(avctx))
             ff_thread_finish_setup(avctx);
 
-        pthread_mutex_lock(&p->mutex);
         av_frame_unref(p->frame);
         p->got_frame = 0;
         p->result = codec->decode(avctx, p->frame, &p->got_frame, &p->avpkt);
@@ -153,14 +162,21 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
 
         if (p->state == STATE_SETTING_UP) ff_thread_finish_setup(avctx);
 
+        pthread_mutex_lock(&p->progress_mutex);
+#if 0 //BUFREF-FIXME
+        for (i = 0; i < MAX_BUFFERS; i++)
+            if (p->progress_used[i] && (p->got_frame || p->result<0 || avctx->codec_id != AV_CODEC_ID_H264)) {
+                p->progress[i][0] = INT_MAX;
+                p->progress[i][1] = INT_MAX;
+            }
+#endif
         p->state = STATE_INPUT_READY;
 
-        pthread_mutex_lock(&p->progress_mutex);
+        pthread_cond_broadcast(&p->progress_cond);
         pthread_cond_signal(&p->output_cond);
         pthread_mutex_unlock(&p->progress_mutex);
-
-        pthread_mutex_unlock(&p->mutex);
     }
+    pthread_mutex_unlock(&p->mutex);
 
     return NULL;
 }
@@ -211,10 +227,16 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         dst->hwaccel = src->hwaccel;
         dst->hwaccel_context = src->hwaccel_context;
+
+        dst->channels       = src->channels;
+        dst->sample_rate    = src->sample_rate;
+        dst->sample_fmt     = src->sample_fmt;
+        dst->channel_layout = src->channel_layout;
         dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
     }
 
     if (for_user) {
+        dst->delay       = src->thread_count - 1;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
         dst->coded_frame = src->coded_frame;
@@ -245,6 +267,7 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->opaque   = src->opaque;
     dst->debug    = src->debug;
+    dst->debug_mv = src->debug_mv;
 
     dst->slice_flags = src->slice_flags;
     dst->flags2      = src->flags2;
@@ -253,16 +276,14 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->frame_number     = src->frame_number;
     dst->reordered_opaque = src->reordered_opaque;
+    dst->thread_safe_callbacks = src->thread_safe_callbacks;
 
     if (src->slice_count && src->slice_offset) {
         if (dst->slice_count < src->slice_count) {
-            int *tmp = av_realloc(dst->slice_offset, src->slice_count *
-                                  sizeof(*dst->slice_offset));
-            if (!tmp) {
-                av_free(dst->slice_offset);
-                return AVERROR(ENOMEM);
-            }
-            dst->slice_offset = tmp;
+            int err = av_reallocp_array(&dst->slice_offset, src->slice_count,
+                                        sizeof(*dst->slice_offset));
+            if (err < 0)
+                return err;
         }
         memcpy(dst->slice_offset, src->slice_offset,
                src->slice_count * sizeof(*dst->slice_offset));
@@ -283,7 +304,8 @@ static void release_delayed_buffers(PerThreadContext *p)
         pthread_mutex_lock(&fctx->buffer_mutex);
 
         // fix extended data in case the caller screwed it up
-        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO);
+        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+                   p->avctx->codec_type == AVMEDIA_TYPE_AUDIO);
         f = &p->released_buffers[--p->num_released_buffers];
         f->extended_data = f->data;
         av_frame_unref(f);
@@ -334,15 +356,27 @@ static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
      * and it calls back to the client here.
      */
 
-    if (!p->avctx->thread_safe_callbacks &&
-        p->avctx->get_buffer2 != avcodec_default_get_buffer2) {
+    if (!p->avctx->thread_safe_callbacks && (
+         p->avctx->get_format != avcodec_default_get_format ||
+         p->avctx->get_buffer2 != avcodec_default_get_buffer2)) {
         while (p->state != STATE_SETUP_FINISHED && p->state != STATE_INPUT_READY) {
+            int call_done = 1;
             pthread_mutex_lock(&p->progress_mutex);
             while (p->state == STATE_SETTING_UP)
                 pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
 
-            if (p->state == STATE_GET_BUFFER) {
+            switch (p->state) {
+            case STATE_GET_BUFFER:
                 p->result = ff_get_buffer(p->avctx, p->requested_frame, p->requested_flags);
+                break;
+            case STATE_GET_FORMAT:
+                p->result_format = ff_get_format(p->avctx, p->available_formats);
+                break;
+            default:
+                call_done = 0;
+                break;
+            }
+            if (call_done) {
                 p->state  = STATE_SETTING_UP;
                 pthread_cond_signal(&p->progress_cond);
             }
@@ -379,9 +413,10 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
      * If we're still receiving the initial packets, don't return a frame.
      */
 
-    if (fctx->delaying) {
-        if (fctx->next_decoding >= (avctx->thread_count-1)) fctx->delaying = 0;
+    if (fctx->next_decoding > (avctx->thread_count-1-(avctx->codec_id == AV_CODEC_ID_FFV1)))
+        fctx->delaying = 0;
 
+    if (fctx->delaying) {
         *got_picture_ptr=0;
         if (avpkt->size)
             return avpkt->size;
@@ -408,6 +443,9 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
         *got_picture_ptr = p->got_frame;
         picture->pkt_dts = p->avpkt.dts;
 
+        if (p->result < 0)
+            err = p->result;
+
         /*
          * A later call with avkpt->size == 0 may loop over all threads,
          * including this one, searching for a frame to return before being
@@ -425,6 +463,14 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
 
     fctx->next_finished = finished;
 
+    /*
+     * When no frame was found while flushing, but an error occurred in
+     * any thread, return it instead of 0.
+     * Otherwise the error can get lost.
+     */
+    if (!avpkt->size && !*got_picture_ptr)
+        return err;
+
     /* return the size of the consumed packet if no error occurred */
     return (p->result >= 0) ? avpkt->size : p->result;
 }
@@ -432,7 +478,7 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
 void ff_thread_report_progress(ThreadFrame *f, int n, int field)
 {
     PerThreadContext *p;
-    int *progress = f->progress ? (int*)f->progress->data : NULL;
+    volatile int *progress = f->progress ? (int*)f->progress->data : NULL;
 
     if (!progress || progress[field] >= n) return;
 
@@ -450,7 +496,7 @@ void ff_thread_report_progress(ThreadFrame *f, int n, int field)
 void ff_thread_await_progress(ThreadFrame *f, int n, int field)
 {
     PerThreadContext *p;
-    int *progress = f->progress ? (int*)f->progress->data : NULL;
+    volatile int *progress = f->progress ? (int*)f->progress->data : NULL;
 
     if (!progress || progress[field] >= n) return;
 
@@ -470,6 +516,10 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
 
     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
 
+    if(p->state == STATE_SETUP_FINISHED){
+        av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n");
+    }
+
     pthread_mutex_lock(&p->progress_mutex);
     p->state = STATE_SETUP_FINISHED;
     pthread_cond_broadcast(&p->progress_cond);
@@ -490,6 +540,7 @@ static void park_frame_worker_threads(FrameThreadContext *fctx, int thread_count
                 pthread_cond_wait(&p->output_cond, &p->progress_mutex);
             pthread_mutex_unlock(&p->progress_mutex);
         }
+        p->got_frame = 0;
     }
 }
 
@@ -502,7 +553,11 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
     park_frame_worker_threads(fctx, thread_count);
 
     if (fctx->prev_thread && fctx->prev_thread != fctx->threads)
-        update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0);
+        if (update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Final thread update failed\n");
+            fctx->prev_thread->avctx->internal->is_copy = fctx->threads->avctx->internal->is_copy;
+            fctx->threads->avctx->internal->is_copy = 1;
+        }
 
     fctx->die = 1;
 
@@ -515,12 +570,11 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
 
         if (p->thread_init)
             pthread_join(p->thread, NULL);
+        p->thread_init=0;
 
-        if (codec->close)
+        if (codec->close && p->avctx)
             codec->close(p->avctx);
 
-        avctx->codec = NULL;
-
         release_delayed_buffers(p);
         av_frame_free(&p->frame);
     }
@@ -536,18 +590,23 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
         av_packet_unref(&p->avpkt);
         av_freep(&p->released_buffers);
 
-        if (i) {
+        if (i && p->avctx) {
             av_freep(&p->avctx->priv_data);
             av_freep(&p->avctx->slice_offset);
         }
 
-        av_freep(&p->avctx->internal);
+        if (p->avctx)
+            av_freep(&p->avctx->internal);
         av_freep(&p->avctx);
     }
 
     av_freep(&fctx->threads);
     pthread_mutex_destroy(&fctx->buffer_mutex);
     av_freep(&avctx->internal->thread_ctx);
+
+    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+        av_opt_free(avctx->priv_data);
+    avctx->codec = NULL;
 }
 
 int ff_frame_thread_init(AVCodecContext *avctx)
@@ -564,7 +623,8 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+        if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) || avctx->debug_mv)
+            nb_cpus = 1;
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -581,7 +641,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
     if (!fctx)
         return AVERROR(ENOMEM);
 
-    fctx->threads = av_mallocz(sizeof(PerThreadContext) * thread_count);
+    fctx->threads = av_mallocz_array(thread_count, sizeof(PerThreadContext));
     if (!fctx->threads) {
         av_freep(&avctx->internal->thread_ctx);
         return AVERROR(ENOMEM);
@@ -619,6 +679,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         copy->internal = av_malloc(sizeof(AVCodecInternal));
         if (!copy->internal) {
+            copy->priv_data = NULL;
             err = AVERROR(ENOMEM);
             goto error;
         }
@@ -648,8 +709,10 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         if (err) goto error;
 
-        if (!pthread_create(&p->thread, NULL, frame_worker_thread, p))
-            p->thread_init = 1;
+        err = AVERROR(pthread_create(&p->thread, NULL, frame_worker_thread, p));
+        p->thread_init= !err;
+        if(!p->thread_init)
+            goto error;
     }
 
     return 0;
@@ -689,18 +752,30 @@ void ff_thread_flush(AVCodecContext *avctx)
     }
 }
 
-int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if ((avctx->active_thread_type&FF_THREAD_FRAME) && p->state != STATE_SETTING_UP &&
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
+        return 0;
+    }
+    return 1;
+}
+
+static int thread_get_buffer_internal(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
     int err;
 
     f->owner = avctx;
 
+    ff_init_buffer_info(avctx, f->f);
+
     if (!(avctx->active_thread_type & FF_THREAD_FRAME))
         return ff_get_buffer(avctx, f->f, flags);
 
     if (p->state != STATE_SETTING_UP &&
-        (avctx->codec->update_thread_context || !avctx->thread_safe_callbacks)) {
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() cannot be called after ff_thread_finish_setup()\n");
         return -1;
     }
@@ -721,11 +796,11 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
         avctx->get_buffer2 == avcodec_default_get_buffer2) {
         err = ff_get_buffer(avctx, f->f, flags);
     } else {
+        pthread_mutex_lock(&p->progress_mutex);
         p->requested_frame = f->f;
         p->requested_flags = flags;
         p->state = STATE_GET_BUFFER;
-        pthread_mutex_lock(&p->progress_mutex);
-        pthread_cond_signal(&p->progress_cond);
+        pthread_cond_broadcast(&p->progress_cond);
 
         while (p->state != STATE_SETTING_UP)
             pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
@@ -735,9 +810,8 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
         pthread_mutex_unlock(&p->progress_mutex);
 
     }
-    if (!avctx->thread_safe_callbacks && !avctx->codec->update_thread_context)
+    if (!THREAD_SAFE_CALLBACKS(avctx) && !avctx->codec->update_thread_context)
         ff_thread_finish_setup(avctx);
-
     if (err)
         av_buffer_unref(&f->progress);
 
@@ -746,6 +820,40 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
     return err;
 }
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    enum AVPixelFormat res;
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if (!(avctx->active_thread_type & FF_THREAD_FRAME) || avctx->thread_safe_callbacks ||
+        avctx->get_format == avcodec_default_get_format)
+        return ff_get_format(avctx, fmt);
+    if (p->state != STATE_SETTING_UP) {
+        av_log(avctx, AV_LOG_ERROR, "get_format() cannot be called after ff_thread_finish_setup()\n");
+        return -1;
+    }
+    pthread_mutex_lock(&p->progress_mutex);
+    p->available_formats = fmt;
+    p->state = STATE_GET_FORMAT;
+    pthread_cond_broadcast(&p->progress_cond);
+
+    while (p->state != STATE_SETTING_UP)
+        pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
+
+    res = p->result_format;
+
+    pthread_mutex_unlock(&p->progress_mutex);
+
+    return res;
+}
+
+int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+{
+    int ret = thread_get_buffer_internal(avctx, f, flags);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "thread_get_buffer() failed\n");
+    return ret;
+}
+
 void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
diff --git a/libavcodec/pthread_internal.h b/libavcodec/pthread_internal.h
index 5dfd0b2a26..6a2f3781db 100644
--- a/libavcodec/pthread_internal.h
+++ b/libavcodec/pthread_internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index d7c73f0884..c8e69f0a9a 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
 #include <pthread.h>
 #elif HAVE_W32THREADS
 #include "compat/w32pthreads.h"
+#elif HAVE_OS2THREADS
+#include "compat/os2threads.h"
 #endif
 
 #include "avcodec.h"
@@ -58,6 +60,12 @@ typedef struct SliceThreadContext {
     unsigned current_execute;
     int current_job;
     int done;
+
+    int *entries;
+    int entries_count;
+    int thread_count;
+    pthread_cond_t *progress_cond;
+    pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
 static void* attribute_align_arg worker(void *v)
@@ -104,15 +112,27 @@ void ff_slice_thread_free(AVCodecContext *avctx)
     pthread_mutex_lock(&c->current_job_lock);
     c->done = 1;
     pthread_cond_broadcast(&c->current_job_cond);
+    for (i = 0; i < c->thread_count; i++)
+        pthread_cond_broadcast(&c->progress_cond[i]);
     pthread_mutex_unlock(&c->current_job_lock);
 
     for (i=0; i<avctx->thread_count; i++)
          pthread_join(c->workers[i], NULL);
 
+    for (i = 0; i < c->thread_count; i++) {
+        pthread_mutex_destroy(&c->progress_mutex[i]);
+        pthread_cond_destroy(&c->progress_cond[i]);
+    }
+
     pthread_mutex_destroy(&c->current_job_lock);
     pthread_cond_destroy(&c->current_job_cond);
     pthread_cond_destroy(&c->last_job_cond);
-    av_free(c->workers);
+
+    av_freep(&c->entries);
+    av_freep(&c->progress_mutex);
+    av_freep(&c->progress_cond);
+
+    av_freep(&c->workers);
     av_freep(&avctx->internal->thread_ctx);
 }
 
@@ -175,7 +195,8 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+        if  (avctx->height)
+            nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16);
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -192,7 +213,7 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     if (!c)
         return -1;
 
-    c->workers = av_mallocz(sizeof(pthread_t)*thread_count);
+    c->workers = av_mallocz_array(thread_count, sizeof(pthread_t));
     if (!c->workers) {
         av_free(c);
         return -1;
@@ -222,3 +243,65 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     avctx->execute2 = thread_execute2;
     return 0;
 }
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    int *entries = p->entries;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    entries[field] +=n;
+    pthread_cond_signal(&p->progress_cond[thread]);
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+    SliceThreadContext *p  = avctx->internal->thread_ctx;
+    int *entries      = p->entries;
+
+    if (!entries || !field) return;
+
+    thread = thread ? thread - 1 : p->thread_count - 1;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    while ((entries[field - 1] - entries[field]) < shift){
+        pthread_cond_wait(&p->progress_cond[thread], &p->progress_mutex[thread]);
+    }
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    int i;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        SliceThreadContext *p = avctx->internal->thread_ctx;
+        p->thread_count  = avctx->thread_count;
+        p->entries       = av_mallocz_array(count, sizeof(int));
+
+        p->progress_mutex = av_malloc_array(p->thread_count, sizeof(pthread_mutex_t));
+        p->progress_cond  = av_malloc_array(p->thread_count, sizeof(pthread_cond_t));
+
+        if (!p->entries || !p->progress_mutex || !p->progress_cond) {
+            av_freep(&p->entries);
+            av_freep(&p->progress_mutex);
+            av_freep(&p->progress_cond);
+            return AVERROR(ENOMEM);
+        }
+        p->entries_count  = count;
+
+        for (i = 0; i < p->thread_count; i++) {
+            pthread_mutex_init(&p->progress_mutex[i], NULL);
+            pthread_cond_init(&p->progress_cond[i], NULL);
+        }
+    }
+
+    return 0;
+}
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    memset(p->entries, 0, p->entries_count * sizeof(int));
+}
diff --git a/libavcodec/ptx.c b/libavcodec/ptx.c
index 312850c246..42147f4afc 100644
--- a/libavcodec/ptx.c
+++ b/libavcodec/ptx.c
@@ -2,20 +2,20 @@
  * V.Flash PTX (.ptx) image decoder
  * Copyright (c) 2007 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_PATCHWELCOME;
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB555;
+    avctx->pix_fmt = AV_PIX_FMT_BGR555LE;
 
     if (buf_end - buf < offset)
         return AVERROR_INVALIDDATA;
@@ -58,10 +58,8 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -69,13 +67,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     stride = p->linesize[0];
 
     for (y = 0; y < h && buf_end - buf >= w * bytes_per_pixel; y++) {
-#if HAVE_BIGENDIAN
-        unsigned int x;
-        for (x=0; x<w*bytes_per_pixel; x+=bytes_per_pixel)
-            AV_WN16(ptr+x, AV_RL16(buf+x));
-#else
         memcpy(ptr, buf, w*bytes_per_pixel);
-#endif
         ptr += stride;
         buf += w*bytes_per_pixel;
     }
diff --git a/libavcodec/put_bits.h b/libavcodec/put_bits.h
index 17666fac48..5b1bc8b8b7 100644
--- a/libavcodec/put_bits.h
+++ b/libavcodec/put_bits.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 
 #include <stdint.h>
 #include <stddef.h>
-#include <assert.h>
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/avassert.h"
 
 typedef struct PutBitContext {
     uint32_t bit_buf;
@@ -62,6 +62,24 @@ static inline void init_put_bits(PutBitContext *s, uint8_t *buffer,
 }
 
 /**
+ * Rebase the bit writer onto a reallocated buffer.
+ *
+ * @param buffer the buffer where to put bits
+ * @param buffer_size the size in bytes of buffer,
+ *                    must be larger than the previous size
+ */
+static inline void rebase_put_bits(PutBitContext *s, uint8_t *buffer,
+                                   int buffer_size)
+{
+    av_assert0(8*buffer_size > s->size_in_bits);
+
+    s->buf_end = buffer + buffer_size;
+    s->buf_ptr = buffer + (s->buf_ptr - s->buf);
+    s->buf     = buffer;
+    s->size_in_bits = 8 * buffer_size;
+}
+
+/**
  * @return the total number of bits written to the bitstream.
  */
 static inline int put_bits_count(PutBitContext *s)
@@ -136,7 +154,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     unsigned int bit_buf;
     int bit_left;
 
-    assert(n <= 31 && value < (1U << n));
+    av_assert2(n <= 31 && value < (1U << n));
 
     bit_buf  = s->bit_buf;
     bit_left = s->bit_left;
@@ -145,9 +163,10 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 #ifdef BITSTREAM_WRITER_LE
     bit_buf |= value << (32 - bit_left);
     if (n >= bit_left) {
+        av_assert2(s->buf_ptr+3<s->buf_end);
         AV_WL32(s->buf_ptr, bit_buf);
         s->buf_ptr += 4;
-        bit_buf     = (bit_left == 32) ? 0 : value >> bit_left;
+        bit_buf     = value >> bit_left;
         bit_left   += 32;
     }
     bit_left -= n;
@@ -158,6 +177,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     } else {
         bit_buf   <<= bit_left;
         bit_buf    |= value >> (n - bit_left);
+        av_assert2(s->buf_ptr+3<s->buf_end);
         AV_WB32(s->buf_ptr, bit_buf);
         s->buf_ptr += 4;
         bit_left   += 32 - n;
@@ -171,9 +191,9 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 
 static inline void put_sbits(PutBitContext *pb, int n, int32_t value)
 {
-    assert(n >= 0 && n <= 31);
+    av_assert2(n >= 0 && n <= 31);
 
-    put_bits(pb, n, value & ((1 << n) - 1));
+    put_bits(pb, n, av_mod_uintp2(value, n));
 }
 
 /**
@@ -207,8 +227,9 @@ static inline uint8_t *put_bits_ptr(PutBitContext *s)
  */
 static inline void skip_put_bytes(PutBitContext *s, int n)
 {
-    assert((put_bits_count(s) & 7) == 0);
-    assert(s->bit_left == 32);
+    av_assert2((put_bits_count(s) & 7) == 0);
+    av_assert2(s->bit_left == 32);
+    av_assert0(n <= s->buf_end - s->buf_ptr);
     s->buf_ptr += n;
 }
 
@@ -231,7 +252,9 @@ static inline void skip_put_bits(PutBitContext *s, int n)
  */
 static inline void set_put_bits_buffer_size(PutBitContext *s, int size)
 {
+    av_assert0(size <= INT_MAX/8 - 32);
     s->buf_end = s->buf + size;
+    s->size_in_bits = 8*size;
 }
 
 #endif /* AVCODEC_PUT_BITS_H */
diff --git a/libavcodec/qcelpdata.h b/libavcodec/qcelpdata.h
index 319833e904..931c990b09 100644
--- a/libavcodec/qcelpdata.h
+++ b/libavcodec/qcelpdata.h
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
  * @file
  * Data tables for the QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
@@ -66,7 +66,7 @@ typedef struct QCELPFrame {
 } QCELPFrame;
 
 /**
- * pre-calculated table for hammsinc function
+ * Pre-calculated table for hammsinc function.
  * Only half of the table is needed because of symmetry.
  *
  * TIA/EIA/IS-733 2.4.5.2-2/3
@@ -82,7 +82,7 @@ typedef struct QCELPBitmap {
 #define QCELP_OF(variable, bit, len) {offsetof(QCELPFrame, variable), bit, len}
 
 /**
- * bitmap unpacking tables for RATE_FULL
+ * Bitmap unpacking tables for RATE_FULL
  *
  * TIA/EIA/IS-733 Table 2.4.7.1-1
  */
@@ -169,7 +169,7 @@ static const QCELPBitmap qcelp_rate_full_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_HALF
+ * Bitmap unpacking tables for RATE_HALF
  *
  * TIA/EIA/IS-733 Table 2.4.7.2-1
  */
@@ -211,7 +211,7 @@ static const QCELPBitmap qcelp_rate_half_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_QUARTER
+ * Bitmap unpacking tables for RATE_QUARTER
  *
  * TIA/EIA/IS-733 Table 2.4.7.3-1
  */
@@ -232,7 +232,7 @@ static const QCELPBitmap qcelp_rate_quarter_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_OCTAVE
+ * Bitmap unpacking tables for RATE_OCTAVE
  *
  * trick: CBSEED is written into QCELPContext.cbsign[15],
  * which is not used for RATE_OCTAVE.
@@ -257,12 +257,12 @@ static const QCELPBitmap qcelp_rate_octave_bitmap[] = {
     QCELP_OF(lspv   [8], 0, 1), //  8
     QCELP_OF(cbsign[15], 0, 1), //  7
     QCELP_OF(lspv   [9], 0, 1), //  6
-    QCELP_OF(cbgain [0], 0, 2), //  7
+    QCELP_OF(cbgain [0], 0, 2), //  5
     QCELP_OF(reserved,   0, 4)  //  3
 };
 
 /**
- * position of the bitmapping data for each packet type in
+ * Bitmapping data position for each packet type in
  * the QCELPContext
  */
 static const QCELPBitmap * const qcelp_unpacking_bitmaps_per_rate[5] = {
@@ -420,12 +420,12 @@ static const qcelp_vector * const qcelp_lspvq[5] = {
 };
 
 /**
- * the final gain scalefactor before clipping into a usable output float
+ * The final gain scalefactor before clipping into a usable output float
  */
 #define QCELP_SCALE 8192.
 
 /**
- * table for computing Ga (decoded linear codebook gain magnitude)
+ * Table for computing Ga (decoded linear codebook gain magnitude)
  *
  * @note The table could fit in int16_t in x*8 form, but it seems
  *       to be slower on x86
@@ -452,7 +452,7 @@ static const float qcelp_g12ga[61] = {
  1000.000/QCELP_SCALE};
 
 /**
- * circular codebook for rate 1 frames in x*100 form
+ * Circular codebook for rate 1 frames in x*100 form
  *
  * TIA/EIA/IS-733 2.4.6.1-2
  */
@@ -477,7 +477,7 @@ static const int16_t qcelp_rate_full_codebook[128] = {
 #define QCELP_RATE_FULL_CODEBOOK_RATIO .01
 
 /**
- * circular codebook for rate 1/2 frames in x*2 form
+ * Circular codebook for rate 1/2 frames in x*2 form
  *
  * TIA/EIA/IS-733 2.4.6.1-1
  */
@@ -511,7 +511,7 @@ static const int8_t qcelp_rate_half_codebook[128] = {
 #define QCELP_SQRT1887 1.373681186
 
 /**
- * table for impulse response of BPF used to filter
+ * Table for impulse response of BPF used to filter
  * the white excitation for bitrate 1/4 synthesis
  *
  * Only half the tables are needed because of symmetry.
@@ -526,14 +526,14 @@ static const double qcelp_rnd_fir_coefs[11] = {
 
 /**
  * This spread factor is used, for bitrate 1/8 and I_F_Q,
- * to force the LSP frequencies to be at least 80 Hz apart.
+ * to force LSP frequencies to be at least 80 Hz apart.
  *
  * TIA/EIA/IS-733 2.4.3.3.2
  */
 #define QCELP_LSP_SPREAD_FACTOR 0.02
 
 /**
- * predictor coefficient for the conversion of LSP codes
+ * Predictor coefficient for the conversion of LSP codes
  * to LSP frequencies for 1/8 and I_F_Q
  *
  * TIA/EIA/IS-733 2.4.3.2.7-2
@@ -541,7 +541,7 @@ static const double qcelp_rnd_fir_coefs[11] = {
 #define QCELP_LSP_OCTAVE_PREDICTOR 29.0/32
 
 /**
- * initial coefficient to perform bandwidth expansion on LPC
+ * Initial coefficient to perform bandwidth expansion on LPC
  *
  * @note: 0.9883 looks like an approximation of 253/256.
  *
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index 2eeefb8b4a..adb3e82359 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,13 @@
  * @file
  * QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
 #include <stddef.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
@@ -40,9 +41,6 @@
 #include "acelp_vectors.h"
 #include "lsp.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 typedef enum {
     I_F_Q = -1,    /**< insufficient frame quality */
     SILENCE,
@@ -135,7 +133,7 @@ static int decode_lspf(QCELPContext *q, float *lspf)
         } else {
             erasure_coeff = QCELP_LSP_OCTAVE_PREDICTOR;
 
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             if (q->erasure_count > 1)
                 erasure_coeff *= q->erasure_count < 4 ? 0.9 : 0.7;
@@ -239,7 +237,7 @@ static void decode_gain_and_index(QCELPContext *q, float *gain)
                     av_clip((q->prev_g1[0] + q->prev_g1[1]) / 2 - 5, 0, 54);
             subframes_count = 8;
         } else {
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             g1[0] = q->prev_g1[1];
             switch (q->erasure_count) {
@@ -321,7 +319,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 10; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cindex++ & 127];
         }
         break;
     case RATE_HALF:
@@ -329,7 +328,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_HALF_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_half_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_half_codebook[cindex++ & 127];
         }
         break;
     case RATE_QUARTER:
@@ -374,7 +374,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
         for (i = 0; i < 4; i++) {
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cbseed++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cbseed++ & 127];
         }
         break;
     case SILENCE:
@@ -435,7 +436,8 @@ static const float *do_pitchfilter(float memory[303], const float v_in[160],
             for (v_len = v_in + 40; v_in < v_len; v_in++) {
                 if (pfrac[i]) { // If it is a fractional lag...
                     for (j = 0, *v_out = 0.0; j < 4; j++)
-                        *v_out += qcelp_hammsinc_table[j] * (v_lag[j - 4] + v_lag[3 - j]);
+                        *v_out += qcelp_hammsinc_table[j] *
+                                  (v_lag[j - 4] + v_lag[3 - j]);
                 } else
                     *v_out = *v_lag;
 
@@ -486,7 +488,7 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
                   else
                       max_pitch_gain = 0.0;
             } else {
-                assert(q->bitrate == SILENCE);
+                av_assert2(q->bitrate == SILENCE);
                 max_pitch_gain = 1.0;
             }
             for (i = 0; i < 4; i++)
@@ -511,7 +513,8 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
 
         apply_gain_ctrl(cdn_vector, v_synthesis_filtered, v_pre_filtered);
     } else {
-        memcpy(q->pitch_synthesis_filter_mem, cdn_vector + 17, 143 * sizeof(float));
+        memcpy(q->pitch_synthesis_filter_mem,
+               cdn_vector + 17, 143 * sizeof(float));
         memcpy(q->pitch_pre_filter_mem, cdn_vector + 17, 143 * sizeof(float));
         memset(q->pitch_gain, 0, sizeof(q->pitch_gain));
         memset(q->pitch_lag,  0, sizeof(q->pitch_lag));
@@ -630,7 +633,7 @@ static qcelp_packet_rate determine_bitrate(AVCodecContext *avctx,
         (*buf)++;
     } else if ((bitrate = buf_size2bitrate(buf_size + 1)) >= 0) {
         av_log(avctx, AV_LOG_WARNING,
-               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+               "Bitrate byte missing, guessing bitrate from packet size.\n");
     } else
         return I_F_Q;
 
@@ -695,14 +698,12 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     outbuffer = (float *)frame->data[0];
 
     if ((q->bitrate = determine_bitrate(avctx, buf_size, &buf)) == I_F_Q) {
-        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        warn_insufficient_frame_quality(avctx, "Bitrate cannot be determined.");
         goto erasure;
     }
 
@@ -718,7 +719,8 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
                                          qcelp_unpacking_bitmaps_lengths[q->bitrate];
         uint8_t *unpacked_data         = (uint8_t *)&q->frame;
 
-        init_get_bits(&q->gb, buf, 8 * buf_size);
+        if ((ret = init_get_bits8(&q->gb, buf, buf_size)) < 0)
+            return ret;
 
         memset(&q->frame, 0, sizeof(QCELPFrame));
 
@@ -770,7 +772,8 @@ erasure:
     formant_mem = q->formant_mem + 10;
     for (i = 0; i < 4; i++) {
         interpolate_lpc(q, quantized_lspf, lpc, i);
-        ff_celp_lp_synthesis_filterf(formant_mem, lpc, outbuffer + i * 40, 40, 10);
+        ff_celp_lp_synthesis_filterf(formant_mem, lpc,
+                                     outbuffer + i * 40, 40, 10);
         formant_mem += 40;
     }
 
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index 4736facfdf..0b6dcd6823 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,13 +44,8 @@
 #include "mpegaudiodsp.h"
 #include "mpegaudio.h"
 
-#include "qdm2data.h"
 #include "qdm2_tablegen.h"
 
-#undef NDEBUG
-#include <assert.h>
-
-
 #define QDM2_LIST_ADD(list, size, packet) \
 do { \
       if (size > 0) { \
@@ -168,7 +163,7 @@ typedef struct QDM2Context {
     /// I/O data
     const uint8_t *compressed_data;
     int compressed_size;
-    float output_buffer[QDM2_MAX_FRAME_SIZE * 2];
+    float output_buffer[QDM2_MAX_FRAME_SIZE * MPA_MAX_CHANNELS * 2];
 
     /// Synthesis filter
     MPADSPContext mpadsp;
@@ -197,173 +192,11 @@ typedef struct QDM2Context {
     int noise_idx; ///< index for dithering noise table
 } QDM2Context;
 
-
-static VLC vlc_tab_level;
-static VLC vlc_tab_diff;
-static VLC vlc_tab_run;
-static VLC fft_level_exp_alt_vlc;
-static VLC fft_level_exp_vlc;
-static VLC fft_stereo_exp_vlc;
-static VLC fft_stereo_phase_vlc;
-static VLC vlc_tab_tone_level_idx_hi1;
-static VLC vlc_tab_tone_level_idx_mid;
-static VLC vlc_tab_tone_level_idx_hi2;
-static VLC vlc_tab_type30;
-static VLC vlc_tab_type34;
-static VLC vlc_tab_fft_tone_offset[5];
-
-static const uint16_t qdm2_vlc_offs[] = {
-    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
-};
-
 static const int switchtable[23] = {
     0, 5, 1, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 4
 };
 
-static av_cold void qdm2_init_vlc(void)
-{
-    static VLC_TYPE qdm2_table[3838][2];
-
-    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
-    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
-    init_vlc(&vlc_tab_level, 8, 24,
-             vlc_tab_level_huffbits, 1, 1,
-             vlc_tab_level_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
-    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
-    init_vlc(&vlc_tab_diff, 8, 37,
-             vlc_tab_diff_huffbits, 1, 1,
-             vlc_tab_diff_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
-    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
-    init_vlc(&vlc_tab_run, 5, 6,
-             vlc_tab_run_huffbits, 1, 1,
-             vlc_tab_run_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
-    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
-                                            qdm2_vlc_offs[3];
-    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
-             fft_level_exp_alt_huffbits, 1, 1,
-             fft_level_exp_alt_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
-    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
-    init_vlc(&fft_level_exp_vlc, 8, 20,
-             fft_level_exp_huffbits, 1, 1,
-             fft_level_exp_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
-    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
-                                         qdm2_vlc_offs[5];
-    init_vlc(&fft_stereo_exp_vlc, 6, 7,
-             fft_stereo_exp_huffbits, 1, 1,
-             fft_stereo_exp_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
-    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
-                                           qdm2_vlc_offs[6];
-    init_vlc(&fft_stereo_phase_vlc, 6, 9,
-             fft_stereo_phase_huffbits, 1, 1,
-             fft_stereo_phase_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi1.table =
-        &qdm2_table[qdm2_vlc_offs[7]];
-    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
-                                                 qdm2_vlc_offs[7];
-    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
-             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_mid.table =
-        &qdm2_table[qdm2_vlc_offs[8]];
-    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
-                                                 qdm2_vlc_offs[8];
-    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
-             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi2.table =
-        &qdm2_table[qdm2_vlc_offs[9]];
-    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
-                                                 qdm2_vlc_offs[9];
-    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
-             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
-    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
-    init_vlc(&vlc_tab_type30, 6, 9,
-             vlc_tab_type30_huffbits, 1, 1,
-             vlc_tab_type30_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
-    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
-    init_vlc(&vlc_tab_type34, 5, 10,
-             vlc_tab_type34_huffbits, 1, 1,
-             vlc_tab_type34_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[0].table =
-        &qdm2_table[qdm2_vlc_offs[12]];
-    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
-                                                 qdm2_vlc_offs[12];
-    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
-             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[1].table =
-        &qdm2_table[qdm2_vlc_offs[13]];
-    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
-                                                 qdm2_vlc_offs[13];
-    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
-             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[2].table =
-        &qdm2_table[qdm2_vlc_offs[14]];
-    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
-                                                 qdm2_vlc_offs[14];
-    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
-             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[3].table =
-        &qdm2_table[qdm2_vlc_offs[15]];
-    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
-                                                 qdm2_vlc_offs[15];
-    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
-             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[4].table =
-        &qdm2_table[qdm2_vlc_offs[16]];
-    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
-                                                 qdm2_vlc_offs[16];
-    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
-             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-}
-
-static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
+static int qdm2_get_vlc(GetBitContext *gb, const VLC *vlc, int flag, int depth)
 {
     int value;
 
@@ -375,7 +208,14 @@ static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
 
     /* stage-3, optional */
     if (flag) {
-        int tmp = vlc_stage3_values[value];
+        int tmp;
+
+        if (value >= 60) {
+            av_log(NULL, AV_LOG_ERROR, "value %d in qdm2_get_vlc too large\n", value);
+            return 0;
+        }
+
+        tmp= vlc_stage3_values[value];
 
         if ((value & ~3) > 0)
             tmp += get_bits(gb, (value >> 2));
@@ -385,7 +225,7 @@ static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
     return value;
 }
 
-static int qdm2_get_se_vlc(VLC *vlc, GetBitContext *gb, int depth)
+static int qdm2_get_se_vlc(const VLC *vlc, GetBitContext *gb, int depth)
 {
     int value = qdm2_get_vlc(gb, vlc, 0, depth);
 
@@ -694,7 +534,8 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
 
     if (!superblocktype_2_3) {
         /* This case is untested, no samples available */
-        SAMPLES_NEEDED
+        avpriv_request_sample(NULL, "!superblocktype_2_3");
+        return;
         for (ch = 0; ch < nb_channels; ch++)
             for (sb = 0; sb < 30; sb++) {
                 for (j = 1; j < 63; j++) {  // The loop only iterates to 63 so the code doesn't overflow the buffer
@@ -806,7 +647,7 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
  * @param sb_min    lower subband processed (sb_min included)
  * @param sb_max    higher subband processed (sb_max excluded)
  */
-static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
+static int synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                        int length, int sb_min, int sb_max)
 {
     int sb, j, k, n, ch, run, channels;
@@ -814,14 +655,15 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
     int type34_first;
     float type34_div = 0;
     float type34_predictor;
-    float samples[10], sign_bits[16];
+    float samples[10];
+    int sign_bits[16] = {0};
 
     if (length == 0) {
         // If no data use noise
         for (sb=sb_min; sb < sb_max; sb++)
             build_sb_samples_from_noise(q, sb);
 
-        return;
+        return 0;
     }
 
     for (sb = sb_min; sb < sb_max; sb++) {
@@ -845,6 +687,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
 
             if (fix_coding_method_array(sb, q->nb_channels,
                                             q->coding_method)) {
+                av_log(NULL, AV_LOG_ERROR, "coding method invalid\n");
                 build_sb_samples_from_noise(q, sb);
                 continue;
             }
@@ -869,6 +712,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 }
                             } else {
                                 n = get_bits(gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[2 * k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -905,6 +753,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 }
                             } else {
                                 n = get_bits (gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -918,6 +771,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                     case 24:
                         if (get_bits_left(gb) >= 7) {
                             n = get_bits(gb, 7);
+                            if (n >= 125) {
+                                av_log(NULL, AV_LOG_ERROR, "Invalid 7bit codeword\n");
+                                return AVERROR_INVALIDDATA;
+                            }
+
                             for (k = 0; k < 3; k++)
                                 samples[k] = (random_dequant_type24[n][k] - 2.0) * 0.5;
                         } else {
@@ -930,10 +788,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                     case 30:
                         if (get_bits_left(gb) >= 4) {
                             unsigned index = qdm2_get_vlc(gb, &vlc_tab_type30, 0, 1);
-                            if (index < FF_ARRAY_ELEMS(type30_dequant)) {
-                                samples[0] = type30_dequant[index];
-                            } else
-                                samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                            if (index >= FF_ARRAY_ELEMS(type30_dequant)) {
+                                av_log(NULL, AV_LOG_ERROR, "index %d out of type30_dequant array\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            samples[0] = type30_dequant[index];
                         } else
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
 
@@ -949,11 +808,12 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 type34_first = 0;
                             } else {
                                 unsigned index = qdm2_get_vlc(gb, &vlc_tab_type34, 0, 1);
-                                if (index < FF_ARRAY_ELEMS(type34_delta)) {
-                                    samples[0] = type34_delta[index] / type34_div + type34_predictor;
-                                    type34_predictor = samples[0];
-                                } else
-                                    samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                                if (index >= FF_ARRAY_ELEMS(type34_delta)) {
+                                    av_log(NULL, AV_LOG_ERROR, "index %d out of type34_delta array\n", index);
+                                    return AVERROR_INVALIDDATA;
+                                }
+                                samples[0] = type34_delta[index] / type34_div + type34_predictor;
+                                type34_predictor = samples[0];
                             }
                         } else {
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
@@ -990,6 +850,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
             } // j loop
         } // channel loop
     } // subband loop
+    return 0;
 }
 
 /**
@@ -1002,24 +863,27 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
  * @param quantized_coeffs    pointer to quantized_coeffs[ch][0]
  * @param gb        bitreader context
  */
-static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
+static int init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
                                         GetBitContext *gb)
 {
     int i, k, run, level, diff;
 
     if (get_bits_left(gb) < 16)
-        return;
+        return -1;
     level = qdm2_get_vlc(gb, &vlc_tab_level, 0, 2);
 
     quantized_coeffs[0] = level;
 
     for (i = 0; i < 7; ) {
         if (get_bits_left(gb) < 16)
-            break;
+            return -1;
         run = qdm2_get_vlc(gb, &vlc_tab_run, 0, 1) + 1;
 
+        if (i + run >= 8)
+            return -1;
+
         if (get_bits_left(gb) < 16)
-            break;
+            return -1;
         diff = qdm2_get_se_vlc(&vlc_tab_diff, gb, 2);
 
         for (k = 1; k <= run; k++)
@@ -1028,6 +892,7 @@ static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
         level += diff;
         i += run;
     }
+    return 0;
 }
 
 /**
@@ -1102,7 +967,7 @@ static void init_tone_level_dequantization(QDM2Context *q, GetBitContext *gb)
  * @param q       context
  * @param node    pointer to node with packet
  */
-static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
+static int process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
 {
     GetBitContext gb;
     int i, j, k, n, ch, run, level, diff;
@@ -1120,6 +985,9 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
                 run  = qdm2_get_vlc(&gb, &vlc_tab_run, 0, 1) + 1;
                 diff = qdm2_get_se_vlc(&vlc_tab_diff, &gb, 2);
 
+                if (j + run >= 8)
+                    return -1;
+
                 for (k = 1; k <= run; k++)
                     q->quantized_coeffs[ch][i][j + k] = (level + ((k * diff) / run));
 
@@ -1131,6 +999,8 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
     for (ch = 0; ch < q->nb_channels; ch++)
         for (i = 0; i < 8; i++)
             q->quantized_coeffs[ch][0][i] = 0;
+
+    return 0;
 }
 
 /**
@@ -1200,7 +1070,7 @@ static void process_subpacket_12(QDM2Context *q, QDM2SubPNode *node)
     synthfilt_build_sb_samples(q, &gb, length, 8, QDM2_SB_USED(q->sub_sampling));
 }
 
-/*
+/**
  * Process new subpackets for synthesis filter
  *
  * @param q       context
@@ -1233,7 +1103,7 @@ static void process_synthesis_subpackets(QDM2Context *q, QDM2SubPNode *list)
         process_subpacket_12(q, NULL);
 }
 
-/*
+/**
  * Decode superblock, fill packet lists.
  *
  * @param q    context
@@ -1393,9 +1263,14 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
     local_int_10 = 1 << (q->group_order - duration - 1);
     offset       = 1;
 
-    while (1) {
+    while (get_bits_left(gb)>0) {
         if (q->superblocktype_2_3) {
             while ((n = qdm2_get_vlc(gb, &vlc_tab_fft_tone_offset[local_int_8], 1, 2)) < 2) {
+                if (get_bits_left(gb)<0) {
+                    if(local_int_4 < q->group_size)
+                        av_log(NULL, AV_LOG_ERROR, "overread in qdm2_fft_decode_tones()\n");
+                    return;
+                }
                 offset = 1;
                 if (n == 0) {
                     local_int_4  += local_int_10;
@@ -1708,12 +1583,19 @@ static void qdm2_synthesis_filter(QDM2Context *q, int index)
  *
  * @param q    context
  */
-static av_cold void qdm2_init_static_data(AVCodec *codec) {
+static av_cold void qdm2_init_static_data(void) {
+    static int done;
+
+    if(done)
+        return;
+
     qdm2_init_vlc();
     ff_mpa_synth_init_float(ff_mpa_synth_window_float);
     softclip_table_init();
     rnd_table_init();
     init_noise_samples();
+
+    done = 1;
 }
 
 /**
@@ -1726,6 +1608,8 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     int extradata_size;
     int tmp_val, tmp, size;
 
+    qdm2_init_static_data();
+
     /* extradata parsing
 
     Structure:
@@ -1814,8 +1698,10 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
 
     avctx->channels = s->nb_channels = s->channels = AV_RB32(extradata);
     extradata += 4;
-    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS)
+    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
         return AVERROR_INVALIDDATA;
+    }
     avctx->channel_layout = avctx->channels == 2 ? AV_CH_LAYOUT_STEREO :
                                                    AV_CH_LAYOUT_MONO;
 
@@ -1842,6 +1728,7 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     // something like max decodable tones
     s->group_order = av_log2(s->group_size) + 1;
     s->frame_size = s->group_size / 16; // 16 iterations per super block
+
     if (s->frame_size > QDM2_MAX_FRAME_SIZE)
         return AVERROR_INVALIDDATA;
 
@@ -1864,18 +1751,9 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     if ((tmp * 2240) < avctx->bit_rate)  tmp_val = 4;
     s->cm_table_select = tmp_val;
 
-    if (s->sub_sampling == 0)
-        tmp = 7999;
-    else
-        tmp = ((-(s->sub_sampling -1)) & 8000) + 20000;
-    /*
-    0: 7999 -> 0
-    1: 20000 -> 2
-    2: 28000 -> 2
-    */
-    if (tmp < 8000)
+    if (avctx->bit_rate <= 8000)
         s->coeff_per_sb_select = 0;
-    else if (tmp <= 16000)
+    else if (avctx->bit_rate < 16000)
         s->coeff_per_sb_select = 1;
     else
         s->coeff_per_sb_select = 2;
@@ -1912,6 +1790,9 @@ static int qdm2_decode(QDM2Context *q, const uint8_t *in, int16_t *out)
     int ch, i;
     const int frame_size = (q->frame_size * q->channels);
 
+    if((unsigned)frame_size > FF_ARRAY_ELEMS(q->output_buffer)/2)
+        return -1;
+
     /* select input buffer */
     q->compressed_data = in;
     q->compressed_size = q->checksum_size;
@@ -1983,10 +1864,8 @@ static int qdm2_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 16 * s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (int16_t *)frame->data[0];
 
     for (i = 0; i < 16; i++) {
@@ -2007,7 +1886,6 @@ AVCodec ff_qdm2_decoder = {
     .id               = AV_CODEC_ID_QDM2,
     .priv_data_size   = sizeof(QDM2Context),
     .init             = qdm2_decode_init,
-    .init_static_data = qdm2_init_static_data,
     .close            = qdm2_decode_close,
     .decode           = qdm2_decode_frame,
     .capabilities     = AV_CODEC_CAP_DR1,
diff --git a/libavcodec/qdm2_tablegen.c b/libavcodec/qdm2_tablegen.c
index 59d82df851..e19b49b235 100644
--- a/libavcodec/qdm2_tablegen.c
+++ b/libavcodec/qdm2_tablegen.c
@@ -3,27 +3,27 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
+#include "tableprint_vlc.h"
 #define CONFIG_HARDCODED_TABLES 0
 #include "qdm2_tablegen.h"
-#include "tableprint.h"
 
 int main(void)
 {
@@ -40,5 +40,22 @@ int main(void)
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_index);
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_type24);
 
+    qdm2_init_vlc();
+
+    WRITE_2D_ARRAY("static const", VLC_TYPE, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_level, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_diff, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_run, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_alt_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_phase_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi1, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_mid, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi2, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type30, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type34, qdm2_table);
+    WRITE_VLC_ARRAY("static const", vlc_tab_fft_tone_offset, qdm2_table);
+
     return 0;
 }
diff --git a/libavcodec/qdm2_tablegen.h b/libavcodec/qdm2_tablegen.h
index bb73d92531..2331ebfbb2 100644
--- a/libavcodec/qdm2_tablegen.h
+++ b/libavcodec/qdm2_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <math.h>
 #include "libavutil/attributes.h"
+#include "qdm2data.h"
 
 #define SOFTCLIP_THRESHOLD 27600
 #define HARDCLIP_THRESHOLD 35716
@@ -34,10 +35,11 @@
 #define softclip_table_init()
 #define rnd_table_init()
 #define init_noise_samples()
+#define qdm2_init_vlc()
 #include "libavcodec/qdm2_tables.h"
 #else
 static uint16_t softclip_table[HARDCLIP_THRESHOLD - SOFTCLIP_THRESHOLD + 1];
-static float noise_table[4096];
+static float noise_table[4096 + 20];
 static uint8_t random_dequant_index[256][5];
 static uint8_t random_dequant_type24[128][3];
 static float noise_samples[128];
@@ -54,8 +56,7 @@ static av_cold void softclip_table_init(void) {
 // random generated table
 static av_cold void rnd_table_init(void) {
     int i,j;
-    uint32_t ldw,hdw;
-    uint64_t tmp64_1;
+    uint32_t ldw;
     uint64_t random_seed = 0;
     float delta = 1.0 / 16384.0;
     for(i = 0; i < 4096 ;i++) {
@@ -67,22 +68,18 @@ static av_cold void rnd_table_init(void) {
         random_seed = 81;
         ldw = i;
         for (j = 0; j < 5 ;j++) {
-            random_dequant_index[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x55555556);
-            hdw = (uint32_t)(tmp64_1 >> 32);
-            random_seed = (uint64_t)(hdw + (ldw >> 31));
+            random_dequant_index[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 3;
         }
     }
     for (i = 0; i < 128 ;i++) {
         random_seed = 25;
         ldw = i;
         for (j = 0; j < 3 ;j++) {
-            random_dequant_type24[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x66666667);
-            hdw = (uint32_t)(tmp64_1 >> 33);
-            random_seed = hdw + (ldw >> 31);
+            random_dequant_type24[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 5;
         }
     }
 }
@@ -97,6 +94,168 @@ static av_cold void init_noise_samples(void) {
         noise_samples[i] = (delta * (float)((random_seed >> 16) & 0x00007fff) - 1.0);
     }
 }
+
+static VLC vlc_tab_level;
+static VLC vlc_tab_diff;
+static VLC vlc_tab_run;
+static VLC fft_level_exp_alt_vlc;
+static VLC fft_level_exp_vlc;
+static VLC fft_stereo_exp_vlc;
+static VLC fft_stereo_phase_vlc;
+static VLC vlc_tab_tone_level_idx_hi1;
+static VLC vlc_tab_tone_level_idx_mid;
+static VLC vlc_tab_tone_level_idx_hi2;
+static VLC vlc_tab_type30;
+static VLC vlc_tab_type34;
+static VLC vlc_tab_fft_tone_offset[5];
+
+static const uint16_t qdm2_vlc_offs[] = {
+    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
+};
+
+static VLC_TYPE qdm2_table[3838][2];
+
+static av_cold void qdm2_init_vlc(void)
+{
+    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
+    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
+    init_vlc(&vlc_tab_level, 8, 24,
+             vlc_tab_level_huffbits, 1, 1,
+             vlc_tab_level_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
+    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
+    init_vlc(&vlc_tab_diff, 8, 37,
+             vlc_tab_diff_huffbits, 1, 1,
+             vlc_tab_diff_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
+    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
+    init_vlc(&vlc_tab_run, 5, 6,
+             vlc_tab_run_huffbits, 1, 1,
+             vlc_tab_run_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
+    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
+                                            qdm2_vlc_offs[3];
+    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
+             fft_level_exp_alt_huffbits, 1, 1,
+             fft_level_exp_alt_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
+    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
+    init_vlc(&fft_level_exp_vlc, 8, 20,
+             fft_level_exp_huffbits, 1, 1,
+             fft_level_exp_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
+    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
+                                         qdm2_vlc_offs[5];
+    init_vlc(&fft_stereo_exp_vlc, 6, 7,
+             fft_stereo_exp_huffbits, 1, 1,
+             fft_stereo_exp_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
+    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
+                                           qdm2_vlc_offs[6];
+    init_vlc(&fft_stereo_phase_vlc, 6, 9,
+             fft_stereo_phase_huffbits, 1, 1,
+             fft_stereo_phase_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi1.table =
+        &qdm2_table[qdm2_vlc_offs[7]];
+    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
+                                                 qdm2_vlc_offs[7];
+    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
+             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_mid.table =
+        &qdm2_table[qdm2_vlc_offs[8]];
+    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
+                                                 qdm2_vlc_offs[8];
+    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
+             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi2.table =
+        &qdm2_table[qdm2_vlc_offs[9]];
+    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
+                                                 qdm2_vlc_offs[9];
+    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
+             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
+    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
+    init_vlc(&vlc_tab_type30, 6, 9,
+             vlc_tab_type30_huffbits, 1, 1,
+             vlc_tab_type30_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
+    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
+    init_vlc(&vlc_tab_type34, 5, 10,
+             vlc_tab_type34_huffbits, 1, 1,
+             vlc_tab_type34_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[0].table =
+        &qdm2_table[qdm2_vlc_offs[12]];
+    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
+                                                 qdm2_vlc_offs[12];
+    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
+             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[1].table =
+        &qdm2_table[qdm2_vlc_offs[13]];
+    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
+                                                 qdm2_vlc_offs[13];
+    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
+             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[2].table =
+        &qdm2_table[qdm2_vlc_offs[14]];
+    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
+                                                 qdm2_vlc_offs[14];
+    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
+             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[3].table =
+        &qdm2_table[qdm2_vlc_offs[15]];
+    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
+                                                 qdm2_vlc_offs[15];
+    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
+             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[4].table =
+        &qdm2_table[qdm2_vlc_offs[16]];
+    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
+                                                 qdm2_vlc_offs[16];
+    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
+             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+}
+
 #endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_QDM2_TABLEGEN_H */
diff --git a/libavcodec/qdm2data.h b/libavcodec/qdm2data.h
index ad6ea88ff6..355d61387b 100644
--- a/libavcodec/qdm2data.h
+++ b/libavcodec/qdm2data.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qdrw.c b/libavcodec/qdrw.c
index b7493e4da7..0a31b41660 100644
--- a/libavcodec/qdrw.c
+++ b/libavcodec/qdrw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2004 Konstantin Shishkov
  * Copyright (c) 2015 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,6 +95,8 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
                         pos -= offset;
                         pos++;
                     }
+                    if (pos >= offset)
+                        return AVERROR_INVALIDDATA;
                 }
                 left  -= 2;
             } else { /* copy */
@@ -105,6 +107,8 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
                         pos -= offset;
                         pos++;
                     }
+                    if (pos >= offset)
+                        return AVERROR_INVALIDDATA;
                 }
                 left  -= 2 + code;
             }
@@ -114,6 +118,29 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
     return 0;
 }
 
+static int check_header(const char *buf, int buf_size)
+{
+    unsigned w, h, v0, v1;
+
+    if (buf_size < 40)
+        return 0;
+
+    w = AV_RB16(buf+6);
+    h = AV_RB16(buf+8);
+    v0 = AV_RB16(buf+10);
+    v1 = AV_RB16(buf+12);
+
+    if (!w || !h)
+        return 0;
+
+    if (v0 == 0x1101)
+        return 1;
+    if (v0 == 0x0011 && v1 == 0x02FF)
+        return 2;
+    return 0;
+}
+
+
 static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
@@ -122,13 +149,16 @@ static int decode_frame(AVCodecContext *avctx,
     GetByteContext gbc;
     int colors;
     int w, h, ret;
+    int ver;
 
     bytestream2_init(&gbc, avpkt->data, avpkt->size);
-
-    /* PICT images start with a 512 bytes empty header */
-    if (bytestream2_peek_be32(&gbc) == 0)
+    if (   bytestream2_get_bytes_left(&gbc) >= 552
+           &&  check_header(gbc.buffer + 512, bytestream2_get_bytes_left(&gbc) - 512)
+       )
         bytestream2_skip(&gbc, 512);
 
+    ver = check_header(gbc.buffer, bytestream2_get_bytes_left(&gbc));
+
     /* smallest PICT header */
     if (bytestream2_get_bytes_left(&gbc) < 40) {
         av_log(avctx, AV_LOG_ERROR, "Frame is too small %d\n",
@@ -146,12 +176,15 @@ static int decode_frame(AVCodecContext *avctx,
 
     /* version 1 is identified by 0x1101
      * it uses byte-aligned opcodes rather than word-aligned */
-    if (bytestream2_get_be32(&gbc) != 0x001102FF) {
+    if (ver == 1) {
         avpriv_request_sample(avctx, "QuickDraw version 1");
         return AVERROR_PATCHWELCOME;
+    } else if (ver != 2) {
+        avpriv_request_sample(avctx, "QuickDraw version unknown (%X)", bytestream2_get_be32(&gbc));
+        return AVERROR_PATCHWELCOME;
     }
 
-    bytestream2_skip(&gbc, 26);
+    bytestream2_skip(&gbc, 4+26);
 
     while (bytestream2_get_bytes_left(&gbc) >= 4) {
         int bppcnt, bpp;
@@ -191,10 +224,8 @@ static int decode_frame(AVCodecContext *avctx,
                        bytestream2_get_bytes_left(&gbc));
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
                 return ret;
-            }
 
             parse_palette(avctx, &gbc, (uint32_t *)p->data[1], colors);
             p->palette_has_changed = 1;
diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c
index f549cd59fc..9eaf9b8054 100644
--- a/libavcodec/qpeg.c
+++ b/libavcodec/qpeg.c
@@ -2,20 +2,20 @@
  * QPEG codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,7 @@
 
 typedef struct QpegContext{
     AVCodecContext *avctx;
-    AVFrame *pic;
-    uint8_t *refdata;
+    AVFrame *pic, *ref;
     uint32_t pal[256];
     GetByteContext buffer;
 } QpegContext;
@@ -111,7 +110,7 @@ static const int qpeg_table_w[16] =
  { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
 
 /* Decodes delta frames */
-static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
+static void av_noinline qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
                               int stride, int width, int height,
                               int delta, const uint8_t *ctable,
                               uint8_t *refdata)
@@ -121,9 +120,13 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
     int filled = 0;
     int orig_height;
 
-    /* copy prev frame */
-    for(i = 0; i < height; i++)
-        memcpy(refdata + (i * width), dst + (i * stride), width);
+    if (refdata) {
+        /* copy prev frame */
+        for (i = 0; i < height; i++)
+            memcpy(dst + (i * stride), refdata + (i * stride), width);
+    } else {
+        refdata = dst;
+    }
 
     orig_height = height;
     height--;
@@ -134,7 +137,7 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
 
         if(delta) {
             /* motion compensation */
-            while((code & 0xF0) == 0xF0) {
+            while(bytestream2_get_bytes_left(&qctx->buffer) > 0 && (code & 0xF0) == 0xF0) {
                 if(delta == 1) {
                     int me_idx;
                     int me_w, me_h, me_x, me_y;
@@ -161,16 +164,16 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
 
                     /* check motion vector */
                     if ((me_x + filled < 0) || (me_x + me_w + filled > width) ||
-                       (height - me_y - me_h < 0) || (height - me_y > orig_height) ||
+                       (height - me_y - me_h < 0) || (height - me_y >= orig_height) ||
                        (filled + me_w > width) || (height - me_h < 0))
                         av_log(NULL, AV_LOG_ERROR, "Bogus motion vector (%i,%i), block size %ix%i at %i,%i\n",
                                me_x, me_y, me_w, me_h, filled, height);
                     else {
                         /* do motion compensation */
-                        me_plane = refdata + (filled + me_x) + (height - me_y) * width;
+                        me_plane = refdata + (filled + me_x) + (height - me_y) * stride;
                         for(j = 0; j < me_h; j++) {
                             for(i = 0; i < me_w; i++)
-                                dst[filled + i - (j * stride)] = me_plane[i - (j * width)];
+                                dst[filled + i - (j * stride)] = me_plane[i - (j * stride)];
                         }
                     }
                 }
@@ -198,6 +201,9 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
         } else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */
             code &= 0x1F;
 
+            if(code + 1 > bytestream2_get_bytes_left(&qctx->buffer))
+                break;
+
             for(i = 0; i <= code; i++) {
                 dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if(filled >= width) {
@@ -251,6 +257,7 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t ctable[128];
     QpegContext * const a = avctx->priv_data;
     AVFrame * const p = a->pic;
+    AVFrame * const ref = a->ref;
     uint8_t* outdata;
     int delta, ret;
     const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
@@ -261,10 +268,12 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     bytestream2_init(&a->buffer, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+
+    av_frame_unref(ref);
+    av_frame_move_ref(ref, p);
+
+    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     outdata = p->data[0];
     bytestream2_skip(&a->buffer, 4);
     bytestream2_get_buffer(&a->buffer, ctable, 128);
@@ -274,7 +283,7 @@ static int decode_frame(AVCodecContext *avctx,
     if(delta == 0x10) {
         qpeg_decode_intra(a, outdata, p->linesize[0], avctx->width, avctx->height);
     } else {
-        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, a->refdata);
+        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, ref->data[0]);
     }
 
     /* make the palette available on the way out */
@@ -292,13 +301,25 @@ static int decode_frame(AVCodecContext *avctx,
     return avpkt->size;
 }
 
+static void decode_flush(AVCodecContext *avctx){
+    QpegContext * const a = avctx->priv_data;
+    int i, pal_size;
+    const uint8_t *pal_src;
+
+    pal_size = FFMIN(1024U, avctx->extradata_size);
+    pal_src = avctx->extradata + avctx->extradata_size - pal_size;
+
+    for (i=0; i<pal_size/4; i++)
+        a->pal[i] = 0xFFU<<24 | AV_RL32(pal_src+4*i);
+}
+
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     QpegContext * const a = avctx->priv_data;
 
     av_frame_free(&a->pic);
+    av_frame_free(&a->ref);
 
-    av_free(a->refdata);
     return 0;
 }
 
@@ -307,10 +328,12 @@ static av_cold int decode_init(AVCodecContext *avctx){
 
     a->avctx = avctx;
     avctx->pix_fmt= AV_PIX_FMT_PAL8;
-    a->refdata = av_malloc(avctx->width * avctx->height);
+
+    decode_flush(avctx);
 
     a->pic = av_frame_alloc();
-    if (!a->pic) {
+    a->ref = av_frame_alloc();
+    if (!a->pic || !a->ref) {
         decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -327,5 +350,6 @@ AVCodec ff_qpeg_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/qpel_template.c b/libavcodec/qpel_template.c
index 2106160741..e52a78cf22 100644
--- a/libavcodec/qpel_template.c
+++ b/libavcodec/qpel_template.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP function templates
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qpeldsp.c b/libavcodec/qpeldsp.c
index 1d0422a382..6e52b33657 100644
--- a/libavcodec/qpeldsp.c
+++ b/libavcodec/qpeldsp.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +32,7 @@
 #include "libavutil/attributes.h"
 #include "copy_block.h"
 #include "qpeldsp.h"
+#include "diracdsp.h"
 
 #define BIT_DEPTH 8
 #include "hpel_template.c"
@@ -732,6 +735,51 @@ void ff_put_pixels8_l2_8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 
 }
 
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_MC(OPNAME)\
+void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+     OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
+    OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
+    OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
+    OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
+}
+DIRAC_MC(put)
+DIRAC_MC(avg)
+#endif
+
 av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 {
 #define dspfunc(PFX, IDX, NUM)                              \
@@ -763,4 +811,6 @@ av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 
     if (ARCH_X86)
         ff_qpeldsp_init_x86(c);
+    if (ARCH_MIPS)
+        ff_qpeldsp_init_mips(c);
 }
diff --git a/libavcodec/qpeldsp.h b/libavcodec/qpeldsp.h
index 4ad141d057..91019eda9c 100644
--- a/libavcodec/qpeldsp.h
+++ b/libavcodec/qpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * quarterpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,5 +78,6 @@ typedef struct QpelDSPContext {
 void ff_qpeldsp_init(QpelDSPContext *c);
 
 void ff_qpeldsp_init_x86(QpelDSPContext *c);
+void ff_qpeldsp_init_mips(QpelDSPContext *c);
 
 #endif /* AVCODEC_QPELDSP_H */
diff --git a/libavcodec/qsv.c b/libavcodec/qsv.c
index ee6b262f31..4c8e6b01a9 100644
--- a/libavcodec/qsv.c
+++ b/libavcodec/qsv.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,8 +85,90 @@ int ff_qsv_error(int mfx_err)
         return AVERROR_UNKNOWN;
     }
 }
+static int ff_qsv_set_display_handle(AVCodecContext *avctx, QSVSession *qs)
+{
+    // this code is only required for Linux.  It searches for a valid
+    // display handle.  First in /dev/dri/renderD then in /dev/dri/card
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    // VAAPI display handle
+    int ret = 0;
+    VADisplay va_dpy = NULL;
+    VAStatus va_res = VA_STATUS_SUCCESS;
+    int major_version = 0, minor_version = 0;
+    int fd = -1;
+    char adapterpath[256];
+    int adapter_num;
+
+    qs->fd_display = -1;
+    qs->va_display = NULL;
+
+    //search for valid graphics device
+    for (adapter_num = 0;adapter_num < 6;adapter_num++) {
+
+        if (adapter_num<3) {
+            snprintf(adapterpath,sizeof(adapterpath),
+                "/dev/dri/renderD%d", adapter_num+128);
+        } else {
+            snprintf(adapterpath,sizeof(adapterpath),
+                "/dev/dri/card%d", adapter_num-3);
+        }
+
+        fd = open(adapterpath, O_RDWR);
+        if (fd < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s fd open failed\n", adapterpath);
+            continue;
+        }
 
-int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
+        va_dpy = vaGetDisplayDRM(fd);
+        if (!va_dpy) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s vaGetDisplayDRM failed\n", adapterpath);
+            close(fd);
+            continue;
+        }
+
+        va_res = vaInitialize(va_dpy, &major_version, &minor_version);
+        if (VA_STATUS_SUCCESS != va_res) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s vaInitialize failed\n", adapterpath);
+            close(fd);
+            fd = -1;
+            continue;
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE,
+            "mfx initialization: %s vaInitialize successful\n",adapterpath);
+            qs->fd_display = fd;
+            qs->va_display = va_dpy;
+            ret = MFXVideoCORE_SetHandle(qs->session,
+                  (mfxHandleType)MFX_HANDLE_VA_DISPLAY, (mfxHDL)va_dpy);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR,
+                "Error %d during set display handle\n", ret);
+                return ff_qsv_error(ret);
+            }
+            break;
+        }
+    }
+#endif //AVCODEC_QSV_LINUX_SESSION_HANDLE
+    return 0;
+}
+/**
+ * @brief Initialize a MSDK session
+ *
+ * Media SDK is based on sessions, so this is the prerequisite
+ * initialization for HW acceleration.  For Windows the session is
+ * complete and ready to use, for Linux a display handle is
+ * required.  For releases of Media Server Studio >= 2015 R4 the
+ * render nodes interface is preferred (/dev/dri/renderD).
+ * Using Media Server Studio 2015 R4 or newer is recommended
+ * but the older /dev/dri/card interface is also searched
+ * for broader compatibility.
+ *
+ * @param avctx    ffmpeg metadata for this codec context
+ * @param session  the MSDK session used
+ */
+int ff_qsv_init_internal_session(AVCodecContext *avctx, QSVSession *qs,
                                  const char *load_plugins)
 {
     mfxIMPL impl   = MFX_IMPL_AUTO_ANY;
@@ -95,13 +177,17 @@ int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
     const char *desc;
     int ret;
 
-    ret = MFXInit(impl, &ver, session);
+    ret = MFXInit(impl, &ver, &qs->session);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing an internal MFX session\n");
         return ff_qsv_error(ret);
     }
 
-    MFXQueryIMPL(*session, &impl);
+    ret = ff_qsv_set_display_handle(avctx, qs);
+    if (ret < 0)
+        return ret;
+
+    MFXQueryIMPL(qs->session, &impl);
 
     switch (MFX_IMPL_BASETYPE(impl)) {
     case MFX_IMPL_SOFTWARE:
@@ -141,7 +227,7 @@ int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
 
             }
 
-            ret = MFXVideoUSER_Load(*session, &uid, 1);
+            ret = MFXVideoUSER_Load(qs->session, &uid, 1);
             if (ret < 0) {
                 av_log(avctx, AV_LOG_ERROR, "Could not load the requested plugin: %s\n",
                        plugin);
@@ -162,3 +248,22 @@ load_plugin_fail:
 
     return 0;
 }
+
+int ff_qsv_close_internal_session(QSVSession *qs)
+{
+    if (qs->session) {
+        MFXClose(qs->session);
+        qs->session = NULL;
+    }
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    if (qs->va_display) {
+        vaTerminate(qs->va_display);
+        qs->va_display = NULL;
+    }
+    if (qs->fd_display > 0) {
+        close(qs->fd_display);
+        qs->fd_display = -1;
+    }
+#endif
+    return 0;
+}
diff --git a/libavcodec/qsv.h b/libavcodec/qsv.h
index 922b8582a4..6049629cc9 100644
--- a/libavcodec/qsv.h
+++ b/libavcodec/qsv.h
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_api.c b/libavcodec/qsv_api.c
index 234b5961e3..327ff7d813 100644
--- a/libavcodec/qsv_api.c
+++ b/libavcodec/qsv_api.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
index 949f6ebf9b..b9ad199e33 100644
--- a/libavcodec/qsv_internal.h
+++ b/libavcodec/qsv_internal.h
@@ -1,32 +1,47 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_QSV_INTERNAL_H
 #define AVCODEC_QSV_INTERNAL_H
 
+#if CONFIG_VAAPI
+#define AVCODEC_QSV_LINUX_SESSION_HANDLE
+#endif //CONFIG_VAAPI
+
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#endif
+
 #include <mfx/mfxvideo.h>
 
 #include "libavutil/frame.h"
 
 #define QSV_VERSION_MAJOR 1
-#define QSV_VERSION_MINOR 1
+#define QSV_VERSION_MINOR 9
 
 #define ASYNC_DEPTH_DEFAULT 4       // internal parallelism
 
@@ -45,14 +60,23 @@ typedef struct QSVFrame {
     struct QSVFrame *next;
 } QSVFrame;
 
+typedef struct QSVSession {
+    mfxSession session;
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    int        fd_display;
+    VADisplay  va_display;
+#endif
+} QSVSession;
+
 /**
- * Convert a libmfx error code into a libav error code.
+ * Convert a libmfx error code into a ffmpeg error code.
  */
 int ff_qsv_error(int mfx_err);
 
 int ff_qsv_codec_id_to_mfx(enum AVCodecID codec_id);
 
-int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
+int ff_qsv_init_internal_session(AVCodecContext *avctx, QSVSession *qs,
                                  const char *load_plugins);
+int ff_qsv_close_internal_session(QSVSession *qs);
 
 #endif /* AVCODEC_QSV_INTERNAL_H */
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index ca6f78110b..9fefc41020 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov <anton@khirnov.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,73 +49,116 @@ int ff_qsv_map_pixfmt(enum AVPixelFormat format)
     }
 }
 
-static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession session)
-{
-    if (!session) {
-        if (!q->internal_session) {
-            int ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
-                                                   q->load_plugins);
-            if (ret < 0)
-                return ret;
-        }
-
-        q->session = q->internal_session;
-    } else {
-        q->session = session;
-    }
-
-    /* make sure the decoder is uninitialized */
-    MFXVideoDECODE_Close(q->session);
-
-    return 0;
-}
-
-static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, mfxSession session)
+static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt)
 {
     mfxVideoParam param = { { 0 } };
+    mfxBitstream bs   = { { { 0 } } };
     int ret;
+    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_QSV,
+                                       AV_PIX_FMT_NV12,
+                                       AV_PIX_FMT_NONE };
 
-    if (!q->async_fifo) {
-        q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
-                                      (sizeof(mfxSyncPoint) + sizeof(QSVFrame*)));
-        if (!q->async_fifo)
-            return AVERROR(ENOMEM);
-    }
+    q->iopattern  = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
+    if (!q->session) {
+        if (avctx->hwaccel_context) {
+            AVQSVContext *qsv = avctx->hwaccel_context;
 
-    ret = qsv_init_session(avctx, q, session);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error initializing an MFX session\n");
-        return ret;
+            q->session        = qsv->session;
+            q->iopattern      = qsv->iopattern;
+            q->ext_buffers    = qsv->ext_buffers;
+            q->nb_ext_buffers = qsv->nb_ext_buffers;
+        }
+        if (!q->session) {
+            ret = ff_qsv_init_internal_session(avctx, &q->internal_qs,
+                                               q->load_plugins);
+            if (ret < 0)
+                return ret;
+
+            q->session = q->internal_qs.session;
+        }
     }
 
+    if (avpkt->size) {
+        bs.Data       = avpkt->data;
+        bs.DataLength = avpkt->size;
+        bs.MaxLength  = bs.DataLength;
+        bs.TimeStamp  = avpkt->pts;
+    } else
+        return AVERROR_INVALIDDATA;
 
     ret = ff_qsv_codec_id_to_mfx(avctx->codec_id);
-    if (ret < 0)
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported codec_id %08x\n", avctx->codec_id);
         return ret;
+    }
 
-    param.mfx.CodecId      = ret;
-    param.mfx.CodecProfile = avctx->profile;
-    param.mfx.CodecLevel   = avctx->level;
-
-    param.mfx.FrameInfo.BitDepthLuma   = 8;
-    param.mfx.FrameInfo.BitDepthChroma = 8;
-    param.mfx.FrameInfo.Shift          = 0;
-    param.mfx.FrameInfo.FourCC         = MFX_FOURCC_NV12;
-    param.mfx.FrameInfo.Width          = avctx->coded_width;
-    param.mfx.FrameInfo.Height         = avctx->coded_height;
-    param.mfx.FrameInfo.ChromaFormat   = MFX_CHROMAFORMAT_YUV420;
+    param.mfx.CodecId = ret;
 
+    ret = MFXVideoDECODE_DecodeHeader(q->session, &bs, &param);
+    if (MFX_ERR_MORE_DATA==ret) {
+        /* this code means that header not found so we return packet size to skip
+           a current packet
+         */
+        return avpkt->size;
+    } else if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Decode header error %d\n", ret);
+        return ff_qsv_error(ret);
+    }
     param.IOPattern   = q->iopattern;
     param.AsyncDepth  = q->async_depth;
     param.ExtParam    = q->ext_buffers;
     param.NumExtParam = q->nb_ext_buffers;
+    param.mfx.FrameInfo.BitDepthLuma   = 8;
+    param.mfx.FrameInfo.BitDepthChroma = 8;
 
     ret = MFXVideoDECODE_Init(q->session, &param);
     if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error initializing the MFX video decoder\n");
+        if (MFX_ERR_INVALID_VIDEO_PARAM==ret) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error initializing the MFX video decoder, unsupported video\n");
+        } else {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error initializing the MFX video decoder %d\n", ret);
+        }
         return ff_qsv_error(ret);
     }
 
+    ret = ff_get_format(avctx, pix_fmts);
+    if (ret < 0)
+        return ret;
+
+    avctx->pix_fmt      = ret;
+    avctx->profile      = param.mfx.CodecProfile;
+    avctx->level        = param.mfx.CodecLevel;
+    avctx->coded_width  = param.mfx.FrameInfo.Width;
+    avctx->coded_height = param.mfx.FrameInfo.Height;
+    avctx->width        = param.mfx.FrameInfo.CropW - param.mfx.FrameInfo.CropX;
+    avctx->height       = param.mfx.FrameInfo.CropH - param.mfx.FrameInfo.CropY;
+
+    /* maximum decoder latency should be not exceed max DPB size for h.264 and
+       HEVC which is 16 for both cases.
+       So weare  pre-allocating fifo big enough for 17 elements:
+     */
+    if (!q->async_fifo) {
+        q->async_fifo = av_fifo_alloc((1 + 16) *
+                                      (sizeof(mfxSyncPoint) + sizeof(QSVFrame*)));
+        if (!q->async_fifo)
+            return AVERROR(ENOMEM);
+    }
+
+    if (!q->input_fifo) {
+        q->input_fifo = av_fifo_alloc(1024*16);
+        if (!q->input_fifo)
+            return AVERROR(ENOMEM);
+    }
+
+    if (!q->pkt_fifo) {
+        q->pkt_fifo = av_fifo_alloc( sizeof(AVPacket) * (1 + 16) );
+        if (!q->pkt_fifo)
+            return AVERROR(ENOMEM);
+    }
+    q->engine_ready = 1;
+
     return 0;
 }
 
@@ -211,9 +254,63 @@ static QSVFrame *find_frame(QSVContext *q, mfxFrameSurface1 *surf)
     return NULL;
 }
 
-static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
-                      AVFrame *frame, int *got_frame,
-                      AVPacket *avpkt)
+/*  This function uses for 'smart' releasing of consumed data
+    from the input bitstream fifo.
+    Since the input fifo mapped to mfxBitstream which does not understand
+    a wrapping of data over fifo end, we should also to relocate a possible
+    data rest to fifo begin. If rest of data is absent then we just reset fifo's
+    pointers to initial positions.
+    NOTE the case when fifo does contain unconsumed data is rare and typical
+    amount of such data is 1..4 bytes.
+*/
+static void qsv_fifo_relocate(AVFifoBuffer *f, int bytes_to_free)
+{
+    int data_size;
+    int data_rest = 0;
+
+    av_fifo_drain(f, bytes_to_free);
+
+    data_size = av_fifo_size(f);
+    if (data_size > 0) {
+        if (f->buffer!=f->rptr) {
+            if ( (f->end - f->rptr) < data_size) {
+                data_rest = data_size - (f->end - f->rptr);
+                data_size-=data_rest;
+                memmove(f->buffer+data_size, f->buffer, data_rest);
+            }
+            memmove(f->buffer, f->rptr, data_size);
+            data_size+= data_rest;
+        }
+    }
+    f->rptr = f->buffer;
+    f->wptr = f->buffer + data_size;
+    f->wndx = data_size;
+    f->rndx = 0;
+}
+
+
+static void close_decoder(QSVContext *q)
+{
+    QSVFrame *cur;
+
+    if (q->session)
+        MFXVideoDECODE_Close(q->session);
+
+    cur = q->work_frames;
+    while (cur) {
+        q->work_frames = cur->next;
+        av_frame_free(&cur->frame);
+        av_freep(&cur);
+        cur = q->work_frames;
+    }
+
+    q->engine_ready   = 0;
+    q->reinit_pending = 0;
+}
+
+static int do_qsv_decode(AVCodecContext *avctx, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt)
 {
     QSVFrame *out_frame;
     mfxFrameSurface1 *insurf;
@@ -221,57 +318,97 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
     mfxSyncPoint sync;
     mfxBitstream bs = { { { 0 } } };
     int ret;
+    int n_out_frames;
+    int buffered = 0;
+    int flush    = !avpkt->size || q->reinit_pending;
 
-    if (avpkt->size) {
-        bs.Data       = avpkt->data;
-        bs.DataLength = avpkt->size;
+    if (!q->engine_ready) {
+        ret = qsv_decode_init(avctx, q, avpkt);
+        if (ret)
+            return ret;
+    }
+
+    if (!flush) {
+        if (av_fifo_size(q->input_fifo)) {
+            /* we have got rest of previous packet into buffer */
+            if (av_fifo_space(q->input_fifo) < avpkt->size) {
+                ret = av_fifo_grow(q->input_fifo, avpkt->size);
+                if (ret < 0)
+                    return ret;
+            }
+            av_fifo_generic_write(q->input_fifo, avpkt->data, avpkt->size, NULL);
+            bs.Data       = q->input_fifo->rptr;
+            bs.DataLength = av_fifo_size(q->input_fifo);
+            buffered = 1;
+        } else {
+            bs.Data       = avpkt->data;
+            bs.DataLength = avpkt->size;
+        }
         bs.MaxLength  = bs.DataLength;
         bs.TimeStamp  = avpkt->pts;
     }
 
-    do {
+    while (1) {
         ret = get_surface(avctx, q, &insurf);
         if (ret < 0)
             return ret;
+        do {
+            ret = MFXVideoDECODE_DecodeFrameAsync(q->session, flush ? NULL : &bs,
+                                                  insurf, &outsurf, &sync);
+            if (ret != MFX_WRN_DEVICE_BUSY)
+                break;
+            av_usleep(500);
+        } while (1);
+
+        if (MFX_WRN_VIDEO_PARAM_CHANGED==ret) {
+            /* TODO: handle here minor sequence header changing */
+        } else if (MFX_ERR_INCOMPATIBLE_VIDEO_PARAM==ret) {
+            av_fifo_reset(q->input_fifo);
+            flush = q->reinit_pending = 1;
+            continue;
+        }
 
-        ret = MFXVideoDECODE_DecodeFrameAsync(q->session, avpkt->size ? &bs : NULL,
-                                              insurf, &outsurf, &sync);
-        if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
+        if (sync) {
+            QSVFrame *out_frame = find_frame(q, outsurf);
 
-    } while (ret == MFX_WRN_DEVICE_BUSY || ret == MFX_ERR_MORE_SURFACE);
+            if (!out_frame) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "The returned surface does not correspond to any frame\n");
+                return AVERROR_BUG;
+            }
 
-    if (ret != MFX_ERR_NONE &&
-        ret != MFX_ERR_MORE_DATA &&
-        ret != MFX_WRN_VIDEO_PARAM_CHANGED &&
-        ret != MFX_ERR_MORE_SURFACE) {
-        av_log(avctx, AV_LOG_ERROR, "Error during QSV decoding.\n");
-        return ff_qsv_error(ret);
+            out_frame->queued = 1;
+            av_fifo_generic_write(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
+            av_fifo_generic_write(q->async_fifo, &sync,      sizeof(sync),      NULL);
+
+            continue;
+        }
+        if (MFX_ERR_MORE_SURFACE != ret && ret < 0)
+            break;
     }
 
     /* make sure we do not enter an infinite loop if the SDK
      * did not consume any data and did not return anything */
-    if (!sync && !bs.DataOffset) {
+    if (!sync && !bs.DataOffset && !flush) {
         av_log(avctx, AV_LOG_WARNING, "A decode call did not consume any data\n");
         bs.DataOffset = avpkt->size;
     }
 
-    if (sync) {
-        QSVFrame *out_frame = find_frame(q, outsurf);
-
-        if (!out_frame) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "The returned surface does not correspond to any frame\n");
-            return AVERROR_BUG;
-        }
+    if (buffered) {
+        qsv_fifo_relocate(q->input_fifo, bs.DataOffset);
+    } else if (bs.DataOffset!=avpkt->size) {
+        /* some data of packet was not consumed. store it to local buffer */
+        av_fifo_generic_write(q->input_fifo, avpkt->data+bs.DataOffset,
+                              avpkt->size - bs.DataOffset, NULL);
+    }
 
-        out_frame->queued = 1;
-        av_fifo_generic_write(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
-        av_fifo_generic_write(q->async_fifo, &sync,      sizeof(sync),      NULL);
+    if (MFX_ERR_MORE_DATA!=ret && ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error %d during QSV decoding.\n", ret);
+        return ff_qsv_error(ret);
     }
+    n_out_frames = av_fifo_size(q->async_fifo) / (sizeof(out_frame)+sizeof(sync));
 
-    if (!av_fifo_space(q->async_fifo) ||
-        (!avpkt->size && av_fifo_size(q->async_fifo))) {
+    if (n_out_frames > q->async_depth || (flush && n_out_frames) ) {
         AVFrame *src_frame;
 
         av_fifo_generic_read(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
@@ -302,130 +439,147 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
         *got_frame = 1;
     }
 
-    return bs.DataOffset;
+    return avpkt->size;
 }
-
-int ff_qsv_decode_close(QSVContext *q)
+/*
+ This function inserts a packet at fifo front.
+*/
+static void qsv_packet_push_front(QSVContext *q, AVPacket *avpkt)
 {
-    QSVFrame *cur = q->work_frames;
+    int fifo_size = av_fifo_size(q->pkt_fifo);
+    if (!fifo_size) {
+    /* easy case fifo is empty */
+        av_fifo_generic_write(q->pkt_fifo, avpkt, sizeof(*avpkt), NULL);
+    } else {
+    /* realloc necessary */
+        AVPacket pkt;
+        AVFifoBuffer *fifo = av_fifo_alloc(fifo_size+av_fifo_space(q->pkt_fifo));
 
-    if (q->session)
-        MFXVideoDECODE_Close(q->session);
+        av_fifo_generic_write(fifo, avpkt, sizeof(*avpkt), NULL);
 
-    while (cur) {
-        q->work_frames = cur->next;
-        av_frame_free(&cur->frame);
-        av_freep(&cur);
-        cur = q->work_frames;
+        while (av_fifo_size(q->pkt_fifo)) {
+            av_fifo_generic_read(q->pkt_fifo, &pkt, sizeof(pkt), NULL);
+            av_fifo_generic_write(fifo,       &pkt, sizeof(pkt), NULL);
+        }
+        av_fifo_free(q->pkt_fifo);
+        q->pkt_fifo = fifo;
     }
+}
+int ff_qsv_decode(AVCodecContext *avctx, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt)
+{
+    AVPacket pkt_ref = { 0 };
+    int ret = 0;
 
-    av_fifo_free(q->async_fifo);
-    q->async_fifo = NULL;
-
-    av_parser_close(q->parser);
-    avcodec_free_context(&q->avctx_internal);
+    if (q->pkt_fifo && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+        /* we already have got some buffered packets. so add new to tail */
+        ret = av_packet_ref(&pkt_ref, avpkt);
+        if (ret < 0)
+            return ret;
+        av_fifo_generic_write(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
+    }
+    if (q->reinit_pending) {
+        ret = do_qsv_decode(avctx, q, frame, got_frame, avpkt);
 
-    if (q->internal_session)
-        MFXClose(q->internal_session);
+        if (!*got_frame) {
+            /* Flushing complete, no more frames  */
+            close_decoder(q);
+            //return ff_qsv_decode(avctx, q, frame, got_frame, avpkt);
+        }
+    }
+    if (!q->reinit_pending) {
+        if (q->pkt_fifo && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+            /* process buffered packets */
+            while (!*got_frame && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+                av_fifo_generic_read(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
+                ret = do_qsv_decode(avctx, q, frame, got_frame, &pkt_ref);
+                if (q->reinit_pending) {
+                    /*
+                       A rare case: new reinit pending when buffering existing.
+                       We should to return the pkt_ref back to same place of fifo
+                    */
+                    qsv_packet_push_front(q, &pkt_ref);
+                } else {
+                    av_packet_unref(&pkt_ref);
+                }
+           }
+        } else {
+            /* general decoding */
+            ret = do_qsv_decode(avctx, q, frame, got_frame, avpkt);
+            if (q->reinit_pending) {
+                ret = av_packet_ref(&pkt_ref, avpkt);
+                if (ret < 0)
+                    return ret;
+                av_fifo_generic_write(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
+            }
+        }
+    }
 
-    return 0;
+    return ret;
 }
-
-int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
-                        AVFrame *frame, int *got_frame, AVPacket *pkt)
+/*
+ This function resets decoder and corresponded buffers before seek operation
+*/
+void ff_qsv_decode_reset(AVCodecContext *avctx, QSVContext *q)
 {
-    uint8_t *dummy_data;
-    int dummy_size;
-    int ret;
+    QSVFrame *cur;
+    AVPacket pkt;
+    int ret = 0;
+    mfxVideoParam param = { { 0 } };
 
-    if (!q->avctx_internal) {
-        q->avctx_internal = avcodec_alloc_context3(NULL);
-        if (!q->avctx_internal)
-            return AVERROR(ENOMEM);
+    if (q->reinit_pending) {
+        close_decoder(q);
+    } else if (q->engine_ready) {
+        ret = MFXVideoDECODE_GetVideoParam(q->session, &param);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MFX decode get param error %d\n", ret);
+        }
 
-        if (avctx->extradata) {
-            q->avctx_internal->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!q->avctx_internal->extradata)
-                return AVERROR(ENOMEM);
+        ret = MFXVideoDECODE_Reset(q->session, &param);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MFX decode reset error %d\n", ret);
+        }
 
-            memcpy(q->avctx_internal->extradata, avctx->extradata,
-                   avctx->extradata_size);
-            q->avctx_internal->extradata_size = avctx->extradata_size;
+        /* Free all frames*/
+        cur = q->work_frames;
+        while (cur) {
+            q->work_frames = cur->next;
+            av_frame_free(&cur->frame);
+            av_freep(&cur);
+            cur = q->work_frames;
         }
+    }
 
-        q->parser = av_parser_init(avctx->codec_id);
-        if (!q->parser)
-            return AVERROR(ENOMEM);
+    /* Reset output surfaces */
+    av_fifo_reset(q->async_fifo);
 
-        q->parser->flags |= PARSER_FLAG_COMPLETE_FRAMES;
-        q->orig_pix_fmt   = AV_PIX_FMT_NONE;
+    /* Reset input packets fifo */
+    while (av_fifo_size(q->pkt_fifo)) {
+        av_fifo_generic_read(q->pkt_fifo, &pkt, sizeof(pkt), NULL);
+        av_packet_unref(&pkt);
     }
 
-    if (!pkt->size)
-        return qsv_decode(avctx, q, frame, got_frame, pkt);
-
-    /* we assume the packets are already split properly and want
-     * just the codec parameters here */
-    av_parser_parse2(q->parser, q->avctx_internal,
-                     &dummy_data, &dummy_size,
-                     pkt->data, pkt->size, pkt->pts, pkt->dts,
-                     pkt->pos);
-
-    /* TODO: flush delayed frames on reinit */
-    if (q->parser->format       != q->orig_pix_fmt    ||
-        q->parser->coded_width  != avctx->coded_width ||
-        q->parser->coded_height != avctx->coded_height) {
-        mfxSession session = NULL;
-
-        enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_QSV,
-                                           AV_PIX_FMT_NONE,
-                                           AV_PIX_FMT_NONE };
-        enum AVPixelFormat qsv_format;
-
-        qsv_format = ff_qsv_map_pixfmt(q->parser->format);
-        if (qsv_format < 0) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Only 8-bit YUV420 streams are supported.\n");
-            ret = AVERROR(ENOSYS);
-            goto reinit_fail;
-        }
-
-        q->orig_pix_fmt     = q->parser->format;
-        avctx->pix_fmt      = pix_fmts[1] = qsv_format;
-        avctx->width        = q->parser->width;
-        avctx->height       = q->parser->height;
-        avctx->coded_width  = q->parser->coded_width;
-        avctx->coded_height = q->parser->coded_height;
-        avctx->level        = q->avctx_internal->level;
-        avctx->profile      = q->avctx_internal->profile;
+    /* Reset input bitstream fifo */
+    av_fifo_reset(q->input_fifo);
+}
 
-        ret = ff_get_format(avctx, pix_fmts);
-        if (ret < 0)
-            goto reinit_fail;
+int ff_qsv_decode_close(QSVContext *q)
+{
+    close_decoder(q);
 
-        avctx->pix_fmt = ret;
+    q->session = NULL;
 
-        if (avctx->hwaccel_context) {
-            AVQSVContext *user_ctx = avctx->hwaccel_context;
-            session           = user_ctx->session;
-            q->iopattern      = user_ctx->iopattern;
-            q->ext_buffers    = user_ctx->ext_buffers;
-            q->nb_ext_buffers = user_ctx->nb_ext_buffers;
-        }
+    ff_qsv_close_internal_session(&q->internal_qs);
 
-        ret = qsv_decode_init(avctx, q, session);
-        if (ret < 0)
-            goto reinit_fail;
-    }
+    av_fifo_free(q->async_fifo);
+    q->async_fifo = NULL;
 
-    return qsv_decode(avctx, q, frame, got_frame, pkt);
+    av_fifo_free(q->input_fifo);
+    q->input_fifo = NULL;
 
-reinit_fail:
-    q->orig_pix_fmt = q->parser->format = avctx->pix_fmt = AV_PIX_FMT_NONE;
-    return ret;
-}
+    av_fifo_free(q->pkt_fifo);
+    q->pkt_fifo = NULL;
 
-void ff_qsv_decode_flush(AVCodecContext *avctx, QSVContext *q)
-{
-    q->orig_pix_fmt = AV_PIX_FMT_NONE;
+    return 0;
 }
diff --git a/libavcodec/qsvdec.h b/libavcodec/qsvdec.h
index 698d8c89b4..97a3315b75 100644
--- a/libavcodec/qsvdec.h
+++ b/libavcodec/qsvdec.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,7 @@ typedef struct QSVContext {
 
     // the session we allocated internally, in case the caller did not provide
     // one
-    mfxSession internal_session;
+    QSVSession internal_qs;
 
     /**
      * a linked list of frames currently being used by QSV
@@ -49,11 +49,22 @@ typedef struct QSVContext {
     QSVFrame *work_frames;
 
     AVFifoBuffer *async_fifo;
+    AVFifoBuffer *input_fifo;
 
-    // the internal parser and codec context for parsing the data
-    AVCodecParserContext *parser;
-    AVCodecContext *avctx_internal;
-    enum AVPixelFormat orig_pix_fmt;
+    // we should to buffer input packets at some cases
+    // else it is not possible to handle dynamic stream changes correctly
+    // this fifo uses for input packets buffering
+    AVFifoBuffer *pkt_fifo;
+
+    // this flag indicates that header parsed,
+    // decoder instance created and ready to general decoding
+    int engine_ready;
+
+    // we can not just re-init decoder if different sequence header arrived
+    // we should to deliver all buffered frames but we can not decode new packets
+    // this time. So when reinit_pending is non-zero we flushing decoder and
+    // accumulate new arrived packets into pkt_fifo
+    int reinit_pending;
 
     // options set by the caller
     int async_depth;
@@ -67,10 +78,11 @@ typedef struct QSVContext {
 
 int ff_qsv_map_pixfmt(enum AVPixelFormat format);
 
-int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
-                        AVFrame *frame, int *got_frame, AVPacket *pkt);
+int ff_qsv_decode(AVCodecContext *s, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt);
 
-void ff_qsv_decode_flush(AVCodecContext *avctx, QSVContext *q);
+void ff_qsv_decode_reset(AVCodecContext *avctx, QSVContext *q);
 
 int ff_qsv_decode_close(QSVContext *q);
 
diff --git a/libavcodec/qsvdec_h2645.c b/libavcodec/qsvdec_h2645.c
index 52e22eec68..a396f31e88 100644
--- a/libavcodec/qsvdec_h2645.c
+++ b/libavcodec/qsvdec_h2645.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,9 +33,7 @@
 
 #include "avcodec.h"
 #include "internal.h"
-#include "qsv_internal.h"
 #include "qsvdec.h"
-#include "qsv.h"
 
 enum LoadPlugin {
     LOAD_PLUGIN_NONE,
@@ -51,37 +49,14 @@ typedef struct QSVH2645Context {
     // the filter for converting to Annex B
     AVBitStreamFilterContext *bsf;
 
-    AVFifoBuffer *packet_fifo;
-
-    AVPacket input_ref;
-    AVPacket pkt_filtered;
-    uint8_t *filtered_data;
 } QSVH2645Context;
 
-static void qsv_clear_buffers(QSVH2645Context *s)
-{
-    AVPacket pkt;
-    while (av_fifo_size(s->packet_fifo) >= sizeof(pkt)) {
-        av_fifo_generic_read(s->packet_fifo, &pkt, sizeof(pkt), NULL);
-        av_packet_unref(&pkt);
-    }
-
-    if (s->filtered_data != s->input_ref.data)
-        av_freep(&s->filtered_data);
-    s->filtered_data = NULL;
-    av_packet_unref(&s->input_ref);
-}
-
 static av_cold int qsv_decode_close(AVCodecContext *avctx)
 {
     QSVH2645Context *s = avctx->priv_data;
 
     ff_qsv_decode_close(&s->qsv);
 
-    qsv_clear_buffers(s);
-
-    av_fifo_free(s->packet_fifo);
-
     av_bitstream_filter_close(s->bsf);
 
     return 0;
@@ -107,12 +82,6 @@ static av_cold int qsv_decode_init(AVCodecContext *avctx)
         }
     }
 
-    s->packet_fifo = av_fifo_alloc(sizeof(AVPacket));
-    if (!s->packet_fifo) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
     if (avctx->codec_id == AV_CODEC_ID_H264)
         s->bsf = av_bitstream_filter_init("h264_mp4toannexb");
     else
@@ -122,8 +91,6 @@ static av_cold int qsv_decode_init(AVCodecContext *avctx)
         goto fail;
     }
 
-    s->qsv.iopattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
-
     return 0;
 fail:
     qsv_decode_close(avctx);
@@ -136,69 +103,42 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
     QSVH2645Context *s = avctx->priv_data;
     AVFrame *frame    = data;
     int ret;
+    uint8_t *p_filtered = NULL;
+    int      n_filtered = NULL;
+    AVPacket pkt_filtered = { 0 };
 
-    /* buffer the input packet */
     if (avpkt->size) {
-        AVPacket input_ref = { 0 };
-
-        if (av_fifo_space(s->packet_fifo) < sizeof(input_ref)) {
-            ret = av_fifo_realloc2(s->packet_fifo,
-                                   av_fifo_size(s->packet_fifo) + sizeof(input_ref));
-            if (ret < 0)
-                return ret;
-        }
-
-        ret = av_packet_ref(&input_ref, avpkt);
-        if (ret < 0)
-            return ret;
-        av_fifo_generic_write(s->packet_fifo, &input_ref, sizeof(input_ref), NULL);
-    }
+        if (avpkt->size > 3 && !avpkt->data[0] &&
+            !avpkt->data[1] && !avpkt->data[2] && 1==avpkt->data[3]) {
+            /* we already have annex-b prefix */
+            return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 
-    /* process buffered data */
-    while (!*got_frame) {
-        /* prepare the input data -- convert to Annex B if needed */
-        if (s->pkt_filtered.size <= 0) {
-            int size;
-
-            /* no more data */
-            if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
-                return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            if (s->filtered_data != s->input_ref.data)
-                av_freep(&s->filtered_data);
-            s->filtered_data = NULL;
-            av_packet_unref(&s->input_ref);
-
-            av_fifo_generic_read(s->packet_fifo, &s->input_ref, sizeof(s->input_ref), NULL);
-            ret = av_bitstream_filter_filter(s->bsf, avctx, NULL,
-                                             &s->filtered_data, &size,
-                                             s->input_ref.data, s->input_ref.size, 0);
-            if (ret < 0) {
-                s->filtered_data = s->input_ref.data;
-                size             = s->input_ref.size;
+        } else {
+            /* no annex-b prefix. try to restore: */
+            ret = av_bitstream_filter_filter(s->bsf, avctx, "private_spspps_buf",
+                                         &p_filtered, &n_filtered,
+                                         avpkt->data, avpkt->size, 0);
+            if (ret>=0) {
+                pkt_filtered.pts  = avpkt->pts;
+                pkt_filtered.data = p_filtered;
+                pkt_filtered.size = n_filtered;
+
+                ret = ff_qsv_decode(avctx, &s->qsv, frame, got_frame, &pkt_filtered);
+
+                if (p_filtered != avpkt->data)
+                    av_free(p_filtered);
+                return ret > 0 ? avpkt->size : ret;
             }
-            s->pkt_filtered      = s->input_ref;
-            s->pkt_filtered.data = s->filtered_data;
-            s->pkt_filtered.size = size;
         }
-
-        ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->pkt_filtered);
-        if (ret < 0)
-            return ret;
-
-        s->pkt_filtered.size -= ret;
-        s->pkt_filtered.data += ret;
     }
 
-    return avpkt->size;
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 }
 
 static void qsv_decode_flush(AVCodecContext *avctx)
 {
     QSVH2645Context *s = avctx->priv_data;
-
-    qsv_clear_buffers(s);
-    ff_qsv_decode_flush(avctx, &s->qsv);
+    ff_qsv_decode_reset(avctx, &s->qsv);
 }
 
 #define OFFSET(x) offsetof(QSVH2645Context, x)
@@ -241,7 +181,7 @@ AVCodec ff_hevc_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
     .priv_class     = &hevc_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
diff --git a/libavcodec/qsvdec_mpeg2.c b/libavcodec/qsvdec_mpeg2.c
index 3378d02a23..d9052e0c62 100644
--- a/libavcodec/qsvdec_mpeg2.c
+++ b/libavcodec/qsvdec_mpeg2.c
@@ -1,91 +1,49 @@
 /*
- * Intel MediaSDK QSV based MPEG2 decoder
+ * Intel MediaSDK QSV based MPEG2 video decoder
  *
- * copyright (c) 2015 Anton Khirnov
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-
 #include <stdint.h>
 #include <string.h>
 
-#include <mfx/mfxvideo.h>
-
 #include "libavutil/common.h"
-#include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
-#include "internal.h"
-#include "qsv_internal.h"
 #include "qsvdec.h"
-#include "qsv.h"
 
 typedef struct QSVMPEG2Context {
     AVClass *class;
     QSVContext qsv;
-
-    AVFifoBuffer *packet_fifo;
-
-    AVPacket input_ref;
 } QSVMPEG2Context;
 
-static void qsv_clear_buffers(QSVMPEG2Context *s)
-{
-    AVPacket pkt;
-    while (av_fifo_size(s->packet_fifo) >= sizeof(pkt)) {
-        av_fifo_generic_read(s->packet_fifo, &pkt, sizeof(pkt), NULL);
-        av_packet_unref(&pkt);
-    }
-
-    av_packet_unref(&s->input_ref);
-}
-
 static av_cold int qsv_decode_close(AVCodecContext *avctx)
 {
     QSVMPEG2Context *s = avctx->priv_data;
 
     ff_qsv_decode_close(&s->qsv);
 
-    qsv_clear_buffers(s);
-
-    av_fifo_free(s->packet_fifo);
-
     return 0;
 }
 
 static av_cold int qsv_decode_init(AVCodecContext *avctx)
 {
-    QSVMPEG2Context *s = avctx->priv_data;
-    int ret;
-
-    s->packet_fifo = av_fifo_alloc(sizeof(AVPacket));
-    if (!s->packet_fifo) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
-    s->qsv.iopattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
-
     return 0;
-fail:
-    qsv_decode_close(avctx);
-    return ret;
 }
 
 static int qsv_decode_frame(AVCodecContext *avctx, void *data,
@@ -93,53 +51,14 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
 {
     QSVMPEG2Context *s = avctx->priv_data;
     AVFrame *frame    = data;
-    int ret;
-
-    /* buffer the input packet */
-    if (avpkt->size) {
-        AVPacket input_ref = { 0 };
-
-        if (av_fifo_space(s->packet_fifo) < sizeof(input_ref)) {
-            ret = av_fifo_realloc2(s->packet_fifo,
-                                   av_fifo_size(s->packet_fifo) + sizeof(input_ref));
-            if (ret < 0)
-                return ret;
-        }
-
-        ret = av_packet_ref(&input_ref, avpkt);
-        if (ret < 0)
-            return ret;
-        av_fifo_generic_write(s->packet_fifo, &input_ref, sizeof(input_ref), NULL);
-    }
 
-    /* process buffered data */
-    while (!*got_frame) {
-        if (s->input_ref.size <= 0) {
-            /* no more data */
-            if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
-                return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            av_packet_unref(&s->input_ref);
-            av_fifo_generic_read(s->packet_fifo, &s->input_ref, sizeof(s->input_ref), NULL);
-        }
-
-        ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->input_ref);
-        if (ret < 0)
-            return ret;
-
-        s->input_ref.size -= ret;
-        s->input_ref.data += ret;
-    }
-
-    return avpkt->size;
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 }
 
 static void qsv_decode_flush(AVCodecContext *avctx)
 {
     QSVMPEG2Context *s = avctx->priv_data;
-
-    qsv_clear_buffers(s);
-    ff_qsv_decode_flush(avctx, &s->qsv);
+    ff_qsv_decode_reset(avctx, &s->qsv);
 }
 
 AVHWAccel ff_mpeg2_qsv_hwaccel = {
@@ -173,7 +92,7 @@ AVCodec ff_mpeg2_qsv_decoder = {
     .decode         = qsv_decode_frame,
     .flush          = qsv_decode_flush,
     .close          = qsv_decode_close,
-    .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
     .priv_class     = &class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
diff --git a/libavcodec/qsvdec_vc1.c b/libavcodec/qsvdec_vc1.c
new file mode 100644
index 0000000000..fcf101f758
--- /dev/null
+++ b/libavcodec/qsvdec_vc1.c
@@ -0,0 +1,97 @@
+/*
+ * Intel MediaSDK QSV based VC-1 video decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/fifo.h"
+#include "libavutil/opt.h"
+
+#include "avcodec.h"
+#include "qsvdec.h"
+
+typedef struct QSVVC1Context {
+    AVClass *class;
+    QSVContext qsv;
+} QSVVC1Context;
+
+
+static av_cold int qsv_decode_close(AVCodecContext *avctx)
+{
+    QSVVC1Context *s = avctx->priv_data;
+
+    ff_qsv_decode_close(&s->qsv);
+
+    return 0;
+}
+
+static int qsv_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame, AVPacket *avpkt)
+{
+    QSVVC1Context *s = avctx->priv_data;
+    AVFrame *frame    = data;
+
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
+}
+
+static void qsv_decode_flush(AVCodecContext *avctx)
+{
+    QSVVC1Context *s = avctx->priv_data;
+    ff_qsv_decode_reset(avctx, &s->qsv);
+}
+
+AVHWAccel ff_vc1_qsv_hwaccel = {
+    .name           = "vc1_qsv",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .pix_fmt        = AV_PIX_FMT_QSV,
+};
+
+#define OFFSET(x) offsetof(QSVVC1Context, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 0, INT_MAX, VD },
+    { NULL },
+};
+
+static const AVClass class = {
+    .class_name = "vc1_qsv",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_vc1_qsv_decoder = {
+    .name           = "vc1_qsv",
+    .long_name      = NULL_IF_CONFIG_SMALL("VC-1 video (Intel Quick Sync Video acceleration)"),
+    .priv_data_size = sizeof(QSVVC1Context),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .init           = NULL,
+    .decode         = qsv_decode_frame,
+    .flush          = qsv_decode_flush,
+    .close          = qsv_decode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .priv_class     = &class,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
+                                                    AV_PIX_FMT_QSV,
+                                                    AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index d6a731f416..55140e1135 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Yukinori Yamazoe
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -67,18 +67,30 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     q->param.mfx.BufferSizeInKB     = 0;
 
     q->param.mfx.FrameInfo.FourCC         = MFX_FOURCC_NV12;
-    q->param.mfx.FrameInfo.Width          = FFALIGN(avctx->width, q->width_align);
-    q->param.mfx.FrameInfo.Height         = FFALIGN(avctx->height, 32);
     q->param.mfx.FrameInfo.CropX          = 0;
     q->param.mfx.FrameInfo.CropY          = 0;
     q->param.mfx.FrameInfo.CropW          = avctx->width;
     q->param.mfx.FrameInfo.CropH          = avctx->height;
     q->param.mfx.FrameInfo.AspectRatioW   = avctx->sample_aspect_ratio.num;
     q->param.mfx.FrameInfo.AspectRatioH   = avctx->sample_aspect_ratio.den;
-    q->param.mfx.FrameInfo.PicStruct      = MFX_PICSTRUCT_PROGRESSIVE;
     q->param.mfx.FrameInfo.ChromaFormat   = MFX_CHROMAFORMAT_YUV420;
     q->param.mfx.FrameInfo.BitDepthLuma   = 8;
     q->param.mfx.FrameInfo.BitDepthChroma = 8;
+    q->param.mfx.FrameInfo.Width          = FFALIGN(avctx->width, q->width_align);
+
+    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
+       /* A true field layout (TFF or BFF) is not important here,
+          it will specified later during frame encoding. But it is important
+          to specify is frame progressive or not because allowed heigh alignment
+          does depend by this.
+        */
+        q->param.mfx.FrameInfo.PicStruct = MFX_PICSTRUCT_FIELD_TFF;
+        q->height_align = 32;
+    } else {
+        q->param.mfx.FrameInfo.PicStruct = MFX_PICSTRUCT_PROGRESSIVE;
+        q->height_align = 16;
+    }
+   q->param.mfx.FrameInfo.Height    = FFALIGN(avctx->height, q->height_align);
 
     if (avctx->framerate.den > 0 && avctx->framerate.num > 0) {
         q->param.mfx.FrameInfo.FrameRateExtN = avctx->framerate.num;
@@ -95,8 +107,16 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
         q->param.mfx.RateControlMethod = MFX_RATECONTROL_CBR;
         ratecontrol_desc = "constant bitrate (CBR)";
     } else if (!avctx->rc_max_rate) {
-        q->param.mfx.RateControlMethod = MFX_RATECONTROL_AVBR;
-        ratecontrol_desc = "average variable bitrate (AVBR)";
+#if QSV_VERSION_ATLEAST(1,7)
+        if (q->look_ahead) {
+            q->param.mfx.RateControlMethod = MFX_RATECONTROL_LA;
+            ratecontrol_desc = "lookahead (LA)";
+        } else
+#endif
+        {
+            q->param.mfx.RateControlMethod = MFX_RATECONTROL_AVBR;
+            ratecontrol_desc = "average variable bitrate (AVBR)";
+        }
     } else {
         q->param.mfx.RateControlMethod = MFX_RATECONTROL_VBR;
         ratecontrol_desc = "variable bitrate (VBR)";
@@ -109,7 +129,7 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     case MFX_RATECONTROL_VBR:
         q->param.mfx.InitialDelayInKB = avctx->rc_initial_buffer_occupancy / 1000;
         q->param.mfx.TargetKbps       = avctx->bit_rate / 1000;
-        q->param.mfx.MaxKbps          = avctx->bit_rate / 1000;
+        q->param.mfx.MaxKbps          = avctx->rc_max_rate / 1000;
         break;
     case MFX_RATECONTROL_CQP:
         quant = avctx->global_quality / FF_QP2LAMBDA;
@@ -120,6 +140,9 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
 
         break;
     case MFX_RATECONTROL_AVBR:
+#if QSV_VERSION_ATLEAST(1,7)
+    case MFX_RATECONTROL_LA:
+#endif
         q->param.mfx.TargetKbps  = avctx->bit_rate / 1000;
         q->param.mfx.Convergence = q->avbr_convergence;
         q->param.mfx.Accuracy    = q->avbr_accuracy;
@@ -134,8 +157,27 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
         q->extco.CAVLC                = avctx->coder_type == FF_CODER_TYPE_VLC ?
                                         MFX_CODINGOPTION_ON : MFX_CODINGOPTION_UNKNOWN;
 
+        q->extco.PicTimingSEI         = q->pic_timing_sei ?
+                                        MFX_CODINGOPTION_ON : MFX_CODINGOPTION_UNKNOWN;
+
         q->extparam[0] = (mfxExtBuffer *)&q->extco;
 
+#if QSV_VERSION_ATLEAST(1,6)
+        q->extco2.Header.BufferId      = MFX_EXTBUFF_CODING_OPTION2;
+        q->extco2.Header.BufferSz      = sizeof(q->extco2);
+
+#if QSV_VERSION_ATLEAST(1,7)
+        // valid value range is from 10 to 100 inclusive
+        // to instruct the encoder to use the default value this should be set to zero
+        q->extco2.LookAheadDepth        = q->look_ahead_depth != 0 ? FFMAX(10, q->look_ahead_depth) : 0;
+#endif
+#if QSV_VERSION_ATLEAST(1,8)
+        q->extco2.LookAheadDS           = q->look_ahead_downsampling;
+#endif
+
+        q->extparam[1] = (mfxExtBuffer *)&q->extco2;
+
+#endif
         q->param.ExtParam    = q->extparam;
         q->param.NumExtParam = FF_ARRAY_ELEMS(q->extparam);
     }
@@ -210,18 +252,26 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     }
 
     if (!q->session) {
-        ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
+        ret = ff_qsv_init_internal_session(avctx, &q->internal_qs,
                                            q->load_plugins);
         if (ret < 0)
             return ret;
 
-        q->session = q->internal_session;
+        q->session = q->internal_qs.session;
     }
 
     ret = init_video_param(avctx, q);
     if (ret < 0)
         return ret;
 
+    ret = MFXVideoENCODE_Query(q->session, &q->param,&q->param);
+    if (MFX_WRN_PARTIAL_ACCELERATION==ret) {
+        av_log(avctx, AV_LOG_WARNING, "Encoder will work with partial HW acceleration\n");
+    } else if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error %d querying encoder params\n", ret);
+        return ff_qsv_error(ret);
+    }
+
     ret = MFXVideoENCODE_QueryIOSurf(q->session, &q->param, &q->req);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error querying the encoding parameters\n");
@@ -229,7 +279,9 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     }
 
     ret = MFXVideoENCODE_Init(q->session, &q->param);
-    if (ret < 0) {
+    if (MFX_WRN_PARTIAL_ACCELERATION==ret) {
+        av_log(avctx, AV_LOG_WARNING, "Encoder will work with partial HW acceleration\n");
+    } else if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing the encoder\n");
         return ff_qsv_error(ret);
     }
@@ -311,8 +363,9 @@ static int submit_frame(QSVEncContext *q, const AVFrame *frame,
     }
 
     /* make a copy if the input is not padded as libmfx requires */
-    if (frame->height & 31 || frame->linesize[0] & (q->width_align - 1)) {
-        qf->frame->height = FFALIGN(frame->height, 32);
+    if (     frame->height & (q->height_align - 1) ||
+        frame->linesize[0] & (q->width_align - 1)) {
+        qf->frame->height = FFALIGN(frame->height, q->height_align);
         qf->frame->width  = FFALIGN(frame->width, q->width_align);
 
         ret = ff_get_buffer(q->avctx, qf->frame, AV_GET_BUFFER_FLAG_REF);
@@ -403,19 +456,29 @@ int ff_qsv_encode(AVCodecContext *avctx, QSVEncContext *q,
 
     do {
         ret = MFXVideoENCODE_EncodeFrameAsync(q->session, NULL, surf, bs, &sync);
-        if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
-    } while (ret > 0);
+        if (ret == MFX_WRN_DEVICE_BUSY) {
+            av_usleep(500);
+            continue;
+        }
+        break;
+    } while ( 1 );
 
     if (ret < 0) {
         av_packet_unref(&new_pkt);
         av_freep(&bs);
-        return (ret == MFX_ERR_MORE_DATA) ? 0 : ff_qsv_error(ret);
+        if (ret == MFX_ERR_MORE_DATA)
+            return 0;
+        av_log(avctx, AV_LOG_ERROR, "EncodeFrameAsync returned %d\n", ret);
+        return ff_qsv_error(ret);
     }
 
-    if (ret == MFX_WRN_INCOMPATIBLE_VIDEO_PARAM && frame->interlaced_frame)
-        print_interlace_msg(avctx, q);
-
+    if (ret == MFX_WRN_INCOMPATIBLE_VIDEO_PARAM) {
+        if (frame->interlaced_frame)
+            print_interlace_msg(avctx, q);
+        else
+            av_log(avctx, AV_LOG_WARNING,
+                   "EncodeFrameAsync returned 'incompatible param' code\n");
+    }
     if (sync) {
         av_fifo_generic_write(q->async_fifo, &new_pkt, sizeof(new_pkt), NULL);
         av_fifo_generic_write(q->async_fifo, &sync,    sizeof(sync),    NULL);
@@ -484,10 +547,9 @@ int ff_qsv_enc_close(AVCodecContext *avctx, QSVEncContext *q)
 
     if (q->session)
         MFXVideoENCODE_Close(q->session);
-    if (q->internal_session)
-        MFXClose(q->internal_session);
-    q->session          = NULL;
-    q->internal_session = NULL;
+    q->session = NULL;
+
+    ff_qsv_close_internal_session(&q->internal_qs);
 
     cur = q->work_frames;
     while (cur) {
diff --git a/libavcodec/qsvenc.h b/libavcodec/qsvenc.h
index 7bbeec33aa..2a21f8217d 100644
--- a/libavcodec/qsvenc.h
+++ b/libavcodec/qsvenc.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,16 +40,22 @@ typedef struct QSVEncContext {
     QSVFrame *work_frames;
 
     mfxSession session;
-    mfxSession internal_session;
+    QSVSession internal_qs;
 
     int packet_size;
     int width_align;
+    int height_align;
 
     mfxVideoParam param;
     mfxFrameAllocRequest req;
 
     mfxExtCodingOption  extco;
+#if QSV_VERSION_ATLEAST(1,6)
+    mfxExtCodingOption2 extco2;
+    mfxExtBuffer *extparam[2];
+#else
     mfxExtBuffer *extparam[1];
+#endif
 
     AVFifoBuffer *async_fifo;
 
@@ -60,6 +66,10 @@ typedef struct QSVEncContext {
     int preset;
     int avbr_accuracy;
     int avbr_convergence;
+    int pic_timing_sei;
+    int look_ahead;
+    int look_ahead_depth;
+    int look_ahead_downsampling;
 
     char *load_plugins;
 } QSVEncContext;
diff --git a/libavcodec/qsvenc_h264.c b/libavcodec/qsvenc_h264.c
index 93a3b1a5fc..0e5a26cd1f 100644
--- a/libavcodec/qsvenc_h264.c
+++ b/libavcodec/qsvenc_h264.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,6 +69,19 @@ static const AVOption options[] = {
     { "idr_interval", "Distance (in I-frames) between IDR frames", OFFSET(qsv.idr_interval), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     { "avbr_accuracy",    "Accuracy of the AVBR ratecontrol",    OFFSET(qsv.avbr_accuracy),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     { "avbr_convergence", "Convergence of the AVBR ratecontrol", OFFSET(qsv.avbr_convergence), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "pic_timing_sei",    "Insert picture timing SEI with pic_struct_syntax element", OFFSET(qsv.pic_timing_sei), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+
+#if QSV_VERSION_ATLEAST(1,7)
+    { "look_ahead",       "Use VBR algorithm with look ahead",    OFFSET(qsv.look_ahead),       AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "look_ahead_depth", "Depth of look ahead in number frames", OFFSET(qsv.look_ahead_depth), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, VE },
+#endif
+
+#if QSV_VERSION_ATLEAST(1,8)
+    { "look_ahead_downsampling", NULL, OFFSET(qsv.look_ahead_downsampling), AV_OPT_TYPE_INT, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, MFX_LOOKAHEAD_DS_UNKNOWN, MFX_LOOKAHEAD_DS_2x, VE, "look_ahead_downsampling" },
+    { "unknown"                , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "off"                    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_OFF     }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "2x"                     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_2x      }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+#endif
 
     { "profile", NULL, OFFSET(qsv.profile), AV_OPT_TYPE_INT, { .i64 = MFX_PROFILE_UNKNOWN }, 0, INT_MAX, VE, "profile" },
     { "unknown" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_UNKNOWN      }, INT_MIN, INT_MAX,     VE, "profile" },
@@ -76,10 +89,14 @@ static const AVOption options[] = {
     { "main"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_MAIN     }, INT_MIN, INT_MAX,     VE, "profile" },
     { "high"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_HIGH     }, INT_MIN, INT_MAX,     VE, "profile" },
 
-    { "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, 0, 7,   VE, "preset" },
-    { "fast",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },
-    { "medium", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },
-    { "slow",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },
+    { "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, MFX_TARGETUSAGE_BEST_QUALITY, MFX_TARGETUSAGE_BEST_SPEED,   VE, "preset" },
+    { "veryfast",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },
+    { "faster",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_6  },            INT_MIN, INT_MAX, VE, "preset" },
+    { "fast",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_5  },            INT_MIN, INT_MAX, VE, "preset" },
+    { "medium",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },
+    { "slow",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_3  },            INT_MIN, INT_MAX, VE, "preset" },
+    { "slower",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_2  },            INT_MIN, INT_MAX, VE, "preset" },
+    { "veryslow",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },
 
     { NULL },
 };
diff --git a/libavcodec/qsvenc_hevc.c b/libavcodec/qsvenc_hevc.c
index 75777432b8..eb43c491cd 100644
--- a/libavcodec/qsvenc_hevc.c
+++ b/libavcodec/qsvenc_hevc.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based HEVC encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,7 +68,7 @@ static int generate_fake_vps(QSVEncContext *q, AVCodecContext *avctx)
     }
 
     /* parse the SPS */
-    ret = ff_hevc_extract_rbsp(avctx->extradata + 4, avctx->extradata_size - 4, &sps_nal);
+    ret = ff_hevc_extract_rbsp(NULL, avctx->extradata + 4, avctx->extradata_size - 4, &sps_nal);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error unescaping the SPS buffer\n");
         return ret;
@@ -263,7 +263,7 @@ AVCodec ff_hevc_qsv_encoder = {
     .init           = qsv_enc_init,
     .encode2        = qsv_enc_frame,
     .close          = qsv_enc_close,
-    .capabilities   = CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
                                                     AV_PIX_FMT_NONE },
diff --git a/libavcodec/qsvenc_mpeg2.c b/libavcodec/qsvenc_mpeg2.c
index ed7c23e311..2b3b1f74e5 100644
--- a/libavcodec/qsvenc_mpeg2.c
+++ b/libavcodec/qsvenc_mpeg2.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based MPEG-2 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ AVCodec ff_mpeg2_qsv_encoder = {
     .init           = qsv_enc_init,
     .encode2        = qsv_enc_frame,
     .close          = qsv_enc_close,
-    .capabilities   = CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
                                                     AV_PIX_FMT_QSV,
                                                     AV_PIX_FMT_NONE },
diff --git a/libavcodec/qtrle.c b/libavcodec/qtrle.c
index d88144fede..1fcf5b3c79 100644
--- a/libavcodec/qtrle.c
+++ b/libavcodec/qtrle.c
@@ -1,21 +1,21 @@
 /*
  * Quicktime Animation (RLE) Video Decoder
- * Copyright (C) 2004 the ffmpeg project
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,17 +59,25 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi0, pi1;  /* 2 8-pixel values */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi0, pi1;  /* 2 8-pixel values */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int skip;
+    /* skip & 0x80 appears to mean 'start a new line', which can be interpreted
+     * as 'go to next line' during the decoding of a frame but is 'go to first
+     * line' at the beginning. Since we always interpret it as 'go to next line'
+     * in the decoding loop (which makes code simpler/faster), the first line
+     * would not be counted, so we count one more.
+     * See: https://trac.ffmpeg.org/ticket/226
+     * In the following decoding loop, row_ptr will be the position of the
+     * current row. */
 
     row_ptr  -= row_inc;
     pixel_ptr = row_ptr;
     lines_to_change++;
     while (lines_to_change) {
         skip     =              bytestream2_get_byte(&s->g);
-        rle_code = (signed char)bytestream2_get_byte(&s->g);
+        rle_code = (int8_t)bytestream2_get_byte(&s->g);
         if (rle_code == 0)
             break;
         if(skip & 0x80) {
@@ -80,6 +88,9 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
             pixel_ptr += 2 * skip;
         CHECK_PIXEL_PTR(0);  /* make sure pixel_ptr is positive */
 
+        if(rle_code == -1)
+            continue;
+
         if (rle_code < 0) {
             /* decode the run length code */
             rle_code = -rle_code;
@@ -99,8 +110,8 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
             rle_code *= 2;
             CHECK_PIXEL_PTR(rle_code);
 
-            while (rle_code--)
-                rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
+            bytestream2_get_buffer(&s->g, &rgb[pixel_ptr], rle_code);
+            pixel_ptr += rle_code;
         }
     }
 }
@@ -111,8 +122,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
     int rle_code, i;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi[16];  /* 16 palette indices */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi[16];  /* 16 palette indices */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int num_pixels = (bpp == 4) ? 8 : 16;
 
@@ -120,7 +131,7 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
         pixel_ptr = row_ptr + (num_pixels * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (num_pixels * (bytestream2_get_byte(&s->g) - 1));
@@ -136,8 +147,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
                 }
                 CHECK_PIXEL_PTR(rle_code * num_pixels);
                 while (rle_code--) {
-                    for (i = 0; i < num_pixels; i++)
-                        rgb[pixel_ptr++] = pi[i];
+                    memcpy(&rgb[pixel_ptr], &pi, num_pixels);
+                    pixel_ptr += num_pixels;
                 }
             } else {
                 /* copy the same pixel directly to output 4 times */
@@ -167,15 +178,15 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi1, pi2, pi3, pi4;  /* 4 palette indexes */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi1, pi2, pi3, pi4;  /* 4 palette indexes */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (4 * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (4 * (bytestream2_get_byte(&s->g) - 1));
@@ -203,9 +214,8 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
                 rle_code *= 4;
                 CHECK_PIXEL_PTR(rle_code);
 
-                while (rle_code--) {
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
-                }
+                bytestream2_get_buffer(&s->g, &rgb[pixel_ptr], rle_code);
+                pixel_ptr += rle_code;
             }
         }
         row_ptr += row_inc;
@@ -217,15 +227,15 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned short rgb16;
-    unsigned char *rgb = s->frame->data[0];
+    uint16_t rgb16;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 2;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 2;
@@ -238,7 +248,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 CHECK_PIXEL_PTR(rle_code * 2);
 
                 while (rle_code--) {
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             } else {
@@ -247,7 +257,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 /* copy pixels directly to output */
                 while (rle_code--) {
                     rgb16 = bytestream2_get_be16(&s->g);
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             }
@@ -261,15 +271,15 @@ static void qtrle_decode_24bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char r, g, b;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t r, g, b;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 3;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 3;
@@ -309,14 +319,14 @@ static void qtrle_decode_32bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
     unsigned int argb;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 4;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 4;
@@ -403,10 +413,8 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     int ret;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log (s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     /* check if this frame is even supposed to change */
     if (avpkt->size < 8)
@@ -426,6 +434,8 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
         bytestream2_skip(&s->g, 2);
         height     = bytestream2_get_be16(&s->g);
         bytestream2_skip(&s->g, 2);
+        if (height > s->avctx->height - start_line)
+            goto done;
     } else {
         start_line = 0;
         height     = s->avctx->height;
diff --git a/libavcodec/qtrleenc.c b/libavcodec/qtrleenc.c
index 29deb2123e..590c39dcea 100644
--- a/libavcodec/qtrleenc.c
+++ b/libavcodec/qtrleenc.c
@@ -5,20 +5,20 @@
  *
  * This file is based on flashsvenc.c.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,7 @@ typedef struct QtrleEncContext {
     int pixel_size;
     AVPicture previous_frame;
     unsigned int max_buf_size;
+    int logical_width;
     /**
      * This array will contain at ith position the value of the best RLE code
      * if the line started at pixel i
@@ -76,13 +77,23 @@ static av_cold int qtrle_encode_end(AVCodecContext *avctx)
 static av_cold int qtrle_encode_init(AVCodecContext *avctx)
 {
     QtrleEncContext *s = avctx->priv_data;
+    int ret;
 
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
-        return -1;
+        return AVERROR(EINVAL);
     }
     s->avctx=avctx;
+    s->logical_width=avctx->width;
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_GRAY8:
+        if (avctx->width % 4) {
+            av_log(avctx, AV_LOG_ERROR, "Width not being a multiple of 4 is not supported\n");
+            return AVERROR(EINVAL);
+        }
+        s->logical_width = avctx->width / 4;
+        s->pixel_size = 4;
+        break;
     case AV_PIX_FMT_RGB555BE:
         s->pixel_size = 2;
         break;
@@ -96,24 +107,24 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unsupported colorspace.\n");
         break;
     }
-    avctx->bits_per_coded_sample = s->pixel_size*8;
+    avctx->bits_per_coded_sample = avctx->pix_fmt == AV_PIX_FMT_GRAY8 ? 40 : s->pixel_size*8;
 
-    s->rlecode_table = av_mallocz(s->avctx->width);
-    s->skip_table    = av_mallocz(s->avctx->width);
-    s->length_table  = av_mallocz((s->avctx->width + 1)*sizeof(int));
+    s->rlecode_table = av_mallocz(s->logical_width);
+    s->skip_table    = av_mallocz(s->logical_width);
+    s->length_table  = av_mallocz_array(s->logical_width + 1, sizeof(int));
     if (!s->skip_table || !s->length_table || !s->rlecode_table) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating memory.\n");
-        return -1;
+        return AVERROR(ENOMEM);
     }
-    if (avpicture_alloc(&s->previous_frame, avctx->pix_fmt, avctx->width, avctx->height) < 0) {
+    if ((ret = avpicture_alloc(&s->previous_frame, avctx->pix_fmt, avctx->width, avctx->height)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating picture\n");
-        return -1;
+        return ret;
     }
 
-    s->max_buf_size = s->avctx->width*s->avctx->height*s->pixel_size*2 /* image base material */
-                      + 15                                           /* header + footer */
-                      + s->avctx->height*2                           /* skip code+rle end */
-                      + s->avctx->width/MAX_RLE_BULK + 1             /* rle codes */;
+    s->max_buf_size = s->logical_width*s->avctx->height*s->pixel_size*2 /* image base material */
+                      + 15                                            /* header + footer */
+                      + s->avctx->height*2                            /* skip code+rle end */
+                      + s->logical_width/MAX_RLE_BULK + 1             /* rle codes */;
 
     return 0;
 }
@@ -123,26 +134,26 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
  */
 static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, uint8_t **buf)
 {
-    int width=s->avctx->width;
+    int width=s->logical_width;
     int i;
     signed char rlecode;
 
-    /* We will use it to compute the best bulk copy sequence */
-    unsigned int bulkcount;
     /* This will be the number of pixels equal to the preivous frame one's
      * starting from the ith pixel */
     unsigned int skipcount;
     /* This will be the number of consecutive equal pixels in the current
      * frame, starting from the ith one also */
-    unsigned int repeatcount;
+    unsigned int av_uninit(repeatcount);
 
     /* The cost of the three different possibilities */
-    int total_bulk_cost;
     int total_skip_cost;
     int total_repeat_cost;
 
-    int temp_cost;
-    int j;
+    int base_bulk_cost;
+    int lowest_bulk_cost;
+    int lowest_bulk_cost_index;
+    int sec_lowest_bulk_cost;
+    int sec_lowest_bulk_cost_index;
 
     uint8_t *this_line = p->               data[0] + line*p->               linesize[0] +
         (width - 1)*s->pixel_size;
@@ -152,8 +163,57 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
     s->length_table[width] = 0;
     skipcount = 0;
 
+    /* Initial values */
+    lowest_bulk_cost = INT_MAX / 2;
+    lowest_bulk_cost_index = width;
+    sec_lowest_bulk_cost = INT_MAX / 2;
+    sec_lowest_bulk_cost_index = width;
+
+    base_bulk_cost = 1 + s->pixel_size;
+
     for (i = width - 1; i >= 0; i--) {
 
+        int prev_bulk_cost;
+
+        /* If our lowest bulk cost index is too far away, replace it
+         * with the next lowest bulk cost */
+        if (FFMIN(width, i + MAX_RLE_BULK) < lowest_bulk_cost_index) {
+            lowest_bulk_cost = sec_lowest_bulk_cost;
+            lowest_bulk_cost_index = sec_lowest_bulk_cost_index;
+
+            sec_lowest_bulk_cost = INT_MAX / 2;
+            sec_lowest_bulk_cost_index = width;
+        }
+
+        /* Deal with the first pixel's bulk cost */
+        if (!i) {
+            base_bulk_cost++;
+            lowest_bulk_cost++;
+            sec_lowest_bulk_cost++;
+        }
+
+        /* Look at the bulk cost of the previous loop and see if it is
+         * a new lower bulk cost */
+        prev_bulk_cost = s->length_table[i + 1] + base_bulk_cost;
+        if (prev_bulk_cost <= sec_lowest_bulk_cost) {
+            /* If it's lower than the 2nd lowest, then it may be lower
+             * than the lowest */
+            if (prev_bulk_cost <= lowest_bulk_cost) {
+
+                /* If we have found a new lowest bulk cost,
+                 * then the 2nd lowest bulk cost is now farther than the
+                 * lowest bulk cost, and will never be used */
+                sec_lowest_bulk_cost = INT_MAX / 2;
+
+                lowest_bulk_cost = prev_bulk_cost;
+                lowest_bulk_cost_index = i + 1;
+            } else {
+                /* Then it must be the 2nd lowest bulk cost */
+                sec_lowest_bulk_cost = prev_bulk_cost;
+                sec_lowest_bulk_cost_index = i + 1;
+            }
+        }
+
         if (!s->key_frame && !memcmp(this_line, prev_line, s->pixel_size))
             skipcount = FFMIN(skipcount + 1, MAX_RLE_SKIP);
         else
@@ -189,31 +249,22 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else {
             /* We cannot do neither skip nor repeat
-             * thus we search for the best bulk copy to do */
-
-            int limit = FFMIN(width - i, MAX_RLE_BULK);
+             * thus we use the best bulk copy  */
 
-            temp_cost = 1 + s->pixel_size + !i;
-            total_bulk_cost = INT_MAX;
+            s->length_table[i]  = lowest_bulk_cost;
+            s->rlecode_table[i] = lowest_bulk_cost_index - i;
 
-            for (j = 1; j <= limit; j++) {
-                if (s->length_table[i + j] + temp_cost < total_bulk_cost) {
-                    /* We have found a better bulk copy ... */
-                    total_bulk_cost = s->length_table[i + j] + temp_cost;
-                    bulkcount = j;
-                }
-                temp_cost += s->pixel_size;
-            }
-
-            s->length_table[i]  = total_bulk_cost;
-            s->rlecode_table[i] = bulkcount;
         }
 
+        /* These bulk costs increase every iteration */
+        lowest_bulk_cost += s->pixel_size;
+        sec_lowest_bulk_cost += s->pixel_size;
+
         this_line -= s->pixel_size;
         prev_line -= s->pixel_size;
     }
 
-    /* Good ! Now we have the best sequence for this line, let's ouput it */
+    /* Good ! Now we have the best sequence for this line, let's output it */
 
     /* We do a special case for the first pixel so that we avoid testing it in
      * the whole loop */
@@ -238,12 +289,28 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else if (rlecode > 0) {
             /* bulk copy */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, we will
+                // ignore the palette that is included in the AVFrame because
+                // AV_PIX_FMT_GRAY8 has defined color mapping
+                for (j = 0; j < rlecode*s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            }
             i += rlecode;
         }
         else {
             /* repeat the bits */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, ...
+                for (j = 0; j < s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            }
             i -= rlecode;
         }
     }
@@ -259,7 +326,7 @@ static int encode_frame(QtrleEncContext *s, const AVFrame *p, uint8_t *buf)
     uint8_t *orig_buf = buf;
 
     if (!s->key_frame) {
-        unsigned line_size = s->avctx->width * s->pixel_size;
+        unsigned line_size = s->logical_width * s->pixel_size;
         for (start_line = 0; start_line < s->avctx->height; start_line++)
             if (memcmp(p->data[0] + start_line*p->linesize[0],
                        s->previous_frame.data[0] + start_line*s->previous_frame.linesize[0],
@@ -299,11 +366,8 @@ static int qtrle_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     enum AVPictureType pict_type;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->max_buf_size)) < 0) {
-        /* Upper bound check for compressed data */
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", s->max_buf_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->max_buf_size, 0)) < 0)
         return ret;
-    }
 
     if (avctx->gop_size == 0 || (s->avctx->frame_number % avctx->gop_size) == 0) {
         /* I-Frame */
@@ -345,6 +409,6 @@ AVCodec ff_qtrle_encoder = {
     .encode2        = qtrle_encode_frame,
     .close          = qtrle_encode_end,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/r210dec.c b/libavcodec/r210dec.c
index f168fd3bfd..9c868cd11c 100644
--- a/libavcodec/r210dec.c
+++ b/libavcodec/r210dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Doeffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,11 @@
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    avctx->pix_fmt             = AV_PIX_FMT_RGB48;
+    if ((avctx->codec_tag & 0xFFFFFF) == MKTAG('r', '1', '0', 0)) {
+        avctx->pix_fmt = AV_PIX_FMT_BGR48;
+    } else {
+        avctx->pix_fmt = AV_PIX_FMT_RGB48;
+    }
     avctx->bits_per_raw_sample = 10;
 
     return 0;
@@ -39,8 +43,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int h, w, ret;
     AVFrame *pic = data;
     const uint32_t *src = (const uint32_t *)avpkt->data;
-    int aligned_width = FFALIGN(avctx->width, 64);
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
     uint8_t *dst_line;
+    int r10 = (avctx->codec_tag & 0xFFFFFF) == MKTAG('r', '1', '0', 0);
+    int le = avctx->codec_tag == MKTAG('R', '1', '0', 'k') &&
+             avctx->extradata_size >= 12 && !memcmp(&avctx->extradata[4], "DpxE", 4) &&
+             !avctx->extradata[11];
 
     if (avpkt->size < 4 * aligned_width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
@@ -57,14 +66,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     for (h = 0; h < avctx->height; h++) {
         uint16_t *dst = (uint16_t *)dst_line;
         for (w = 0; w < avctx->width; w++) {
-            uint32_t pixel = av_be2ne32(*src++);
+            uint32_t pixel;
             uint16_t r, g, b;
-            if (avctx->codec_id==AV_CODEC_ID_R210) {
+            if (avctx->codec_id == AV_CODEC_ID_AVRP || r10 || le) {
+                pixel = av_le2ne32(*src++);
+            } else {
+                pixel = av_be2ne32(*src++);
+            }
+            if (avctx->codec_id == AV_CODEC_ID_R210 || r10) {
                 b =  pixel <<  6;
                 g = (pixel >>  4) & 0xffc0;
                 r = (pixel >> 14) & 0xffc0;
             } else {
-                b =  pixel <<  4;
+                b = (pixel <<  4) & 0xffc0;
                 g = (pixel >>  6) & 0xffc0;
                 r = (pixel >> 16) & 0xffc0;
             }
@@ -103,3 +117,14 @@ AVCodec ff_r10k_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
 };
 #endif
+#if CONFIG_AVRP_DECODER
+AVCodec ff_avrp_decoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/r210enc.c b/libavcodec/r210enc.c
new file mode 100644
index 0000000000..65b3c069fc
--- /dev/null
+++ b/libavcodec/r210enc.c
@@ -0,0 +1,102 @@
+/*
+ * R210 encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pic, int *got_packet)
+{
+    int i, j, ret;
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
+    int pad = (aligned_width - avctx->width) * 4;
+    uint8_t *src_line;
+    uint8_t *dst;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 4 * aligned_width * avctx->height, 0)) < 0)
+        return ret;
+
+    src_line = pic->data[0];
+    dst = pkt->data;
+
+    for (i = 0; i < avctx->height; i++) {
+        uint16_t *src = (uint16_t *)src_line;
+        for (j = 0; j < avctx->width; j++) {
+            uint32_t pixel;
+            uint16_t r = *src++ >> 6;
+            uint16_t g = *src++ >> 6;
+            uint16_t b = *src++ >> 6;
+            if (avctx->codec_id == AV_CODEC_ID_R210)
+                pixel = (r << 20) | (g << 10) | b;
+            else
+                pixel = (r << 22) | (g << 12) | (b << 2);
+            if (avctx->codec_id == AV_CODEC_ID_AVRP)
+                bytestream_put_le32(&dst, pixel);
+            else
+                bytestream_put_be32(&dst, pixel);
+        }
+        memset(dst, 0, pad);
+        dst += pad;
+        src_line += pic->linesize[0];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+
+#if CONFIG_R210_ENCODER
+AVCodec ff_r210_encoder = {
+    .name           = "r210",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed RGB 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R210,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_R10K_ENCODER
+AVCodec ff_r10k_encoder = {
+    .name           = "r10k",
+    .long_name      = NULL_IF_CONFIG_SMALL("AJA Kona 10-bit RGB Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R10K,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_AVRP_ENCODER
+AVCodec ff_avrp_encoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/ra144.c b/libavcodec/ra144.c
index 355baddf33..696a49e7ab 100644
--- a/libavcodec/ra144.c
+++ b/libavcodec/ra144.c
@@ -1,21 +1,21 @@
 /*
  * Real Audio 1.0 (14.4K)
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1566,8 +1566,15 @@ int ff_eval_refl(int *refl, const int16_t *coefs, AVCodecContext *avctx)
         if (!b)
             b = -2;
 
-        for (j=0; j <= i; j++)
-            bp1[j] = ((bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12)) * (0x1000000 / b)) >> 12;
+        b = 0x1000000 / b;
+        for (j=0; j <= i; j++) {
+#if CONFIG_FTRAPV
+            int a = bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12);
+            if((int)(a*(unsigned)b) != a*(int64_t)b)
+                return 1;
+#endif
+            bp1[j] = ((bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12)) * b) >> 12;
+        }
 
         if ((unsigned) bp1[i] + 0x1000 > 0x1fff)
             return 1;
@@ -1674,12 +1681,9 @@ unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy)
 }
 
 /** inverse root mean square */
-int ff_irms(const int16_t *data)
+int ff_irms(AudioDSPContext *adsp, const int16_t *data)
 {
-    unsigned int i, sum = 0;
-
-    for (i=0; i < BLOCKSIZE; i++)
-        sum += data[i] * data[i];
+    unsigned int sum = adsp->scalarproduct_int16(data, data, BLOCKSIZE);
 
     if (sum == 0)
         return 0; /* OOPS - division by zero */
@@ -1687,18 +1691,17 @@ int ff_irms(const int16_t *data)
     return 0x20000000 / (ff_t_sqrt(sum) >> 8);
 }
 
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain)
 {
-    uint16_t buffer_a[BLOCKSIZE];
-    uint16_t *block;
+    int16_t *block;
     int m[3];
 
     if (cba_idx) {
         cba_idx += BLOCKSIZE/2 - 1;
-        ff_copy_and_dup(buffer_a, ractx->adapt_cb, cba_idx);
-        m[0] = (ff_irms(buffer_a) * gval) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * gval) >> 12;
     } else {
         m[0] = 0;
     }
@@ -1709,7 +1712,7 @@ void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
 
     block = ractx->adapt_cb + BUFFERSIZE - BLOCKSIZE;
 
-    add_wav(block, gain, cba_idx, m, cba_idx? buffer_a: NULL,
+    add_wav(block, gain, cba_idx, m, cba_idx? ractx->buffer_a: NULL,
             ff_cb1_vects[cb1_idx], ff_cb2_vects[cb2_idx]);
 
     memcpy(ractx->curr_sblock, ractx->curr_sblock + BLOCKSIZE,
diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h
index 81d6964abc..df747905b3 100644
--- a/libavcodec/ra144.h
+++ b/libavcodec/ra144.h
@@ -1,21 +1,21 @@
 /*
  * Real Audio 1.0 (14.4K)
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,16 +25,18 @@
 #include <stdint.h>
 #include "lpc.h"
 #include "audio_frame_queue.h"
+#include "audiodsp.h"
 
 #define NBLOCKS         4       ///< number of subblocks within a block
 #define BLOCKSIZE       40      ///< subblock size in 16-bit words
 #define BUFFERSIZE      146     ///< the size of the adaptive codebook
 #define FIXED_CB_SIZE   128     ///< size of fixed codebooks
-#define FRAMESIZE       20      ///< size of encoded frame
+#define FRAME_SIZE      20      ///< size of encoded frame
 #define LPC_ORDER       10      ///< order of LPC filter
 
 typedef struct RA144Context {
     AVCodecContext *avctx;
+    AudioDSPContext adsp;
     LPCContext lpc_ctx;
     AudioFrameQueue afq;
     int last_frame;
@@ -56,7 +58,9 @@ typedef struct RA144Context {
 
     /** Adaptive codebook, its size is two units bigger to avoid a
      *  buffer overflow. */
-    uint16_t adapt_cb[146+2];
+    int16_t adapt_cb[146+2];
+
+    DECLARE_ALIGNED(16, int16_t, buffer_a)[FFALIGN(BLOCKSIZE,16)];
 } RA144Context;
 
 void ff_copy_and_dup(int16_t *target, const int16_t *source, int offset);
@@ -68,8 +72,8 @@ unsigned int ff_rms(const int *data);
 int ff_interp(RA144Context *ractx, int16_t *out, int a, int copyold,
               int energy);
 unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy);
-int ff_irms(const int16_t *data);
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+int ff_irms(AudioDSPContext *adsp, const int16_t *data/*align 16*/);
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain);
 
diff --git a/libavcodec/ra144dec.c b/libavcodec/ra144dec.c
index 289535741f..3eed17c0da 100644
--- a/libavcodec/ra144dec.c
+++ b/libavcodec/ra144dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2003 Nick Kurshev
  *     Based on public domain decoder at http://www.honeypot.net/audio
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     RA144Context *ractx = avctx->priv_data;
 
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
 
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
@@ -45,7 +46,7 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static void do_output_subblock(RA144Context *ractx, const uint16_t  *lpc_coefs,
+static void do_output_subblock(RA144Context *ractx, const int16_t  *lpc_coefs,
                                int gval, GetBitContext *gb)
 {
     int cba_idx = get_bits(gb, 7); // index of the adaptive CB, 0 if none
@@ -66,7 +67,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     int buf_size = avpkt->size;
     static const uint8_t sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2};
     unsigned int refl_rms[NBLOCKS];           // RMS of the reflection coefficients
-    uint16_t block_coefs[NBLOCKS][LPC_ORDER]; // LPC coefficients of each sub-block
+    int16_t block_coefs[NBLOCKS][LPC_ORDER];  // LPC coefficients of each sub-block
     unsigned int lpc_refl[LPC_ORDER];         // LPC reflection coefficients of the frame
     int i, j;
     int ret;
@@ -76,7 +77,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     RA144Context *ractx = avctx->priv_data;
     GetBitContext gb;
 
-    if (buf_size < FRAMESIZE) {
+    if (buf_size < FRAME_SIZE) {
         av_log(avctx, AV_LOG_ERROR,
                "Frame too small (%d bytes). Truncated file?\n", buf_size);
         *got_frame_ptr = 0;
@@ -85,13 +86,11 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = NBLOCKS * BLOCKSIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
-    init_get_bits(&gb, buf, FRAMESIZE * 8);
+    init_get_bits8(&gb, buf, FRAME_SIZE);
 
     for (i = 0; i < LPC_ORDER; i++)
         lpc_refl[i] = ff_lpc_refl_cb[i][get_bits(&gb, sizes[i])];
@@ -124,7 +123,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return FRAMESIZE;
+    return FRAME_SIZE;
 }
 
 AVCodec ff_ra_144_decoder = {
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index 1d1498bd32..32755d2204 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K) encoder
  * Copyright (c) 2010 Francesco Lavra <francescolavra@interfree.it>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,6 @@
 #include "put_bits.h"
 #include "ra144.h"
 
-
 static av_cold int ra144_encode_close(AVCodecContext *avctx)
 {
     RA144Context *ractx = avctx->priv_data;
@@ -62,6 +61,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
     ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER,
                       FF_LPC_TYPE_LEVINSON);
     if (ret < 0)
@@ -198,8 +198,8 @@ static void create_adapt_vect(float *vect, const int16_t *cb, int lag)
 static int adaptive_cb_search(const int16_t *adapt_cb, float *work,
                               const float *coefs, float *data)
 {
-    int i, best_vect;
-    float score, gain, best_score, best_gain;
+    int i, av_uninit(best_vect);
+    float score, gain, best_score, av_uninit(best_gain);
     float exc[BLOCKSIZE];
 
     gain = best_score = 0;
@@ -335,9 +335,9 @@ static void ra144_encode_subblock(RA144Context *ractx,
     float data[BLOCKSIZE] = { 0 }, work[LPC_ORDER + BLOCKSIZE];
     float coefs[LPC_ORDER];
     float zero[BLOCKSIZE], cba[BLOCKSIZE], cb1[BLOCKSIZE], cb2[BLOCKSIZE];
-    int16_t cba_vect[BLOCKSIZE];
     int cba_idx, cb1_idx, cb2_idx, gain;
-    int i, n, m[3];
+    int i, n;
+    unsigned m[3];
     float g[3];
     float error, best_error;
 
@@ -373,8 +373,8 @@ static void ra144_encode_subblock(RA144Context *ractx,
          */
         memcpy(cba, work + LPC_ORDER, sizeof(cba));
 
-        ff_copy_and_dup(cba_vect, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
-        m[0] = (ff_irms(cba_vect) * rms) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * rms) >> 12;
     }
     fixed_cb_search(work + LPC_ORDER, coefs, data, cba_idx, &cb1_idx, &cb2_idx);
     for (i = 0; i < BLOCKSIZE; i++) {
@@ -447,10 +447,8 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (ractx->last_frame)
         return 0;
 
-    if ((ret = ff_alloc_packet(avpkt, FRAMESIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     /**
      * Since the LPC coefficients are calculated on a frame centered over the
@@ -538,7 +536,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&ractx->afq, avctx->frame_size, &avpkt->pts,
                        &avpkt->duration);
 
-    avpkt->size = FRAMESIZE;
+    avpkt->size = FRAME_SIZE;
     *got_packet_ptr = 1;
     return 0;
 }
@@ -556,4 +554,6 @@ AVCodec ff_ra_144_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){ 8000, 0 },
+    .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
 };
diff --git a/libavcodec/ra288.c b/libavcodec/ra288.c
index a306231129..8f5a7f2289 100644
--- a/libavcodec/ra288.c
+++ b/libavcodec/ra288.c
@@ -1,21 +1,21 @@
 /*
  * RealAudio 2.0 (28.8K)
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 #define RA288_BLOCKS_PER_FRAME 32
 
 typedef struct RA288Context {
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     DECLARE_ALIGNED(32, float,   sp_lpc)[FFALIGN(36, 16)];   ///< LPC coefficients for speech data (spec: A)
     DECLARE_ALIGNED(32, float, gain_lpc)[FFALIGN(10, 16)];   ///< LPC coefficients for gain        (spec: GB)
 
@@ -59,6 +59,15 @@ typedef struct RA288Context {
     float gain_rec[11];
 } RA288Context;
 
+static av_cold int ra288_decode_close(AVCodecContext *avctx)
+{
+    RA288Context *ractx = avctx->priv_data;
+
+    av_freep(&ractx->fdsp);
+
+    return 0;
+}
+
 static av_cold int ra288_decode_init(AVCodecContext *avctx)
 {
     RA288Context *ractx = avctx->priv_data;
@@ -67,7 +76,14 @@ static av_cold int ra288_decode_init(AVCodecContext *avctx)
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
-    avpriv_float_dsp_init(&ractx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported block align\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ractx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!ractx->fdsp)
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -104,14 +120,14 @@ static void decode(RA288Context *ractx, float gain, int cb_coef)
     for (i=0; i < 5; i++)
         buffer[i] = codetable[cb_coef][i] * sumsum;
 
-    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5) * ((1 << 24) / 5.0);
+    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5);
 
-    sum = FFMAX(sum, 1);
+    sum = FFMAX(sum, 5.0 / (1<<24));
 
     /* shift and store */
     memmove(gain_block, gain_block + 1, 9 * sizeof(*gain_block));
 
-    gain_block[9] = 10 * log10(sum) - 32;
+    gain_block[9] = 10 * log10(sum) + (10*log10(((1<<24)/5.)) - 32);
 
     ff_celp_lp_synthesis_filterf(block, ractx->sp_lpc, buffer, 5, 36);
 }
@@ -139,7 +155,9 @@ static void do_hybrid_window(RA288Context *ractx,
                                             MAX_BACKWARD_FILTER_LEN   +
                                             MAX_BACKWARD_FILTER_NONREC, 16)]);
 
-    ractx->fdsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
+    av_assert2(order>=0);
+
+    ractx->fdsp->vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
 
     convolve(buffer1, work + order    , n      , order);
     convolve(buffer2, work + order + n, non_rec, order);
@@ -166,7 +184,7 @@ static void backward_filter(RA288Context *ractx,
     do_hybrid_window(ractx, order, n, non_rec, temp, hist, rec, window);
 
     if (!compute_lpc_coefs(temp, order, lpc, 0, 1, 1))
-        ractx->fdsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
+        ractx->fdsp->vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
 
     memmove(hist, hist + n, move_size*sizeof(*hist));
 }
@@ -189,16 +207,16 @@ static int ra288_decode_frame(AVCodecContext * avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    ret = init_get_bits8(&gb, buf, avctx->block_align);
+    if (ret < 0)
+        return ret;
+
     /* get output buffer */
     frame->nb_samples = RA288_BLOCK_SIZE * RA288_BLOCKS_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (float *)frame->data[0];
 
-    init_get_bits(&gb, buf, avctx->block_align * 8);
-
     for (i=0; i < RA288_BLOCKS_PER_FRAME; i++) {
         float gain = amptable[get_bits(&gb, 3)];
         int cb_coef = get_bits(&gb, 6 + (i&1));
@@ -230,5 +248,6 @@ AVCodec ff_ra_288_decoder = {
     .priv_data_size = sizeof(RA288Context),
     .init           = ra288_decode_init,
     .decode         = ra288_decode_frame,
+    .close          = ra288_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/ra288.h b/libavcodec/ra288.h
index 9f4beebbcf..d30e49a0e3 100644
--- a/libavcodec/ra288.h
+++ b/libavcodec/ra288.h
@@ -1,21 +1,21 @@
 /*
  * RealAudio 2.0 (28.8K)
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ralf.c b/libavcodec/ralf.c
index bed5adfe00..8cd9f88dc0 100644
--- a/libavcodec/ralf.c
+++ b/libavcodec/ralf.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -461,10 +461,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     frame->nb_samples = ctx->max_frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Me fail get_buffer()? That's unpossible!\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples0 = (int16_t *)frame->data[0];
     samples1 = (int16_t *)frame->data[1];
 
diff --git a/libavcodec/ralfdata.h b/libavcodec/ralfdata.h
index 83eb970f5a..9a84e45a32 100644
--- a/libavcodec/ralfdata.h
+++ b/libavcodec/ralfdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rangecoder.c b/libavcodec/rangecoder.c
index af0a8c009e..e4c5763e4d 100644
--- a/libavcodec/rangecoder.c
+++ b/libavcodec/rangecoder.c
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "rangecoder.h"
 #include "bytestream.h"
@@ -55,7 +56,7 @@ av_cold void ff_init_range_decoder(RangeCoder *c, const uint8_t *buf,
     /* cast to avoid compiler warning */
     ff_init_range_encoder(c, (uint8_t *)buf, buf_size);
 
-    c->low = bytestream_get_be16(&c->bytestream);
+    c->low = bytestream_get_be16((const uint8_t **)&c->bytestream);
 }
 
 void ff_build_rac_states(RangeCoder *c, int factor, int max_p)
@@ -107,8 +108,8 @@ int ff_rac_terminate(RangeCoder *c)
     c->range = 0xFF;
     renorm_encoder(c);
 
-    assert(c->low == 0);
-    assert(c->range >= 0x100);
+    av_assert1(c->low   == 0);
+    av_assert1(c->range >= 0x100);
 
     return c->bytestream - c->bytestream_start;
 }
@@ -119,11 +120,12 @@ int ff_rac_terminate(RangeCoder *c)
 #include "libavutil/lfg.h"
 #include "libavutil/log.h"
 
+static uint8_t b[9 * SIZE];
+static uint8_t r[9 * SIZE];
+
 int main(void)
 {
     RangeCoder c;
-    uint8_t b[9 * SIZE];
-    uint8_t r[9 * SIZE];
     int i;
     uint8_t state[10];
     AVLFG prng;
@@ -131,7 +133,7 @@ int main(void)
     av_lfg_init(&prng, 1);
 
     ff_init_range_encoder(&c, b, SIZE);
-    ff_build_rac_states(&c, 0.05 * (1LL << 32), 128 + 64 + 32 + 16);
+    ff_build_rac_states(&c, (1LL << 32) / 20, 128 + 64 + 32 + 16);
 
     memset(state, 128, sizeof(state));
 
diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
index 4c88169790..d36fbd7c54 100644
--- a/libavcodec/rangecoder.h
+++ b/libavcodec/rangecoder.h
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #define AVCODEC_RANGECODER_H
 
 #include <stdint.h>
-#include <assert.h>
 
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 typedef struct RangeCoder {
     int low;
@@ -86,9 +86,9 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
 {
     int range1 = (c->range * (*state)) >> 8;
 
-    assert(*state);
-    assert(range1 < c->range);
-    assert(range1 > 0);
+    av_assert2(*state);
+    av_assert2(range1 < c->range);
+    av_assert2(range1 > 0);
     if (!bit) {
         c->range -= range1;
         *state    = c->zero_state[*state];
diff --git a/libavcodec/ratecontrol.c b/libavcodec/ratecontrol.c
index c0eac6daf4..308e34ee18 100644
--- a/libavcodec/ratecontrol.c
+++ b/libavcodec/ratecontrol.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,9 +35,6 @@
 #include "mpegvideo.h"
 #include "libavutil/eval.h"
 
-#undef NDEBUG // Always check asserts, the speed effect is far too small to disable them.
-#include <assert.h>
-
 #ifndef M_E
 #define M_E 2.718281828
 #endif
@@ -50,7 +47,7 @@ void ff_write_pass1_stats(MpegEncContext *s)
 {
     snprintf(s->avctx->stats_out, 256,
              "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d "
-             "fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d hbits:%d;\n",
+             "fcode:%d bcode:%d mc-var:%"PRId64" var:%"PRId64" icount:%d skipcount:%d hbits:%d;\n",
              s->current_picture_ptr->f->display_picture_number,
              s->current_picture_ptr->f->coded_picture_number,
              s->pict_type,
@@ -67,6 +64,11 @@ void ff_write_pass1_stats(MpegEncContext *s)
              s->header_bits);
 }
 
+static double get_fps(AVCodecContext *avctx)
+{
+    return 1.0 / av_q2d(avctx->time_base) / FFMAX(avctx->ticks_per_frame, 1);
+}
+
 static inline double qp2bits(RateControlEntry *rce, double qp)
 {
     if (qp <= 0.0) {
@@ -128,6 +130,13 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
     };
     emms_c();
 
+    if (!s->avctx->rc_max_available_vbv_use && s->avctx->rc_buffer_size) {
+        if (s->avctx->rc_max_rate) {
+            s->avctx->rc_max_available_vbv_use = av_clipf(s->avctx->rc_max_rate/(s->avctx->rc_buffer_size*get_fps(s->avctx)), 1.0/3, 1.0);
+        } else
+            s->avctx->rc_max_available_vbv_use = 1.0;
+    }
+
     res = av_expr_parse(&rcc->rc_eq_eval,
                         s->rc_eq ? s->rc_eq : "tex^qComp",
                         const_names, func1_names, func1,
@@ -158,6 +167,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         rcc->last_qscale_for[i] = FF_QP2LAMBDA * 5;
     }
     rcc->buffer_index = s->avctx->rc_initial_buffer_occupancy;
+    if (!rcc->buffer_index)
+        rcc->buffer_index = s->avctx->rc_buffer_size * 3 / 4;
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         int i;
@@ -171,9 +182,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
         if (i <= 0 || i >= INT_MAX / sizeof(RateControlEntry))
             return -1;
         rcc->entry       = av_mallocz(i * sizeof(RateControlEntry));
-        rcc->num_entries = i;
         if (!rcc->entry)
             return AVERROR(ENOMEM);
+        rcc->num_entries = i;
 
         /* init all to skipped p frames
          * (with b frames we might have a not encoded frame at the end FIXME) */
@@ -201,11 +212,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
             e = sscanf(p, " in:%d ", &picture_number);
 
-            assert(picture_number >= 0);
-            assert(picture_number < rcc->num_entries);
+            av_assert0(picture_number >= 0);
+            av_assert0(picture_number < rcc->num_entries);
             rce = &rcc->entry[picture_number];
 
-            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d hbits:%d",
+            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%"SCNd64" var:%"SCNd64" icount:%d skipcount:%d hbits:%d",
                         &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits,
                         &rce->mv_bits, &rce->misc_bits,
                         &rce->f_code, &rce->b_code,
@@ -290,7 +301,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 get_qscale(s, &rce, rcc->pass1_wanted_bits / rcc->pass1_rc_eq_output_sum, i);
 
                 // FIXME misbehaves a little for variable fps
-                rcc->pass1_wanted_bits += s->bit_rate / (1 / av_q2d(s->avctx->time_base));
+                rcc->pass1_wanted_bits += s->bit_rate / get_fps(s->avctx);
             }
         }
     }
@@ -315,7 +326,7 @@ av_cold void ff_rate_control_uninit(MpegEncContext *s)
 int ff_vbv_update(MpegEncContext *s, int frame_size)
 {
     RateControlContext *rcc = &s->rc_context;
-    const double fps        = 1 / av_q2d(s->avctx->time_base);
+    const double fps        = get_fps(s->avctx);
     const int buffer_size   = s->avctx->rc_buffer_size;
     const double min_rate   = s->avctx->rc_min_rate / fps;
     const double max_rate   = s->avctx->rc_max_rate / fps;
@@ -329,6 +340,9 @@ int ff_vbv_update(MpegEncContext *s, int frame_size)
         rcc->buffer_index -= frame_size;
         if (rcc->buffer_index < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "rc buffer underflow\n");
+            if (frame_size > max_rate && s->qscale == s->avctx->qmax) {
+                av_log(s->avctx, AV_LOG_ERROR, "max bitrate possibly too small or try trellis with large lmax or increase qmax\n");
+            }
             rcc->buffer_index = 0;
         }
 
@@ -477,7 +491,7 @@ static void get_qminmax(int *qmin_ret, int *qmax_ret, MpegEncContext *s, int pic
     int qmin = s->lmin;
     int qmax = s->lmax;
 
-    assert(qmin <= qmax);
+    av_assert0(qmin <= qmax);
 
     switch (pict_type) {
     case AV_PICTURE_TYPE_B:
@@ -505,7 +519,7 @@ static double modify_qscale(MpegEncContext *s, RateControlEntry *rce,
 {
     RateControlContext *rcc  = &s->rc_context;
     const double buffer_size = s->avctx->rc_buffer_size;
-    const double fps         = 1 / av_q2d(s->avctx->time_base);
+    const double fps         = get_fps(s->avctx);
     const double min_rate    = s->avctx->rc_min_rate / fps;
     const double max_rate    = s->avctx->rc_max_rate / fps;
     const int pict_type      = rce->new_pict_type;
@@ -751,7 +765,7 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     RateControlEntry local_rce, *rce;
     double bits;
     double rate_factor;
-    int var;
+    int64_t var;
     const int pict_type = s->pict_type;
     Picture * const pic = &s->current_picture;
     emms_c();
@@ -763,19 +777,25 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
 
     get_qminmax(&qmin, &qmax, s, pict_type);
 
-    fps = 1 / av_q2d(s->avctx->time_base);
+    fps = get_fps(s->avctx);
     /* update predictors */
     if (picture_number > 2 && !dry_run) {
-        const int last_var = s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
-                                                                    : rcc->last_mc_mb_var_sum;
+        const int64_t last_var =
+            s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
+                                                   : rcc->last_mc_mb_var_sum;
+        av_assert1(s->frame_bits >= s->stuffing_bits);
         update_predictor(&rcc->pred[s->last_pict_type],
                          rcc->last_qscale,
-                         sqrt(last_var), s->frame_bits);
+                         sqrt(last_var),
+                         s->frame_bits - s->stuffing_bits);
     }
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
-        assert(picture_number >= 0);
-        assert(picture_number < rcc->num_entries);
+        av_assert0(picture_number >= 0);
+        if (picture_number >= rcc->num_entries) {
+            av_log(s, AV_LOG_ERROR, "Input is longer than 2-pass log file\n");
+            return -1;
+        }
         rce         = &rcc->entry[picture_number];
         wanted_bits = rce->expected_bits;
     } else {
@@ -806,10 +826,10 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     short_term_q = 0; /* avoid warning */
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         if (pict_type != AV_PICTURE_TYPE_I)
-            assert(pict_type == rce->new_pict_type);
+            av_assert0(pict_type == rce->new_pict_type);
 
         q = rce->new_qscale / br_compensation;
-        ff_dlog(s, "%f %f %f last:%d var:%d type:%d//\n", q, rce->new_qscale,
+        ff_dlog(s, "%f %f %f last:%d var:%"PRId64" type:%d//\n", q, rce->new_qscale,
                 br_compensation, s->frame_bits, var, pict_type);
     } else {
         rce->pict_type     =
@@ -838,7 +858,6 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         rcc->mv_bits_sum[pict_type] += rce->mv_bits;
         rcc->frame_count[pict_type]++;
 
-        bits        = rce->i_tex_bits + rce->p_tex_bits;
         rate_factor = rcc->pass1_wanted_bits /
                       rcc->pass1_rc_eq_output_sum * br_compensation;
 
@@ -846,9 +865,9 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         if (q < 0)
             return -1;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
         q = get_diff_limited_q(s, rce, q);
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         // FIXME type dependent blur like in 2-pass
         if (pict_type == AV_PICTURE_TYPE_P || s->intra_only) {
@@ -859,19 +878,19 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
             rcc->short_term_qcount++;
             q = short_term_q = rcc->short_term_qsum / rcc->short_term_qcount;
         }
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         q = modify_qscale(s, rce, q, picture_number);
 
         rcc->pass1_wanted_bits += s->bit_rate / fps;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
     }
 
     if (s->avctx->debug & FF_DEBUG_RC) {
         av_log(s->avctx, AV_LOG_DEBUG,
                "%c qp:%d<%2.1f<%d %d want:%d total:%d comp:%f st_q:%2.2f "
-               "size:%d var:%d/%d br:%d fps:%d\n",
+               "size:%d var:%"PRId64"/%"PRId64" br:%"PRId64" fps:%d\n",
                av_get_picture_type_char(pict_type),
                qmin, q, qmax, picture_number,
                (int)wanted_bits / 1000, (int)s->total_bits / 1000,
@@ -906,7 +925,7 @@ static int init_pass2(MpegEncContext *s)
     RateControlContext *rcc = &s->rc_context;
     AVCodecContext *a       = s->avctx;
     int i, toobig;
-    double fps             = 1 / av_q2d(s->avctx->time_base);
+    double fps             = get_fps(s->avctx);
     double complexity[5]   = { 0 }; // approximate bits at quant=1
     uint64_t const_bits[5] = { 0 }; // quantizer independent bits
     uint64_t all_const_bits;
@@ -915,7 +934,7 @@ static int init_pass2(MpegEncContext *s)
     double rate_factor          = 0;
     double step;
     const int filter_size = (int)(a->qblur * 4) | 1;
-    double expected_bits;
+    double expected_bits = 0; // init to silence gcc warning
     double *qscale, *blurred_qscale, qscale_sum;
 
     /* find complexity & const_bits & decide the pict_types */
@@ -942,8 +961,8 @@ static int init_pass2(MpegEncContext *s)
         return -1;
     }
 
-    qscale         = av_malloc(sizeof(double) * rcc->num_entries);
-    blurred_qscale = av_malloc(sizeof(double) * rcc->num_entries);
+    qscale         = av_malloc_array(rcc->num_entries, sizeof(double));
+    blurred_qscale = av_malloc_array(rcc->num_entries, sizeof(double));
     if (!qscale || !blurred_qscale) {
         av_free(qscale);
         av_free(blurred_qscale);
@@ -964,9 +983,15 @@ static int init_pass2(MpegEncContext *s)
             qscale[i] = get_qscale(s, &rcc->entry[i], rate_factor, i);
             rcc->last_qscale_for[rce->pict_type] = qscale[i];
         }
-        assert(filter_size % 2 == 1);
+        av_assert0(filter_size % 2 == 1);
 
         /* fixed I/B QP relative to P mode */
+        for (i = FFMAX(0, rcc->num_entries - 300); i < rcc->num_entries; i++) {
+            RateControlEntry *rce = &rcc->entry[i];
+
+            qscale[i] = get_diff_limited_q(s, rce, qscale[i]);
+        }
+
         for (i = rcc->num_entries - 1; i >= 0; i--) {
             RateControlEntry *rce = &rcc->entry[i];
 
@@ -1030,11 +1055,11 @@ static int init_pass2(MpegEncContext *s)
         qscale_sum += av_clip(rcc->entry[i].new_qscale / FF_QP2LAMBDA,
                               s->avctx->qmin, s->avctx->qmax);
     }
-    assert(toobig <= 40);
+    av_assert0(toobig <= 40);
     av_log(s->avctx, AV_LOG_DEBUG,
-           "[lavc rc] requested bitrate: %d bps  expected bitrate: %d bps\n",
+           "[lavc rc] requested bitrate: %"PRId64" bps  expected bitrate: %"PRId64" bps\n",
            s->bit_rate,
-           (int)(expected_bits / ((double)all_available_bits / s->bit_rate)));
+           (int64_t)(expected_bits / ((double)all_available_bits / s->bit_rate)));
     av_log(s->avctx, AV_LOG_DEBUG,
            "[lavc rc] estimated target average qp: %.3f\n",
            (float)qscale_sum / rcc->num_entries);
diff --git a/libavcodec/ratecontrol.h b/libavcodec/ratecontrol.h
index 63ebeb2586..eeb4bb96c0 100644
--- a/libavcodec/ratecontrol.h
+++ b/libavcodec/ratecontrol.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,8 +49,8 @@ typedef struct RateControlEntry{
     uint64_t expected_bits;
     int new_pict_type;
     float new_qscale;
-    int mc_mb_var_sum;
-    int mb_var_sum;
+    int64_t mc_mb_var_sum;
+    int64_t mb_var_sum;
     int i_count;
     int skip_count;
     int f_code;
@@ -71,8 +71,8 @@ typedef struct RateControlContext{
     double pass1_wanted_bits;     ///< bits which should have been outputed by the pass1 code (including complexity init)
     double last_qscale;
     double last_qscale_for[5];    ///< last qscale for a specific pict type, used for max_diff & ipb factor stuff
-    int last_mc_mb_var_sum;
-    int last_mb_var_sum;
+    int64_t last_mc_mb_var_sum;
+    int64_t last_mb_var_sum;
     uint64_t i_cplx_sum[5];
     uint64_t p_cplx_sum[5];
     uint64_t mv_bits_sum[5];
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 541ef7aa3a..62ad338b9f 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,7 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'u', 'p') },
     { AV_PIX_FMT_UYVY422, MKTAG('V', 'D', 'T', 'Z') }, /* SoftLab-NSK VideoTizer */
     { AV_PIX_FMT_UYVY422, MKTAG('a', 'u', 'v', '2') },
+    { AV_PIX_FMT_UYVY422, MKTAG('c', 'y', 'u', 'v') }, /* CYUV is also Creative YUV */
     { AV_PIX_FMT_UYYVYY411, MKTAG('Y', '4', '1', '1') },
     { AV_PIX_FMT_GRAY8,   MKTAG('G', 'R', 'E', 'Y') },
     { AV_PIX_FMT_NV12,    MKTAG('N', 'V', '1', '2') },
@@ -83,14 +84,18 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_BGR444LE, MKTAG('B', 'G', 'R', 12) },
     { AV_PIX_FMT_RGB444BE, MKTAG(12 , 'B', 'G', 'R') },
     { AV_PIX_FMT_BGR444BE, MKTAG(12 , 'R', 'G', 'B') },
-    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
-    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA64LE, MKTAG('R', 'B', 'A', 64 ) },
     { AV_PIX_FMT_BGRA64LE, MKTAG('B', 'R', 'A', 64 ) },
     { AV_PIX_FMT_RGBA64BE, MKTAG(64 , 'R', 'B', 'A') },
     { AV_PIX_FMT_BGRA64BE, MKTAG(64 , 'B', 'R', 'A') },
+    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB0,     MKTAG('R', 'G', 'B',  0 ) },
+    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
+    { AV_PIX_FMT_BGR0,     MKTAG('B', 'G', 'R',  0 ) },
     { AV_PIX_FMT_ABGR,     MKTAG('A', 'B', 'G', 'R') },
+    { AV_PIX_FMT_0BGR,     MKTAG( 0 , 'B', 'G', 'R') },
     { AV_PIX_FMT_ARGB,     MKTAG('A', 'R', 'G', 'B') },
+    { AV_PIX_FMT_0RGB,     MKTAG( 0 , 'R', 'G', 'B') },
     { AV_PIX_FMT_RGB24,    MKTAG('R', 'G', 'B', 24 ) },
     { AV_PIX_FMT_BGR24,    MKTAG('B', 'G', 'R', 24 ) },
     { AV_PIX_FMT_YUV411P,  MKTAG('4', '1', '1', 'P') },
@@ -120,6 +125,18 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV422P10BE, MKTAG(10 , 10 , '3', 'Y') },
     { AV_PIX_FMT_YUV444P10LE, MKTAG('Y', '3',  0 , 10 ) },
     { AV_PIX_FMT_YUV444P10BE, MKTAG(10 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P12LE, MKTAG('Y', '3', 11 , 12 ) },
+    { AV_PIX_FMT_YUV420P12BE, MKTAG(12 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P12LE, MKTAG('Y', '3', 10 , 12 ) },
+    { AV_PIX_FMT_YUV422P12BE, MKTAG(12 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P12LE, MKTAG('Y', '3',  0 , 12 ) },
+    { AV_PIX_FMT_YUV444P12BE, MKTAG(12 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P14LE, MKTAG('Y', '3', 11 , 14 ) },
+    { AV_PIX_FMT_YUV420P14BE, MKTAG(14 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P14LE, MKTAG('Y', '3', 10 , 14 ) },
+    { AV_PIX_FMT_YUV422P14BE, MKTAG(14 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P14LE, MKTAG('Y', '3',  0 , 14 ) },
+    { AV_PIX_FMT_YUV444P14BE, MKTAG(14 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUV420P16LE, MKTAG('Y', '3', 11 , 16 ) },
     { AV_PIX_FMT_YUV420P16BE, MKTAG(16 , 11 , '3', 'Y') },
     { AV_PIX_FMT_YUV422P16LE, MKTAG('Y', '3', 10 , 16 ) },
@@ -127,6 +144,8 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV444P16LE, MKTAG('Y', '3',  0 , 16 ) },
     { AV_PIX_FMT_YUV444P16BE, MKTAG(16 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUVA420P,    MKTAG('Y', '4', 11 ,  8 ) },
+    { AV_PIX_FMT_YUVA422P,    MKTAG('Y', '4', 10 ,  8 ) },
+    { AV_PIX_FMT_YUVA444P,    MKTAG('Y', '4',  0 ,  8 ) },
     { AV_PIX_FMT_YA8,         MKTAG('Y', '2',  0 ,  8 ) },
 
     { AV_PIX_FMT_YUVA420P9LE,  MKTAG('Y', '4', 11 ,  9 ) },
@@ -148,10 +167,41 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUVA444P16LE, MKTAG('Y', '4',  0 , 16 ) },
     { AV_PIX_FMT_YUVA444P16BE, MKTAG(16 ,  0 , '4', 'Y') },
 
+    { AV_PIX_FMT_GBRP,         MKTAG('G', '3', 00 ,  8 ) },
+    { AV_PIX_FMT_GBRP9LE,      MKTAG('G', '3', 00 ,  9 ) },
+    { AV_PIX_FMT_GBRP9BE,      MKTAG( 9 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP10LE,     MKTAG('G', '3', 00 , 10 ) },
+    { AV_PIX_FMT_GBRP10BE,     MKTAG(10 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP12LE,     MKTAG('G', '3', 00 , 12 ) },
+    { AV_PIX_FMT_GBRP12BE,     MKTAG(12 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP14LE,     MKTAG('G', '3', 00 , 14 ) },
+    { AV_PIX_FMT_GBRP14BE,     MKTAG(14 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP16LE,     MKTAG('G', '3', 00 , 16 ) },
+    { AV_PIX_FMT_GBRP16BE,     MKTAG(16 , 00 , '3', 'G') },
+
+    { AV_PIX_FMT_XYZ12LE,      MKTAG('X', 'Y', 'Z' , 36 ) },
+    { AV_PIX_FMT_XYZ12BE,      MKTAG(36 , 'Z' , 'Y', 'X') },
+
+    { AV_PIX_FMT_BAYER_BGGR8,    MKTAG(0xBA, 'B', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_BGGR16LE, MKTAG(0xBA, 'B', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_BGGR16BE, MKTAG(16  , 'G', 'B', 0xBA) },
+    { AV_PIX_FMT_BAYER_RGGB8,    MKTAG(0xBA, 'R', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_RGGB16LE, MKTAG(0xBA, 'R', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_RGGB16BE, MKTAG(16  , 'G', 'R', 0xBA) },
+    { AV_PIX_FMT_BAYER_GBRG8,    MKTAG(0xBA, 'G', 'B', 8   ) },
+    { AV_PIX_FMT_BAYER_GBRG16LE, MKTAG(0xBA, 'G', 'B', 16  ) },
+    { AV_PIX_FMT_BAYER_GBRG16BE, MKTAG(16,   'B', 'G', 0xBA) },
+    { AV_PIX_FMT_BAYER_GRBG8,    MKTAG(0xBA, 'G', 'R', 8   ) },
+    { AV_PIX_FMT_BAYER_GRBG16LE, MKTAG(0xBA, 'G', 'R', 16  ) },
+    { AV_PIX_FMT_BAYER_GRBG16BE, MKTAG(16,   'R', 'G', 0xBA) },
+
     /* quicktime */
+    { AV_PIX_FMT_YUV420P, MKTAG('R', '4', '2', '0') }, /* Radius DV YUV PAL */
+    { AV_PIX_FMT_YUV411P, MKTAG('R', '4', '1', '1') }, /* Radius DV YUV NTSC */
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'V', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'U', 'I') }, /* FIXME merge both fields */
+    { AV_PIX_FMT_UYVY422, MKTAG('b', 'x', 'y', 'v') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', '2') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', 's') },
     { AV_PIX_FMT_YUYV422, MKTAG('D', 'V', 'O', 'O') }, /* Digital Voodoo SD 8 Bit */
@@ -159,8 +209,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG('L', '5', '6', '5') },
     { AV_PIX_FMT_RGB565BE,MKTAG('B', '5', '6', '5') },
     { AV_PIX_FMT_BGR24,   MKTAG('2', '4', 'B', 'G') },
+    { AV_PIX_FMT_BGR24,   MKTAG('b', 'x', 'b', 'g') },
     { AV_PIX_FMT_BGRA,    MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA,    MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB24,   MKTAG('b', 'x', 'r', 'g') },
     { AV_PIX_FMT_ABGR,    MKTAG('A', 'B', 'G', 'R') },
     { AV_PIX_FMT_GRAY16BE,MKTAG('b', '1', '6', 'g') },
     { AV_PIX_FMT_RGB48BE, MKTAG('b', '4', '8', 'r') },
@@ -172,6 +224,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_NONE, 0 },
 };
 
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void)
+{
+    return ff_raw_pix_fmt_tags;
+}
+
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
 {
     const PixelFormatTag *tags = ff_raw_pix_fmt_tags;
@@ -182,3 +239,28 @@ unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
     }
     return 0;
 }
+
+const PixelFormatTag avpriv_pix_fmt_bps_avi[] = {
+    { AV_PIX_FMT_MONOWHITE, 1 },
+    { AV_PIX_FMT_PAL8,    2 },
+    { AV_PIX_FMT_PAL8,    4 },
+    { AV_PIX_FMT_PAL8,    8 },
+    { AV_PIX_FMT_RGB444LE, 12 },
+    { AV_PIX_FMT_RGB555LE, 15 },
+    { AV_PIX_FMT_RGB555LE, 16 },
+    { AV_PIX_FMT_BGR24,  24 },
+    { AV_PIX_FMT_BGRA,   32 },
+    { AV_PIX_FMT_NONE,    0 },
+};
+
+const PixelFormatTag avpriv_pix_fmt_bps_mov[] = {
+    { AV_PIX_FMT_MONOWHITE, 1 },
+    { AV_PIX_FMT_PAL8,      2 },
+    { AV_PIX_FMT_PAL8,      4 },
+    { AV_PIX_FMT_PAL8,      8 },
+    { AV_PIX_FMT_RGB555BE, 16 },
+    { AV_PIX_FMT_RGB24,    24 },
+    { AV_PIX_FMT_ARGB,     32 },
+    { AV_PIX_FMT_MONOWHITE,33 },
+    { AV_PIX_FMT_NONE,      0 },
+};
diff --git a/libavcodec/raw.h b/libavcodec/raw.h
index bf66671d16..24bf4cc55a 100644
--- a/libavcodec/raw.h
+++ b/libavcodec/raw.h
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,20 @@
 #define AVCODEC_RAW_H
 
 #include "avcodec.h"
+#include "libavutil/internal.h"
 
 typedef struct PixelFormatTag {
     enum AVPixelFormat pix_fmt;
     unsigned int fourcc;
 } PixelFormatTag;
 
-extern const PixelFormatTag ff_raw_pix_fmt_tags[];
+extern const PixelFormatTag ff_raw_pix_fmt_tags[]; // exposed through avpriv_get_raw_pix_fmt_tags()
+
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void);
+
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags, unsigned int fourcc);
+
+extern av_export const PixelFormatTag avpriv_pix_fmt_bps_avi[];
+extern av_export const PixelFormatTag avpriv_pix_fmt_bps_mov[];
 
 #endif /* AVCODEC_RAW_H */
diff --git a/libavcodec/rawdec.c b/libavcodec/rawdec.c
index 3b69e49e84..d8d77fceed 100644
--- a/libavcodec/rawdec.c
+++ b/libavcodec/rawdec.c
@@ -2,20 +2,20 @@
  * Raw Video Decoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,72 +25,61 @@
  */
 
 #include "avcodec.h"
+#include "bswapdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "raw.h"
+#include "libavutil/avassert.h"
 #include "libavutil/buffer.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
 typedef struct RawVideoContext {
+    AVClass *av_class;
     AVBufferRef *palette;
     int frame_size;  /* size of the frame in bytes */
     int flip;
     int is_2_4_bpp; // 2 or 4 bpp raw in avi/mov
     int is_yuv2;
+    int is_lt_16bpp; // 16bpp pixfmt and bits_per_coded_sample < 16
+    int tff;
+
+    BswapDSPContext bbdsp;
+    void *bitstream_buf;
+    unsigned int bitstream_buf_size;
 } RawVideoContext;
 
-static const PixelFormatTag pix_fmt_bps_avi[] = {
-    { AV_PIX_FMT_PAL8,    4 },
-    { AV_PIX_FMT_PAL8,    8 },
-    { AV_PIX_FMT_RGB444, 12 },
-    { AV_PIX_FMT_RGB555, 15 },
-    { AV_PIX_FMT_RGB555, 16 },
-    { AV_PIX_FMT_BGR24,  24 },
-    { AV_PIX_FMT_RGB32,  32 },
-    { AV_PIX_FMT_NONE,    0 },
+static const AVOption options[]={
+{"top", "top field first", offsetof(RawVideoContext, tff), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM},
+{NULL}
 };
 
-static const PixelFormatTag pix_fmt_bps_mov[] = {
-    { AV_PIX_FMT_MONOWHITE, 1 },
-    { AV_PIX_FMT_PAL8,      2 },
-    { AV_PIX_FMT_PAL8,      4 },
-    { AV_PIX_FMT_PAL8,      8 },
-    // FIXME swscale does not support 16 bit in .mov, sample 16bit.mov
-    // http://developer.apple.com/documentation/QuickTime/QTFF/QTFFChap3/qtff3.html
-    { AV_PIX_FMT_RGB555BE, 16 },
-    { AV_PIX_FMT_RGB24,    24 },
-    { AV_PIX_FMT_ARGB,     32 },
-    { AV_PIX_FMT_MONOWHITE,33 },
-    { AV_PIX_FMT_NONE,      0 },
+static const AVClass rawdec_class = {
+    .class_name = "rawdec",
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static enum AVPixelFormat find_pix_fmt(const PixelFormatTag *tags,
-                                       unsigned int fourcc)
-{
-    while (tags->pix_fmt >= 0) {
-        if (tags->fourcc == fourcc)
-            return tags->pix_fmt;
-        tags++;
-    }
-    return AV_PIX_FMT_YUV420P;
-}
-
 static av_cold int raw_init_decoder(AVCodecContext *avctx)
 {
     RawVideoContext *context = avctx->priv_data;
     const AVPixFmtDescriptor *desc;
 
-    if (avctx->codec_tag == MKTAG('r', 'a', 'w', ' '))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_mov,
+    ff_bswapdsp_init(&context->bbdsp);
+
+    if (   avctx->codec_tag == MKTAG('r','a','w',' ')
+        || avctx->codec_tag == MKTAG('N','O','1','6'))
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_mov,
                                       avctx->bits_per_coded_sample);
     else if (avctx->codec_tag == MKTAG('W', 'R', 'A', 'W'))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
-    else if (avctx->codec_tag)
-        avctx->pix_fmt = find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
+    else if (avctx->codec_tag && (avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0))
+        avctx->pix_fmt = avpriv_find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
     else if (avctx->pix_fmt == AV_PIX_FMT_NONE && avctx->bits_per_coded_sample)
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
 
     desc = av_pix_fmt_desc_get(avctx->pix_fmt);
@@ -109,15 +98,9 @@ static av_cold int raw_init_decoder(AVCodecContext *avctx)
             memset(context->palette->data, 0, AVPALETTE_SIZE);
     }
 
-    context->frame_size = avpicture_get_size(avctx->pix_fmt, avctx->width,
-                                             avctx->height);
-    if ((avctx->bits_per_coded_sample == 4 || avctx->bits_per_coded_sample == 2) &&
-        avctx->pix_fmt == AV_PIX_FMT_PAL8 &&
-       (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' ')))
-        context->is_2_4_bpp = 1;
-
     if ((avctx->extradata_size >= 9 &&
          !memcmp(avctx->extradata + avctx->extradata_size - 9, "BottomUp", 9)) ||
+        avctx->codec_tag == MKTAG('c','y','u','v') ||
         avctx->codec_tag == MKTAG(3, 0, 0, 0) ||
         avctx->codec_tag == MKTAG('W','R','A','W'))
         context->flip = 1;
@@ -135,6 +118,34 @@ static void flip(AVCodecContext *avctx, AVPicture *picture)
     picture->linesize[0] *= -1;
 }
 
+/*
+ * Scale sample to 16-bit resolution
+ */
+#define SCALE16(x, bits) (((x) << (16 - (bits))) | ((x) >> (2 * (bits) - 16)))
+
+/**
+ * Scale buffer to 16 bits per coded sample resolution
+ */
+#define MKSCALE16(name, r16, w16) \
+static void name(AVCodecContext *avctx, uint8_t * dst, const uint8_t *buf, int buf_size, int packed) \
+{ \
+    int i; \
+    if (!packed) { \
+        for (i = 0; i + 1 < buf_size; i += 2) \
+            w16(dst + i, SCALE16(r16(buf + i), avctx->bits_per_coded_sample)); \
+    } else { \
+        GetBitContext gb; \
+        init_get_bits(&gb, buf, buf_size * 8); \
+        for (i = 0; i < avctx->width * avctx->height; i++) { \
+            int sample = get_bits(&gb, avctx->bits_per_coded_sample); \
+            w16(dst + i*2, SCALE16(sample, avctx->bits_per_coded_sample)); \
+        } \
+   } \
+}
+
+MKSCALE16(scale16be, AV_RB16, AV_WB16)
+MKSCALE16(scale16le, AV_RL16, AV_WL16)
+
 static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt)
 {
@@ -142,12 +153,30 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
     RawVideoContext *context       = avctx->priv_data;
     const uint8_t *buf             = avpkt->data;
     int buf_size                   = avpkt->size;
-    int need_copy                  = !avpkt->buf || context->is_2_4_bpp || context->is_yuv2;
-    int res;
+    int linesize_align             = 4;
+    int res, len;
+    int need_copy;
 
     AVFrame   *frame   = data;
     AVPicture *picture = data;
 
+    if ((avctx->bits_per_coded_sample == 4 || avctx->bits_per_coded_sample == 2) &&
+        avctx->pix_fmt == AV_PIX_FMT_PAL8 &&
+       (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' '))) {
+        context->is_2_4_bpp = 1;
+        context->frame_size = avpicture_get_size(avctx->pix_fmt,
+                                                 FFALIGN(avctx->width, 16),
+                                                 avctx->height);
+    } else {
+        context->is_lt_16bpp = av_get_bits_per_pixel(desc) == 16 && avctx->bits_per_coded_sample && avctx->bits_per_coded_sample < 16;
+        context->frame_size = avpicture_get_size(avctx->pix_fmt, avctx->width,
+                                                 avctx->height);
+    }
+    if (context->frame_size < 0)
+        return context->frame_size;
+
+    need_copy = !avpkt->buf || context->is_2_4_bpp || context->is_yuv2 || context->is_lt_16bpp;
+
     frame->pict_type        = AV_PICTURE_TYPE_I;
     frame->key_frame        = 1;
 
@@ -155,12 +184,19 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
     if (res < 0)
         return res;
 
-    if (buf_size < context->frame_size - (avctx->pix_fmt == AV_PIX_FMT_PAL8 ?
-                                          AVPALETTE_SIZE : 0))
-        return -1;
+    av_frame_set_pkt_pos     (frame, avctx->internal->pkt->pos);
+    av_frame_set_pkt_duration(frame, avctx->internal->pkt->duration);
+
+    if (context->tff >= 0) {
+        frame->interlaced_frame = 1;
+        frame->top_field_first  = context->tff;
+    }
+
+    if ((res = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return res;
 
     if (need_copy)
-        frame->buf[0] = av_buffer_alloc(context->frame_size);
+        frame->buf[0] = av_buffer_alloc(FFMAX(context->frame_size, buf_size));
     else
         frame->buf[0] = av_buffer_ref(avpkt->buf);
     if (!frame->buf[0])
@@ -172,21 +208,48 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         uint8_t *dst = frame->buf[0]->data;
         buf_size = context->frame_size - AVPALETTE_SIZE;
         if (avctx->bits_per_coded_sample == 4) {
-            for (i = 0; 2 * i + 1 < buf_size; i++) {
+            for (i = 0; 2 * i + 1 < buf_size && i<avpkt->size; i++) {
                 dst[2 * i + 0] = buf[i] >> 4;
                 dst[2 * i + 1] = buf[i] & 15;
             }
+            linesize_align = 8;
         } else {
-            for (i = 0; 4 * i + 3 < buf_size; i++) {
+            av_assert0(avctx->bits_per_coded_sample == 2);
+            for (i = 0; 4 * i + 3 < buf_size && i<avpkt->size; i++) {
                 dst[4 * i + 0] = buf[i] >> 6;
                 dst[4 * i + 1] = buf[i] >> 4 & 3;
                 dst[4 * i + 2] = buf[i] >> 2 & 3;
                 dst[4 * i + 3] = buf[i]      & 3;
             }
+            linesize_align = 16;
+        }
+        buf = dst;
+    } else if (context->is_lt_16bpp) {
+        uint8_t *dst = frame->buf[0]->data;
+        int packed = (avctx->codec_tag & 0xFFFFFF) == MKTAG('B','I','T', 0);
+        int swap   =  avctx->codec_tag >> 24;
+
+        if (packed && swap) {
+            av_fast_padded_malloc(&context->bitstream_buf, &context->bitstream_buf_size, buf_size);
+            if (!context->bitstream_buf)
+                return AVERROR(ENOMEM);
+            if (swap == 16)
+                context->bbdsp.bswap16_buf(context->bitstream_buf, (const uint16_t*)buf, buf_size / 2);
+            else if (swap == 32)
+                context->bbdsp.bswap_buf(context->bitstream_buf, (const uint32_t*)buf, buf_size / 4);
+            else
+                return AVERROR_INVALIDDATA;
+            buf = context->bitstream_buf;
         }
+
+        if (desc->flags & AV_PIX_FMT_FLAG_BE)
+            scale16be(avctx, dst, buf, buf_size, packed);
+        else
+            scale16le(avctx, dst, buf, buf_size, packed);
+
         buf = dst;
     } else if (need_copy) {
-        memcpy(frame->buf[0]->data, buf, FFMIN(buf_size, context->frame_size));
+        memcpy(frame->buf[0]->data, buf, buf_size);
         buf = frame->buf[0]->data;
     }
 
@@ -194,9 +257,18 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('A', 'V', 'u', 'p'))
         buf += buf_size - context->frame_size;
 
+    len = context->frame_size - (avctx->pix_fmt==AV_PIX_FMT_PAL8 ? AVPALETTE_SIZE : 0);
+    if (buf_size < len && (avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid buffer size, packet size %d < expected frame_size %d\n", buf_size, len);
+        av_buffer_unref(&frame->buf[0]);
+        return AVERROR(EINVAL);
+    }
+
     if ((res = avpicture_fill(picture, buf, avctx->pix_fmt,
-                              avctx->width, avctx->height)) < 0)
+                              avctx->width, avctx->height)) < 0) {
+        av_buffer_unref(&frame->buf[0]);
         return res;
+    }
 
     if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE,
@@ -205,20 +277,44 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         if (pal) {
             av_buffer_unref(&context->palette);
             context->palette = av_buffer_alloc(AVPALETTE_SIZE);
-            if (!context->palette)
+            if (!context->palette) {
+                av_buffer_unref(&frame->buf[0]);
                 return AVERROR(ENOMEM);
+            }
             memcpy(context->palette->data, pal, AVPALETTE_SIZE);
             frame->palette_has_changed = 1;
         }
     }
 
+    if ((avctx->pix_fmt==AV_PIX_FMT_BGR24    ||
+        avctx->pix_fmt==AV_PIX_FMT_GRAY8    ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555LE ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555BE ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB565LE ||
+        avctx->pix_fmt==AV_PIX_FMT_MONOWHITE ||
+        avctx->pix_fmt==AV_PIX_FMT_PAL8) &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height <= buf_size)
+        frame->linesize[0] = FFALIGN(frame->linesize[0], linesize_align);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_NV12 && avctx->codec_tag == MKTAG('N', 'V', '1', '2') &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height +
+        FFALIGN(frame->linesize[1], linesize_align) * ((avctx->height + 1) / 2) <= buf_size) {
+        int la0 = FFALIGN(frame->linesize[0], linesize_align);
+        frame->data[1] += (la0 - frame->linesize[0]) * avctx->height;
+        frame->linesize[0] = la0;
+        frame->linesize[1] = FFALIGN(frame->linesize[1], linesize_align);
+    }
+
     if ((avctx->pix_fmt == AV_PIX_FMT_PAL8 && buf_size < context->frame_size) ||
         (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)) {
         frame->buf[1]  = av_buffer_ref(context->palette);
-        if (!frame->buf[1])
+        if (!frame->buf[1]) {
+            av_buffer_unref(&frame->buf[0]);
             return AVERROR(ENOMEM);
+        }
         frame->data[1] = frame->buf[1]->data;
     }
+
     if (avctx->pix_fmt == AV_PIX_FMT_BGR24 &&
         ((frame->linesize[0] + 3) & ~3) * avctx->height <= buf_size)
         frame->linesize[0] = (frame->linesize[0] + 3) & ~3;
@@ -232,6 +328,11 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('Y', 'V', 'U', '9'))
         FFSWAP(uint8_t *, picture->data[1], picture->data[2]);
 
+    if (avctx->codec_tag == AV_RL32("I420") && (avctx->width+1)*(avctx->height+1) * 3/2 == buf_size) {
+        picture->data[1] = picture->data[1] +  (avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height;
+        picture->data[2] = picture->data[2] + ((avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height)*5/4;
+    }
+
     if (avctx->codec_tag == AV_RL32("yuv2") &&
         avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
         int x, y;
@@ -243,6 +344,12 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) { /* we have interlaced material flagged in container */
+        frame->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            frame->top_field_first = 1;
+    }
+
     *got_frame = 1;
     return buf_size;
 }
@@ -264,4 +371,6 @@ AVCodec ff_rawvideo_decoder = {
     .init           = raw_init_decoder,
     .close          = raw_close_decoder,
     .decode         = raw_decode,
+    .priv_class     = &rawdec_class,
+    .capabilities   = AV_CODEC_CAP_PARAM_CHANGE,
 };
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index cc55b3a875..c23225fe60 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -2,20 +2,20 @@
  * Raw Video Encoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,6 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     avctx->bits_per_coded_sample = av_get_bits_per_pixel(desc);
@@ -50,21 +49,21 @@ FF_ENABLE_DEPRECATION_WARNINGS
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    int ret = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
+    int ret = avpicture_get_size(frame->format, frame->width, frame->height);
 
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_alloc_packet(pkt, ret)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
         return ret;
-    if ((ret = avpicture_layout((const AVPicture *)frame, avctx->pix_fmt, avctx->width,
-                                avctx->height, pkt->data, pkt->size)) < 0)
+    if ((ret = avpicture_layout((const AVPicture *)frame, frame->format, frame->width,
+                                frame->height, pkt->data, pkt->size)) < 0)
         return ret;
 
     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
-       avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
+       frame->format   == AV_PIX_FMT_YUYV422) {
         int x;
-        for(x = 1; x < avctx->height*avctx->width*2; x += 2)
+        for(x = 1; x < frame->height*frame->width*2; x += 2)
             pkt->data[x] ^= 0x80;
     }
     pkt->flags |= AV_PKT_FLAG_KEY;
diff --git a/libavcodec/rdft.c b/libavcodec/rdft.c
index 19652537f8..c318aa8394 100644
--- a/libavcodec/rdft.c
+++ b/libavcodec/rdft.c
@@ -2,20 +2,20 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdlib.h>
@@ -99,16 +99,17 @@ static void rdft_calc_c(RDFTContext *s, FFTSample *data)
 av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
 {
     int n = 1 << nbits;
+    int ret;
 
     s->nbits           = nbits;
     s->inverse         = trans == IDFT_C2R || trans == DFT_C2R;
     s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
 
     if (nbits < 4 || nbits > 16)
-        return -1;
+        return AVERROR(EINVAL);
 
-    if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0)
-        return -1;
+    if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0)
+        return ret;
 
     ff_init_ff_cos_tabs(nbits);
     s->tcos = ff_cos_tabs[nbits];
diff --git a/libavcodec/rdft.h b/libavcodec/rdft.h
index 8ff620fb59..37c40e7c80 100644
--- a/libavcodec/rdft.h
+++ b/libavcodec/rdft.h
@@ -2,24 +2,24 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_RDFT_H
+#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_RDFT_H
 
 #include "config.h"
diff --git a/libavcodec/realtextdec.c b/libavcodec/realtextdec.c
new file mode 100644
index 0000000000..870953bf3c
--- /dev/null
+++ b/libavcodec/realtextdec.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * RealText subtitle decoder
+ * @see http://service.real.com/help/library/guides/ProductionGuide/prodguide/htmfiles/realtext.htm
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+
+static int rt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int prev_chr_is_space = 1;
+
+    while (*p) {
+        if (*p != '<') {
+            if (!av_isspace(*p))
+                av_bprint_chars(buf, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(buf, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+        } else {
+            const char *end = strchr(p, '>');
+            if (!end)
+                break;
+            if (!av_strncasecmp(p, "<br/>", 5) ||
+                !av_strncasecmp(p, "<br>",  4)) {
+                av_bprintf(buf, "\\N");
+            }
+            p = end;
+        }
+        p++;
+    }
+    return 0;
+}
+
+static int realtext_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, 4096);
+    // note: no need to rescale pts & duration since they are in the same
+    // timebase as ASS (1/100)
+    if (ptr && avpkt->size > 0 && !rt_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect_bprint(sub, &buf, avpkt->pts, avpkt->duration);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_realtext_decoder = {
+    .name           = "realtext",
+    .long_name      = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_REALTEXT,
+    .decode         = realtext_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+};
diff --git a/libavcodec/rectangle.h b/libavcodec/rectangle.h
index 73e8b0abd1..594a760801 100644
--- a/libavcodec/rectangle.h
+++ b/libavcodec/rectangle.h
@@ -2,20 +2,20 @@
  * rectangle filling function
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #ifndef AVCODEC_RECTANGLE_H
 #define AVCODEC_RECTANGLE_H
 
-#include <assert.h>
 #include "config.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 /**
  * fill a rectangle.
@@ -40,13 +40,14 @@
  */
 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
     uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==2 || size==4);
-    assert(w<=4);
+    av_assert2(size==1 || size==2 || size==4);
+    av_assert2(w<=4);
 
     w      *= size;
     stride *= size;
 
-    assert((stride&(w-1))==0);
+    av_assert2((((long)vp)&(FFMIN(w, 8<<(HAVE_NEON|ARCH_PPC|HAVE_MMX))-1)) == 0);
+    av_assert2((stride&(w-1))==0);
     if(w==2){
         const uint16_t v= size==4 ? val : val*0x0101;
         *(uint16_t*)(p + 0*stride)= v;
@@ -116,8 +117,8 @@ static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride,
         *(uint32_t*)(p +12+3*stride)= val;
 #endif
     }else
-        assert(0);
-    assert(h==4);
+        av_assert2(0);
+    av_assert2(h==4);
 }
 
 #endif /* AVCODEC_RECTANGLE_H */
diff --git a/libavcodec/remove_extradata_bsf.c b/libavcodec/remove_extradata_bsf.c
index b82c867f43..6bb3576cea 100644
--- a/libavcodec/remove_extradata_bsf.c
+++ b/libavcodec/remove_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,6 @@ static int remove_extradata(AVBitStreamFilterContext *bsfc, AVCodecContext *avct
 }
 
 AVBitStreamFilter ff_remove_extradata_bsf={
-    "remove_extra",
-    0,
-    remove_extradata,
+    .name   = "remove_extra",
+    .filter = remove_extradata,
 };
diff --git a/libavcodec/resample.c b/libavcodec/resample.c
new file mode 100644
index 0000000000..0f5ee84942
--- /dev/null
+++ b/libavcodec/resample.c
@@ -0,0 +1,445 @@
+/*
+ * samplerate conversion for both audio and video
+ * Copyright (c) 2000 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * samplerate conversion for both audio and video
+ */
+
+#include <string.h>
+
+#include "avcodec.h"
+#include "audioconvert.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+#include "libavutil/samplefmt.h"
+
+#if FF_API_AVCODEC_RESAMPLE
+FF_DISABLE_DEPRECATION_WARNINGS
+
+#define MAX_CHANNELS 8
+
+struct AVResampleContext;
+
+static const char *context_to_name(void *ptr)
+{
+    return "audioresample";
+}
+
+static const AVOption options[] = {{NULL}};
+static const AVClass audioresample_context_class = {
+    "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT
+};
+
+struct ReSampleContext {
+    struct AVResampleContext *resample_context;
+    short *temp[MAX_CHANNELS];
+    int temp_len;
+    float ratio;
+    /* channel convert */
+    int input_channels, output_channels, filter_channels;
+    AVAudioConvert *convert_ctx[2];
+    enum AVSampleFormat sample_fmt[2]; ///< input and output sample format
+    unsigned sample_size[2];           ///< size of one sample in sample_fmt
+    short *buffer[2];                  ///< buffers used for conversion to S16
+    unsigned buffer_size[2];           ///< sizes of allocated buffers
+};
+
+/* n1: number of samples */
+static void stereo_to_mono(short *output, short *input, int n1)
+{
+    short *p, *q;
+    int n = n1;
+
+    p = input;
+    q = output;
+    while (n >= 4) {
+        q[0] = (p[0] + p[1]) >> 1;
+        q[1] = (p[2] + p[3]) >> 1;
+        q[2] = (p[4] + p[5]) >> 1;
+        q[3] = (p[6] + p[7]) >> 1;
+        q += 4;
+        p += 8;
+        n -= 4;
+    }
+    while (n > 0) {
+        q[0] = (p[0] + p[1]) >> 1;
+        q++;
+        p += 2;
+        n--;
+    }
+}
+
+/* n1: number of samples */
+static void mono_to_stereo(short *output, short *input, int n1)
+{
+    short *p, *q;
+    int n = n1;
+    int v;
+
+    p = input;
+    q = output;
+    while (n >= 4) {
+        v = p[0]; q[0] = v; q[1] = v;
+        v = p[1]; q[2] = v; q[3] = v;
+        v = p[2]; q[4] = v; q[5] = v;
+        v = p[3]; q[6] = v; q[7] = v;
+        q += 8;
+        p += 4;
+        n -= 4;
+    }
+    while (n > 0) {
+        v = p[0]; q[0] = v; q[1] = v;
+        q += 2;
+        p += 1;
+        n--;
+    }
+}
+
+/*
+5.1 to stereo input: [fl, fr, c, lfe, rl, rr]
+- Left = front_left + rear_gain * rear_left + center_gain * center
+- Right = front_right + rear_gain * rear_right + center_gain * center
+Where rear_gain is usually around 0.5-1.0 and
+      center_gain is almost always 0.7 (-3 dB)
+*/
+static void surround_to_stereo(short **output, short *input, int channels, int samples)
+{
+    int i;
+    short l, r;
+
+    for (i = 0; i < samples; i++) {
+        int fl,fr,c,rl,rr;
+        fl = input[0];
+        fr = input[1];
+        c = input[2];
+        // lfe = input[3];
+        rl = input[4];
+        rr = input[5];
+
+        l = av_clip_int16(fl + (0.5 * rl) + (0.7 * c));
+        r = av_clip_int16(fr + (0.5 * rr) + (0.7 * c));
+
+        /* output l & r. */
+        *output[0]++ = l;
+        *output[1]++ = r;
+
+        /* increment input. */
+        input += channels;
+    }
+}
+
+static void deinterleave(short **output, short *input, int channels, int samples)
+{
+    int i, j;
+
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output[j]++ = *input++;
+        }
+    }
+}
+
+static void interleave(short *output, short **input, int channels, int samples)
+{
+    int i, j;
+
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output++ = *input[j]++;
+        }
+    }
+}
+
+static void ac3_5p1_mux(short *output, short *input1, short *input2, int n)
+{
+    int i;
+    short l, r;
+
+    for (i = 0; i < n; i++) {
+        l = *input1++;
+        r = *input2++;
+        *output++ = l;                  /* left */
+        *output++ = (l / 2) + (r / 2);  /* center */
+        *output++ = r;                  /* right */
+        *output++ = 0;                  /* left surround */
+        *output++ = 0;                  /* right surroud */
+        *output++ = 0;                  /* low freq */
+    }
+}
+
+#define SUPPORT_RESAMPLE(ch1, ch2, ch3, ch4, ch5, ch6, ch7, ch8) \
+    ch8<<7 | ch7<<6 | ch6<<5 | ch5<<4 | ch4<<3 | ch3<<2 | ch2<<1 | ch1<<0
+
+static const uint8_t supported_resampling[MAX_CHANNELS] = {
+    // output ch:    1  2  3  4  5  6  7  8
+    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 0, 0, 0), // 1 input channel
+    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 1, 0, 0), // 2 input channels
+    SUPPORT_RESAMPLE(0, 0, 1, 0, 0, 0, 0, 0), // 3 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 1, 0, 0, 0, 0), // 4 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 1, 0, 0, 0), // 5 input channels
+    SUPPORT_RESAMPLE(0, 1, 0, 0, 0, 1, 0, 0), // 6 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 1, 0), // 7 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 0, 1), // 8 input channels
+};
+
+ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
+                                        int output_rate, int input_rate,
+                                        enum AVSampleFormat sample_fmt_out,
+                                        enum AVSampleFormat sample_fmt_in,
+                                        int filter_length, int log2_phase_count,
+                                        int linear, double cutoff)
+{
+    ReSampleContext *s;
+
+    if (input_channels > MAX_CHANNELS) {
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling with input channels greater than %d is unsupported.\n",
+               MAX_CHANNELS);
+        return NULL;
+    }
+    if (!(supported_resampling[input_channels-1] & (1<<(output_channels-1)))) {
+        int i;
+        av_log(NULL, AV_LOG_ERROR, "Unsupported audio resampling. Allowed "
+               "output channels for %d input channel%s", input_channels,
+               input_channels > 1 ? "s:" : ":");
+        for (i = 0; i < MAX_CHANNELS; i++)
+            if (supported_resampling[input_channels-1] & (1<<i))
+                av_log(NULL, AV_LOG_ERROR, " %d", i + 1);
+        av_log(NULL, AV_LOG_ERROR, "\n");
+        return NULL;
+    }
+
+    s = av_mallocz(sizeof(ReSampleContext));
+    if (!s) {
+        av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n");
+        return NULL;
+    }
+
+    s->ratio = (float)output_rate / (float)input_rate;
+
+    s->input_channels = input_channels;
+    s->output_channels = output_channels;
+
+    s->filter_channels = s->input_channels;
+    if (s->output_channels < s->filter_channels)
+        s->filter_channels = s->output_channels;
+
+    s->sample_fmt[0]  = sample_fmt_in;
+    s->sample_fmt[1]  = sample_fmt_out;
+    s->sample_size[0] = av_get_bytes_per_sample(s->sample_fmt[0]);
+    s->sample_size[1] = av_get_bytes_per_sample(s->sample_fmt[1]);
+
+    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
+        if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1,
+                                                         s->sample_fmt[0], 1, NULL, 0))) {
+            av_log(s, AV_LOG_ERROR,
+                   "Cannot convert %s sample format to s16 sample format\n",
+                   av_get_sample_fmt_name(s->sample_fmt[0]));
+            av_free(s);
+            return NULL;
+        }
+    }
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        if (!(s->convert_ctx[1] = av_audio_convert_alloc(s->sample_fmt[1], 1,
+                                                         AV_SAMPLE_FMT_S16, 1, NULL, 0))) {
+            av_log(s, AV_LOG_ERROR,
+                   "Cannot convert s16 sample format to %s sample format\n",
+                   av_get_sample_fmt_name(s->sample_fmt[1]));
+            av_audio_convert_free(s->convert_ctx[0]);
+            av_free(s);
+            return NULL;
+        }
+    }
+
+    s->resample_context = av_resample_init(output_rate, input_rate,
+                                           filter_length, log2_phase_count,
+                                           linear, cutoff);
+
+    *(const AVClass**)s->resample_context = &audioresample_context_class;
+
+    return s;
+}
+
+/* resample audio. 'nb_samples' is the number of input samples */
+/* XXX: optimize it ! */
+int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples)
+{
+    int i, nb_samples1;
+    short *bufin[MAX_CHANNELS];
+    short *bufout[MAX_CHANNELS];
+    short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS];
+    short *output_bak = NULL;
+    int lenout;
+
+    if (s->input_channels == s->output_channels && s->ratio == 1.0 && 0) {
+        /* nothing to do */
+        memcpy(output, input, nb_samples * s->input_channels * sizeof(short));
+        return nb_samples;
+    }
+
+    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
+        int istride[1] = { s->sample_size[0] };
+        int ostride[1] = { 2 };
+        const void *ibuf[1] = { input };
+        void       *obuf[1];
+        unsigned input_size = nb_samples * s->input_channels * 2;
+
+        if (!s->buffer_size[0] || s->buffer_size[0] < input_size) {
+            av_free(s->buffer[0]);
+            s->buffer_size[0] = input_size;
+            s->buffer[0] = av_malloc(s->buffer_size[0]);
+            if (!s->buffer[0]) {
+                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+                return 0;
+            }
+        }
+
+        obuf[0] = s->buffer[0];
+
+        if (av_audio_convert(s->convert_ctx[0], obuf, ostride,
+                             ibuf, istride, nb_samples * s->input_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
+            return 0;
+        }
+
+        input = s->buffer[0];
+    }
+
+    lenout= 2*s->output_channels*nb_samples * s->ratio + 16;
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        int out_size = lenout * av_get_bytes_per_sample(s->sample_fmt[1]) *
+                       s->output_channels;
+        output_bak = output;
+
+        if (!s->buffer_size[1] || s->buffer_size[1] < out_size) {
+            av_free(s->buffer[1]);
+            s->buffer_size[1] = out_size;
+            s->buffer[1] = av_malloc(s->buffer_size[1]);
+            if (!s->buffer[1]) {
+                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+                return 0;
+            }
+        }
+
+        output = s->buffer[1];
+    }
+
+    /* XXX: move those malloc to resample init code */
+    for (i = 0; i < s->filter_channels; i++) {
+        bufin[i] = av_malloc_array((nb_samples + s->temp_len), sizeof(short));
+        bufout[i] = av_malloc_array(lenout, sizeof(short));
+
+        if (!bufin[i] || !bufout[i]) {
+            av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+            nb_samples1 = 0;
+            goto fail;
+        }
+
+        memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
+        buftmp2[i] = bufin[i] + s->temp_len;
+    }
+
+    if (s->input_channels == 2 && s->output_channels == 1) {
+        buftmp3[0] = output;
+        stereo_to_mono(buftmp2[0], input, nb_samples);
+    } else if (s->output_channels >= 2 && s->input_channels == 1) {
+        buftmp3[0] = bufout[0];
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
+    } else if (s->input_channels == 6 && s->output_channels ==2) {
+        buftmp3[0] = bufout[0];
+        buftmp3[1] = bufout[1];
+        surround_to_stereo(buftmp2, input, s->input_channels, nb_samples);
+    } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
+        for (i = 0; i < s->input_channels; i++) {
+            buftmp3[i] = bufout[i];
+        }
+        deinterleave(buftmp2, input, s->input_channels, nb_samples);
+    } else {
+        buftmp3[0] = output;
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
+    }
+
+    nb_samples += s->temp_len;
+
+    /* resample each channel */
+    nb_samples1 = 0; /* avoid warning */
+    for (i = 0; i < s->filter_channels; i++) {
+        int consumed;
+        int is_last = i + 1 == s->filter_channels;
+
+        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i],
+                                  &consumed, nb_samples, lenout, is_last);
+        s->temp_len = nb_samples - consumed;
+        s->temp[i] = av_realloc_array(s->temp[i], s->temp_len, sizeof(short));
+        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short));
+    }
+
+    if (s->output_channels == 2 && s->input_channels == 1) {
+        mono_to_stereo(output, buftmp3[0], nb_samples1);
+    } else if (s->output_channels == 6 && s->input_channels == 2) {
+        ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
+    } else if ((s->output_channels == s->input_channels && s->input_channels >= 2) ||
+               (s->output_channels == 2 && s->input_channels == 6)) {
+        interleave(output, buftmp3, s->output_channels, nb_samples1);
+    }
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        int istride[1] = { 2 };
+        int ostride[1] = { s->sample_size[1] };
+        const void *ibuf[1] = { output };
+        void       *obuf[1] = { output_bak };
+
+        if (av_audio_convert(s->convert_ctx[1], obuf, ostride,
+                             ibuf, istride, nb_samples1 * s->output_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
+            return 0;
+        }
+    }
+
+fail:
+    for (i = 0; i < s->filter_channels; i++) {
+        av_free(bufin[i]);
+        av_free(bufout[i]);
+    }
+
+    return nb_samples1;
+}
+
+void audio_resample_close(ReSampleContext *s)
+{
+    int i;
+    av_resample_close(s->resample_context);
+    for (i = 0; i < s->filter_channels; i++)
+        av_freep(&s->temp[i]);
+    av_freep(&s->buffer[0]);
+    av_freep(&s->buffer[1]);
+    av_audio_convert_free(s->convert_ctx[0]);
+    av_audio_convert_free(s->convert_ctx[1]);
+    av_free(s);
+}
+
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
diff --git a/libavcodec/resample2.c b/libavcodec/resample2.c
new file mode 100644
index 0000000000..cd9fe1ce56
--- /dev/null
+++ b/libavcodec/resample2.c
@@ -0,0 +1,319 @@
+/*
+ * audio resampling
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * audio resampling
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/avassert.h"
+#include "avcodec.h"
+#include "libavutil/common.h"
+
+#if FF_API_AVCODEC_RESAMPLE
+
+#ifndef CONFIG_RESAMPLE_HP
+#define FILTER_SHIFT 15
+
+#define FELEM int16_t
+#define FELEM2 int32_t
+#define FELEML int64_t
+#define FELEM_MAX INT16_MAX
+#define FELEM_MIN INT16_MIN
+#define WINDOW_TYPE 9
+#elif !defined(CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE)
+#define FILTER_SHIFT 30
+
+#define FELEM int32_t
+#define FELEM2 int64_t
+#define FELEML int64_t
+#define FELEM_MAX INT32_MAX
+#define FELEM_MIN INT32_MIN
+#define WINDOW_TYPE 12
+#else
+#define FILTER_SHIFT 0
+
+#define FELEM double
+#define FELEM2 double
+#define FELEML double
+#define WINDOW_TYPE 24
+#endif
+
+
+typedef struct AVResampleContext{
+    const AVClass *av_class;
+    FELEM *filter_bank;
+    int filter_length;
+    int ideal_dst_incr;
+    int dst_incr;
+    int index;
+    int frac;
+    int src_incr;
+    int compensation_distance;
+    int phase_shift;
+    int phase_mask;
+    int linear;
+}AVResampleContext;
+
+/**
+ * 0th order modified bessel function of the first kind.
+ */
+static double bessel(double x){
+    double v=1;
+    double lastv=0;
+    double t=1;
+    int i;
+
+    x= x*x/4;
+    for(i=1; v != lastv; i++){
+        lastv=v;
+        t *= x/(i*i);
+        v += t;
+    }
+    return v;
+}
+
+/**
+ * Build a polyphase filterbank.
+ * @param factor resampling factor
+ * @param scale wanted sum of coefficients for each filter
+ * @param type 0->cubic, 1->blackman nuttall windowed sinc, 2..16->kaiser windowed sinc beta=2..16
+ * @return 0 on success, negative on error
+ */
+static int build_filter(FELEM *filter, double factor, int tap_count, int phase_count, int scale, int type){
+    int ph, i;
+    double x, y, w;
+    double *tab = av_malloc_array(tap_count, sizeof(*tab));
+    const int center= (tap_count-1)/2;
+
+    if (!tab)
+        return AVERROR(ENOMEM);
+
+    /* if upsampling, only need to interpolate, no filter */
+    if (factor > 1.0)
+        factor = 1.0;
+
+    for(ph=0;ph<phase_count;ph++) {
+        double norm = 0;
+        for(i=0;i<tap_count;i++) {
+            x = M_PI * ((double)(i - center) - (double)ph / phase_count) * factor;
+            if (x == 0) y = 1.0;
+            else        y = sin(x) / x;
+            switch(type){
+            case 0:{
+                const float d= -0.5; //first order derivative = -0.5
+                x = fabs(((double)(i - center) - (double)ph / phase_count) * factor);
+                if(x<1.0) y= 1 - 3*x*x + 2*x*x*x + d*(            -x*x + x*x*x);
+                else      y=                       d*(-4 + 8*x - 5*x*x + x*x*x);
+                break;}
+            case 1:
+                w = 2.0*x / (factor*tap_count) + M_PI;
+                y *= 0.3635819 - 0.4891775 * cos(w) + 0.1365995 * cos(2*w) - 0.0106411 * cos(3*w);
+                break;
+            default:
+                w = 2.0*x / (factor*tap_count*M_PI);
+                y *= bessel(type*sqrt(FFMAX(1-w*w, 0)));
+                break;
+            }
+
+            tab[i] = y;
+            norm += y;
+        }
+
+        /* normalize so that an uniform color remains the same */
+        for(i=0;i<tap_count;i++) {
+#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
+            filter[ph * tap_count + i] = tab[i] / norm;
+#else
+            filter[ph * tap_count + i] = av_clip(lrintf(tab[i] * scale / norm), FELEM_MIN, FELEM_MAX);
+#endif
+        }
+    }
+#if 0
+    {
+#define LEN 1024
+        int j,k;
+        double sine[LEN + tap_count];
+        double filtered[LEN];
+        double maxff=-2, minff=2, maxsf=-2, minsf=2;
+        for(i=0; i<LEN; i++){
+            double ss=0, sf=0, ff=0;
+            for(j=0; j<LEN+tap_count; j++)
+                sine[j]= cos(i*j*M_PI/LEN);
+            for(j=0; j<LEN; j++){
+                double sum=0;
+                ph=0;
+                for(k=0; k<tap_count; k++)
+                    sum += filter[ph * tap_count + k] * sine[k+j];
+                filtered[j]= sum / (1<<FILTER_SHIFT);
+                ss+= sine[j + center] * sine[j + center];
+                ff+= filtered[j] * filtered[j];
+                sf+= sine[j + center] * filtered[j];
+            }
+            ss= sqrt(2*ss/LEN);
+            ff= sqrt(2*ff/LEN);
+            sf= 2*sf/LEN;
+            maxff= FFMAX(maxff, ff);
+            minff= FFMIN(minff, ff);
+            maxsf= FFMAX(maxsf, sf);
+            minsf= FFMIN(minsf, sf);
+            if(i%11==0){
+                av_log(NULL, AV_LOG_ERROR, "i:%4d ss:%f ff:%13.6e-%13.6e sf:%13.6e-%13.6e\n", i, ss, maxff, minff, maxsf, minsf);
+                minff=minsf= 2;
+                maxff=maxsf= -2;
+            }
+        }
+    }
+#endif
+
+    av_free(tab);
+    return 0;
+}
+
+AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_size, int phase_shift, int linear, double cutoff){
+    AVResampleContext *c= av_mallocz(sizeof(AVResampleContext));
+    double factor= FFMIN(out_rate * cutoff / in_rate, 1.0);
+    int phase_count= 1<<phase_shift;
+
+    if (!c)
+        return NULL;
+
+    c->phase_shift= phase_shift;
+    c->phase_mask= phase_count-1;
+    c->linear= linear;
+
+    c->filter_length= FFMAX((int)ceil(filter_size/factor), 1);
+    c->filter_bank= av_mallocz_array(c->filter_length, (phase_count+1)*sizeof(FELEM));
+    if (!c->filter_bank)
+        goto error;
+    if (build_filter(c->filter_bank, factor, c->filter_length, phase_count, 1<<FILTER_SHIFT, WINDOW_TYPE))
+        goto error;
+    memcpy(&c->filter_bank[c->filter_length*phase_count+1], c->filter_bank, (c->filter_length-1)*sizeof(FELEM));
+    c->filter_bank[c->filter_length*phase_count]= c->filter_bank[c->filter_length - 1];
+
+    if(!av_reduce(&c->src_incr, &c->dst_incr, out_rate, in_rate * (int64_t)phase_count, INT32_MAX/2))
+        goto error;
+    c->ideal_dst_incr= c->dst_incr;
+
+    c->index= -phase_count*((c->filter_length-1)/2);
+
+    return c;
+error:
+    av_free(c->filter_bank);
+    av_free(c);
+    return NULL;
+}
+
+void av_resample_close(AVResampleContext *c){
+    av_freep(&c->filter_bank);
+    av_freep(&c);
+}
+
+void av_resample_compensate(AVResampleContext *c, int sample_delta, int compensation_distance){
+//    sample_delta += (c->ideal_dst_incr - c->dst_incr)*(int64_t)c->compensation_distance / c->ideal_dst_incr;
+    c->compensation_distance= compensation_distance;
+    c->dst_incr = c->ideal_dst_incr - c->ideal_dst_incr * (int64_t)sample_delta / compensation_distance;
+}
+
+int av_resample(AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx){
+    int dst_index, i;
+    int index= c->index;
+    int frac= c->frac;
+    int dst_incr_frac= c->dst_incr % c->src_incr;
+    int dst_incr=      c->dst_incr / c->src_incr;
+    int compensation_distance= c->compensation_distance;
+
+  if(compensation_distance == 0 && c->filter_length == 1 && c->phase_shift==0){
+        int64_t index2= ((int64_t)index)<<32;
+        int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr;
+        dst_size= FFMIN(dst_size, (src_size-1-index) * (int64_t)c->src_incr / c->dst_incr);
+
+        for(dst_index=0; dst_index < dst_size; dst_index++){
+            dst[dst_index] = src[index2>>32];
+            index2 += incr;
+        }
+        index += dst_index * dst_incr;
+        index += (frac + dst_index * (int64_t)dst_incr_frac) / c->src_incr;
+        frac   = (frac + dst_index * (int64_t)dst_incr_frac) % c->src_incr;
+  }else{
+    for(dst_index=0; dst_index < dst_size; dst_index++){
+        FELEM *filter= c->filter_bank + c->filter_length*(index & c->phase_mask);
+        int sample_index= index >> c->phase_shift;
+        FELEM2 val=0;
+
+        if(sample_index < 0){
+            for(i=0; i<c->filter_length; i++)
+                val += src[FFABS(sample_index + i) % src_size] * filter[i];
+        }else if(sample_index + c->filter_length > src_size){
+            break;
+        }else if(c->linear){
+            FELEM2 v2=0;
+            for(i=0; i<c->filter_length; i++){
+                val += src[sample_index + i] * (FELEM2)filter[i];
+                v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_length];
+            }
+            val+=(v2-val)*(FELEML)frac / c->src_incr;
+        }else{
+            for(i=0; i<c->filter_length; i++){
+                val += src[sample_index + i] * (FELEM2)filter[i];
+            }
+        }
+
+#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
+        dst[dst_index] = av_clip_int16(lrintf(val));
+#else
+        val = (val + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;
+        dst[dst_index] = (unsigned)(val + 32768) > 65535 ? (val>>31) ^ 32767 : val;
+#endif
+
+        frac += dst_incr_frac;
+        index += dst_incr;
+        if(frac >= c->src_incr){
+            frac -= c->src_incr;
+            index++;
+        }
+
+        if(dst_index + 1 == compensation_distance){
+            compensation_distance= 0;
+            dst_incr_frac= c->ideal_dst_incr % c->src_incr;
+            dst_incr=      c->ideal_dst_incr / c->src_incr;
+        }
+    }
+  }
+    *consumed= FFMAX(index, 0) >> c->phase_shift;
+    if(index>=0) index &= c->phase_mask;
+
+    if(compensation_distance){
+        compensation_distance -= dst_index;
+        av_assert2(compensation_distance > 0);
+    }
+    if(update_ctx){
+        c->frac= frac;
+        c->index= index;
+        c->dst_incr= dst_incr_frac + c->src_incr*dst_incr;
+        c->compensation_distance= compensation_distance;
+    }
+
+    return dst_index;
+}
+
+#endif
diff --git a/libavcodec/reverse.c b/libavcodec/reverse.c
new file mode 100644
index 0000000000..440badaf34
--- /dev/null
+++ b/libavcodec/reverse.c
@@ -0,0 +1 @@
+#include "libavutil/reverse.c"
diff --git a/libavcodec/rl.c b/libavcodec/rl.c
index 5bc1f91563..b206c6f2ce 100644
--- a/libavcodec/rl.c
+++ b/libavcodec/rl.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,9 +100,13 @@ fail:
     return AVERROR(ENOMEM);
 }
 
-av_cold void ff_rl_init_vlc(RLTable *rl)
+av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
 {
     int i, q;
+    VLC_TYPE table[1500][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, 9, rl->n + 1, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
 
     for (q = 0; q < 32; q++) {
         int qmul = q * 2;
@@ -112,9 +116,9 @@ av_cold void ff_rl_init_vlc(RLTable *rl)
             qmul = 1;
             qadd = 0;
         }
-        for (i = 0; i < rl->vlc.table_size; i++) {
-            int code = rl->vlc.table[i][0];
-            int len  = rl->vlc.table[i][1];
+        for (i = 0; i < vlc.table_size; i++) {
+            int code = vlc.table[i][0];
+            int len  = vlc.table[i][1];
             int level, run;
 
             if (len == 0) { // illegal code
diff --git a/libavcodec/rl.h b/libavcodec/rl.h
index fe4efef972..33f4ef88de 100644
--- a/libavcodec/rl.h
+++ b/libavcodec/rl.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000-2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,7 +44,6 @@ typedef struct RLTable {
     uint8_t *index_run[2];         ///< encoding only
     int8_t *max_level[2];          ///< encoding & decoding
     int8_t *max_run[2];            ///< encoding & decoding
-    VLC vlc;                       ///< decoding only deprecated FIXME remove
     RL_VLC_ELEM *rl_vlc[32];       ///< decoding only
 } RLTable;
 
@@ -54,7 +53,7 @@ typedef struct RLTable {
  *                     the level and run tables, if this is NULL av_malloc() will be used
  */
 int ff_rl_init(RLTable *rl, uint8_t static_store[2][2*MAX_RUN + MAX_LEVEL + 3]);
-void ff_rl_init_vlc(RLTable *rl);
+void ff_rl_init_vlc(RLTable *rl, unsigned static_size);
 
 /**
  * Free the contents of a dynamically allocated table.
@@ -65,15 +64,12 @@ void ff_rl_free(RLTable *rl);
 {\
     int q;\
     static RL_VLC_ELEM rl_vlc_table[32][static_size];\
-    INIT_VLC_STATIC(&rl.vlc, 9, rl.n + 1,\
-             &rl.table_vlc[0][1], 4, 2,\
-             &rl.table_vlc[0][0], 4, 2, static_size);\
 \
     if(!rl.rl_vlc[0]){\
         for(q=0; q<32; q++)\
             rl.rl_vlc[q]= rl_vlc_table[q];\
 \
-        ff_rl_init_vlc(&rl);\
+        ff_rl_init_vlc(&rl, static_size);\
     }\
 }
 
diff --git a/libavcodec/rl2.c b/libavcodec/rl2.c
index ba539e7d3a..6662979c52 100644
--- a/libavcodec/rl2.c
+++ b/libavcodec/rl2.c
@@ -2,20 +2,20 @@
  * RL2 Video Decoder
  * Copyright (C) 2008 Sascha Sommer (saschasommer@freenet.de)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,7 +53,7 @@ typedef struct Rl2Context {
  * @param s rl2 context
  * @param in input buffer
  * @param size input buffer size
- * @param out ouput buffer
+ * @param out output buffer
  * @param stride stride of the output buffer
  * @param video_base offset of the rle data inside the frame
  */
@@ -155,7 +155,7 @@ static av_cold int rl2_decode_init(AVCodecContext *avctx)
 
     /** initialize palette */
     for (i = 0; i < AVPALETTE_COUNT; i++)
-        s->palette[i] = AV_RB24(&avctx->extradata[6 + i * 3]);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(&avctx->extradata[6 + i * 3]);
 
     /** decode background frame if present */
     back_size = avctx->extradata_size - EXTRADATA1_SIZE;
@@ -181,10 +181,8 @@ static int rl2_decode_frame(AVCodecContext *avctx,
     int ret, buf_size  = avpkt->size;
     Rl2Context *s = avctx->priv_data;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /** run length decode */
     rl2_rle_decode(s, buf, buf_size, frame->data[0], frame->linesize[0],
@@ -209,7 +207,7 @@ static av_cold int rl2_decode_end(AVCodecContext *avctx)
 {
     Rl2Context *s = avctx->priv_data;
 
-    av_free(s->back_frame);
+    av_freep(&s->back_frame);
 
     return 0;
 }
diff --git a/libavcodec/rle.c b/libavcodec/rle.c
index cbbde93f56..d2ec68c407 100644
--- a/libavcodec/rle.c
+++ b/libavcodec/rle.c
@@ -2,20 +2,20 @@
  * RLE encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
diff --git a/libavcodec/rle.h b/libavcodec/rle.h
index 00261d3598..24851321fe 100644
--- a/libavcodec/rle.h
+++ b/libavcodec/rle.h
@@ -1,20 +1,20 @@
 /*
  * RLE encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rnd_avg.h b/libavcodec/rnd_avg.h
index 412cda5203..344775e31f 100644
--- a/libavcodec/rnd_avg.h
+++ b/libavcodec/rnd_avg.h
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
+ * Copyright (c) 2011 Oskar Arvidsson
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqaudioenc.c b/libavcodec/roqaudioenc.c
index f687f5cd9d..5154604be8 100644
--- a/libavcodec/roqaudioenc.c
+++ b/libavcodec/roqaudioenc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2005 Eric Lasota
  *    Based on RoQ specs (c)2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -160,10 +160,8 @@ static int roq_dpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         data_size = avctx->channels * avctx->frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, ROQ_HEADER_SIZE + data_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, ROQ_HEADER_SIZE + data_size, 0)) < 0)
         return ret;
-    }
     out = avpkt->data;
 
     bytestream_put_byte(&out, stereo ? 0x21 : 0x20);
diff --git a/libavcodec/roqvideo.c b/libavcodec/roqvideo.c
index b0fd6ba7f3..8eda93c13c 100644
--- a/libavcodec/roqvideo.c
+++ b/libavcodec/roqvideo.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqvideo.h b/libavcodec/roqvideo.h
index 3f000225e9..3da6eaa991 100644
--- a/libavcodec/roqvideo.h
+++ b/libavcodec/roqvideo.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,7 @@ struct RoqTempData;
 
 typedef struct RoqContext {
 
+    const AVClass *class;
     AVCodecContext *avctx;
     AVFrame *last_frame;
     AVFrame *current_frame;
@@ -69,6 +70,9 @@ typedef struct RoqContext {
     const AVFrame *frame_to_enc;
     uint8_t *out_buf;
     struct RoqTempData *tmpData;
+
+    int quake3_compat; // Quake 3 compatibility option
+
 } RoqContext;
 
 #define RoQ_INFO              0x1001
diff --git a/libavcodec/roqvideodec.c b/libavcodec/roqvideodec.c
index 0ce694388f..68ef160ecf 100644
--- a/libavcodec/roqvideodec.c
+++ b/libavcodec/roqvideodec.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,7 @@
  *   http://www.csse.monash.edu.au/~timf/
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -72,9 +69,19 @@ static void roqvideo_decode_frame(RoqContext *ri)
 
     chunk_start = bytestream2_tell(&ri->gb);
     xpos = ypos = 0;
+
+    if (chunk_size > bytestream2_get_bytes_left(&ri->gb)) {
+        av_log(ri->avctx, AV_LOG_ERROR, "Chunk does not fit in input buffer\n");
+        chunk_size = bytestream2_get_bytes_left(&ri->gb);
+    }
+
     while (bytestream2_tell(&ri->gb) < chunk_start + chunk_size) {
         for (yp = ypos; yp < ypos + 16; yp += 8)
             for (xp = xpos; xp < xpos + 16; xp += 8) {
+                if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                    av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                    return;
+                }
                 if (vqflg_pos < 0) {
                     vqflg = bytestream2_get_le16(&ri->gb);
                     vqflg_pos = 7;
@@ -106,6 +113,10 @@ static void roqvideo_decode_frame(RoqContext *ri)
                         if(k & 0x01) x += 4;
                         if(k & 0x02) y += 4;
 
+                        if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                            av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                            return;
+                        }
                         if (vqflg_pos < 0) {
                             vqflg = bytestream2_get_le16(&ri->gb);
                             vqflg_pos = 7;
@@ -140,7 +151,7 @@ static void roqvideo_decode_frame(RoqContext *ri)
                     }
                     break;
                 default:
-                    av_log(ri->avctx, AV_LOG_ERROR, "Unknown vq code: %d\n", vqid);
+                    av_assert2(0);
             }
         }
 
@@ -178,7 +189,8 @@ static av_cold int roq_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+    avctx->pix_fmt = AV_PIX_FMT_YUVJ444P;
+    avctx->color_range = AVCOL_RANGE_JPEG;
 
     return 0;
 }
@@ -193,10 +205,8 @@ static int roq_decode_frame(AVCodecContext *avctx,
     int copy= !s->current_frame->data[0];
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0)
         return ret;
-    }
 
     if(copy)
         av_picture_copy((AVPicture*)s->current_frame, (AVPicture*)s->last_frame,
diff --git a/libavcodec/roqvideoenc.c b/libavcodec/roqvideoenc.c
index 32dabae09c..b6ed1f9835 100644
--- a/libavcodec/roqvideoenc.c
+++ b/libavcodec/roqvideoenc.c
@@ -5,27 +5,27 @@
  * Copyright (C) 2004-2007 Eric Lasota
  *    Based on RoQ specs (C) 2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * id RoQ encoder by Vitor. Based on the Switchblade3 library and the
- * Switchblade3 Libav glue by Eric Lasota.
+ * Switchblade3 FFmpeg glue by Eric Lasota.
  */
 
 /*
@@ -57,6 +57,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/opt.h"
 #include "roqvideo.h"
 #include "bytestream.h"
 #include "elbg.h"
@@ -69,7 +70,7 @@
  * Maximum number of generated 4x4 codebooks. Can't be 256 to workaround a
  * Quake 3 bug.
  */
-#define MAX_CBS_4x4 255
+#define MAX_CBS_4x4 256
 
 #define MAX_CBS_2x2 256 ///< Maximum number of 2x2 codebooks.
 
@@ -245,7 +246,7 @@ static int create_cel_evals(RoqContext *enc, RoqTempdata *tempData)
 {
     int n=0, x, y, i;
 
-    tempData->cel_evals = av_malloc(enc->width*enc->height/64 * sizeof(CelEvaluation));
+    tempData->cel_evals = av_malloc_array(enc->width*enc->height/64, sizeof(CelEvaluation));
     if (!tempData->cel_evals)
         return AVERROR(ENOMEM);
 
@@ -541,7 +542,7 @@ static void remap_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int i, j, idx=0;
 
     /* Make remaps for the final codebook usage */
-    for (i=0; i<MAX_CBS_4x4; i++) {
+    for (i=0; i<(enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4); i++) {
         if (tempData->codebooks.usedCB4[i]) {
             tempData->i2f4[i] = idx;
             tempData->f2i4[idx] = i;
@@ -798,14 +799,14 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     int i, j, k, ret = 0;
     int c_size = size*size/4;
     int *buf;
-    int *codebook = av_malloc(6*c_size*cbsize*sizeof(int));
+    int *codebook = av_malloc_array(6*c_size, cbsize*sizeof(int));
     int *closest_cb;
 
     if (!codebook)
         return AVERROR(ENOMEM);
 
     if (size == 4) {
-        closest_cb = av_malloc(6*c_size*inputCount*sizeof(int));
+        closest_cb = av_malloc_array(6*c_size, inputCount*sizeof(int));
         if (!closest_cb) {
             ret = AVERROR(ENOMEM);
             goto out;
@@ -813,11 +814,11 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     } else
         closest_cb = tempdata->closest_cb2;
 
-    ret = ff_init_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_init_elbg(points, 6 * c_size, inputCount, codebook,
                        cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
-    ret = ff_do_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_do_elbg(points, 6 * c_size, inputCount, codebook,
                      cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
@@ -846,8 +847,8 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int max = enc->width*enc->height/16;
     uint8_t mb2[3*4];
     roq_cell *results4 = av_malloc(sizeof(roq_cell)*MAX_CBS_4x4*4);
-    uint8_t *yuvClusters=av_malloc(sizeof(int)*max*6*4);
-    int *points = av_malloc(max*6*4*sizeof(int));
+    uint8_t *yuvClusters=av_malloc_array(max, sizeof(int)*6*4);
+    int *points = av_malloc_array(max, 6*4*sizeof(int));
     int bias;
 
     if (!results4 || !yuvClusters || !points) {
@@ -866,12 +867,12 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
 
     /* Create 4x4 codebooks */
     if ((ret = generate_codebook(enc, tempData, points, max,
-                                 results4, 4, MAX_CBS_4x4)) < 0)
+                                 results4, 4, (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4))) < 0)
         goto out;
 
-    codebooks->numCB4 = MAX_CBS_4x4;
+    codebooks->numCB4 = (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4);
 
-    tempData->closest_cb2 = av_malloc(max*4*sizeof(int));
+    tempData->closest_cb2 = av_malloc_array(max, 4*sizeof(int));
     if (!tempData->closest_cb2) {
         ret = AVERROR(ENOMEM);
         goto out;
@@ -932,10 +933,14 @@ static int roq_encode_video(RoqContext *enc)
         gather_data_for_cel(tempData->cel_evals + i, enc, tempData);
 
     /* Quake 3 can't handle chunks bigger than 65535 bytes */
-    if (tempData->mainChunkSize/8 > 65535) {
+    if (tempData->mainChunkSize/8 > 65535 && enc->quake3_compat) {
+        if (enc->lambda > 100000) {
+            av_log(enc->avctx, AV_LOG_ERROR, "Cannot encode video in Quake compatible form\n");
+            return AVERROR(EINVAL);
+        }
         av_log(enc->avctx, AV_LOG_ERROR,
-               "Warning, generated a frame too big (%d > 65535), "
-               "try using a smaller qscale value.\n",
+               "Warning, generated a frame too big for Quake (%d > 65535), "
+               "now switching to a bigger qscale value.\n",
                tempData->mainChunkSize/8);
         enc->lambda *= 1.5;
         tempData->mainChunkSize = 0;
@@ -960,8 +965,8 @@ static int roq_encode_video(RoqContext *enc)
     FFSWAP(motion_vect *, enc->last_motion4, enc->this_motion4);
     FFSWAP(motion_vect *, enc->last_motion8, enc->this_motion8);
 
-    av_free(tempData->cel_evals);
-    av_free(tempData->closest_cb2);
+    av_freep(&tempData->cel_evals);
+    av_freep(&tempData->closest_cb2);
 
     enc->framesSinceKeyframe++;
 
@@ -975,11 +980,11 @@ static av_cold int roq_encode_end(AVCodecContext *avctx)
     av_frame_free(&enc->current_frame);
     av_frame_free(&enc->last_frame);
 
-    av_free(enc->tmpData);
-    av_free(enc->this_motion4);
-    av_free(enc->last_motion4);
-    av_free(enc->this_motion8);
-    av_free(enc->last_motion8);
+    av_freep(&enc->tmpData);
+    av_freep(&enc->this_motion4);
+    av_freep(&enc->last_motion4);
+    av_freep(&enc->this_motion8);
+    av_freep(&enc->last_motion8);
 
     return 0;
 }
@@ -995,11 +1000,16 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->framesSinceKeyframe = 0;
     if ((avctx->width & 0xf) || (avctx->height & 0xf)) {
         av_log(avctx, AV_LOG_ERROR, "Dimensions must be divisible by 16\n");
-        return -1;
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions are max %d\n", enc->quake3_compat ? 32768 : 65535);
+        return AVERROR(EINVAL);
     }
 
     if (((avctx->width)&(avctx->width-1))||((avctx->height)&(avctx->height-1)))
-        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two\n");
+        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two, this is not supported by quake\n");
 
     enc->width = avctx->width;
     enc->height = avctx->height;
@@ -1017,16 +1027,22 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->tmpData      = av_malloc(sizeof(RoqTempdata));
 
     enc->this_motion4 =
-        av_mallocz((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->last_motion4 =
-        av_malloc ((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->this_motion8 =
-        av_mallocz((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/64), sizeof(motion_vect));
 
     enc->last_motion8 =
-        av_malloc ((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/64), sizeof(motion_vect));
+
+    if (!enc->tmpData || !enc->this_motion4 || !enc->last_motion4 ||
+        !enc->this_motion8 || !enc->last_motion8) {
+        roq_encode_end(avctx);
+        return AVERROR(ENOMEM);
+    }
 
     return 0;
 }
@@ -1074,10 +1090,8 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* 138 bits max per 8x8 block +
      *     256 codebooks*(6 bytes 2x2 + 4 bytes 4x4) + 8 bytes frame header */
     size = ((enc->width * enc->height / 64) * 138 + 7) / 8 + 256 * (6 + 4) + 8;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet with size %d.\n", size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
     enc->out_buf = pkt->data;
 
     /* Check for I frame */
@@ -1087,11 +1101,9 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (enc->first_frame) {
         /* Alloc memory for the reconstruction data (we must know the stride
          for that) */
-        if (ff_get_buffer(avctx, enc->current_frame, 0) ||
-            ff_get_buffer(avctx, enc->last_frame, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_get_buffer(avctx, enc->current_frame, 0)) < 0 ||
+            (ret = ff_get_buffer(avctx, enc->last_frame,    0)) < 0)
+            return ret;
 
         /* Before the first video frame, write a "video info" chunk */
         roq_write_video_info_chunk(enc);
@@ -1112,6 +1124,20 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     return 0;
 }
 
+#define OFFSET(x) offsetof(RoqContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "quake3_compat", "Whether to respect known limitations in Quake 3 decoder", OFFSET(quake3_compat), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE },
+    { NULL },
+};
+
+static const AVClass roq_class = {
+    .class_name = "RoQ",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_roq_encoder = {
     .name                 = "roqvideo",
     .long_name            = NULL_IF_CONFIG_SMALL("id RoQ video"),
@@ -1121,7 +1147,7 @@ AVCodec ff_roq_encoder = {
     .init                 = roq_encode_init,
     .encode2              = roq_encode_frame,
     .close                = roq_encode_end,
-    .supported_framerates = (const AVRational[]){ {30,1}, {0,0} },
-    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P,
+    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ444P,
                                                         AV_PIX_FMT_NONE },
+    .priv_class     = &roq_class,
 };
diff --git a/libavcodec/rpza.c b/libavcodec/rpza.c
index f365a06fb9..5faf562f8a 100644
--- a/libavcodec/rpza.c
+++ b/libavcodec/rpza.c
@@ -1,21 +1,21 @@
 /*
  * Quicktime Video (RPZA) Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -80,7 +80,7 @@ static void rpza_decode_stream(RpzaContext *s)
     uint16_t *pixels = (uint16_t *)s->frame->data[0];
 
     int row_ptr = 0;
-    int pixel_ptr = 0;
+    int pixel_ptr = -4;
     int block_ptr;
     int pixel_x, pixel_y;
     int total_blocks;
@@ -94,8 +94,12 @@ static void rpza_decode_stream(RpzaContext *s)
     chunk_size = bytestream2_get_be32(&s->gb) & 0x00FFFFFF;
 
     /* If length mismatch use size from MOV file and try to decode anyway */
-    if (chunk_size != bytestream2_get_bytes_left(&s->gb) - 4)
-        av_log(s->avctx, AV_LOG_WARNING, "MOV chunk size != encoded chunk size\n");
+    if (chunk_size != bytestream2_get_bytes_left(&s->gb) + 4)
+        av_log(s->avctx, AV_LOG_WARNING,
+               "MOV chunk size %d != encoded chunk size %d\n",
+               chunk_size,
+               bytestream2_get_bytes_left(&s->gb) + 4
+              );
 
     /* Number of 4x4 blocks in frame. */
     total_blocks = ((s->avctx->width + 3) / 4) * ((s->avctx->height + 3) / 4);
@@ -134,6 +138,7 @@ static void rpza_decode_stream(RpzaContext *s)
         case 0xa0:
             colorA = bytestream2_get_be16(&s->gb);
             while (n_blocks--) {
+                ADVANCE_BLOCK()
                 block_ptr = row_ptr + pixel_ptr;
                 for (pixel_y = 0; pixel_y < 4; pixel_y++) {
                     for (pixel_x = 0; pixel_x < 4; pixel_x++){
@@ -142,7 +147,6 @@ static void rpza_decode_stream(RpzaContext *s)
                     }
                     block_ptr += row_inc;
                 }
-                ADVANCE_BLOCK();
             }
             break;
 
@@ -179,6 +183,7 @@ static void rpza_decode_stream(RpzaContext *s)
             if (bytestream2_get_bytes_left(&s->gb) < n_blocks * 4)
                 return;
             while (n_blocks--) {
+                ADVANCE_BLOCK();
                 block_ptr = row_ptr + pixel_ptr;
                 for (pixel_y = 0; pixel_y < 4; pixel_y++) {
                     uint8_t index = bytestream2_get_byteu(&s->gb);
@@ -189,7 +194,6 @@ static void rpza_decode_stream(RpzaContext *s)
                     }
                     block_ptr += row_inc;
                 }
-                ADVANCE_BLOCK();
             }
             break;
 
@@ -197,6 +201,7 @@ static void rpza_decode_stream(RpzaContext *s)
         case 0x00:
             if (bytestream2_get_bytes_left(&s->gb) < 30)
                 return;
+            ADVANCE_BLOCK();
             block_ptr = row_ptr + pixel_ptr;
             for (pixel_y = 0; pixel_y < 4; pixel_y++) {
                 for (pixel_x = 0; pixel_x < 4; pixel_x++){
@@ -208,7 +213,6 @@ static void rpza_decode_stream(RpzaContext *s)
                 }
                 block_ptr += row_inc;
             }
-            ADVANCE_BLOCK();
             break;
 
         /* Unknown opcode */
@@ -244,10 +248,8 @@ static int rpza_decode_frame(AVCodecContext *avctx,
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     rpza_decode_stream(s);
 
diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c
index 67eeff8f4a..8e02bce2e8 100644
--- a/libavcodec/rtjpeg.c
+++ b/libavcodec/rtjpeg.c
@@ -2,20 +2,20 @@
  * RTJpeg decoding functions
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavutil/common.h"
diff --git a/libavcodec/rtjpeg.h b/libavcodec/rtjpeg.h
index cd300797c5..d22ff4070e 100644
--- a/libavcodec/rtjpeg.h
+++ b/libavcodec/rtjpeg.h
@@ -2,20 +2,20 @@
  * RTJpeg decoding functions
  * copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index 29a46399e2..b56bb5881f 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -309,7 +309,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
 {
     MpegEncContext *s = &rv->m;
     int seq, mb_pos, i, ret;
-    int rpr_bits;
+    int rpr_max;
 
     i = get_bits(&s->gb, 2);
     switch (i) {
@@ -330,6 +330,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->low_delay && s->pict_type == AV_PICTURE_TYPE_B) {
+        av_log(s->avctx, AV_LOG_ERROR, "low delay B\n");
+        return -1;
+    }
     if (!s->last_picture_ptr && s->pict_type == AV_PICTURE_TYPE_B) {
         av_log(s->avctx, AV_LOG_ERROR, "early B-frame\n");
         return AVERROR_INVALIDDATA;
@@ -347,17 +351,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     }
 
     if (RV_GET_MINOR_VER(rv->sub_id) >= 2)
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) && !s->avctx->lowres;
 
     if (RV_GET_MINOR_VER(rv->sub_id) <= 1)
         seq = get_bits(&s->gb, 8) << 7;
     else
         seq = get_bits(&s->gb, 13) << 2;
 
-    rpr_bits = s->avctx->extradata[1] & 7;
-    if (rpr_bits) {
+    rpr_max = s->avctx->extradata[1] & 7;
+    if (rpr_max) {
         int f, new_w, new_h;
-        rpr_bits = FFMIN((rpr_bits >> 1) + 1, 3);
+        int rpr_bits = av_log2(rpr_max) + 1;
 
         f = get_bits(&s->gb, rpr_bits);
 
@@ -374,10 +378,21 @@ static int rv20_decode_picture_header(RVDecContext *rv)
             new_h = rv->orig_height;
         }
         if (new_w != s->width || new_h != s->height) {
+            AVRational old_aspect = s->avctx->sample_aspect_ratio;
             av_log(s->avctx, AV_LOG_DEBUG,
                    "attempting to change resolution to %dx%d\n", new_w, new_h);
+            if (av_image_check_size(new_w, new_h, 0, s->avctx) < 0)
+                return AVERROR_INVALIDDATA;
             ff_mpv_common_end(s);
 
+            // attempt to keep aspect during typical resolution switches
+            if (!old_aspect.num)
+                old_aspect = (AVRational){1, 1};
+            if (2 * new_w * s->height == new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){2, 1});
+            if (new_w * s->height == 2 * new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){1, 2});
+
             ret = ff_set_dimensions(s->avctx, new_w, new_h);
             if (ret < 0)
                 return ret;
@@ -389,9 +404,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         }
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
-            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d\n", f, rpr_bits);
+            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d/%d\n", f, rpr_bits, rpr_max);
         }
-    } else if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
+    }
+    if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
         return AVERROR_INVALIDDATA;
 
     mb_pos = ff_h263_decode_mba(s);
@@ -410,15 +426,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         } else {
             s->time    = seq;
             s->pb_time = s->pp_time - (s->last_non_b_time - s->time);
-            if (s->pp_time <= s->pb_time ||
-                s->pp_time <= s->pp_time - s->pb_time || s->pp_time <= 0) {
-                av_log(s->avctx, AV_LOG_DEBUG, "messed up order, possible "
-                       "from seeking? skipping current b frame\n");
-                return FRAME_SKIPPED;
-            }
-            ff_mpeg4_init_direct_mv(s);
         }
     }
+    if (s->pict_type == AV_PICTURE_TYPE_B) {
+        if (s->pp_time <=s->pb_time || s->pp_time <= s->pp_time - s->pb_time || s->pp_time<=0) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "messed up order, possible from seeking? skipping current b frame\n");
+#define ERROR_SKIP_FRAME -123
+            return ERROR_SKIP_FRAME;
+        }
+        ff_mpeg4_init_direct_mv(s);
+    }
 
     s->no_rounding = get_bits1(&s->gb);
 
@@ -430,7 +448,8 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     s->unrestricted_mv = 1;
     s->h263_aic        = s->pict_type == AV_PICTURE_TYPE_I;
     s->modified_quant  = 1;
-    s->loop_filter     = 1;
+    if (!s->avctx->lowres)
+        s->loop_filter = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_INFO,
@@ -439,7 +458,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
                s->no_rounding);
     }
 
-    assert(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
+    av_assert0(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
 
     return s->mb_width * s->mb_height - mb_pos;
 }
@@ -460,10 +479,9 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
         return ret;
 
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
 
-    s->avctx       = avctx;
     s->out_format  = FMT_H263;
-    s->codec_id    = avctx->codec_id;
 
     rv->orig_width  =
     s->width        = avctx->coded_width;
@@ -496,8 +514,8 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
     }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO) {
-        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id,
-               avctx->extradata_size >= 4 ? ((int *) avctx->extradata)[0] : -1);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%"PRIX32"\n", rv->sub_id,
+               ((uint32_t *) avctx->extradata)[0]);
     }
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
@@ -545,7 +563,8 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     else
         mb_count = rv20_decode_picture_header(rv);
     if (mb_count < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
+        if (mb_count != ERROR_SKIP_FRAME)
+            av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -578,6 +597,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
         }
     }
 
+
     ff_dlog(avctx, "qscale=%d\n", s->qscale);
 
     /* default quantization values */
@@ -618,7 +638,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     for (s->mb_num_left = mb_count; s->mb_num_left > 0; s->mb_num_left--) {
         int ret;
         ff_update_block_index(s);
-        ff_dlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+        ff_tlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 
         s->mv_dir  = MV_DIR_FORWARD;
         s->mv_type = MV_TYPE_16X16;
@@ -748,11 +768,13 @@ static int rv10_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict,s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         }
 
         if (s->last_picture_ptr || s->low_delay) {
@@ -776,6 +798,7 @@ AVCodec ff_rv10_decoder = {
     .close          = rv10_decode_end,
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -793,6 +816,7 @@ AVCodec ff_rv20_decoder = {
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/rv10.h b/libavcodec/rv10.h
index b44bc1f1c3..364270e76a 100644
--- a/libavcodec/rv10.h
+++ b/libavcodec/rv10.h
@@ -1,20 +1,20 @@
 /*
  * RV10/RV20 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10enc.c b/libavcodec/rv10enc.c
index 9f4f7380c3..b17acbc9b5 100644
--- a/libavcodec/rv10enc.c
+++ b/libavcodec/rv10enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv20enc.c b/libavcodec/rv20enc.c
index 20090b14ef..81fb4fc1ba 100644
--- a/libavcodec/rv20enc.c
+++ b/libavcodec/rv20enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,12 +43,12 @@ void ff_rv20_encode_picture_header(MpegEncContext *s, int picture_number){
 
     put_bits(&s->pb, 1, s->no_rounding);
 
-    assert(s->f_code == 1);
-    assert(s->unrestricted_mv == 0);
-    assert(s->alt_inter_vlc == 0);
-    assert(s->umvplus == 0);
-    assert(s->modified_quant==1);
-    assert(s->loop_filter==1);
+    av_assert0(s->f_code == 1);
+    av_assert0(s->unrestricted_mv == 0);
+    av_assert0(s->alt_inter_vlc == 0);
+    av_assert0(s->umvplus == 0);
+    av_assert0(s->modified_quant==1);
+    av_assert0(s->loop_filter==1);
 
     s->h263_aic= s->pict_type == AV_PICTURE_TYPE_I;
     if(s->h263_aic){
diff --git a/libavcodec/rv30.c b/libavcodec/rv30.c
index bf22df5f10..3b9868cd5e 100644
--- a/libavcodec/rv30.c
+++ b/libavcodec/rv30.c
@@ -2,20 +2,20 @@
  * RV30 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,8 +51,13 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
     si->quant = get_bits(gb, 5);
     skip_bits1(gb);
     si->pts = get_bits(gb, 13);
-    rpr = get_bits(gb, r->rpr);
+    rpr = get_bits(gb, av_log2(r->max_rpr) + 1);
     if(rpr){
+        if (rpr > r->max_rpr) {
+            av_log(avctx, AV_LOG_ERROR, "rpr too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (avctx->extradata_size < rpr * 2 + 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "Insufficient extradata - need at least %d bytes, got %d\n",
@@ -62,6 +67,9 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
 
         w = r->s.avctx->extradata[6 + rpr*2] << 2;
         h = r->s.avctx->extradata[7 + rpr*2] << 2;
+    } else {
+        w = r->orig_width;
+        h = r->orig_height;
     }
     si->width  = w;
     si->height = h;
@@ -82,7 +90,7 @@ static int rv30_decode_intra_types(RV34DecContext *r, GetBitContext *gb, int8_t
     for(i = 0; i < 4; i++, dst += r->intra_types_stride - 4){
         for(j = 0; j < 4; j+= 2){
             unsigned code = svq3_get_ue_golomb(gb) << 1;
-            if(code >= 81*2){
+            if (code > 80U*2U) {
                 av_log(r->s.avctx, AV_LOG_ERROR, "Incorrect intra prediction code\n");
                 return -1;
             }
@@ -254,15 +262,22 @@ static av_cold int rv30_decode_init(AVCodecContext *avctx)
     RV34DecContext *r = avctx->priv_data;
     int ret;
 
+    r->orig_width  = avctx->coded_width;
+    r->orig_height = avctx->coded_height;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
+        return AVERROR(EINVAL);
+    }
     r->rv30 = 1;
     if ((ret = ff_rv34_decode_init(avctx)) < 0)
         return ret;
-    if(avctx->extradata_size < 2){
-        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
-        return -1;
+
+    r->max_rpr = avctx->extradata[1] & 7;
+    if(avctx->extradata_size < 2*r->max_rpr + 8){
+        av_log(avctx, AV_LOG_WARNING, "Insufficient extradata - need at least %d bytes, got %d\n",
+               2*r->max_rpr + 8, avctx->extradata_size);
     }
-    r->rpr = (avctx->extradata[1] & 7) >> 1;
-    r->rpr = FFMIN(r->rpr + 1, 3);
 
     r->parse_slice_header = rv30_parse_slice_header;
     r->decode_intra_types = rv30_decode_intra_types;
diff --git a/libavcodec/rv30data.h b/libavcodec/rv30data.h
index 5ee304802c..5c4cb9710b 100644
--- a/libavcodec/rv30data.h
+++ b/libavcodec/rv30data.h
@@ -2,20 +2,20 @@
  * RealVideo 3 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ static const uint8_t rv30_itype_code[9*9*2] = {
  *
  * This is really a three-dimensional matrix with dimensions
  * [-1..9][-1..9][0..9]. The first and second coordinates are
- * detemined by the top and left neighbors (-1 if unavailable).
+ * determined by the top and left neighbors (-1 if unavailable).
  */
 static const uint8_t rv30_itype_from_context[900] = {
     0, 9, 9, 9, 9, 9, 9, 9, 9,
diff --git a/libavcodec/rv30dsp.c b/libavcodec/rv30dsp.c
index 50f418697b..8b205e09b6 100644
--- a/libavcodec/rv30dsp.c
+++ b/libavcodec/rv30dsp.c
@@ -2,20 +2,20 @@
  * RV30 decoder motion compensation functions
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 6c0a4a3b3b..c2e84a3b57 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
  * RV30/40 decoder common data
  */
 
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 
 #include "avcodec.h"
@@ -215,7 +216,7 @@ static int rv34_decode_cbp(GetBitContext *gb, RV34VLC *vlc, int table)
 }
 
 /**
- * Get one coefficient value from the bistream and store it.
+ * Get one coefficient value from the bitstream and store it.
  */
 static inline void decode_coeff(int16_t *dst, int coef, int esc, GetBitContext *gb, VLC* vlc, int q)
 {
@@ -510,7 +511,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int
     }
 }
 
-#define GET_PTS_DIFF(a, b) ((a - b + 8192) & 0x1FFF)
+#define GET_PTS_DIFF(a, b) (((a) - (b) + 8192) & 0x1FFF)
 
 /**
  * Calculate motion vector component that should be added for direct blocks.
@@ -672,6 +673,7 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     int dxy, mx, my, umx, umy, lx, ly, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
     int mv_pos = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride + mv_off;
     int is16x16 = 1;
+    int emu = 0;
 
     if(thirdpel){
         int chroma_mx, chroma_my;
@@ -723,24 +725,14 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     if(s->h_edge_pos - (width << 3) < 6 || s->v_edge_pos - (height << 3) < 6 ||
        (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4 ||
        (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 22 * s->linesize;
-
         srcY -= 2 + 2*s->linesize;
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
                                  (width << 3) + 6, (height << 3) + 6,
-                            src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos);
+                                 src_x - 2, src_y - 2,
+                                 s->h_edge_pos, s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer + 2 + 2*s->linesize;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
-                                 s->uvlinesize,s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+        emu = 1;
     }
     if(!weighted){
         Y = s->dest[0] + xoff      + yoff     *s->linesize;
@@ -763,6 +755,24 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     }
     is16x16 = (block_type != RV34_MB_P_8x8) && (block_type != RV34_MB_P_16x8) && (block_type != RV34_MB_P_8x16);
     qpel_mc[!is16x16][dxy](Y, srcY, s->linesize);
+    if (emu) {
+        uint8_t *uvbuf = s->sc.edge_emu_buffer;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcU = uvbuf;
+        uvbuf += 9*s->uvlinesize;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcV,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcV = uvbuf;
+    }
     chroma_mc[2-width]   (U, srcU, s->uvlinesize, height*4, uvmx, uvmy);
     chroma_mc[2-width]   (V, srcV, s->uvlinesize, height*4, uvmx, uvmy);
 }
@@ -1339,7 +1349,7 @@ static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
     if(r->s.mb_skip_run > 1)
         return 0;
     bits = get_bits_left(&s->gb);
-    if(bits < 0 || (bits < 8 && !show_bits(&s->gb, bits)))
+    if(bits <= 0 || (bits < 8 && !show_bits(&s->gb, bits)))
         return 1;
     return 0;
 }
@@ -1361,11 +1371,11 @@ static int rv34_decoder_alloc(RV34DecContext *r)
 {
     r->intra_types_stride = r->s.mb_width * 4 + 4;
 
-    r->cbp_chroma       = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_chroma       = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_chroma));
-    r->cbp_luma         = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_luma         = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_luma));
-    r->deblock_coefs    = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->deblock_coefs    = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->deblock_coefs));
     r->intra_types_hist = av_malloc(r->intra_types_stride * 4 * 2 *
                                     sizeof(*r->intra_types_hist));
@@ -1410,6 +1420,10 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
         av_log(s->avctx, AV_LOG_ERROR, "Slice type mismatch\n");
         return AVERROR_INVALIDDATA;
     }
+    if (s->width != r->si.width || s->height != r->si.height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Size mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     r->si.end = end;
     s->qscale = r->si.quant;
@@ -1476,14 +1490,9 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
     int ret;
 
     ff_mpv_decode_defaults(s);
-    s->avctx      = avctx;
+    ff_mpv_decode_init(s, avctx);
     s->out_format = FMT_H263;
-    s->codec_id   = avctx->codec_id;
-
-    s->width  = avctx->width;
-    s->height = avctx->height;
 
-    r->s.avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->has_b_frames = 1;
     s->low_delay = 0;
@@ -1525,7 +1534,14 @@ int ff_rv34_decode_init_thread_copy(AVCodecContext *avctx)
 
     if (avctx->internal->is_copy) {
         r->tmp_b_block_base = NULL;
+        r->cbp_chroma       = NULL;
+        r->cbp_luma         = NULL;
+        r->deblock_coefs    = NULL;
+        r->intra_types_hist = NULL;
+        r->mb_type          = NULL;
+
         ff_mpv_idct_init(&r->s);
+
         if ((err = ff_mpv_common_init(&r->s)) < 0)
             return err;
         if ((err = rv34_decoder_alloc(r)) < 0) {
@@ -1591,18 +1607,30 @@ static int finish_frame(AVCodecContext *avctx, AVFrame *pict)
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     }
 
     return got_picture;
 }
 
+static AVRational update_sar(int old_w, int old_h, AVRational sar, int new_w, int new_h)
+{
+    // attempt to keep aspect during typical resolution switches
+    if (!sar.num)
+        sar = (AVRational){1, 1};
+
+    sar = av_mul_q(sar, (AVRational){new_h * old_w, new_w * old_h});
+    return sar;
+}
+
 int ff_rv34_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_picture_ptr,
                             AVPacket *avpkt)
@@ -1617,6 +1645,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     int slice_count;
     const uint8_t *slices_hdr = NULL;
     int last = 0;
+    int faulty_b = 0;
 
     /* no supplementary picture */
     if (buf_size == 0) {
@@ -1654,7 +1683,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
         si.type == AV_PICTURE_TYPE_B) {
         av_log(avctx, AV_LOG_ERROR, "Invalid decoder state: B-frame without "
                "reference data.\n");
-        return AVERROR_INVALIDDATA;
+        faulty_b = 1;
     }
     if(   (avctx->skip_frame >= AVDISCARD_NONREF && si.type==AV_PICTURE_TYPE_B)
        || (avctx->skip_frame >= AVDISCARD_NONKEY && si.type!=AV_PICTURE_TYPE_I)
@@ -1663,8 +1692,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
 
     /* first slice */
     if (si.start == 0) {
-        if (s->mb_num_left > 0) {
-            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.",
+        if (s->mb_num_left > 0 && s->current_picture_ptr) {
+            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.\n",
                    s->mb_num_left);
             ff_er_frame_end(&s->er);
             ff_mpv_frame_end(s);
@@ -1676,6 +1705,12 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
             av_log(s->avctx, AV_LOG_WARNING, "Changing dimensions to %dx%d\n",
                    si.width, si.height);
 
+            if (av_image_check_size(si.width, si.height, 0, s->avctx))
+                return AVERROR_INVALIDDATA;
+
+            s->avctx->sample_aspect_ratio = update_sar(
+                s->width, s->height, s->avctx->sample_aspect_ratio,
+                si.width, si.height);
             s->width  = si.width;
             s->height = si.height;
 
@@ -1738,6 +1773,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
                "multithreading mode (start MB is %d).\n", si.start);
         return AVERROR_INVALIDDATA;
     }
+    if (faulty_b)
+        return AVERROR_INVALIDDATA;
 
     for(i = 0; i < slice_count; i++){
         int offset = get_slice_offset(avctx, slices_hdr, i);
diff --git a/libavcodec/rv34.h b/libavcodec/rv34.h
index c32c089cdd..e2f40c8e5a 100644
--- a/libavcodec/rv34.h
+++ b/libavcodec/rv34.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data declarations
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,13 +102,15 @@ typedef struct RV34DecContext{
     int dmv[4][2];           ///< differential motion vectors for the current macroblock
 
     int rv30;                ///< indicates which RV variasnt is currently decoded
-    int rpr;                 ///< one field size in RV30 slice header
+    int max_rpr;
 
     int cur_pts, last_pts, next_pts;
     int scaled_weight;
     int weight1, weight2;    ///< B frame distance fractions (0.14) used in motion compensation
     int mv_weight1, mv_weight2;
 
+    int orig_width, orig_height;
+
     uint16_t *cbp_luma;      ///< CBP values for luma subblocks
     uint8_t  *cbp_chroma;    ///< CBP values for chroma subblocks
     uint16_t *deblock_coefs; ///< deblock coefficients for each macroblock
diff --git a/libavcodec/rv34_parser.c b/libavcodec/rv34_parser.c
index ec6d3a5644..765d390550 100644
--- a/libavcodec/rv34_parser.c
+++ b/libavcodec/rv34_parser.c
@@ -2,20 +2,20 @@
  * RV30/40 parser
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h
index 30641249da..4b2701fee6 100644
--- a/libavcodec/rv34data.h
+++ b/libavcodec/rv34data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 7234ee808b..c3f245eb85 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  * Copyright (c) 2011 Janne Grunau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 1aa80cf7ef..2e9ec4eee4 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34vlc.h b/libavcodec/rv34vlc.h
index f4670c1625..aa29357c78 100644
--- a/libavcodec/rv34vlc.h
+++ b/libavcodec/rv34vlc.h
@@ -2,20 +2,20 @@
  * RealVideo 3/4 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40.c b/libavcodec/rv40.c
index e6c77e8710..3ff1554d3c 100644
--- a/libavcodec/rv40.c
+++ b/libavcodec/rv40.c
@@ -2,20 +2,20 @@
  * RV40 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -230,8 +230,11 @@ static int rv40_decode_mb_info(RV34DecContext *r)
     int prev_type = 0;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
-    if(!r->s.mb_skip_run)
+    if(!r->s.mb_skip_run) {
         r->s.mb_skip_run = svq3_get_ue_golomb(gb) + 1;
+        if(r->s.mb_skip_run > (unsigned)s->mb_num)
+            return -1;
+    }
 
     if(--r->s.mb_skip_run)
          return RV34_MB_SKIP;
@@ -358,7 +361,7 @@ static void rv40_loop_filter(RV34DecContext *r, int row)
     int uvcbp[4][2];
     /**
      * This mask represents the pattern of luma subblocks that should be filtered
-     * in addition to the coded ones because because they lie at the edge of
+     * in addition to the coded ones because they lie at the edge of
      * 8x8 block with different enough motion vectors
      */
     unsigned mvmasks[4];
diff --git a/libavcodec/rv40data.h b/libavcodec/rv40data.h
index 42328af5a8..36f9f919bd 100644
--- a/libavcodec/rv40data.h
+++ b/libavcodec/rv40data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index da3efb4442..19b0e93696 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "pixels.h"
 #include "rnd_avg.h"
 #include "rv34dsp.h"
+#include "libavutil/avassert.h"
 
 #define RV40_LOWPASS(OPNAME, OP) \
 static void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\
@@ -299,7 +300,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
@@ -332,7 +333,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
diff --git a/libavcodec/rv40vlc2.h b/libavcodec/rv40vlc2.h
index 2f63fc27e3..15119a145b 100644
--- a/libavcodec/rv40vlc2.h
+++ b/libavcodec/rv40vlc2.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c
index 635f697b5c..ccfb5913a0 100644
--- a/libavcodec/s302m.c
+++ b/libavcodec/s302m.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 #include "libavutil/log.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -28,6 +29,11 @@
 
 #define AES3_HEADER_LEN 4
 
+typedef struct S302Context {
+    AVClass *class;
+    int non_pcm_mode;
+} S302Context;
+
 static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
                                     int buf_size)
 {
@@ -59,18 +65,26 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
     }
 
     /* Set output properties */
-    avctx->bits_per_coded_sample = bits;
+    avctx->bits_per_raw_sample = bits;
     if (bits > 16)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32;
     else
         avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     avctx->channels    = channels;
-    avctx->sample_rate = 48000;
-    avctx->bit_rate    = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) +
-                         32 * (48000 / (buf_size * 8 /
-                                        (avctx->channels *
-                                         (avctx->bits_per_coded_sample + 4))));
+    switch(channels) {
+        case 2:
+            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+            break;
+        case 4:
+            avctx->channel_layout = AV_CH_LAYOUT_QUAD;
+            break;
+        case 6:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK;
+            break;
+        case 8:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK | AV_CH_LAYOUT_STEREO_DOWNMIX;
+    }
 
     return frame_size;
 }
@@ -78,10 +92,13 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
 static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                               int *got_frame_ptr, AVPacket *avpkt)
 {
+    S302Context *s = avctx->priv_data;
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     int block_size, ret;
+    int i;
+    int non_pcm_data_type = -1;
 
     int frame_size = s302m_parse_frame_header(avctx, buf, buf_size);
     if (frame_size < 0)
@@ -91,16 +108,16 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
     buf      += AES3_HEADER_LEN;
 
     /* get output buffer */
-    block_size = (avctx->bits_per_coded_sample + 4) / 4;
+    block_size = (avctx->bits_per_raw_sample + 4) / 4;
     frame->nb_samples = 2 * (buf_size / block_size) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
+    avctx->bit_rate = 48000 * avctx->channels * (avctx->bits_per_raw_sample + 4) +
+                      32 * 48000 / frame->nb_samples;
     buf_size = (frame->nb_samples * avctx->channels / 2) * block_size;
 
-    if (avctx->bits_per_coded_sample == 24) {
+    if (avctx->bits_per_raw_sample == 24) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 6; buf_size -= 7) {
             *o++ = (ff_reverse[buf[2]]        << 24) |
@@ -112,7 +129,17 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[3] & 0x0f] <<  4);
             buf += 7;
         }
-    } else if (avctx->bits_per_coded_sample == 20) {
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x96F87200U && o[i+5] == 0xA54E1F00) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
+    } else if (avctx->bits_per_raw_sample == 20) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 5; buf_size -= 6) {
             *o++ = (ff_reverse[buf[2] & 0xf0] << 28) |
@@ -123,6 +150,16 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[3]]        << 12);
             buf += 6;
         }
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x6F872000U && o[i+5] == 0x54E1F000) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
     } else {
         uint16_t *o = (uint16_t *)frame->data[0];
         for (; buf_size > 4; buf_size -= 5) {
@@ -133,18 +170,61 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[2]]        >>  4);
             buf += 5;
         }
+        o = (uint16_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0xF872U && o[i+5] == 0x4E1F) {
+                    non_pcm_data_type = (o[i+6] & 0x1F);
+                    break;
+                }
+            }
+    }
+
+    if (non_pcm_data_type != -1) {
+        if (s->non_pcm_mode == 3) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "S302 non PCM mode with data type %d not supported\n",
+                   non_pcm_data_type);
+            return AVERROR_PATCHWELCOME;
+        }
+        if (s->non_pcm_mode & 1) {
+            return avpkt->size;
+        }
     }
 
+    avctx->sample_rate = 48000;
+
     *got_frame_ptr = 1;
 
     return avpkt->size;
 }
 
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_DECODING_PARAM
+static const AVOption s302m_options[] = {
+    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"copy"        , "Pass NON-PCM through unchanged"     , 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"drop"        , "Drop NON-PCM"                       , 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_copy" , "Decode if possible else passthrough", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_drop" , "Decode if possible else drop"       , 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {NULL}
+};
+
+static const AVClass s302m_class = {
+    "SMPTE 302M Decoder",
+    av_default_item_name,
+    s302m_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_s302m_decoder = {
     .name           = "s302m",
     .long_name      = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_S302M,
+    .priv_data_size = sizeof(S302Context),
     .decode         = s302m_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &s302m_class,
 };
diff --git a/libavcodec/s302menc.c b/libavcodec/s302menc.c
new file mode 100644
index 0000000000..fbaa845aaa
--- /dev/null
+++ b/libavcodec/s302menc.c
@@ -0,0 +1,178 @@
+/*
+ * SMPTE 302M encoder
+ * Copyright (c) 2010 Google, Inc.
+ * Copyright (c) 2013 Darryl Wallace <wallacdj@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define AES3_HEADER_LEN 4
+
+typedef struct S302MEncContext {
+    uint8_t framing_index; /* Set for even channels on multiple of 192 samples */
+} S302MEncContext;
+
+static av_cold int s302m_encode_init(AVCodecContext *avctx)
+{
+    S302MEncContext *s = avctx->priv_data;
+
+    if (avctx->channels & 1 || avctx->channels > 8) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Encoding %d channel(s) is not allowed. Only 2, 4, 6 and 8 channels are supported.\n",
+               avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 20) {
+            if (avctx->bits_per_raw_sample > 24)
+                av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+            avctx->bits_per_raw_sample = 24;
+        } else if (!avctx->bits_per_raw_sample) {
+            avctx->bits_per_raw_sample = 24;
+        } else if (avctx->bits_per_raw_sample <= 20) {
+            avctx->bits_per_raw_sample = 20;
+        }
+    }
+
+    avctx->frame_size = 0;
+    avctx->bit_rate   = 48000 * avctx->channels *
+                       (avctx->bits_per_raw_sample + 4);
+    s->framing_index  = 0;
+
+    return 0;
+}
+
+static int s302m_encode2_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                               const AVFrame *frame, int *got_packet_ptr)
+{
+    S302MEncContext *s = avctx->priv_data;
+    const int buf_size = AES3_HEADER_LEN +
+                        (frame->nb_samples *
+                         avctx->channels *
+                        (avctx->bits_per_raw_sample + 4)) / 8;
+    int ret, c, channels;
+    uint8_t *o;
+    PutBitContext pb;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+
+    o = avpkt->data;
+    init_put_bits(&pb, o, buf_size);
+    put_bits(&pb, 16, buf_size - AES3_HEADER_LEN);
+    put_bits(&pb, 2, (avctx->channels - 2) >> 1);   // number of channels
+    put_bits(&pb, 8, 0);                            // channel ID
+    put_bits(&pb, 2, (avctx->bits_per_raw_sample - 16) / 4); // bits per samples (0 = 16bit, 1 = 20bit, 2 = 24bit)
+    put_bits(&pb, 4, 0);                            // alignments
+    flush_put_bits(&pb);
+    o += AES3_HEADER_LEN;
+
+    if (avctx->bits_per_raw_sample == 24) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[(samples[0] & 0x0000FF00) >> 8];
+                o[1] = ff_reverse[(samples[0] & 0x00FF0000) >> 16];
+                o[2] = ff_reverse[(samples[0] & 0xFF000000) >> 24];
+                o[3] = ff_reverse[(samples[1] & 0x00000F00) >> 4] | vucf;
+                o[4] = ff_reverse[(samples[1] & 0x000FF000) >> 12];
+                o[5] = ff_reverse[(samples[1] & 0x0FF00000) >> 20];
+                o[6] = ff_reverse[(samples[1] & 0xF0000000) >> 28];
+                o += 7;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 20) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x80: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ (samples[0] & 0x000FF000) >> 12];
+                o[1] = ff_reverse[ (samples[0] & 0x0FF00000) >> 20];
+                o[2] = ff_reverse[((samples[0] & 0xF0000000) >> 28) | vucf];
+                o[3] = ff_reverse[ (samples[1] & 0x000FF000) >> 12];
+                o[4] = ff_reverse[ (samples[1] & 0x0FF00000) >> 20];
+                o[5] = ff_reverse[ (samples[1] & 0xF0000000) >> 28];
+                o += 6;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 16) {
+        const uint16_t *samples = (uint16_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10 : 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ samples[0] & 0xFF];
+                o[1] = ff_reverse[(samples[0] & 0xFF00) >>  8];
+                o[2] = ff_reverse[(samples[1] & 0x0F)   <<  4] | vucf;
+                o[3] = ff_reverse[(samples[1] & 0x0FF0) >>  4];
+                o[4] = ff_reverse[(samples[1] & 0xF000) >> 12];
+                o += 5;
+                samples += 2;
+
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    }
+
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+AVCodec ff_s302m_encoder = {
+    .name                  = "s302m",
+    .long_name             = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_S302M,
+    .priv_data_size        = sizeof(S302MEncContext),
+    .init                  = s302m_encode_init,
+    .encode2               = s302m_encode2_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .capabilities          = AV_CODEC_CAP_VARIABLE_FRAME_SIZE | AV_CODEC_CAP_EXPERIMENTAL,
+    .supported_samplerates = (const int[]) { 48000, 0 },
+};
diff --git a/libavcodec/samidec.c b/libavcodec/samidec.c
new file mode 100644
index 0000000000..95f35abd42
--- /dev/null
+++ b/libavcodec/samidec.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SAMI subtitle decoder
+ * @see http://msdn.microsoft.com/en-us/library/ms971327.aspx
+ */
+
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "htmlsubtitles.h"
+
+typedef struct {
+    AVBPrint source;
+    AVBPrint content;
+    AVBPrint encoded_source;
+    AVBPrint encoded_content;
+    AVBPrint full;
+} SAMIContext;
+
+static int sami_paragraph_to_ass(AVCodecContext *avctx, const char *src)
+{
+    SAMIContext *sami = avctx->priv_data;
+    int ret = 0;
+    char *tag = NULL;
+    char *dupsrc = av_strdup(src);
+    char *p = dupsrc;
+    AVBPrint *dst_content = &sami->encoded_content;
+    AVBPrint *dst_source = &sami->encoded_source;
+
+    av_bprint_clear(&sami->encoded_content);
+    av_bprint_clear(&sami->content);
+    av_bprint_clear(&sami->encoded_source);
+    for (;;) {
+        char *saveptr = NULL;
+        int prev_chr_is_space = 0;
+        AVBPrint *dst = &sami->content;
+
+        /* parse & extract paragraph tag */
+        p = av_stristr(p, "<P");
+        if (!p)
+            break;
+        if (p[2] != '>' && !av_isspace(p[2])) { // avoid confusion with tags such as <PRE>
+            p++;
+            continue;
+        }
+        if (dst->len) // add a separator with the previous paragraph if there was one
+            av_bprintf(dst, "\\N");
+        tag = av_strtok(p, ">", &saveptr);
+        if (!tag || !saveptr)
+            break;
+        p = saveptr;
+
+        /* check if the current paragraph is the "source" (speaker name) */
+        if (av_stristr(tag, "ID=Source") || av_stristr(tag, "ID=\"Source\"")) {
+            dst = &sami->source;
+            av_bprint_clear(dst);
+        }
+
+        /* if empty event -> skip subtitle */
+        while (av_isspace(*p))
+            p++;
+        if (!strncmp(p, "&nbsp;", 6)) {
+            ret = -1;
+            goto end;
+        }
+
+        /* extract the text, stripping most of the tags */
+        while (*p) {
+            if (*p == '<') {
+                if (!av_strncasecmp(p, "<P", 2) && (p[2] == '>' || av_isspace(p[2])))
+                    break;
+            }
+            if (!av_strncasecmp(p, "<BR", 3)) {
+                av_bprintf(dst, "\\N");
+                p++;
+                while (*p && *p != '>')
+                    p++;
+                if (!*p)
+                    break;
+                if (*p == '>')
+                    p++;
+                continue;
+            }
+            if (!av_isspace(*p))
+                av_bprint_chars(dst, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(dst, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+            p++;
+        }
+    }
+
+    av_bprint_clear(&sami->full);
+    if (sami->source.len) {
+        ff_htmlmarkup_to_ass(avctx, dst_source, sami->source.str);
+        av_bprintf(&sami->full, "{\\i1}%s{\\i0}\\N", sami->encoded_source.str);
+    }
+    ff_htmlmarkup_to_ass(avctx, dst_content, sami->content.str);
+    av_bprintf(&sami->full, "%s", sami->encoded_content.str);
+
+end:
+    av_free(dupsrc);
+    return ret;
+}
+
+static int sami_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    SAMIContext *sami = avctx->priv_data;
+
+    if (ptr && avpkt->size > 0 && !sami_paragraph_to_ass(avctx, ptr)) {
+        int ts_start     = av_rescale_q(avpkt->pts, avctx->time_base, (AVRational){1,100});
+        int ts_duration  = avpkt->duration != -1 ?
+                           av_rescale_q(avpkt->duration, avctx->time_base, (AVRational){1,100}) : -1;
+        int ret = ff_ass_add_rect_bprint(sub, &sami->full, ts_start, ts_duration);
+        if (ret < 0)
+            return ret;
+    }
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static av_cold int sami_init(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_init(&sami->source,  0, 2048);
+    av_bprint_init(&sami->content, 0, 2048);
+    av_bprint_init(&sami->encoded_source,  0, 2048);
+    av_bprint_init(&sami->encoded_content, 0, 2048);
+    av_bprint_init(&sami->full,    0, 2048);
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+static av_cold int sami_close(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_finalize(&sami->source,  NULL);
+    av_bprint_finalize(&sami->content, NULL);
+    av_bprint_finalize(&sami->encoded_source,  NULL);
+    av_bprint_finalize(&sami->encoded_content, NULL);
+    av_bprint_finalize(&sami->full,    NULL);
+    return 0;
+}
+
+AVCodec ff_sami_decoder = {
+    .name           = "sami",
+    .long_name      = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SAMI,
+    .priv_data_size = sizeof(SAMIContext),
+    .init           = sami_init,
+    .close          = sami_close,
+    .decode         = sami_decode_frame,
+};
diff --git a/libavcodec/sanm.c b/libavcodec/sanm.c
index 6436f84895..1aa002b6a5 100644
--- a/libavcodec/sanm.c
+++ b/libavcodec/sanm.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Cyril Zorin
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,108 +105,159 @@ static const int8_t motion_vectors[256][2] = {
 };
 
 static const int8_t c37_mv[] = {
-    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,   8,   0,  13,   0,  21,
-    0,  -1,   0,  -2,   0,  -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
-  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,   5,   1,   8,   1,  13,
-    1,  21,   1,  -1,   1,  -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
-  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   8,
-    2,  13,   2,  21,   2,  -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
-  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,   2,   3,   3,   3,   5,
-    3,   8,   3,  13,   3,  21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
-   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,   1,   5,   2,   5,   3,
-    5,   5,   5,   8,   5,  13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
-   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,   0,   8,   1,   8,   2,
-    8,   3,   8,   5,   8,   8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
-   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8, -21,   8,   0,  13,   1,
-   13,   2,  13,   3,  13,   5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
-   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13, -17,  13, -21,  13,   0,
-   21,   1,  21,   2,  21,   3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
-   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21, -13,  21, -17,  21, -21,
-   21,   0,  -1,   1,  -1,   2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
-   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,  -8,  -1, -13,  -1, -17,
-   -1, -21,  -1,   0,  -2,   1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
-   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,  -5,  -2,  -8,  -2, -13,
-   -2, -17,  -2, -21,  -2,   0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
-    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,  -3,  -3,  -5,  -3,  -8,
-   -3, -13,  -3, -17,  -3, -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
-    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,  -2,  -5,  -3,  -5,  -5,
-   -5,  -8,  -5, -13,  -5, -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
-    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,  -1,  -8,  -2,  -8,  -3,
-   -8,  -5,  -8,  -8,  -8, -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
-    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,  21, -13,  -1, -13,  -2,
-  -13,  -3, -13,  -5, -13,  -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
-    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,  13, -17,  21, -17,  -1,
-  -17,  -2, -17,  -3, -17,  -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
-    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,   8, -21,  13, -21,  21,
-  -21,  -1, -21,  -2, -21,  -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
-    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,   0, -23,  -6, -22,   6,
-  -22, -13, -19,  12, -19,   0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
-  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,  -8,
-  -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
-   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -29,  -8, -11,  -8,  -8,
-   -8,  -3,  -8,   3,  -8,   8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
-    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,
-   -6,   1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
-   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,   7,  -5,  17,  -5, -13,
-   -4, -10,  -4,  -5,  -4,  -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
-    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,  -4,  -3,  -3,  -3,  -2,
-   -3,  -1,  -3,   0,  -3,   1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
-  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,
-   -2,   2,  -2,   3,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   6,  -1,   9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -31,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,  -6,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -3,   2,  -2,   2,  -1,
-    2,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
-   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,   0,   3,   1,   3,   2,
-    3,   3,   3,   4,   3,   6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
-   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,   5,   4,  10,   4,  13,
-    4, -17,   5,  -7,   5,  -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
-    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,   1,
-    6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
-    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,  -3,   8,   3,   8,   8,
-    8,  11,   8,  29,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
-   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,
-   11,  19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
-   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18, -12,  19,  13,  19,  -6,
-   22,   6,  22,   0,  23, -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
-    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,   0, -18,  -5, -17,   5,
-  -17, -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
-   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4,
-  -10,  15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
-   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,  -5,  -7,  -2,  -7,   0,
-   -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
-    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,  -4,
-   -5,  -2,  -5,  -1,  -5,   0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
-   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,  -2,  -4,  -1,  -4,   0,
-   -4,   1,  -4,   2,  -4,   3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
-   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,   1,  -3,   2,
-   -3,   3,  -3,   4,  -3,   6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
-   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,
-   -2,   4,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   5,  -1,   6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -23,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,  -5,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -4,   2,  -3,
-    2,  -2,   2,  -1,   2,   0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
-    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,  -4,   3,  -3,   3,  -2,
-    3,  -1,   3,   0,   3,   1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
-    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,  -2,   4,  -1,   4,   0,
-    4,   1,   4,   2,   4,   3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
-   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,   1,   5,   2,   5,   4,
-    5,   7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
-    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,
-    7,   2,   7,   5,   7, -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
-    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,  -4,
-   10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
-   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,  -5,
-   17,   5,  17,   0,  18, -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,
+    8,   0,  13,   0,  21,   0,  -1,   0,  -2,   0,
+   -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
+  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,
+    5,   1,   8,   1,  13,   1,  21,   1,  -1,   1,
+   -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
+  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,
+    3,   2,   5,   2,   8,   2,  13,   2,  21,   2,
+   -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
+  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,
+    2,   3,   3,   3,   5,   3,   8,   3,  13,   3,
+   21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
+   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,
+    1,   5,   2,   5,   3,   5,   5,   5,   8,   5,
+   13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
+   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,
+    0,   8,   1,   8,   2,   8,   3,   8,   5,   8,
+    8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
+   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8,
+  -21,   8,   0,  13,   1,  13,   2,  13,   3,  13,
+    5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
+   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13,
+  -17,  13, -21,  13,   0,  21,   1,  21,   2,  21,
+    3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
+   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21,
+  -13,  21, -17,  21, -21,  21,   0,  -1,   1,  -1,
+    2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
+   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,
+   -8,  -1, -13,  -1, -17,  -1, -21,  -1,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
+   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,
+   -5,  -2,  -8,  -2, -13,  -2, -17,  -2, -21,  -2,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
+    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,
+   -3,  -3,  -5,  -3,  -8,  -3, -13,  -3, -17,  -3,
+  -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
+    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,
+   -2,  -5,  -3,  -5,  -5,  -5,  -8,  -5, -13,  -5,
+  -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
+    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,
+   -1,  -8,  -2,  -8,  -3,  -8,  -5,  -8,  -8,  -8,
+  -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
+    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,
+   21, -13,  -1, -13,  -2, -13,  -3, -13,  -5, -13,
+   -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
+    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,
+   13, -17,  21, -17,  -1, -17,  -2, -17,  -3, -17,
+   -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
+    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,
+    8, -21,  13, -21,  21, -21,  -1, -21,  -2, -21,
+   -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
+    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,
+    0, -23,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
+  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,
+   19, -13, -19, -12,  -8, -11,  -2, -11,   0, -11,
+    2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
+   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9,
+  -29,  -8, -11,  -8,  -8,  -8,  -3,  -8,   3,  -8,
+    8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
+    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,
+   -6,  -6,  -3,  -6,  -1,  -6,   1,  -6,   3,  -6,
+    6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
+   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,
+    7,  -5,  17,  -5, -13,  -4, -10,  -4,  -5,  -4,
+   -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
+    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,
+   -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,
+    1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
+  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,
+   -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,  -2,
+    5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   6,  -1,
+    9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -31,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,
+   -6,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,
+   -3,   2,  -2,   2,  -1,   2,   0,   2,   1,   2,
+    2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
+   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,
+    0,   3,   1,   3,   2,   3,   3,   3,   4,   3,
+    6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
+   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,
+    5,   4,  10,   4,  13,   4, -17,   5,  -7,   5,
+   -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
+    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,
+   -3,   6,  -1,   6,   1,   6,   3,   6,   6,   6,
+    9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
+    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,
+   -3,   8,   3,   8,   8,   8,  11,   8,  29,   8,
+   -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
+   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,
+    0,  11,   2,  11,   8,  11,  19,  12, -19,  13,
+   -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
+   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+  -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
+    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  -5, -17,   5, -17, -10, -15,  10, -15,
+    0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
+   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11,
+  -15, -10,  -4, -10,   4, -10,  15, -10,  -6,  -9,
+   -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
+   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,
+   -5,  -7,  -2,  -7,   0,  -7,   2,  -7,   5,  -7,
+  -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
+    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6,
+  -17,  -5,  -7,  -5,  -4,  -5,  -2,  -5,  -1,  -5,
+    0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
+   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,
+   -2,  -4,  -1,  -4,   0,  -4,   1,  -4,   2,  -4,
+    3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
+   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   4,  -3,
+    6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
+   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   4,  -2,   5,  -2,
+    7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   5,  -1,
+    6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -23,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,
+   -5,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,
+   -5,   2,  -4,   2,  -3,   2,  -2,   2,  -1,   2,
+    0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
+    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,
+   -4,   3,  -3,   3,  -2,   3,  -1,   3,   0,   3,
+    1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
+    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,
+   -2,   4,  -1,   4,   0,   4,   1,   4,   2,   4,
+    3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
+   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,
+    1,   5,   2,   5,   4,   5,   7,   5,  17,   5,
+  -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
+    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,
+   -5,   7,  -2,   7,   0,   7,   2,   7,   5,   7,
+  -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
+    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,
+    6,   9, -15,  10,  -4,  10,   4,  10,  15,  10,
+   -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
+   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14,
+  -10,  15,  10,  15,  -5,  17,   5,  17,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
 };
 
 typedef struct SANMVideoContext {
@@ -406,6 +457,7 @@ static void destroy_buffers(SANMVideoContext *ctx)
     ctx->frm0_size =
     ctx->frm1_size =
     ctx->frm2_size = 0;
+    init_sizes(ctx, 0, 0);
 }
 
 static av_cold int init_buffers(SANMVideoContext *ctx)
@@ -460,7 +512,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         }
 
         ctx->subversion = AV_RL16(avctx->extradata);
-        for (i = 0; i < 256; i++)
+        for (i = 0; i < PALETTE_SIZE; i++)
             ctx->pal[i] = 0xFFU << 24 | AV_RL32(avctx->extradata + 2 + i * 4);
     }
 
@@ -1466,7 +1518,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
 AVCodec ff_sanm_decoder = {
     .name           = "sanm",
-    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM/Smush video"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SANM,
     .priv_data_size = sizeof(SANMVideoContext),
diff --git a/libavcodec/sbr.h b/libavcodec/sbr.h
index a47ad6eedb..66a88a2898 100644
--- a/libavcodec/sbr.h
+++ b/libavcodec/sbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,8 @@
 #include "aacps.h"
 #include "sbrdsp.h"
 
+typedef struct AACContext AACContext;
+
 /**
  * Spectral Band Replication header - spectrum parameters that invoke a reset if they differ from the previous header.
  */
@@ -64,9 +66,9 @@ typedef struct SBRData {
      */
     unsigned           bs_frame_class;
     unsigned           bs_add_harmonic_flag;
-    unsigned           bs_num_env;
+    AAC_SIGNE          bs_num_env;
     uint8_t            bs_freq_res[7];
-    unsigned           bs_num_noise;
+    AAC_SIGNE          bs_num_noise;
     uint8_t            bs_df_env[5];
     uint8_t            bs_df_noise[2];
     uint8_t            bs_invf_mode[2][5];
@@ -78,25 +80,25 @@ typedef struct SBRData {
      * @name State variables
      * @{
      */
-    DECLARE_ALIGNED(32, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
-    DECLARE_ALIGNED(32, float, analysis_filterbank_samples) [1312];
+    DECLARE_ALIGNED(32, INTFLOAT, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
+    DECLARE_ALIGNED(32, INTFLOAT, analysis_filterbank_samples) [1312];
     int                synthesis_filterbank_samples_offset;
     ///l_APrev and l_A
     int                e_a[2];
     ///Chirp factors
-    float              bw_array[5];
+    INTFLOAT              bw_array[5];
     ///QMF values of the original signal
-    float              W[2][32][32][2];
+    INTFLOAT              W[2][32][32][2];
     ///QMF output of the HF adjustor
     int                Ypos;
-    DECLARE_ALIGNED(16, float, Y)[2][38][64][2];
-    DECLARE_ALIGNED(16, float, g_temp)[42][48];
-    float              q_temp[42][48];
+    DECLARE_ALIGNED(16, INTFLOAT, Y)[2][38][64][2];
+    DECLARE_ALIGNED(16, AAC_FLOAT, g_temp)[42][48];
+    AAC_FLOAT          q_temp[42][48];
     uint8_t            s_indexmapped[8][48];
     ///Envelope scalefactors
-    float              env_facs[6][48];
+    AAC_FLOAT          env_facs[6][48];
     ///Noise scalefactors
-    float              noise_facs[3][5];
+    AAC_FLOAT          noise_facs[3][5];
     ///Envelope time borders
     uint8_t            t_env[8];
     ///Envelope time border of the last envelope of the previous frame
@@ -108,12 +110,34 @@ typedef struct SBRData {
     /** @} */
 } SBRData;
 
+typedef struct SpectralBandReplication SpectralBandReplication;
+
+/**
+ * aacsbr functions pointers
+ */
+typedef struct AACSBRContext {
+    int (*sbr_lf_gen)(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx);
+    void (*sbr_hf_assemble)(INTFLOAT Y1[38][64][2],
+                            const INTFLOAT X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2]);
+    int (*sbr_x_gen)(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch);
+    void (*sbr_hf_inverse_filter)(SBRDSPContext *dsp,
+                                  INTFLOAT (*alpha0)[2], INTFLOAT (*alpha1)[2],
+                                  const INTFLOAT X_low[32][40][2], int k0);
+} AACSBRContext;
+
 /**
  * Spectral Band Replication
  */
-typedef struct SpectralBandReplication {
+struct SpectralBandReplication {
     int                sample_rate;
     int                start;
+    int                id_aac;
     int                reset;
     SpectrumParameters spectrum_params;
     int                bs_amp_res_header;
@@ -127,23 +151,23 @@ typedef struct SpectralBandReplication {
     unsigned           bs_smoothing_mode;
     /** @} */
     unsigned           bs_coupling;
-    unsigned           k[5]; ///< k0, k1, k2
+    AAC_SIGNE          k[5]; ///< k0, k1, k2
     ///kx', and kx respectively, kx is the first QMF subband where SBR is used.
     ///kx' is its value from the previous frame
-    unsigned           kx[2];
+    AAC_SIGNE          kx[2];
     ///M' and M respectively, M is the number of QMF subbands that use SBR.
-    unsigned           m[2];
+    AAC_SIGNE          m[2];
     unsigned           kx_and_m_pushed;
     ///The number of frequency bands in f_master
-    unsigned           n_master;
+    AAC_SIGNE          n_master;
     SBRData            data[2];
     PSContext          ps;
     ///N_Low and N_High respectively, the number of frequency bands for low and high resolution
-    unsigned           n[2];
+    AAC_SIGNE          n[2];
     ///Number of noise floor bands
-    unsigned           n_q;
+    AAC_SIGNE          n_q;
     ///Number of limiter bands
-    unsigned           n_lim;
+    AAC_SIGNE          n_lim;
     ///The master QMF frequency grouping
     uint16_t           f_master[49];
     ///Frequency borders for low resolution SBR
@@ -153,37 +177,38 @@ typedef struct SpectralBandReplication {
     ///Frequency borders for noise floors
     uint16_t           f_tablenoise[6];
     ///Frequency borders for the limiter
-    uint16_t           f_tablelim[29];
-    unsigned           num_patches;
+    uint16_t           f_tablelim[30];
+    AAC_SIGNE          num_patches;
     uint8_t            patch_num_subbands[6];
     uint8_t            patch_start_subband[6];
     ///QMF low frequency input to the HF generator
-    DECLARE_ALIGNED(16, float, X_low)[32][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_low)[32][40][2];
     ///QMF output of the HF generator
-    DECLARE_ALIGNED(16, float, X_high)[64][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_high)[64][40][2];
     ///QMF values of the reconstructed signal
-    DECLARE_ALIGNED(16, float, X)[2][2][38][64];
+    DECLARE_ALIGNED(16, INTFLOAT, X)[2][2][38][64];
     ///Zeroth coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha0)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha0)[64][2];
     ///First coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha1)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha1)[64][2];
     ///Dequantized envelope scalefactors, remapped
-    float              e_origmapped[7][48];
+    AAC_FLOAT          e_origmapped[7][48];
     ///Dequantized noise scalefactors, remapped
-    float              q_mapped[7][48];
+    AAC_FLOAT          q_mapped[7][48];
     ///Sinusoidal presence, remapped
     uint8_t            s_mapped[7][48];
     ///Estimated envelope
-    float              e_curr[7][48];
+    AAC_FLOAT          e_curr[7][48];
     ///Amplitude adjusted noise scalefactors
-    float              q_m[7][48];
+    AAC_FLOAT          q_m[7][48];
     ///Sinusoidal levels
-    float              s_m[7][48];
-    float              gain[7][48];
-    DECLARE_ALIGNED(32, float, qmf_filter_scratch)[5][64];
+    AAC_FLOAT          s_m[7][48];
+    AAC_FLOAT          gain[7][48];
+    DECLARE_ALIGNED(32, INTFLOAT, qmf_filter_scratch)[5][64];
     FFTContext         mdct_ana;
     FFTContext         mdct;
     SBRDSPContext      dsp;
-} SpectralBandReplication;
+    AACSBRContext      c;
+};
 
 #endif /* AVCODEC_SBR_H */
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index b7917dc9af..cc432b6534 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -3,37 +3,31 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
+
+#include "aac.h"
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/intfloat.h"
 #include "sbrdsp.h"
 
-static void sbr_sum64x5_c(float *z)
-{
-    int k;
-    for (k = 0; k < 64; k++) {
-        float f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
-        z[k] = f;
-    }
-}
-
 static float sbr_sum_square_c(float (*x)[2], int n)
 {
     float sum0 = 0.0f, sum1 = 0.0f;
@@ -72,6 +66,7 @@ static void sbr_qmf_pre_shuffle_c(float *z)
         zi[64 + 2 * k + 2].i = zi[63 - k].i ^ (1U << 31);
         zi[64 + 2 * k + 3].i = zi[ k + 2].i;
     }
+
     zi[64 + 2 * 31 + 0].i = zi[64 - 31].i ^ (1U << 31);
     zi[64 + 2 * 31 + 1].i = zi[31 +  1].i;
 }
@@ -100,16 +95,6 @@ static void sbr_qmf_deint_neg_c(float *v, const float *src)
     }
 }
 
-static void sbr_qmf_deint_bfly_c(float *v, const float *src0, const float *src1)
-{
-    int i;
-    for (i = 0; i < 64; i++) {
-        v[      i] = src0[i] - src1[63 - i];
-        v[127 - i] = src0[i] + src1[63 - i];
-    }
-}
-
-
 #if 0
     /* This code is slower because it multiplies memory accesses.
      * It is left for educational purposes and because it may offer
@@ -237,56 +222,4 @@ static av_always_inline void sbr_hf_apply_noise(float (*Y)[2],
     }
 }
 
-static void sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, phi_sign, m_max);
-}
-
-static void sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, -1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, -phi_sign, m_max);
-}
-
-av_cold void ff_sbrdsp_init(SBRDSPContext *s)
-{
-    s->sum64x5 = sbr_sum64x5_c;
-    s->sum_square = sbr_sum_square_c;
-    s->neg_odd_64 = sbr_neg_odd_64_c;
-    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
-    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
-    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
-    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
-    s->autocorrelate = sbr_autocorrelate_c;
-    s->hf_gen = sbr_hf_gen_c;
-    s->hf_g_filt = sbr_hf_g_filt_c;
-
-    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
-    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
-    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
-    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
-
-    if (ARCH_ARM)
-        ff_sbrdsp_init_arm(s);
-    if (ARCH_X86)
-        ff_sbrdsp_init_x86(s);
-}
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h
index 07235c68e6..66852de618 100644
--- a/libavcodec/sbrdsp.h
+++ b/libavcodec/sbrdsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,30 +22,33 @@
 #define AVCODEC_SBRDSP_H
 
 #include <stdint.h>
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
 
 typedef struct SBRDSPContext {
-    void (*sum64x5)(float *z);
-    float (*sum_square)(float (*x)[2], int n);
-    void (*neg_odd_64)(float *x);
-    void (*qmf_pre_shuffle)(float *z);
-    void (*qmf_post_shuffle)(float W[32][2], const float *z);
-    void (*qmf_deint_neg)(float *v, const float *src);
-    void (*qmf_deint_bfly)(float *v, const float *src0, const float *src1);
-    void (*autocorrelate)(const float x[40][2], float phi[3][2][2]);
-    void (*hf_gen)(float (*X_high)[2], const float (*X_low)[2],
-                   const float alpha0[2], const float alpha1[2],
-                   float bw, int start, int end);
-    void (*hf_g_filt)(float (*Y)[2], const float (*X_high)[40][2],
-                      const float *g_filt, int m_max, intptr_t ixh);
-    void (*hf_apply_noise[4])(float (*Y)[2], const float *s_m,
-                              const float *q_filt, int noise,
+    void (*sum64x5)(INTFLOAT *z);
+    AAC_FLOAT (*sum_square)(INTFLOAT (*x)[2], int n);
+    void (*neg_odd_64)(INTFLOAT *x);
+    void (*qmf_pre_shuffle)(INTFLOAT *z);
+    void (*qmf_post_shuffle)(INTFLOAT W[32][2], const INTFLOAT *z);
+    void (*qmf_deint_neg)(INTFLOAT *v, const INTFLOAT *src);
+    void (*qmf_deint_bfly)(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1);
+    void (*autocorrelate)(const INTFLOAT x[40][2], AAC_FLOAT phi[3][2][2]);
+    void (*hf_gen)(INTFLOAT (*X_high)[2], const INTFLOAT (*X_low)[2],
+                   const INTFLOAT alpha0[2], const INTFLOAT alpha1[2],
+                   INTFLOAT bw, int start, int end);
+    void (*hf_g_filt)(INTFLOAT (*Y)[2], const INTFLOAT (*X_high)[40][2],
+                      const AAC_FLOAT *g_filt, int m_max, intptr_t ixh);
+    void (*hf_apply_noise[4])(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                              const AAC_FLOAT *q_filt, int noise,
                               int kx, int m_max);
 } SBRDSPContext;
 
-extern const float ff_sbr_noise_table[][2];
+extern const INTFLOAT AAC_RENAME(ff_sbr_noise_table)[][2];
 
-void ff_sbrdsp_init(SBRDSPContext *s);
+void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s);
 void ff_sbrdsp_init_arm(SBRDSPContext *s);
 void ff_sbrdsp_init_x86(SBRDSPContext *s);
+void ff_sbrdsp_init_mips(SBRDSPContext *s);
 
 #endif /* AVCODEC_SBRDSP_H */
diff --git a/libavcodec/sbrdsp_fixed.c b/libavcodec/sbrdsp_fixed.c
new file mode 100644
index 0000000000..5b7b7a6f9b
--- /dev/null
+++ b/libavcodec/sbrdsp_fixed.c
@@ -0,0 +1,286 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/intfloat.h"
+#include "sbrdsp.h"
+
+static SoftFloat sbr_sum_square_c(int (*x)[2], int n)
+{
+    SoftFloat ret;
+    int64_t accu = 0;
+    int i, nz, round;
+
+    for (i = 0; i < n; i += 2) {
+        accu += (int64_t)x[i + 0][0] * x[i + 0][0];
+        accu += (int64_t)x[i + 0][1] * x[i + 0][1];
+        accu += (int64_t)x[i + 1][0] * x[i + 1][0];
+        accu += (int64_t)x[i + 1][1] * x[i + 1][1];
+    }
+
+    i = (int)(accu >> 32);
+    if (i == 0) {
+        nz = 1;
+    } else {
+        nz = 0;
+        while (FFABS(i) < 0x40000000) {
+            i <<= 1;
+            nz++;
+        }
+        nz = 32 - nz;
+    }
+
+    round = 1 << (nz-1);
+    i = (int)((accu + round) >> nz);
+    i >>= 1;
+    ret = av_int2sf(i, 15 - nz);
+
+    return ret;
+}
+
+static void sbr_neg_odd_64_c(int *x)
+{
+    int i;
+    for (i = 1; i < 64; i += 2)
+        x[i] = -x[i];
+}
+
+static void sbr_qmf_pre_shuffle_c(int *z)
+{
+    int k;
+    z[64] = z[0];
+    z[65] = z[1];
+    for (k = 1; k < 32; k++) {
+        z[64+2*k  ] = -z[64 - k];
+        z[64+2*k+1] =  z[ k + 1];
+    }
+}
+
+static void sbr_qmf_post_shuffle_c(int W[32][2], const int *z)
+{
+    int k;
+    for (k = 0; k < 32; k++) {
+        W[k][0] = -z[63-k];
+        W[k][1] = z[k];
+    }
+}
+
+static void sbr_qmf_deint_neg_c(int *v, const int *src)
+{
+    int i;
+    for (i = 0; i < 32; i++) {
+        v[     i] = ( src[63 - 2*i    ] + 0x10) >> 5;
+        v[63 - i] = (-src[63 - 2*i - 1] + 0x10) >> 5;
+    }
+}
+
+static av_always_inline SoftFloat autocorr_calc(int64_t accu)
+{
+        int nz, mant, expo, round;
+        int i = (int)(accu >> 32);
+        if (i == 0) {
+            nz = 1;
+        } else {
+            nz = 0;
+            while (FFABS(i) < 0x40000000) {
+                i <<= 1;
+                nz++;
+            }
+            nz = 32-nz;
+        }
+
+        round = 1 << (nz-1);
+        mant = (int)((accu + round) >> nz);
+        mant = (mant + 0x40)>>7;
+        mant <<= 6;
+        expo = nz + 15;
+        return av_int2sf(mant, 30 - expo);
+}
+
+static av_always_inline void autocorrelate(const int x[40][2], SoftFloat phi[3][2][2], int lag)
+{
+    int i;
+    int64_t real_sum, imag_sum;
+    int64_t accu_re = 0, accu_im = 0;
+
+    if (lag) {
+        for (i = 1; i < 38; i++) {
+            accu_re += (int64_t)x[i][0] * x[i+lag][0];
+            accu_re += (int64_t)x[i][1] * x[i+lag][1];
+            accu_im += (int64_t)x[i][0] * x[i+lag][1];
+            accu_im -= (int64_t)x[i][1] * x[i+lag][0];
+        }
+
+        real_sum = accu_re;
+        imag_sum = accu_im;
+
+        accu_re += (int64_t)x[ 0][0] * x[lag][0];
+        accu_re += (int64_t)x[ 0][1] * x[lag][1];
+        accu_im += (int64_t)x[ 0][0] * x[lag][1];
+        accu_im -= (int64_t)x[ 0][1] * x[lag][0];
+
+        phi[2-lag][1][0] = autocorr_calc(accu_re);
+        phi[2-lag][1][1] = autocorr_calc(accu_im);
+
+        if (lag == 1) {
+            accu_re = real_sum;
+            accu_im = imag_sum;
+            accu_re += (int64_t)x[38][0] * x[39][0];
+            accu_re += (int64_t)x[38][1] * x[39][1];
+            accu_im += (int64_t)x[38][0] * x[39][1];
+            accu_im -= (int64_t)x[38][1] * x[39][0];
+
+            phi[0][0][0] = autocorr_calc(accu_re);
+            phi[0][0][1] = autocorr_calc(accu_im);
+        }
+    } else {
+        for (i = 1; i < 38; i++) {
+            accu_re += (int64_t)x[i][0] * x[i][0];
+            accu_re += (int64_t)x[i][1] * x[i][1];
+        }
+        real_sum = accu_re;
+        accu_re += (int64_t)x[ 0][0] * x[ 0][0];
+        accu_re += (int64_t)x[ 0][1] * x[ 0][1];
+
+        phi[2][1][0] = autocorr_calc(accu_re);
+
+        accu_re = real_sum;
+        accu_re += (int64_t)x[38][0] * x[38][0];
+        accu_re += (int64_t)x[38][1] * x[38][1];
+
+        phi[1][0][0] = autocorr_calc(accu_re);
+    }
+}
+
+static void sbr_autocorrelate_c(const int x[40][2], SoftFloat phi[3][2][2])
+{
+    autocorrelate(x, phi, 0);
+    autocorrelate(x, phi, 1);
+    autocorrelate(x, phi, 2);
+}
+
+static void sbr_hf_gen_c(int (*X_high)[2], const int (*X_low)[2],
+                       const int alpha0[2], const int alpha1[2],
+                       int bw, int start, int end)
+{
+    int alpha[4];
+    int i;
+    int64_t accu;
+
+    accu = (int64_t)alpha0[0] * bw;
+    alpha[2] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha0[1] * bw;
+    alpha[3] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)bw * bw;
+    bw = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[0] * bw;
+    alpha[0] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[1] * bw;
+    alpha[1] = (int)((accu + 0x40000000) >> 31);
+
+    for (i = start; i < end; i++) {
+        accu  = (int64_t)X_low[i][0] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][0] * alpha[0];
+        accu -= (int64_t)X_low[i - 2][1] * alpha[1];
+        accu += (int64_t)X_low[i - 1][0] * alpha[2];
+        accu -= (int64_t)X_low[i - 1][1] * alpha[3];
+        X_high[i][0] = (int)((accu + 0x10000000) >> 29);
+
+        accu  = (int64_t)X_low[i][1] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][1] * alpha[0];
+        accu += (int64_t)X_low[i - 2][0] * alpha[1];
+        accu += (int64_t)X_low[i - 1][1] * alpha[2];
+        accu += (int64_t)X_low[i - 1][0] * alpha[3];
+        X_high[i][1] = (int)((accu + 0x10000000) >> 29);
+    }
+}
+
+static void sbr_hf_g_filt_c(int (*Y)[2], const int (*X_high)[40][2],
+                          const SoftFloat *g_filt, int m_max, intptr_t ixh)
+{
+    int m, r;
+    int64_t accu;
+
+    for (m = 0; m < m_max; m++) {
+        r = 1 << (22-g_filt[m].exp);
+        accu = (int64_t)X_high[m][ixh][0] * ((g_filt[m].mant + 0x40)>>7);
+        Y[m][0] = (int)((accu + r) >> (23-g_filt[m].exp));
+
+        accu = (int64_t)X_high[m][ixh][1] * ((g_filt[m].mant + 0x40)>>7);
+        Y[m][1] = (int)((accu + r) >> (23-g_filt[m].exp));
+    }
+}
+
+static av_always_inline void sbr_hf_apply_noise(int (*Y)[2],
+                                                const SoftFloat *s_m,
+                                                const SoftFloat *q_filt,
+                                                int noise,
+                                                int phi_sign0,
+                                                int phi_sign1,
+                                                int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+        int y0 = Y[m][0];
+        int y1 = Y[m][1];
+        noise = (noise + 1) & 0x1ff;
+        if (s_m[m].mant) {
+            int shift, round;
+
+            shift = 22 - s_m[m].exp;
+            if (shift < 30) {
+                round = 1 << (shift-1);
+                y0 += (s_m[m].mant * phi_sign0 + round) >> shift;
+                y1 += (s_m[m].mant * phi_sign1 + round) >> shift;
+            }
+        } else {
+            int shift, round, tmp;
+            int64_t accu;
+
+            shift = 22 - q_filt[m].exp;
+            if (shift < 30) {
+                round = 1 << (shift-1);
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][0];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y0 += (tmp + round) >> shift;
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][1];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y1 += (tmp + round) >> shift;
+            }
+        }
+        Y[m][0] = y0;
+        Y[m][1] = y1;
+        phi_sign1 = -phi_sign1;
+    }
+}
+
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp_template.c b/libavcodec/sbrdsp_template.c
new file mode 100644
index 0000000000..b649dfd7ee
--- /dev/null
+++ b/libavcodec/sbrdsp_template.c
@@ -0,0 +1,97 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void sbr_sum64x5_c(INTFLOAT *z)
+{
+    int k;
+    for (k = 0; k < 64; k++) {
+        INTFLOAT f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
+        z[k] = f;
+    }
+}
+
+static void sbr_qmf_deint_bfly_c(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1)
+{
+    int i;
+    for (i = 0; i < 64; i++) {
+        v[      i] = AAC_SRA_R((src0[i] - src1[63 - i]), 5);
+        v[127 - i] = AAC_SRA_R((src0[i] + src1[63 - i]), 5);
+    }
+}
+
+static void sbr_hf_apply_noise_0(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_1(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, phi_sign, m_max);
+}
+
+static void sbr_hf_apply_noise_2(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)-1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_3(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, -phi_sign, m_max);
+}
+
+av_cold void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s)
+{
+    s->sum64x5 = sbr_sum64x5_c;
+    s->sum_square = sbr_sum_square_c;
+    s->neg_odd_64 = sbr_neg_odd_64_c;
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
+    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
+    s->autocorrelate = sbr_autocorrelate_c;
+    s->hf_gen = sbr_hf_gen_c;
+    s->hf_g_filt = sbr_hf_g_filt_c;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_sbrdsp_init_arm(s);
+    if (ARCH_X86)
+        ff_sbrdsp_init_x86(s);
+    if (ARCH_MIPS)
+        ff_sbrdsp_init_mips(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/screenpresso.c b/libavcodec/screenpresso.c
index d25d0ebb9d..e3150e9b2b 100644
--- a/libavcodec/screenpresso.c
+++ b/libavcodec/screenpresso.c
@@ -2,20 +2,20 @@
  * Screenpresso decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sgi.h b/libavcodec/sgi.h
index 3c47d3a58b..5ec891e431 100644
--- a/libavcodec/sgi.h
+++ b/libavcodec/sgi.h
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Xiaohui Sun <tjnksxh@hotmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sgidec.c b/libavcodec/sgidec.c
index c827ff5559..02ad1e1165 100644
--- a/libavcodec/sgidec.c
+++ b/libavcodec/sgidec.c
@@ -2,24 +2,25 @@
  * SGI image decoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,7 +42,7 @@ typedef struct SgiState {
  * @param out_buf Points to one line after the output buffer.
  * @param len length of out_buf in bytes
  * @param pixelstride pixel stride of input buffer
- * @return size of output in bytes, -1 if buffer overflows
+ * @return size of output in bytes, else return error code.
  */
 static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
                            int len, int pixelstride)
@@ -59,7 +60,7 @@ static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
         }
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -97,7 +98,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
             break;
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -125,7 +126,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
  * Read a run length encoded SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if no error, else return error number.
+ * @return 0 if no error, else return error code.
  */
 static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
 {
@@ -144,7 +145,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
     for (z = 0; z < s->depth; z++) {
         dest_row = out_buf;
         for (y = 0; y < s->height; y++) {
-            linesize = s->width * s->depth * s->bytes_per_channel;
+            linesize = s->width * s->depth;
             dest_row -= s->linesize;
             start_offset = bytestream2_get_be32(&g_table);
             bytestream2_seek(&s->g, start_offset, SEEK_SET);
@@ -163,7 +164,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
  * Read an uncompressed SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if read success, otherwise return -1.
+ * @return 0 if read success, else return error code.
  */
 static int read_uncompressed_sgi(unsigned char *out_buf, SgiState *s)
 {
@@ -215,27 +216,27 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     /* Test for SGI magic. */
-    if (bytestream2_get_be16(&s->g) != SGI_MAGIC) {
+    if (bytestream2_get_be16u(&s->g) != SGI_MAGIC) {
         av_log(avctx, AV_LOG_ERROR, "bad magic number\n");
         return AVERROR_INVALIDDATA;
     }
 
-    rle                  = bytestream2_get_byte(&s->g);
-    s->bytes_per_channel = bytestream2_get_byte(&s->g);
-    dimension            = bytestream2_get_be16(&s->g);
-    s->width             = bytestream2_get_be16(&s->g);
-    s->height            = bytestream2_get_be16(&s->g);
-    s->depth             = bytestream2_get_be16(&s->g);
+    rle                  = bytestream2_get_byteu(&s->g);
+    s->bytes_per_channel = bytestream2_get_byteu(&s->g);
+    dimension            = bytestream2_get_be16u(&s->g);
+    s->width             = bytestream2_get_be16u(&s->g);
+    s->height            = bytestream2_get_be16u(&s->g);
+    s->depth             = bytestream2_get_be16u(&s->g);
 
     if (s->bytes_per_channel != 1 && s->bytes_per_channel != 2) {
         av_log(avctx, AV_LOG_ERROR, "wrong channel number\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* Check for supported image dimensions. */
     if (dimension != 2 && dimension != 3) {
         av_log(avctx, AV_LOG_ERROR, "wrong dimension number\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     if (s->depth == SGI_GRAYSCALE) {
@@ -246,17 +247,15 @@ static int decode_frame(AVCodecContext *avctx,
         avctx->pix_fmt = s->bytes_per_channel == 2 ? AV_PIX_FMT_RGBA64BE : AV_PIX_FMT_RGBA;
     } else {
         av_log(avctx, AV_LOG_ERROR, "wrong picture format\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
 
-    if (ff_get_buffer(avctx, p, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed.\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
 
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
@@ -273,13 +272,11 @@ static int decode_frame(AVCodecContext *avctx,
     } else {
         ret = read_uncompressed_sgi(out_buf, s);
     }
-
-    if (ret == 0) {
-        *got_frame = 1;
-        return avpkt->size;
-    } else {
+    if (ret)
         return ret;
-    }
+
+    *got_frame = 1;
+    return avpkt->size;
 }
 
 static av_cold int sgi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/sgienc.c b/libavcodec/sgienc.c
index b88f6c3843..28cec8625f 100644
--- a/libavcodec/sgienc.c
+++ b/libavcodec/sgienc.c
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (avctx->width > 65535 || avctx->height > 65535) {
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported resolution %dx%d.\n", avctx->width, avctx->height);
+        av_log(avctx, AV_LOG_ERROR, "SGI does not support resolutions above 65535x65535\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -113,10 +114,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else // assume ff_rl_encode() produces at most 2x size of input
         length += tablesize * 2 + depth * height * (2 * width + 1);
 
-    if ((ret = ff_alloc_packet(pkt, bytes_per_channel * length)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
+    if ((ret = ff_alloc_packet2(avctx, pkt, bytes_per_channel * length, 0)) < 0)
         return ret;
-    }
     buf     = pkt->data;
     end_buf = pkt->data + pkt->size;
 
@@ -154,7 +153,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         /* Make an intermediate consecutive buffer. */
         if (!(encode_buf = av_malloc(width)))
-            return -1;
+            return AVERROR(ENOMEM);
 
         for (z = 0; z < depth; z++) {
             in_buf = p->data[0] + p->linesize[0] * (height - 1) + z;
@@ -184,13 +183,15 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             for (y = 0; y < height; y++) {
                 for (x = 0; x < width * depth; x += depth)
-                    if (bytes_per_channel == 1)
+                    if (bytes_per_channel == 1) {
                         bytestream_put_byte(&buf, in_buf[x]);
-                    else
-                        if (put_be)
+                    } else {
+                        if (put_be) {
                             bytestream_put_be16(&buf, ((uint16_t *)in_buf)[x]);
-                        else
+                        } else {
                             bytestream_put_le16(&buf, ((uint16_t *)in_buf)[x]);
+                        }
+                    }
 
                 in_buf -= p->linesize[0];
             }
diff --git a/libavcodec/sgirledec.c b/libavcodec/sgirledec.c
index 67d79c81e8..e7b281ac31 100644
--- a/libavcodec/sgirledec.c
+++ b/libavcodec/sgirledec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics RLE 8-bit video decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,9 +49,9 @@ static av_cold int sgirle_decode_init(AVCodecContext *avctx)
  * Convert SGI RBG323 pixel into AV_PIX_FMT_BGR8
  * SGI RGB data is packed as 8bpp, (msb)3R 2B 3G(lsb)
  */
-#define RBG323_TO_BGR8(x) (((x << 3) & 0xC0) |                                \
-                           ((x << 3) & 0x38) |                                \
-                           ((x >> 5) & 7))
+#define RBG323_TO_BGR8(x) ((((x) << 3) & 0xC0) |                                \
+                           (((x) << 3) & 0x38) |                                \
+                           (((x) >> 5) & 7))
 static av_always_inline
 void rbg323_to_bgr8(uint8_t *dst, const uint8_t *src, int size)
 {
@@ -110,8 +110,8 @@ static int decode_sgirle8(AVCodecContext *avctx, uint8_t *dst,
                 v   -= length;
             } while (v > 0);
         } else {
-            av_log(avctx, AV_LOG_ERROR, "Invalid opcode %d.\n", v);
-            return AVERROR_INVALIDDATA;
+            avpriv_request_sample(avctx, "opcode %d", v);
+            return AVERROR_PATCHWELCOME;
         }
     }
     return 0;
diff --git a/libavcodec/sh4/README b/libavcodec/sh4/README
new file mode 100644
index 0000000000..8dd61fe875
--- /dev/null
+++ b/libavcodec/sh4/README
@@ -0,0 +1,6 @@
+SH4 optimizations have been removed in
+commit d6096a67422534918405abb46dafbbac4608cbc3
+The last revission with the optimizations is cbfc9046e1c7e295b74f252902ae6f255eef4e78
+
+If you want to maintain these (or other) SH4 optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/shorten.c b/libavcodec/shorten.c
index ce98a83dba..0f5be96eb7 100644
--- a/libavcodec/shorten.c
+++ b/libavcodec/shorten.c
@@ -2,20 +2,20 @@
  * Shorten decoder
  * Copyright (c) 2005 Jeff Muizelaar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,8 +50,12 @@
 #define ENERGYSIZE 3
 #define BITSHIFTSIZE 2
 
+#define TYPE_S8    1
+#define TYPE_U8    2
 #define TYPE_S16HL 3
+#define TYPE_U16HL 4
 #define TYPE_S16LH 5
+#define TYPE_U16LH 6
 
 #define NWRAP 3
 #define NSKIPSIZE 1
@@ -112,7 +116,6 @@ static av_cold int shorten_decode_init(AVCodecContext *avctx)
 {
     ShortenContext *s = avctx->priv_data;
     s->avctx          = avctx;
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
 
     return 0;
 }
@@ -126,19 +129,18 @@ static int allocate_buffers(ShortenContext *s)
             av_log(s->avctx, AV_LOG_ERROR, "nmean too large\n");
             return AVERROR_INVALIDDATA;
         }
-        if (s->blocksize + s->nwrap >= UINT_MAX / sizeof(int32_t) ||
-            s->blocksize + s->nwrap <= (unsigned)s->nwrap) {
+        if (s->blocksize + (uint64_t)s->nwrap >= UINT_MAX / sizeof(int32_t)) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "s->blocksize + s->nwrap too large\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if ((err = av_reallocp(&s->offset[chan],
-                               sizeof(int32_t) *
+        if ((err = av_reallocp_array(&s->offset[chan],
+                               sizeof(int32_t),
                                FFMAX(1, s->nmean))) < 0)
             return err;
 
-        if ((err = av_reallocp(&s->decoded_base[chan], (s->blocksize + s->nwrap) *
+        if ((err = av_reallocp_array(&s->decoded_base[chan], (s->blocksize + s->nwrap),
                                sizeof(s->decoded_base[0][0]))) < 0)
             return err;
         for (i = 0; i < s->nwrap; i++)
@@ -146,7 +148,7 @@ static int allocate_buffers(ShortenContext *s)
         s->decoded[chan] = s->decoded_base[chan] + s->nwrap;
     }
 
-    if ((err = av_reallocp(&s->coeffs, s->nwrap * sizeof(*s->coeffs))) < 0)
+    if ((err = av_reallocp_array(&s->coeffs, s->nwrap, sizeof(*s->coeffs))) < 0)
         return err;
 
     return 0;
@@ -175,13 +177,17 @@ static int init_offset(ShortenContext *s)
     int nblock = FFMAX(1, s->nmean);
     /* initialise offset */
     switch (s->internal_ftype) {
+    case TYPE_U8:
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
+        mean = 0x80;
+        break;
     case TYPE_S16HL:
     case TYPE_S16LH:
-        mean = 0;
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type");
-        return AVERROR_INVALIDDATA;
+        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type\n");
+        return AVERROR_PATCHWELCOME;
     }
 
     for (chan = 0; chan < s->channels; chan++)
@@ -193,7 +199,7 @@ static int init_offset(ShortenContext *s)
 static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
                               int header_size)
 {
-    int len;
+    int len, bps;
     short wave_format;
     GetByteContext gb;
 
@@ -214,7 +220,7 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     while (bytestream2_get_le32(&gb) != MKTAG('f', 'm', 't', ' ')) {
         len = bytestream2_get_le32(&gb);
         bytestream2_skip(&gb, len);
-        if (bytestream2_get_bytes_left(&gb) < 16) {
+        if (len < 0 || bytestream2_get_bytes_left(&gb) < 16) {
             av_log(avctx, AV_LOG_ERROR, "no fmt chunk found\n");
             return AVERROR_INVALIDDATA;
         }
@@ -240,10 +246,11 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     avctx->sample_rate = bytestream2_get_le32(&gb);
     bytestream2_skip(&gb, 4); // skip bit rate    (represents original uncompressed bit rate)
     bytestream2_skip(&gb, 2); // skip block align (not needed)
-    avctx->bits_per_coded_sample = bytestream2_get_le16(&gb);
+    bps = bytestream2_get_le16(&gb);
+    avctx->bits_per_coded_sample = bps;
 
-    if (avctx->bits_per_coded_sample != 16) {
-        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample\n");
+    if (bps != 16 && bps != 8) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample: %d\n", bps);
         return AVERROR(ENOSYS);
     }
 
@@ -254,18 +261,6 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     return 0;
 }
 
-static void output_buffer(int16_t **samples, int nchan, int blocksize,
-                          int32_t **buffer)
-{
-    int i, ch;
-    for (ch = 0; ch < nchan; ch++) {
-        int32_t *in  = buffer[ch];
-        int16_t *out = samples[ch];
-        for (i = 0; i < blocksize; i++)
-            out[i] = av_clip_int16(in[i]);
-    }
-}
-
 static const int fixed_coeffs[][3] = {
     { 0,  0,  0 },
     { 1,  0,  0 },
@@ -282,7 +277,7 @@ static int decode_subframe_lpc(ShortenContext *s, int command, int channel,
     if (command == FN_QLPC) {
         /* read/validate prediction order */
         pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE);
-        if (pred_order > s->nwrap) {
+        if ((unsigned)pred_order > s->nwrap) {
             av_log(s->avctx, AV_LOG_ERROR, "invalid pred_order %d\n",
                    pred_order);
             return AVERROR(EINVAL);
@@ -374,6 +369,11 @@ static int read_header(ShortenContext *s)
         s->nmean = get_uint(s, 0);
 
         skip_bytes = get_uint(s, NSKIPSIZE);
+        if ((unsigned)skip_bytes > get_bits_left(&s->gb)/8) {
+            av_log(s->avctx, AV_LOG_ERROR, "invalid skip_bytes: %d\n", skip_bytes);
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < skip_bytes; i++)
             skip_bits(&s->gb, 8);
     }
@@ -429,13 +429,14 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
     /* allocate internal bitstream buffer */
     if (s->max_framesize == 0) {
         void *tmp_ptr;
-        s->max_framesize = 1024; // should hopefully be enough for the first header
+        s->max_framesize = 8192; // should hopefully be enough for the first header
         tmp_ptr = av_fast_realloc(s->bitstream, &s->allocated_bitstream_size,
                                   s->max_framesize + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!tmp_ptr) {
             av_log(avctx, AV_LOG_ERROR, "error allocating bitstream buffer\n");
             return AVERROR(ENOMEM);
         }
+        memset(tmp_ptr, 0, s->allocated_bitstream_size);
         s->bitstream = tmp_ptr;
     }
 
@@ -444,7 +445,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
         buf_size       = FFMIN(buf_size, s->max_framesize - s->bitstream_size);
         input_buf_size = buf_size;
 
-        if (s->bitstream_index + s->bitstream_size + buf_size >
+        if (s->bitstream_index + s->bitstream_size + buf_size + AV_INPUT_BUFFER_PADDING_SIZE >
             s->allocated_bitstream_size) {
             memmove(s->bitstream, &s->bitstream[s->bitstream_index],
                     s->bitstream_size);
@@ -465,7 +466,8 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
         }
     }
     /* init and position bitstream reader */
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
     skip_bits(&s->gb, s->bitindex);
 
     /* process header or next subblock */
@@ -508,11 +510,16 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 while (len--)
                     get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
                 break;
-            case FN_BITSHIFT:
-                s->bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
-                if (s->bitshift < 0)
+            case FN_BITSHIFT: {
+                unsigned bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
+                if (bitshift > 31) {
+                    av_log(avctx, AV_LOG_ERROR, "bitshift %d is invalid\n",
+                           bitshift);
                     return AVERROR_INVALIDDATA;
+                }
+                s->bitshift = bitshift;
                 break;
+            }
             case FN_BLOCKSIZE: {
                 unsigned blocksize = get_uint(s, av_log2(s->blocksize));
                 if (blocksize > s->blocksize) {
@@ -560,7 +567,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                     sum += s->offset[channel][i];
                 coffset = sum / s->nmean;
                 if (s->version >= 2)
-                    coffset >>= FFMIN(1, s->bitshift);
+                    coffset = s->bitshift == 0 ? coffset : coffset >> s->bitshift - 1 >> 1;
             }
 
             /* decode samples for this channel */
@@ -599,15 +606,30 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
             /* if this is the last channel in the block, output the samples */
             s->cur_chan++;
             if (s->cur_chan == s->channels) {
+                uint8_t *samples_u8;
+                int16_t *samples_s16;
+                int chan;
+
                 /* get output buffer */
                 frame->nb_samples = s->blocksize;
-                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
                     return ret;
+
+                for (chan = 0; chan < s->channels; chan++) {
+                    samples_u8  = ((uint8_t **)frame->extended_data)[chan];
+                    samples_s16 = ((int16_t **)frame->extended_data)[chan];
+                    for (i = 0; i < s->blocksize; i++) {
+                        switch (s->internal_ftype) {
+                        case TYPE_U8:
+                            *samples_u8++ = av_clip_uint8(s->decoded[chan][i]);
+                            break;
+                        case TYPE_S16HL:
+                        case TYPE_S16LH:
+                            *samples_s16++ = av_clip_int16(s->decoded[chan][i]);
+                            break;
+                        }
+                    }
                 }
-                /* interleave output */
-                output_buffer((int16_t **)frame->extended_data, s->channels,
-                              s->blocksize, s->decoded);
 
                 *got_frame_ptr = 1;
             }
@@ -660,5 +682,6 @@ AVCodec ff_shorten_decoder = {
     .decode         = shorten_decode_frame,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_U8P,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c
index f61e9e639d..eeb627999c 100644
--- a/libavcodec/simple_idct.c
+++ b/libavcodec/simple_idct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,10 @@
 #include "simple_idct_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "simple_idct_template.c"
+#undef BIT_DEPTH
+
 /* 2x4x8 idct */
 
 #define CN_SHIFT 12
@@ -228,6 +232,8 @@ void ff_prores_idct(int16_t *block, const int16_t *qmat)
     for (i = 0; i < 8; i++)
         idctRowCondDC_10(block + i*8, 2);
 
-    for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++) {
+        block[i] += 8192;
         idctSparseCol_10(block + i);
+    }
 }
diff --git a/libavcodec/simple_idct.h b/libavcodec/simple_idct.h
index 7f14aae5d0..154e297470 100644
--- a/libavcodec/simple_idct.h
+++ b/libavcodec/simple_idct.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,11 @@ void ff_simple_idct_8(int16_t *block);
 void ff_simple_idct_put_10(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_add_10(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_10(int16_t *block);
+
+void ff_simple_idct_put_12(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_add_12(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_12(int16_t *block);
+
 /**
  * Special version of ff_simple_idct_10() which does dequantization
  * and scales by a factor of 2 more between the two IDCTs to account
diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index b287c4f6bf..789db8d0ac 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,19 +64,33 @@
 #define MUL(a, b)    MUL16(a, b)
 #define MAC(a, b, c) MAC16(a, b, c)
 
-#elif BIT_DEPTH == 10
+#elif BIT_DEPTH == 10 || BIT_DEPTH == 12
 
-#define W1 90901
-#define W2 85627
-#define W3 77062
-#define W4 65535
-#define W5 51491
-#define W6 35468
-#define W7 18081
+#if BIT_DEPTH == 10
+#define W1 (22725*4)  // 90901
+#define W2 (21407*4) //  85627
+#define W3 (19265*4) //  77062
+#define W4 (16384*4) //  65535
+#define W5 (12873*4) //  51491
+#define W6 ( 8867*4) //  35468
+#define W7 ( 4520*4) //  18081
 
 #define ROW_SHIFT 15
 #define COL_SHIFT 20
 #define DC_SHIFT 1
+#else
+#define W1 45451
+#define W2 42813
+#define W3 38531
+#define W4 32767
+#define W5 25746
+#define W6 17734
+#define W7 9041
+
+#define ROW_SHIFT 16
+#define COL_SHIFT 17
+#define DC_SHIFT -1
+#endif
 
 #define MUL(a, b)    ((a) * (b))
 #define MAC(a, b, c) ((a) += (b) * (c))
@@ -95,13 +109,13 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
 #define ROW0_MASK (0xffffLL << 48 * HAVE_BIGENDIAN)
     if (((((uint64_t *)row)[0] & ~ROW0_MASK) | ((uint64_t *)row)[1]) == 0) {
         uint64_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
-            temp = (row[0] << (DC_SHIFT - extra_shift)) & 0xffff;
+        if (DC_SHIFT - extra_shift >= 0) {
+            temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
-        temp += temp << 16;
-        temp += temp << 32;
+        temp += temp * (1 << 16);
+        temp += temp * ((uint64_t) 1 << 32);
         ((uint64_t *)row)[0] = temp;
         ((uint64_t *)row)[1] = temp;
         return;
@@ -112,19 +126,19 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
           ((uint32_t*)row)[3] |
           row[1])) {
         uint32_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
-            temp = (row[0] << (DC_SHIFT - extra_shift)) & 0xffff;
+        if (DC_SHIFT - extra_shift >= 0) {
+            temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
-        temp += temp << 16;
+        temp += temp * (1 << 16);
         ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
             ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
         return;
     }
 #endif
 
-    a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+    a0 = (W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1));
     a1 = a0;
     a2 = a0;
     a3 = a0;
diff --git a/libavcodec/sinewin.c b/libavcodec/sinewin.c
index be38dbc713..4532dc7354 100644
--- a/libavcodec/sinewin.c
+++ b/libavcodec/sinewin.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
 #include "sinewin.h"
 #include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin.h b/libavcodec/sinewin.h
index 478036d3c6..27c107ce59 100644
--- a/libavcodec/sinewin.h
+++ b/libavcodec/sinewin.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Robert Swain
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 
 #include "config.h"
 #include "libavutil/mem.h"
+#include "libavcodec/aac_defines.h"
 
 #if CONFIG_HARDCODED_TABLES
 #   define SINETABLE_CONST const
@@ -30,20 +31,24 @@
 #   define SINETABLE_CONST
 #endif
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
 #define SINETABLE(size) \
-    SINETABLE_CONST DECLARE_ALIGNED(32, float, ff_sine_##size)[size]
+    SINETABLE_CONST DECLARE_ALIGNED(32, INTFLOAT, AAC_RENAME(ff_sine_##size))[size]
 
 /**
  * Generate a sine window.
  * @param   window  pointer to half window
  * @param   n       size of half window
  */
-void ff_sine_window_init(float *window, int n);
+void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n);
 
 /**
  * initialize the specified entry of ff_sine_windows
  */
-void ff_init_ff_sine_windows(int index);
+void AAC_RENAME(ff_init_ff_sine_windows)(int index);
 
 extern SINETABLE(  32);
 extern SINETABLE(  64);
@@ -55,6 +60,6 @@ extern SINETABLE(2048);
 extern SINETABLE(4096);
 extern SINETABLE(8192);
 
-extern SINETABLE_CONST float * const ff_sine_windows[14];
+extern SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[14];
 
 #endif /* AVCODEC_SINEWIN_H */
diff --git a/libavcodec/sinewin_fixed.c b/libavcodec/sinewin_fixed.c
new file mode 100644
index 0000000000..27ead29e8e
--- /dev/null
+++ b/libavcodec/sinewin_fixed.c
@@ -0,0 +1,21 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin.h"
+#include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin_fixed_tablegen.c b/libavcodec/sinewin_fixed_tablegen.c
new file mode 100644
index 0000000000..977e6f3cbf
--- /dev/null
+++ b/libavcodec/sinewin_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.c b/libavcodec/sinewin_tablegen.c
index 90a75c2267..dd602668ee 100644
--- a/libavcodec/sinewin_tablegen.c
+++ b/libavcodec/sinewin_tablegen.c
@@ -3,44 +3,22 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#define SINETABLE_CONST
-#define SINETABLE(size) \
-    float ff_sine_##size[size]
-#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
-#include "sinewin_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    int i;
-
-    write_fileheader();
-
-    for (i = 5; i <= 13; i++) {
-        ff_init_ff_sine_windows(i);
-        printf("SINETABLE(%4i) = {\n", 1 << i);
-        write_float_array(ff_sine_windows[i], 1 << i);
-        printf("};\n");
-    }
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.h b/libavcodec/sinewin_tablegen.h
index 1ee225bf67..4432135f19 100644
--- a/libavcodec/sinewin_tablegen.h
+++ b/libavcodec/sinewin_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 // do not use libavutil/libm.h since this is compiled both
 // for the host and the target and config.h is only valid for the target
 #include <math.h>
+#include "libavcodec/aac_defines.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 
@@ -41,26 +42,37 @@ SINETABLE(2048);
 SINETABLE(4096);
 SINETABLE(8192);
 #else
+#if USE_FIXED
+#include "libavcodec/sinewin_fixed_tables.h"
+#else
 #include "libavcodec/sinewin_tables.h"
 #endif
+#endif
+
+#if USE_FIXED
+#define SIN_FIX(a) (int)floor((a) * 0x80000000 + 0.5)
+#else
+#define SIN_FIX(a) a
+#endif
 
-SINETABLE_CONST float * const ff_sine_windows[] = {
+SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[] = {
     NULL, NULL, NULL, NULL, NULL, // unused
-    ff_sine_32 , ff_sine_64 ,
-    ff_sine_128, ff_sine_256, ff_sine_512, ff_sine_1024, ff_sine_2048, ff_sine_4096, ff_sine_8192
+    AAC_RENAME(ff_sine_32) , AAC_RENAME(ff_sine_64), AAC_RENAME(ff_sine_128),
+    AAC_RENAME(ff_sine_256), AAC_RENAME(ff_sine_512), AAC_RENAME(ff_sine_1024),
+    AAC_RENAME(ff_sine_2048), AAC_RENAME(ff_sine_4096), AAC_RENAME(ff_sine_8192)
 };
 
 // Generate a sine window.
-av_cold void ff_sine_window_init(float *window, int n) {
+av_cold void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n) {
     int i;
     for(i = 0; i < n; i++)
-        window[i] = sinf((i + 0.5) * (M_PI / (2.0 * n)));
+        window[i] = SIN_FIX(sinf((i + 0.5) * (M_PI / (2.0 * n))));
 }
 
-av_cold void ff_init_ff_sine_windows(int index) {
-    assert(index >= 0 && index < FF_ARRAY_ELEMS(ff_sine_windows));
+av_cold void AAC_RENAME(ff_init_ff_sine_windows)(int index) {
+    assert(index >= 0 && index < FF_ARRAY_ELEMS(AAC_RENAME(ff_sine_windows)));
 #if !CONFIG_HARDCODED_TABLES
-    ff_sine_window_init(ff_sine_windows[index], 1 << index);
+    AAC_RENAME(ff_sine_window_init)(AAC_RENAME(ff_sine_windows)[index], 1 << index);
 #endif
 }
 
diff --git a/libavcodec/sinewin_tablegen_template.c b/libavcodec/sinewin_tablegen_template.c
new file mode 100644
index 0000000000..43ce1ba82e
--- /dev/null
+++ b/libavcodec/sinewin_tablegen_template.c
@@ -0,0 +1,54 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavcodec/aac_defines.h"
+#define CONFIG_HARDCODED_TABLES 0
+
+#if USE_FIXED
+#define WRITE_FUNC write_int32_t_array
+#else
+#define WRITE_FUNC write_float_array
+#endif
+
+#define SINETABLE_CONST
+#define SINETABLE(size) \
+    INTFLOAT AAC_RENAME(ff_sine_##size)[size]
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+#include "sinewin_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    int i;
+
+    write_fileheader();
+
+    for (i = 5; i <= 13; i++) {
+        AAC_RENAME(ff_init_ff_sine_windows)(i);
+        printf("SINETABLE(%4i) = {\n", 1 << i);
+        WRITE_FUNC(AAC_RENAME(ff_sine_windows)[i], 1 << i);
+        printf("};\n");
+    }
+
+    return 0;
+}
diff --git a/libavcodec/sipr.c b/libavcodec/sipr.c
index 3f17686a2a..4602a42484 100644
--- a/libavcodec/sipr.c
+++ b/libavcodec/sipr.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ const float ff_pow_0_5[] = {
     1.0/(1 << 13), 1.0/(1 << 14), 1.0/(1 << 15), 1.0/(1 << 16)
 };
 
-static void dequant(float *out, const int *idx, const float *cbs[])
+static void dequant(float *out, const int *idx, const float * const cbs[])
 {
     int i;
     int stride  = 2;
@@ -493,8 +493,8 @@ static av_cold int sipr_decoder_init(AVCodecContext * avctx)
         else if (avctx->bit_rate > 5750 ) ctx->mode = MODE_6k5;
         else                              ctx->mode = MODE_5k0;
         av_log(avctx, AV_LOG_WARNING,
-               "Invalid block_align: %d. Mode %s guessed based on bitrate: %d\n",
-               avctx->block_align, modes[ctx->mode].mode_name, avctx->bit_rate);
+               "Invalid block_align: %d. Mode %s guessed based on bitrate: %"PRId64"\n",
+               avctx->block_align, modes[ctx->mode].mode_name, (int64_t)avctx->bit_rate);
     }
 
     av_log(avctx, AV_LOG_DEBUG, "Mode: %s\n", modes[ctx->mode].mode_name);
@@ -543,10 +543,8 @@ static int sipr_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     frame->nb_samples = mode_par->frames_per_packet * subframe_size *
                         mode_par->subframe_count;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (float *)frame->data[0];
 
     init_get_bits(&gb, buf, mode_par->bits_per_frame);
diff --git a/libavcodec/sipr.h b/libavcodec/sipr.h
index 4cdea67e51..34f7f994cd 100644
--- a/libavcodec/sipr.h
+++ b/libavcodec/sipr.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sipr16k.c b/libavcodec/sipr16k.c
index f7fcb34315..9c8f684003 100644
--- a/libavcodec/sipr16k.c
+++ b/libavcodec/sipr16k.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "lsp.h"
-#include "celp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
 #include "acelp_filters.h"
@@ -51,7 +50,7 @@ static void lsf2lsp(const float *lsf, double *lsp)
         lsp[i] = cosf(lsf[i]);
 }
 
-static void dequant(float *out, const int *idx, const float *cbs[])
+static void dequant(float *out, const int *idx, const float * const cbs[])
 {
     int i;
 
diff --git a/libavcodec/sipr16kdata.h b/libavcodec/sipr16kdata.h
index ec60c29b51..16a653da94 100644
--- a/libavcodec/sipr16kdata.h
+++ b/libavcodec/sipr16kdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -525,7 +525,7 @@ static const float lsf_cb5_16k[128][4] = {
     { 0.124405,  0.009943, -0.148477, -0.205184}
 };
 
-static const float *lsf_codebooks_16k[] = {
+static const float * const lsf_codebooks_16k[] = {
     lsf_cb1_16k[0], lsf_cb2_16k[0], lsf_cb3_16k[0], lsf_cb4_16k[0],
     lsf_cb5_16k[0]
 };
diff --git a/libavcodec/siprdata.h b/libavcodec/siprdata.h
index 92037a4a87..0dbc113fca 100644
--- a/libavcodec/siprdata.h
+++ b/libavcodec/siprdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -206,7 +206,7 @@ static const float lsf_cb5[32][2] = {
     { 0.150514,  0.034366}, { 0.186092, -0.069272}
 };
 
-static const float *lsf_codebooks[] = {
+static const float * const lsf_codebooks[] = {
     lsf_cb1[0], lsf_cb2[0], lsf_cb3[0], lsf_cb4[0], lsf_cb5[0]
 };
 
diff --git a/libavcodec/smacker.c b/libavcodec/smacker.c
index 20f93d5c8b..b2fc29b138 100644
--- a/libavcodec/smacker.c
+++ b/libavcodec/smacker.c
@@ -2,20 +2,20 @@
  * Smacker decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -97,10 +97,14 @@ enum SmkBlockTypes {
  */
 static int smacker_decode_tree(GetBitContext *gb, HuffContext *hc, uint32_t prefix, int length)
 {
+    if(length > 32 || length > 3*SMKTREE_BITS) {
+        av_log(NULL, AV_LOG_ERROR, "length too long\n");
+        return AVERROR_INVALIDDATA;
+    }
     if(!get_bits1(gb)){ //Leaf
-        if(hc->current >= 256){
+        if(hc->current >= hc->length){
             av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
         if(length){
             hc->bits[hc->current] = prefix;
@@ -131,14 +135,14 @@ static int smacker_decode_bigtree(GetBitContext *gb, HuffContext *hc, DBCtx *ctx
 {
     if (hc->current + 1 >= hc->length) {
         av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     if(!get_bits1(gb)){ //Leaf
         int val, i1, i2;
         i1 = ctx->v1->table ? get_vlc2(gb, ctx->v1->table, SMKTREE_BITS, 3) : 0;
         i2 = ctx->v2->table ? get_vlc2(gb, ctx->v2->table, SMKTREE_BITS, 3) : 0;
         if (i1 < 0 || i2 < 0)
-            return -1;
+            return AVERROR_INVALIDDATA;
         val = ctx->recode1[i1] | (ctx->recode2[i2] << 8);
         if(val == ctx->escapes[0]) {
             ctx->last[0] = hc->current;
@@ -170,7 +174,7 @@ static int smacker_decode_bigtree(GetBitContext *gb, HuffContext *hc, DBCtx *ctx
 }
 
 /**
- * Store large tree as Libav's vlc codes
+ * Store large tree as FFmpeg's vlc codes
  */
 static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int **recodes, int *last, int size)
 {
@@ -184,7 +188,7 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
 
     if(size >= UINT_MAX>>4){ // (((size + 3) >> 2) + 3) << 2 must not overflow
         av_log(smk->avctx, AV_LOG_ERROR, "size too large\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     tmp1.length = 256;
@@ -207,40 +211,51 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
     }
 
     if(get_bits1(gb)) {
-        smacker_decode_tree(gb, &tmp1, 0, 0);
-        skip_bits1(gb);
-        res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
-                    tmp1.lengths, sizeof(int), sizeof(int),
-                    tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+        res = smacker_decode_tree(gb, &tmp1, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp1.current > 1) {
+            res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
+                        tmp1.lengths, sizeof(int), sizeof(int),
+                        tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[0].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping low bytes tree\n");
     }
     if(get_bits1(gb)){
-        smacker_decode_tree(gb, &tmp2, 0, 0);
-        skip_bits1(gb);
-        res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
-                    tmp2.lengths, sizeof(int), sizeof(int),
-                    tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+        res = smacker_decode_tree(gb, &tmp2, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp2.current > 1) {
+            res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
+                        tmp2.lengths, sizeof(int), sizeof(int),
+                        tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[1].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping high bytes tree\n");
     }
 
-    escapes[0]  = get_bits(gb, 8);
-    escapes[0] |= get_bits(gb, 8) << 8;
-    escapes[1]  = get_bits(gb, 8);
-    escapes[1] |= get_bits(gb, 8) << 8;
-    escapes[2]  = get_bits(gb, 8);
-    escapes[2] |= get_bits(gb, 8) << 8;
+    escapes[0]  = get_bits(gb, 16);
+    escapes[1]  = get_bits(gb, 16);
+    escapes[2]  = get_bits(gb, 16);
 
     last[0] = last[1] = last[2] = -1;
 
@@ -256,7 +271,7 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
     huff.length = ((size + 3) >> 2) + 4;
     huff.maxlength = 0;
     huff.current = 0;
-    huff.values = av_mallocz(huff.length * sizeof(int));
+    huff.values = av_mallocz_array(huff.length, sizeof(int));
     if (!huff.values) {
         err = AVERROR(ENOMEM);
         goto error;
@@ -294,14 +309,16 @@ error:
 
 static int decode_header_trees(SmackVContext *smk) {
     GetBitContext gb;
-    int mmap_size, mclr_size, full_size, type_size;
+    int mmap_size, mclr_size, full_size, type_size, ret;
 
     mmap_size = AV_RL32(smk->avctx->extradata);
     mclr_size = AV_RL32(smk->avctx->extradata + 4);
     full_size = AV_RL32(smk->avctx->extradata + 8);
     type_size = AV_RL32(smk->avctx->extradata + 12);
 
-    init_get_bits(&gb, smk->avctx->extradata + 16, (smk->avctx->extradata_size - 16) * 8);
+    ret = init_get_bits8(&gb, smk->avctx->extradata + 16, smk->avctx->extradata_size - 16);
+    if (ret < 0)
+        return ret;
 
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MMAP tree\n");
@@ -311,8 +328,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mmap_tbl[0] = 0;
         smk->mmap_last[0] = smk->mmap_last[1] = smk->mmap_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->mmap_tbl, smk->mmap_last, mmap_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mmap_tbl, smk->mmap_last, mmap_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MCLR tree\n");
@@ -322,8 +340,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mclr_tbl[0] = 0;
         smk->mclr_last[0] = smk->mclr_last[1] = smk->mclr_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->mclr_tbl, smk->mclr_last, mclr_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mclr_tbl, smk->mclr_last, mclr_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping FULL tree\n");
@@ -333,8 +352,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->full_tbl[0] = 0;
         smk->full_last[0] = smk->full_last[1] = smk->full_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->full_tbl, smk->full_last, full_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->full_tbl, smk->full_last, full_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping TYPE tree\n");
@@ -344,8 +364,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->type_tbl[0] = 0;
         smk->type_last[0] = smk->type_last[1] = smk->type_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->type_tbl, smk->type_last, type_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->type_tbl, smk->type_last, type_size);
+        if (ret < 0)
+            return ret;
     }
 
     return 0;
@@ -389,12 +410,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int flags;
 
     if (avpkt->size <= 769)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0)
         return ret;
-    }
 
     /* make the palette available on the way out */
     pal = (uint32_t*)smk->pic->data[1];
@@ -402,25 +421,25 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     flags = bytestream2_get_byteu(&gb2);
     smk->pic->palette_has_changed = flags & 1;
     smk->pic->key_frame = !!(flags & 2);
-    if(smk->pic->key_frame)
+    if (smk->pic->key_frame)
         smk->pic->pict_type = AV_PICTURE_TYPE_I;
     else
         smk->pic->pict_type = AV_PICTURE_TYPE_P;
 
     for(i = 0; i < 256; i++)
-        *pal++ = bytestream2_get_be24u(&gb2);
+        *pal++ = 0xFFU << 24 | bytestream2_get_be24u(&gb2);
 
     last_reset(smk->mmap_tbl, smk->mmap_last);
     last_reset(smk->mclr_tbl, smk->mclr_last);
     last_reset(smk->full_tbl, smk->full_last);
     last_reset(smk->type_tbl, smk->type_last);
-    init_get_bits(&gb, avpkt->data + 769, (avpkt->size - 769) * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data + 769, avpkt->size - 769)) < 0)
+        return ret;
 
     blk = 0;
     bw = avctx->width >> 2;
     bh = avctx->height >> 2;
     blocks = bw * bh;
-    out = smk->pic->data[0];
     stride = smk->pic->linesize[0];
     while(blk < blocks) {
         int type, run, mode;
@@ -481,7 +500,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     out += stride;
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
-                    out += stride;
                     break;
                 case 2:
                     for(i = 0; i < 2; i++) {
@@ -560,6 +578,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     SmackVContext * const c = avctx->priv_data;
+    int ret;
 
     c->avctx = avctx;
 
@@ -572,19 +591,20 @@ static av_cold int decode_init(AVCodecContext *avctx)
     /* decode huffman trees from extradata */
     if(avctx->extradata_size < 16){
         av_log(avctx, AV_LOG_ERROR, "Extradata missing!\n");
-        return -1;
+        decode_end(avctx);
+        return AVERROR(EINVAL);
     }
 
-    if (decode_header_trees(c)) {
+    ret = decode_header_trees(c);
+    if (ret < 0) {
         decode_end(avctx);
-        return -1;
+        return ret;
     }
 
     return 0;
 }
 
 
-
 static av_cold int smka_decode_init(AVCodecContext *avctx)
 {
     if (avctx->channels < 1 || avctx->channels > 2) {
@@ -624,7 +644,13 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
 
     unp_size = AV_RL32(buf);
 
-    init_get_bits(&gb, buf + 4, (buf_size - 4) * 8);
+    if (unp_size > (1U<<24)) {
+        av_log(avctx, AV_LOG_ERROR, "packet is too big\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = init_get_bits8(&gb, buf + 4, buf_size - 4)) < 0)
+        return ret;
 
     if(!get_bits1(&gb)){
         av_log(avctx, AV_LOG_INFO, "Sound: no data\n");
@@ -637,17 +663,15 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR, "channels mismatch\n");
         return AVERROR(EINVAL);
     }
-    if (bits && avctx->sample_fmt == AV_SAMPLE_FMT_U8) {
+    if (bits == (avctx->sample_fmt == AV_SAMPLE_FMT_U8)) {
         av_log(avctx, AV_LOG_ERROR, "sample format mismatch\n");
         return AVERROR(EINVAL);
     }
 
     /* get output buffer */
     frame->nb_samples = unp_size / (avctx->channels * (bits + 1));
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples  = (int16_t *)frame->data[0];
     samples8 =            frame->data[0];
 
@@ -687,16 +711,26 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         for(i = 0; i <= stereo; i++)
             *samples++ = pred[i];
         for(; i < unp_size / 2; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo) {
                 if(vlc[2].table)
                     res = get_vlc2(&gb, vlc[2].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[2].values[res];
                 if(vlc[3].table)
                     res = get_vlc2(&gb, vlc[3].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[3].values[res] << 8;
                 pred[1] += sign_extend(val, 16);
                 *samples++ = pred[1];
@@ -705,11 +739,19 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
                     res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[0].values[res];
                 if(vlc[1].table)
                     res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[1].values[res] << 8;
                 pred[0] += sign_extend(val, 16);
                 *samples++ = pred[0];
@@ -721,11 +763,17 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         for(i = 0; i <= stereo; i++)
             *samples8++ = pred[i];
         for(; i < unp_size; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo){
                 if(vlc[1].table)
                     res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[1] += sign_extend(h[1].values[res], 8);
                 *samples8++ = pred[1];
             } else {
@@ -733,6 +781,10 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
                     res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[0] += sign_extend(h[0].values[res], 8);
                 *samples8++ = pred[0];
             }
diff --git a/libavcodec/smc.c b/libavcodec/smc.c
index b91e410196..a423c455cc 100644
--- a/libavcodec/smc.c
+++ b/libavcodec/smc.c
@@ -1,21 +1,21 @@
 /*
  * Quicktime Graphics (SMC) Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,7 +84,7 @@ static void smc_decode_stream(SmcContext *s)
     int stride = s->frame->linesize[0];
     int i;
     int chunk_size;
-    int buf_size = (int) (s->gb.buffer_end - s->gb.buffer_start);
+    int buf_size = bytestream2_size(&s->gb);
     unsigned char opcode;
     int n_blocks;
     unsigned int color_flags;
@@ -436,10 +436,8 @@ static int smc_decode_frame(AVCodecContext *avctx,
 
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (pal) {
         s->frame->palette_has_changed = 1;
diff --git a/libavcodec/smvjpegdec.c b/libavcodec/smvjpegdec.c
new file mode 100644
index 0000000000..9c2fb380e1
--- /dev/null
+++ b/libavcodec/smvjpegdec.c
@@ -0,0 +1,220 @@
+/*
+ * SMV JPEG decoder
+ * Copyright (c) 2013 Ash Hughes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SMV JPEG decoder.
+ */
+
+// #define DEBUG
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "mjpegdec.h"
+#include "internal.h"
+
+typedef struct SMVJpegDecodeContext {
+    MJpegDecodeContext jpg;
+    AVFrame *picture[2]; /* pictures array */
+    AVCodecContext* avctx;
+    int frames_per_jpeg;
+    int mjpeg_data_size;
+} SMVJpegDecodeContext;
+
+static inline void smv_img_pnt_plane(uint8_t      **dst, uint8_t *src,
+                                     int src_linesize, int height, int nlines)
+{
+    if (!dst || !src)
+        return;
+    src += (nlines) * src_linesize * height;
+    *dst = src;
+}
+
+static inline void smv_img_pnt(uint8_t *dst_data[4], uint8_t *src_data[4],
+                               const int src_linesizes[4],
+                               enum AVPixelFormat pix_fmt, int width, int height,
+                               int nlines)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    int i, planes_nb = 0;
+
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL)
+        return;
+
+    for (i = 0; i < desc->nb_components; i++)
+        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+    for (i = 0; i < planes_nb; i++) {
+        int h = height;
+        if (i == 1 || i == 2) {
+            h = FF_CEIL_RSHIFT(height, desc->log2_chroma_h);
+        }
+        smv_img_pnt_plane(&dst_data[i], src_data[i],
+            src_linesizes[i], h, nlines);
+    }
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
+        dst_data[1] = src_data[1];
+}
+
+static av_cold int smvjpeg_decode_end(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    MJpegDecodeContext *jpg = &s->jpg;
+    int ret;
+
+    jpg->picture_ptr = NULL;
+    av_frame_free(&s->picture[0]);
+    av_frame_free(&s->picture[1]);
+    ret = avcodec_close(s->avctx);
+    av_freep(&s->avctx);
+    return ret;
+}
+
+static av_cold int smvjpeg_decode_init(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVCodec *codec;
+    AVDictionary *thread_opt = NULL;
+    int ret = 0, r;
+
+    s->frames_per_jpeg = 0;
+
+    s->picture[0] = av_frame_alloc();
+    if (!s->picture[0])
+        return AVERROR(ENOMEM);
+
+    s->picture[1] = av_frame_alloc();
+    if (!s->picture[1]) {
+        av_frame_free(&s->picture[0]);
+        return AVERROR(ENOMEM);
+    }
+
+    s->jpg.picture_ptr      = s->picture[0];
+
+    if (avctx->extradata_size >= 4)
+        s->frames_per_jpeg = AV_RL32(avctx->extradata);
+
+    if (s->frames_per_jpeg <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of frames per jpeg.\n");
+        ret = AVERROR_INVALIDDATA;
+    }
+
+    codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+    if (!codec) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+        smvjpeg_decode_end(avctx);
+        return AVERROR_DECODER_NOT_FOUND;
+    }
+
+    s->avctx = avcodec_alloc_context3(codec);
+
+    av_dict_set(&thread_opt, "threads", "1", 0);
+    s->avctx->refcounted_frames = 1;
+    s->avctx->flags = avctx->flags;
+    s->avctx->idct_algo = avctx->idct_algo;
+    if ((r = ff_codec_open2_recursive(s->avctx, codec, &thread_opt)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        ret = r;
+    }
+    av_dict_free(&thread_opt);
+
+    if (ret < 0)
+        smvjpeg_decode_end(avctx);
+    return ret;
+}
+
+static int smvjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
+                            AVPacket *avpkt)
+{
+    const AVPixFmtDescriptor *desc;
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVFrame* mjpeg_data = s->picture[0];
+    int i, cur_frame = 0, ret = 0;
+
+    cur_frame = avpkt->pts % s->frames_per_jpeg;
+
+    /* Are we at the start of a block? */
+    if (!cur_frame) {
+        av_frame_unref(mjpeg_data);
+        ret = avcodec_decode_video2(s->avctx, mjpeg_data, &s->mjpeg_data_size, avpkt);
+        if (ret < 0) {
+            s->mjpeg_data_size = 0;
+            return ret;
+        }
+    } else if (!s->mjpeg_data_size)
+        return AVERROR(EINVAL);
+
+    desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+    av_assert0(desc);
+
+    if (mjpeg_data->height % (s->frames_per_jpeg << desc->log2_chroma_h)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid height\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*use the last lot... */
+    *data_size = s->mjpeg_data_size;
+
+    avctx->pix_fmt = s->avctx->pix_fmt;
+
+    /* We shouldn't get here if frames_per_jpeg <= 0 because this was rejected
+       in init */
+    ret = ff_set_dimensions(avctx, mjpeg_data->width, mjpeg_data->height / s->frames_per_jpeg);
+    if (ret < 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to set dimensions\n");
+        return ret;
+    }
+
+    if (*data_size) {
+        s->picture[1]->extended_data = NULL;
+        s->picture[1]->width         = avctx->width;
+        s->picture[1]->height        = avctx->height;
+        s->picture[1]->format        = avctx->pix_fmt;
+        /* ff_init_buffer_info(avctx, &s->picture[1]); */
+        smv_img_pnt(s->picture[1]->data, mjpeg_data->data, mjpeg_data->linesize,
+                    avctx->pix_fmt, avctx->width, avctx->height, cur_frame);
+        for (i = 0; i < AV_NUM_DATA_POINTERS; i++)
+            s->picture[1]->linesize[i] = mjpeg_data->linesize[i];
+
+        ret = av_frame_ref(data, s->picture[1]);
+    }
+
+    return ret;
+}
+
+static const AVClass smvjpegdec_class = {
+    .class_name = "SMVJPEG decoder",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_smvjpeg_decoder = {
+    .name           = "smvjpeg",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMV JPEG"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SMVJPEG,
+    .priv_data_size = sizeof(SMVJpegDecodeContext),
+    .init           = smvjpeg_decode_init,
+    .close          = smvjpeg_decode_end,
+    .decode         = smvjpeg_decode_frame,
+    .priv_class     = &smvjpegdec_class,
+};
diff --git a/libavcodec/snappy.c b/libavcodec/snappy.c
index df6c6b386a..7900b0f978 100644
--- a/libavcodec/snappy.c
+++ b/libavcodec/snappy.c
@@ -2,20 +2,20 @@
  * Snappy decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -148,7 +148,7 @@ int ff_snappy_uncompress(GetByteContext *gb, uint8_t *buf, int64_t *size)
         return len;
 
     if (len > *size)
-        return AVERROR_BUG;
+        return AVERROR_BUFFER_TOO_SMALL;
 
     *size = len;
     p     = buf;
diff --git a/libavcodec/snappy.h b/libavcodec/snappy.h
index 8d365c0f8e..a65cb3aa1b 100644
--- a/libavcodec/snappy.h
+++ b/libavcodec/snappy.h
@@ -2,20 +2,20 @@
  * Snappy module
  * Copyright (c) Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
new file mode 100644
index 0000000000..fc2e7279f3
--- /dev/null
+++ b/libavcodec/snow.c
@@ -0,0 +1,733 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+#include "snowdata.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+#include "h263.h"
+
+
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    int y, x;
+    IDWTELEM * dst;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly misuse of obmc_stride
+        const uint8_t *obmc1= obmc + y*obmc_stride;
+        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+}
+
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame)
+{
+    int ret, i;
+    int edges_needed = av_codec_is_encoder(s->avctx->codec);
+
+    frame->width  = s->avctx->width ;
+    frame->height = s->avctx->height;
+    if (edges_needed) {
+        frame->width  += 2 * EDGE_WIDTH;
+        frame->height += 2 * EDGE_WIDTH;
+    }
+    if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+    if (edges_needed) {
+        for (i = 0; frame->data[i]; i++) {
+            int offset = (EDGE_WIDTH >> (i ? s->chroma_v_shift : 0)) *
+                            frame->linesize[i] +
+                            (EDGE_WIDTH >> (i ? s->chroma_h_shift : 0));
+            frame->data[i] += offset;
+        }
+        frame->width  = s->avctx->width;
+        frame->height = s->avctx->height;
+    }
+
+    return 0;
+}
+
+void ff_snow_reset_contexts(SnowContext *s){ //FIXME better initial contexts
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        for(level=0; level<MAX_DECOMPOSITIONS; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
+            }
+        }
+    }
+    memset(s->header_state, MID_STATE, sizeof(s->header_state));
+    memset(s->block_state, MID_STATE, sizeof(s->block_state));
+}
+
+int ff_snow_alloc_blocks(SnowContext *s){
+    int w= FF_CEIL_RSHIFT(s->avctx->width,  LOG2_MB_SIZE);
+    int h= FF_CEIL_RSHIFT(s->avctx->height, LOG2_MB_SIZE);
+
+    s->b_width = w;
+    s->b_height= h;
+
+    av_free(s->block);
+    s->block= av_mallocz_array(w * h,  sizeof(BlockNode) << (s->block_max_depth*2));
+    if (!s->block)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void init_qexp(void){
+    int i;
+    double v=128;
+
+    for(i=0; i<QROOT; i++){
+        ff_qexp[i]= lrintf(v);
+        v *= pow(2, 1.0 / QROOT);
+    }
+}
+static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, int stride, int b_w, int b_h, int dx, int dy){
+    static const uint8_t weight[64]={
+    8,7,6,5,4,3,2,1,
+    7,7,0,0,0,0,0,1,
+    6,0,6,0,0,0,2,0,
+    5,0,0,5,0,3,0,0,
+    4,0,0,0,4,0,0,0,
+    3,0,0,5,0,3,0,0,
+    2,0,6,0,0,0,2,0,
+    1,7,0,0,0,0,0,1,
+    };
+
+    static const uint8_t brane[256]={
+    0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+    0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
+    0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
+    0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
+    0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
+    0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
+    0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
+    0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
+    0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
+    0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
+    0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
+    0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
+    0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
+    0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
+    0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
+    0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
+    };
+
+    static const uint8_t needs[16]={
+    0,1,0,0,
+    2,4,2,0,
+    0,1,0,0,
+    15
+    };
+
+    int x, y, b, r, l;
+    int16_t tmpIt   [64*(32+HTAPS_MAX)];
+    uint8_t tmp2t[3][64*(32+HTAPS_MAX)];
+    int16_t *tmpI= tmpIt;
+    uint8_t *tmp2= tmp2t[0];
+    const uint8_t *hpel[11];
+    av_assert2(dx<16 && dy<16);
+    r= brane[dx + 16*dy]&15;
+    l= brane[dx + 16*dy]>>4;
+
+    b= needs[l] | needs[r];
+    if(p && !p->diag_mc)
+        b= 15;
+
+    if(b&5){
+        for(y=0; y < b_h+HTAPS_MAX-1; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=src[x + HTAPS_MAX/2-4];
+                int a0= src[x + HTAPS_MAX/2-3];
+                int a1= src[x + HTAPS_MAX/2-2];
+                int a2= src[x + HTAPS_MAX/2-1];
+                int a3= src[x + HTAPS_MAX/2+0];
+                int a4= src[x + HTAPS_MAX/2+1];
+                int a5= src[x + HTAPS_MAX/2+2];
+                int a6= src[x + HTAPS_MAX/2+3];
+                int am=0;
+                if(!p || p->fast_mc){
+                    am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+                    tmpI[x]= am;
+                    am= (am+16)>>5;
+                }else{
+                    am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
+                    tmpI[x]= am;
+                    am= (am+32)>>6;
+                }
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+            src += stride;
+        }
+        src -= stride*y;
+    }
+    src += HTAPS_MAX/2 - 1;
+    tmp2= tmp2t[1];
+
+    if(b&2){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w+1; x++){
+                int a_1=src[x + (HTAPS_MAX/2-4)*stride];
+                int a0= src[x + (HTAPS_MAX/2-3)*stride];
+                int a1= src[x + (HTAPS_MAX/2-2)*stride];
+                int a2= src[x + (HTAPS_MAX/2-1)*stride];
+                int a3= src[x + (HTAPS_MAX/2+0)*stride];
+                int a4= src[x + (HTAPS_MAX/2+1)*stride];
+                int a5= src[x + (HTAPS_MAX/2+2)*stride];
+                int a6= src[x + (HTAPS_MAX/2+3)*stride];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            src += stride;
+            tmp2+= 64;
+        }
+        src -= stride*y;
+    }
+    src += stride*(HTAPS_MAX/2 - 1);
+    tmp2= tmp2t[2];
+    tmpI= tmpIt;
+    if(b&4){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
+                int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
+                int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
+                int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
+                int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
+                int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
+                int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
+                int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+        }
+    }
+
+    hpel[ 0]= src;
+    hpel[ 1]= tmp2t[0] + 64*(HTAPS_MAX/2-1);
+    hpel[ 2]= src + 1;
+
+    hpel[ 4]= tmp2t[1];
+    hpel[ 5]= tmp2t[2];
+    hpel[ 6]= tmp2t[1] + 1;
+
+    hpel[ 8]= src + stride;
+    hpel[ 9]= hpel[1] + 64;
+    hpel[10]= hpel[8] + 1;
+
+#define MC_STRIDE(x) (needs[x] ? 64 : stride)
+
+    if(b==15){
+        int dxy = dx / 8 + dy / 8 * 4;
+        const uint8_t *src1 = hpel[dxy    ];
+        const uint8_t *src2 = hpel[dxy + 1];
+        const uint8_t *src3 = hpel[dxy + 4];
+        const uint8_t *src4 = hpel[dxy + 5];
+        int stride1 = MC_STRIDE(dxy);
+        int stride2 = MC_STRIDE(dxy + 1);
+        int stride3 = MC_STRIDE(dxy + 4);
+        int stride4 = MC_STRIDE(dxy + 5);
+        dx&=7;
+        dy&=7;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
+                         (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            src3+=stride3;
+            src4+=stride4;
+            dst +=stride;
+        }
+    }else{
+        const uint8_t *src1= hpel[l];
+        const uint8_t *src2= hpel[r];
+        int stride1 = MC_STRIDE(l);
+        int stride2 = MC_STRIDE(r);
+        int a= weight[((dx&7) + (8*(dy&7)))];
+        int b= 8-a;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            dst +=stride;
+        }
+    }
+}
+
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride, int sx, int sy, int b_w, int b_h, const BlockNode *block, int plane_index, int w, int h){
+    if(block->type & BLOCK_INTRA){
+        int x, y;
+        const unsigned color  = block->color[plane_index];
+        const unsigned color4 = color*0x01010101;
+        if(b_w==32){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+                *(uint32_t*)&dst[16+ y*stride]= color4;
+                *(uint32_t*)&dst[20+ y*stride]= color4;
+                *(uint32_t*)&dst[24+ y*stride]= color4;
+                *(uint32_t*)&dst[28+ y*stride]= color4;
+            }
+        }else if(b_w==16){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+            }
+        }else if(b_w==8){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+            }
+        }else if(b_w==4){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+            }
+        }else{
+            for(y=0; y < b_h; y++){
+                for(x=0; x < b_w; x++){
+                    dst[x + y*stride]= color;
+                }
+            }
+        }
+    }else{
+        uint8_t *src= s->last_picture[block->ref]->data[plane_index];
+        const int scale= plane_index ?  (2*s->mv_scale)>>s->chroma_h_shift : 2*s->mv_scale;
+        int mx= block->mx*scale;
+        int my= block->my*scale;
+        const int dx= mx&15;
+        const int dy= my&15;
+        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
+        sx += (mx>>4) - (HTAPS_MAX/2-1);
+        sy += (my>>4) - (HTAPS_MAX/2-1);
+        src += sx + sy*stride;
+        if(   (unsigned)sx >= FFMAX(w - b_w - (HTAPS_MAX-2), 0)
+           || (unsigned)sy >= FFMAX(h - b_h - (HTAPS_MAX-2), 0)){
+            s->vdsp.emulated_edge_mc(tmp + MB_SIZE, src,
+                                     stride, stride,
+                                     b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1,
+                                     sx, sy, w, h);
+            src= tmp + MB_SIZE;
+        }
+
+        av_assert2(s->chroma_h_shift == s->chroma_v_shift); // only one mv_scale
+
+        av_assert2((tab_index>=0 && tab_index<4) || b_w==32);
+        if(    (dx&3) || (dy&3)
+            || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h)
+            || (b_w&(b_w-1))
+            || b_w == 1
+            || b_h == 1
+            || !s->plane[plane_index].fast_mc )
+            mc_block(&s->plane[plane_index], dst, src, stride, b_w, b_h, dx, dy);
+        else if(b_w==32){
+            int y;
+            for(y=0; y<b_h; y+=16){
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
+            }
+        }else if(b_w==b_h)
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
+        else if(b_w==2*b_h){
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
+        }else{
+            av_assert2(2*b_w==b_h);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
+        }
+    }
+}
+
+#define mca(dx,dy,b_w)\
+static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h){\
+    av_assert2(h==b_w);\
+    mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, stride, b_w, b_w, dx, dy);\
+}
+
+mca( 0, 0,16)
+mca( 8, 0,16)
+mca( 0, 8,16)
+mca( 8, 8,16)
+mca( 0, 0,8)
+mca( 8, 0,8)
+mca( 0, 8,8)
+mca( 8, 8,8)
+
+av_cold int ff_snow_common_init(AVCodecContext *avctx){
+    SnowContext *s = avctx->priv_data;
+    int width, height;
+    int i, j;
+
+    s->avctx= avctx;
+    s->max_ref_frames=1; //just make sure it's not an invalid value in case of no initial keyframe
+    s->spatial_decomposition_count = 1;
+
+    ff_me_cmp_init(&s->mecc, avctx);
+    ff_hpeldsp_init(&s->hdsp, avctx->flags);
+    ff_videodsp_init(&s->vdsp, 8);
+    ff_dwt_init(&s->dwt);
+    ff_h264qpel_init(&s->h264qpel, 8);
+
+#define mcf(dx,dy)\
+    s->qdsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[0][dy+dx/4];\
+    s->qdsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[1][dy+dx/4];
+
+    mcf( 0, 0)
+    mcf( 4, 0)
+    mcf( 8, 0)
+    mcf(12, 0)
+    mcf( 0, 4)
+    mcf( 4, 4)
+    mcf( 8, 4)
+    mcf(12, 4)
+    mcf( 0, 8)
+    mcf( 4, 8)
+    mcf( 8, 8)
+    mcf(12, 8)
+    mcf( 0,12)
+    mcf( 4,12)
+    mcf( 8,12)
+    mcf(12,12)
+
+#define mcfh(dx,dy)\
+    s->hdsp.put_pixels_tab       [0][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 16;\
+    s->hdsp.put_pixels_tab       [1][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 8;
+
+    mcfh(0, 0)
+    mcfh(8, 0)
+    mcfh(0, 8)
+    mcfh(8, 8)
+
+    init_qexp();
+
+//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
+
+    width= s->avctx->width;
+    height= s->avctx->height;
+
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_idwt_buffer, width, height * sizeof(IDWTELEM), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_dwt_buffer,  width, height * sizeof(DWTELEM),  fail); //FIXME this does not belong here
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_dwt_buffer,     width, sizeof(DWTELEM),  fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_idwt_buffer,    width, sizeof(IDWTELEM), fail);
+    FF_ALLOC_ARRAY_OR_GOTO(avctx,  s->run_buffer,          ((width + 1) >> 1), ((height + 1) >> 1) * sizeof(*s->run_buffer), fail);
+
+    for(i=0; i<MAX_REF_FRAMES; i++) {
+        for(j=0; j<MAX_REF_FRAMES; j++)
+            ff_scale_mv_ref[i][j] = 256*(i+1)/(j+1);
+        s->last_picture[i] = av_frame_alloc();
+        if (!s->last_picture[i])
+            goto fail;
+    }
+
+    s->mconly_picture = av_frame_alloc();
+    s->current_picture = av_frame_alloc();
+    if (!s->mconly_picture || !s->current_picture)
+        goto fail;
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+int ff_snow_common_init_after_header(AVCodecContext *avctx) {
+    SnowContext *s = avctx->priv_data;
+    int plane_index, level, orientation;
+    int ret, emu_buf_size;
+
+    if(!s->scratchbuf) {
+        if ((ret = ff_get_buffer(s->avctx, s->mconly_picture,
+                                 AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->scratchbuf, FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256), 7*MB_SIZE, fail);
+        emu_buf_size = FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256) * (2 * MB_SIZE + HTAPS_MAX - 1);
+        FF_ALLOC_OR_GOTO(avctx, s->emu_edge_buffer, emu_buf_size, fail);
+    }
+
+    if(s->mconly_picture->format != avctx->pix_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "pixel format changed\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        int w= s->avctx->width;
+        int h= s->avctx->height;
+
+        if(plane_index){
+            w = FF_CEIL_RSHIFT(w, s->chroma_h_shift);
+            h = FF_CEIL_RSHIFT(h, s->chroma_v_shift);
+        }
+        s->plane[plane_index].width = w;
+        s->plane[plane_index].height= h;
+
+        for(level=s->spatial_decomposition_count-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                b->buf= s->spatial_dwt_buffer;
+                b->level= level;
+                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
+                b->width = (w + !(orientation&1))>>1;
+                b->height= (h + !(orientation>1))>>1;
+
+                b->stride_line = 1 << (s->spatial_decomposition_count - level);
+                b->buf_x_offset = 0;
+                b->buf_y_offset = 0;
+
+                if(orientation&1){
+                    b->buf += (w+1)>>1;
+                    b->buf_x_offset = (w+1)>>1;
+                }
+                if(orientation>1){
+                    b->buf += b->stride>>1;
+                    b->buf_y_offset = b->stride_line >> 1;
+                }
+                b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
+
+                if(level)
+                    b->parent= &s->plane[plane_index].band[level-1][orientation];
+                //FIXME avoid this realloc
+                av_freep(&b->x_coeff);
+                b->x_coeff=av_mallocz_array(((b->width+1) * b->height+1), sizeof(x_and_coeff));
+                if (!b->x_coeff)
+                    goto fail;
+            }
+            w= (w+1)>>1;
+            h= (h+1)>>1;
+        }
+    }
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+#define USE_HALFPEL_PLANE 0
+
+static int halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
+    int p,x,y;
+
+    for(p=0; p < s->nb_planes; p++){
+        int is_chroma= !!p;
+        int w= is_chroma ? FF_CEIL_RSHIFT(s->avctx->width,  s->chroma_h_shift) : s->avctx->width;
+        int h= is_chroma ? FF_CEIL_RSHIFT(s->avctx->height, s->chroma_v_shift) : s->avctx->height;
+        int ls= frame->linesize[p];
+        uint8_t *src= frame->data[p];
+
+        halfpel[1][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[2][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[3][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        if (!halfpel[1][p] || !halfpel[2][p] || !halfpel[3][p]) {
+            av_freep(&halfpel[1][p]);
+            av_freep(&halfpel[2][p]);
+            av_freep(&halfpel[3][p]);
+            return AVERROR(ENOMEM);
+        }
+        halfpel[1][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[2][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[3][p] += EDGE_WIDTH * (1 + ls);
+
+        halfpel[0][p]= src;
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
+            }
+        }
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+        src= halfpel[1][p];
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+
+//FIXME border!
+    }
+    return 0;
+}
+
+void ff_snow_release_buffer(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int i;
+
+    if(s->last_picture[s->max_ref_frames-1]->data[0]){
+        av_frame_unref(s->last_picture[s->max_ref_frames-1]);
+        for(i=0; i<9; i++)
+            if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3]) {
+                av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture->linesize[i%3]));
+                s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] = NULL;
+            }
+    }
+}
+
+int ff_snow_frame_start(SnowContext *s){
+   AVFrame *tmp;
+   int i, ret;
+
+    ff_snow_release_buffer(s->avctx);
+
+    tmp= s->last_picture[s->max_ref_frames-1];
+    for(i=s->max_ref_frames-1; i>0; i--)
+        s->last_picture[i] = s->last_picture[i-1];
+    memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
+    if(USE_HALFPEL_PLANE && s->current_picture->data[0]) {
+        if((ret = halfpel_interpol(s, s->halfpel_plane[0], s->current_picture)) < 0)
+            return ret;
+    }
+    s->last_picture[0] = s->current_picture;
+    s->current_picture = tmp;
+
+    if(s->keyframe){
+        s->ref_frames= 0;
+    }else{
+        int i;
+        for(i=0; i<s->max_ref_frames && s->last_picture[i]->data[0]; i++)
+            if(i && s->last_picture[i-1]->key_frame)
+                break;
+        s->ref_frames= i;
+        if(s->ref_frames==0){
+            av_log(s->avctx,AV_LOG_ERROR, "No reference frames\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    if ((ret = ff_snow_get_buffer(s, s->current_picture)) < 0)
+        return ret;
+
+    s->current_picture->key_frame= s->keyframe;
+
+    return 0;
+}
+
+av_cold void ff_snow_common_end(SnowContext *s)
+{
+    int plane_index, level, orientation, i;
+
+    av_freep(&s->spatial_dwt_buffer);
+    av_freep(&s->temp_dwt_buffer);
+    av_freep(&s->spatial_idwt_buffer);
+    av_freep(&s->temp_idwt_buffer);
+    av_freep(&s->run_buffer);
+
+    s->m.me.temp= NULL;
+    av_freep(&s->m.me.scratchpad);
+    av_freep(&s->m.me.map);
+    av_freep(&s->m.me.score_map);
+    av_freep(&s->m.sc.obmc_scratchpad);
+
+    av_freep(&s->block);
+    av_freep(&s->scratchbuf);
+    av_freep(&s->emu_edge_buffer);
+
+    for(i=0; i<MAX_REF_FRAMES; i++){
+        av_freep(&s->ref_mvs[i]);
+        av_freep(&s->ref_scores[i]);
+        if(s->last_picture[i] && s->last_picture[i]->data[0]) {
+            av_assert0(s->last_picture[i]->data[0] != s->current_picture->data[0]);
+        }
+        av_frame_free(&s->last_picture[i]);
+    }
+
+    for(plane_index=0; plane_index < MAX_PLANES; plane_index++){
+        for(level=MAX_DECOMPOSITIONS-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                av_freep(&b->x_coeff);
+            }
+        }
+    }
+    av_frame_free(&s->mconly_picture);
+    av_frame_free(&s->current_picture);
+}
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
new file mode 100644
index 0000000000..f2587fd6ef
--- /dev/null
+++ b/libavcodec/snow.h
@@ -0,0 +1,729 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_H
+#define AVCODEC_SNOW_H
+
+#include "libavutil/motion_vector.h"
+
+#include "hpeldsp.h"
+#include "me_cmp.h"
+#include "qpeldsp.h"
+#include "snow_dwt.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#define FF_MPV_OFFSET(x) (offsetof(MpegEncContext, x) + offsetof(SnowContext, m))
+#include "mpegvideo.h"
+#include "h264qpel.h"
+
+#define MID_STATE 128
+
+#define MAX_PLANES 4
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 4
+#define MAX_REF_FRAMES 8
+
+#define LOG2_OBMC_MAX 8
+#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
+typedef struct BlockNode{
+    int16_t mx;
+    int16_t my;
+    uint8_t ref;
+    uint8_t color[3];
+    uint8_t type;
+//#define TYPE_SPLIT    1
+#define BLOCK_INTRA   1
+#define BLOCK_OPT     2
+//#define TYPE_NOCOLOR  4
+    uint8_t level; //FIXME merge into type?
+}BlockNode;
+
+static const BlockNode null_block= { //FIXME add border maybe
+    .color= {128,128,128},
+    .mx= 0,
+    .my= 0,
+    .ref= 0,
+    .type= 0,
+    .level= 0,
+};
+
+#define LOG2_MB_SIZE 4
+#define MB_SIZE (1<<LOG2_MB_SIZE)
+#define ENCODER_EXTRA_BITS 4
+#define HTAPS_MAX 8
+
+typedef struct x_and_coeff{
+    int16_t x;
+    uint16_t coeff;
+} x_and_coeff;
+
+typedef struct SubBand{
+    int level;
+    int stride;
+    int width;
+    int height;
+    int qlog;        ///< log(qscale)/log[2^(1/6)]
+    DWTELEM *buf;
+    IDWTELEM *ibuf;
+    int buf_x_offset;
+    int buf_y_offset;
+    int stride_line; ///< Stride measured in lines, not pixels.
+    x_and_coeff * x_coeff;
+    struct SubBand *parent;
+    uint8_t state[/*7*2*/ 7 + 512][32];
+}SubBand;
+
+typedef struct Plane{
+    int width;
+    int height;
+    SubBand band[MAX_DECOMPOSITIONS][4];
+
+    int htaps;
+    int8_t hcoeff[HTAPS_MAX/2];
+    int diag_mc;
+    int fast_mc;
+
+    int last_htaps;
+    int8_t last_hcoeff[HTAPS_MAX/2];
+    int last_diag_mc;
+}Plane;
+
+typedef struct SnowContext{
+    AVClass *class;
+    AVCodecContext *avctx;
+    RangeCoder c;
+    MECmpContext mecc;
+    HpelDSPContext hdsp;
+    QpelDSPContext qdsp;
+    VideoDSPContext vdsp;
+    H264QpelContext h264qpel;
+    MpegvideoEncDSPContext mpvencdsp;
+    SnowDWTContext dwt;
+    AVFrame *input_picture;              ///< new_picture with the internal linesizes
+    AVFrame *current_picture;
+    AVFrame *last_picture[MAX_REF_FRAMES];
+    uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
+    AVFrame *mconly_picture;
+//     uint8_t q_context[16];
+    uint8_t header_state[32];
+    uint8_t block_state[128 + 32*128];
+    int keyframe;
+    int always_reset;
+    int version;
+    int spatial_decomposition_type;
+    int last_spatial_decomposition_type;
+    int temporal_decomposition_type;
+    int spatial_decomposition_count;
+    int last_spatial_decomposition_count;
+    int temporal_decomposition_count;
+    int max_ref_frames;
+    int ref_frames;
+    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
+    uint32_t *ref_scores[MAX_REF_FRAMES];
+    DWTELEM *spatial_dwt_buffer;
+    DWTELEM *temp_dwt_buffer;
+    IDWTELEM *spatial_idwt_buffer;
+    IDWTELEM *temp_idwt_buffer;
+    int *run_buffer;
+    int colorspace_type;
+    int chroma_h_shift;
+    int chroma_v_shift;
+    int spatial_scalability;
+    int qlog;
+    int last_qlog;
+    int lambda;
+    int lambda2;
+    int pass1_rc;
+    int mv_scale;
+    int last_mv_scale;
+    int qbias;
+    int last_qbias;
+#define QBIAS_SHIFT 3
+    int b_width;
+    int b_height;
+    int block_max_depth;
+    int last_block_max_depth;
+    int nb_planes;
+    Plane plane[MAX_PLANES];
+    BlockNode *block;
+#define ME_CACHE_SIZE 1024
+    unsigned me_cache[ME_CACHE_SIZE];
+    unsigned me_cache_generation;
+    slice_buffer sb;
+    int memc_only;
+    int no_bitstream;
+    int intra_penalty;
+    int motion_est;
+    int iterative_dia_size;
+
+    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
+
+    uint8_t *scratchbuf;
+    uint8_t *emu_edge_buffer;
+
+    AVMotionVector *avmv;
+    int avmv_index;
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
+}SnowContext;
+
+/* Tables */
+extern const uint8_t * const ff_obmc_tab[4];
+extern uint8_t ff_qexp[QROOT];
+extern int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+/* C bits used by mmx/sse2/altivec */
+
+static av_always_inline void snow_interleave_line_header(int * i, int width, IDWTELEM * low, IDWTELEM * high){
+    (*i) = (width) - 2;
+
+    if (width & 1){
+        low[(*i)+1] = low[((*i)+1)>>1];
+        (*i)--;
+    }
+}
+
+static av_always_inline void snow_interleave_line_footer(int * i, IDWTELEM * low, IDWTELEM * high){
+    for (; (*i)>=0; (*i)-=2){
+        low[(*i)+1] = high[(*i)>>1];
+        low[*i] = low[(*i)>>1];
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
+    for(; i<w; i++){
+        dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
+    }
+
+    if((width^lift_high)&1){
+        dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w){
+        for(; i<w; i++){
+            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
+        }
+
+        if(width&1){
+            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
+        }
+}
+
+/* common code */
+
+int ff_snow_common_init(AVCodecContext *avctx);
+int ff_snow_common_init_after_header(AVCodecContext *avctx);
+void ff_snow_common_end(SnowContext *s);
+void ff_snow_release_buffer(AVCodecContext *avctx);
+void ff_snow_reset_contexts(SnowContext *s);
+int ff_snow_alloc_blocks(SnowContext *s);
+int ff_snow_frame_start(SnowContext *s);
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride,
+                     int sx, int sy, int b_w, int b_h, const BlockNode *block,
+                     int plane_index, int w, int h);
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame);
+/* common inline functions */
+//XXX doublecheck all of them should stay inlined
+
+static inline void snow_set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<rem_depth;
+    BlockNode block;
+    int i,j;
+
+    block.color[0]= l;
+    block.color[1]= cb;
+    block.color[2]= cr;
+    block.mx= mx;
+    block.my= my;
+    block.ref= ref;
+    block.type= type;
+    block.level= level;
+
+    for(j=0; j<block_w; j++){
+        for(i=0; i<block_w; i++){
+            s->block[index + i + j*w]= block;
+        }
+    }
+}
+
+static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
+                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
+    if(s->ref_frames == 1){
+        *mx = mid_pred(left->mx, top->mx, tr->mx);
+        *my = mid_pred(left->my, top->my, tr->my);
+    }else{
+        const int *scale = ff_scale_mv_ref[ref];
+        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
+                       (top ->mx * scale[top ->ref] + 128) >>8,
+                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
+        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
+                       (top ->my * scale[top ->ref] + 128) >>8,
+                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
+    }
+}
+
+static av_always_inline int same_block(BlockNode *a, BlockNode *b){
+    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
+        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
+    }else{
+        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
+    }
+}
+
+//FIXME name cleanup (b_w, block_w, b_width stuff)
+//XXX should we really inline it?
+static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    BlockNode *lt= &s->block[b_x + b_y*b_stride];
+    BlockNode *rt= lt+1;
+    BlockNode *lb= lt+b_stride;
+    BlockNode *rb= lb+1;
+    uint8_t *block[4];
+    // When src_stride is large enough, it is possible to interleave the blocks.
+    // Otherwise the blocks are written sequentially in the tmp buffer.
+    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
+    uint8_t *tmp = s->scratchbuf;
+    uint8_t *ptmp;
+    int x,y;
+
+    if(b_x<0){
+        lt= rt;
+        lb= rb;
+    }else if(b_x + 1 >= b_width){
+        rt= lt;
+        rb= lb;
+    }
+    if(b_y<0){
+        lt= lb;
+        rt= rb;
+    }else if(b_y + 1 >= b_height){
+        lb= lt;
+        rb= rt;
+    }
+
+    if(src_x<0){ //FIXME merge with prev & always round internal width up to *16
+        obmc -= src_x;
+        b_w += src_x;
+        if(!sliced && !offset_dst)
+            dst -= src_x;
+        src_x=0;
+    }
+    if(src_x + b_w > w){
+        b_w = w - src_x;
+    }
+    if(src_y<0){
+        obmc -= src_y*obmc_stride;
+        b_h += src_y;
+        if(!sliced && !offset_dst)
+            dst -= src_y*dst_stride;
+        src_y=0;
+    }
+    if(src_y + b_h> h){
+        b_h = h - src_y;
+    }
+
+    if(b_w<=0 || b_h<=0) return;
+
+    if(!sliced && offset_dst)
+        dst += src_x + src_y*dst_stride;
+    dst8+= src_x + src_y*src_stride;
+//    src += src_x + src_y*src_stride;
+
+    ptmp= tmp + 3*tmp_step;
+    block[0]= ptmp;
+    ptmp+=tmp_step;
+    ff_snow_pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
+
+    if(same_block(lt, rt)){
+        block[1]= block[0];
+    }else{
+        block[1]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+    }
+
+    if(same_block(lt, lb)){
+        block[2]= block[0];
+    }else if(same_block(rt, lb)){
+        block[2]= block[1];
+    }else{
+        block[2]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+    }
+
+    if(same_block(lt, rb) ){
+        block[3]= block[0];
+    }else if(same_block(rt, rb)){
+        block[3]= block[1];
+    }else if(same_block(lb, rb)){
+        block[3]= block[2];
+    }else{
+        block[3]= ptmp;
+        ff_snow_pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+    }
+    if(sliced){
+        s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    }else{
+        for(y=0; y<b_h; y++){
+            //FIXME ugly misuse of obmc_stride
+            const uint8_t *obmc1= obmc + y*obmc_stride;
+            const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+            const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+            const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+            for(x=0; x<b_w; x++){
+                int v=   obmc1[x] * block[3][x + y*src_stride]
+                        +obmc2[x] * block[2][x + y*src_stride]
+                        +obmc3[x] * block[1][x + y*src_stride]
+                        +obmc4[x] * block[0][x + y*src_stride];
+
+                v <<= 8 - LOG2_OBMC_MAX;
+                if(FRAC_BITS != 8){
+                    v >>= 8 - FRAC_BITS;
+                }
+                if(add){
+                    v += dst[x + y*dst_stride];
+                    v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*src_stride] = v;
+                }else{
+                    dst[x + y*dst_stride] -= v;
+                }
+            }
+        }
+    }
+}
+
+static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); // obmc params assume squares
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 0, NULL, buf, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 1, plane_index);
+    }
+}
+
+static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
+    const int mb_h= s->b_height << s->block_max_depth;
+    int mb_y;
+    for(mb_y=0; mb_y<=mb_h; mb_y++)
+        predict_slice(s, buf, plane_index, add, mb_y);
+}
+
+static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<rem_depth;
+    const int block_h= 1<<rem_depth; //FIXME "w!=h"
+    BlockNode block;
+    int i,j;
+
+    block.color[0]= l;
+    block.color[1]= cb;
+    block.color[2]= cr;
+    block.mx= mx;
+    block.my= my;
+    block.ref= ref;
+    block.type= type;
+    block.level= level;
+
+    for(j=0; j<block_h; j++){
+        for(i=0; i<block_w; i++){
+            s->block[index + i + j*w]= block;
+        }
+    }
+}
+
+static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+    SnowContext *s = c->avctx->priv_data;
+    const int offset[3]= {
+          y*c->  stride + x,
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+    };
+    int i;
+    for(i=0; i<3; i++){
+        c->src[0][i]= src [i];
+        c->ref[0][i]= ref [i] + offset[i];
+    }
+    av_assert2(!ref_index);
+}
+
+
+/* bitstream functions */
+
+extern const int8_t ff_quant3bA[256];
+
+#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
+
+static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
+    int i;
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        const int el= FFMIN(e, 10);
+        put_rac(c, state+0, 0);
+
+        for(i=0; i<el; i++){
+            put_rac(c, state+1+i, 1);  //1..10
+        }
+        for(; i<e; i++){
+            put_rac(c, state+1+9, 1);  //1..10
+        }
+        put_rac(c, state+1+FFMIN(i,9), 0);
+
+        for(i=e-1; i>=el; i--){
+            put_rac(c, state+22+9, (a>>i)&1); //22..31
+        }
+        for(; i>=0; i--){
+            put_rac(c, state+22+i, (a>>i)&1); //22..31
+        }
+
+        if(is_signed)
+            put_rac(c, state+11 + el, v < 0); //11..21
+    }else{
+        put_rac(c, state+0, 1);
+    }
+}
+
+static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e,10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+
+    av_assert2(v>=0);
+    av_assert2(log2>=-4);
+
+    while(v >= r){
+        put_rac(c, state+4+log2, 1);
+        v -= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+    put_rac(c, state+4+log2, 0);
+
+    for(i=log2-1; i>=0; i--){
+        put_rac(c, state+31-i, (v>>i)&1);
+    }
+}
+
+static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+    int v=0;
+
+    av_assert2(log2>=-4);
+
+    while(log2<28 && get_rac(c, state+4+log2)){
+        v+= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+
+    for(i=log2-1; i>=0; i--){
+        v+= get_rac(c, state+31-i)<<i;
+    }
+
+    return v;
+}
+
+static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    int run, runs;
+    x_and_coeff *xc= b->x_coeff;
+    x_and_coeff *prev_xc= NULL;
+    x_and_coeff *prev2_xc= xc;
+    x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
+    x_and_coeff *prev_parent_xc= parent_xc;
+
+    runs= get_symbol2(&s->c, b->state[30], 0);
+    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+    else           run= INT_MAX;
+
+    for(y=0; y<h; y++){
+        int v=0;
+        int lt=0, t=0, rt=0;
+
+        if(y && prev_xc->x == 0){
+            rt= prev_xc->coeff;
+        }
+        for(x=0; x<w; x++){
+            int p=0;
+            const int l= v;
+
+            lt= t; t= rt;
+
+            if(y){
+                if(prev_xc->x <= x)
+                    prev_xc++;
+                if(prev_xc->x == x + 1)
+                    rt= prev_xc->coeff;
+                else
+                    rt=0;
+            }
+            if(parent_xc){
+                if(x>>1 > parent_xc->x){
+                    parent_xc++;
+                }
+                if(x>>1 == parent_xc->x){
+                    p= parent_xc->coeff;
+                }
+            }
+            if(/*ll|*/l|lt|t|rt|p){
+                int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
+
+                v=get_rac(&s->c, &b->state[0][context]);
+                if(v){
+                    v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l&0xFF] + 3*ff_quant3bA[t&0xFF]]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }
+            }else{
+                if(!run){
+                    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+                    else           run= INT_MAX;
+                    v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }else{
+                    int max_run;
+                    run--;
+                    v=0;
+                    av_assert2(run >= 0);
+                    if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
+                    else  max_run= FFMIN(run, w-x-1);
+                    if(parent_xc)
+                        max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
+                    av_assert2(max_run >= 0 && max_run <= run);
+
+                    x+= max_run;
+                    run-= max_run;
+                }
+            }
+        }
+        (xc++)->x= w+1; //end marker
+        prev_xc= prev2_xc;
+        prev2_xc= xc;
+
+        if(parent_xc){
+            if(y&1){
+                while(parent_xc->x != parent->width+1)
+                    parent_xc++;
+                parent_xc++;
+                prev_parent_xc= parent_xc;
+            }else{
+                parent_xc= prev_parent_xc;
+            }
+        }
+    }
+
+    (xc++)->x= w+1; //end marker
+}
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snow_dwt.c b/libavcodec/snow_dwt.c
new file mode 100644
index 0000000000..25681e7edd
--- /dev/null
+++ b/libavcodec/snow_dwt.c
@@ -0,0 +1,860 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer)
+{
+    int i;
+
+    buf->base_buffer = base_buffer;
+    buf->line_count  = line_count;
+    buf->line_width  = line_width;
+    buf->data_count  = max_allocated_lines;
+    buf->line        = av_mallocz_array(line_count, sizeof(IDWTELEM *));
+    if (!buf->line)
+        return AVERROR(ENOMEM);
+    buf->data_stack  = av_malloc_array(max_allocated_lines, sizeof(IDWTELEM *));
+    if (!buf->data_stack) {
+        av_freep(&buf->line);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < max_allocated_lines; i++) {
+        buf->data_stack[i] = av_malloc_array(line_width, sizeof(IDWTELEM));
+        if (!buf->data_stack[i]) {
+            for (i--; i >=0; i--)
+                av_freep(&buf->data_stack[i]);
+            av_freep(&buf->data_stack);
+            av_freep(&buf->line);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    buf->data_stack_top = max_allocated_lines - 1;
+    return 0;
+}
+
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert0(buf->data_stack_top >= 0);
+//  av_assert1(!buf->line[line]);
+    if (buf->line[line])
+        return buf->line[line];
+
+    buffer = buf->data_stack[buf->data_stack_top];
+    buf->data_stack_top--;
+    buf->line[line] = buffer;
+
+    return buffer;
+}
+
+void ff_slice_buffer_release(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert1(line >= 0 && line < buf->line_count);
+    av_assert1(buf->line[line]);
+
+    buffer = buf->line[line];
+    buf->data_stack_top++;
+    buf->data_stack[buf->data_stack_top] = buffer;
+    buf->line[line]                      = NULL;
+}
+
+void ff_slice_buffer_flush(slice_buffer *buf)
+{
+    int i;
+
+    if (!buf->line)
+        return;
+
+    for (i = 0; i < buf->line_count; i++)
+        if (buf->line[i])
+            ff_slice_buffer_release(buf, i);
+}
+
+void ff_slice_buffer_destroy(slice_buffer *buf)
+{
+    int i;
+    ff_slice_buffer_flush(buf);
+
+    if (buf->data_stack)
+        for (i = buf->data_count - 1; i >= 0; i--)
+            av_freep(&buf->data_stack[i]);
+    av_freep(&buf->data_stack);
+    av_freep(&buf->line);
+}
+
+static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                  int dst_step, int src_step, int ref_step,
+                                  int width, int mul, int add, int shift,
+                                  int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? -(ref) : +(ref)))
+    if (mirror_left) {
+        dst[0] = LIFT(src[0], ((mul * 2 * ref[0] + add) >> shift), inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFT(src[i * src_step],
+                                 ((mul * (ref[i * ref_step] +
+                                          ref[(i + 1) * ref_step]) +
+                                   add) >> shift),
+                                 inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFT(src[w * src_step],
+                                 ((mul * 2 * ref[w * ref_step] + add) >> shift),
+                                 inverse);
+}
+
+static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                   int dst_step, int src_step, int ref_step,
+                                   int width, int mul, int add, int shift,
+                                   int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+    av_assert1(shift == 4);
+#define LIFTS(src, ref, inv)                                            \
+    ((inv) ? (src) + (((ref) + 4 * (src)) >> shift)                     \
+           : -((-16 * (src) + (ref) + add /                             \
+                4 + 1 + (5 << 25)) / (5 * 4) - (1 << 23)))
+    if (mirror_left) {
+        dst[0] = LIFTS(src[0], mul * 2 * ref[0] + add, inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFTS(src[i * src_step],
+                                  mul * (ref[i * ref_step] +
+                                         ref[(i + 1) * ref_step]) + add,
+                                  inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFTS(src[w * src_step],
+                                  mul * 2 * ref[w * ref_step] + add,
+                                  inverse);
+}
+
+static void horizontal_decompose53i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    int x;
+    const int w2 = (width + 1) >> 1;
+
+    for (x = 0; x < width2; x++) {
+        temp[x]      = b[2 * x];
+        temp[x + w2] = b[2 * x + 1];
+    }
+    if (width & 1)
+        temp[x] = b[2 * x];
+    lift(b + w2, temp + w2, temp,   1, 1, 1, width, -1, 0, 1, 1, 0);
+    lift(b,      temp,      b + w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
+}
+
+static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_decompose53i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-2 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-2,     height - 1) * stride;
+
+    for (y = -2; y < height; y += 2) {
+        DWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+        DWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+        if (y + 1 < (unsigned)height)
+            horizontal_decompose53i(b2, temp, width);
+        if (y + 2 < (unsigned)height)
+            horizontal_decompose53i(b3, temp, width);
+
+        if (y + 1 < (unsigned)height)
+            vertical_decompose53iH0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose53iL0(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+    }
+}
+
+static void horizontal_decompose97i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+
+    lift(temp + w2, b + 1, b,         1, 2, 2, width, W_AM, W_AO, W_AS, 1, 1);
+    liftS(temp,     b,     temp + w2, 1, 2, 1, width, W_BM, W_BO, W_BS, 0, 0);
+    lift(b + w2, temp + w2, temp,     1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0);
+    lift(b,      temp,      b + w2,   1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0);
+}
+
+static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] = (16 * 4 * b1[i] - 4 * (b0[i] + b2[i]) + W_BO * 5 + (5 << 27)) /
+                (5 * 16) - (1 << 23);
+}
+
+static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+static void spatial_decompose97i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-4 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-4,     height - 1) * stride;
+    DWTELEM *b2 = buffer + avpriv_mirror(-4 + 1, height - 1) * stride;
+    DWTELEM *b3 = buffer + avpriv_mirror(-4 + 2, height - 1) * stride;
+
+    for (y = -4; y < height; y += 2) {
+        DWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+        DWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+        if (y + 3 < (unsigned)height)
+            horizontal_decompose97i(b4, temp, width);
+        if (y + 4 < (unsigned)height)
+            horizontal_decompose97i(b5, temp, width);
+
+        if (y + 3 < (unsigned)height)
+            vertical_decompose97iH0(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_decompose97iL0(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_decompose97iH1(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose97iL1(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+        b2 = b4;
+        b3 = b5;
+    }
+}
+
+void ff_spatial_dwt(DWTELEM *buffer, DWTELEM *temp, int width, int height,
+                    int stride, int type, int decomposition_count)
+{
+    int level;
+
+    for (level = 0; level < decomposition_count; level++) {
+        switch (type) {
+        case DWT_97:
+            spatial_decompose97i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        case DWT_53:
+            spatial_decompose53i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        }
+    }
+}
+
+static void horizontal_compose53i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    const int w2     = (width + 1) >> 1;
+    int x;
+
+    for (x = 0; x < width2; x++) {
+        temp[2 * x]     = b[x];
+        temp[2 * x + 1] = b[x + w2];
+    }
+    if (width & 1)
+        temp[2 * x] = b[x];
+
+    b[0] = temp[0] - ((temp[1] + 1) >> 1);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x]     - ((temp[x - 1] + temp[x + 1] + 2) >> 2);
+        b[x - 1] = temp[x - 1] + ((b[x - 2]    + b[x]        + 1) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x]     - ((temp[x - 1]     + 1) >> 1);
+        b[x - 1] = temp[x - 1] + ((b[x - 2] + b[x] + 1) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + b[x - 2];
+}
+
+static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_compose53i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb,
+                                   avpriv_mirror(-1 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-1, height - 1) * stride_line);
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_init(DWTCompose *cs, IDWTELEM *buffer,
+                                    int height, int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-1 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-1,     height - 1) * stride;
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_dy_buffered(DWTCompose *cs, slice_buffer *sb,
+                                           IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 1, height - 1) *
+                                         stride_line);
+    IDWTELEM *b3 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 2, height - 1) *
+                                         stride_line);
+
+    if (y + 1 < (unsigned)height && y < (unsigned)height) {
+        int x;
+
+        for (x = 0; x < width; x++) {
+            b2[x] -= (b1[x] + b3[x] + 2) >> 2;
+            b1[x] += (b0[x] + b2[x])     >> 1;
+        }
+    } else {
+        if (y + 1 < (unsigned)height)
+            vertical_compose53iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose53iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+static void spatial_compose53i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+    IDWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+    if (y + 1 < (unsigned)height)
+        vertical_compose53iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose53iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+    int x;
+
+    temp[0] = b[0] - ((3 * b[w2] + 2) >> 2);
+    for (x = 1; x < (width >> 1); x++) {
+        temp[2 * x]     = b[x] - ((3 * (b[x + w2 - 1] + b[x + w2]) + 4) >> 3);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    }
+    if (width & 1) {
+        temp[2 * x]     = b[x] - ((3 * b[x + w2 - 1] + 2) >> 2);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    } else
+        temp[2 * x - 1] = b[x + w2 - 1] - 2 * temp[2 * x - 2];
+
+    b[0] = temp[0] + ((2 * temp[0] + temp[1] + 4) >> 3);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x] + ((4 * temp[x] + temp[x - 1] + temp[x + 1] + 8) >> 4);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x] + ((2 * temp[x] + temp[x - 1] + 4) >> 3);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + 3 * b[x - 2];
+}
+
+static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_BM * (b0[i] + b2[i]) + 4 * b1[i] + W_BO) >> W_BS;
+}
+
+static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++) {
+        b4[i] -= (W_DM * (b3[i] + b5[i]) + W_DO) >> W_DS;
+        b3[i] -= (W_CM * (b2[i] + b4[i]) + W_CO) >> W_CS;
+        b2[i] += (W_BM * (b1[i] + b3[i]) + 4 * b2[i] + W_BO) >> W_BS;
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+    }
+}
+
+static void spatial_compose97i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb, avpriv_mirror(-3 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-3,     height - 1) * stride_line);
+    cs->b2 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 1, height - 1) * stride_line);
+    cs->b3 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 2, height - 1) * stride_line);
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_init(DWTCompose *cs, IDWTELEM *buffer, int height,
+                                    int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-3 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-3,     height - 1) * stride;
+    cs->b2 = buffer + avpriv_mirror(-3 + 1, height - 1) * stride;
+    cs->b3 = buffer + avpriv_mirror(-3 + 2, height - 1) * stride;
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_dy_buffered(SnowDWTContext *dsp, DWTCompose *cs,
+                                           slice_buffer * sb, IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 3, height - 1) *
+                                         stride_line);
+    IDWTELEM *b5 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 4, height - 1) *
+                                         stride_line);
+
+    if (y > 0 && y + 4 < height) {
+        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+    } else {
+        if (y + 3 < (unsigned)height)
+            vertical_compose97iL1(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_compose97iH1(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_compose97iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose97iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        dsp->horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        dsp->horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+static void spatial_compose97i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+    IDWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+    if (y + 3 < (unsigned)height)
+        vertical_compose97iL1(b3, b4, b5, width);
+    if (y + 2 < (unsigned)height)
+        vertical_compose97iH1(b2, b3, b4, width);
+    if (y + 1 < (unsigned)height)
+        vertical_compose97iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose97iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        }
+    }
+}
+
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy_buffered(dsp, cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy_buffered(cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            }
+        }
+}
+
+static void spatial_idwt_init(DWTCompose *cs, IDWTELEM *buffer, int width,
+                                 int height, int stride, int type,
+                                 int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        }
+    }
+}
+
+static void spatial_idwt_slice(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride, int type,
+                                  int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            }
+        }
+}
+
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count)
+{
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+    int y;
+    spatial_idwt_init(cs, buffer, width, height, stride, type,
+                         decomposition_count);
+    for (y = 0; y < height; y += 4)
+        spatial_idwt_slice(cs, buffer, temp, width, height, stride, type,
+                              decomposition_count, y);
+}
+
+static inline int w_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size,
+                      int w, int h, int type)
+{
+    int s, i, j;
+    const int dec_count = w == 8 ? 3 : 4;
+    int tmp[32 * 32], tmp2[32];
+    int level, ori;
+    static const int scale[2][2][4][4] = {
+        {
+            { // 9/7 8x8 dec=3
+                { 268, 239, 239, 213 },
+                { 0,   224, 224, 152 },
+                { 0,   135, 135, 110 },
+            },
+            { // 9/7 16x16 or 32x32 dec=4
+                { 344, 310, 310, 280 },
+                { 0,   320, 320, 228 },
+                { 0,   175, 175, 136 },
+                { 0,   129, 129, 102 },
+            }
+        },
+        {
+            { // 5/3 8x8 dec=3
+                { 275, 245, 245, 218 },
+                { 0,   230, 230, 156 },
+                { 0,   138, 138, 113 },
+            },
+            { // 5/3 16x16 or 32x32 dec=4
+                { 352, 317, 317, 286 },
+                { 0,   328, 328, 233 },
+                { 0,   180, 180, 140 },
+                { 0,   132, 132, 105 },
+            }
+        }
+    };
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j += 4) {
+            tmp[32 * i + j + 0] = (pix1[j + 0] - pix2[j + 0]) << 4;
+            tmp[32 * i + j + 1] = (pix1[j + 1] - pix2[j + 1]) << 4;
+            tmp[32 * i + j + 2] = (pix1[j + 2] - pix2[j + 2]) << 4;
+            tmp[32 * i + j + 3] = (pix1[j + 3] - pix2[j + 3]) << 4;
+        }
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+
+    ff_spatial_dwt(tmp, tmp2, w, h, 32, type, dec_count);
+
+    s = 0;
+    av_assert1(w == h);
+    for (level = 0; level < dec_count; level++)
+        for (ori = level ? 1 : 0; ori < 4; ori++) {
+            int size   = w >> (dec_count - level);
+            int sx     = (ori & 1) ? size : 0;
+            int stride = 32 << (dec_count - level);
+            int sy     = (ori & 2) ? stride >> 1 : 0;
+
+            for (i = 0; i < size; i++)
+                for (j = 0; j < size; j++) {
+                    int v = tmp[sx + sy + i * stride + j] *
+                            scale[type][dec_count - 3][level][ori];
+                    s += FFABS(v);
+                }
+        }
+    av_assert1(s >= 0);
+    return s >> 9;
+}
+
+static int w53_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 1);
+}
+
+static int w97_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 0);
+}
+
+static int w53_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 1);
+}
+
+static int w97_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 0);
+}
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 1);
+}
+
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 0);
+}
+
+av_cold void ff_dsputil_init_dwt(MECmpContext *c)
+{
+    c->w53[0] = w53_16_c;
+    c->w53[1] = w53_8_c;
+    c->w97[0] = w97_16_c;
+    c->w97[1] = w97_8_c;
+}
+
+av_cold void ff_dwt_init(SnowDWTContext *c)
+{
+    c->vertical_compose97i   = ff_snow_vertical_compose97i;
+    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+    c->inner_add_yblock      = ff_snow_inner_add_yblock;
+
+    if (HAVE_MMX)
+        ff_dwt_init_x86(c);
+}
+
+
diff --git a/libavcodec/snow_dwt.h b/libavcodec/snow_dwt.h
new file mode 100644
index 0000000000..e2d7528056
--- /dev/null
+++ b/libavcodec/snow_dwt.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_DWT_H
+#define AVCODEC_SNOW_DWT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    IDWTELEM *b0;
+    IDWTELEM *b1;
+    IDWTELEM *b2;
+    IDWTELEM *b3;
+    int y;
+} DWTCompose;
+
+/** Used to minimize the amount of memory used in order to
+ *  optimize cache performance. **/
+typedef struct slice_buffer_s {
+    IDWTELEM **line;   ///< For use by idwt and predict_slices.
+    IDWTELEM **data_stack;   ///< Used for internal purposes.
+    int data_stack_top;
+    int line_count;
+    int line_width;
+    int data_count;
+    IDWTELEM *base_buffer;  ///< Buffer that this structure is caching.
+} slice_buffer;
+
+struct SnowDWTContext;
+
+typedef struct SnowDWTContext {
+    void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                int width);
+    void (*horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width);
+    void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride,
+                             uint8_t **block, int b_w, int b_h, int src_x,
+                             int src_y, int src_stride, slice_buffer *sb,
+                             int add, uint8_t *dst8);
+} SnowDWTContext;
+
+
+#define DWT_97 0
+#define DWT_53 1
+
+#define liftS lift
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+
+#define slice_buffer_get_line(slice_buf, line_num)                          \
+    ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num]              \
+                                 : ff_slice_buffer_load_line((slice_buf),   \
+                                                             (line_num)))
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer);
+void ff_slice_buffer_release(slice_buffer *buf, int line);
+void ff_slice_buffer_flush(slice_buffer *buf);
+void ff_slice_buffer_destroy(slice_buffer *buf);
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line);
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width);
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width);
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride,
+                              uint8_t **block, int b_w, int b_h, int src_x,
+                              int src_y, int src_stride, slice_buffer *sb,
+                              int add, uint8_t *dst8);
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+
+void ff_spatial_dwt(int *buffer, int *temp, int width, int height, int stride,
+                    int type, int decomposition_count);
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count);
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y);
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count);
+
+void ff_dwt_init(SnowDWTContext *c);
+void ff_dwt_init_x86(SnowDWTContext *c);
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/snowdata.h b/libavcodec/snowdata.h
new file mode 100644
index 0000000000..490fdf8bd6
--- /dev/null
+++ b/libavcodec/snowdata.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOWDATA_H
+#define AVCODEC_SNOWDATA_H
+
+#include "snow.h"
+
+static const uint8_t obmc32[1024]={
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+ //error:0.000020
+};
+static const uint8_t obmc16[256]={
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+//error:0.000015
+};
+
+//linear *64
+static const uint8_t obmc8[64]={
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
+//error:0.000000
+};
+
+//linear *64
+static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+//error:0.000000
+};
+
+const int8_t ff_quant3bA[256]={
+ 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+};
+
+const uint8_t * const ff_obmc_tab[4]= {
+    obmc32, obmc16, obmc8, obmc4
+};
+
+/* runtime generated tables */
+uint8_t ff_qexp[QROOT];
+int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snowdec.c b/libavcodec/snowdec.c
new file mode 100644
index 0000000000..1b288dd813
--- /dev/null
+++ b/libavcodec/snowdec.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+                    line[x] -= 128 << FRAC_BITS;
+//                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 1, sb, old_buffer, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 0, plane_index);
+    }
+
+    if(s->avmv && mb_y < mb_h && plane_index == 0)
+        for(mb_x=0; mb_x<mb_w; mb_x++){
+            AVMotionVector *avmv = s->avmv + s->avmv_index;
+            const int b_width = s->b_width  << s->block_max_depth;
+            const int b_stride= b_width;
+            BlockNode *bn= &s->block[mb_x + mb_y*b_stride];
+
+            if (bn->type)
+                continue;
+
+            s->avmv_index++;
+
+            avmv->w = block_w;
+            avmv->h = block_h;
+            avmv->dst_x = block_w*mb_x - block_w/2;
+            avmv->dst_y = block_h*mb_y - block_h/2;
+            avmv->src_x = avmv->dst_x + (bn->mx * s->mv_scale)/8;
+            avmv->src_y = avmv->dst_y + (bn->my * s->mv_scale)/8;
+            avmv->source= -1 - bn->ref;
+            avmv->flags = 0;
+        }
+}
+
+static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
+    const int w= b->width;
+    int y;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int new_index = 0;
+
+    if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
+        qadd= 0;
+        qmul= 1<<QEXPSHIFT;
+    }
+
+    /* If we are on the second or later slice, restore our index. */
+    if (start_y != 0)
+        new_index = save_state[0];
+
+
+    for(y=start_y; y<h; y++){
+        int x = 0;
+        int v;
+        IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
+        memset(line, 0, b->width*sizeof(IDWTELEM));
+        v = b->x_coeff[new_index].coeff;
+        x = b->x_coeff[new_index++].x;
+        while(x < w){
+            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
+            register int u= -(v&1);
+            line[x] = (t^u) - u;
+
+            v = b->x_coeff[new_index].coeff;
+            x = b->x_coeff[new_index++].x;
+        }
+    }
+
+    /* Save our variables for the next slice. */
+    save_state[0] = new_index;
+
+    return;
+}
+
+static int decode_q_branch(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int res;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
+        return 0;
+    }
+
+    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
+        int type, mx, my;
+        int l = left->color[0];
+        int cb= left->color[1];
+        int cr= left->color[2];
+        unsigned ref = 0;
+        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
+        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
+
+        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
+
+        if(type){
+            pred_mv(s, &mx, &my, 0, left, top, tr);
+            l += get_symbol(&s->c, &s->block_state[32], 1);
+            if (s->nb_planes > 2) {
+                cb+= get_symbol(&s->c, &s->block_state[64], 1);
+                cr+= get_symbol(&s->c, &s->block_state[96], 1);
+            }
+        }else{
+            if(s->ref_frames > 1)
+                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
+            if (ref >= s->ref_frames) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid ref\n");
+                return AVERROR_INVALIDDATA;
+            }
+            pred_mv(s, &mx, &my, ref, left, top, tr);
+            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
+            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
+        }
+        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
+    }else{
+        if ((res = decode_q_branch(s, level+1, 2*x+0, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+0, 2*y+1)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+1)) < 0)
+            return res;
+    }
+    return 0;
+}
+
+static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
+    const int w= b->width;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=start_y; y<end_y; y++){
+//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            int i= line[x];
+            if(i<0){
+                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
+    const int w= b->width;
+    int x,y;
+
+    IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
+    IDWTELEM * prev;
+
+    if (start_y != 0)
+        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+
+    for(y=start_y; y<end_y; y++){
+        prev = line;
+//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
+                    else  line[x] += line[x - 1];
+                }else{
+                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
+                    else  line[x] += line[x - 1];
+                }
+            }else{
+                if(y) line[x] += prev[x];
+            }
+        }
+    }
+}
+
+static void decode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                int q;
+                if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
+                else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
+                else                    q= get_symbol(&s->c, s->header_state, 1);
+                s->plane[plane_index].band[level][orientation].qlog= q;
+            }
+        }
+    }
+}
+
+#define GET_S(dst, check) \
+    tmp= get_symbol(&s->c, s->header_state, 0);\
+    if(!(check)){\
+        av_log(s->avctx, AV_LOG_ERROR, "Error " #dst " is %d\n", tmp);\
+        return AVERROR_INVALIDDATA;\
+    }\
+    dst= tmp;
+
+static int decode_header(SnowContext *s){
+    int plane_index, tmp;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    s->keyframe= get_rac(&s->c, kstate);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->spatial_decomposition_type=
+        s->qlog=
+        s->qbias=
+        s->mv_scale=
+        s->block_max_depth= 0;
+    }
+    if(s->keyframe){
+        GET_S(s->version, tmp <= 0U)
+        s->always_reset= get_rac(&s->c, s->header_state);
+        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
+        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
+        GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
+        if (s->colorspace_type == 1) {
+            s->avctx->pix_fmt= AV_PIX_FMT_GRAY8;
+            s->nb_planes = 1;
+        } else if(s->colorspace_type == 0) {
+            s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
+            s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
+
+            if(s->chroma_h_shift == 1 && s->chroma_v_shift==1){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            }else if(s->chroma_h_shift == 0 && s->chroma_v_shift==0){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV444P;
+            }else if(s->chroma_h_shift == 2 && s->chroma_v_shift==2){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s, AV_LOG_ERROR, "unsupported color subsample mode %d %d\n", s->chroma_h_shift, s->chroma_v_shift);
+                s->chroma_h_shift = s->chroma_v_shift = 1;
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+                return AVERROR_INVALIDDATA;
+            }
+            s->nb_planes = 3;
+        } else {
+            av_log(s, AV_LOG_ERROR, "unsupported color space\n");
+            s->chroma_h_shift = s->chroma_v_shift = 1;
+            s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            return AVERROR_INVALIDDATA;
+        }
+
+
+        s->spatial_scalability= get_rac(&s->c, s->header_state);
+//        s->rate_scalability= get_rac(&s->c, s->header_state);
+        GET_S(s->max_ref_frames, tmp < (unsigned)MAX_REF_FRAMES)
+        s->max_ref_frames++;
+
+        decode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        if(get_rac(&s->c, s->header_state)){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                int htaps, i, sum=0;
+                Plane *p= &s->plane[plane_index];
+                p->diag_mc= get_rac(&s->c, s->header_state);
+                htaps= get_symbol(&s->c, s->header_state, 0)*2 + 2;
+                if((unsigned)htaps > HTAPS_MAX || htaps==0)
+                    return AVERROR_INVALIDDATA;
+                p->htaps= htaps;
+                for(i= htaps/2; i; i--){
+                    p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
+                    sum += p->hcoeff[i];
+                }
+                p->hcoeff[0]= 32-sum;
+            }
+            s->plane[2].diag_mc= s->plane[1].diag_mc;
+            s->plane[2].htaps  = s->plane[1].htaps;
+            memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
+        }
+        if(get_rac(&s->c, s->header_state)){
+            GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+            decode_qlogs(s);
+        }
+    }
+
+    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
+    if(s->spatial_decomposition_type > 1U){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported\n", s->spatial_decomposition_type);
+        return AVERROR_INVALIDDATA;
+    }
+    if(FFMIN(s->avctx-> width>>s->chroma_h_shift,
+             s->avctx->height>>s->chroma_v_shift) >> (s->spatial_decomposition_count-1) <= 1){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_count %d too large for size\n", s->spatial_decomposition_count);
+        return AVERROR_INVALIDDATA;
+    }
+
+
+    s->qlog           += get_symbol(&s->c, s->header_state, 1);
+    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
+    s->qbias          += get_symbol(&s->c, s->header_state, 1);
+    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
+    if(s->block_max_depth > 1 || s->block_max_depth < 0){
+        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large\n", s->block_max_depth);
+        s->block_max_depth= 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    int ret;
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static int decode_blocks(SnowContext *s){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+    int res;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            if ((res = decode_q_branch(s, 0, x, y)) < 0)
+                return res;
+        }
+    }
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    int bytes_read;
+    AVFrame *picture = data;
+    int level, orientation, plane_index;
+    int res;
+
+    ff_init_range_decoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+
+    s->current_picture->pict_type= AV_PICTURE_TYPE_I; //FIXME I vs. P
+    if ((res = decode_header(s)) < 0)
+        return res;
+    if ((res=ff_snow_common_init_after_header(avctx)) < 0)
+        return res;
+
+    // realloc slice buffer for the case that spatial_decomposition_count changed
+    ff_slice_buffer_destroy(&s->sb);
+    if ((res = ff_slice_buffer_init(&s->sb, s->plane[0].height,
+                                    (MB_SIZE >> s->block_max_depth) +
+                                    s->spatial_decomposition_count * 11 + 1,
+                                    s->plane[0].width,
+                                    s->spatial_idwt_buffer)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
+                                              && p->hcoeff[1]==-10
+                                              && p->hcoeff[2]==2;
+    }
+
+    ff_snow_alloc_blocks(s);
+
+    if((res = ff_snow_frame_start(s)) < 0)
+        return res;
+
+    s->current_picture->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    //keyframe flag duplication mess FIXME
+    if(avctx->debug&FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_ERROR,
+               "keyframe:%d qlog:%d qbias: %d mvscale: %d "
+               "decomposition_type:%d decomposition_count:%d\n",
+               s->keyframe, s->qlog, s->qbias, s->mv_scale,
+               s->spatial_decomposition_type,
+               s->spatial_decomposition_count
+              );
+
+    av_assert0(!s->avmv);
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) {
+        s->avmv = av_malloc_array(s->b_width * s->b_height, sizeof(AVMotionVector) << (s->block_max_depth*2));
+    }
+    s->avmv_index = 0;
+
+    if ((res = decode_blocks(s)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+        int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
+
+        if(s->avctx->debug&2048){
+            memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    int v= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x];
+                    s->mconly_picture->data[plane_index][y*s->mconly_picture->linesize[plane_index] + x]= v;
+                }
+            }
+        }
+
+        {
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+                unpack_coeffs(s, b, b->parent, orientation);
+            }
+        }
+        }
+
+        {
+        const int mb_h= s->b_height << s->block_max_depth;
+        const int block_size = MB_SIZE >> s->block_max_depth;
+        const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+        int mb_y;
+        DWTCompose cs[MAX_DECOMPOSITIONS];
+        int yd=0, yq=0;
+        int y;
+        int end_y;
+
+        ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
+        for(mb_y=0; mb_y<=mb_h; mb_y++){
+
+            int slice_starty = block_h*mb_y;
+            int slice_h = block_h*(mb_y+1);
+
+            if (!(s->keyframe || s->avctx->debug&512)){
+                slice_starty = FFMAX(0, slice_starty - (block_h >> 1));
+                slice_h -= (block_h >> 1);
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+                    int start_y;
+                    int end_y;
+                    int our_mb_start = mb_y;
+                    int our_mb_end = (mb_y + 1);
+                    const int extra= 3;
+                    start_y = (mb_y ? ((block_h * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
+                    end_y = (((block_h * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
+                    if (!(s->keyframe || s->avctx->debug&512)){
+                        start_y = FFMAX(0, start_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                        end_y = FFMAX(0, end_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                    }
+                    start_y = FFMIN(b->height, start_y);
+                    end_y = FFMIN(b->height, end_y);
+
+                    if (start_y != end_y){
+                        if (orientation == 0){
+                            SubBand * correlate_band = &p->band[0][0];
+                            int correlate_end_y = FFMIN(b->height, end_y + 1);
+                            int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
+                            decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
+                            correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
+                            dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
+                        }
+                        else
+                            decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
+                    }
+                }
+            }
+
+            for(; yd<slice_h; yd+=4){
+                ff_spatial_idwt_buffered_slice(&s->dwt, cs, &s->sb, s->temp_idwt_buffer, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(; yq<slice_h && yq<h; yq++){
+                    IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
+                    for(x=0; x<w; x++){
+                        line[x] <<= FRAC_BITS;
+                    }
+                }
+            }
+
+            predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
+
+            y = FFMIN(p->height, slice_starty);
+            end_y = FFMIN(p->height, slice_h);
+            while(y < end_y)
+                ff_slice_buffer_release(&s->sb, y++);
+        }
+
+        ff_slice_buffer_flush(&s->sb);
+        }
+
+    }
+
+    emms_c();
+
+    ff_snow_release_buffer(avctx);
+
+    if(!(s->avctx->debug&2048))
+        res = av_frame_ref(picture, s->current_picture);
+    else
+        res = av_frame_ref(picture, s->mconly_picture);
+    if (res >= 0 && s->avmv_index) {
+        AVFrameSideData *sd;
+
+        sd = av_frame_new_side_data(picture, AV_FRAME_DATA_MOTION_VECTORS, s->avmv_index * sizeof(AVMotionVector));
+        if (!sd)
+            return AVERROR(ENOMEM);
+        memcpy(sd->data, s->avmv, s->avmv_index * sizeof(AVMotionVector));
+    }
+
+    av_freep(&s->avmv);
+
+    if (res < 0)
+        return res;
+
+    *got_frame = 1;
+
+    bytes_read= c->bytestream - c->bytestream_start;
+    if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
+
+    return bytes_read;
+}
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_slice_buffer_destroy(&s->sb);
+
+    ff_snow_common_end(s);
+
+    return 0;
+}
+
+AVCodec ff_snow_decoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
new file mode 100644
index 0000000000..7e8269cd92
--- /dev/null
+++ b/libavcodec/snowenc.c
@@ -0,0 +1,2067 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "snow_dwt.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+#define FF_ME_ITER 50
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int plane_index, ret;
+    int i;
+
+    if(avctx->prediction_method == DWT_97
+       && (avctx->flags & AV_CODEC_FLAG_QSCALE)
+       && avctx->global_quality == 0){
+        av_log(avctx, AV_LOG_ERROR, "The 9/7 wavelet is incompatible with lossless mode.\n");
+        return -1;
+    }
+#if FF_API_MOTION_EST
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->me_method == ME_ITER)
+        s->motion_est = FF_ME_ITER;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
+
+    s->mv_scale       = (avctx->flags & AV_CODEC_FLAG_QPEL) ? 2 : 4;
+    s->block_max_depth= (avctx->flags & AV_CODEC_FLAG_4MV ) ? 1 : 0;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        s->plane[plane_index].diag_mc= 1;
+        s->plane[plane_index].htaps= 6;
+        s->plane[plane_index].hcoeff[0]=  40;
+        s->plane[plane_index].hcoeff[1]= -10;
+        s->plane[plane_index].hcoeff[2]=   2;
+        s->plane[plane_index].fast_mc= 1;
+    }
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+
+    ff_snow_alloc_blocks(s);
+
+    s->version=0;
+
+    s->m.avctx   = avctx;
+    s->m.bit_rate= avctx->bit_rate;
+
+    s->m.me.temp      =
+    s->m.me.scratchpad= av_mallocz_array((avctx->width+64), 2*16*2*sizeof(uint8_t));
+    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.sc.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
+    if (!s->m.me.scratchpad || !s->m.me.map || !s->m.me.score_map || !s->m.sc.obmc_scratchpad)
+        return AVERROR(ENOMEM);
+
+    ff_h263_encode_init(&s->m); //mv_penalty
+
+    s->max_ref_frames = av_clip(avctx->refs, 1, MAX_REF_FRAMES);
+
+    if(avctx->flags&AV_CODEC_FLAG_PASS1){
+        if(!avctx->stats_out)
+            avctx->stats_out = av_mallocz(256);
+
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
+    if((avctx->flags&AV_CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
+        if(ff_rate_control_init(&s->m) < 0)
+            return -1;
+    }
+    s->pass1_rc= !(avctx->flags & (AV_CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
+
+    switch(avctx->pix_fmt){
+    case AV_PIX_FMT_YUV444P:
+//    case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV420P:
+//    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV410P:
+        s->nb_planes = 3;
+        s->colorspace_type= 0;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->nb_planes = 1;
+        s->colorspace_type = 1;
+        break;
+/*    case AV_PIX_FMT_RGB32:
+        s->colorspace= 1;
+        break;*/
+    default:
+        av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
+        return -1;
+    }
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
+
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp, s->avctx->me_cmp);
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, s->avctx->me_sub_cmp);
+
+    s->input_picture = av_frame_alloc();
+    if (!s->input_picture)
+        return AVERROR(ENOMEM);
+
+    if ((ret = ff_snow_get_buffer(s, s->input_picture)) < 0)
+        return ret;
+
+    if(s->motion_est == FF_ME_ITER){
+        int size= s->b_width * s->b_height << 2*s->block_max_depth;
+        for(i=0; i<s->max_ref_frames; i++){
+            s->ref_mvs[i]= av_mallocz_array(size, sizeof(int16_t[2]));
+            s->ref_scores[i]= av_mallocz_array(size, sizeof(uint32_t));
+            if (!s->ref_mvs[i] || !s->ref_scores[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_sum(uint8_t * pix, int line_size, int w, int h)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j++) {
+            s += pix[0];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_norm1(uint8_t * pix, int line_size, int w)
+{
+    int s, i, j;
+    uint32_t *sq = ff_square_tab + 256;
+
+    s = 0;
+    for (i = 0; i < w; i++) {
+        for (j = 0; j < w; j ++) {
+            s += sq[pix[0]];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+static inline int get_penalty_factor(int lambda, int lambda2, int type){
+    switch(type&0xFF){
+    default:
+    case FF_CMP_SAD:
+        return lambda>>FF_LAMBDA_SHIFT;
+    case FF_CMP_DCT:
+        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_SATD:
+    case FF_CMP_DCT264:
+        return (2*lambda)>>FF_LAMBDA_SHIFT;
+    case FF_CMP_RD:
+    case FF_CMP_PSNR:
+    case FF_CMP_SSE:
+    case FF_CMP_NSSE:
+        return lambda2>>FF_LAMBDA_SHIFT;
+    case FF_CMP_BIT:
+        return 1;
+    }
+}
+
+//FIXME copy&paste
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_MV1 P[9]
+#define FLAG_QPEL   1 //must be 1
+
+static int encode_q_branch(SnowContext *s, int level, int x, int y){
+    uint8_t p_buffer[1024];
+    uint8_t i_buffer[1024];
+    uint8_t p_state[sizeof(s->block_state)];
+    uint8_t i_state[sizeof(s->block_state)];
+    RangeCoder pc, ic;
+    uint8_t *pbbak= s->c.bytestream;
+    uint8_t *pbbak_start= s->c.bytestream_start;
+    int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
+    const int w= s->b_width  << s->block_max_depth;
+    const int h= s->b_height << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<(LOG2_MB_SIZE - level);
+    int trx= (x+1)<<rem_depth;
+    int try= (y+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
+    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int mx=0, my=0;
+    int l,cr,cb;
+    const int stride= s->current_picture->linesize[0];
+    const int uvstride= s->current_picture->linesize[1];
+    uint8_t *current_data[3]= { s->input_picture->data[0] + (x + y*  stride)*block_w,
+                                s->input_picture->data[1] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift),
+                                s->input_picture->data[2] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift)};
+    int P[10][2];
+    int16_t last_mv[3][2];
+    int qpel= !!(s->avctx->flags & AV_CODEC_FLAG_QPEL); //unused
+    const int shift= 1+qpel;
+    MotionEstContext *c= &s->m.me;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+    int my_context= av_log2(2*FFABS(left->my - top->my));
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int ref, best_ref, ref_score, ref_mx, ref_my;
+
+    av_assert0(sizeof(s->block_state) >= 256);
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return 0;
+    }
+
+//    clip predictors / edge ?
+
+    P_LEFT[0]= left->mx;
+    P_LEFT[1]= left->my;
+    P_TOP [0]= top->mx;
+    P_TOP [1]= top->my;
+    P_TOPRIGHT[0]= tr->mx;
+    P_TOPRIGHT[1]= tr->my;
+
+    last_mv[0][0]= s->block[index].mx;
+    last_mv[0][1]= s->block[index].my;
+    last_mv[1][0]= right->mx;
+    last_mv[1][1]= right->my;
+    last_mv[2][0]= bottom->mx;
+    last_mv[2][1]= bottom->my;
+
+    s->m.mb_stride=2;
+    s->m.mb_x=
+    s->m.mb_y= 0;
+    c->skip= 0;
+
+    av_assert1(c->  stride ==   stride);
+    av_assert1(c->uvstride == uvstride);
+
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
+    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
+
+    c->xmin = - x*block_w - 16+3;
+    c->ymin = - y*block_w - 16+3;
+    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+
+    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
+    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
+    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
+    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
+    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
+    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
+    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
+
+    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+    if (!y) {
+        c->pred_x= P_LEFT[0];
+        c->pred_y= P_LEFT[1];
+    } else {
+        c->pred_x = P_MEDIAN[0];
+        c->pred_y = P_MEDIAN[1];
+    }
+
+    score= INT_MAX;
+    best_ref= 0;
+    for(ref=0; ref<s->ref_frames; ref++){
+        init_ref(c, current_data, s->last_picture[ref]->data, NULL, block_w*x, block_w*y, 0);
+
+        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
+                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
+
+        av_assert2(ref_mx >= c->xmin);
+        av_assert2(ref_mx <= c->xmax);
+        av_assert2(ref_my >= c->ymin);
+        av_assert2(ref_my <= c->ymax);
+
+        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
+        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
+        if(s->ref_mvs[ref]){
+            s->ref_mvs[ref][index][0]= ref_mx;
+            s->ref_mvs[ref][index][1]= ref_my;
+            s->ref_scores[ref][index]= ref_score;
+        }
+        if(score > ref_score){
+            score= ref_score;
+            best_ref= ref;
+            mx= ref_mx;
+            my= ref_my;
+        }
+    }
+    //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
+
+  //  subpel search
+    base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
+    pc= s->c;
+    pc.bytestream_start=
+    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
+    memcpy(p_state, s->block_state, sizeof(s->block_state));
+
+    if(level!=s->block_max_depth)
+        put_rac(&pc, &p_state[4 + s_context], 1);
+    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
+    if(s->ref_frames > 1)
+        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
+    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
+    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
+    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
+    p_len= pc.bytestream - pc.bytestream_start;
+    score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    block_s= block_w*block_w;
+    sum = pix_sum(current_data[0], stride, block_w, block_w);
+    l= (sum + block_s/2)/block_s;
+    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
+
+    if (s->nb_planes > 2) {
+        block_s= block_w*block_w>>(s->chroma_h_shift + s->chroma_v_shift);
+        sum = pix_sum(current_data[1], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cb= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
+        sum = pix_sum(current_data[2], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cr= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
+    }else
+        cb = cr = 0;
+
+    ic= s->c;
+    ic.bytestream_start=
+    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
+    memcpy(i_state, s->block_state, sizeof(s->block_state));
+    if(level!=s->block_max_depth)
+        put_rac(&ic, &i_state[4 + s_context], 1);
+    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
+    put_symbol(&ic, &i_state[32],  l-pl , 1);
+    if (s->nb_planes > 2) {
+        put_symbol(&ic, &i_state[64], cb-pcb, 1);
+        put_symbol(&ic, &i_state[96], cr-pcr, 1);
+    }
+    i_len= ic.bytestream - ic.bytestream_start;
+    iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    av_assert1(iscore < 255*255*256 + s->lambda2*10);
+    av_assert1(iscore >= 0);
+    av_assert1(l>=0 && l<=255);
+    av_assert1(pl>=0 && pl<=255);
+
+    if(level==0){
+        int varc= iscore >> 8;
+        int vard= score >> 8;
+        if (vard <= 64 || vard < varc)
+            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            c->scene_change_score+= s->m.qscale;
+    }
+
+    if(level!=s->block_max_depth){
+        put_rac(&s->c, &s->block_state[4 + s_context], 0);
+        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
+        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
+
+        if(score2 < score && score2 < iscore)
+            return score2;
+    }
+
+    if(iscore < score){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        memcpy(pbbak, i_buffer, i_len);
+        s->c= ic;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + i_len;
+        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
+        memcpy(s->block_state, i_state, sizeof(s->block_state));
+        return iscore;
+    }else{
+        memcpy(pbbak, p_buffer, p_len);
+        s->c= pc;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + p_len;
+        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
+        memcpy(s->block_state, p_state, sizeof(s->block_state));
+        return score;
+    }
+}
+
+static void encode_q_branch2(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width  << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    BlockNode *b= &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
+    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return;
+    }
+
+    if(level!=s->block_max_depth){
+        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
+            put_rac(&s->c, &s->block_state[4 + s_context], 1);
+        }else{
+            put_rac(&s->c, &s->block_state[4 + s_context], 0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
+            return;
+        }
+    }
+    if(b->type & BLOCK_INTRA){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
+        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
+            put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
+        }
+        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
+    }else{
+        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
+        if(s->ref_frames > 1)
+            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
+        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
+        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
+        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
+    }
+}
+
+static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, x2, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    IDWTELEM *dst= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int index= mb_x + mb_y*b_stride;
+    BlockNode *b= &s->block[index];
+    BlockNode backup= *b;
+    int ab=0;
+    int aa=0;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc stuff above
+
+    b->type|= BLOCK_INTRA;
+    b->color[plane_index]= 0;
+    memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
+
+    for(i=0; i<4; i++){
+        int mb_x2= mb_x + (i &1) - 1;
+        int mb_y2= mb_y + (i>>1) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, dst + (i&1)*block_w + (i>>1)*obmc_stride*block_h, NULL, obmc,
+                    x, y, block_w, block_h, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
+
+        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_h); y2++){
+            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
+                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_h*mb_y - block_h/2))*obmc_stride;
+                int obmc_v= obmc[index];
+                int d;
+                if(y<0) obmc_v += obmc[index + block_h*obmc_stride];
+                if(x<0) obmc_v += obmc[index + block_w];
+                if(y+block_h>h) obmc_v += obmc[index - block_h*obmc_stride];
+                if(x+block_w>w) obmc_v += obmc[index - block_w];
+                //FIXME precalculate this or simplify it somehow else
+
+                d = -dst[index] + (1<<(FRAC_BITS-1));
+                dst[index] = d;
+                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
+                aa += obmc_v * obmc_v; //FIXME precalculate this
+            }
+        }
+    }
+    *b= backup;
+
+    return av_clip_uint8( ROUNDED_DIV(ab<<LOG2_OBMC_MAX, aa) ); //FIXME we should not need clipping
+}
+
+static inline int get_block_bits(SnowContext *s, int x, int y, int w){
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    int index= x + y*b_stride;
+    const BlockNode *b     = &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
+    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
+    int dmx, dmy;
+//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+//  int my_context= av_log2(2*FFABS(left->my - top->my));
+
+    if(x<0 || x>=b_stride || y>=b_height)
+        return 0;
+/*
+1            0      0
+01X          1-2    1
+001XX        3-6    2-3
+0001XXX      7-14   4-7
+00001XXXX   15-30   8-15
+*/
+//FIXME try accurate rate
+//FIXME intra and inter predictors if surrounding blocks are not the same type
+    if(b->type & BLOCK_INTRA){
+        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
+                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
+                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
+    }else{
+        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
+        dmx-= b->mx;
+        dmy-= b->my;
+        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
+                    + av_log2(2*FFABS(dmy))
+                    + av_log2(2*b->ref));
+    }
+}
+
+static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uint8_t (*obmc_edged)[MB_SIZE * 2]){
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s->  input_picture->data[plane_index];
+    IDWTELEM *pred= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4;
+    uint8_t *cur = s->scratchbuf;
+    uint8_t *tmp = s->emu_edge_buffer;
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+    int sx= block_w*mb_x - block_w/2;
+    int sy= block_h*mb_y - block_h/2;
+    int x0= FFMAX(0,-sx);
+    int y0= FFMAX(0,-sy);
+    int x1= FFMIN(block_w*2, w-sx);
+    int y1= FFMIN(block_h*2, h-sy);
+    int i,x,y;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below chckinhg only block_w
+
+    ff_snow_pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_h*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
+
+    for(y=y0; y<y1; y++){
+        const uint8_t *obmc1= obmc_edged[y];
+        const IDWTELEM *pred1 = pred + y*obmc_stride;
+        uint8_t *cur1 = cur + y*ref_stride;
+        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
+        for(x=x0; x<x1; x++){
+#if FRAC_BITS >= LOG2_OBMC_MAX
+            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
+#else
+            int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
+#endif
+            v = (v + pred1[x]) >> FRAC_BITS;
+            if(v&(~255)) v= ~(v>>31);
+            dst1[x] = v;
+        }
+    }
+
+    /* copy the regions where obmc[] = (uint8_t)256 */
+    if(LOG2_OBMC_MAX == 8
+        && (mb_x == 0 || mb_x == b_stride-1)
+        && (mb_y == 0 || mb_y == b_height-1)){
+        if(mb_x == 0)
+            x1 = block_w;
+        else
+            x0 = block_w;
+        if(mb_y == 0)
+            y1 = block_h;
+        else
+            y0 = block_h;
+        for(y=y0; y<y1; y++)
+            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+    }
+
+    if(block_w==16){
+        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
+        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
+        /* FIXME cmps overlap but do not cover the wavelet's whole support.
+         * So improving the score of one block is not strictly guaranteed
+         * to improve the score of the whole frame, thus iterative motion
+         * estimation does not always converge. */
+        if(s->avctx->me_cmp == FF_CMP_W97)
+            distortion = ff_w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else if(s->avctx->me_cmp == FF_CMP_W53)
+            distortion = ff_w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else{
+            distortion = 0;
+            for(i=0; i<4; i++){
+                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
+                distortion += s->mecc.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
+            }
+        }
+    }else{
+        av_assert2(block_w==8);
+        distortion = s->mecc.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
+    }
+
+    if(plane_index==0){
+        for(i=0; i<4; i++){
+/* ..RRr
+ * .RXx.
+ * rxx..
+ */
+            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
+        }
+        if(mb_x == b_stride-2)
+            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    //FIXME zero_dst is const but add_yblock changes dst if add is 0 (this is never the case for dst=zero_dst
+    // const has only been removed from zero_dst to suppress a warning
+    static IDWTELEM zero_dst[4096]; //FIXME
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion= 0;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below
+
+    for(i=0; i<9; i++){
+        int mb_x2= mb_x + (i%3) - 1;
+        int mb_y2= mb_y + (i/3) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
+                   x, y, block_w, block_h, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
+
+        //FIXME find a cleaner/simpler way to skip the outside stuff
+        for(y2= y; y2<0; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        for(y2= h; y2<y+block_h; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        if(x<0){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
+        }
+        if(x+block_w > w){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
+        }
+
+        av_assert1(block_w== 8 || block_w==16);
+        distortion += s->mecc.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
+    }
+
+    if(plane_index==0){
+        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
+        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
+
+/* ..RRRr
+ * .RXXx.
+ * .RXXx.
+ * rxxx.
+ */
+        if(merged)
+            rate = get_block_bits(s, mb_x, mb_y, 2);
+        for(i=merged?4:0; i<9; i++){
+            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
+            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
+        }
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x, y;
+
+    if(1){
+        int run=0;
+        int *runs = s->run_buffer;
+        int run_index=0;
+        int max_index;
+
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(!(/*ll|*/l|lt|t|rt|p)){
+                    if(v){
+                        runs[run_index++]= run;
+                        run=0;
+                    }else{
+                        run++;
+                    }
+                }
+            }
+        }
+        max_index= run_index;
+        runs[run_index++]= run;
+        run_index=0;
+        run= runs[run_index++];
+
+        put_symbol2(&s->c, b->state[30], max_index, 0);
+        if(run_index <= max_index)
+            put_symbol2(&s->c, b->state[1], run, 3);
+
+        for(y=0; y<h; y++){
+            if(s->c.bytestream_end - s->c.bytestream < w*40){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(/*ll|*/l|lt|t|rt|p){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+
+                    put_rac(&s->c, &b->state[0][context], !!v);
+                }else{
+                    if(!run){
+                        run= runs[run_index++];
+
+                        if(run_index <= max_index)
+                            put_symbol2(&s->c, b->state[1], run, 3);
+                        av_assert2(v);
+                    }else{
+                        run--;
+                        av_assert2(!v);
+                    }
+                }
+                if(v){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+                    int l2= 2*FFABS(l) + (l<0);
+                    int t2= 2*FFABS(t) + (t<0);
+
+                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
+                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l2&0xFF] + 3*ff_quant3bA[t2&0xFF]], v<0);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_subband(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+//    encode_subband_qtree(s, b, src, parent, stride, orientation);
+//    encode_subband_z0run(s, b, src, parent, stride, orientation);
+    return encode_subband_c0run(s, b, src, parent, stride, orientation);
+//    encode_subband_dzr(s, b, src, parent, stride, orientation);
+}
+
+static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup= *block;
+    unsigned value;
+    int rd, index;
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+
+    if(intra){
+        block->color[0] = p[0];
+        block->color[1] = p[1];
+        block->color[2] = p[2];
+        block->type |= BLOCK_INTRA;
+    }else{
+        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
+        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
+        if(s->me_cache[index] == value)
+            return 0;
+        s->me_cache[index]= value;
+
+        block->mx= p[0];
+        block->my= p[1];
+        block->type &= ~BLOCK_INTRA;
+    }
+
+    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged) + s->intra_penalty * !!intra;
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        *block= backup;
+        return 0;
+    }
+}
+
+/* special case for int[2] args we discard afterwards,
+ * fixes compilation problem with gcc 2.95 */
+static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    int p[2] = {p0, p1};
+    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
+}
+
+static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup[4];
+    unsigned value;
+    int rd, index;
+
+    /* We don't initialize backup[] during variable declaration, because
+     * that fails to compile on MSVC: "cannot convert from 'BlockNode' to
+     * 'int16_t'". */
+    backup[0] = block[0];
+    backup[1] = block[1];
+    backup[2] = block[b_stride];
+    backup[3] = block[b_stride + 1];
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+    av_assert2(((mb_x|mb_y)&1) == 0);
+
+    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
+    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
+    if(s->me_cache[index] == value)
+        return 0;
+    s->me_cache[index]= value;
+
+    block->mx= p0;
+    block->my= p1;
+    block->ref= ref;
+    block->type &= ~BLOCK_INTRA;
+    block[1]= block[b_stride]= block[b_stride+1]= *block;
+
+    rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        block[0]= backup[0];
+        block[1]= backup[1];
+        block[b_stride]= backup[2];
+        block[b_stride+1]= backup[3];
+        return 0;
+    }
+}
+
+static void iterative_me(SnowContext *s){
+    int pass, mb_x, mb_y;
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    int color[3];
+
+    {
+        RangeCoder r = s->c;
+        uint8_t state[sizeof(s->block_state)];
+        memcpy(state, s->block_state, sizeof(s->block_state));
+        for(mb_y= 0; mb_y<s->b_height; mb_y++)
+            for(mb_x= 0; mb_x<s->b_width; mb_x++)
+                encode_q_branch(s, 0, mb_x, mb_y);
+        s->c = r;
+        memcpy(s->block_state, state, sizeof(s->block_state));
+    }
+
+    for(pass=0; pass<25; pass++){
+        int change= 0;
+
+        for(mb_y= 0; mb_y<b_height; mb_y++){
+            for(mb_x= 0; mb_x<b_width; mb_x++){
+                int dia_change, i, j, ref;
+                int best_rd= INT_MAX, ref_rd;
+                BlockNode backup, ref_b;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *block= &s->block[index];
+                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
+                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
+                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
+                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
+                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
+                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
+                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
+                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
+                const int b_w= (MB_SIZE >> s->block_max_depth);
+                uint8_t obmc_edged[MB_SIZE * 2][MB_SIZE * 2];
+
+                if(pass && (block->type & BLOCK_OPT))
+                    continue;
+                block->type |= BLOCK_OPT;
+
+                backup= *block;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                //FIXME precalculate
+                {
+                    int x, y;
+                    for (y = 0; y < b_w * 2; y++)
+                        memcpy(obmc_edged[y], ff_obmc_tab[s->block_max_depth] + y * b_w * 2, b_w * 2);
+                    if(mb_x==0)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
+                    if(mb_x==b_stride-1)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
+                    if(mb_y==0){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
+                        for(y=1; y<b_w; y++)
+                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
+                    }
+                    if(mb_y==b_height-1){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
+                        for(y=b_w; y<b_w*2-1; y++)
+                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
+                    }
+                }
+
+                //skip stuff outside the picture
+                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1){
+                    uint8_t *src= s->  input_picture->data[0];
+                    uint8_t *dst= s->current_picture->data[0];
+                    const int stride= s->current_picture->linesize[0];
+                    const int block_w= MB_SIZE >> s->block_max_depth;
+                    const int block_h= MB_SIZE >> s->block_max_depth;
+                    const int sx= block_w*mb_x - block_w/2;
+                    const int sy= block_h*mb_y - block_h/2;
+                    const int w= s->plane[0].width;
+                    const int h= s->plane[0].height;
+                    int y;
+
+                    for(y=sy; y<0; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    for(y=h; y<sy+block_h*2; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    if(sx<0){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
+                    }
+                    if(sx+block_w*2 > w){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
+                    }
+                }
+
+                // intra(black) = neighbors' contribution to the current block
+                for(i=0; i < s->nb_planes; i++)
+                    color[i]= get_dc(s, mb_x, mb_y, i);
+
+                // get previous score (cannot be cached due to OBMC)
+                if(pass > 0 && (block->type&BLOCK_INTRA)){
+                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
+                    check_block(s, mb_x, mb_y, color0, 1, obmc_edged, &best_rd);
+                }else
+                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, obmc_edged, &best_rd);
+
+                ref_b= *block;
+                ref_rd= best_rd;
+                for(ref=0; ref < s->ref_frames; ref++){
+                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
+                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
+                        continue;
+                    block->ref= ref;
+                    best_rd= INT_MAX;
+
+                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, 0, 0, obmc_edged, &best_rd);
+                    if(tb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], obmc_edged, &best_rd);
+                    if(lb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], obmc_edged, &best_rd);
+                    if(rb)
+                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], obmc_edged, &best_rd);
+                    if(bb)
+                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], obmc_edged, &best_rd);
+
+                    /* fullpel ME */
+                    //FIXME avoid subpel interpolation / round to nearest integer
+                    do{
+                        int newx = block->mx;
+                        int newy = block->my;
+                        int dia_size = s->iterative_dia_size ? s->iterative_dia_size : FFMAX(s->avctx->dia_size, 1);
+                        dia_change=0;
+                        for(i=0; i < dia_size; i++){
+                            for(j=0; j<i; j++){
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+4*(i-j), newy+(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-4*(i-j), newy-(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-(4*j), newy+4*(i-j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+(4*j), newy-4*(i-j), obmc_edged, &best_rd);
+                            }
+                        }
+                    }while(dia_change);
+                    /* subpel ME */
+                    do{
+                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
+                        dia_change=0;
+                        for(i=0; i<8; i++)
+                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], obmc_edged, &best_rd);
+                    }while(dia_change);
+                    //FIXME or try the standard 2 pass qpel or similar
+
+                    mvr[0][0]= block->mx;
+                    mvr[0][1]= block->my;
+                    if(ref_rd > best_rd){
+                        ref_rd= best_rd;
+                        ref_b= *block;
+                    }
+                }
+                best_rd= ref_rd;
+                *block= ref_b;
+                check_block(s, mb_x, mb_y, color, 1, obmc_edged, &best_rd);
+                //FIXME RD style color selection
+                if(!same_block(block, &backup)){
+                    if(tb ) tb ->type &= ~BLOCK_OPT;
+                    if(lb ) lb ->type &= ~BLOCK_OPT;
+                    if(rb ) rb ->type &= ~BLOCK_OPT;
+                    if(bb ) bb ->type &= ~BLOCK_OPT;
+                    if(tlb) tlb->type &= ~BLOCK_OPT;
+                    if(trb) trb->type &= ~BLOCK_OPT;
+                    if(blb) blb->type &= ~BLOCK_OPT;
+                    if(brb) brb->type &= ~BLOCK_OPT;
+                    change ++;
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "pass:%d changed:%d\n", pass, change);
+        if(!change)
+            break;
+    }
+
+    if(s->block_max_depth == 1){
+        int change= 0;
+        for(mb_y= 0; mb_y<b_height; mb_y+=2){
+            for(mb_x= 0; mb_x<b_width; mb_x+=2){
+                int i;
+                int best_rd, init_rd;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *b[4];
+
+                b[0]= &s->block[index];
+                b[1]= b[0]+1;
+                b[2]= b[0]+b_stride;
+                b[3]= b[2]+1;
+                if(same_block(b[0], b[1]) &&
+                   same_block(b[0], b[2]) &&
+                   same_block(b[0], b[3]))
+                    continue;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+                //FIXME more multiref search?
+                check_4block_inter(s, mb_x, mb_y,
+                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
+                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
+
+                for(i=0; i<4; i++)
+                    if(!(b[i]->type&BLOCK_INTRA))
+                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
+
+                if(init_rd != best_rd)
+                    change++;
+            }
+        }
+        av_log(s->avctx, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
+    }
+}
+
+static void encode_blocks(SnowContext *s, int search){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+
+    if(s->motion_est == FF_ME_ITER && !s->keyframe && search)
+        iterative_me(s);
+
+    for(y=0; y<h; y++){
+        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return;
+        }
+        for(x=0; x<w; x++){
+            if(s->motion_est == FF_ME_ITER || !search)
+                encode_q_branch2(s, 0, x, y);
+            else
+                encode_q_branch (s, 0, x, y);
+        }
+    }
+}
+
+static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
+    int x,y, thres1, thres2;
+
+    if(s->qlog == LOSSLESS_QLOG){
+        for(y=0; y<h; y++)
+            for(x=0; x<w; x++)
+                dst[x + y*stride]= src[x + y*stride];
+        return;
+    }
+
+    bias= bias ? 0 : (3*qmul)>>3;
+    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
+    thres2= 2*thres1;
+
+    if(!bias){
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }else{
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }
+}
+
+static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= src[x + y*stride];
+            if(i<0){
+                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=h-1; y>=0; y--){
+        for(x=w-1; x>=0; x--){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] -= src[i - 1];
+                }else{
+                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] -= src[i - 1];
+                }
+            }else{
+                if(y) src[i] -= src[i - stride];
+            }
+        }
+    }
+}
+
+static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] += src[i - 1];
+                }else{
+                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] += src[i - 1];
+                }
+            }else{
+                if(y) src[i] += src[i - stride];
+            }
+        }
+    }
+}
+
+static void encode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                if(orientation==2) continue;
+                put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
+            }
+        }
+    }
+}
+
+static void encode_header(SnowContext *s){
+    int plane_index, i;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    put_rac(&s->c, kstate, s->keyframe);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->last_spatial_decomposition_type=
+        s->last_qlog=
+        s->last_qbias=
+        s->last_mv_scale=
+        s->last_block_max_depth= 0;
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_htaps=0;
+            p->last_diag_mc=0;
+            memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
+        }
+    }
+    if(s->keyframe){
+        put_symbol(&s->c, s->header_state, s->version, 0);
+        put_rac(&s->c, s->header_state, s->always_reset);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
+            put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
+        }
+        put_rac(&s->c, s->header_state, s->spatial_scalability);
+//        put_rac(&s->c, s->header_state, s->rate_scalability);
+        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
+
+        encode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        int update_mc=0;
+        for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+            Plane *p= &s->plane[plane_index];
+            update_mc |= p->last_htaps   != p->htaps;
+            update_mc |= p->last_diag_mc != p->diag_mc;
+            update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+        put_rac(&s->c, s->header_state, update_mc);
+        if(update_mc){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                Plane *p= &s->plane[plane_index];
+                put_rac(&s->c, s->header_state, p->diag_mc);
+                put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
+                for(i= p->htaps/2; i; i--)
+                    put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
+            }
+        }
+        if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+            put_rac(&s->c, s->header_state, 1);
+            put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+            encode_qlogs(s);
+        }else
+            put_rac(&s->c, s->header_state, 0);
+    }
+
+    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
+    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
+    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
+    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
+    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
+
+}
+
+static void update_last_header_values(SnowContext *s){
+    int plane_index;
+
+    if(!s->keyframe){
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_diag_mc= p->diag_mc;
+            p->last_htaps  = p->htaps;
+            memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+    }
+
+    s->last_spatial_decomposition_type  = s->spatial_decomposition_type;
+    s->last_qlog                        = s->qlog;
+    s->last_qbias                       = s->qbias;
+    s->last_mv_scale                    = s->mv_scale;
+    s->last_block_max_depth             = s->block_max_depth;
+    s->last_spatial_decomposition_count = s->spatial_decomposition_count;
+}
+
+static int qscale2qlog(int qscale){
+    return rint(QROOT*log2(qscale / (float)FF_QP2LAMBDA))
+           + 61*QROOT/8; ///< 64 > 60
+}
+
+static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
+{
+    /* Estimate the frame's complexity as a sum of weighted dwt coefficients.
+     * FIXME we know exact mv bits at this point,
+     * but ratecontrol isn't set up to include them. */
+    uint32_t coef_sum= 0;
+    int level, orientation, delta_qlog;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &s->plane[0].band[level][orientation];
+            IDWTELEM *buf= b->ibuf;
+            const int w= b->width;
+            const int h= b->height;
+            const int stride= b->stride;
+            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
+            const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+            const int qdiv= (1<<16)/qmul;
+            int x, y;
+            //FIXME this is ugly
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    buf[x+y*stride]= b->buf[x+y*stride];
+            if(orientation==0)
+                decorrelate(s, b, buf, stride, 1, 0);
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
+        }
+    }
+
+    /* ugly, ratecontrol just takes a sqrt again */
+    av_assert0(coef_sum < INT_MAX);
+    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
+
+    if(pict->pict_type == AV_PICTURE_TYPE_I){
+        s->m.current_picture.mb_var_sum= coef_sum;
+        s->m.current_picture.mc_mb_var_sum= 0;
+    }else{
+        s->m.current_picture.mc_mb_var_sum= coef_sum;
+        s->m.current_picture.mb_var_sum= 0;
+    }
+
+    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
+    if (pict->quality < 0)
+        return INT_MIN;
+    s->lambda= pict->quality * 3/2;
+    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
+    s->qlog+= delta_qlog;
+    return delta_qlog;
+}
+
+static void calculate_visual_weight(SnowContext *s, Plane *p){
+    int width = p->width;
+    int height= p->height;
+    int level, orientation, x, y;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &p->band[level][orientation];
+            IDWTELEM *ibuf= b->ibuf;
+            int64_t error=0;
+
+            memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
+            ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
+                    error += d*d;
+                }
+            }
+
+            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
+        }
+    }
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    AVFrame *pic;
+    const int width= s->avctx->width;
+    const int height= s->avctx->height;
+    int level, orientation, plane_index, i, y, ret;
+    uint8_t rc_header_bak[sizeof(s->header_state)];
+    uint8_t rc_block_bak[sizeof(s->block_state)];
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->b_width*s->b_height*MB_SIZE*MB_SIZE*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(c, pkt->data, pkt->size);
+    ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+
+    for(i=0; i < s->nb_planes; i++){
+        int hshift= i ? s->chroma_h_shift : 0;
+        int vshift= i ? s->chroma_v_shift : 0;
+        for(y=0; y<FF_CEIL_RSHIFT(height, vshift); y++)
+            memcpy(&s->input_picture->data[i][y * s->input_picture->linesize[i]],
+                   &pict->data[i][y * pict->linesize[i]],
+                   FF_CEIL_RSHIFT(width, hshift));
+        s->mpvencdsp.draw_edges(s->input_picture->data[i], s->input_picture->linesize[i],
+                                FF_CEIL_RSHIFT(width, hshift), FF_CEIL_RSHIFT(height, vshift),
+                                EDGE_WIDTH >> hshift, EDGE_WIDTH >> vshift,
+                                EDGE_TOP | EDGE_BOTTOM);
+
+    }
+    emms_c();
+    pic = s->input_picture;
+    pic->pict_type = pict->pict_type;
+    pic->quality = pict->quality;
+
+    s->m.picture_number= avctx->frame_number;
+    if(avctx->flags&AV_CODEC_FLAG_PASS2){
+        s->m.pict_type = pic->pict_type = s->m.rc_context.entry[avctx->frame_number].new_pict_type;
+        s->keyframe = pic->pict_type == AV_PICTURE_TYPE_I;
+        if(!(avctx->flags&AV_CODEC_FLAG_QSCALE)) {
+            pic->quality = ff_rate_estimate_qscale(&s->m, 0);
+            if (pic->quality < 0)
+                return -1;
+        }
+    }else{
+        s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
+        s->m.pict_type = pic->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    }
+
+    if(s->pass1_rc && avctx->frame_number == 0)
+        pic->quality = 2*FF_QP2LAMBDA;
+    if (pic->quality) {
+        s->qlog   = qscale2qlog(pic->quality);
+        s->lambda = pic->quality * 3/2;
+    }
+    if (s->qlog < 0 || (!pic->quality && (avctx->flags & AV_CODEC_FLAG_QSCALE))) {
+        s->qlog= LOSSLESS_QLOG;
+        s->lambda = 0;
+    }//else keep previous frame's qlog until after motion estimation
+
+    if (s->current_picture->data[0]
+#if FF_API_EMU_EDGE
+        && !(s->avctx->flags&CODEC_FLAG_EMU_EDGE)
+#endif
+        ) {
+        int w = s->avctx->width;
+        int h = s->avctx->height;
+
+        s->mpvencdsp.draw_edges(s->current_picture->data[0],
+                                s->current_picture->linesize[0], w   , h   ,
+                                EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+        if (s->current_picture->data[2]) {
+            s->mpvencdsp.draw_edges(s->current_picture->data[1],
+                                    s->current_picture->linesize[1], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+            s->mpvencdsp.draw_edges(s->current_picture->data[2],
+                                    s->current_picture->linesize[2], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+        }
+    }
+
+    ff_snow_frame_start(s);
+    av_frame_unref(avctx->coded_frame);
+    ret = av_frame_ref(avctx->coded_frame, s->current_picture);
+    if (ret < 0)
+        return ret;
+
+    s->m.current_picture_ptr= &s->m.current_picture;
+    s->m.current_picture.f = s->current_picture;
+    s->m.current_picture.f->pts = pict->pts;
+    if(pic->pict_type == AV_PICTURE_TYPE_P){
+        int block_width = (width +15)>>4;
+        int block_height= (height+15)>>4;
+        int stride= s->current_picture->linesize[0];
+
+        av_assert0(s->current_picture->data[0]);
+        av_assert0(s->last_picture[0]->data[0]);
+
+        s->m.avctx= s->avctx;
+        s->m.   last_picture.f = s->last_picture[0];
+        s->m.    new_picture.f = s->input_picture;
+        s->m.   last_picture_ptr= &s->m.   last_picture;
+        s->m.linesize = stride;
+        s->m.uvlinesize= s->current_picture->linesize[1];
+        s->m.width = width;
+        s->m.height= height;
+        s->m.mb_width = block_width;
+        s->m.mb_height= block_height;
+        s->m.mb_stride=   s->m.mb_width+1;
+        s->m.b8_stride= 2*s->m.mb_width+1;
+        s->m.f_code=1;
+        s->m.pict_type = pic->pict_type;
+#if FF_API_MOTION_EST
+        s->m.me_method= s->avctx->me_method;
+#endif
+        s->m.motion_est= s->motion_est;
+        s->m.me.scene_change_score=0;
+        s->m.me.dia_size = avctx->dia_size;
+        s->m.quarter_sample= (s->avctx->flags & AV_CODEC_FLAG_QPEL)!=0;
+        s->m.out_format= FMT_H263;
+        s->m.unrestricted_mv= 1;
+
+        s->m.lambda = s->lambda;
+        s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
+        s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
+
+        s->m.mecc= s->mecc; //move
+        s->m.qdsp= s->qdsp; //move
+        s->m.hdsp = s->hdsp;
+        ff_init_me(&s->m);
+        s->hdsp = s->m.hdsp;
+        s->mecc= s->m.mecc;
+    }
+
+    if(s->pass1_rc){
+        memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
+        memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
+    }
+
+redo_frame:
+
+    s->spatial_decomposition_count= 5;
+
+    while(   !(width >>(s->chroma_h_shift + s->spatial_decomposition_count))
+          || !(height>>(s->chroma_v_shift + s->spatial_decomposition_count)))
+        s->spatial_decomposition_count--;
+
+    if (s->spatial_decomposition_count <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Resolution too low\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->m.pict_type = pic->pict_type;
+    s->qbias = pic->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
+
+    ff_snow_common_init_after_header(avctx);
+
+    if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+        for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+            calculate_visual_weight(s, &s->plane[plane_index]);
+        }
+    }
+
+    encode_header(s);
+    s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    encode_blocks(s, 1);
+    s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+//        int bits= put_bits_count(&s->c.pb);
+
+        if (!s->memc_only) {
+            //FIXME optimize
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
+                    }
+                }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
+
+            if(   plane_index==0
+               && pic->pict_type == AV_PICTURE_TYPE_P
+               && !(avctx->flags&AV_CODEC_FLAG_PASS2)
+               && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
+                ff_init_range_encoder(c, pkt->data, pkt->size);
+                ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+                pic->pict_type= AV_PICTURE_TYPE_I;
+                s->keyframe=1;
+                s->current_picture->key_frame=1;
+                goto redo_frame;
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
+                    }
+                }
+            }else{
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
+                    }
+                }
+            }
+
+            ff_spatial_dwt(s->spatial_dwt_buffer, s->temp_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+
+            if(s->pass1_rc && plane_index==0){
+                int delta_qlog = ratecontrol_1pass(s, pic);
+                if (delta_qlog <= INT_MIN)
+                    return -1;
+                if(delta_qlog){
+                    //reordering qlog in the bitstream would eliminate this reset
+                    ff_init_range_encoder(c, pkt->data, pkt->size);
+                    memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
+                    memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
+                    encode_header(s);
+                    encode_blocks(s, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
+                    if(orientation==0)
+                        decorrelate(s, b, b->ibuf, b->stride, pic->pict_type == AV_PICTURE_TYPE_P, 0);
+                    if (!s->no_bitstream)
+                    encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
+                    av_assert0(b->parent==NULL || b->parent->stride == b->stride*2);
+                    if(orientation==0)
+                        correlate(s, b, b->ibuf, b->stride, 1, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    dequantize(s, b, b->ibuf, b->stride);
+                }
+            }
+
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
+                    }
+                }
+            }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+        }else{
+            //ME/MC only
+            if(pic->pict_type == AV_PICTURE_TYPE_I){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x]=
+                            pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                    }
+                }
+            }else{
+                memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
+                predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+            }
+        }
+        if(s->avctx->flags&AV_CODEC_FLAG_PSNR){
+            int64_t error= 0;
+
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        int d= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                        error += d*d;
+                    }
+                }
+            s->avctx->error[plane_index] += error;
+            s->encoding_error[plane_index] = error;
+        }
+
+    }
+
+    update_last_header_values(s);
+
+    ff_snow_release_buffer(avctx);
+
+    s->current_picture->coded_picture_number = avctx->frame_number;
+    s->current_picture->pict_type = pic->pict_type;
+    s->current_picture->quality = pic->quality;
+    s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
+    s->m.current_picture.f->display_picture_number =
+    s->m.current_picture.f->coded_picture_number   = avctx->frame_number;
+    s->m.current_picture.f->quality                = pic->quality;
+    s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
+    if(s->pass1_rc)
+        if (ff_rate_estimate_qscale(&s->m, 0) < 0)
+            return -1;
+    if(avctx->flags&AV_CODEC_FLAG_PASS1)
+        ff_write_pass1_stats(&s->m);
+    s->m.last_pict_type = s->m.pict_type;
+    avctx->frame_bits = s->m.frame_bits;
+    avctx->mv_bits = s->m.mv_bits;
+    avctx->misc_bits = s->m.misc_bits;
+    avctx->p_tex_bits = s->m.p_tex_bits;
+
+    emms_c();
+
+    ff_side_data_set_encoder_stats(pkt, s->current_picture->quality,
+                                   s->encoding_error,
+                                   (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                   s->current_picture->pict_type);
+
+#if FF_API_ERROR_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    memcpy(s->current_picture->error, s->encoding_error, sizeof(s->encoding_error));
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    pkt->size = ff_rac_terminate(c);
+    if (s->current_picture->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_snow_common_end(s);
+    ff_rate_control_uninit(&s->m);
+    av_frame_free(&s->input_picture);
+    av_freep(&avctx->stats_out);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(SnowContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    FF_MPV_COMMON_OPTS
+    { "iter",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ITER }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" },
+    { "memc_only",      "Only do ME/MC (I frames -> ref, P frame -> ME+MC).",   OFFSET(memc_only), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "no_bitstream",   "Skip final bitstream writeout.",                    OFFSET(no_bitstream), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "intra_penalty",  "Penalty for intra blocks in block decission",      OFFSET(intra_penalty), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "iterative_dia_size",  "Dia size for the iterative ME",          OFFSET(iterative_dia_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { NULL },
+};
+
+static const AVClass snowenc_class = {
+    .class_name = "snow encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_snow_encoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &snowenc_class,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
+
+
+#ifdef TEST
+#undef malloc
+#undef free
+#undef printf
+
+#include "libavutil/lfg.h"
+#include "libavutil/mathematics.h"
+
+int main(void){
+#define width  256
+#define height 256
+    int buffer[2][width*height];
+    SnowContext s;
+    int i;
+    AVLFG prng;
+    s.spatial_decomposition_count=6;
+    s.spatial_decomposition_type=1;
+
+    s.temp_dwt_buffer  = av_mallocz_array(width, sizeof(DWTELEM));
+    s.temp_idwt_buffer = av_mallocz_array(width, sizeof(IDWTELEM));
+
+    if (!s.temp_dwt_buffer || !s.temp_idwt_buffer) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        return 1;
+    }
+
+    av_lfg_init(&prng, 1);
+
+    printf("testing 5/3 DWT\n");
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    printf("testing 9/7 DWT\n");
+    s.spatial_decomposition_type=0;
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    {
+    int level, orientation, x, y;
+    int64_t errors[8][4];
+    int64_t g=0;
+
+        memset(errors, 0, sizeof(errors));
+        s.spatial_decomposition_count=3;
+        s.spatial_decomposition_type=0;
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                int w= width  >> (s.spatial_decomposition_count-level);
+                int h= height >> (s.spatial_decomposition_count-level);
+                int stride= width  << (s.spatial_decomposition_count-level);
+                DWTELEM *buf= buffer[0];
+                int64_t error=0;
+
+                if(orientation&1) buf+=w;
+                if(orientation>1) buf+=stride>>1;
+
+                memset(buffer[0], 0, sizeof(int)*width*height);
+                buf[w/2 + h/2*stride]= 256*256;
+                ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+                for(y=0; y<height; y++){
+                    for(x=0; x<width; x++){
+                        int64_t d= buffer[0][x + y*width];
+                        error += d*d;
+                        if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
+                    }
+                    if(FFABS(height/2-y)<9 && level==2) printf("\n");
+                }
+                error= (int)(sqrt(error)+0.5);
+                errors[level][orientation]= error;
+                if(g) g=av_gcd(g, error);
+                else g= error;
+            }
+        }
+        printf("static int const visual_weight[][4]={\n");
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            printf("  {");
+            for(orientation=0; orientation<4; orientation++){
+                printf("%8"PRId64",", errors[level][orientation]/g);
+            }
+            printf("},\n");
+        }
+        printf("};\n");
+        {
+            int level=2;
+            int w= width  >> (s.spatial_decomposition_count-level);
+            //int h= height >> (s.spatial_decomposition_count-level);
+            int stride= width  << (s.spatial_decomposition_count-level);
+            DWTELEM *buf= buffer[0];
+            int64_t error=0;
+
+            buf+=w;
+            buf+=stride>>1;
+
+            memset(buffer[0], 0, sizeof(int)*width*height);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int tab[4]={0,2,3,1};
+                    buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
+                }
+            }
+            ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= buffer[0][x + y*width];
+                    error += d*d;
+                    if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
+                }
+                if(FFABS(height/2-y)<9) printf("\n");
+            }
+        }
+
+    }
+    return 0;
+}
+#endif /* TEST */
diff --git a/libavcodec/sonic.c b/libavcodec/sonic.c
new file mode 100644
index 0000000000..4ec7d89fde
--- /dev/null
+++ b/libavcodec/sonic.c
@@ -0,0 +1,1119 @@
+/*
+ * Simple free lossless/lossy audio codec
+ * Copyright (c) 2004 Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "golomb.h"
+#include "internal.h"
+#include "rangecoder.h"
+
+
+/**
+ * @file
+ * Simple free lossless/lossy audio codec
+ * Based on Paul Francis Harrison's Bonk (http://www.logarithmic.net/pfh/bonk)
+ * Written and designed by Alex Beregszaszi
+ *
+ * TODO:
+ *  - CABAC put/get_symbol
+ *  - independent quantizer for channels
+ *  - >2 channels support
+ *  - more decorrelation types
+ *  - more tap_quant tests
+ *  - selectable intlist writers/readers (bonk-style, golomb, cabac)
+ */
+
+#define MAX_CHANNELS 2
+
+#define MID_SIDE 0
+#define LEFT_SIDE 1
+#define RIGHT_SIDE 2
+
+typedef struct SonicContext {
+    int version;
+    int minor_version;
+    int lossless, decorrelation;
+
+    int num_taps, downsampling;
+    double quantization;
+
+    int channels, samplerate, block_align, frame_size;
+
+    int *tap_quant;
+    int *int_samples;
+    int *coded_samples[MAX_CHANNELS];
+
+    // for encoding
+    int *tail;
+    int tail_size;
+    int *window;
+    int window_size;
+
+    // for decoding
+    int *predictor_k;
+    int *predictor_state[MAX_CHANNELS];
+} SonicContext;
+
+#define LATTICE_SHIFT   10
+#define SAMPLE_SHIFT    4
+#define LATTICE_FACTOR  (1 << LATTICE_SHIFT)
+#define SAMPLE_FACTOR   (1 << SAMPLE_SHIFT)
+
+#define BASE_QUANT      0.6
+#define RATE_VARIATION  3.0
+
+static inline int shift(int a,int b)
+{
+    return (a+(1<<(b-1))) >> b;
+}
+
+static inline int shift_down(int a,int b)
+{
+    return (a>>b)+(a<0);
+}
+
+static av_always_inline av_flatten void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed, uint64_t rc_stat[256][2], uint64_t rc_stat2[32][2]){
+    int i;
+
+#define put_rac(C,S,B) \
+do{\
+    if(rc_stat){\
+        rc_stat[*(S)][B]++;\
+        rc_stat2[(S)-state][B]++;\
+    }\
+    put_rac(C,S,B);\
+}while(0)
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        put_rac(c, state+0, 0);
+        if(e<=9){
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+i, 1);  //1..10
+            }
+            put_rac(c, state+1+i, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+i, (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + e, v < 0); //11..21
+        }else{
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
+            }
+            put_rac(c, state+1+9, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + 10, v < 0); //11..21
+        }
+    }else{
+        put_rac(c, state+0, 1);
+    }
+#undef put_rac
+}
+
+static inline av_flatten int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e, 10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+#if 1
+static inline int intlist_write(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        put_symbol(c, state, buf[i], 1, NULL, NULL);
+
+    return 1;
+}
+
+static inline int intlist_read(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_symbol(c, state, 1);
+
+    return 1;
+}
+#elif 1
+static inline int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        set_se_golomb(pb, buf[i]);
+
+    return 1;
+}
+
+static inline int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_se_golomb(gb);
+
+    return 1;
+}
+
+#else
+
+#define ADAPT_LEVEL 8
+
+static int bits_to_store(uint64_t x)
+{
+    int res = 0;
+
+    while(x)
+    {
+        res++;
+        x >>= 1;
+    }
+    return res;
+}
+
+static void write_uint_max(PutBitContext *pb, unsigned int value, unsigned int max)
+{
+    int i, bits;
+
+    if (!max)
+        return;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        put_bits(pb, 1, value & (1 << i));
+
+    if ( (value | (1 << (bits-1))) <= max)
+        put_bits(pb, 1, value & (1 << (bits-1)));
+}
+
+static unsigned int read_uint_max(GetBitContext *gb, int max)
+{
+    int i, bits, value = 0;
+
+    if (!max)
+        return 0;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        if (get_bits1(gb))
+            value += 1 << i;
+
+    if ( (value | (1<<(bits-1))) <= max)
+        if (get_bits1(gb))
+            value += 1 << (bits-1);
+
+    return value;
+}
+
+static int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i, j, x = 0, low_bits = 0, max = 0;
+    int step = 256, pos = 0, dominant = 0, any = 0;
+    int *copy, *bits;
+
+    copy = av_calloc(entries, sizeof(*copy));
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        int energy = 0;
+
+        for (i = 0; i < entries; i++)
+            energy += abs(buf[i]);
+
+        low_bits = bits_to_store(energy / (entries * 2));
+        if (low_bits > 15)
+            low_bits = 15;
+
+        put_bits(pb, 4, low_bits);
+    }
+
+    for (i = 0; i < entries; i++)
+    {
+        put_bits(pb, low_bits, abs(buf[i]));
+        copy[i] = abs(buf[i]) >> low_bits;
+        if (copy[i] > max)
+            max = abs(copy[i]);
+    }
+
+    bits = av_calloc(entries*max, sizeof(*bits));
+    if (!bits)
+    {
+        av_free(copy);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i <= max; i++)
+    {
+        for (j = 0; j < entries; j++)
+            if (copy[j] >= i)
+                bits[x++] = copy[j] > i;
+    }
+
+    // store bitstream
+    while (pos < x)
+    {
+        int steplet = step >> 8;
+
+        if (pos + steplet > x)
+            steplet = x - pos;
+
+        for (i = 0; i < steplet; i++)
+            if (bits[i+pos] != dominant)
+                any = 1;
+
+        put_bits(pb, 1, any);
+
+        if (!any)
+        {
+            pos += steplet;
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int interloper = 0;
+
+            while (((pos + interloper) < x) && (bits[pos + interloper] == dominant))
+                interloper++;
+
+            // note change
+            write_uint_max(pb, interloper, (step >> 8) - 1);
+
+            pos += interloper + 1;
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // store signs
+    for (i = 0; i < entries; i++)
+        if (buf[i])
+            put_bits(pb, 1, buf[i] < 0);
+
+    av_free(bits);
+    av_free(copy);
+
+    return 0;
+}
+
+static int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i, low_bits = 0, x = 0;
+    int n_zeros = 0, step = 256, dominant = 0;
+    int pos = 0, level = 0;
+    int *bits = av_calloc(entries, sizeof(*bits));
+
+    if (!bits)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        low_bits = get_bits(gb, 4);
+
+        if (low_bits)
+            for (i = 0; i < entries; i++)
+                buf[i] = get_bits(gb, low_bits);
+    }
+
+//    av_log(NULL, AV_LOG_INFO, "entries: %d, low bits: %d\n", entries, low_bits);
+
+    while (n_zeros < entries)
+    {
+        int steplet = step >> 8;
+
+        if (!get_bits1(gb))
+        {
+            for (i = 0; i < steplet; i++)
+                bits[x++] = dominant;
+
+            if (!dominant)
+                n_zeros += steplet;
+
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int actual_run = read_uint_max(gb, steplet-1);
+
+//            av_log(NULL, AV_LOG_INFO, "actual run: %d\n", actual_run);
+
+            for (i = 0; i < actual_run; i++)
+                bits[x++] = dominant;
+
+            bits[x++] = !dominant;
+
+            if (!dominant)
+                n_zeros += actual_run;
+            else
+                n_zeros++;
+
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // reconstruct unsigned values
+    n_zeros = 0;
+    for (i = 0; n_zeros < entries; i++)
+    {
+        while(1)
+        {
+            if (pos >= entries)
+            {
+                pos = 0;
+                level += 1 << low_bits;
+            }
+
+            if (buf[pos] >= level)
+                break;
+
+            pos++;
+        }
+
+        if (bits[i])
+            buf[pos] += 1 << low_bits;
+        else
+            n_zeros++;
+
+        pos++;
+    }
+    av_free(bits);
+
+    // read signs
+    for (i = 0; i < entries; i++)
+        if (buf[i] && get_bits1(gb))
+            buf[i] = -buf[i];
+
+//    av_log(NULL, AV_LOG_INFO, "zeros: %d pos: %d\n", n_zeros, pos);
+
+    return 0;
+}
+#endif
+
+static void predictor_init_state(int *k, int *state, int order)
+{
+    int i;
+
+    for (i = order-2; i >= 0; i--)
+    {
+        int j, p, x = state[i];
+
+        for (j = 0, p = i+1; p < order; j++,p++)
+            {
+            int tmp = x + shift_down(k[j] * state[p], LATTICE_SHIFT);
+            state[p] += shift_down(k[j]*x, LATTICE_SHIFT);
+            x = tmp;
+        }
+    }
+}
+
+static int predictor_calc_error(int *k, int *state, int order, int error)
+{
+    int i, x = error - shift_down(k[order-1] * state[order-1], LATTICE_SHIFT);
+
+#if 1
+    int *k_ptr = &(k[order-2]),
+        *state_ptr = &(state[order-2]);
+    for (i = order-2; i >= 0; i--, k_ptr--, state_ptr--)
+    {
+        int k_value = *k_ptr, state_value = *state_ptr;
+        x -= shift_down(k_value * state_value, LATTICE_SHIFT);
+        state_ptr[1] = state_value + shift_down(k_value * x, LATTICE_SHIFT);
+    }
+#else
+    for (i = order-2; i >= 0; i--)
+    {
+        x -= shift_down(k[i] * state[i], LATTICE_SHIFT);
+        state[i+1] = state[i] + shift_down(k[i] * x, LATTICE_SHIFT);
+    }
+#endif
+
+    // don't drift too far, to avoid overflows
+    if (x >  (SAMPLE_FACTOR<<16)) x =  (SAMPLE_FACTOR<<16);
+    if (x < -(SAMPLE_FACTOR<<16)) x = -(SAMPLE_FACTOR<<16);
+
+    state[0] = x;
+
+    return x;
+}
+
+#if CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER
+// Heavily modified Levinson-Durbin algorithm which
+// copes better with quantization, and calculates the
+// actual whitened result as it goes.
+
+static int modified_levinson_durbin(int *window, int window_entries,
+        int *out, int out_entries, int channels, int *tap_quant)
+{
+    int i;
+    int *state = av_calloc(window_entries, sizeof(*state));
+
+    if (!state)
+        return AVERROR(ENOMEM);
+
+    memcpy(state, window, 4* window_entries);
+
+    for (i = 0; i < out_entries; i++)
+    {
+        int step = (i+1)*channels, k, j;
+        double xx = 0.0, xy = 0.0;
+#if 1
+        int *x_ptr = &(window[step]);
+        int *state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            double x_value = *x_ptr;
+            double state_value = *state_ptr;
+            xx += state_value*state_value;
+            xy += x_value*state_value;
+        }
+#else
+        for (j = 0; j <= (window_entries - step); j++);
+        {
+            double stepval = window[step+j];
+            double stateval = window[j];
+//            xx += (double)window[j]*(double)window[j];
+//            xy += (double)window[step+j]*(double)window[j];
+            xx += stateval*stateval;
+            xy += stepval*stateval;
+        }
+#endif
+        if (xx == 0.0)
+            k = 0;
+        else
+            k = (int)(floor(-xy/xx * (double)LATTICE_FACTOR / (double)(tap_quant[i]) + 0.5));
+
+        if (k > (LATTICE_FACTOR/tap_quant[i]))
+            k = LATTICE_FACTOR/tap_quant[i];
+        if (-k > (LATTICE_FACTOR/tap_quant[i]))
+            k = -(LATTICE_FACTOR/tap_quant[i]);
+
+        out[i] = k;
+        k *= tap_quant[i];
+
+#if 1
+        x_ptr = &(window[step]);
+        state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            int x_value = *x_ptr;
+            int state_value = *state_ptr;
+            *x_ptr = x_value + shift_down(k*state_value,LATTICE_SHIFT);
+            *state_ptr = state_value + shift_down(k*x_value, LATTICE_SHIFT);
+        }
+#else
+        for (j=0; j <= (window_entries - step); j++)
+        {
+            int stepval = window[step+j];
+            int stateval=state[j];
+            window[step+j] += shift_down(k * stateval, LATTICE_SHIFT);
+            state[j] += shift_down(k * stepval, LATTICE_SHIFT);
+        }
+#endif
+    }
+
+    av_free(state);
+    return 0;
+}
+
+static inline int code_samplerate(int samplerate)
+{
+    switch (samplerate)
+    {
+        case 44100: return 0;
+        case 22050: return 1;
+        case 11025: return 2;
+        case 96000: return 3;
+        case 48000: return 4;
+        case 32000: return 5;
+        case 24000: return 6;
+        case 16000: return 7;
+        case 8000: return 8;
+    }
+    return AVERROR(EINVAL);
+}
+
+static av_cold int sonic_encode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int i;
+
+    s->version = 2;
+
+    if (avctx->channels > MAX_CHANNELS)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR(EINVAL); /* only stereo or mono for now */
+    }
+
+    if (avctx->channels == 2)
+        s->decorrelation = MID_SIDE;
+    else
+        s->decorrelation = 3;
+
+    if (avctx->codec->id == AV_CODEC_ID_SONIC_LS)
+    {
+        s->lossless = 1;
+        s->num_taps = 32;
+        s->downsampling = 1;
+        s->quantization = 0.0;
+    }
+    else
+    {
+        s->num_taps = 128;
+        s->downsampling = 2;
+        s->quantization = 1.0;
+    }
+
+    // max tap 2048
+    if (s->num_taps < 32 || s->num_taps > 1024 || s->num_taps % 32) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of taps\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+
+    s->tail_size = s->num_taps*s->channels;
+    s->tail = av_calloc(s->tail_size, sizeof(*s->tail));
+    if (!s->tail)
+        return AVERROR(ENOMEM);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k) );
+    if (!s->predictor_k)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+
+    s->window_size = ((2*s->tail_size)+s->frame_size);
+    s->window = av_calloc(s->window_size, sizeof(*s->window));
+    if (!s->window || !s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->extradata = av_mallocz(16);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+    init_put_bits(&pb, avctx->extradata, 16*8);
+
+    put_bits(&pb, 2, s->version); // version
+    if (s->version >= 1)
+    {
+        if (s->version >= 2) {
+            put_bits(&pb, 8, s->version);
+            put_bits(&pb, 8, s->minor_version);
+        }
+        put_bits(&pb, 2, s->channels);
+        put_bits(&pb, 4, code_samplerate(s->samplerate));
+    }
+    put_bits(&pb, 1, s->lossless);
+    if (!s->lossless)
+        put_bits(&pb, 3, SAMPLE_SHIFT); // XXX FIXME: sample precision
+    put_bits(&pb, 2, s->decorrelation);
+    put_bits(&pb, 2, s->downsampling);
+    put_bits(&pb, 5, (s->num_taps >> 5)-1); // 32..1024
+    put_bits(&pb, 1, 0); // XXX FIXME: no custom tap quant table
+
+    flush_put_bits(&pb);
+    avctx->extradata_size = put_bits_count(&pb)/8;
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    avctx->frame_size = s->block_align*s->downsampling;
+
+    return 0;
+}
+
+static av_cold int sonic_encode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->channels; i++)
+        av_freep(&s->coded_samples[i]);
+
+    av_freep(&s->predictor_k);
+    av_freep(&s->tail);
+    av_freep(&s->tap_quant);
+    av_freep(&s->window);
+    av_freep(&s->int_samples);
+
+    return 0;
+}
+
+static int sonic_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                              const AVFrame *frame, int *got_packet_ptr)
+{
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    int i, j, ch, quant = 0, x = 0;
+    int ret;
+    const short *samples = (const int16_t*)frame->data[0];
+    uint8_t state[32];
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size * 5 + 1000, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(&c, avpkt->data, avpkt->size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+    memset(state, 128, sizeof(state));
+
+    // short -> internal
+    for (i = 0; i < s->frame_size; i++)
+        s->int_samples[i] = samples[i];
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = s->int_samples[i] << SAMPLE_SHIFT;
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i] += s->int_samples[i+1];
+                s->int_samples[i+1] -= shift(s->int_samples[i], 1);
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] -= s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] -= s->int_samples[i+1];
+            break;
+    }
+
+    memset(s->window, 0, 4* s->window_size);
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = s->tail[i];
+
+    for (i = 0; i < s->frame_size; i++)
+        s->window[x++] = s->int_samples[i];
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = 0;
+
+    for (i = 0; i < s->tail_size; i++)
+        s->tail[i] = s->int_samples[s->frame_size - s->tail_size + i];
+
+    // generate taps
+    ret = modified_levinson_durbin(s->window, s->window_size,
+                s->predictor_k, s->num_taps, s->channels, s->tap_quant);
+    if (ret < 0)
+        return ret;
+
+    if ((ret = intlist_write(&c, state, s->predictor_k, s->num_taps, 0)) < 0)
+        return ret;
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        x = s->tail_size+ch;
+        for (i = 0; i < s->block_align; i++)
+        {
+            int sum = 0;
+            for (j = 0; j < s->downsampling; j++, x += s->channels)
+                sum += s->window[x];
+            s->coded_samples[ch][i] = sum;
+        }
+    }
+
+    // simple rate control code
+    if (!s->lossless)
+    {
+        double energy1 = 0.0, energy2 = 0.0;
+        for (ch = 0; ch < s->channels; ch++)
+        {
+            for (i = 0; i < s->block_align; i++)
+            {
+                double sample = s->coded_samples[ch][i];
+                energy2 += sample*sample;
+                energy1 += fabs(sample);
+            }
+        }
+
+        energy2 = sqrt(energy2/(s->channels*s->block_align));
+        energy1 = M_SQRT2*energy1/(s->channels*s->block_align);
+
+        // increase bitrate when samples are like a gaussian distribution
+        // reduce bitrate when samples are like a two-tailed exponential distribution
+
+        if (energy2 > energy1)
+            energy2 += (energy2-energy1)*RATE_VARIATION;
+
+        quant = (int)(BASE_QUANT*s->quantization*energy2/SAMPLE_FACTOR);
+//        av_log(avctx, AV_LOG_DEBUG, "quant: %d energy: %f / %f\n", quant, energy1, energy2);
+
+        quant = av_clip(quant, 1, 65534);
+
+        put_symbol(&c, state, quant, 0, NULL, NULL);
+
+        quant *= SAMPLE_FACTOR;
+    }
+
+    // write out coded samples
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        if (!s->lossless)
+            for (i = 0; i < s->block_align; i++)
+                s->coded_samples[ch][i] = ROUNDED_DIV(s->coded_samples[ch][i], quant);
+
+        if ((ret = intlist_write(&c, state, s->coded_samples[ch], s->block_align, 1)) < 0)
+            return ret;
+    }
+
+//    av_log(avctx, AV_LOG_DEBUG, "used bytes: %d\n", (put_bits_count(&pb)+7)/8);
+
+    avpkt->size = ff_rac_terminate(&c);
+    *got_packet_ptr = 1;
+    return 0;
+
+}
+#endif /* CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER */
+
+#if CONFIG_SONIC_DECODER
+static const int samplerate_table[] =
+    { 44100, 22050, 11025, 96000, 48000, 32000, 24000, 16000, 8000 };
+
+static av_cold int sonic_decode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    GetBitContext gb;
+    int i;
+    int ret;
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    if (!avctx->extradata)
+    {
+        av_log(avctx, AV_LOG_ERROR, "No mandatory headers present\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+
+    s->version = get_bits(&gb, 2);
+    if (s->version >= 2) {
+        s->version       = get_bits(&gb, 8);
+        s->minor_version = get_bits(&gb, 8);
+    }
+    if (s->version != 2)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported Sonic version, please report\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->version >= 1)
+    {
+        int sample_rate_index;
+        s->channels = get_bits(&gb, 2);
+        sample_rate_index = get_bits(&gb, 4);
+        if (sample_rate_index >= FF_ARRAY_ELEMS(samplerate_table)) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample_rate_index %d\n", sample_rate_index);
+            return AVERROR_INVALIDDATA;
+        }
+        s->samplerate = samplerate_table[sample_rate_index];
+        av_log(avctx, AV_LOG_INFO, "Sonicv2 chans: %d samprate: %d\n",
+            s->channels, s->samplerate);
+    }
+
+    if (s->channels > MAX_CHANNELS || s->channels < 1)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->channels = s->channels;
+
+    s->lossless = get_bits1(&gb);
+    if (!s->lossless)
+        skip_bits(&gb, 3); // XXX FIXME
+    s->decorrelation = get_bits(&gb, 2);
+    if (s->decorrelation != 3 && s->channels != 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid decorrelation %d\n", s->decorrelation);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->downsampling = get_bits(&gb, 2);
+    if (!s->downsampling) {
+        av_log(avctx, AV_LOG_ERROR, "invalid downsampling value\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->num_taps = (get_bits(&gb, 5)+1)<<5;
+    if (get_bits1(&gb)) // XXX FIXME
+        av_log(avctx, AV_LOG_INFO, "Custom quant table\n");
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+//    avctx->frame_size = s->block_align;
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k));
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->predictor_state[i] = av_calloc(s->num_taps, sizeof(**s->predictor_state));
+        if (!s->predictor_state[i])
+            return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+    if (!s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int sonic_decode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    av_freep(&s->int_samples);
+    av_freep(&s->tap_quant);
+    av_freep(&s->predictor_k);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        av_freep(&s->predictor_state[i]);
+        av_freep(&s->coded_samples[i]);
+    }
+
+    return 0;
+}
+
+static int sonic_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame_ptr,
+                            AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    uint8_t state[32];
+    int i, quant, ch, j, ret;
+    int16_t *samples;
+    AVFrame *frame = data;
+
+    if (buf_size == 0) return 0;
+
+    frame->nb_samples = s->frame_size / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (int16_t *)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "buf_size: %d\n", buf_size);
+
+    memset(state, 128, sizeof(state));
+    ff_init_range_decoder(&c, buf, buf_size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+
+    intlist_read(&c, state, s->predictor_k, s->num_taps, 0);
+
+    // dequantize
+    for (i = 0; i < s->num_taps; i++)
+        s->predictor_k[i] *= s->tap_quant[i];
+
+    if (s->lossless)
+        quant = 1;
+    else
+        quant = get_symbol(&c, state, 0) * SAMPLE_FACTOR;
+
+//    av_log(NULL, AV_LOG_INFO, "quant: %d\n", quant);
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        int x = ch;
+
+        predictor_init_state(s->predictor_k, s->predictor_state[ch], s->num_taps);
+
+        intlist_read(&c, state, s->coded_samples[ch], s->block_align, 1);
+
+        for (i = 0; i < s->block_align; i++)
+        {
+            for (j = 0; j < s->downsampling - 1; j++)
+            {
+                s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, 0);
+                x += s->channels;
+            }
+
+            s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, s->coded_samples[ch][i] * quant);
+            x += s->channels;
+        }
+
+        for (i = 0; i < s->num_taps; i++)
+            s->predictor_state[ch][i] = s->int_samples[s->frame_size - s->channels + ch - i*s->channels];
+    }
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i+1] += shift(s->int_samples[i], 1);
+                s->int_samples[i] -= s->int_samples[i+1];
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] += s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] += s->int_samples[i+1];
+            break;
+    }
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = shift(s->int_samples[i], SAMPLE_SHIFT);
+
+    // internal -> short
+    for (i = 0; i < s->frame_size; i++)
+        samples[i] = av_clip_int16(s->int_samples[i]);
+
+    *got_frame_ptr = 1;
+
+    return buf_size;
+}
+
+AVCodec ff_sonic_decoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_decode_init,
+    .close          = sonic_decode_close,
+    .decode         = sonic_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_EXPERIMENTAL,
+};
+#endif /* CONFIG_SONIC_DECODER */
+
+#if CONFIG_SONIC_ENCODER
+AVCodec ff_sonic_encoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
+
+#if CONFIG_SONIC_LS_ENCODER
+AVCodec ff_sonic_ls_encoder = {
+    .name           = "sonicls",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC_LS,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
diff --git a/libavcodec/sp5x.h b/libavcodec/sp5x.h
index 1577302e3c..ecd6e8d649 100644
--- a/libavcodec/sp5x.h
+++ b/libavcodec/sp5x.h
@@ -1,21 +1,21 @@
 /*
  * Sunplus JPEG tables
- * Copyright (c) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sp5xdec.c b/libavcodec/sp5xdec.c
index 7f57b6357e..815f9ad50e 100644
--- a/libavcodec/sp5xdec.c
+++ b/libavcodec/sp5xdec.c
@@ -2,20 +2,20 @@
  * Sunplus JPEG decoder (SP5X)
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
         for (i = 2; i < buf_size-2 && j < buf_size+1024-2; i++)
             recoded[j++] = buf[i];
     else
-    for (i = 14; i < buf_size && j < buf_size+1024-2; i++)
+    for (i = 14; i < buf_size && j < buf_size+1024-3; i++)
     {
         recoded[j++] = buf[i];
         if (buf[i] == 0xff)
@@ -91,9 +91,10 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
 
     av_free(recoded);
 
-    return i;
+    return i < 0 ? i : avpkt->size;
 }
 
+#if CONFIG_SP5X_DECODER
 AVCodec ff_sp5x_decoder = {
     .name           = "sp5x",
     .long_name      = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
@@ -104,9 +105,11 @@ AVCodec ff_sp5x_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
-
+#endif
+#if CONFIG_AMV_DECODER
 AVCodec ff_amv_decoder = {
     .name           = "amv",
     .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
@@ -116,6 +119,8 @@ AVCodec ff_amv_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
+    .max_lowres     = 3,
     .capabilities   = AV_CODEC_CAP_DR1,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/sparc/README b/libavcodec/sparc/README
new file mode 100644
index 0000000000..f9f2349cd4
--- /dev/null
+++ b/libavcodec/sparc/README
@@ -0,0 +1,6 @@
+SPARC optimizations have been removed in
+commit b4dd424d96f09f9bafb88e47f37df65dc4529143
+The last revission with the optimizations is fb1b70c1ed50951c5fc1a309c3c446b2eaaf564b
+
+If you want to maintain these (or other) SPARC optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/srtdec.c b/libavcodec/srtdec.c
index 3bee3c726f..dd6bc9d104 100644
--- a/libavcodec/srtdec.c
+++ b/libavcodec/srtdec.c
@@ -2,241 +2,115 @@
  * SubRip subtitle decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/parseutils.h"
 #include "avcodec.h"
 #include "ass.h"
+#include "htmlsubtitles.h"
 
-static int html_color_parse(AVCodecContext *avctx, const char *str)
+static void srt_to_ass(AVCodecContext *avctx, AVBPrint *dst,
+                       const char *in, int x1, int y1, int x2, int y2)
 {
-    uint8_t rgba[4];
-    if (av_parse_color(rgba, str, strcspn(str, "\" >"), avctx) < 0)
-        return -1;
-    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
-}
-
-enum {
-    PARAM_UNKNOWN = -1,
-    PARAM_SIZE,
-    PARAM_COLOR,
-    PARAM_FACE,
-    PARAM_NUMBER
-};
-
-typedef struct SrtStack {
-    char tag[128];
-    char param[PARAM_NUMBER][128];
-} SrtStack;
-
-static const char *srt_to_ass(AVCodecContext *avctx, char *out, char *out_end,
-                              const char *in, int x1, int y1, int x2, int y2)
-{
-    char c, *param, buffer[128], tmp[128];
-    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
-    SrtStack stack[16];
-
-    stack[0].tag[0] = 0;
-    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
-    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
-    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
-
     if (x1 >= 0 && y1 >= 0) {
-        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1))
-            out += snprintf(out, out_end-out,
-                            "{\\an1}{\\move(%d,%d,%d,%d)}", x1, y1, x2, y2);
-        else
-            out += snprintf(out, out_end-out, "{\\an1}{\\pos(%d,%d)}", x1, y1);
-    }
-
-    for (; out < out_end && !end && *in; in++) {
-        switch (*in) {
-        case '\r':
-            break;
-        case '\n':
-            if (line_start) {
-                end = 1;
-                break;
-            }
-            while (out[-1] == ' ')
-                out--;
-            out += snprintf(out, out_end-out, "\\N");
-            line_start = 1;
-            break;
-        case ' ':
-            if (!line_start)
-                *out++ = *in;
-            break;
-        case '{':    /* skip all {\xxx} substrings except for {\an%d}
-                        and all microdvd like styles such as {Y:xxx} */
-            an += sscanf(in, "{\\an%*1u}%c", &c) == 1;
-            if ((an != 1 && sscanf(in, "{\\%*[^}]}%n%c", &len, &c) > 0) ||
-                sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n%c", &len, &c) > 0) {
-                in += len - 1;
-            } else
-                *out++ = *in;
-            break;
-        case '<':
-            tag_close = in[1] == '/';
-            if (sscanf(in+tag_close+1, "%127[^>]>%n%c", buffer, &len,&c) >= 2) {
-                if ((param = strchr(buffer, ' ')))
-                    *param++ = 0;
-                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
-                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, buffer))) {
-                    int i, j, unknown = 0;
-                    in += len + tag_close;
-                    if (!tag_close)
-                        memset(stack+sptr, 0, sizeof(*stack));
-                    if (!strcmp(buffer, "font")) {
-                        if (tag_close) {
-                            for (i=PARAM_NUMBER-1; i>=0; i--)
-                                if (stack[sptr-1].param[i][0])
-                                    for (j=sptr-2; j>=0; j--)
-                                        if (stack[j].param[i][0]) {
-                                            out += snprintf(out, out_end-out,
-                                                            "%s", stack[j].param[i]);
-                                            break;
-                                        }
-                        } else {
-                            while (param) {
-                                if (!strncmp(param, "size=", 5)) {
-                                    unsigned font_size;
-                                    param += 5 + (param[5] == '"');
-                                    if (sscanf(param, "%u", &font_size) == 1) {
-                                        snprintf(stack[sptr].param[PARAM_SIZE],
-                                             sizeof(stack[0].param[PARAM_SIZE]),
-                                             "{\\fs%u}", font_size);
-                                    }
-                                } else if (!strncmp(param, "color=", 6)) {
-                                    param += 6 + (param[6] == '"');
-                                    snprintf(stack[sptr].param[PARAM_COLOR],
-                                         sizeof(stack[0].param[PARAM_COLOR]),
-                                         "{\\c&H%X&}",
-                                         html_color_parse(avctx, param));
-                                } else if (!strncmp(param, "face=", 5)) {
-                                    param += 5 + (param[5] == '"');
-                                    len = strcspn(param,
-                                                  param[-1] == '"' ? "\"" :" ");
-                                    av_strlcpy(tmp, param,
-                                               FFMIN(sizeof(tmp), len+1));
-                                    param += len;
-                                    snprintf(stack[sptr].param[PARAM_FACE],
-                                             sizeof(stack[0].param[PARAM_FACE]),
-                                             "{\\fn%s}", tmp);
-                                }
-                                if ((param = strchr(param, ' ')))
-                                    param++;
-                            }
-                            for (i=0; i<PARAM_NUMBER; i++)
-                                if (stack[sptr].param[i][0])
-                                    out += snprintf(out, out_end-out,
-                                                    "%s", stack[sptr].param[i]);
-                        }
-                    } else if (!buffer[1] && strspn(buffer, "bisu") == 1) {
-                        out += snprintf(out, out_end-out,
-                                        "{\\%c%d}", buffer[0], !tag_close);
-                    } else {
-                        unknown = 1;
-                        snprintf(tmp, sizeof(tmp), "</%s>", buffer);
-                    }
-                    if (tag_close) {
-                        sptr--;
-                    } else if (unknown && !strstr(in, tmp)) {
-                        in -= len + tag_close;
-                        *out++ = *in;
-                    } else
-                        av_strlcpy(stack[sptr++].tag, buffer,
-                                   sizeof(stack[0].tag));
-                    break;
-                }
-            }
-        default:
-            *out++ = *in;
-            break;
+        /* XXX: here we rescale coordinate assuming they are in DVD resolution
+         * (720x480) since we don't have anything better */
+
+        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1) && x2 >= x1 && y2 >= y1) {
+            /* text rectangle defined, write the text at the center of the rectangle */
+            const int cx = x1 + (x2 - x1)/2;
+            const int cy = y1 + (y2 - y1)/2;
+            const int scaled_x = cx * ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = cy * ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an5}{\\pos(%d,%d)}", scaled_x, scaled_y);
+        } else {
+            /* only the top left corner, assume the text starts in that corner */
+            const int scaled_x = x1 * ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = y1 * ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an1}{\\pos(%d,%d)}", scaled_x, scaled_y);
         }
-        if (*in != ' ' && *in != '\r' && *in != '\n')
-            line_start = 0;
     }
 
-    out = FFMIN(out, out_end-3);
-    while (!strncmp(out-2, "\\N", 2))
-        out -= 2;
-    while (out[-1] == ' ')
-        out--;
-    out += snprintf(out, out_end-out, "\r\n");
-    return in;
-}
-
-static const char *read_ts(const char *buf, int *ts_start, int *ts_end,
-                           int *x1, int *y1, int *x2, int *y2)
-{
-    int i, hs, ms, ss, he, me, se;
-
-    for (i=0; i<2; i++) {
-        /* try to read timestamps in either the first or second line */
-        int c = sscanf(buf, "%d:%2d:%2d%*1[,.]%3d --> %d:%2d:%2d%*1[,.]%3d"
-                       "%*[ ]X1:%u X2:%u Y1:%u Y2:%u",
-                       &hs, &ms, &ss, ts_start, &he, &me, &se, ts_end,
-                       x1, x2, y1, y2);
-        buf += strcspn(buf, "\n") + 1;
-        if (c >= 8) {
-            *ts_start = 100*(ss + 60*(ms + 60*hs)) + *ts_start/10;
-            *ts_end   = 100*(se + 60*(me + 60*he)) + *ts_end  /10;
-            return buf;
-        }
-    }
-    return NULL;
+    ff_htmlmarkup_to_ass(avctx, dst, in);
 }
 
 static int srt_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_sub_ptr, AVPacket *avpkt)
 {
     AVSubtitle *sub = data;
+    AVBPrint buffer;
     int ts_start, ts_end, x1 = -1, y1 = -1, x2 = -1, y2 = -1;
-    char buffer[2048];
-    const char *ptr = avpkt->data;
-    const char *end = avpkt->data + avpkt->size;
+    int size, ret;
+    const uint8_t *p = av_packet_get_side_data(avpkt, AV_PKT_DATA_SUBTITLE_POSITION, &size);
+
+    if (p && size == 16) {
+        x1 = AV_RL32(p     );
+        y1 = AV_RL32(p +  4);
+        x2 = AV_RL32(p +  8);
+        y2 = AV_RL32(p + 12);
+    }
 
     if (avpkt->size <= 0)
         return avpkt->size;
 
-    ff_ass_init(sub);
+    av_bprint_init(&buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
 
-    while (ptr < end && *ptr) {
-        ptr = read_ts(ptr, &ts_start, &ts_end, &x1, &y1, &x2, &y2);
-        if (!ptr)
-            break;
-        ptr = srt_to_ass(avctx, buffer, buffer+sizeof(buffer), ptr,
-                         x1, y1, x2, y2);
-        ff_ass_add_rect(sub, buffer, ts_start, ts_end, 0);
-    }
+        // TODO: reindent
+            // Do final divide-by-10 outside rescale to force rounding down.
+            ts_start = av_rescale_q(avpkt->pts,
+                                    avctx->time_base,
+                                    (AVRational){1,100});
+            ts_end   = av_rescale_q(avpkt->pts + avpkt->duration,
+                                    avctx->time_base,
+                                    (AVRational){1,100});
+
+    srt_to_ass(avctx, &buffer, avpkt->data, x1, y1, x2, y2);
+    ret = ff_ass_add_rect_bprint(sub, &buffer, ts_start, ts_end-ts_start);
+    av_bprint_finalize(&buffer, NULL);
+    if (ret < 0)
+        return ret;
 
     *got_sub_ptr = sub->num_rects > 0;
     return avpkt->size;
 }
 
+#if CONFIG_SRT_DECODER
+/* deprecated decoder */
 AVCodec ff_srt_decoder = {
     .name         = "srt",
     .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SRT,
+    .id           = AV_CODEC_ID_SUBRIP,
+    .init         = ff_ass_subtitle_header_default,
+    .decode       = srt_decode_frame,
+};
+#endif
+
+#if CONFIG_SUBRIP_DECODER
+AVCodec ff_subrip_decoder = {
+    .name         = "subrip",
+    .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_SUBRIP,
     .init         = ff_ass_subtitle_header_default,
     .decode       = srt_decode_frame,
 };
+#endif
diff --git a/libavcodec/srtenc.c b/libavcodec/srtenc.c
new file mode 100644
index 0000000000..328797089c
--- /dev/null
+++ b/libavcodec/srtenc.c
@@ -0,0 +1,295 @@
+/*
+ * SubRip subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+
+#define SRT_STACK_SIZE 64
+
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    char stack[SRT_STACK_SIZE];
+    int stack_ptr;
+    int alignment_applied;
+} SRTContext;
+
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void srt_print(SRTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int srt_stack_push(SRTContext *s, const char c)
+{
+    if (s->stack_ptr >= SRT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char srt_stack_pop(SRTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int srt_stack_find(SRTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void srt_close_tag(SRTContext *s, char tag)
+{
+    srt_print(s, "</%c%s>", tag, tag == 'f' ? "ont" : "");
+}
+
+static void srt_stack_push_pop(SRTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? srt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            srt_close_tag(s, srt_stack_pop(s));
+    } else if (srt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void srt_style_apply(SRTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        int c = st->primary_color & 0xFFFFFF;
+        if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT) ||
+            st->font_size != ASS_DEFAULT_FONT_SIZE ||
+            c != ASS_DEFAULT_COLOR) {
+            srt_print(s, "<font");
+            if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT))
+                srt_print(s, " face=\"%s\"", st->font_name);
+            if (st->font_size != ASS_DEFAULT_FONT_SIZE)
+                srt_print(s, " size=\"%d\"", st->font_size);
+            if (c != ASS_DEFAULT_COLOR)
+                srt_print(s, " color=\"#%06x\"",
+                          (c & 0xFF0000) >> 16 | c & 0xFF00 | (c & 0xFF) << 16);
+            srt_print(s, ">");
+            srt_stack_push(s, 'f');
+        }
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            srt_print(s, "<b>");
+            srt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            srt_print(s, "<i>");
+            srt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            srt_print(s, "<u>");
+            srt_stack_push(s, 'u');
+        }
+        if (st->alignment != ASS_DEFAULT_ALIGNMENT) {
+            srt_print(s, "{\\an%d}", st->alignment);
+            s->alignment_applied = 1;
+        }
+    }
+}
+
+
+static av_cold int srt_encode_init(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void srt_text_cb(void *priv, const char *text, int len)
+{
+    SRTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void srt_new_line_cb(void *priv, int forced)
+{
+    srt_print(priv, "\r\n");
+}
+
+static void srt_style_cb(void *priv, char style, int close)
+{
+    srt_stack_push_pop(priv, style, close);
+    if (!close)
+        srt_print(priv, "<%c>", style);
+}
+
+static void srt_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    if (color_id > 1)
+        return;
+    srt_stack_push_pop(priv, 'f', color == 0xFFFFFFFF);
+    if (color != 0xFFFFFFFF)
+        srt_print(priv, "<font color=\"#%06x\">",
+              (color & 0xFF0000) >> 16 | color & 0xFF00 | (color & 0xFF) << 16);
+}
+
+static void srt_font_name_cb(void *priv, const char *name)
+{
+    srt_stack_push_pop(priv, 'f', !name);
+    if (name)
+        srt_print(priv, "<font face=\"%s\">", name);
+}
+
+static void srt_font_size_cb(void *priv, int size)
+{
+    srt_stack_push_pop(priv, 'f', size < 0);
+    if (size >= 0)
+        srt_print(priv, "<font size=\"%d\">", size);
+}
+
+static void srt_alignment_cb(void *priv, int alignment)
+{
+    SRTContext *s = priv;
+    if (!s->alignment_applied && alignment >= 0) {
+        srt_print(s, "{\\an%d}", alignment);
+        s->alignment_applied = 1;
+    }
+}
+
+static void srt_cancel_overrides_cb(void *priv, const char *style)
+{
+    srt_stack_push_pop(priv, 0, 1);
+    srt_style_apply(priv, style);
+}
+
+static void srt_move_cb(void *priv, int x1, int y1, int x2, int y2,
+                        int t1, int t2)
+{
+    // TODO: add a AV_PKT_DATA_SUBTITLE_POSITION side data when a new subtitles
+    // encoding API passing the AVPacket is available.
+}
+
+static void srt_end_cb(void *priv)
+{
+    srt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks srt_callbacks = {
+    .text             = srt_text_cb,
+    .new_line         = srt_new_line_cb,
+    .style            = srt_style_cb,
+    .color            = srt_color_cb,
+    .font_name        = srt_font_name_cb,
+    .font_size        = srt_font_size_cb,
+    .alignment        = srt_alignment_cb,
+    .cancel_overrides = srt_cancel_overrides_cb,
+    .move             = srt_move_cb,
+    .end              = srt_end_cb,
+};
+
+static int srt_encode_frame(AVCodecContext *avctx,
+                            unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    SRTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i, num;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+        dialog = ff_ass_split_dialog(s->ass_ctx, sub->rects[i]->ass, 0, &num);
+        for (; dialog && num--; dialog++) {
+            s->alignment_applied = 0;
+            srt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&srt_callbacks, s, dialog->text);
+        }
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int srt_encode_close(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+#if CONFIG_SRT_ENCODER
+/* deprecated encoder */
+AVCodec ff_srt_encoder = {
+    .name           = "srt",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
+
+#if CONFIG_SUBRIP_ENCODER
+AVCodec ff_subrip_encoder = {
+    .name           = "subrip",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 5c1ec84c21..9efdffe8c6 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
index f38ce54d6d..cfa02b0860 100644
--- a/libavcodec/startcode.h
+++ b/libavcodec/startcode.h
@@ -1,21 +1,27 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ */
+
 #ifndef AVCODEC_STARTCODE_H
 #define AVCODEC_STARTCODE_H
 
diff --git a/libavcodec/subviewerdec.c b/libavcodec/subviewerdec.c
new file mode 100644
index 0000000000..a008828540
--- /dev/null
+++ b/libavcodec/subviewerdec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SubViewer subtitle decoder
+ * @see https://en.wikipedia.org/wiki/SubViewer
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int subviewer_event_to_ass(AVBPrint *buf, const char *p)
+{
+    while (*p) {
+        if (!strncmp(p, "[br]", 4)) {
+            av_bprintf(buf, "\\N");
+            p += 4;
+        } else {
+            if (p[0] == '\n' && p[1])
+                av_bprintf(buf, "\\N");
+            else if (*p != '\n' && *p != '\r')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int subviewer_decode_frame(AVCodecContext *avctx,
+                                  void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    // note: no need to rescale pts & duration since they are in the same
+    // timebase as ASS (1/100)
+    if (ptr && avpkt->size > 0 && !subviewer_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect_bprint(sub, &buf, avpkt->pts, avpkt->duration);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_subviewer_decoder = {
+    .name           = "subviewer",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER,
+    .decode         = subviewer_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+};
diff --git a/libavcodec/sunrast.c b/libavcodec/sunrast.c
index 6a928bb2cf..25e11f6cd2 100644
--- a/libavcodec/sunrast.c
+++ b/libavcodec/sunrast.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image decoder
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf_end   = avpkt->data + avpkt->size;
     AVFrame * const p        = data;
     unsigned int w, h, depth, type, maptype, maplength, stride, x, y, len, alen;
-    uint8_t *ptr;
+    uint8_t *ptr, *ptr2 = NULL;
     const uint8_t *bufstart = buf;
     int ret;
 
@@ -53,7 +53,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     maplength = AV_RB32(buf + 28);
     buf      += 32;
 
-    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF || type == RT_EXPERIMENTAL) {
+    if (type == RT_EXPERIMENTAL) {
         avpriv_request_sample(avctx, "TIFF/IFF/EXPERIMENTAL (compression) type");
         return AVERROR_PATCHWELCOME;
     }
@@ -70,10 +70,17 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported (compression) type\n");
+        return -1;
+    }
 
     switch (depth) {
         case 1:
-            avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_MONOWHITE;
+            break;
+        case 4:
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_NONE;
             break;
         case 8:
             avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
@@ -81,6 +88,9 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         case 24:
             avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_RGB24 : AV_PIX_FMT_BGR24;
             break;
+        case 32:
+            avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_0RGB : AV_PIX_FMT_0BGR;
+            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "invalid depth\n");
             return AVERROR_INVALIDDATA;
@@ -90,17 +100,15 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (buf_end - buf < maplength)
         return AVERROR_INVALIDDATA;
 
-    if (depth != 8 && maplength) {
+    if (depth > 8 && maplength) {
         av_log(avctx, AV_LOG_WARNING, "useless colormap found or file is corrupted, trying to recover\n");
 
     } else if (maplength) {
@@ -113,13 +121,20 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
 
         ptr = p->data[1];
         for (x = 0; x < len; x++, ptr += 4)
-            *(uint32_t *)ptr = (buf[x] << 16) + (buf[len + x] << 8) + buf[len + len + x];
+            *(uint32_t *)ptr = (0xFFU<<24) + (buf[x]<<16) + (buf[len+x]<<8) + buf[len+len+x];
     }
 
     buf += maplength;
 
+    if (maplength && depth < 8) {
+        ptr = ptr2 = av_malloc_array((w + 15), h);
+        if (!ptr)
+            return AVERROR(ENOMEM);
+        stride = (w + 15 >> 3) * depth;
+    } else {
     ptr    = p->data[0];
     stride = p->linesize[0];
+    }
 
     /* scanlines are aligned on 16 bit boundaries */
     len  = (depth * w + 7) >> 3;
@@ -160,6 +175,30 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
             buf += alen;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8 && depth < 8) {
+        uint8_t *ptr_free = ptr2;
+        ptr = p->data[0];
+        for (y=0; y<h; y++) {
+            for (x = 0; x < (w + 7 >> 3) * depth; x++) {
+                if (depth == 1) {
+                    ptr[8*x]   = ptr2[x] >> 7;
+                    ptr[8*x+1] = ptr2[x] >> 6 & 1;
+                    ptr[8*x+2] = ptr2[x] >> 5 & 1;
+                    ptr[8*x+3] = ptr2[x] >> 4 & 1;
+                    ptr[8*x+4] = ptr2[x] >> 3 & 1;
+                    ptr[8*x+5] = ptr2[x] >> 2 & 1;
+                    ptr[8*x+6] = ptr2[x] >> 1 & 1;
+                    ptr[8*x+7] = ptr2[x]      & 1;
+                } else {
+                    ptr[2*x]   = ptr2[x] >> 4;
+                    ptr[2*x+1] = ptr2[x] & 0xF;
+                }
+            }
+            ptr  += p->linesize[0];
+            ptr2 += (w + 15 >> 3) * depth;
+        }
+        av_freep(&ptr_free);
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/sunrast.h b/libavcodec/sunrast.h
index d9fe307b67..d162e63b45 100644
--- a/libavcodec/sunrast.h
+++ b/libavcodec/sunrast.h
@@ -2,20 +2,20 @@
  * Sun Rasterfile Image Format
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sunrastenc.c b/libavcodec/sunrastenc.c
index ddf624eee2..d83a42dd0f 100644
--- a/libavcodec/sunrastenc.c
+++ b/libavcodec/sunrastenc.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image encoder
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,7 +55,7 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
 {
     SUNRASTContext *s = avctx->priv_data;
     const uint8_t *ptr;
-    int len, alen, x;
+    int len, alen, x, y;
 
     if (s->maplength) {     // palettized
         PutByteContext pb_r, pb_g;
@@ -82,33 +82,29 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
      if (s->type == RT_BYTE_ENCODED) {
         uint8_t value, value2;
         int run;
-        const uint8_t *start = linesize < 0 ? pixels + (avctx->height - 1) * linesize
-                                            : pixels;
-        const uint8_t *end   = linesize < 0 ? pixels - linesize
-                                            : pixels + avctx->height * linesize;
 
         ptr = pixels;
 
-#define GET_VALUE ptr >= end || ptr < start ? 0 : x >= len ? ptr[len-1] : ptr[x]
+#define GET_VALUE y >= avctx->height ? 0 : x >= len ? ptr[len-1] : ptr[x]
 
-        x = 0;
+        x = 0, y = 0;
         value2 = GET_VALUE;
-        while (ptr < end && ptr >= start) {
+        while (y < avctx->height) {
             run = 1;
             value = value2;
             x++;
             if (x >= alen) {
                 x = 0;
-                ptr += linesize;
+                ptr += linesize, y++;
             }
 
             value2 = GET_VALUE;
-            while (value2 == value && run < 256 && ptr < end && ptr >= start) {
+            while (value2 == value && run < 256 && y < avctx->height) {
                 x++;
                 run++;
                 if (x >= alen) {
                     x = 0;
-                    ptr += linesize;
+                    ptr += linesize, y++;
                 }
                 value2 = GET_VALUE;
             }
@@ -127,7 +123,6 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
         // update data length for header
         s->length = bytestream2_tell_p(&s->p) - 32 - s->maplength;
     } else {
-        int y;
         for (y = 0; y < avctx->height; y++) {
             bytestream2_put_buffer(&s->p, ptr, len);
             if (len < alen)
@@ -153,12 +148,6 @@ static av_cold int sunrast_encode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     s->maptype                    = RMT_NONE;
     s->maplength                  = 0;
 
@@ -192,7 +181,7 @@ static int sunrast_encode_frame(AVCodecContext *avctx,  AVPacket *avpkt,
     SUNRASTContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, s->size)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->size, 0)) < 0)
         return ret;
 
     bytestream2_init_writer(&s->p, avpkt->data, avpkt->size);
diff --git a/libavcodec/svq1.c b/libavcodec/svq1.c
index 545df80974..909b0bf5cc 100644
--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 the ffmpeg project
+ * Copyright (c) 2002 The Xine Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1.h b/libavcodec/svq1.h
index 70b5c37be2..88d61d1e39 100644
--- a/libavcodec/svq1.h
+++ b/libavcodec/svq1.h
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 the ffmpeg project
+ * Copyright (c) 2002 The Xine Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq13.c b/libavcodec/svq13.c
index e0d21544d9..b821a446d6 100644
--- a/libavcodec/svq13.c
+++ b/libavcodec/svq13.c
@@ -1,20 +1,20 @@
 /*
  * SVQ1/SVQ3 decoder common code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_cb.h b/libavcodec/svq1_cb.h
index e22cd60e23..41485d2af4 100644
--- a/libavcodec/svq1_cb.h
+++ b/libavcodec/svq1_cb.h
@@ -3,23 +3,23 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 the ffmpeg project
+ * Copyright (c) 2002 The Xine Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_vlc.h b/libavcodec/svq1_vlc.h
index 834279dac4..f5d298dfa1 100644
--- a/libavcodec/svq1_vlc.h
+++ b/libavcodec/svq1_vlc.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1dec.c b/libavcodec/svq1dec.c
index 4f84395891..dca99fae4c 100644
--- a/libavcodec/svq1dec.c
+++ b/libavcodec/svq1dec.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 the ffmpeg project
+ * Copyright (c) 2002 The Xine Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,9 +40,6 @@
 #include "mathops.h"
 #include "svq1.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 static VLC svq1_block_type;
 static VLC svq1_motion_component;
 static VLC svq1_intra_multistage[6];
@@ -114,12 +111,11 @@ static const uint8_t string_table[256] = {
                 break;                                                  \
         }                                                               \
         /* divide block if next bit set */                              \
-        if (get_bits1(bitbuf) == 0)                                     \
+        if (!get_bits1(bitbuf))                                         \
             break;                                                      \
         /* add child nodes */                                           \
         list[n++] = list[i];                                            \
-        list[n++] = list[i] +                                           \
-                    (((level & 1) ? pitch : 1) << (level / 2 + 1));     \
+        list[n++] = list[i] + (((level & 1) ? pitch : 1) << ((level >> 1) + 1));\
     }
 
 #define SVQ1_ADD_CODEBOOK()                                             \
@@ -155,7 +151,7 @@ static const uint8_t string_table[256] = {
                       16 * j) << (level + 1);                           \
     }                                                                   \
     mean -= stages * 128;                                               \
-    n4    = mean + (mean >> 31) << 16 | (mean & 0xFFFF);
+    n4    = (mean << 16) + mean;
 
 static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
                                    int pitch)
@@ -166,7 +162,8 @@ static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     unsigned x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -191,12 +188,13 @@ static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
             continue;   /* skip vector */
         }
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
         mean = get_vlc2(bitbuf, svq1_intra_mean.table, 8, 3);
 
@@ -231,7 +229,8 @@ static int svq1_decode_block_non_intra(GetBitContext *bitbuf, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     int x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -253,12 +252,13 @@ static int svq1_decode_block_non_intra(GetBitContext *bitbuf, uint8_t *pixels,
         if (stages == -1)
             continue;           /* skip vector */
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_non_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
         mean = get_vlc2(bitbuf, svq1_inter_mean.table, 9, 3) - 256;
 
@@ -345,8 +345,7 @@ static int svq1_motion_inter_block(HpelDSPContext *hdsp, GetBitContext *bitbuf,
     }
 
     result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     motion[0].x         =
@@ -389,8 +388,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     }
 
     result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (1) */
@@ -402,8 +400,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
         pmv[1] = &motion[(x / 8) + 3];
     }
     result = svq1_decode_motion_vector(bitbuf, &motion[0], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (2) */
@@ -411,8 +408,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     pmv[2] = &motion[(x / 8) + 1];
 
     result = svq1_decode_motion_vector(bitbuf, &motion[(x / 8) + 2], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (3) */
@@ -420,8 +416,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     pmv[3] = &motion[(x / 8) + 3];
 
     result = svq1_decode_motion_vector(bitbuf, pmv[3], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* form predictions */
@@ -505,7 +500,7 @@ static int svq1_decode_delta_block(AVCodecContext *avctx, HpelDSPContext *hdsp,
     return result;
 }
 
-static void svq1_parse_string(GetBitContext *bitbuf, uint8_t *out)
+static void svq1_parse_string(GetBitContext *bitbuf, uint8_t out[257])
 {
     uint8_t seed;
     int i;
@@ -517,6 +512,7 @@ static void svq1_parse_string(GetBitContext *bitbuf, uint8_t *out)
         out[i] = get_bits(bitbuf, 8) ^ seed;
         seed   = string_table[out[i] ^ seed];
     }
+    out[i] = 0;
 }
 
 static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
@@ -524,6 +520,8 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
     SVQ1Context *s = avctx->priv_data;
     GetBitContext *bitbuf = &s->gb;
     int frame_size_code;
+    int width  = s->width;
+    int height = s->height;
 
     skip_bits(bitbuf, 8); /* temporal_reference */
 
@@ -557,12 +555,12 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
         }
 
         if ((s->frame_code ^ 0x10) >= 0x50) {
-            uint8_t msg[256];
+            uint8_t msg[257];
 
             svq1_parse_string(bitbuf, msg);
 
             av_log(avctx, AV_LOG_INFO,
-                   "embedded message: \"%s\"\n", (char *)msg);
+                   "embedded message:\n%s\n", ((char *)msg) + 1);
         }
 
         skip_bits(bitbuf, 2);
@@ -574,20 +572,20 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
 
         if (frame_size_code == 7) {
             /* load width, height (12 bits each) */
-            s->width  = get_bits(bitbuf, 12);
-            s->height = get_bits(bitbuf, 12);
+            width  = get_bits(bitbuf, 12);
+            height = get_bits(bitbuf, 12);
 
-            if (!s->width || !s->height)
+            if (!width || !height)
                 return AVERROR_INVALIDDATA;
         } else {
             /* get width, height from table */
-            s->width  = ff_svq1_frame_size_table[frame_size_code][0];
-            s->height = ff_svq1_frame_size_table[frame_size_code][1];
+            width  = ff_svq1_frame_size_table[frame_size_code][0];
+            height = ff_svq1_frame_size_table[frame_size_code][1];
         }
     }
 
     /* unknown fields */
-    if (get_bits1(bitbuf) == 1) {
+    if (get_bits1(bitbuf)) {
         skip_bits1(bitbuf);    /* use packet checksum if (1) */
         skip_bits1(bitbuf);    /* component checksums after image data if (1) */
 
@@ -595,16 +593,18 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
             return AVERROR_INVALIDDATA;
     }
 
-    if (get_bits1(bitbuf) == 1) {
+    if (get_bits1(bitbuf)) {
         skip_bits1(bitbuf);
         skip_bits(bitbuf, 4);
         skip_bits1(bitbuf);
         skip_bits(bitbuf, 2);
 
-        while (get_bits1(bitbuf) == 1)
-            skip_bits(bitbuf, 8);
+        if (skip_1stop_8data_bits(bitbuf) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
     return 0;
 }
 
@@ -618,9 +618,12 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *current;
     int result, i, x, y, width, height;
     svq1_pmv *pmv;
+    int ret;
 
     /* initialize bit buffer */
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    ret = init_get_bits8(&s->gb, buf, buf_size);
+    if (ret < 0)
+        return ret;
 
     /* decode frame header */
     s->frame_code = get_bits(&s->gb, 22);
@@ -655,7 +658,6 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     result = svq1_decode_frame_header(avctx, cur);
-
     if (result != 0) {
         ff_dlog(avctx, "Error in svq1_decode_frame_header %i\n", result);
         return result;
@@ -700,8 +702,8 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
                 for (x = 0; x < width; x += 16) {
                     result = svq1_decode_block_intra(&s->gb, &current[x],
                                                      linesize);
-                    if (result != 0) {
-                        av_log(avctx, AV_LOG_INFO,
+                    if (result) {
+                        av_log(avctx, AV_LOG_ERROR,
                                "Error in svq1_decode_block %i (keyframe)\n",
                                result);
                         goto err;
@@ -819,6 +821,7 @@ static av_cold int svq1_decode_end(AVCodecContext *avctx)
 
     av_frame_free(&s->prev);
     av_freep(&s->pkt_swapped);
+    s->pkt_swapped_allocated = 0;
 
     return 0;
 }
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
index a869c2478b..1e1745e7b1 100644
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,8 @@
 #include "svq1.h"
 #include "svq1enc.h"
 #include "svq1enc_cb.h"
+#include "libavutil/avassert.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 {
@@ -59,7 +58,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
         /* output 5 unknown bits (2 + 2 + 1) */
         put_bits(&s->pb, 5, 2); /* 2 needed by quicktime decoder */
 
-        i = ff_match_2uint16(ff_svq1_frame_size_table,
+        i = ff_match_2uint16((void*)ff_svq1_frame_size_table,
                              FF_ARRAY_ELEMS(ff_svq1_frame_size_table),
                              s->frame_width, s->frame_height);
         put_bits(&s->pb, 3, i);
@@ -78,7 +77,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 #define THRESHOLD_MULTIPLIER 0.6
 
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
-                               int size)
+                               intptr_t size)
 {
     int score = 0, i;
 
@@ -97,7 +96,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     int w            = 2 << (level + 2 >> 1);
     int h            = 2 << (level + 1 >> 1);
     int size         = w * h;
-    int16_t block[7][256];
+    int16_t (*block)[256] = s->encoded_block_levels[level];
     const int8_t *codebook_sum, *codebook;
     const uint16_t(*mean_vlc)[2];
     const uint8_t(*multistage_vlc)[2];
@@ -153,7 +152,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                 score  = sqr - (diff * (int64_t)diff >> (level + 3)); // FIXME: 64bit slooow
                 if (score < best_vector_score) {
                     int mean = diff + (size >> 1) >> (level + 3);
-                    assert(mean > -300 && mean < 300);
+                    av_assert2(mean > -300 && mean < 300);
                     mean               = av_clip(mean, intra ? 0 : -256, 255);
                     best_vector_score  = score;
                     best_vector[stage] = i;
@@ -161,7 +160,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                     best_vector_mean   = mean;
                 }
             }
-            assert(best_vector_mean != -999);
+            av_assert0(best_vector_mean != -999);
             vector = codebook + stage * size * 16 + best_vector[stage] * size;
             for (j = 0; j < size; j++)
                 block[stage + 1][j] = block[stage][j] - vector[j];
@@ -205,10 +204,10 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
         put_bits(&s->reorder_pb[level], 1, split);
 
     if (!split) {
-        assert(best_mean >= 0 && best_mean < 256 || !intra);
-        assert(best_mean >= -256 && best_mean < 256);
-        assert(best_count >= 0 && best_count < 7);
-        assert(level < 4 || best_count == 0);
+        av_assert1(best_mean >= 0 && best_mean < 256 || !intra);
+        av_assert1(best_mean >= -256 && best_mean < 256);
+        av_assert1(best_count >= 0 && best_count < 7);
+        av_assert1(level < 4 || best_count == 0);
 
         /* output the encoding */
         put_bits(&s->reorder_pb[level],
@@ -218,7 +217,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                  mean_vlc[best_mean][0]);
 
         for (i = 0; i < best_count; i++) {
-            assert(best_vector[i] >= 0 && best_vector[i] < 16);
+            av_assert2(best_vector[i] >= 0 && best_vector[i] < 16);
             put_bits(&s->reorder_pb[level], 4, best_vector[i]);
         }
 
@@ -232,6 +231,15 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     return best_score;
 }
 
+static void init_block_index(MpegEncContext *s){
+    s->block_index[0]= s->b8_stride*(s->mb_y*2    )     + s->mb_x*2;
+    s->block_index[1]= s->b8_stride*(s->mb_y*2    ) + 1 + s->mb_x*2;
+    s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1)     + s->mb_x*2;
+    s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) + 1 + s->mb_x*2;
+    s->block_index[4]= s->mb_stride*(s->mb_y + 1)                + s->b8_stride*s->mb_height*2 + s->mb_x;
+    s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x;
+}
+
 static int svq1_encode_plane(SVQ1EncContext *s, int plane,
                              unsigned char *src_plane,
                              unsigned char *ref_plane,
@@ -243,7 +251,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
     int block_width, block_height;
     int level;
     int threshold[6];
-    uint8_t *src     = s->scratchbuf + stride * 16;
+    uint8_t *src     = s->scratchbuf + stride * 32;
     const int lambda = (s->quality * s->quality) >>
                        (2 * FF_LAMBDA_SHIFT);
 
@@ -340,8 +348,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             for (x = 0; x < block_width; x++) {
                 s->m.mb_x = x;
-                ff_init_block_index(&s->m);
-                ff_update_block_index(&s->m);
+                init_block_index(&s->m);
 
                 ff_estimate_p_frame_motion(&s->m, x, y);
             }
@@ -366,8 +373,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         s->m.mb_y = y;
         for (x = 0; x < block_width; x++) {
-            uint8_t reorder_buffer[3][6][7 * 32];
-            int count[3][6];
+            uint8_t reorder_buffer[2][6][7 * 32];
+            int count[2][6];
             int offset       = y * 16 * stride + x * 16;
             uint8_t *decoded = decoded_plane + offset;
             uint8_t *ref     = ref_plane + offset;
@@ -381,8 +388,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
 
             s->m.mb_x = x;
-            ff_init_block_index(&s->m);
-            ff_update_block_index(&s->m);
+            init_block_index(&s->m);
 
             if (s->pict_type == AV_PICTURE_TYPE_I ||
                 (s->m.mb_type[x + y * s->m.mb_stride] &
@@ -423,23 +429,23 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     s->m.pb = s->reorder_pb[5];
                     mx      = motion_ptr[0];
                     my      = motion_ptr[1];
-                    assert(mx     >= -32 && mx     <= 31);
-                    assert(my     >= -32 && my     <= 31);
-                    assert(pred_x >= -32 && pred_x <= 31);
-                    assert(pred_y >= -32 && pred_y <= 31);
-                    ff_h263_encode_motion(&s->m, mx - pred_x, 1);
-                    ff_h263_encode_motion(&s->m, my - pred_y, 1);
+                    av_assert1(mx     >= -32 && mx     <= 31);
+                    av_assert1(my     >= -32 && my     <= 31);
+                    av_assert1(pred_x >= -32 && pred_x <= 31);
+                    av_assert1(pred_y >= -32 && pred_y <= 31);
+                    ff_h263_encode_motion(&s->m.pb, mx - pred_x, 1);
+                    ff_h263_encode_motion(&s->m.pb, my - pred_y, 1);
                     s->reorder_pb[5] = s->m.pb;
                     score[1]        += lambda * put_bits_count(&s->reorder_pb[5]);
 
                     dxy = (mx & 1) + 2 * (my & 1);
 
-                    s->hdsp.put_pixels_tab[0][dxy](temp + 16,
+                    s->hdsp.put_pixels_tab[0][dxy](temp + 16*stride,
                                                    ref + (mx >> 1) +
                                                    stride * (my >> 1),
                                                    stride, 16);
 
-                    score[1] += encode_block(s, src + 16 * x, temp + 16,
+                    score[1] += encode_block(s, src + 16 * x, temp + 16*stride,
                                              decoded, stride, 5, 64, lambda, 0);
                     best      = score[1] <= score[0];
 
@@ -450,8 +456,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     if (score[2] < score[best] && mx == 0 && my == 0) {
                         best = 2;
                         s->hdsp.put_pixels_tab[0][0](decoded, ref, stride, 16);
-                        for (i = 0; i < 6; i++)
-                            count[2][i] = 0;
                         put_bits(&s->pb, vlc[1], vlc[0]);
                     }
                 }
@@ -475,6 +479,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             s->rd_total += score[best];
 
+            if (best != 2)
             for (i = 5; i >= 0; i--)
                 avpriv_copy_bits(&s->pb, reorder_buffer[best][i],
                                  count[best][i]);
@@ -521,6 +526,11 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
     SVQ1EncContext *const s = avctx->priv_data;
     int ret;
 
+    if (avctx->width >= 4096 || avctx->height >= 4096) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too large, maximum is 4095x4095\n");
+        return AVERROR(EINVAL);
+    }
+
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
@@ -582,14 +592,10 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 {
     SVQ1EncContext *const s = avctx->priv_data;
     int i, ret;
-    uint8_t *sd;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, s->y_block_width * s->y_block_height *
-                             MAX_MB_BYTES * 3 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->y_block_width * s->y_block_height *
+                             MAX_MB_BYTES*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (avctx->pix_fmt != AV_PIX_FMT_YUV410P) {
         av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
@@ -597,9 +603,9 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (!s->current_picture->data[0]) {
-        ret = ff_get_buffer(avctx, s->current_picture, 0);
-        if (ret < 0)
+        if ((ret = ff_get_buffer(avctx, s->current_picture, 0)) < 0) {
             return ret;
+        }
     }
     if (!s->last_picture->data[0]) {
         ret = ff_get_buffer(avctx, s->last_picture, 0);
@@ -607,7 +613,7 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             return ret;
     }
     if (!s->scratchbuf) {
-        s->scratchbuf = av_malloc(s->current_picture->linesize[0] * 16 * 2);
+        s->scratchbuf = av_malloc_array(s->current_picture->linesize[0], 16 * 3);
         if (!s->scratchbuf)
             return AVERROR(ENOMEM);
     }
@@ -629,10 +635,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = pict->quality;
+    ff_side_data_set_encoder_stats(pkt, pict->quality, NULL, 0, s->pict_type);
 
     svq1_write_header(s, s->pict_type);
     for (i = 0; i < 3; i++)
diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h
index 62e8bb2699..37f05a0460 100644
--- a/libavcodec/svq1enc.h
+++ b/libavcodec/svq1enc.h
@@ -1,20 +1,20 @@
 /*
  * SVQ1 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,6 +63,8 @@ typedef struct SVQ1EncContext {
     int c_block_width;
     int c_block_height;
 
+    DECLARE_ALIGNED(16, int16_t, encoded_block_levels)[6][7][256];
+
     uint16_t *mb_type;
     uint32_t *dummy;
     int16_t (*motion_val8[3])[2];
@@ -75,7 +77,7 @@ typedef struct SVQ1EncContext {
     int motion_est;
 
     int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
-                             int size);
+                             intptr_t size);
 } SVQ1EncContext;
 
 void ff_svq1enc_init_ppc(SVQ1EncContext *c);
diff --git a/libavcodec/svq1enc_cb.h b/libavcodec/svq1enc_cb.h
index a5cd179442..1edb4ecf99 100644
--- a/libavcodec/svq1enc_cb.h
+++ b/libavcodec/svq1enc_cb.h
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 80bc46abdb..57205c6ad1 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2003 The Libav Project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
  *
  * You will know you have these parameters passed correctly when the decoder
  * correctly decodes this file:
- *  http://samples.libav.org/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
+ *  http://samples.mplayerhq.hu/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
  */
 
 #include <inttypes.h>
@@ -55,6 +55,7 @@
 #include "hpeldsp.h"
 #include "rectangle.h"
 #include "tpeldsp.h"
+#include "vdpau_internal.h"
 
 #if CONFIG_ZLIB
 #include <zlib.h>
@@ -77,9 +78,11 @@ typedef struct SVQ3Context {
     H264Picture *last_pic;
     int halfpel_flag;
     int thirdpel_flag;
-    int unknown_flag;
+    int has_watermark;
     int next_slice_index;
     uint32_t watermark_key;
+    uint8_t *buf;
+    int buf_size;
     int adaptive_quant;
     int next_p_frame_damaged;
     int h_edge_pos;
@@ -159,6 +162,8 @@ static const uint32_t svq3_dequant_coeff[32] = {
     61694, 68745, 77615, 89113, 100253, 109366, 126635, 141533
 };
 
+static int svq3_decode_end(AVCodecContext *avctx);
+
 void ff_svq3_luma_dc_dequant_idct_c(int16_t *output, int16_t *input, int qp)
 {
     const int qmul = svq3_dequant_coeff[qp];
@@ -240,14 +245,17 @@ static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
     static const uint8_t *const scan_patterns[4] =
     { luma_dc_zigzag_scan, zigzag_scan, svq3_scan, chroma_dc_scan };
 
-    int run, level, limit;
+    int run, level, sign, limit;
     unsigned vlc;
     const int intra           = 3 * type >> 2;
     const uint8_t *const scan = scan_patterns[type];
 
     for (limit = (16 >> intra); index < 16; index = limit, limit += 8) {
         for (; (vlc = svq3_get_ue_golomb(gb)) != 0; index++) {
-            int sign = (vlc & 1) ? 0 : -1;
+            if ((int32_t)vlc < 0)
+                return -1;
+
+            sign     = (vlc & 1) ? 0 : -1;
             vlc      = vlc + 1 >> 1;
 
             if (type == 3) {
@@ -262,20 +270,19 @@ static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
                     level = (vlc + 9 >> 2) - run;
                 }
             } else {
-                if (vlc < 16) {
+                if (vlc < 16U) {
                     run   = svq3_dct_tables[intra][vlc].run;
                     level = svq3_dct_tables[intra][vlc].level;
                 } else if (intra) {
                     run   = vlc & 0x7;
-                    level = (vlc >> 3) +
-                            ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
+                    level = (vlc >> 3) + ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
                 } else {
                     run   = vlc & 0xF;
-                    level = (vlc >> 4) +
-                            ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
+                    level = (vlc >> 4) + ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
                 }
             }
 
+
             if ((index += run) >= limit)
                 return -1;
 
@@ -620,7 +627,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
             for (i = 0; i < 16; i += 2) {
                 vlc = svq3_get_ue_golomb(&h->gb);
 
-                if (vlc >= 25) {
+                if (vlc >= 25U) {
                     av_log(h->avctx, AV_LOG_ERROR,
                            "luma prediction:%"PRIu32"\n", vlc);
                     return -1;
@@ -690,7 +697,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
 
     if (!IS_INTRA16x16(mb_type) &&
         (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B)) {
-        if ((vlc = svq3_get_ue_golomb(&h->gb)) >= 48) {
+        if ((vlc = svq3_get_ue_golomb(&h->gb)) >= 48U){
             av_log(h->avctx, AV_LOG_ERROR, "cbp_vlc=%"PRIu32"\n", vlc);
             return -1;
         }
@@ -807,8 +814,8 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
                     header ^ s->watermark_key);
         }
         if (length > 0) {
-            memcpy((uint8_t *) &h->gb.buffer[get_bits_count(&h->gb) >> 3],
-                   &h->gb.buffer[h->gb.size_in_bits >> 3], length - 1);
+            memmove((uint8_t *) &h->gb.buffer[get_bits_count(&h->gb) >> 3],
+                    &h->gb.buffer[h->gb.size_in_bits >> 3], length - 1);
         }
         skip_bits_long(&h->gb, 0);
     }
@@ -836,14 +843,14 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
     /* unknown fields */
     skip_bits1(&h->gb);
 
-    if (s->unknown_flag)
+    if (s->has_watermark)
         skip_bits1(&h->gb);
 
     skip_bits1(&h->gb);
     skip_bits(&h->gb, 2);
 
-    while (get_bits1(&h->gb))
-        skip_bits(&h->gb, 8);
+    if (skip_1stop_8data_bits(&h->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* reset intra predictors and invalidate motion vector references */
     if (sl->mb_x > 0) {
@@ -873,15 +880,14 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     unsigned char *extradata_end;
     unsigned int size;
     int marker_found = 0;
+    int ret;
 
     s->cur_pic  = av_mallocz(sizeof(*s->cur_pic));
     s->last_pic = av_mallocz(sizeof(*s->last_pic));
     s->next_pic = av_mallocz(sizeof(*s->next_pic));
     if (!s->next_pic || !s->last_pic || !s->cur_pic) {
-        av_freep(&s->cur_pic);
-        av_freep(&s->last_pic);
-        av_freep(&s->next_pic);
-        return AVERROR(ENOMEM);
+        ret = AVERROR(ENOMEM);
+        goto fail;
     }
 
     s->cur_pic->f  = av_frame_alloc();
@@ -890,21 +896,23 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     if (!s->cur_pic->f || !s->last_pic->f || !s->next_pic->f)
         return AVERROR(ENOMEM);
 
-    if (ff_h264_decode_init(avctx) < 0)
-        return -1;
+    if ((ret = ff_h264_decode_init(avctx)) < 0)
+        goto fail;
 
     // we will overwrite it later during decoding
     av_frame_free(&h->cur_pic.f);
 
+    av_frame_free(&h->last_pic_for_ec.f);
+
     ff_h264dsp_init(&h->h264dsp, 8, 1);
-    ff_h264chroma_init(&h->h264chroma, 8);
-    ff_h264qpel_init(&h->h264qpel, 8);
+    av_assert0(h->sps.bit_depth_chroma == 0);
     ff_h264_pred_init(&h->hpc, AV_CODEC_ID_SVQ3, 8, 1);
     ff_videodsp_init(&h->vdsp, 8);
 
     memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t));
     memset(h->pps.scaling_matrix8, 16, 2 * 64 * sizeof(uint8_t));
 
+    avctx->bits_per_raw_sample = 8;
     h->sps.bit_depth_luma = 8;
     h->chroma_format_idc = 1;
 
@@ -915,6 +923,7 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 
     h->flags           = avctx->flags;
     sl->is_complex     = 1;
+    h->sps.chroma_format_idc = 1;
     h->picture_structure = PICT_FRAME;
     avctx->pix_fmt     = AV_PIX_FMT_YUVJ420P;
     avctx->color_range = AVCOL_RANGE_JPEG;
@@ -924,7 +933,7 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 
     s->halfpel_flag  = 1;
     s->thirdpel_flag = 1;
-    s->unknown_flag  = 0;
+    s->has_watermark = 0;
 
     /* prowl for the "SEQH" marker in the extradata */
     extradata     = (unsigned char *)avctx->extradata;
@@ -943,10 +952,13 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     if (marker_found) {
         GetBitContext gb;
         int frame_size_code;
+        int unk0, unk1, unk2, unk3, unk4;
 
         size = AV_RB32(&extradata[4]);
-        if (size > extradata_end - extradata - 8)
-            return AVERROR_INVALIDDATA;
+        if (size > extradata_end - extradata - 8) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
         init_get_bits(&gb, extradata + 8, size * 8);
 
         /* 'frame size code' and optional 'width, height' */
@@ -990,22 +1002,27 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
         s->thirdpel_flag = get_bits1(&gb);
 
         /* unknown fields */
-        skip_bits1(&gb);
-        skip_bits1(&gb);
-        skip_bits1(&gb);
-        skip_bits1(&gb);
+        unk0 = get_bits1(&gb);
+        unk1 = get_bits1(&gb);
+        unk2 = get_bits1(&gb);
+        unk3 = get_bits1(&gb);
 
         h->low_delay = get_bits1(&gb);
 
         /* unknown field */
-        skip_bits1(&gb);
+        unk4 = get_bits1(&gb);
+
+        av_log(avctx, AV_LOG_DEBUG, "Unknown fields %d %d %d %d %d\n",
+               unk0, unk1, unk2, unk3, unk4);
 
-        while (get_bits1(&gb))
-            skip_bits(&gb, 8);
+        if (skip_1stop_8data_bits(&gb) < 0) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
 
-        s->unknown_flag  = get_bits1(&gb);
+        s->has_watermark  = get_bits1(&gb);
         avctx->has_b_frames = !h->low_delay;
-        if (s->unknown_flag) {
+        if (s->has_watermark) {
 #if CONFIG_ZLIB
             unsigned watermark_width  = svq3_get_ue_golomb(&gb);
             unsigned watermark_height = svq3_get_ue_golomb(&gb);
@@ -1018,11 +1035,17 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
             int offset                = get_bits_count(&gb) + 7 >> 3;
             uint8_t *buf;
 
-            if (watermark_height > 0 &&
-                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height)
-                return -1;
+            if (watermark_height <= 0 ||
+                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height) {
+                ret = -1;
+                goto fail;
+            }
 
             buf = av_malloc(buf_len);
+            if (!buf) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
             av_log(avctx, AV_LOG_DEBUG, "watermark size: %ux%u\n",
                    watermark_width, watermark_height);
             av_log(avctx, AV_LOG_DEBUG,
@@ -1033,7 +1056,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
                 av_log(avctx, AV_LOG_ERROR,
                        "could not uncompress watermark logo\n");
                 av_free(buf);
-                return -1;
+                ret = -1;
+                goto fail;
             }
             s->watermark_key = ff_svq1_packet_checksum(buf, buf_len, 0);
             s->watermark_key = s->watermark_key << 16 | s->watermark_key;
@@ -1043,7 +1067,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 #else
             av_log(avctx, AV_LOG_ERROR,
                    "this svq3 file contains watermark which need zlib support compiled in\n");
-            return -1;
+            ret = -1;
+            goto fail;
 #endif
         }
     }
@@ -1058,12 +1083,15 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     s->h_edge_pos = h->mb_width * 16;
     s->v_edge_pos = h->mb_height * 16;
 
-    if (ff_h264_alloc_tables(h) < 0) {
+    if ((ret = ff_h264_alloc_tables(h)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "svq3 memory allocation failed\n");
-        return AVERROR(ENOMEM);
+        goto fail;
     }
 
     return 0;
+fail:
+    svq3_decode_end(avctx);
+    return ret;
 }
 
 static void free_picture(AVCodecContext *avctx, H264Picture *pic)
@@ -1117,7 +1145,7 @@ static int get_buffer(AVCodecContext *avctx, H264Picture *pic)
         goto fail;
 
     if (!sl->edge_emu_buffer) {
-        sl->edge_emu_buffer = av_mallocz(pic->f->linesize[0] * 17);
+        sl->edge_emu_buffer = av_mallocz_array(pic->f->linesize[0], 17);
         if (!sl->edge_emu_buffer)
             return AVERROR(ENOMEM);
     }
@@ -1134,11 +1162,12 @@ fail:
 static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     SVQ3Context *s     = avctx->priv_data;
     H264Context *h     = &s->h;
     H264SliceContext *sl = &h->slice_ctx[0];
     int buf_size       = avpkt->size;
+    int left;
+    uint8_t *buf;
     int ret, m, i;
 
     /* special case for last picture */
@@ -1153,10 +1182,20 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
         return 0;
     }
 
-    init_get_bits(&h->gb, buf, 8 * buf_size);
-
     sl->mb_x = sl->mb_y = sl->mb_xy = 0;
 
+    if (s->watermark_key) {
+        av_fast_padded_malloc(&s->buf, &s->buf_size, buf_size);
+        if (!s->buf)
+            return AVERROR(ENOMEM);
+        memcpy(s->buf, avpkt->data, buf_size);
+        buf = s->buf;
+    } else {
+        buf = avpkt->data;
+    }
+
+    init_get_bits(&h->gb, buf, 8 * buf_size);
+
     if (svq3_decode_slice_header(avctx))
         return -1;
 
@@ -1192,6 +1231,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
     if (h->pict_type != AV_PICTURE_TYPE_I) {
         if (!s->last_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->last_pic->f);
             ret = get_buffer(avctx, s->last_pic);
             if (ret < 0)
                 return ret;
@@ -1204,6 +1244,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
 
         if (h->pict_type == AV_PICTURE_TYPE_B && !s->next_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->next_pic->f);
             ret = get_buffer(avctx, s->next_pic);
             if (ret < 0)
                 return ret;
@@ -1293,7 +1334,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                 return -1;
             }
 
-            if (mb_type != 0)
+            if (mb_type != 0 || sl->cbp)
                 ff_h264_hl_decode_mb(h, &h->slice_ctx[0]);
 
             if (h->pict_type != AV_PICTURE_TYPE_B && !h->low_delay)
@@ -1307,6 +1348,18 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                            h->low_delay);
     }
 
+    left = buf_size*8 - get_bits_count(&h->gb);
+
+    if (sl->mb_y != h->mb_height || sl->mb_x != h->mb_width) {
+        av_log(avctx, AV_LOG_INFO, "frame num %d incomplete pic x %d y %d left %d\n", avctx->frame_number, sl->mb_y, sl->mb_x, left);
+        //av_hex_dump(stderr, buf+buf_size-8, 8);
+    }
+
+    if (left < 0) {
+        av_log(avctx, AV_LOG_ERROR, "frame num %d left %d\n", avctx->frame_number, left);
+        return -1;
+    }
+
     if (h->pict_type == AV_PICTURE_TYPE_B || h->low_delay)
         ret = av_frame_ref(data, s->cur_pic->f);
     else if (s->last_pic->f->data[0])
@@ -1346,6 +1399,9 @@ static av_cold int svq3_decode_end(AVCodecContext *avctx)
 
     ff_h264_free_context(h);
 
+    av_freep(&s->buf);
+    s->buf_size = 0;
+
     return 0;
 }
 
diff --git a/libavcodec/svq3.h b/libavcodec/svq3.h
index a20e620f8e..5007a8c1c0 100644
--- a/libavcodec/svq3.h
+++ b/libavcodec/svq3.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/synth_filter.c b/libavcodec/synth_filter.c
index d0ace4040e..d49ffe642d 100644
--- a/libavcodec/synth_filter.c
+++ b/libavcodec/synth_filter.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/synth_filter.h b/libavcodec/synth_filter.h
index f842c701e3..b63fd779b5 100644
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tableprint.h b/libavcodec/tableprint.h
index daa89fe99b..6f61c7124b 100644
--- a/libavcodec/tableprint.h
+++ b/libavcodec/tableprint.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,6 +64,7 @@ void write_int8_t_array     (const int8_t   *, int);
 void write_uint8_t_array    (const uint8_t  *, int);
 void write_uint16_t_array   (const uint16_t *, int);
 void write_uint32_t_array   (const uint32_t *, int);
+void write_int32_t_array    (const int32_t  *, int);
 void write_float_array      (const float    *, int);
 void write_int8_t_2d_array  (const void *, int, int);
 void write_uint8_t_2d_array (const void *, int, int);
@@ -81,6 +82,16 @@ void write_float_2d_array   (const void *, int, int);
 #define FMT "zu"
 #endif
 
+#define WRITE_ARRAY_ALIGNED(prefix, align, type, name)  \
+    do {                                                \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" DECLARE_ALIGNED("#align", "      \
+               #type", "#name")[%"FMT"] = {\n",         \
+               array_size);                             \
+        write_##type##_array(name, array_size);         \
+        printf("};\n");                                 \
+    } while(0)
+
 #define WRITE_ARRAY(prefix, type, name)                 \
     do {                                                \
         const size_t array_size = FF_ARRAY_ELEMS(name); \
@@ -104,7 +115,9 @@ void write_float_2d_array   (const void *, int, int);
 WRITE_1D_FUNC(int8_t,   "%3"PRIi8, 15)
 WRITE_1D_FUNC(uint8_t,  "0x%02"PRIx8, 15)
 WRITE_1D_FUNC(uint16_t, "0x%08"PRIx16, 7)
+WRITE_1D_FUNC(int16_t,  "%5"PRIi16, 7)
 WRITE_1D_FUNC(uint32_t, "0x%08"PRIx32, 7)
+WRITE_1D_FUNC(int32_t,  "0x%08"PRIx32, 7)
 WRITE_1D_FUNC(float,    "%.18e", 3)
 
 WRITE_2D_FUNC(int8_t)
diff --git a/libavcodec/tableprint_vlc.h b/libavcodec/tableprint_vlc.h
new file mode 100644
index 0000000000..675251a836
--- /dev/null
+++ b/libavcodec/tableprint_vlc.h
@@ -0,0 +1,82 @@
+/*
+ * Helpers for generating hard-coded VLC tables
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TABLEPRINT_VLC_H
+#define AVCODEC_TABLEPRINT_VLC_H
+
+#define FFMPEG_CONFIG_H
+#define AVUTIL_LOG_H
+#define av_log(a, ...) while(0)
+#define ff_dlog(a, ...) while(0)
+#define AVUTIL_MEM_H
+#define av_malloc(s) NULL
+#define av_malloc_array(a, b) NULL
+#define av_realloc_f(p, o, n) NULL
+#define av_free(p) while(0)
+#define av_freep(p) while(0)
+#define AVCODEC_AVCODEC_H
+#define AVCODEC_INTERNAL_H
+#include "tableprint.h"
+#include "get_bits.h"
+#include "mathtables.c"
+#include "libavutil/reverse.c"
+#include "bitstream.c"
+
+#define REPLACE_DEFINE2(type) write_##type##_array
+#define REPLACE_DEFINE(type) REPLACE_DEFINE2(type)
+static void write_VLC_TYPE_array(const VLC_TYPE *p, int s) {
+    REPLACE_DEFINE(VLC_TYPE)(p, s);
+}
+
+WRITE_2D_FUNC(VLC_TYPE)
+
+static void write_vlc_type(const VLC *vlc, VLC_TYPE (*base_table)[2], const char *base_table_name)
+{
+    printf("    .bits = %i,\n", vlc->bits);
+    // Unfortunately need to cast away const currently
+    printf("    .table = (VLC_TYPE (*)[2])(%s + 0x%x),\n", base_table_name, (int)(vlc->table - base_table));
+    printf("    .table_size = 0x%x,\n", vlc->table_size);
+    printf("    .table_allocated = 0x%x,\n", vlc->table_allocated);
+}
+
+#define WRITE_VLC_TYPE(prefix, name, base_table)        \
+    do {                                                \
+        printf(prefix" VLC "#name" = {\n");             \
+        write_vlc_type(&name, base_table, #base_table); \
+        printf("};\n");                                 \
+    } while(0)
+
+#define WRITE_VLC_ARRAY(prefix, name, base_table)       \
+    do {                                                \
+        int i;                                          \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" VLC "#name"[%"FMT"] = {{\n",     \
+               array_size);                             \
+        for (i = 0; i < array_size; i++) {              \
+            write_vlc_type(name + i,                    \
+                           base_table, #base_table);    \
+            if (i != array_size - 1) printf("}, {\n");  \
+        }                                               \
+        printf("}};\n");                                \
+    } while(0)
+
+#endif /* AVCODEC_TABLEPRINT_VLC_H */
diff --git a/libavcodec/tak.c b/libavcodec/tak.c
index 867a84bc71..ed41ca8a2e 100644
--- a/libavcodec/tak.c
+++ b/libavcodec/tak.c
@@ -2,28 +2,49 @@
  * TAK common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/bswap.h"
 #include "libavutil/crc.h"
 #include "libavutil/intreadwrite.h"
 #include "tak.h"
 
+static const int64_t tak_channel_layouts[] = {
+    0,
+    AV_CH_FRONT_LEFT,
+    AV_CH_FRONT_RIGHT,
+    AV_CH_FRONT_CENTER,
+    AV_CH_LOW_FREQUENCY,
+    AV_CH_BACK_LEFT,
+    AV_CH_BACK_RIGHT,
+    AV_CH_FRONT_LEFT_OF_CENTER,
+    AV_CH_FRONT_RIGHT_OF_CENTER,
+    AV_CH_BACK_CENTER,
+    AV_CH_SIDE_LEFT,
+    AV_CH_SIDE_RIGHT,
+    AV_CH_TOP_CENTER,
+    AV_CH_TOP_FRONT_LEFT,
+    AV_CH_TOP_FRONT_CENTER,
+    AV_CH_TOP_FRONT_RIGHT,
+    AV_CH_TOP_BACK_LEFT,
+    AV_CH_TOP_BACK_CENTER,
+    AV_CH_TOP_BACK_RIGHT,
+};
+
 static const uint16_t frame_duration_type_quants[] = {
     3, 4, 6, 8, 4096, 8192, 16384, 512, 1024, 2048,
 };
@@ -51,22 +72,6 @@ static int tak_get_nb_samples(int sample_rate, enum TAKFrameSizeType type)
     return nb_samples;
 }
 
-static int crc_init = 0;
-#if CONFIG_SMALL
-#define CRC_TABLE_SIZE 257
-#else
-#define CRC_TABLE_SIZE 1024
-#endif
-static AVCRC crc_24[CRC_TABLE_SIZE];
-
-av_cold void ff_tak_init_crc(void)
-{
-    if (!crc_init) {
-        av_crc_init(crc_24, 0, 24, 0x864CFBU, sizeof(crc_24));
-        crc_init = 1;
-    }
-}
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
 {
     uint32_t crc, CRC;
@@ -75,8 +80,8 @@ int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
         return AVERROR_INVALIDDATA;
     buf_size -= 3;
 
-    CRC = av_bswap32(AV_RL24(buf + buf_size)) >> 8;
-    crc = av_crc(crc_24, 0xCE04B7U, buf, buf_size);
+    CRC = AV_RB24(buf + buf_size);
+    crc = av_crc(av_crc_get_table(AV_CRC_24_IEEE), 0xCE04B7U, buf, buf_size);
     if (CRC != crc)
         return AVERROR_INVALIDDATA;
 
@@ -108,8 +113,8 @@ void avpriv_tak_parse_streaminfo(GetBitContext *gb, TAKStreamInfo *s)
             for (i = 0; i < s->channels; i++) {
                 int value = get_bits(gb, TAK_FORMAT_CH_LAYOUT_BITS);
 
-                if (value > 0 && value <= 18)
-                    channel_mask |= 1 << (value - 1);
+                if (value < FF_ARRAY_ELEMS(tak_channel_layouts))
+                    channel_mask |= tak_channel_layouts[value];
             }
         }
     }
@@ -144,6 +149,9 @@ int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
         align_get_bits(gb);
     }
 
+    if (ti->flags & TAK_FRAME_FLAG_HAS_METADATA)
+        return AVERROR_INVALIDDATA;
+
     skip_bits(gb, 24);
 
     return 0;
diff --git a/libavcodec/tak.h b/libavcodec/tak.h
index fa91149793..e8e2dacf7c 100644
--- a/libavcodec/tak.h
+++ b/libavcodec/tak.h
@@ -2,20 +2,20 @@
  * TAK decoder/demuxer common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,7 +99,7 @@
 
 enum TAKCodecType {
     TAK_CODEC_MONO_STEREO  = 2,
-    TAK_CODEC_MULTICHANNEL = 4
+    TAK_CODEC_MULTICHANNEL = 4,
 };
 
 enum TAKMetaDataType {
@@ -140,8 +140,6 @@ typedef struct TAKStreamInfo {
     int64_t           samples;
 } TAKStreamInfo;
 
-void ff_tak_init_crc(void);
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size);
 
 /**
@@ -162,5 +160,4 @@ void avpriv_tak_parse_streaminfo(GetBitContext *gb, TAKStreamInfo *s);
  */
 int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                TAKStreamInfo *s, int log_level_offset);
-
 #endif /* AVCODEC_TAK_H */
diff --git a/libavcodec/tak_parser.c b/libavcodec/tak_parser.c
index 295df2418a..4f2149ae11 100644
--- a/libavcodec/tak_parser.c
+++ b/libavcodec/tak_parser.c
@@ -2,20 +2,20 @@
  * TAK parser
  * Copyright (c) 2012 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,12 +33,6 @@ typedef struct TAKParseContext {
     int           index;
 } TAKParseContext;
 
-static av_cold int tak_init(AVCodecParserContext *s)
-{
-    ff_tak_init_crc();
-    return 0;
-}
-
 static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                      const uint8_t **poutbuf, int *poutbuf_size,
                      const uint8_t *buf, int buf_size)
@@ -49,10 +43,12 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     GetBitContext gb;
     int consumed = 0;
     int needed   = buf_size ? TAK_MAX_FRAME_HEADER_BYTES : 8;
+    int ret;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         TAKStreamInfo ti;
-        init_get_bits(&gb, buf, buf_size);
+        if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+            return ret;
         if (!ff_tak_decode_frame_header(avctx, &gb, &ti, 127))
             s->duration = t->ti.last_frame_samples ? t->ti.last_frame_samples
                                                    : t->ti.frame_samples;
@@ -67,23 +63,23 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                                            buf_size);
             const uint8_t *tmp_buf = buf;
 
-            ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size);
+            if (ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size) != -1)
+                return AVERROR(ENOMEM);
             consumed += tmp_buf_size;
             buf      += tmp_buf_size;
             buf_size -= tmp_buf_size;
         }
 
-        for (; t->index + needed <= pc->index; t->index++)
-            if (pc->buffer[t->index]     == 0xFF &&
-                pc->buffer[t->index + 1] == 0xA0) {
+        for (; t->index + needed <= pc->index; t->index++) {
+            if (pc->buffer[ t->index     ] == 0xFF &&
+                pc->buffer[ t->index + 1 ] == 0xA0) {
                 TAKStreamInfo ti;
 
-                init_get_bits(&gb, pc->buffer + t->index,
-                              8 * (pc->index - t->index));
+                if ((ret = init_get_bits8(&gb, pc->buffer + t->index,
+                                          pc->index - t->index)) < 0)
+                    return ret;
                 if (!ff_tak_decode_frame_header(avctx, &gb,
-                                                pc->frame_start_found ? &ti
-                                                                      : &t->ti,
-                                                127) &&
+                        pc->frame_start_found ? &ti : &t->ti, 127) &&
                     !ff_tak_check_crc(pc->buffer + t->index,
                                       get_bits_count(&gb) / 8)) {
                     if (!pc->frame_start_found) {
@@ -91,6 +87,7 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                         s->duration           = t->ti.last_frame_samples ?
                                                 t->ti.last_frame_samples :
                                                 t->ti.frame_samples;
+                        s->key_frame          = !!(t->ti.flags & TAK_FRAME_FLAG_HAS_INFO);
                     } else {
                         pc->frame_start_found = 0;
                         next                  = t->index - pc->index;
@@ -99,9 +96,10 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                     }
                 }
             }
+        }
     }
-
 found:
+
     if (consumed && !buf_size && next == END_NOT_FOUND ||
         ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
         *poutbuf      = NULL;
@@ -122,7 +120,6 @@ found:
 AVCodecParser ff_tak_parser = {
     .codec_ids      = { AV_CODEC_ID_TAK },
     .priv_data_size = sizeof(TAKParseContext),
-    .parser_init    = tak_init,
     .parser_parse   = tak_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/takdec.c b/libavcodec/takdec.c
index c84cca448d..5395596cfc 100644
--- a/libavcodec/takdec.c
+++ b/libavcodec/takdec.c
@@ -2,20 +2,20 @@
  * TAK decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,42 +29,47 @@
 #include "libavutil/samplefmt.h"
 #include "tak.h"
 #include "audiodsp.h"
+#include "thread.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "unary.h"
 
-#define MAX_SUBFRAMES     8                         // max number of subframes per channel
+#define MAX_SUBFRAMES     8                         ///< max number of subframes per channel
 #define MAX_PREDICTORS  256
 
 typedef struct MCDParam {
-    int8_t present;                                 // decorrelation parameter availability for this channel
-    int8_t index;                                   // index into array of decorrelation types
+    int8_t present;                                 ///< decorrelation parameter availability for this channel
+    int8_t index;                                   ///< index into array of decorrelation types
     int8_t chan1;
     int8_t chan2;
 } MCDParam;
 
 typedef struct TAKDecContext {
-    AVCodecContext *avctx;                          // parent AVCodecContext
+    AVCodecContext *avctx;                          ///< parent AVCodecContext
     AudioDSPContext adsp;
     TAKStreamInfo   ti;
-    GetBitContext   gb;                             // bitstream reader initialized to start at the current frame
+    GetBitContext   gb;                             ///< bitstream reader initialized to start at the current frame
 
     int             uval;
-    int             nb_samples;                     // number of samples in the current frame
+    int             nb_samples;                     ///< number of samples in the current frame
     uint8_t        *decode_buffer;
     unsigned int    decode_buffer_size;
-    int32_t        *decoded[TAK_MAX_CHANNELS];      // decoded samples for each channel
+    int32_t        *decoded[TAK_MAX_CHANNELS];      ///< decoded samples for each channel
 
     int8_t          lpc_mode[TAK_MAX_CHANNELS];
-    int8_t          sample_shift[TAK_MAX_CHANNELS]; // shift applied to every sample in the channel
+    int8_t          sample_shift[TAK_MAX_CHANNELS]; ///< shift applied to every sample in the channel
+    int16_t         predictors[MAX_PREDICTORS];
+    int             nb_subframes;                   ///< number of subframes in the current frame
+    int16_t         subframe_len[MAX_SUBFRAMES];    ///< subframe length in samples
     int             subframe_scale;
 
-    int8_t          dmode;                          // channel decorrelation type in the current frame
+    int8_t          dmode;                          ///< channel decorrelation type in the current frame
 
-    MCDParam        mcdparams[TAK_MAX_CHANNELS];    // multichannel decorrelation parameters
+    MCDParam        mcdparams[TAK_MAX_CHANNELS];    ///< multichannel decorrelation parameters
 
-    int16_t        *residues;
-    unsigned int    residues_buf_size;
+    int8_t          coding_mode[128];
+    DECLARE_ALIGNED(16, int16_t, filter)[MAX_PREDICTORS];
+    DECLARE_ALIGNED(16, int16_t, residues)[544];
 } TAKDecContext;
 
 static const int8_t mc_dmodes[] = { 1, 3, 4, 6, };
@@ -132,14 +137,9 @@ static const struct CParam {
     { 0x1A, 0x1800000, 0x1800000, 0x6800000, 0xC000000 },
 };
 
-static av_cold void tak_init_static_data(AVCodec *codec)
-{
-    ff_tak_init_crc();
-}
-
 static int set_bps_params(AVCodecContext *avctx)
 {
-    switch (avctx->bits_per_coded_sample) {
+    switch (avctx->bits_per_raw_sample) {
     case 8:
         avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
         break;
@@ -150,11 +150,10 @@ static int set_bps_params(AVCodecContext *avctx)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "unsupported bits per sample: %d\n",
-               avctx->bits_per_coded_sample);
+        av_log(avctx, AV_LOG_ERROR, "invalid/unsupported bits per sample: %d\n",
+               avctx->bits_per_raw_sample);
         return AVERROR_INVALIDDATA;
     }
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     return 0;
 }
@@ -175,6 +174,7 @@ static av_cold int tak_decode_init(AVCodecContext *avctx)
     ff_audiodsp_init(&s->adsp);
 
     s->avctx = avctx;
+    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     set_sample_rate_params(avctx);
 
@@ -236,10 +236,10 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
     }
 }
 
-static int decode_segment(GetBitContext *gb, int mode, int32_t *decoded,
-                          int len)
+static int decode_segment(TAKDecContext *s, int8_t mode, int32_t *decoded, int len)
 {
     struct CParam code;
+    GetBitContext *gb = &s->gb;
     int i;
 
     if (!mode) {
@@ -290,7 +290,6 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 
     if (get_bits1(gb)) {
         int wlength, rval;
-        int coding_mode[128];
 
         wlength = length / s->uval;
 
@@ -304,7 +303,7 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
         if (wlength <= 1 || wlength > 128)
             return AVERROR_INVALIDDATA;
 
-        coding_mode[0] = mode = get_bits(gb, 6);
+        s->coding_mode[0] = mode = get_bits(gb, 6);
 
         for (i = 1; i < wlength; i++) {
             int c = get_unary(gb, 1, 6);
@@ -328,14 +327,14 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
                 mode--;
                 break;
             }
-            coding_mode[i] = mode;
+            s->coding_mode[i] = mode;
         }
 
         i = 0;
         while (i < wlength) {
             int len = 0;
 
-            mode = coding_mode[i];
+            mode = s->coding_mode[i];
             do {
                 if (i >= wlength - 1)
                     len += rval;
@@ -345,15 +344,15 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 
                 if (i == wlength)
                     break;
-            } while (coding_mode[i] == mode);
+            } while (s->coding_mode[i] == mode);
 
-            if ((ret = decode_segment(gb, mode, decoded, len)) < 0)
+            if ((ret = decode_segment(s, mode, decoded, len)) < 0)
                 return ret;
             decoded += len;
         }
     } else {
         mode = get_bits(gb, 6);
-        if ((ret = decode_segment(gb, mode, decoded, length)) < 0)
+        if ((ret = decode_segment(s, mode, decoded, length)) < 0)
             return ret;
     }
 
@@ -368,62 +367,13 @@ static int get_bits_esc4(GetBitContext *gb)
         return 0;
 }
 
-static void decode_filter_coeffs(TAKDecContext *s, int filter_order, int size,
-                                 int filter_quant, int16_t *filter)
-{
-    GetBitContext *gb = &s->gb;
-    int i, j, a, b;
-    int filter_tmp[MAX_PREDICTORS];
-    int16_t predictors[MAX_PREDICTORS];
-
-    predictors[0] = get_sbits(gb, 10);
-    predictors[1] = get_sbits(gb, 10);
-    predictors[2] = get_sbits(gb, size) << (10 - size);
-    predictors[3] = get_sbits(gb, size) << (10 - size);
-    if (filter_order > 4) {
-        int av_uninit(code_size);
-        int code_size_base = size - get_bits1(gb);
-
-        for (i = 4; i < filter_order; i++) {
-            if (!(i & 3))
-                code_size = code_size_base - get_bits(gb, 2);
-            predictors[i] = get_sbits(gb, code_size) << (10 - size);
-        }
-    }
-
-    filter_tmp[0] = predictors[0] << 6;
-    for (i = 1; i < filter_order; i++) {
-        int *p1 = &filter_tmp[0];
-        int *p2 = &filter_tmp[i - 1];
-
-        for (j = 0; j < (i + 1) / 2; j++) {
-            int tmp = *p1 + (predictors[i] * *p2 + 256 >> 9);
-            *p2     = *p2 + (predictors[i] * *p1 + 256 >> 9);
-            *p1     = tmp;
-            p1++;
-            p2--;
-        }
-
-        filter_tmp[i] = predictors[i] << 6;
-    }
-
-    a = 1 << (32 - (15 - filter_quant));
-    b = 1 << ((15 - filter_quant) - 1);
-    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
-        filter[j] = a - ((filter_tmp[i] + b) >> (15 - filter_quant));
-        filter[i] = a - ((filter_tmp[j] + b) >> (15 - filter_quant));
-    }
-}
-
 static int decode_subframe(TAKDecContext *s, int32_t *decoded,
                            int subframe_size, int prev_subframe_size)
 {
-    LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
     GetBitContext *gb = &s->gb;
-    int i, ret;
+    int x, y, i, j, ret = 0;
     int dshift, size, filter_quant, filter_order;
-
-    memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
+    int tfilter[MAX_PREDICTORS];
 
     if (!get_bits1(gb))
         return decode_residues(s, decoded, subframe_size);
@@ -466,30 +416,74 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
             return AVERROR_INVALIDDATA;
     }
 
-    decode_filter_coeffs(s, filter_order, size, filter_quant, filter);
+    s->predictors[0] = get_sbits(gb, 10);
+    s->predictors[1] = get_sbits(gb, 10);
+    s->predictors[2] = get_sbits(gb, size) << (10 - size);
+    s->predictors[3] = get_sbits(gb, size) << (10 - size);
+    if (filter_order > 4) {
+        int tmp = size - get_bits1(gb);
+
+        for (i = 4; i < filter_order; i++) {
+            if (!(i & 3))
+                x = tmp - get_bits(gb, 2);
+            s->predictors[i] = get_sbits(gb, x) << (10 - size);
+        }
+    }
+
+    tfilter[0] = s->predictors[0] << 6;
+    for (i = 1; i < filter_order; i++) {
+        int32_t *p1 = &tfilter[0];
+        int32_t *p2 = &tfilter[i - 1];
+
+        for (j = 0; j < (i + 1) / 2; j++) {
+            x     = *p1 + (s->predictors[i] * *p2 + 256 >> 9);
+            *p2  += s->predictors[i] * *p1 + 256 >> 9;
+            *p1++ = x;
+            p2--;
+        }
+
+        tfilter[i] = s->predictors[i] << 6;
+    }
+
+    x = 1 << (32 - (15 - filter_quant));
+    y = 1 << ((15 - filter_quant) - 1);
+    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
+        s->filter[j] = x - ((tfilter[i] + y) >> (15 - filter_quant));
+        s->filter[i] = x - ((tfilter[j] + y) >> (15 - filter_quant));
+    }
 
     if ((ret = decode_residues(s, &decoded[filter_order],
                                subframe_size - filter_order)) < 0)
         return ret;
 
-    av_fast_malloc(&s->residues, &s->residues_buf_size,
-                   FFALIGN(subframe_size + 16, 16) * sizeof(*s->residues));
-    if (!s->residues)
-        return AVERROR(ENOMEM);
-    memset(s->residues, 0, s->residues_buf_size);
-
     for (i = 0; i < filter_order; i++)
         s->residues[i] = *decoded++ >> dshift;
 
-    for (i = 0; i < subframe_size - filter_order; i++) {
-        int v = 1 << (filter_quant - 1);
-
-        v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                         FFALIGN(filter_order, 16));
+    y    = FF_ARRAY_ELEMS(s->residues) - filter_order;
+    x    = subframe_size - filter_order;
+    while (x > 0) {
+        int tmp = FFMIN(y, x);
+
+        for (i = 0; i < tmp; i++) {
+            int v = 1 << (filter_quant - 1);
+
+            if (filter_order & -16)
+                v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                 filter_order & -16);
+            for (j = filter_order & -16; j < filter_order; j += 4) {
+                v += s->residues[i + j + 3] * s->filter[j + 3] +
+                     s->residues[i + j + 2] * s->filter[j + 2] +
+                     s->residues[i + j + 1] * s->filter[j + 1] +
+                     s->residues[i + j    ] * s->filter[j    ];
+            }
+            v = (av_clip_intp2(v >> filter_quant, 13) << dshift) - *decoded;
+            *decoded++ = v;
+            s->residues[filter_order + i] = v >> dshift;
+        }
 
-        v = (av_clip_intp2(v >> filter_quant, 13) << dshift) - *decoded;
-        *decoded++ = v;
-        s->residues[filter_order + i] = v >> dshift;
+        x -= tmp;
+        if (x > 0)
+            memcpy(s->residues, &s->residues[y], 2 * filter_order);
     }
 
     emms_c();
@@ -503,50 +497,42 @@ static int decode_channel(TAKDecContext *s, int chan)
     GetBitContext *gb     = &s->gb;
     int32_t *decoded      = s->decoded[chan];
     int left              = s->nb_samples - 1;
-    int i, prev, ret, nb_subframes;
-    int subframe_len[MAX_SUBFRAMES];
+    int i = 0, ret, prev = 0;
 
     s->sample_shift[chan] = get_bits_esc4(gb);
-    if (s->sample_shift[chan] >= avctx->bits_per_coded_sample)
+    if (s->sample_shift[chan] >= avctx->bits_per_raw_sample)
         return AVERROR_INVALIDDATA;
 
-    /* NOTE: TAK 2.2.0 appears to set the sample value to 0 if
-     *       bits_per_coded_sample - sample_shift is 1, but this produces
-     *       non-bit-exact output. Reading the 1 bit using get_sbits() instead
-     *       of skipping it produces bit-exact output. This has been reported
-     *       to the TAK author. */
-    *decoded++        = get_sbits(gb,
-                                  avctx->bits_per_coded_sample -
-                                  s->sample_shift[chan]);
+    *decoded++ = get_sbits(gb, avctx->bits_per_raw_sample - s->sample_shift[chan]);
     s->lpc_mode[chan] = get_bits(gb, 2);
-    nb_subframes      = get_bits(gb, 3) + 1;
+    s->nb_subframes   = get_bits(gb, 3) + 1;
 
-    i = 0;
-    if (nb_subframes > 1) {
-        if (get_bits_left(gb) < (nb_subframes - 1) * 6)
+    if (s->nb_subframes > 1) {
+        if (get_bits_left(gb) < (s->nb_subframes - 1) * 6)
             return AVERROR_INVALIDDATA;
 
-        prev = 0;
-        for (; i < nb_subframes - 1; i++) {
-            int subframe_end = get_bits(gb, 6) * s->subframe_scale;
-            if (subframe_end <= prev)
+        for (; i < s->nb_subframes - 1; i++) {
+            int v = get_bits(gb, 6);
+
+            s->subframe_len[i] = (v - prev) * s->subframe_scale;
+            if (s->subframe_len[i] <= 0)
                 return AVERROR_INVALIDDATA;
-            subframe_len[i] = subframe_end - prev;
-            left           -= subframe_len[i];
-            prev            = subframe_end;
+
+            left -= s->subframe_len[i];
+            prev  = v;
         }
 
         if (left <= 0)
             return AVERROR_INVALIDDATA;
     }
-    subframe_len[i] = left;
+    s->subframe_len[i] = left;
 
     prev = 0;
-    for (i = 0; i < nb_subframes; i++) {
-        if ((ret = decode_subframe(s, decoded, subframe_len[i], prev)) < 0)
+    for (i = 0; i < s->nb_subframes; i++) {
+        if ((ret = decode_subframe(s, decoded, s->subframe_len[i], prev)) < 0)
             return ret;
-        decoded += subframe_len[i];
-        prev     = subframe_len[i];
+        decoded += s->subframe_len[i];
+        prev     = s->subframe_len[i];
     }
 
     return 0;
@@ -599,11 +585,8 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
     case 6:
         FFSWAP(int32_t*, p1, p2);
     case 7: {
-        LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
         int length2, order_half, filter_order, dval1, dval2;
-        int av_uninit(code_size);
-
-        memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
+        int tmp, x, code_size;
 
         if (length < 256)
             return AVERROR_INVALIDDATA;
@@ -616,7 +599,7 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
         for (i = 0; i < filter_order; i++) {
             if (!(i & 3))
                 code_size = 14 - get_bits(gb, 3);
-            filter[i] = get_sbits(gb, code_size);
+            s->filter[i] = get_sbits(gb, code_size);
         }
 
         order_half = filter_order / 2;
@@ -640,24 +623,40 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
             }
         }
 
-        av_fast_malloc(&s->residues, &s->residues_buf_size,
-                       FFALIGN(length + 16, 16) * sizeof(*s->residues));
-        if (!s->residues)
-            return AVERROR(ENOMEM);
-        memset(s->residues, 0, s->residues_buf_size);
 
-        for (i = 0; i < length; i++)
-            s->residues[i] = p2[i] >> dshift;
+        for (i = 0; i < filter_order; i++)
+            s->residues[i] = *p2++ >> dshift;
 
         p1 += order_half;
+        x = FF_ARRAY_ELEMS(s->residues) - filter_order;
+        for (; length2 > 0; length2 -= tmp) {
+            tmp = FFMIN(length2, x);
+
+            for (i = 0; i < tmp; i++)
+                s->residues[filter_order + i] = *p2++ >> dshift;
+
+            for (i = 0; i < tmp; i++) {
+                int v = 1 << 9;
+
+                if (filter_order == 16) {
+                    v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                     filter_order);
+                } else {
+                    v += s->residues[i + 7] * s->filter[7] +
+                         s->residues[i + 6] * s->filter[6] +
+                         s->residues[i + 5] * s->filter[5] +
+                         s->residues[i + 4] * s->filter[4] +
+                         s->residues[i + 3] * s->filter[3] +
+                         s->residues[i + 2] * s->filter[2] +
+                         s->residues[i + 1] * s->filter[1] +
+                         s->residues[i    ] * s->filter[0];
+                }
 
-        for (i = 0; i < length2; i++) {
-            int v = 1 << 9;
-
-            v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                             FFALIGN(filter_order, 16));
+                v = (av_clip_intp2(v >> 10, 13) << dshift) - *p1;
+                *p1++ = v;
+            }
 
-            p1[i] = (av_clip_intp2(v >> 10, 13) << dshift) - p1[i];
+            memcpy(s->residues, &s->residues[tmp], 2 * filter_order);
         }
 
         emms_c();
@@ -673,24 +672,21 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
 {
     TAKDecContext *s  = avctx->priv_data;
     AVFrame *frame    = data;
+    ThreadFrame tframe = { .f = data };
     GetBitContext *gb = &s->gb;
     int chan, i, ret, hsize;
 
     if (pkt->size < TAK_MIN_FRAME_HEADER_BYTES)
         return AVERROR_INVALIDDATA;
 
-    init_get_bits(gb, pkt->data, pkt->size * 8);
+    if ((ret = init_get_bits8(gb, pkt->data, pkt->size)) < 0)
+        return ret;
 
     if ((ret = ff_tak_decode_frame_header(avctx, gb, &s->ti, 0)) < 0)
         return ret;
 
-    if (s->ti.flags & TAK_FRAME_FLAG_HAS_METADATA) {
-        avpriv_request_sample(avctx, "Frame metadata");
-        return AVERROR_PATCHWELCOME;
-    }
-
     hsize = get_bits_count(gb) / 8;
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data, hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
             if (avctx->err_recognition & AV_EF_EXPLODE)
@@ -724,11 +720,9 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->ti.bps != avctx->bits_per_coded_sample) {
-        avctx->bits_per_coded_sample = s->ti.bps;
-        if ((ret = set_bps_params(avctx)) < 0)
-            return ret;
-    }
+    avctx->bits_per_raw_sample = s->ti.bps;
+    if ((ret = set_bps_params(avctx)) < 0)
+        return ret;
     if (s->ti.sample_rate != avctx->sample_rate) {
         avctx->sample_rate = s->ti.sample_rate;
         set_sample_rate_params(avctx);
@@ -741,10 +735,11 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                                              : s->ti.frame_samples;
 
     frame->nb_samples = s->nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
+    ff_thread_finish_setup(avctx);
 
-    if (avctx->bits_per_coded_sample <= 16) {
+    if (avctx->bits_per_raw_sample <= 16) {
         int buf_size = av_samples_get_buffer_size(NULL, avctx->channels,
                                                   s->nb_samples,
                                                   AV_SAMPLE_FMT_S32P, 0);
@@ -767,7 +762,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         for (chan = 0; chan < avctx->channels; chan++) {
             int32_t *decoded = s->decoded[chan];
             for (i = 0; i < s->nb_samples; i++)
-                decoded[i] = get_sbits(gb, avctx->bits_per_coded_sample);
+                decoded[i] = get_sbits(gb, avctx->bits_per_raw_sample);
         }
     } else {
         if (s->ti.codec == TAK_CODEC_MONO_STEREO) {
@@ -776,9 +771,9 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                     return ret;
 
             if (avctx->channels == 2) {
-                if (get_bits1(gb)) {
-                    // some kind of subframe length, but it seems to be unused
-                    skip_bits(gb, 6);
+                s->nb_subframes = get_bits(gb, 1) + 1;
+                if (s->nb_subframes > 1) {
+                    s->subframe_len[1] = get_bits(gb, 6);
                 }
 
                 s->dmode = get_bits(gb, 3);
@@ -806,6 +801,12 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                     if (s->mcdparams[i].present) {
                         s->mcdparams[i].index = get_bits(gb, 2);
                         s->mcdparams[i].chan2 = get_bits(gb, 4);
+                        if (s->mcdparams[i].chan2 >= avctx->channels) {
+                            av_log(avctx, AV_LOG_ERROR,
+                                   "invalid channel 2 (%d) for %d channel(s)\n",
+                                   s->mcdparams[i].chan2, avctx->channels);
+                            return AVERROR_INVALIDDATA;
+                        }
                         if (s->mcdparams[i].index == 1) {
                             if ((nbit == s->mcdparams[i].chan2) ||
                                 (ch_mask & 1 << s->mcdparams[i].chan2))
@@ -866,7 +867,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     else if (get_bits_left(gb) > 0)
         av_log(avctx, AV_LOG_DEBUG, "underread\n");
 
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    if (avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data + hsize,
                              get_bits_count(gb) / 8 - hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
@@ -907,12 +908,32 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     return pkt->size;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TAKDecContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+
+static int update_thread_context(AVCodecContext *dst,
+                                 const AVCodecContext *src)
+{
+    TAKDecContext *tsrc = src->priv_data;
+    TAKDecContext *tdst = dst->priv_data;
+
+    if (dst == src)
+        return 0;
+    memcpy(&tdst->ti, &tsrc->ti, sizeof(TAKStreamInfo));
+    return 0;
+}
+#endif
+
 static av_cold int tak_decode_close(AVCodecContext *avctx)
 {
     TAKDecContext *s = avctx->priv_data;
 
     av_freep(&s->decode_buffer);
-    av_freep(&s->residues);
 
     return 0;
 }
@@ -924,10 +945,11 @@ AVCodec ff_tak_decoder = {
     .id               = AV_CODEC_ID_TAK,
     .priv_data_size   = sizeof(TAKDecContext),
     .init             = tak_decode_init,
-    .init_static_data = tak_init_static_data,
     .close            = tak_decode_close,
     .decode           = tak_decode_frame,
-    .capabilities     = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts      = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                         AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_S32P,
diff --git a/libavcodec/targa.c b/libavcodec/targa.c
index ef8565f451..215c0f51f6 100644
--- a/libavcodec/targa.c
+++ b/libavcodec/targa.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,22 +28,37 @@
 
 typedef struct TargaContext {
     GetByteContext gb;
-
-    int color_type;
-    int compression_type;
 } TargaContext;
 
+static uint8_t *advance_line(uint8_t *start, uint8_t *line,
+                             int stride, int *y, int h, int interleave)
+{
+    *y += interleave;
+
+    if (*y < h) {
+        return line + interleave * stride;
+    } else {
+        *y = (*y + 1) & (interleave - 1);
+        if (*y && *y < h) {
+            return start + *y * stride;
+        } else {
+            return NULL;
+        }
+    }
+}
+
 static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
-                            uint8_t *dst, int w, int h, int stride, int bpp)
+                            uint8_t *start, int w, int h, int stride,
+                            int bpp, int interleave)
 {
     int x, y;
     int depth = (bpp + 1) >> 3;
     int type, count;
-    int diff;
+    uint8_t *line = start;
+    uint8_t *dst  = line;
 
-    diff = stride - w * depth;
-    x = y = 0;
-    while (y < h) {
+    x = y = count = 0;
+    while (dst) {
         if (bytestream2_get_bytes_left(&s->gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "Ran ouf of data before end-of-image\n");
@@ -52,12 +67,6 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
         type  = bytestream2_get_byteu(&s->gb);
         count = (type & 0x7F) + 1;
         type &= 0x80;
-        if (x + count > w && x + count + 1 > (h - y) * w) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Packet went out of bounds: position (%i,%i) size %i\n",
-                   x, y, count);
-            return AVERROR_INVALIDDATA;
-        }
         if (!type) {
             do {
                 int n  = FFMIN(count, w - x);
@@ -67,10 +76,9 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 x     += n;
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         } else {
             uint8_t tmp[4];
             bytestream2_get_buffer(&s->gb, tmp, depth);
@@ -84,12 +92,17 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 } while (--n);
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         }
     }
+
+    if (count) {
+        av_log(avctx, AV_LOG_ERROR, "Packet went out of bounds\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -101,14 +114,15 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p = data;
     uint8_t *dst;
     int stride;
-    int idlen, compr, y, w, h, bpp, flags, ret;
+    int idlen, pal, compr, y, w, h, bpp, flags, ret;
     int first_clr, colors, csize;
+    int interleave;
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     /* parse image header */
     idlen     = bytestream2_get_byte(&s->gb);
-    bytestream2_skip(&s->gb, 1); /* pal */
+    pal       = bytestream2_get_byte(&s->gb);
     compr     = bytestream2_get_byte(&s->gb);
     first_clr = bytestream2_get_le16(&s->gb);
     colors    = bytestream2_get_le16(&s->gb);
@@ -117,17 +131,29 @@ static int decode_frame(AVCodecContext *avctx,
     w         = bytestream2_get_le16(&s->gb);
     h         = bytestream2_get_le16(&s->gb);
     bpp       = bytestream2_get_byte(&s->gb);
+
+    if (bytestream2_get_bytes_left(&s->gb) <= idlen) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Not enough data to read header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     flags     = bytestream2_get_byte(&s->gb);
+
+    if (!pal && (first_clr || colors || csize)) {
+        av_log(avctx, AV_LOG_WARNING, "File without colormap has colormap information set.\n");
+        // specification says we should ignore those value in this case
+        first_clr = colors = csize = 0;
+    }
+
     // skip identifier if any
     bytestream2_skip(&s->gb, idlen);
 
-    switch(bpp){
+    switch (bpp) {
     case 8:
         avctx->pix_fmt = ((compr & (~TGA_RLE)) == TGA_BW) ? AV_PIX_FMT_GRAY8 : AV_PIX_FMT_PAL8;
         break;
     case 15:
-        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
-        break;
     case 16:
         avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
         break;
@@ -142,28 +168,34 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (colors && (colors + first_clr) > 256) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
+        return AVERROR_INVALIDDATA;
+    }
+
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
-    if(flags & 0x20){
+    p->pict_type = AV_PICTURE_TYPE_I;
+
+    if (flags & TGA_TOPTOBOTTOM) {
         dst = p->data[0];
         stride = p->linesize[0];
-    }else{ //image is upside-down
+    } else { //image is upside-down
         dst = p->data[0] + p->linesize[0] * (h - 1);
         stride = -p->linesize[0];
     }
 
-    if(colors){
+    interleave = flags & TGA_INTERLEAVE2 ? 2 :
+                 flags & TGA_INTERLEAVE4 ? 4 : 1;
+
+    if (colors) {
         int pal_size, pal_sample_size;
-        if((colors + first_clr) > 256){
-            av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
-            return AVERROR_INVALIDDATA;
-        }
+
         switch (csize) {
+        case 32: pal_sample_size = 4; break;
         case 24: pal_sample_size = 3; break;
         case 16:
         case 15: pal_sample_size = 2; break;
@@ -172,9 +204,9 @@ static int decode_frame(AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
         }
         pal_size = colors * pal_sample_size;
-        if(avctx->pix_fmt != AV_PIX_FMT_PAL8)//should not occur but skip palette anyway
+        if (avctx->pix_fmt != AV_PIX_FMT_PAL8) //should not occur but skip palette anyway
             bytestream2_skip(&s->gb, pal_size);
-        else{
+        else {
             int t;
             uint32_t *pal = ((uint32_t *)p->data[1]) + first_clr;
 
@@ -184,10 +216,14 @@ static int decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             }
             switch (pal_sample_size) {
+            case 4:
+                for (t = 0; t < colors; t++)
+                    *pal++ = bytestream2_get_le32u(&s->gb);
+                break;
             case 3:
                 /* RGB24 */
                 for (t = 0; t < colors; t++)
-                    *pal++ = bytestream2_get_le24u(&s->gb);
+                    *pal++ = (0xffU<<24) | bytestream2_get_le24u(&s->gb);
                 break;
             case 2:
                 /* RGB555 */
@@ -198,30 +234,59 @@ static int decode_frame(AVCodecContext *avctx,
                         ((v & 0x001F) <<  3);
                     /* left bit replication */
                     v |= (v & 0xE0E0E0U) >> 5;
-                    *pal++ = v;
+                    *pal++ = (0xffU<<24) | v;
                 }
                 break;
             }
             p->palette_has_changed = 1;
         }
     }
+
     if ((compr & (~TGA_RLE)) == TGA_NODATA) {
         memset(p->data[0], 0, p->linesize[0] * h);
     } else {
-        if(compr & TGA_RLE){
-            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp);
+        if (compr & TGA_RLE) {
+            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp, interleave);
             if (res < 0)
                 return res;
         } else {
             size_t img_size = w * ((bpp + 1) >> 3);
+            uint8_t *line;
             if (bytestream2_get_bytes_left(&s->gb) < img_size * h) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Not enough data available for image\n");
                 return AVERROR_INVALIDDATA;
             }
-            for (y = 0; y < h; y++) {
-                bytestream2_get_bufferu(&s->gb, dst, img_size);
-                dst += stride;
+
+            line = dst;
+            y = 0;
+            do {
+                bytestream2_get_buffer(&s->gb, line, img_size);
+                line = advance_line(dst, line, stride, &y, h, interleave);
+            } while (line);
+        }
+    }
+
+    if (flags & TGA_RIGHTTOLEFT) { // right-to-left, needs horizontal flip
+        int x;
+        for (y = 0; y < h; y++) {
+            void *line = &p->data[0][y * p->linesize[0]];
+            for (x = 0; x < w >> 1; x++) {
+                switch (bpp) {
+                case 32:
+                    FFSWAP(uint32_t, ((uint32_t *)line)[x], ((uint32_t *)line)[w - x - 1]);
+                    break;
+                case 24:
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x    ], ((uint8_t *)line)[3 * w - 3 * x - 3]);
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 1], ((uint8_t *)line)[3 * w - 3 * x - 2]);
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 2], ((uint8_t *)line)[3 * w - 3 * x - 1]);
+                    break;
+                case 16:
+                    FFSWAP(uint16_t, ((uint16_t *)line)[x], ((uint16_t *)line)[w - x - 1]);
+                    break;
+                case 8:
+                    FFSWAP(uint8_t, ((uint8_t *)line)[x], ((uint8_t *)line)[w - x - 1]);
+                }
             }
         }
     }
diff --git a/libavcodec/targa.h b/libavcodec/targa.h
index f4ef5537b1..c2f522465b 100644
--- a/libavcodec/targa.h
+++ b/libavcodec/targa.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,4 +38,11 @@ enum TargaCompr {
     TGA_RLE    = 8, // flag pointing that data is RLE-coded
 };
 
+enum TargaFlags {
+    TGA_RIGHTTOLEFT = 0x10, // right-to-left (flipped horizontally)
+    TGA_TOPTOBOTTOM = 0x20, // top-to-bottom (NOT flipped vertically)
+    TGA_INTERLEAVE2 = 0x40, // 2-way interleave, odd then even lines
+    TGA_INTERLEAVE4 = 0x80, // 4-way interleave
+};
+
 #endif /* AVCODEC_TARGA_H */
diff --git a/libavcodec/targa_y216dec.c b/libavcodec/targa_y216dec.c
new file mode 100644
index 0000000000..21b3d35d67
--- /dev/null
+++ b/libavcodec/targa_y216dec.c
@@ -0,0 +1,83 @@
+/*
+ * Pinnacle TARGA CineWave YUV16 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y216_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 14;
+
+    return 0;
+}
+
+static int y216_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint16_t *src = (uint16_t *)avpkt->data;
+    uint16_t *y, *u, *v, aligned_width = FFALIGN(avctx->width, 4);
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * aligned_width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = (uint16_t *)pic->data[0];
+    u = (uint16_t *)pic->data[1];
+    v = (uint16_t *)pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width >> 1; j++) {
+            u[    j    ] = src[4 * j    ] << 2 | src[4 * j    ] >> 14;
+            y[2 * j    ] = src[4 * j + 1] << 2 | src[4 * j + 1] >> 14;
+            v[    j    ] = src[4 * j + 2] << 2 | src[4 * j + 2] >> 14;
+            y[2 * j + 1] = src[4 * j + 3] << 2 | src[4 * j + 3] >> 14;
+        }
+
+        y += pic->linesize[0] >> 1;
+        u += pic->linesize[1] >> 1;
+        v += pic->linesize[2] >> 1;
+        src += aligned_width << 1;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_targa_y216_decoder = {
+    .name         = "targa_y216",
+    .long_name    = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_TARGA_Y216,
+    .init         = y216_decode_init,
+    .decode       = y216_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/targaenc.c b/libavcodec/targaenc.c
index a85400f958..de8163a5f0 100644
--- a/libavcodec/targaenc.c
+++ b/libavcodec/targaenc.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,7 +77,7 @@ static int targa_encode_normal(uint8_t *outbuf, const AVFrame *pic, int bpp, int
 static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                               const AVFrame *p, int *got_packet)
 {
-    int bpp, picsize, datasize = -1, ret;
+    int bpp, picsize, datasize = -1, ret, i;
     uint8_t *out;
 
     if(avctx->width > 0xffff || avctx->height > 0xffff) {
@@ -85,10 +85,8 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return AVERROR(EINVAL);
     }
     picsize = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
-    if ((ret = ff_alloc_packet(pkt, picsize + 45)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, picsize + 45, 0)) < 0)
         return ret;
-    }
 
     /* zero out the header and only set applicable fields */
     memset(pkt->data, 0, 12);
@@ -97,13 +95,39 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* image descriptor byte: origin is always top-left, bits 0-3 specify alpha */
     pkt->data[17] = 0x20 | (avctx->pix_fmt == AV_PIX_FMT_BGRA ? 8 : 0);
 
+    out = pkt->data + 18;  /* skip past the header we write */
+
+    avctx->bits_per_coded_sample = av_get_bits_per_pixel(av_pix_fmt_desc_get(avctx->pix_fmt));
     switch(avctx->pix_fmt) {
+    case AV_PIX_FMT_PAL8: {
+        int pal_bpp = 24; /* Only write 32bit palette if there is transparency information */
+        for (i = 0; i < 256; i++)
+            if (AV_RN32(p->data[1] + 4 * i) >> 24 != 0xFF) {
+                pal_bpp = 32;
+                break;
+            }
+        pkt->data[1]  = 1;          /* palette present */
+        pkt->data[2]  = TGA_PAL;    /* uncompressed palettised image */
+        pkt->data[6]  = 1;          /* palette contains 256 entries */
+        pkt->data[7]  = pal_bpp;    /* palette contains pal_bpp bit entries */
+        pkt->data[16] = 8;          /* bpp */
+        for (i = 0; i < 256; i++)
+            if (pal_bpp == 32) {
+                AV_WL32(pkt->data + 18 + 4 * i, *(uint32_t *)(p->data[1] + i * 4));
+            } else {
+            AV_WL24(pkt->data + 18 + 3 * i, *(uint32_t *)(p->data[1] + i * 4));
+            }
+        out += 32 * pal_bpp;        /* skip past the palette we just output */
+        break;
+        }
     case AV_PIX_FMT_GRAY8:
         pkt->data[2]  = TGA_BW;     /* uncompressed grayscale image */
+        avctx->bits_per_coded_sample = 0x28;
         pkt->data[16] = 8;          /* bpp */
         break;
     case AV_PIX_FMT_RGB555LE:
-        pkt->data[2]  = TGA_RGB;    /* uncompresses true-color image */
+        pkt->data[2]  = TGA_RGB;    /* uncompressed true-color image */
+        avctx->bits_per_coded_sample =
         pkt->data[16] = 16;         /* bpp */
         break;
     case AV_PIX_FMT_BGR24:
@@ -121,15 +145,13 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
     bpp = pkt->data[16] >> 3;
 
-    out = pkt->data + 18;  /* skip past the header we just output */
-
     /* try RLE compression */
     if (avctx->coder_type != FF_CODER_TYPE_RAW)
         datasize = targa_encode_rle(out, picsize, p, bpp, avctx->width, avctx->height);
 
     /* if that worked well, mark the picture as RLE compressed */
     if(datasize >= 0)
-        pkt->data[2] |= 8;
+        pkt->data[2] |= TGA_RLE;
 
     /* if RLE didn't make it smaller, go back to no compression */
     else datasize = targa_encode_normal(out, p, bpp, avctx->width, avctx->height);
@@ -168,7 +190,7 @@ AVCodec ff_targa_encoder = {
     .init           = targa_encode_init,
     .encode2        = targa_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/tdsc.c b/libavcodec/tdsc.c
index 9e1ef2de87..63cd44341d 100644
--- a/libavcodec/tdsc.c
+++ b/libavcodec/tdsc.c
@@ -2,20 +2,20 @@
  * TDSC decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,8 +124,8 @@ static av_cold int tdsc_init(AVCodecContext *avctx)
     ctx->jpeg_avctx->flags = avctx->flags;
     ctx->jpeg_avctx->flags2 = avctx->flags2;
     ctx->jpeg_avctx->dct_algo = avctx->dct_algo;
-    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;;
-    ret = avcodec_open2(ctx->jpeg_avctx, codec, NULL);
+    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;
+    ret = ff_codec_open2_recursive(ctx->jpeg_avctx, codec, NULL);
     if (ret < 0)
         return ret;
 
diff --git a/libavcodec/textdec.c b/libavcodec/textdec.c
new file mode 100644
index 0000000000..a6c8722c1d
--- /dev/null
+++ b/libavcodec/textdec.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Raw subtitles decoder
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *class;
+    const char *linebreaks;
+    int keep_ass_markup;
+} TextContext;
+
+#define OFFSET(x) offsetof(TextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "keep_ass_markup", "Set if ASS tags must be escaped", OFFSET(keep_ass_markup), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, .flags=SD },
+    { NULL }
+};
+
+static int text_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    const TextContext *text = avctx->priv_data;
+    const int ts_start     = av_rescale_q(avpkt->pts,      avctx->time_base, (AVRational){1,100});
+    const int ts_duration  = avpkt->duration != -1 ?
+                             av_rescale_q(avpkt->duration, avctx->time_base, (AVRational){1,100}) : -1;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr) {
+        ff_ass_bprint_text_event(&buf, ptr, avpkt->size, text->linebreaks, text->keep_ass_markup);
+        ret = ff_ass_add_rect_bprint(sub, &buf, ts_start, ts_duration);
+    }
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+#define DECLARE_CLASS(decname) static const AVClass decname ## _decoder_class = {   \
+    .class_name = #decname " decoder",      \
+    .item_name  = av_default_item_name,     \
+    .option     = decname ## _options,      \
+    .version    = LIBAVUTIL_VERSION_INT,    \
+}
+
+#if CONFIG_TEXT_DECODER
+#define text_options options
+DECLARE_CLASS(text);
+
+AVCodec ff_text_decoder = {
+    .name           = "text",
+    .long_name      = NULL_IF_CONFIG_SMALL("Raw text subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_TEXT,
+    .decode         = text_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .priv_class     = &text_decoder_class,
+};
+#endif
+
+#if CONFIG_VPLAYER_DECODER || CONFIG_PJS_DECODER || CONFIG_SUBVIEWER1_DECODER || CONFIG_STL_DECODER
+
+static int linebreak_init(AVCodecContext *avctx)
+{
+    TextContext *text = avctx->priv_data;
+    text->linebreaks = "|";
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+#if CONFIG_VPLAYER_DECODER
+#define vplayer_options options
+DECLARE_CLASS(vplayer);
+
+AVCodec ff_vplayer_decoder = {
+    .name           = "vplayer",
+    .long_name      = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_VPLAYER,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &vplayer_decoder_class,
+};
+#endif
+
+#if CONFIG_STL_DECODER
+#define stl_options options
+DECLARE_CLASS(stl);
+
+AVCodec ff_stl_decoder = {
+    .name           = "stl",
+    .long_name      = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_STL,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &stl_decoder_class,
+};
+#endif
+
+#if CONFIG_PJS_DECODER
+#define pjs_options options
+DECLARE_CLASS(pjs);
+
+AVCodec ff_pjs_decoder = {
+    .name           = "pjs",
+    .long_name      = NULL_IF_CONFIG_SMALL("PJS subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_PJS,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &pjs_decoder_class,
+};
+#endif
+
+#if CONFIG_SUBVIEWER1_DECODER
+#define subviewer1_options options
+DECLARE_CLASS(subviewer1);
+
+AVCodec ff_subviewer1_decoder = {
+    .name           = "subviewer1",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer1 subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER1,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &subviewer1_decoder_class,
+};
+#endif
+
+#endif /* text subtitles with '|' line break */
diff --git a/libavcodec/texturedsp.c b/libavcodec/texturedsp.c
index 78eb6fa0ce..19aa353821 100644
--- a/libavcodec/texturedsp.c
+++ b/libavcodec/texturedsp.c
@@ -31,7 +31,7 @@
 
 #include "texturedsp.h"
 
-#define RGBA(r, g, b, a) (r) | ((g) << 8) | ((b) << 16) | ((a) << 24)
+#define RGBA(r, g, b, a) ((r) | ((g) << 8) | ((b) << 16) | ((a) << 24))
 
 static av_always_inline void extract_color(uint32_t colors[4],
                                            uint16_t color0,
@@ -425,7 +425,7 @@ static inline void rgtc_block_internal(uint8_t *dst, ptrdiff_t stride,
             int i = indices[x + y * 4];
             /* Interval expansion from [-1 1] or [0 1] to [0 255]. */
             int c = color_tab[i];
-            uint32_t pixel = RGBA(c, c, c, 255);
+            uint32_t pixel = RGBA(c, c, c, 255U);
             AV_WL32(dst + x * 4 + y * stride, pixel);
         }
     }
diff --git a/libavcodec/texturedsp.h b/libavcodec/texturedsp.h
index fcbe7a4d26..26f3b6473f 100644
--- a/libavcodec/texturedsp.h
+++ b/libavcodec/texturedsp.h
@@ -2,20 +2,20 @@
  * Texture block module
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/texturedspenc.c b/libavcodec/texturedspenc.c
index 2036998c81..5bd07771e7 100644
--- a/libavcodec/texturedspenc.c
+++ b/libavcodec/texturedspenc.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Based on public domain code by Fabian Giesen, Sean Barrett and Yann Collet.
  *
- * This file is part of Libav
+ * This file is part of FFmpeg
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -140,14 +140,14 @@ const static uint8_t match6[256][2] = {
 };
 
 /* Multiplication over 8 bit emulation */
-#define mul8(a, b) (a * b + 128 + ((a * b + 128) >> 8)) >> 8
+#define mul8(a, b) (((a) * (b) + 128 + (((a) * (b) + 128) >> 8)) >> 8)
 
 /* Conversion from rgb24 to rgb565 */
 #define rgb2rgb565(r, g, b) \
-    (mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0)
+    ((mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0))
 
 /* Linear interpolation at 1/3 point between a and b */
-#define lerp13(a, b) (2 * a + b) / 3
+#define lerp13(a, b) ((2 * (a) + (b)) / 3)
 
 /* Linear interpolation on an RGB pixel */
 static inline void lerp13rgb(uint8_t *out, uint8_t *p1, uint8_t *p2)
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 864e67eb98..c848d7ae8b 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Alexander Strange <astrange@ithinksw.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,6 +98,16 @@ void ff_thread_report_progress(ThreadFrame *f, int progress, int field);
 void ff_thread_await_progress(ThreadFrame *f, int progress, int field);
 
 /**
+ * Wrapper around get_format() for frame-multithreaded codecs.
+ * Call this function instead of avctx->get_format().
+ * Cannot be called after the codec has called ff_thread_finish_setup().
+ *
+ * @param avctx The current context.
+ * @param fmt The list of available formats.
+ */
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt);
+
+/**
  * Wrapper around get_buffer() for frame-multithreaded codecs.
  * Call this function instead of ff_get_buffer(f).
  * Cannot be called after the codec has called ff_thread_finish_setup().
@@ -125,4 +135,9 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src);
 int ff_thread_init(AVCodecContext *s);
 void ff_thread_free(AVCodecContext *s);
 
+int ff_alloc_entries(AVCodecContext *avctx, int count);
+void ff_reset_entries(AVCodecContext *avctx);
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n);
+void ff_thread_await_progress2(AVCodecContext *avctx,  int field, int thread, int shift);
+
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/tiertexseqv.c b/libavcodec/tiertexseqv.c
index 626324a170..df12ee3809 100644
--- a/libavcodec/tiertexseqv.c
+++ b/libavcodec/tiertexseqv.c
@@ -2,20 +2,20 @@
  * Tiertex Limited SEQ Video Decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -179,7 +179,7 @@ static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int
         for (i = 0; i < 256; i++) {
             for (j = 0; j < 3; j++, data++)
                 c[j] = (*data << 2) | (*data >> 4);
-            palette[i] = AV_RB24(c);
+            palette[i] = 0xFFU << 24 | AV_RB24(c);
         }
         seq->frame->palette_has_changed = 1;
     }
@@ -234,10 +234,8 @@ static int seqvideo_decode_frame(AVCodecContext *avctx,
 
     SeqVideoContext *seq = avctx->priv_data;
 
-    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0) {
-        av_log(seq->avctx, AV_LOG_ERROR, "tiertexseqvideo: reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0)
         return ret;
-    }
 
     if (seqvideo_decode(seq, buf, buf_size))
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 97b9d6fb01..d5c6398a17 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -1,21 +1,20 @@
 /*
- * TIFF image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +28,13 @@
 #if CONFIG_ZLIB
 #include <zlib.h>
 #endif
+#if CONFIG_LZMA
+#define LZMA_API_STATIC
+#include <lzma.h>
+#endif
 
 #include "libavutil/attributes.h"
+#include "libavutil/avstring.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -40,6 +44,8 @@
 #include "lzw.h"
 #include "mathops.h"
 #include "tiff.h"
+#include "tiff_data.h"
+#include "thread.h"
 
 typedef struct TiffContext {
     AVCodecContext *avctx;
@@ -53,33 +59,259 @@ typedef struct TiffContext {
     enum TiffCompr compr;
     enum TiffPhotometric photometric;
     int planar;
+    int subsampling[2];
     int fax_opts;
     int predictor;
     int fill_order;
+    uint32_t res[4];
 
     int strips, rps, sstype;
     int sot;
     int stripsizesoff, stripsize, stripoff, strippos;
     LZWState *lzw;
+
+    uint8_t *deinvert_buf;
+    int deinvert_buf_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
+
+    int geotag_count;
+    TiffGeoTag *geotags;
 } TiffContext;
 
-static unsigned tget_short(GetByteContext *gb, int le)
+static void free_geotags(TiffContext *const s)
+{
+    int i;
+    for (i = 0; i < s->geotag_count; i++) {
+        if (s->geotags[i].val)
+            av_freep(&s->geotags[i].val);
+    }
+    av_freep(&s->geotags);
+    s->geotag_count = 0;
+}
+
+#define RET_GEOKEY(TYPE, array, element)\
+    if (key >= TIFF_##TYPE##_KEY_ID_OFFSET &&\
+        key - TIFF_##TYPE##_KEY_ID_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_name_type_map))\
+        return ff_tiff_##array##_name_type_map[key - TIFF_##TYPE##_KEY_ID_OFFSET].element;
+
+static const char *get_geokey_name(int key)
+{
+    RET_GEOKEY(VERT, vert, name);
+    RET_GEOKEY(PROJ, proj, name);
+    RET_GEOKEY(GEOG, geog, name);
+    RET_GEOKEY(CONF, conf, name);
+
+    return NULL;
+}
+
+static int get_geokey_type(int key)
+{
+    RET_GEOKEY(VERT, vert, type);
+    RET_GEOKEY(PROJ, proj, type);
+    RET_GEOKEY(GEOG, geog, type);
+    RET_GEOKEY(CONF, conf, type);
+
+    return AVERROR_INVALIDDATA;
+}
+
+static int cmp_id_key(const void *id, const void *k)
 {
-    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+    return *(const int*)id - ((const TiffGeoTagKeyName*)k)->key;
 }
 
-static unsigned tget_long(GetByteContext *gb, int le)
+static const char *search_keyval(const TiffGeoTagKeyName *keys, int n, int id)
 {
-    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+    TiffGeoTagKeyName *r = bsearch(&id, keys, n, sizeof(keys[0]), cmp_id_key);
+    if(r)
+        return r->name;
+
+    return NULL;
 }
 
-static unsigned tget(GetByteContext *gb, int type, int le)
+static char *get_geokey_val(int key, int val)
 {
-    switch (type) {
-    case TIFF_BYTE:  return bytestream2_get_byte(gb);
-    case TIFF_SHORT: return tget_short(gb, le);
-    case TIFF_LONG:  return tget_long(gb, le);
-    default:         return UINT_MAX;
+    char *ap;
+
+    if (val == TIFF_GEO_KEY_UNDEFINED)
+        return av_strdup("undefined");
+    if (val == TIFF_GEO_KEY_USER_DEFINED)
+        return av_strdup("User-Defined");
+
+#define RET_GEOKEY_VAL(TYPE, array)\
+    if (val >= TIFF_##TYPE##_OFFSET &&\
+        val - TIFF_##TYPE##_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_codes))\
+        return av_strdup(ff_tiff_##array##_codes[val - TIFF_##TYPE##_OFFSET]);
+
+    switch (key) {
+    case TIFF_GT_MODEL_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_MODEL_TYPE, gt_model_type);
+        break;
+    case TIFF_GT_RASTER_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_RASTER_TYPE, gt_raster_type);
+        break;
+    case TIFF_GEOG_LINEAR_UNITS_GEOKEY:
+    case TIFF_PROJ_LINEAR_UNITS_GEOKEY:
+    case TIFF_VERTICAL_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(LINEAR_UNIT, linear_unit);
+        break;
+    case TIFF_GEOG_ANGULAR_UNITS_GEOKEY:
+    case TIFF_GEOG_AZIMUTH_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(ANGULAR_UNIT, angular_unit);
+        break;
+    case TIFF_GEOGRAPHIC_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GCS_TYPE, gcs_type);
+        RET_GEOKEY_VAL(GCSE_TYPE, gcse_type);
+        break;
+    case TIFF_GEOG_GEODETIC_DATUM_GEOKEY:
+        RET_GEOKEY_VAL(GEODETIC_DATUM, geodetic_datum);
+        RET_GEOKEY_VAL(GEODETIC_DATUM_E, geodetic_datum_e);
+        break;
+    case TIFF_GEOG_ELLIPSOID_GEOKEY:
+        RET_GEOKEY_VAL(ELLIPSOID, ellipsoid);
+        break;
+    case TIFF_GEOG_PRIME_MERIDIAN_GEOKEY:
+        RET_GEOKEY_VAL(PRIME_MERIDIAN, prime_meridian);
+        break;
+    case TIFF_PROJECTED_CS_TYPE_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_proj_cs_type_codes, FF_ARRAY_ELEMS(ff_tiff_proj_cs_type_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJECTION_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_projection_codes, FF_ARRAY_ELEMS(ff_tiff_projection_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJ_COORD_TRANS_GEOKEY:
+        RET_GEOKEY_VAL(COORD_TRANS, coord_trans);
+        break;
+    case TIFF_VERTICAL_CS_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(VERT_CS, vert_cs);
+        RET_GEOKEY_VAL(ORTHO_VERT_CS, ortho_vert_cs);
+        break;
+
+    }
+
+    ap = av_malloc(14);
+    if (ap)
+        snprintf(ap, 14, "Unknown-%d", val);
+    return ap;
+}
+
+static char *doubles2str(double *dp, int count, const char *sep)
+{
+    int i;
+    char *ap, *ap0;
+    uint64_t component_len;
+    if (!sep) sep = ", ";
+    component_len = 24LL + strlen(sep);
+    if (count >= (INT_MAX - 1)/component_len)
+        return NULL;
+    ap = av_malloc(component_len * count + 1);
+    if (!ap)
+        return NULL;
+    ap0   = ap;
+    ap[0] = '\0';
+    for (i = 0; i < count; i++) {
+        unsigned l = snprintf(ap, component_len, "%.15g%s", dp[i], sep);
+        if(l >= component_len) {
+            av_free(ap0);
+            return NULL;
+        }
+        ap += l;
+    }
+    ap0[strlen(ap0) - strlen(sep)] = '\0';
+    return ap0;
+}
+
+static int add_metadata(int count, int type,
+                        const char *name, const char *sep, TiffContext *s, AVFrame *frame)
+{
+    switch(type) {
+    case TIFF_DOUBLE: return ff_tadd_doubles_metadata(count, name, sep, &s->gb, s->le, avpriv_frame_get_metadatap(frame));
+    case TIFF_SHORT : return ff_tadd_shorts_metadata(count, name, sep, &s->gb, s->le, 0, avpriv_frame_get_metadatap(frame));
+    case TIFF_STRING: return ff_tadd_string_metadata(count, name, &s->gb, s->le, avpriv_frame_get_metadatap(frame));
+    default         : return AVERROR_INVALIDDATA;
+    };
+}
+
+static void av_always_inline horizontal_fill(unsigned int bpp, uint8_t* dst,
+                                             int usePtr, const uint8_t *src,
+                                             uint8_t c, int width, int offset)
+{
+    switch (bpp) {
+    case 1:
+        while (--width >= 0) {
+            dst[(width+offset)*8+7] = (usePtr ? src[width] : c)      & 0x1;
+            dst[(width+offset)*8+6] = (usePtr ? src[width] : c) >> 1 & 0x1;
+            dst[(width+offset)*8+5] = (usePtr ? src[width] : c) >> 2 & 0x1;
+            dst[(width+offset)*8+4] = (usePtr ? src[width] : c) >> 3 & 0x1;
+            dst[(width+offset)*8+3] = (usePtr ? src[width] : c) >> 4 & 0x1;
+            dst[(width+offset)*8+2] = (usePtr ? src[width] : c) >> 5 & 0x1;
+            dst[(width+offset)*8+1] = (usePtr ? src[width] : c) >> 6 & 0x1;
+            dst[(width+offset)*8+0] = (usePtr ? src[width] : c) >> 7;
+        }
+        break;
+    case 2:
+        while (--width >= 0) {
+            dst[(width+offset)*4+3] = (usePtr ? src[width] : c) & 0x3;
+            dst[(width+offset)*4+2] = (usePtr ? src[width] : c) >> 2 & 0x3;
+            dst[(width+offset)*4+1] = (usePtr ? src[width] : c) >> 4 & 0x3;
+            dst[(width+offset)*4+0] = (usePtr ? src[width] : c) >> 6;
+        }
+        break;
+    case 4:
+        while (--width >= 0) {
+            dst[(width+offset)*2+1] = (usePtr ? src[width] : c) & 0xF;
+            dst[(width+offset)*2+0] = (usePtr ? src[width] : c) >> 4;
+        }
+        break;
+    default:
+        if (usePtr) {
+            memcpy(dst + offset, src, width);
+        } else {
+            memset(dst + offset, c, width);
+        }
+    }
+}
+
+static int deinvert_buffer(TiffContext *s, const uint8_t *src, int size)
+{
+    int i;
+
+    av_fast_padded_malloc(&s->deinvert_buf, &s->deinvert_buf_size, size);
+    if (!s->deinvert_buf)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < size; i++)
+        s->deinvert_buf[i] = ff_reverse[src[i]];
+
+    return 0;
+}
+
+static void unpack_yuv(TiffContext *s, AVFrame *p,
+                       const uint8_t *src, int lnum)
+{
+    int i, j, k;
+    int w       = (s->width - 1) / s->subsampling[0] + 1;
+    uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
+    uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                               FFMIN(i * s->subsampling[0] + k, s->width-1)] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][(lnum + j) * p->linesize[0] +
+                               i * s->subsampling[0] + k] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
     }
 }
 
@@ -90,7 +322,7 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     z_stream zstream = { 0 };
     int zret;
 
-    zstream.next_in   = src;
+    zstream.next_in   = (uint8_t *)src;
     zstream.avail_in  = size;
     zstream.next_out  = dst;
     zstream.avail_out = *len;
@@ -105,9 +337,9 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     return zret == Z_STREAM_END ? Z_OK : zret;
 }
 
-static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
-                            const uint8_t *src, int size,
-                            int width, int lines)
+static int tiff_unpack_zlib(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
 {
     uint8_t *zbuf;
     unsigned long outlen;
@@ -116,6 +348,13 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     zbuf   = av_malloc(outlen);
     if (!zbuf)
         return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(zbuf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
     ret = tiff_uncompress(zbuf, &outlen, src, size);
     if (ret != Z_OK) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -126,7 +365,15 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     }
     src = zbuf;
     for (line = 0; line < lines; line++) {
-        memcpy(dst, src, width);
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
         dst += stride;
         src += width;
     }
@@ -135,11 +382,76 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
 }
 #endif
 
+#if CONFIG_LZMA
+static int tiff_uncompress_lzma(uint8_t *dst, uint64_t *len, const uint8_t *src,
+                                int size)
+{
+    lzma_stream stream = LZMA_STREAM_INIT;
+    lzma_ret ret;
+
+    stream.next_in   = (uint8_t *)src;
+    stream.avail_in  = size;
+    stream.next_out  = dst;
+    stream.avail_out = *len;
+    ret              = lzma_stream_decoder(&stream, UINT64_MAX, 0);
+    if (ret != LZMA_OK) {
+        av_log(NULL, AV_LOG_ERROR, "LZMA init error: %d\n", ret);
+        return ret;
+    }
+    ret = lzma_code(&stream, LZMA_RUN);
+    lzma_end(&stream);
+    *len = stream.total_out;
+    return ret == LZMA_STREAM_END ? LZMA_OK : ret;
+}
+
+static int tiff_unpack_lzma(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
+{
+    uint64_t outlen = width * lines;
+    int ret, line;
+    uint8_t *buf = av_malloc(outlen);
+    if (!buf)
+        return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(buf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
+    ret = tiff_uncompress_lzma(buf, &outlen, src, size);
+    if (ret != LZMA_OK) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Uncompressing failed (%"PRIu64" of %"PRIu64") with error %d\n", outlen,
+               (uint64_t)width * lines, ret);
+        av_free(buf);
+        return AVERROR_UNKNOWN;
+    }
+    src = buf;
+    for (line = 0; line < lines; line++) {
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
+        dst += stride;
+        src += width;
+    }
+    av_free(buf);
+    return 0;
+}
+#endif
 
 static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
-                           const uint8_t *src, int size, int lines)
+                           const uint8_t *src, int size, int width, int lines)
 {
     int i, ret = 0;
+    int line;
     uint8_t *src2 = av_malloc((unsigned)size +
                               AV_INPUT_BUFFER_PADDING_SIZE);
 
@@ -148,11 +460,7 @@ static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
                "Error allocating temporary buffer\n");
         return AVERROR(ENOMEM);
     }
-    if (s->fax_opts & 2) {
-        avpriv_request_sample(s->avctx, "Uncompressed fax mode");
-        av_free(src2);
-        return AVERROR_PATCHWELCOME;
-    }
+
     if (!s->fill_order) {
         memcpy(src2, src, size);
     } else {
@@ -162,16 +470,26 @@ static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
     memset(src2 + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     ret = ff_ccitt_unpack(s->avctx, src2, size, dst, lines, stride,
                           s->compr, s->fax_opts);
+    if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        for (line = 0; line < lines; line++) {
+            horizontal_fill(s->bpp, dst, 1, dst, 0, width, 0);
+            dst += stride;
+        }
     av_free(src2);
     return ret;
 }
 
-static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
-                             const uint8_t *src, int size, int lines)
+static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                             const uint8_t *src, int size, int strip_start, int lines)
 {
     PutByteContext pb;
     int c, line, pixels, code, ret;
+    const uint8_t *ssrc = src;
     int width = ((s->width * s->bpp) + 7) >> 3;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(p->format);
+    int is_yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                 (desc->flags & AV_PIX_FMT_FLAG_PLANAR) &&
+                 desc->nb_components >= 3;
 
     if (s->planar)
         width /= s->bppcount;
@@ -179,9 +497,27 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (size <= 0)
         return AVERROR_INVALIDDATA;
 
+    if (is_yuv) {
+        int bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                            s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
+            return AVERROR(ENOMEM);
+        }
+        dst = s->yuv_line;
+        stride = 0;
+
+        width = (s->width - 1) / s->subsampling[0] + 1;
+        width = width * s->subsampling[0] * s->subsampling[1] + 2*width;
+        av_assert0(width <= bytes_per_row);
+        av_assert0(s->bpp == 24);
+    }
+
     if (s->compr == TIFF_DEFLATE || s->compr == TIFF_ADOBE_DEFLATE) {
 #if CONFIG_ZLIB
-        return tiff_unpack_zlib(s, dst, stride, src, size, width, lines);
+        return tiff_unpack_zlib(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
 #else
         av_log(s->avctx, AV_LOG_ERROR,
                "zlib support not enabled, "
@@ -189,7 +525,25 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
         return AVERROR(ENOSYS);
 #endif
     }
+    if (s->compr == TIFF_LZMA) {
+#if CONFIG_LZMA
+        return tiff_unpack_lzma(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
+#else
+        av_log(s->avctx, AV_LOG_ERROR,
+               "LZMA support not enabled\n");
+        return AVERROR(ENOSYS);
+#endif
+    }
     if (s->compr == TIFF_LZW) {
+        if (s->fill_order) {
+            if ((ret = deinvert_buffer(s, src, size)) < 0)
+                return ret;
+            ssrc = src = s->deinvert_buf;
+        }
+        if (size > 1 && !src[0] && (src[1]&1)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Old style LZW is unsupported\n");
+        }
         if ((ret = ff_lzw_decode_init(s->lzw, 8, src, size, FF_LZW_TIFF)) < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error initializing LZW decoder\n");
             return ret;
@@ -201,6 +555,12 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
                        pixels, width);
                 return AVERROR_INVALIDDATA;
             }
+            if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+                horizontal_fill(s->bpp, dst, 1, dst, 0, width, 0);
+            if (is_yuv) {
+                unpack_yuv(s, p, dst, strip_start + line);
+                line += s->subsampling[1] - 1;
+            }
             dst += stride;
         }
         return 0;
@@ -208,49 +568,91 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (s->compr == TIFF_CCITT_RLE ||
         s->compr == TIFF_G3        ||
         s->compr == TIFF_G4) {
-        return tiff_unpack_fax(s, dst, stride, src, size, lines);
+        if (is_yuv)
+            return AVERROR_INVALIDDATA;
+
+        return tiff_unpack_fax(s, dst, stride, src, size, width, lines);
     }
 
     bytestream2_init(&s->gb, src, size);
-    bytestream2_init_writer(&pb, dst, stride * lines);
+    bytestream2_init_writer(&pb, dst, is_yuv ? s->yuv_line_size : (stride * lines));
 
     for (line = 0; line < lines; line++) {
+        if (src - ssrc > size) {
+            av_log(s->avctx, AV_LOG_ERROR, "Source data overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (bytestream2_get_bytes_left(&s->gb) == 0 || bytestream2_get_eof(&pb))
             break;
         bytestream2_seek_p(&pb, stride * line, SEEK_SET);
         switch (s->compr) {
         case TIFF_RAW:
+            if (ssrc + size - src < width)
+                return AVERROR_INVALIDDATA;
+
             if (!s->fill_order) {
-                bytestream2_copy_buffer(&pb, &s->gb, width);
+                horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                dst, 1, src, 0, width, 0);
             } else {
                 int i;
                 for (i = 0; i < width; i++)
-                    bytestream2_put_byte(&pb, ff_reverse[bytestream2_get_byte(&s->gb)]);
+                    dst[i] = ff_reverse[src[i]];
             }
+            src += width;
             break;
         case TIFF_PACKBITS:
             for (pixels = 0; pixels < width;) {
-                code = ff_u8_to_s8(bytestream2_get_byte(&s->gb));
+                if (ssrc + size - src < 2) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Read went out of bounds\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                code = s->fill_order ? (int8_t) ff_reverse[*src++]: (int8_t) *src++;
                 if (code >= 0) {
                     code++;
-                    bytestream2_copy_buffer(&pb, &s->gb, code);
+                    if (pixels + code > width ||
+                        ssrc + size - src < code) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Copy went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 1, src, 0, code, pixels);
+                    src    += code;
                     pixels += code;
                 } else if (code != -128) { // -127..-1
                     code = (-code) + 1;
-                    c    = bytestream2_get_byte(&s->gb);
-                    bytestream2_set_buffer(&pb, c, code);
+                    if (pixels + code > width) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    c = *src++;
+                    horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 0, NULL, c, code, pixels);
                     pixels += code;
                 }
             }
+            if (s->fill_order) {
+                int i;
+                for (i = 0; i < width; i++)
+                    dst[i] = ff_reverse[dst[i]];
+            }
             break;
         }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
+        dst += stride;
     }
     return 0;
 }
 
-static int init_image(TiffContext *s, AVFrame *frame)
+static int init_image(TiffContext *s, ThreadFrame *frame)
 {
     int ret;
+    int create_gray_palette = 0;
 
     // make sure there is no aliasing in the following switch
     if (s->bpp >= 100 || s->bppcount >= 10) {
@@ -262,13 +664,40 @@ static int init_image(TiffContext *s, AVFrame *frame)
 
     switch (s->planar * 1000 + s->bpp * 10 + s->bppcount) {
     case 11:
-        s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        if (!s->palette_is_set) {
+            s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            break;
+        }
+    case 21:
+    case 41:
+        s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (!s->palette_is_set) {
+            create_gray_palette = 1;
+        }
         break;
     case 81:
         s->avctx->pix_fmt = s->palette_is_set ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
         break;
     case 243:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+            if (s->subsampling[0] == 1 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            } else if (s->subsampling[0] == 1 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 4) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr subsampling\n");
+                return AVERROR_PATCHWELCOME;
+            }
+        } else
+            s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
         break;
     case 161:
         s->avctx->pix_fmt = s->le ? AV_PIX_FMT_GRAY16LE : AV_PIX_FMT_GRAY16BE;
@@ -283,10 +712,10 @@ static int init_image(TiffContext *s, AVFrame *frame)
         s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
         break;
     case 483:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE : AV_PIX_FMT_RGB48BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE  : AV_PIX_FMT_RGB48BE;
         break;
     case 644:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE : AV_PIX_FMT_RGBA64BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE  : AV_PIX_FMT_RGBA64BE;
         break;
     case 1243:
         s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
@@ -306,64 +735,80 @@ static int init_image(TiffContext *s, AVFrame *frame)
                s->bpp, s->bppcount);
         return AVERROR_INVALIDDATA;
     }
+
+    if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+        if((desc->flags & AV_PIX_FMT_FLAG_RGB) ||
+           !(desc->flags & AV_PIX_FMT_FLAG_PLANAR) ||
+           desc->nb_components < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr variant\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     if (s->width != s->avctx->width || s->height != s->avctx->height) {
         ret = ff_set_dimensions(s->avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
-    if ((ret = ff_get_buffer(s->avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(s->avctx, frame, 0)) < 0)
         return ret;
-    }
     if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
-        memcpy(frame->data[1], s->palette, sizeof(s->palette));
+        if (!create_gray_palette)
+            memcpy(frame->f->data[1], s->palette, sizeof(s->palette));
+        else {
+            /* make default grayscale pal */
+            int i;
+            uint32_t *pal = (uint32_t *)frame->f->data[1];
+            for (i = 0; i < 1<<s->bpp; i++)
+                pal[i] = 0xFFU << 24 | i * 255 / ((1<<s->bpp) - 1) * 0x010101;
+        }
     }
     return 0;
 }
 
-static int tiff_decode_tag(TiffContext *s)
+static void set_sar(TiffContext *s, unsigned tag, unsigned num, unsigned den)
 {
-    unsigned tag, type, count, off, value = 0;
+    int offset = tag == TIFF_YRES ? 2 : 0;
+    s->res[offset++] = num;
+    s->res[offset]   = den;
+    if (s->res[0] && s->res[1] && s->res[2] && s->res[3])
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                  s->res[2] * (uint64_t)s->res[1], s->res[0] * (uint64_t)s->res[3], INT32_MAX);
+}
+
+static int tiff_decode_tag(TiffContext *s, AVFrame *frame)
+{
+    unsigned tag, type, count, off, value = 0, value2 = 0;
     int i, start;
+    int pos;
+    int ret;
+    double *dp;
 
-    if (bytestream2_get_bytes_left(&s->gb) < 12)
-        return AVERROR_INVALIDDATA;
-    tag   = tget_short(&s->gb, s->le);
-    type  = tget_short(&s->gb, s->le);
-    count = tget_long(&s->gb, s->le);
-    off   = tget_long(&s->gb, s->le);
-    start = bytestream2_tell(&s->gb);
-
-    if (type == 0 || type >= FF_ARRAY_ELEMS(type_sizes)) {
-        av_log(s->avctx, AV_LOG_DEBUG, "Unknown tiff type (%u) encountered\n",
-               type);
-        return 0;
+    ret = ff_tread_tag(&s->gb, s->le, &tag, &type, &count, &start);
+    if (ret < 0) {
+        goto end;
     }
 
+    off = bytestream2_tell(&s->gb);
     if (count == 1) {
         switch (type) {
         case TIFF_BYTE:
         case TIFF_SHORT:
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-            value = tget(&s->gb, type, s->le);
-            break;
         case TIFF_LONG:
-            value = off;
+            value = ff_tget(&s->gb, type, s->le);
+            break;
+        case TIFF_RATIONAL:
+            value  = ff_tget(&s->gb, TIFF_LONG, s->le);
+            value2 = ff_tget(&s->gb, TIFF_LONG, s->le);
             break;
         case TIFF_STRING:
             if (count <= 4) {
-                bytestream2_seek(&s->gb, -4, SEEK_CUR);
                 break;
             }
         default:
             value = UINT_MAX;
-            bytestream2_seek(&s->gb, off, SEEK_SET);
         }
-    } else {
-        if (count <= 4 && type_sizes[type] * count <= 4)
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-        else
-            bytestream2_seek(&s->gb, off, SEEK_SET);
     }
 
     switch (tag) {
@@ -374,26 +819,25 @@ static int tiff_decode_tag(TiffContext *s)
         s->height = value;
         break;
     case TIFF_BPP:
-        s->bppcount = count;
-        if (count > 4) {
+        if (count > 4U) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "This format is not supported (bpp=%d, %d components)\n",
-                   s->bpp, count);
+                   value, count);
             return AVERROR_INVALIDDATA;
         }
+        s->bppcount = count;
         if (count == 1)
             s->bpp = value;
         else {
             switch (type) {
             case TIFF_BYTE:
-                s->bpp = (off & 0xFF) + ((off >> 8) & 0xFF) +
-                         ((off >> 16) & 0xFF) + ((off >> 24) & 0xFF);
-                break;
             case TIFF_SHORT:
             case TIFF_LONG:
                 s->bpp = 0;
+                if (bytestream2_get_bytes_left(&s->gb) < type_sizes[type] * count)
+                    return AVERROR_INVALIDDATA;
                 for (i = 0; i < count; i++)
-                    s->bpp += tget(&s->gb, type, s->le);
+                    s->bpp += ff_tget(&s->gb, type, s->le);
                 break;
             default:
                 s->bpp = -1;
@@ -406,6 +850,11 @@ static int tiff_decode_tag(TiffContext *s)
                    "Samples per pixel requires a single value, many provided\n");
             return AVERROR_INVALIDDATA;
         }
+        if (value > 4U) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Samples per pixel %d is too large\n", value);
+            return AVERROR_INVALIDDATA;
+        }
         if (s->bppcount == 1)
             s->bpp *= value;
         s->bppcount = value;
@@ -436,8 +885,12 @@ static int tiff_decode_tag(TiffContext *s)
             avpriv_report_missing_feature(s->avctx, "JPEG compression");
             return AVERROR_PATCHWELCOME;
         case TIFF_LZMA:
-            avpriv_report_missing_feature(s->avctx, "LZMA compression");
-            return AVERROR_PATCHWELCOME;
+#if CONFIG_LZMA
+            break;
+#else
+            av_log(s->avctx, AV_LOG_ERROR, "LZMA not compiled in\n");
+            return AVERROR(ENOSYS);
+#endif
         default:
             av_log(s->avctx, AV_LOG_ERROR, "Unknown compression method %i\n",
                    s->compr);
@@ -471,6 +924,17 @@ static int tiff_decode_tag(TiffContext *s)
         s->strips = count;
         s->sstype = type;
         break;
+    case TIFF_XRES:
+    case TIFF_YRES:
+        set_sar(s, tag, value, value2);
+        break;
+    case TIFF_TILE_BYTE_COUNTS:
+    case TIFF_TILE_LENGTH:
+    case TIFF_TILE_OFFSETS:
+    case TIFF_TILE_WIDTH:
+        av_log(s->avctx, AV_LOG_ERROR, "Tiled images are not supported\n");
+        return AVERROR_PATCHWELCOME;
+        break;
     case TIFF_PREDICTOR:
         s->predictor = value;
         break;
@@ -480,11 +944,11 @@ static int tiff_decode_tag(TiffContext *s)
         case TIFF_PHOTOMETRIC_BLACK_IS_ZERO:
         case TIFF_PHOTOMETRIC_RGB:
         case TIFF_PHOTOMETRIC_PALETTE:
+        case TIFF_PHOTOMETRIC_YCBCR:
             s->photometric = value;
             break;
         case TIFF_PHOTOMETRIC_ALPHA_MASK:
         case TIFF_PHOTOMETRIC_SEPARATED:
-        case TIFF_PHOTOMETRIC_YCBCR:
         case TIFF_PHOTOMETRIC_CIE_LAB:
         case TIFF_PHOTOMETRIC_ICC_LAB:
         case TIFF_PHOTOMETRIC_ITU_LAB:
@@ -516,15 +980,17 @@ static int tiff_decode_tag(TiffContext *s)
         if (count / 3 > 256 ||
             bytestream2_get_bytes_left(&s->gb) < count / 3 * off * 3)
             return AVERROR_INVALIDDATA;
+
         pal_gb[0] = pal_gb[1] = pal_gb[2] = s->gb;
         bytestream2_skip(&pal_gb[1], count / 3 * off);
         bytestream2_skip(&pal_gb[2], count / 3 * off * 2);
+
         off = (type_sizes[type] - 1) << 3;
         for (i = 0; i < count / 3; i++) {
             uint32_t p = 0xFF000000;
-            p |= (tget(&pal_gb[0], type, s->le) >> off) << 16;
-            p |= (tget(&pal_gb[1], type, s->le) >> off) << 8;
-            p |=  tget(&pal_gb[2], type, s->le) >> off;
+            p |= (ff_tget(&pal_gb[0], type, s->le) >> off) << 16;
+            p |= (ff_tget(&pal_gb[1], type, s->le) >> off) << 8;
+            p |=  ff_tget(&pal_gb[2], type, s->le) >> off;
             s->palette[i] = p;
         }
         s->palette_is_set = 1;
@@ -533,6 +999,14 @@ static int tiff_decode_tag(TiffContext *s)
     case TIFF_PLANAR:
         s->planar = value == 2;
         break;
+    case TIFF_YCBCR_SUBSAMPLING:
+        if (count != 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "subsample count invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < count; i++)
+            s->subsampling[i] = ff_tget(&s->gb, type, s->le);
+        break;
     case TIFF_T4OPTIONS:
         if (s->compr == TIFF_G3)
             s->fax_opts = value;
@@ -541,6 +1015,137 @@ static int tiff_decode_tag(TiffContext *s)
         if (s->compr == TIFF_G4)
             s->fax_opts = value;
         break;
+#define ADD_METADATA(count, name, sep)\
+    if ((ret = add_metadata(count, type, name, sep, s, frame)) < 0) {\
+        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");\
+        goto end;\
+    }
+    case TIFF_MODEL_PIXEL_SCALE:
+        ADD_METADATA(count, "ModelPixelScaleTag", NULL);
+        break;
+    case TIFF_MODEL_TRANSFORMATION:
+        ADD_METADATA(count, "ModelTransformationTag", NULL);
+        break;
+    case TIFF_MODEL_TIEPOINT:
+        ADD_METADATA(count, "ModelTiepointTag", NULL);
+        break;
+    case TIFF_GEO_KEY_DIRECTORY:
+        ADD_METADATA(1, "GeoTIFF_Version", NULL);
+        ADD_METADATA(2, "GeoTIFF_Key_Revision", ".");
+        s->geotag_count   = ff_tget_short(&s->gb, s->le);
+        if (s->geotag_count > count / 4 - 1) {
+            s->geotag_count = count / 4 - 1;
+            av_log(s->avctx, AV_LOG_WARNING, "GeoTIFF key directory buffer shorter than specified\n");
+        }
+        if (bytestream2_get_bytes_left(&s->gb) < s->geotag_count * sizeof(int16_t) * 4) {
+            s->geotag_count = 0;
+            return -1;
+        }
+        s->geotags = av_mallocz_array(s->geotag_count, sizeof(TiffGeoTag));
+        if (!s->geotags) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            s->geotag_count = 0;
+            goto end;
+        }
+        for (i = 0; i < s->geotag_count; i++) {
+            s->geotags[i].key    = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].type   = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].count  = ff_tget_short(&s->gb, s->le);
+
+            if (!s->geotags[i].type)
+                s->geotags[i].val  = get_geokey_val(s->geotags[i].key, ff_tget_short(&s->gb, s->le));
+            else
+                s->geotags[i].offset = ff_tget_short(&s->gb, s->le);
+        }
+        break;
+    case TIFF_GEO_DOUBLE_PARAMS:
+        if (count >= INT_MAX / sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        if (bytestream2_get_bytes_left(&s->gb) < count * sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        dp = av_malloc_array(count, sizeof(double));
+        if (!dp) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            goto end;
+        }
+        for (i = 0; i < count; i++)
+            dp[i] = ff_tget_double(&s->gb, s->le);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_DOUBLE_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset + s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap = doubles2str(&dp[s->geotags[i].offset], s->geotags[i].count, ", ");
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        av_freep(&dp);
+                        return AVERROR(ENOMEM);
+                    }
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        av_freep(&dp);
+        break;
+    case TIFF_GEO_ASCII_PARAMS:
+        pos = bytestream2_tell(&s->gb);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_ASCII_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset +  s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap;
+
+                    bytestream2_seek(&s->gb, pos + s->geotags[i].offset, SEEK_SET);
+                    if (bytestream2_get_bytes_left(&s->gb) < s->geotags[i].count)
+                        return AVERROR_INVALIDDATA;
+                    ap = av_malloc(s->geotags[i].count);
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        return AVERROR(ENOMEM);
+                    }
+                    bytestream2_get_bufferu(&s->gb, ap, s->geotags[i].count);
+                    ap[s->geotags[i].count - 1] = '\0'; //replace the "|" delimiter with a 0 byte
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        break;
+    case TIFF_ARTIST:
+        ADD_METADATA(count, "artist", NULL);
+        break;
+    case TIFF_COPYRIGHT:
+        ADD_METADATA(count, "copyright", NULL);
+        break;
+    case TIFF_DATE:
+        ADD_METADATA(count, "date", NULL);
+        break;
+    case TIFF_DOCUMENT_NAME:
+        ADD_METADATA(count, "document_name", NULL);
+        break;
+    case TIFF_HOST_COMPUTER:
+        ADD_METADATA(count, "computer", NULL);
+        break;
+    case TIFF_IMAGE_DESCRIPTION:
+        ADD_METADATA(count, "description", NULL);
+        break;
+    case TIFF_MAKE:
+        ADD_METADATA(count, "make", NULL);
+        break;
+    case TIFF_MODEL:
+        ADD_METADATA(count, "model", NULL);
+        break;
+    case TIFF_PAGE_NAME:
+        ADD_METADATA(count, "page_name", NULL);
+        break;
+    case TIFF_PAGE_NUMBER:
+        ADD_METADATA(count, "page_number", " / ");
+        break;
+    case TIFF_SOFTWARE_NAME:
+        ADD_METADATA(count, "software", NULL);
+        break;
     default:
         if (s->avctx->err_recognition & AV_EF_EXPLODE) {
             av_log(s->avctx, AV_LOG_ERROR,
@@ -549,6 +1154,14 @@ static int tiff_decode_tag(TiffContext *s)
             return AVERROR_INVALIDDATA;
         }
     }
+end:
+    if (s->bpp > 64U) {
+        av_log(s->avctx, AV_LOG_ERROR,
+                "This format is not supported (bpp=%d, %d components)\n",
+                s->bpp, count);
+        s->bpp = 0;
+        return AVERROR_INVALIDDATA;
+    }
     bytestream2_seek(&s->gb, start, SEEK_SET);
     return 0;
 }
@@ -558,8 +1171,9 @@ static int decode_frame(AVCodecContext *avctx,
 {
     TiffContext *const s = avctx->priv_data;
     AVFrame *const p = data;
+    ThreadFrame frame = { .f = data };
     unsigned off;
-    int id, le, ret, plane, planes;
+    int le, ret, plane, planes;
     int i, j, entries, stride;
     unsigned soff, ssize;
     uint8_t *dst;
@@ -569,48 +1183,56 @@ static int decode_frame(AVCodecContext *avctx,
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     // parse image header
-    if (avpkt->size < 8)
-        return AVERROR_INVALIDDATA;
-    id = bytestream2_get_le16(&s->gb);
-    if (id == 0x4949)
-        le = 1;
-    else if (id == 0x4D4D)
-        le = 0;
-    else {
-        av_log(avctx, AV_LOG_ERROR, "TIFF header not found\n");
+    if ((ret = ff_tdecode_header(&s->gb, &le, &off))) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid TIFF header\n");
+        return ret;
+    } else if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
+        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
         return AVERROR_INVALIDDATA;
     }
     s->le          = le;
+    // TIFF_BPP is not a required tag and defaults to 1
+    s->bppcount    = s->bpp = 1;
     s->photometric = TIFF_PHOTOMETRIC_NONE;
     s->compr       = TIFF_RAW;
     s->fill_order  = 0;
-    // As TIFF 6.0 specification puts it "An arbitrary but carefully chosen number
-    // that further identifies the file as a TIFF file"
-    if (tget_short(&s->gb, le) != 42) {
-        av_log(avctx, AV_LOG_ERROR,
-               "The answer to life, universe and everything is not correct!\n");
-        return AVERROR_INVALIDDATA;
-    }
+    free_geotags(s);
+
     // Reset these offsets so we can tell if they were set this frame
     s->stripsizesoff = s->strippos = 0;
     /* parse image file directory */
-    off = tget_long(&s->gb, le);
-    if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
-        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
-        return AVERROR_INVALIDDATA;
-    }
     bytestream2_seek(&s->gb, off, SEEK_SET);
-    entries = tget_short(&s->gb, le);
+    entries = ff_tget_short(&s->gb, le);
+    if (bytestream2_get_bytes_left(&s->gb) < entries * 12)
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < entries; i++) {
-        if ((ret = tiff_decode_tag(s)) < 0)
+        if ((ret = tiff_decode_tag(s, p)) < 0)
+            return ret;
+    }
+
+    for (i = 0; i<s->geotag_count; i++) {
+        const char *keyname = get_geokey_name(s->geotags[i].key);
+        if (!keyname) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown or unsupported GeoTIFF key %d\n", s->geotags[i].key);
+            continue;
+        }
+        if (get_geokey_type(s->geotags[i].key) != s->geotags[i].type) {
+            av_log(avctx, AV_LOG_WARNING, "Type of GeoTIFF key %d is wrong\n", s->geotags[i].key);
+            continue;
+        }
+        ret = av_dict_set(avpriv_frame_get_metadatap(p), keyname, s->geotags[i].val, 0);
+        if (ret<0) {
+            av_log(avctx, AV_LOG_ERROR, "Writing metadata with key '%s' failed\n", keyname);
             return ret;
+        }
     }
+
     if (!s->strippos && !s->stripoff) {
         av_log(avctx, AV_LOG_ERROR, "Image data is missing\n");
         return AVERROR_INVALIDDATA;
     }
     /* now we have the data and may start decoding */
-    if ((ret = init_image(s, p)) < 0)
+    if ((ret = init_image(s, &frame)) < 0)
         return ret;
 
     if (s->strips == 1 && !s->stripsize) {
@@ -619,30 +1241,35 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     if (s->stripsizesoff) {
-        if (s->stripsizesoff >= avpkt->size)
+        if (s->stripsizesoff >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripsizes, avpkt->data + s->stripsizesoff,
                          avpkt->size - s->stripsizesoff);
     }
     if (s->strippos) {
-        if (s->strippos >= avpkt->size)
+        if (s->strippos >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripdata, avpkt->data + s->strippos,
                          avpkt->size - s->strippos);
     }
 
+    if (s->rps <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "rps %d invalid\n", s->rps);
+        return AVERROR_INVALIDDATA;
+    }
+
     planes = s->planar ? s->bppcount : 1;
     for (plane = 0; plane < planes; plane++) {
         stride = p->linesize[plane];
         dst = p->data[plane];
         for (i = 0; i < s->height; i += s->rps) {
             if (s->stripsizesoff)
-                ssize = tget(&stripsizes, s->sstype, le);
+                ssize = ff_tget(&stripsizes, s->sstype, le);
             else
                 ssize = s->stripsize;
 
             if (s->strippos)
-                soff = tget(&stripdata, s->sot, le);
+                soff = ff_tget(&stripdata, s->sot, le);
             else
                 soff = s->stripoff;
 
@@ -650,7 +1277,7 @@ static int decode_frame(AVCodecContext *avctx,
                 av_log(avctx, AV_LOG_ERROR, "Invalid strip size/offset\n");
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = tiff_unpack_strip(s, dst, stride, avpkt->data + soff, ssize,
+            if ((ret = tiff_unpack_strip(s, p, dst, stride, avpkt->data + soff, ssize, i,
                                          FFMIN(s->rps, s->height - i))) < 0) {
                 if (avctx->err_recognition & AV_EF_EXPLODE)
                     return ret;
@@ -659,18 +1286,32 @@ static int decode_frame(AVCodecContext *avctx,
             dst += s->rps * stride;
         }
         if (s->predictor == 2) {
+            if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+                av_log(s->avctx, AV_LOG_ERROR, "predictor == 2 with YUV is unsupported");
+                return AVERROR_PATCHWELCOME;
+            }
             dst   = p->data[plane];
             soff  = s->bpp >> 3;
+            if (s->planar)
+                soff  = FFMAX(soff / s->bppcount, 1);
             ssize = s->width * soff;
             if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48LE ||
-                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE) {
+                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GRAY16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_YA16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRP16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16LE) {
                 for (i = 0; i < s->height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WL16(dst + j, AV_RL16(dst + j) + AV_RL16(dst + j - soff));
                     dst += stride;
                 }
             } else if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
-                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE) {
+                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_YA16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRP16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16BE) {
                 for (i = 0; i < s->height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WB16(dst + j, AV_RB16(dst + j) + AV_RB16(dst + j - soff));
@@ -689,7 +1330,7 @@ static int decode_frame(AVCodecContext *avctx,
             dst = p->data[plane];
             for (i = 0; i < s->height; i++) {
                 for (j = 0; j < stride; j++)
-                    dst[j] = 255 - dst[j];
+                    dst[j] = (s->avctx->pix_fmt == AV_PIX_FMT_PAL8 ? (1<<s->bpp) - 1 : 255) - dst[j];
                 dst += stride;
             }
         }
@@ -713,6 +1354,8 @@ static av_cold int tiff_init(AVCodecContext *avctx)
 
     s->width  = 0;
     s->height = 0;
+    s->subsampling[0] =
+    s->subsampling[1] = 1;
     s->avctx  = avctx;
     ff_lzw_decode_open(&s->lzw);
     ff_ccitt_unpack_init();
@@ -724,7 +1367,10 @@ static av_cold int tiff_end(AVCodecContext *avctx)
 {
     TiffContext *const s = avctx->priv_data;
 
+    free_geotags(s);
+
     ff_lzw_decode_close(&s->lzw);
+    av_freep(&s->deinvert_buf);
     return 0;
 }
 
@@ -737,5 +1383,6 @@ AVCodec ff_tiff_decoder = {
     .init           = tiff_init,
     .close          = tiff_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(tiff_init),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/tiff.h b/libavcodec/tiff.h
index 68ac695937..3f692afa00 100644
--- a/libavcodec/tiff.h
+++ b/libavcodec/tiff.h
@@ -1,27 +1,29 @@
 /*
- * TIFF tables
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * TIFF tables
+ *
+ * For more information about the TIFF format, check the official docs at:
+ * http://partners.adobe.com/public/developer/tiff/index.html
  * @author Konstantin Shishkov
  */
 
@@ -29,6 +31,7 @@
 #define AVCODEC_TIFF_H
 
 #include <stdint.h>
+#include "tiff_common.h"
 
 /** abridged list of TIFF tags */
 enum TiffTags {
@@ -39,6 +42,10 @@ enum TiffTags {
     TIFF_COMPR,
     TIFF_PHOTOMETRIC        = 0x106,
     TIFF_FILL_ORDER         = 0x10A,
+    TIFF_DOCUMENT_NAME      = 0x10D,
+    TIFF_IMAGE_DESCRIPTION  = 0x10E,
+    TIFF_MAKE               = 0x10F,
+    TIFF_MODEL              = 0x110,
     TIFF_STRIP_OFFS         = 0x111,
     TIFF_SAMPLES_PER_PIXEL  = 0x115,
     TIFF_ROWSPERSTRIP       = 0x116,
@@ -46,18 +53,35 @@ enum TiffTags {
     TIFF_XRES               = 0x11A,
     TIFF_YRES               = 0x11B,
     TIFF_PLANAR             = 0x11C,
+    TIFF_PAGE_NAME          = 0x11D,
     TIFF_XPOS               = 0x11E,
     TIFF_YPOS               = 0x11F,
     TIFF_T4OPTIONS          = 0x124,
     TIFF_T6OPTIONS,
     TIFF_RES_UNIT           = 0x128,
+    TIFF_PAGE_NUMBER        = 0x129,
     TIFF_SOFTWARE_NAME      = 0x131,
+    TIFF_DATE               = 0x132,
+    TIFF_ARTIST             = 0x13B,
+    TIFF_HOST_COMPUTER      = 0x13C,
     TIFF_PREDICTOR          = 0x13D,
     TIFF_PAL                = 0x140,
+    TIFF_TILE_WIDTH         = 0x142,
+    TIFF_TILE_LENGTH        = 0x143,
+    TIFF_TILE_OFFSETS       = 0x144,
+    TIFF_TILE_BYTE_COUNTS   = 0x145,
+    TIFF_EXTRASAMPLES       = 0x152,
     TIFF_YCBCR_COEFFICIENTS = 0x211,
     TIFF_YCBCR_SUBSAMPLING  = 0x212,
     TIFF_YCBCR_POSITIONING  = 0x213,
     TIFF_REFERENCE_BW       = 0x214,
+    TIFF_COPYRIGHT          = 0x8298,
+    TIFF_MODEL_TIEPOINT     = 0x8482,
+    TIFF_MODEL_PIXEL_SCALE  = 0x830E,
+    TIFF_MODEL_TRANSFORMATION= 0x8480,
+    TIFF_GEO_KEY_DIRECTORY  = 0x87AF,
+    TIFF_GEO_DOUBLE_PARAMS  = 0x87B0,
+    TIFF_GEO_ASCII_PARAMS   = 0x87B1
 };
 
 /** list of TIFF compression types */
@@ -75,12 +99,52 @@ enum TiffCompr {
     TIFF_LZMA     = 0x886D,
 };
 
-enum TiffTypes {
-    TIFF_BYTE = 1,
-    TIFF_STRING,
-    TIFF_SHORT,
-    TIFF_LONG,
-    TIFF_RATIONAL,
+enum TiffGeoTagKey {
+    TIFF_GT_MODEL_TYPE_GEOKEY                = 1024,
+    TIFF_GT_RASTER_TYPE_GEOKEY               = 1025,
+    TIFF_GT_CITATION_GEOKEY                  = 1026,
+    TIFF_GEOGRAPHIC_TYPE_GEOKEY              = 2048,
+    TIFF_GEOG_CITATION_GEOKEY                = 2049,
+    TIFF_GEOG_GEODETIC_DATUM_GEOKEY          = 2050,
+    TIFF_GEOG_PRIME_MERIDIAN_GEOKEY          = 2051,
+    TIFF_GEOG_LINEAR_UNITS_GEOKEY            = 2052,
+    TIFF_GEOG_LINEAR_UNIT_SIZE_GEOKEY        = 2053,
+    TIFF_GEOG_ANGULAR_UNITS_GEOKEY           = 2054,
+    TIFF_GEOG_ANGULAR_UNIT_SIZE_GEOKEY       = 2055,
+    TIFF_GEOG_ELLIPSOID_GEOKEY               = 2056,
+    TIFF_GEOG_SEMI_MAJOR_AXIS_GEOKEY         = 2057,
+    TIFF_GEOG_SEMI_MINOR_AXIS_GEOKEY         = 2058,
+    TIFF_GEOG_INV_FLATTENING_GEOKEY          = 2059,
+    TIFF_GEOG_AZIMUTH_UNITS_GEOKEY           = 2060,
+    TIFF_GEOG_PRIME_MERIDIAN_LONG_GEOKEY     = 2061,
+    TIFF_PROJECTED_CS_TYPE_GEOKEY            = 3072,
+    TIFF_PCS_CITATION_GEOKEY                 = 3073,
+    TIFF_PROJECTION_GEOKEY                   = 3074,
+    TIFF_PROJ_COORD_TRANS_GEOKEY             = 3075,
+    TIFF_PROJ_LINEAR_UNITS_GEOKEY            = 3076,
+    TIFF_PROJ_LINEAR_UNIT_SIZE_GEOKEY        = 3077,
+    TIFF_PROJ_STD_PARALLEL1_GEOKEY           = 3078,
+    TIFF_PROJ_STD_PARALLEL2_GEOKEY           = 3079,
+    TIFF_PROJ_NAT_ORIGIN_LONG_GEOKEY         = 3080,
+    TIFF_PROJ_NAT_ORIGIN_LAT_GEOKEY          = 3081,
+    TIFF_PROJ_FALSE_EASTING_GEOKEY           = 3082,
+    TIFF_PROJ_FALSE_NORTHING_GEOKEY          = 3083,
+    TIFF_PROJ_FALSE_ORIGIN_LONG_GEOKEY       = 3084,
+    TIFF_PROJ_FALSE_ORIGIN_LAT_GEOKEY        = 3085,
+    TIFF_PROJ_FALSE_ORIGIN_EASTING_GEOKEY    = 3086,
+    TIFF_PROJ_FALSE_ORIGIN_NORTHING_GEOKEY   = 3087,
+    TIFF_PROJ_CENTER_LONG_GEOKEY             = 3088,
+    TIFF_PROJ_CENTER_LAT_GEOKEY              = 3089,
+    TIFF_PROJ_CENTER_EASTING_GEOKEY          = 3090,
+    TIFF_PROJ_CENTER_NORTHING_GEOKEY         = 3091,
+    TIFF_PROJ_SCALE_AT_NAT_ORIGIN_GEOKEY     = 3092,
+    TIFF_PROJ_SCALE_AT_CENTER_GEOKEY         = 3093,
+    TIFF_PROJ_AZIMUTH_ANGLE_GEOKEY           = 3094,
+    TIFF_PROJ_STRAIGHT_VERT_POLE_LONG_GEOKEY = 3095,
+    TIFF_VERTICAL_CS_TYPE_GEOKEY             = 4096,
+    TIFF_VERTICAL_CITATION_GEOKEY            = 4097,
+    TIFF_VERTICAL_DATUM_GEOKEY               = 4098,
+    TIFF_VERTICAL_UNITS_GEOKEY               = 4099
 };
 
 enum TiffPhotometric {
@@ -101,9 +165,28 @@ enum TiffPhotometric {
     TIFF_PHOTOMETRIC_LINEAR_RAW = 34892, /* Linear Raw (DNG) */
 };
 
-/** sizes of various TIFF field types (string size = 100)*/
-static const uint8_t type_sizes[6] = {
-    0, 1, 100, 2, 4, 8
+enum TiffGeoTagType {
+    GEOTIFF_SHORT  = 0,
+    GEOTIFF_DOUBLE = 34736,
+    GEOTIFF_STRING = 34737
 };
 
+typedef struct TiffGeoTag {
+    enum TiffGeoTagKey key;
+    enum TiffTags type;
+    int count;
+    int offset;
+    char *val;
+} TiffGeoTag;
+
+typedef struct TiffGeoTagKeyName {
+    const enum TiffGeoTagKey key;
+    const char *const name;
+} TiffGeoTagKeyName;
+
+typedef struct TiffGeoTagNameType {
+    const char *const name;
+    const enum TiffGeoTagType type;
+} TiffGeoTagNameType;
+
 #endif /* AVCODEC_TIFF_H */
diff --git a/libavcodec/tiff_common.c b/libavcodec/tiff_common.c
new file mode 100644
index 0000000000..35119af558
--- /dev/null
+++ b/libavcodec/tiff_common.c
@@ -0,0 +1,313 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "tiff_common.h"
+
+
+int ff_tis_ifd(unsigned tag)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(ifd_tags); i++) {
+        if (ifd_tags[i] == tag) {
+            return i + 1;
+        }
+    }
+    return 0;
+}
+
+
+unsigned ff_tget_short(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+}
+
+
+unsigned ff_tget_long(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+}
+
+
+double ff_tget_double(GetByteContext *gb, int le)
+{
+    av_alias64 i = { .u64 = le ? bytestream2_get_le64(gb) : bytestream2_get_be64(gb)};
+    return i.f64;
+}
+
+
+unsigned ff_tget(GetByteContext *gb, int type, int le)
+{
+    switch (type) {
+    case TIFF_BYTE:  return bytestream2_get_byte(gb);
+    case TIFF_SHORT: return ff_tget_short(gb, le);
+    case TIFF_LONG:  return ff_tget_long(gb, le);
+    default:         return UINT_MAX;
+    }
+}
+
+static const char *auto_sep(int count, const char *sep, int i, int columns)
+{
+    if (sep)
+        return i ? sep : "";
+    if (i && i%columns) {
+        return ", ";
+    } else
+        return columns < count ? "\n" : "";
+}
+
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int32_t nom, denom;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        nom   = ff_tget_long(gb, le);
+        denom = ff_tget_long(gb, le);
+        av_bprintf(&bp, "%s%7i:%-7i", auto_sep(count, sep, i, 4), nom, denom);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int32_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int32_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%7i", auto_sep(count, sep, i, 8), ff_tget_long(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, 100 * count);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%.15g", auto_sep(count, sep, i, 4), ff_tget_double(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int16_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int16_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int16_t)ff_tget_short(gb, le) :  ff_tget_short(gb, le);
+        av_bprintf(&bp, "%s%5i", auto_sep(count, sep, i, 8), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int8_t) || count < 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int8_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int8_t)bytestream2_get_byte(gb) :  bytestream2_get_byte(gb);
+        av_bprintf(&bp, "%s%3i", auto_sep(count, sep, i, 16), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    char *value;
+
+    if (bytestream2_get_bytes_left(gb) < count || count < 0)
+        return AVERROR_INVALIDDATA;
+
+    value = av_malloc(count + 1);
+    if (!value)
+        return AVERROR(ENOMEM);
+
+    bytestream2_get_bufferu(gb, value, count);
+    value[count] = 0;
+
+    av_dict_set(metadata, name, value, AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset)
+{
+    if (bytestream2_get_bytes_left(gb) < 8) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *le = bytestream2_get_le16u(gb);
+    if (*le == AV_RB16("II")) {
+        *le = 1;
+    } else if (*le == AV_RB16("MM")) {
+        *le = 0;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ff_tget_short(gb, *le) != 42) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *ifd_offset = ff_tget_long(gb, *le);
+
+    return 0;
+}
+
+
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next)
+{
+    int ifd_tag;
+    int valid_type;
+
+    *tag    = ff_tget_short(gb, le);
+    *type   = ff_tget_short(gb, le);
+    *count  = ff_tget_long (gb, le);
+
+    ifd_tag    = ff_tis_ifd(*tag);
+    valid_type = *type != 0 && *type < FF_ARRAY_ELEMS(type_sizes);
+
+    *next = bytestream2_tell(gb) + 4;
+
+    // check for valid type
+    if (!valid_type) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    // seek to offset if this is an IFD-tag or
+    // if count values do not fit into the offset value
+    if (ifd_tag || (*count > 4 || !(type_sizes[*type] * (*count) <= 4 || *type == TIFF_STRING))) {
+        bytestream2_seek(gb, ff_tget_long (gb, le), SEEK_SET);
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tiff_common.h b/libavcodec/tiff_common.h
new file mode 100644
index 0000000000..03558c31a3
--- /dev/null
+++ b/libavcodec/tiff_common.h
@@ -0,0 +1,152 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_TIFF_COMMON_H
+#define AVCODEC_TIFF_COMMON_H
+
+#include "avcodec.h"
+#include "tiff.h"
+#include "bytestream.h"
+#include "libavutil/bprint.h"
+
+/** data type identifiers for TIFF tags */
+enum TiffTypes {
+    TIFF_BYTE = 1,
+    TIFF_STRING,
+    TIFF_SHORT,
+    TIFF_LONG,
+    TIFF_RATIONAL,
+    TIFF_SBYTE,
+    TIFF_UNDEFINED,
+    TIFF_SSHORT,
+    TIFF_SLONG,
+    TIFF_SRATIONAL,
+    TIFF_FLOAT,
+    TIFF_DOUBLE,
+    TIFF_IFD
+};
+
+/** sizes of various TIFF field types (string size = 100)*/
+static const uint8_t type_sizes[14] = {
+    0, 1, 100, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
+};
+
+static const uint16_t ifd_tags[] = {
+    0x8769, // EXIF IFD
+    0x8825, // GPS IFD
+    0xA005  // Interoperability IFD
+};
+
+
+/** Returns a value > 0 if the tag is a known IFD-tag.
+ *  The return value is the array index + 1 within ifd_tags[].
+ */
+int ff_tis_ifd(unsigned tag);
+
+/** Reads a short from the bytestream using given endianness. */
+unsigned ff_tget_short(GetByteContext *gb, int le);
+
+/** Reads a long from the bytestream using given endianness. */
+unsigned ff_tget_long(GetByteContext *gb, int le);
+
+/** Reads a double from the bytestream using given endianness. */
+double   ff_tget_double(GetByteContext *gb, int le);
+
+/** Reads a byte from the bytestream using given endianness. */
+unsigned ff_tget(GetByteContext *gb, int type, int le);
+
+/** Returns an allocated string containing count
+ *  rational values using the given separator.
+ */
+char *ff_trationals2str(int *rp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  long values using the given separator.
+ */
+char *ff_tlongs2str(int32_t *lp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  double values using the given separator.
+ */
+char *ff_tdoubles2str(double *dp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  short values using the given separator.
+ */
+char *ff_tshorts2str(int16_t *sp, int count, const char *sep);
+
+/** Adds count rationals converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count longs converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count doubles converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count shorts converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds count bytes converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds a string of count characters
+ *  into the metadata dictionary.
+ */
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Decodes a TIFF header from the input bytestream
+ *  and sets the endianness in *le and the offset to
+ *  the first IFD in *ifd_offset accordingly.
+ */
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset);
+
+/** Reads the first 3 fields of a TIFF tag, which are
+ *  the tag id, the tag type and the count of values for that tag.
+ *  Afterwards the bytestream is located at the first value to read and
+ *  *next holds the bytestream offset of the following tag.
+ */
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next);
+
+#endif /* AVCODEC_TIFF_COMMON_H */
diff --git a/libavcodec/tiff_data.c b/libavcodec/tiff_data.c
new file mode 100644
index 0000000000..88c2256813
--- /dev/null
+++ b/libavcodec/tiff_data.c
@@ -0,0 +1,1870 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#include "tiff_data.h"
+
+const TiffGeoTagNameType ff_tiff_conf_name_type_map[] = {
+    {"GTModelTypeGeoKey",              GEOTIFF_SHORT },
+    {"GTRasterTypeGeoKey",             GEOTIFF_SHORT },
+    {"GTCitationGeoKey",               GEOTIFF_STRING}
+};
+
+const TiffGeoTagNameType ff_tiff_geog_name_type_map[] = {
+    {"GeographicTypeGeoKey",           GEOTIFF_SHORT },
+    {"GeogCitationGeoKey",             GEOTIFF_STRING},
+    {"GeogGeodeticDatumGeoKey",        GEOTIFF_SHORT },
+    {"GeogPrimeMeridianGeoKey",        GEOTIFF_SHORT },
+    {"GeogLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"GeogLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"GeogAngularUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogAngularUnitSizeGeoKey",      GEOTIFF_DOUBLE},
+    {"GeogEllipsoidGeoKey",            GEOTIFF_SHORT },
+    {"GeogSemiMajorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogSemiMinorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogInvFlatteningGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogAzimuthUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogPrimeMeridianLongGeoKey",    GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_proj_name_type_map[] = {
+    {"ProjectedCSTypeGeoKey",          GEOTIFF_SHORT },
+    {"PCSCitationGeoKey",              GEOTIFF_STRING},
+    {"ProjectionGeoKey",               GEOTIFF_SHORT },
+    {"ProjCoordTransGeoKey",           GEOTIFF_SHORT },
+    {"ProjLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"ProjLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjStdParallel1GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStdParallel2GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjNatOriginLongGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjNatOriginLatGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseEastingGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseNorthingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLongGeoKey",      GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLatGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjFalseOriginEastingGeoKey",   GEOTIFF_DOUBLE},
+    {"ProjFalseOriginNorthingGeoKey",  GEOTIFF_DOUBLE},
+    {"ProjCenterLongGeoKey",           GEOTIFF_DOUBLE},
+    {"ProjCenterLatGeoKey",            GEOTIFF_DOUBLE},
+    {"ProjCenterEastingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjCenterNorthingGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjScaleAtNatOriginGeoKey",     GEOTIFF_DOUBLE},
+    {"ProjScaleAtCenterGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjAzimuthAngleGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStraightVertPoleLongGeoKey", GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_vert_name_type_map[] = {
+    {"VerticalCSTypeGeoKey",           GEOTIFF_SHORT },
+    {"VerticalCitationGeoKey",         GEOTIFF_STRING},
+    {"VerticalDatumGeoKey",            GEOTIFF_SHORT },
+    {"VerticalUnitsGeoKey",            GEOTIFF_SHORT }
+};
+
+const char *const ff_tiff_gt_model_type_codes[] = {
+    "ModelTypeProjected",
+    "ModelTypeGeographic",
+    "ModelTypeGeocentric"
+};
+
+const char *const ff_tiff_gt_raster_type_codes[] = {
+    "RasterPixelIsArea",
+    "RasterPixelIsPoint"
+};
+
+const char *const ff_tiff_linear_unit_codes[] = {
+    "Linear_Meter",
+    "Linear_Foot",
+    "Linear_Foot_US_Survey",
+    "Linear_Foot_Modified_American",
+    "Linear_Foot_Clarke",
+    "Linear_Foot_Indian",
+    "Linear_Link",
+    "Linear_Link_Benoit",
+    "Linear_Link_Sears",
+    "Linear_Chain_Benoit",
+    "Linear_Chain_Sears",
+    "Linear_Yard_Sears",
+    "Linear_Yard_Indian",
+    "Linear_Fathom",
+    "Linear_Mile_International_Nautical"
+};
+
+const char *const ff_tiff_angular_unit_codes[] = {
+    "Angular_Radian",
+    "Angular_Degree",
+    "Angular_Arc_Minute",
+    "Angular_Arc_Second",
+    "Angular_Grad",
+    "Angular_Gon",
+    "Angular_DMS",
+    "Angular_DMS_Hemisphere"
+};
+
+const char *const ff_tiff_gcs_type_codes[] = {
+    "GCS_Adindan",
+    "GCS_AGD66",
+    "GCS_AGD84",
+    "GCS_Ain_el_Abd",
+    "GCS_Afgooye",
+    "GCS_Agadez",
+    "GCS_Lisbon",
+    "GCS_Aratu",
+    "GCS_Arc_1950",
+    "GCS_Arc_1960",
+    "GCS_Batavia",
+    "GCS_Barbados",
+    "GCS_Beduaram",
+    "GCS_Beijing_1954",
+    "GCS_Belge_1950",
+    "GCS_Bermuda_1957",
+    "GCS_Bern_1898",
+    "GCS_Bogota",
+    "GCS_Bukit_Rimpah",
+    "GCS_Camacupa",
+    "GCS_Campo_Inchauspe",
+    "GCS_Cape",
+    "GCS_Carthage",
+    "GCS_Chua",
+    "GCS_Corrego_Alegre",
+    "GCS_Cote_d_Ivoire",
+    "GCS_Deir_ez_Zor",
+    "GCS_Douala",
+    "GCS_Egypt_1907",
+    "GCS_ED50",
+    "GCS_ED87",
+    "GCS_Fahud",
+    "GCS_Gandajika_1970",
+    "GCS_Garoua",
+    "GCS_Guyane_Francaise",
+    "GCS_Hu_Tzu_Shan",
+    "GCS_HD72",
+    "GCS_ID74",
+    "GCS_Indian_1954",
+    "GCS_Indian_1975",
+    "GCS_Jamaica_1875",
+    "GCS_JAD69",
+    "GCS_Kalianpur",
+    "GCS_Kandawala",
+    "GCS_Kertau",
+    "GCS_KOC",
+    "GCS_La_Canoa",
+    "GCS_PSAD56",
+    "GCS_Lake",
+    "GCS_Leigon",
+    "GCS_Liberia_1964",
+    "GCS_Lome",
+    "GCS_Luzon_1911",
+    "GCS_Hito_XVIII_1963",
+    "GCS_Herat_North",
+    "GCS_Mahe_1971",
+    "GCS_Makassar",
+    "GCS_EUREF89",
+    "GCS_Malongo_1987",
+    "GCS_Manoca",
+    "GCS_Merchich",
+    "GCS_Massawa",
+    "GCS_Minna",
+    "GCS_Mhast",
+    "GCS_Monte_Mario",
+    "GCS_M_poraloko",
+    "GCS_NAD27",
+    "GCS_NAD_Michigan",
+    "GCS_NAD83",
+    "GCS_Nahrwan_1967",
+    "GCS_Naparima_1972",
+    "GCS_GD49",
+    "GCS_NGO_1948",
+    "GCS_Datum_73",
+    "GCS_NTF",
+    "GCS_NSWC_9Z_2",
+    "GCS_OSGB_1936",
+    "GCS_OSGB70",
+    "GCS_OS_SN80",
+    "GCS_Padang",
+    "GCS_Palestine_1923",
+    "GCS_Pointe_Noire",
+    "GCS_GDA94",
+    "GCS_Pulkovo_1942",
+    "GCS_Qatar",
+    "GCS_Qatar_1948",
+    "GCS_Qornoq",
+    "GCS_Loma_Quintana",
+    "GCS_Amersfoort",
+    "GCS_RT38",
+    "GCS_SAD69",
+    "GCS_Sapper_Hill_1943",
+    "GCS_Schwarzeck",
+    "GCS_Segora",
+    "GCS_Serindung",
+    "GCS_Sudan",
+    "GCS_Tananarive",
+    "GCS_Timbalai_1948",
+    "GCS_TM65",
+    "GCS_TM75",
+    "GCS_Tokyo",
+    "GCS_Trinidad_1903",
+    "GCS_TC_1948",
+    "GCS_Voirol_1875",
+    "GCS_Voirol_Unifie",
+    "GCS_Bern_1938",
+    "GCS_Nord_Sahara_1959",
+    "GCS_Stockholm_1938",
+    "GCS_Yacare",
+    "GCS_Yoff",
+    "GCS_Zanderij",
+    "GCS_MGI",
+    "GCS_Belge_1972",
+    "GCS_DHDN",
+    "GCS_Conakry_1905",
+    "GCS_WGS_72",
+    "GCS_WGS_72BE",
+    "GCS_WGS_84",
+    "GCS_Bern_1898_Bern",
+    "GCS_Bogota_Bogota",
+    "GCS_Lisbon_Lisbon",
+    "GCS_Makassar_Jakarta",
+    "GCS_MGI_Ferro",
+    "GCS_Monte_Mario_Rome",
+    "GCS_NTF_Paris",
+    "GCS_Padang_Jakarta",
+    "GCS_Belge_1950_Brussels",
+    "GCS_Tananarive_Paris",
+    "GCS_Voirol_1875_Paris",
+    "GCS_Voirol_Unifie_Paris",
+    "GCS_Batavia_Jakarta",
+    "GCS_ATF_Paris",
+    "GCS_NDG_Paris"
+};
+
+const char *const ff_tiff_gcse_type_codes[] = {
+    "GCSE_Airy1830",
+    "GCSE_AiryModified1849",
+    "GCSE_AustralianNationalSpheroid",
+    "GCSE_Bessel1841",
+    "GCSE_BesselModified",
+    "GCSE_BesselNamibia",
+    "GCSE_Clarke1858",
+    "GCSE_Clarke1866",
+    "GCSE_Clarke1866Michigan",
+    "GCSE_Clarke1880_Benoit",
+    "GCSE_Clarke1880_IGN",
+    "GCSE_Clarke1880_RGS",
+    "GCSE_Clarke1880_Arc",
+    "GCSE_Clarke1880_SGA1922",
+    "GCSE_Everest1830_1937Adjustment",
+    "GCSE_Everest1830_1967Definition",
+    "GCSE_Everest1830_1975Definition",
+    "GCSE_Everest1830Modified",
+    "GCSE_GRS1980",
+    "GCSE_Helmert1906",
+    "GCSE_IndonesianNationalSpheroid",
+    "GCSE_International1924",
+    "GCSE_International1967",
+    "GCSE_Krassowsky1940",
+    "GCSE_NWL9D",
+    "GCSE_NWL10D",
+    "GCSE_Plessis1817",
+    "GCSE_Struve1860",
+    "GCSE_WarOffice",
+    "GCSE_WGS84",
+    "GCSE_GEM10C",
+    "GCSE_OSU86F",
+    "GCSE_OSU91A",
+    "GCSE_Clarke1880",
+    "GCSE_Sphere"
+};
+
+const char *const ff_tiff_geodetic_datum_codes[] = {
+    "Datum_Adindan",
+    "Datum_Australian_Geodetic_Datum_1966",
+    "Datum_Australian_Geodetic_Datum_1984",
+    "Datum_Ain_el_Abd_1970",
+    "Datum_Afgooye",
+    "Datum_Agadez",
+    "Datum_Lisbon",
+    "Datum_Aratu",
+    "Datum_Arc_1950",
+    "Datum_Arc_1960",
+    "Datum_Batavia",
+    "Datum_Barbados",
+    "Datum_Beduaram",
+    "Datum_Beijing_1954",
+    "Datum_Reseau_National_Belge_1950",
+    "Datum_Bermuda_1957",
+    "Datum_Bern_1898",
+    "Datum_Bogota",
+    "Datum_Bukit_Rimpah",
+    "Datum_Camacupa",
+    "Datum_Campo_Inchauspe",
+    "Datum_Cape",
+    "Datum_Carthage",
+    "Datum_Chua",
+    "Datum_Corrego_Alegre",
+    "Datum_Cote_d_Ivoire",
+    "Datum_Deir_ez_Zor",
+    "Datum_Douala",
+    "Datum_Egypt_1907",
+    "Datum_European_Datum_1950",
+    "Datum_European_Datum_1987",
+    "Datum_Fahud",
+    "Datum_Gandajika_1970",
+    "Datum_Garoua",
+    "Datum_Guyane_Francaise",
+    "Datum_Hu_Tzu_Shan",
+    "Datum_Hungarian_Datum_1972",
+    "Datum_Indonesian_Datum_1974",
+    "Datum_Indian_1954",
+    "Datum_Indian_1975",
+    "Datum_Jamaica_1875",
+    "Datum_Jamaica_1969",
+    "Datum_Kalianpur",
+    "Datum_Kandawala",
+    "Datum_Kertau",
+    "Datum_Kuwait_Oil_Company",
+    "Datum_La_Canoa",
+    "Datum_Provisional_S_American_Datum_1956",
+    "Datum_Lake",
+    "Datum_Leigon",
+    "Datum_Liberia_1964",
+    "Datum_Lome",
+    "Datum_Luzon_1911",
+    "Datum_Hito_XVIII_1963",
+    "Datum_Herat_North",
+    "Datum_Mahe_1971",
+    "Datum_Makassar",
+    "Datum_European_Reference_System_1989",
+    "Datum_Malongo_1987",
+    "Datum_Manoca",
+    "Datum_Merchich",
+    "Datum_Massawa",
+    "Datum_Minna",
+    "Datum_Mhast",
+    "Datum_Monte_Mario",
+    "Datum_M_poraloko",
+    "Datum_North_American_Datum_1927",
+    "Datum_NAD_Michigan",
+    "Datum_North_American_Datum_1983",
+    "Datum_Nahrwan_1967",
+    "Datum_Naparima_1972",
+    "Datum_New_Zealand_Geodetic_Datum_1949",
+    "Datum_NGO_1948",
+    "Datum_Datum_73",
+    "Datum_Nouvelle_Triangulation_Francaise",
+    "Datum_NSWC_9Z_2",
+    "Datum_OSGB_1936",
+    "Datum_OSGB_1970_SN",
+    "Datum_OS_SN_1980",
+    "Datum_Padang_1884",
+    "Datum_Palestine_1923",
+    "Datum_Pointe_Noire",
+    "Datum_Geocentric_Datum_of_Australia_1994",
+    "Datum_Pulkovo_1942",
+    "Datum_Qatar",
+    "Datum_Qatar_1948",
+    "Datum_Qornoq",
+    "Datum_Loma_Quintana",
+    "Datum_Amersfoort",
+    "Datum_RT38",
+    "Datum_South_American_Datum_1969",
+    "Datum_Sapper_Hill_1943",
+    "Datum_Schwarzeck",
+    "Datum_Segora",
+    "Datum_Serindung",
+    "Datum_Sudan",
+    "Datum_Tananarive_1925",
+    "Datum_Timbalai_1948",
+    "Datum_TM65",
+    "Datum_TM75",
+    "Datum_Tokyo",
+    "Datum_Trinidad_1903",
+    "Datum_Trucial_Coast_1948",
+    "Datum_Voirol_1875",
+    "Datum_Voirol_Unifie_1960",
+    "Datum_Bern_1938",
+    "Datum_Nord_Sahara_1959",
+    "Datum_Stockholm_1938",
+    "Datum_Yacare",
+    "Datum_Yoff",
+    "Datum_Zanderij",
+    "Datum_Militar_Geographische_Institut",
+    "Datum_Reseau_National_Belge_1972",
+    "Datum_Deutsche_Hauptdreiecksnetz",
+    "Datum_Conakry_1905",
+    "Datum_WGS72",
+    "Datum_WGS72_Transit_Broadcast_Ephemeris",
+    "Datum_WGS84",
+    "Datum_Ancienne_Triangulation_Francaise",
+    "Datum_Nord_de_Guerre"
+};
+
+const char *const ff_tiff_geodetic_datum_e_codes[] = {
+    "DatumE_Airy1830",
+    "DatumE_AiryModified1849",
+    "DatumE_AustralianNationalSpheroid",
+    "DatumE_Bessel1841",
+    "DatumE_BesselModified",
+    "DatumE_BesselNamibia",
+    "DatumE_Clarke1858",
+    "DatumE_Clarke1866",
+    "DatumE_Clarke1866Michigan",
+    "DatumE_Clarke1880_Benoit",
+    "DatumE_Clarke1880_IGN",
+    "DatumE_Clarke1880_RGS",
+    "DatumE_Clarke1880_Arc",
+    "DatumE_Clarke1880_SGA1922",
+    "DatumE_Everest1830_1937Adjustment",
+    "DatumE_Everest1830_1967Definition",
+    "DatumE_Everest1830_1975Definition",
+    "DatumE_Everest1830Modified",
+    "DatumE_GRS1980",
+    "DatumE_Helmert1906",
+    "DatumE_IndonesianNationalSpheroid",
+    "DatumE_International1924",
+    "DatumE_International1967",
+    "DatumE_Krassowsky1960",
+    "DatumE_NWL9D",
+    "DatumE_NWL10D",
+    "DatumE_Plessis1817",
+    "DatumE_Struve1860",
+    "DatumE_WarOffice",
+    "DatumE_WGS84",
+    "DatumE_GEM10C",
+    "DatumE_OSU86F",
+    "DatumE_OSU91A",
+    "DatumE_Clarke1880",
+    "DatumE_Sphere"
+};
+
+const char *const ff_tiff_ellipsoid_codes[] = {
+    "Ellipse_Airy_1830",
+    "Ellipse_Airy_Modified_1849",
+    "Ellipse_Australian_National_Spheroid",
+    "Ellipse_Bessel_1841",
+    "Ellipse_Bessel_Modified",
+    "Ellipse_Bessel_Namibia",
+    "Ellipse_Clarke_1858",
+    "Ellipse_Clarke_1866",
+    "Ellipse_Clarke_1866_Michigan",
+    "Ellipse_Clarke_1880_Benoit",
+    "Ellipse_Clarke_1880_IGN",
+    "Ellipse_Clarke_1880_RGS",
+    "Ellipse_Clarke_1880_Arc",
+    "Ellipse_Clarke_1880_SGA_1922",
+    "Ellipse_Everest_1830_1937_Adjustment",
+    "Ellipse_Everest_1830_1967_Definition",
+    "Ellipse_Everest_1830_1975_Definition",
+    "Ellipse_Everest_1830_Modified",
+    "Ellipse_GRS_1980",
+    "Ellipse_Helmert_1906",
+    "Ellipse_Indonesian_National_Spheroid",
+    "Ellipse_International_1924",
+    "Ellipse_International_1967",
+    "Ellipse_Krassowsky_1940",
+    "Ellipse_NWL_9D",
+    "Ellipse_NWL_10D",
+    "Ellipse_Plessis_1817",
+    "Ellipse_Struve_1860",
+    "Ellipse_War_Office",
+    "Ellipse_WGS_84",
+    "Ellipse_GEM_10C",
+    "Ellipse_OSU86F",
+    "Ellipse_OSU91A",
+    "Ellipse_Clarke_1880",
+    "Ellipse_Sphere"
+};
+
+const char *const ff_tiff_prime_meridian_codes[] = {
+    "PM_Greenwich",
+    "PM_Lisbon",
+    "PM_Paris",
+    "PM_Bogota",
+    "PM_Madrid",
+    "PM_Rome",
+    "PM_Bern",
+    "PM_Jakarta",
+    "PM_Ferro",
+    "PM_Brussels",
+    "PM_Stockholm"
+};
+
+const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[] = {
+    {20137, "PCS_Adindan_UTM_zone_37N"},
+    {20138, "PCS_Adindan_UTM_zone_38N"},
+    {20248, "PCS_AGD66_AMG_zone_48"},
+    {20249, "PCS_AGD66_AMG_zone_49"},
+    {20250, "PCS_AGD66_AMG_zone_50"},
+    {20251, "PCS_AGD66_AMG_zone_51"},
+    {20252, "PCS_AGD66_AMG_zone_52"},
+    {20253, "PCS_AGD66_AMG_zone_53"},
+    {20254, "PCS_AGD66_AMG_zone_54"},
+    {20255, "PCS_AGD66_AMG_zone_55"},
+    {20256, "PCS_AGD66_AMG_zone_56"},
+    {20257, "PCS_AGD66_AMG_zone_57"},
+    {20258, "PCS_AGD66_AMG_zone_58"},
+    {20348, "PCS_AGD84_AMG_zone_48"},
+    {20349, "PCS_AGD84_AMG_zone_49"},
+    {20350, "PCS_AGD84_AMG_zone_50"},
+    {20351, "PCS_AGD84_AMG_zone_51"},
+    {20352, "PCS_AGD84_AMG_zone_52"},
+    {20353, "PCS_AGD84_AMG_zone_53"},
+    {20354, "PCS_AGD84_AMG_zone_54"},
+    {20355, "PCS_AGD84_AMG_zone_55"},
+    {20356, "PCS_AGD84_AMG_zone_56"},
+    {20357, "PCS_AGD84_AMG_zone_57"},
+    {20358, "PCS_AGD84_AMG_zone_58"},
+    {20437, "PCS_Ain_el_Abd_UTM_zone_37N"},
+    {20438, "PCS_Ain_el_Abd_UTM_zone_38N"},
+    {20439, "PCS_Ain_el_Abd_UTM_zone_39N"},
+    {20499, "PCS_Ain_el_Abd_Bahrain_Grid"},
+    {20538, "PCS_Afgooye_UTM_zone_38N"},
+    {20539, "PCS_Afgooye_UTM_zone_39N"},
+    {20700, "PCS_Lisbon_Portugese_Grid"},
+    {20822, "PCS_Aratu_UTM_zone_22S"},
+    {20823, "PCS_Aratu_UTM_zone_23S"},
+    {20824, "PCS_Aratu_UTM_zone_24S"},
+    {20973, "PCS_Arc_1950_Lo13"},
+    {20975, "PCS_Arc_1950_Lo15"},
+    {20977, "PCS_Arc_1950_Lo17"},
+    {20979, "PCS_Arc_1950_Lo19"},
+    {20981, "PCS_Arc_1950_Lo21"},
+    {20983, "PCS_Arc_1950_Lo23"},
+    {20985, "PCS_Arc_1950_Lo25"},
+    {20987, "PCS_Arc_1950_Lo27"},
+    {20989, "PCS_Arc_1950_Lo29"},
+    {20991, "PCS_Arc_1950_Lo31"},
+    {20993, "PCS_Arc_1950_Lo33"},
+    {20995, "PCS_Arc_1950_Lo35"},
+    {21100, "PCS_Batavia_NEIEZ"},
+    {21148, "PCS_Batavia_UTM_zone_48S"},
+    {21149, "PCS_Batavia_UTM_zone_49S"},
+    {21150, "PCS_Batavia_UTM_zone_50S"},
+    {21413, "PCS_Beijing_Gauss_zone_13"},
+    {21414, "PCS_Beijing_Gauss_zone_14"},
+    {21415, "PCS_Beijing_Gauss_zone_15"},
+    {21416, "PCS_Beijing_Gauss_zone_16"},
+    {21417, "PCS_Beijing_Gauss_zone_17"},
+    {21418, "PCS_Beijing_Gauss_zone_18"},
+    {21419, "PCS_Beijing_Gauss_zone_19"},
+    {21420, "PCS_Beijing_Gauss_zone_20"},
+    {21421, "PCS_Beijing_Gauss_zone_21"},
+    {21422, "PCS_Beijing_Gauss_zone_22"},
+    {21423, "PCS_Beijing_Gauss_zone_23"},
+    {21473, "PCS_Beijing_Gauss_13N"},
+    {21474, "PCS_Beijing_Gauss_14N"},
+    {21475, "PCS_Beijing_Gauss_15N"},
+    {21476, "PCS_Beijing_Gauss_16N"},
+    {21477, "PCS_Beijing_Gauss_17N"},
+    {21478, "PCS_Beijing_Gauss_18N"},
+    {21479, "PCS_Beijing_Gauss_19N"},
+    {21480, "PCS_Beijing_Gauss_20N"},
+    {21481, "PCS_Beijing_Gauss_21N"},
+    {21482, "PCS_Beijing_Gauss_22N"},
+    {21483, "PCS_Beijing_Gauss_23N"},
+    {21500, "PCS_Belge_Lambert_50"},
+    {21790, "PCS_Bern_1898_Swiss_Old"},
+    {21817, "PCS_Bogota_UTM_zone_17N"},
+    {21818, "PCS_Bogota_UTM_zone_18N"},
+    {21891, "PCS_Bogota_Colombia_3W"},
+    {21892, "PCS_Bogota_Colombia_Bogota"},
+    {21893, "PCS_Bogota_Colombia_3E"},
+    {21894, "PCS_Bogota_Colombia_6E"},
+    {22032, "PCS_Camacupa_UTM_32S"},
+    {22033, "PCS_Camacupa_UTM_33S"},
+    {22191, "PCS_C_Inchauspe_Argentina_1"},
+    {22192, "PCS_C_Inchauspe_Argentina_2"},
+    {22193, "PCS_C_Inchauspe_Argentina_3"},
+    {22194, "PCS_C_Inchauspe_Argentina_4"},
+    {22195, "PCS_C_Inchauspe_Argentina_5"},
+    {22196, "PCS_C_Inchauspe_Argentina_6"},
+    {22197, "PCS_C_Inchauspe_Argentina_7"},
+    {22332, "PCS_Carthage_UTM_zone_32N"},
+    {22391, "PCS_Carthage_Nord_Tunisie"},
+    {22392, "PCS_Carthage_Sud_Tunisie"},
+    {22523, "PCS_Corrego_Alegre_UTM_23S"},
+    {22524, "PCS_Corrego_Alegre_UTM_24S"},
+    {22832, "PCS_Douala_UTM_zone_32N"},
+    {22992, "PCS_Egypt_1907_Red_Belt"},
+    {22993, "PCS_Egypt_1907_Purple_Belt"},
+    {22994, "PCS_Egypt_1907_Ext_Purple"},
+    {23028, "PCS_ED50_UTM_zone_28N"},
+    {23029, "PCS_ED50_UTM_zone_29N"},
+    {23030, "PCS_ED50_UTM_zone_30N"},
+    {23031, "PCS_ED50_UTM_zone_31N"},
+    {23032, "PCS_ED50_UTM_zone_32N"},
+    {23033, "PCS_ED50_UTM_zone_33N"},
+    {23034, "PCS_ED50_UTM_zone_34N"},
+    {23035, "PCS_ED50_UTM_zone_35N"},
+    {23036, "PCS_ED50_UTM_zone_36N"},
+    {23037, "PCS_ED50_UTM_zone_37N"},
+    {23038, "PCS_ED50_UTM_zone_38N"},
+    {23239, "PCS_Fahud_UTM_zone_39N"},
+    {23240, "PCS_Fahud_UTM_zone_40N"},
+    {23433, "PCS_Garoua_UTM_zone_33N"},
+    {23846, "PCS_ID74_UTM_zone_46N"},
+    {23847, "PCS_ID74_UTM_zone_47N"},
+    {23848, "PCS_ID74_UTM_zone_48N"},
+    {23849, "PCS_ID74_UTM_zone_49N"},
+    {23850, "PCS_ID74_UTM_zone_50N"},
+    {23851, "PCS_ID74_UTM_zone_51N"},
+    {23852, "PCS_ID74_UTM_zone_52N"},
+    {23853, "PCS_ID74_UTM_zone_53N"},
+    {23886, "PCS_ID74_UTM_zone_46S"},
+    {23887, "PCS_ID74_UTM_zone_47S"},
+    {23888, "PCS_ID74_UTM_zone_48S"},
+    {23889, "PCS_ID74_UTM_zone_49S"},
+    {23890, "PCS_ID74_UTM_zone_50S"},
+    {23891, "PCS_ID74_UTM_zone_51S"},
+    {23892, "PCS_ID74_UTM_zone_52S"},
+    {23893, "PCS_ID74_UTM_zone_53S"},
+    {23894, "PCS_ID74_UTM_zone_54S"},
+    {23947, "PCS_Indian_1954_UTM_47N"},
+    {23948, "PCS_Indian_1954_UTM_48N"},
+    {24047, "PCS_Indian_1975_UTM_47N"},
+    {24048, "PCS_Indian_1975_UTM_48N"},
+    {24100, "PCS_Jamaica_1875_Old_Grid"},
+    {24200, "PCS_JAD69_Jamaica_Grid"},
+    {24370, "PCS_Kalianpur_India_0"},
+    {24371, "PCS_Kalianpur_India_I"},
+    {24372, "PCS_Kalianpur_India_IIa"},
+    {24373, "PCS_Kalianpur_India_IIIa"},
+    {24374, "PCS_Kalianpur_India_IVa"},
+    {24382, "PCS_Kalianpur_India_IIb"},
+    {24383, "PCS_Kalianpur_India_IIIb"},
+    {24384, "PCS_Kalianpur_India_IVb"},
+    {24500, "PCS_Kertau_Singapore_Grid"},
+    {24547, "PCS_Kertau_UTM_zone_47N"},
+    {24548, "PCS_Kertau_UTM_zone_48N"},
+    {24720, "PCS_La_Canoa_UTM_zone_20N"},
+    {24721, "PCS_La_Canoa_UTM_zone_21N"},
+    {24818, "PCS_PSAD56_UTM_zone_18N"},
+    {24819, "PCS_PSAD56_UTM_zone_19N"},
+    {24820, "PCS_PSAD56_UTM_zone_20N"},
+    {24821, "PCS_PSAD56_UTM_zone_21N"},
+    {24877, "PCS_PSAD56_UTM_zone_17S"},
+    {24878, "PCS_PSAD56_UTM_zone_18S"},
+    {24879, "PCS_PSAD56_UTM_zone_19S"},
+    {24880, "PCS_PSAD56_UTM_zone_20S"},
+    {24891, "PCS_PSAD56_Peru_west_zone"},
+    {24892, "PCS_PSAD56_Peru_central"},
+    {24893, "PCS_PSAD56_Peru_east_zone"},
+    {25000, "PCS_Leigon_Ghana_Grid"},
+    {25231, "PCS_Lome_UTM_zone_31N"},
+    {25391, "PCS_Luzon_Philippines_I"},
+    {25392, "PCS_Luzon_Philippines_II"},
+    {25393, "PCS_Luzon_Philippines_III"},
+    {25394, "PCS_Luzon_Philippines_IV"},
+    {25395, "PCS_Luzon_Philippines_V"},
+    {25700, "PCS_Makassar_NEIEZ"},
+    {25932, "PCS_Malongo_1987_UTM_32S"},
+    {26191, "PCS_Merchich_Nord_Maroc"},
+    {26192, "PCS_Merchich_Sud_Maroc"},
+    {26193, "PCS_Merchich_Sahara"},
+    {26237, "PCS_Massawa_UTM_zone_37N"},
+    {26331, "PCS_Minna_UTM_zone_31N"},
+    {26332, "PCS_Minna_UTM_zone_32N"},
+    {26391, "PCS_Minna_Nigeria_West"},
+    {26392, "PCS_Minna_Nigeria_Mid_Belt"},
+    {26393, "PCS_Minna_Nigeria_East"},
+    {26432, "PCS_Mhast_UTM_zone_32S"},
+    {26591, "PCS_Monte_Mario_Italy_1"},
+    {26592, "PCS_Monte_Mario_Italy_2"},
+    {26632, "PCS_M_poraloko_UTM_32N"},
+    {26692, "PCS_M_poraloko_UTM_32S"},
+    {26703, "PCS_NAD27_UTM_zone_3N"},
+    {26704, "PCS_NAD27_UTM_zone_4N"},
+    {26705, "PCS_NAD27_UTM_zone_5N"},
+    {26706, "PCS_NAD27_UTM_zone_6N"},
+    {26707, "PCS_NAD27_UTM_zone_7N"},
+    {26708, "PCS_NAD27_UTM_zone_8N"},
+    {26709, "PCS_NAD27_UTM_zone_9N"},
+    {26710, "PCS_NAD27_UTM_zone_10N"},
+    {26711, "PCS_NAD27_UTM_zone_11N"},
+    {26712, "PCS_NAD27_UTM_zone_12N"},
+    {26713, "PCS_NAD27_UTM_zone_13N"},
+    {26714, "PCS_NAD27_UTM_zone_14N"},
+    {26715, "PCS_NAD27_UTM_zone_15N"},
+    {26716, "PCS_NAD27_UTM_zone_16N"},
+    {26717, "PCS_NAD27_UTM_zone_17N"},
+    {26718, "PCS_NAD27_UTM_zone_18N"},
+    {26719, "PCS_NAD27_UTM_zone_19N"},
+    {26720, "PCS_NAD27_UTM_zone_20N"},
+    {26721, "PCS_NAD27_UTM_zone_21N"},
+    {26722, "PCS_NAD27_UTM_zone_22N"},
+    {26729, "PCS_NAD27_Alabama_East"},
+    {26730, "PCS_NAD27_Alabama_West"},
+    {26731, "PCS_NAD27_Alaska_zone_1"},
+    {26732, "PCS_NAD27_Alaska_zone_2"},
+    {26733, "PCS_NAD27_Alaska_zone_3"},
+    {26734, "PCS_NAD27_Alaska_zone_4"},
+    {26735, "PCS_NAD27_Alaska_zone_5"},
+    {26736, "PCS_NAD27_Alaska_zone_6"},
+    {26737, "PCS_NAD27_Alaska_zone_7"},
+    {26738, "PCS_NAD27_Alaska_zone_8"},
+    {26739, "PCS_NAD27_Alaska_zone_9"},
+    {26740, "PCS_NAD27_Alaska_zone_10"},
+    {26741, "PCS_NAD27_California_I"},
+    {26742, "PCS_NAD27_California_II"},
+    {26743, "PCS_NAD27_California_III"},
+    {26744, "PCS_NAD27_California_IV"},
+    {26745, "PCS_NAD27_California_V"},
+    {26746, "PCS_NAD27_California_VI"},
+    {26747, "PCS_NAD27_California_VII"},
+    {26748, "PCS_NAD27_Arizona_East"},
+    {26749, "PCS_NAD27_Arizona_Central"},
+    {26750, "PCS_NAD27_Arizona_West"},
+    {26751, "PCS_NAD27_Arkansas_North"},
+    {26752, "PCS_NAD27_Arkansas_South"},
+    {26753, "PCS_NAD27_Colorado_North"},
+    {26754, "PCS_NAD27_Colorado_Central"},
+    {26755, "PCS_NAD27_Colorado_South"},
+    {26756, "PCS_NAD27_Connecticut"},
+    {26757, "PCS_NAD27_Delaware"},
+    {26758, "PCS_NAD27_Florida_East"},
+    {26759, "PCS_NAD27_Florida_West"},
+    {26760, "PCS_NAD27_Florida_North"},
+    {26761, "PCS_NAD27_Hawaii_zone_1"},
+    {26762, "PCS_NAD27_Hawaii_zone_2"},
+    {26763, "PCS_NAD27_Hawaii_zone_3"},
+    {26764, "PCS_NAD27_Hawaii_zone_4"},
+    {26765, "PCS_NAD27_Hawaii_zone_5"},
+    {26766, "PCS_NAD27_Georgia_East"},
+    {26767, "PCS_NAD27_Georgia_West"},
+    {26768, "PCS_NAD27_Idaho_East"},
+    {26769, "PCS_NAD27_Idaho_Central"},
+    {26770, "PCS_NAD27_Idaho_West"},
+    {26771, "PCS_NAD27_Illinois_East"},
+    {26772, "PCS_NAD27_Illinois_West"},
+    {26773, "PCS_NAD27_Indiana_East"},
+    {26774, "PCS_NAD27_BLM_14N_feet"},
+    {26774, "PCS_NAD27_Indiana_West"},
+    {26775, "PCS_NAD27_BLM_15N_feet"},
+    {26775, "PCS_NAD27_Iowa_North"},
+    {26776, "PCS_NAD27_BLM_16N_feet"},
+    {26776, "PCS_NAD27_Iowa_South"},
+    {26777, "PCS_NAD27_BLM_17N_feet"},
+    {26777, "PCS_NAD27_Kansas_North"},
+    {26778, "PCS_NAD27_Kansas_South"},
+    {26779, "PCS_NAD27_Kentucky_North"},
+    {26780, "PCS_NAD27_Kentucky_South"},
+    {26781, "PCS_NAD27_Louisiana_North"},
+    {26782, "PCS_NAD27_Louisiana_South"},
+    {26783, "PCS_NAD27_Maine_East"},
+    {26784, "PCS_NAD27_Maine_West"},
+    {26785, "PCS_NAD27_Maryland"},
+    {26786, "PCS_NAD27_Massachusetts"},
+    {26787, "PCS_NAD27_Massachusetts_Is"},
+    {26788, "PCS_NAD27_Michigan_North"},
+    {26789, "PCS_NAD27_Michigan_Central"},
+    {26790, "PCS_NAD27_Michigan_South"},
+    {26791, "PCS_NAD27_Minnesota_North"},
+    {26792, "PCS_NAD27_Minnesota_Cent"},
+    {26793, "PCS_NAD27_Minnesota_South"},
+    {26794, "PCS_NAD27_Mississippi_East"},
+    {26795, "PCS_NAD27_Mississippi_West"},
+    {26796, "PCS_NAD27_Missouri_East"},
+    {26797, "PCS_NAD27_Missouri_Central"},
+    {26798, "PCS_NAD27_Missouri_West"},
+    {26801, "PCS_NAD_Michigan_Michigan_East"},
+    {26802, "PCS_NAD_Michigan_Michigan_Old_Central"},
+    {26803, "PCS_NAD_Michigan_Michigan_West"},
+    {26903, "PCS_NAD83_UTM_zone_3N"},
+    {26904, "PCS_NAD83_UTM_zone_4N"},
+    {26905, "PCS_NAD83_UTM_zone_5N"},
+    {26906, "PCS_NAD83_UTM_zone_6N"},
+    {26907, "PCS_NAD83_UTM_zone_7N"},
+    {26908, "PCS_NAD83_UTM_zone_8N"},
+    {26909, "PCS_NAD83_UTM_zone_9N"},
+    {26910, "PCS_NAD83_UTM_zone_10N"},
+    {26911, "PCS_NAD83_UTM_zone_11N"},
+    {26912, "PCS_NAD83_UTM_zone_12N"},
+    {26913, "PCS_NAD83_UTM_zone_13N"},
+    {26914, "PCS_NAD83_UTM_zone_14N"},
+    {26915, "PCS_NAD83_UTM_zone_15N"},
+    {26916, "PCS_NAD83_UTM_zone_16N"},
+    {26917, "PCS_NAD83_UTM_zone_17N"},
+    {26918, "PCS_NAD83_UTM_zone_18N"},
+    {26919, "PCS_NAD83_UTM_zone_19N"},
+    {26920, "PCS_NAD83_UTM_zone_20N"},
+    {26921, "PCS_NAD83_UTM_zone_21N"},
+    {26922, "PCS_NAD83_UTM_zone_22N"},
+    {26923, "PCS_NAD83_UTM_zone_23N"},
+    {26929, "PCS_NAD83_Alabama_East"},
+    {26930, "PCS_NAD83_Alabama_West"},
+    {26931, "PCS_NAD83_Alaska_zone_1"},
+    {26932, "PCS_NAD83_Alaska_zone_2"},
+    {26933, "PCS_NAD83_Alaska_zone_3"},
+    {26934, "PCS_NAD83_Alaska_zone_4"},
+    {26935, "PCS_NAD83_Alaska_zone_5"},
+    {26936, "PCS_NAD83_Alaska_zone_6"},
+    {26937, "PCS_NAD83_Alaska_zone_7"},
+    {26938, "PCS_NAD83_Alaska_zone_8"},
+    {26939, "PCS_NAD83_Alaska_zone_9"},
+    {26940, "PCS_NAD83_Alaska_zone_10"},
+    {26941, "PCS_NAD83_California_1"},
+    {26942, "PCS_NAD83_California_2"},
+    {26943, "PCS_NAD83_California_3"},
+    {26944, "PCS_NAD83_California_4"},
+    {26945, "PCS_NAD83_California_5"},
+    {26946, "PCS_NAD83_California_6"},
+    {26948, "PCS_NAD83_Arizona_East"},
+    {26949, "PCS_NAD83_Arizona_Central"},
+    {26950, "PCS_NAD83_Arizona_West"},
+    {26951, "PCS_NAD83_Arkansas_North"},
+    {26952, "PCS_NAD83_Arkansas_South"},
+    {26953, "PCS_NAD83_Colorado_North"},
+    {26954, "PCS_NAD83_Colorado_Central"},
+    {26955, "PCS_NAD83_Colorado_South"},
+    {26956, "PCS_NAD83_Connecticut"},
+    {26957, "PCS_NAD83_Delaware"},
+    {26958, "PCS_NAD83_Florida_East"},
+    {26959, "PCS_NAD83_Florida_West"},
+    {26960, "PCS_NAD83_Florida_North"},
+    {26961, "PCS_NAD83_Hawaii_zone_1"},
+    {26962, "PCS_NAD83_Hawaii_zone_2"},
+    {26963, "PCS_NAD83_Hawaii_zone_3"},
+    {26964, "PCS_NAD83_Hawaii_zone_4"},
+    {26965, "PCS_NAD83_Hawaii_zone_5"},
+    {26966, "PCS_NAD83_Georgia_East"},
+    {26967, "PCS_NAD83_Georgia_West"},
+    {26968, "PCS_NAD83_Idaho_East"},
+    {26969, "PCS_NAD83_Idaho_Central"},
+    {26970, "PCS_NAD83_Idaho_West"},
+    {26971, "PCS_NAD83_Illinois_East"},
+    {26972, "PCS_NAD83_Illinois_West"},
+    {26973, "PCS_NAD83_Indiana_East"},
+    {26974, "PCS_NAD83_Indiana_West"},
+    {26975, "PCS_NAD83_Iowa_North"},
+    {26976, "PCS_NAD83_Iowa_South"},
+    {26977, "PCS_NAD83_Kansas_North"},
+    {26978, "PCS_NAD83_Kansas_South"},
+    {26979, "PCS_NAD83_Kentucky_North"},
+    {26980, "PCS_NAD83_Kentucky_South"},
+    {26981, "PCS_NAD83_Louisiana_North"},
+    {26982, "PCS_NAD83_Louisiana_South"},
+    {26983, "PCS_NAD83_Maine_East"},
+    {26984, "PCS_NAD83_Maine_West"},
+    {26985, "PCS_NAD83_Maryland"},
+    {26986, "PCS_NAD83_Massachusetts"},
+    {26987, "PCS_NAD83_Massachusetts_Is"},
+    {26988, "PCS_NAD83_Michigan_North"},
+    {26989, "PCS_NAD83_Michigan_Central"},
+    {26990, "PCS_NAD83_Michigan_South"},
+    {26991, "PCS_NAD83_Minnesota_North"},
+    {26992, "PCS_NAD83_Minnesota_Cent"},
+    {26993, "PCS_NAD83_Minnesota_South"},
+    {26994, "PCS_NAD83_Mississippi_East"},
+    {26995, "PCS_NAD83_Mississippi_West"},
+    {26996, "PCS_NAD83_Missouri_East"},
+    {26997, "PCS_NAD83_Missouri_Central"},
+    {26998, "PCS_NAD83_Missouri_West"},
+    {27038, "PCS_Nahrwan_1967_UTM_38N"},
+    {27039, "PCS_Nahrwan_1967_UTM_39N"},
+    {27040, "PCS_Nahrwan_1967_UTM_40N"},
+    {27120, "PCS_Naparima_UTM_20N"},
+    {27200, "PCS_GD49_NZ_Map_Grid"},
+    {27291, "PCS_GD49_North_Island_Grid"},
+    {27292, "PCS_GD49_South_Island_Grid"},
+    {27429, "PCS_Datum_73_UTM_zone_29N"},
+    {27500, "PCS_ATF_Nord_de_Guerre"},
+    {27581, "PCS_NTF_France_I"},
+    {27582, "PCS_NTF_France_II"},
+    {27583, "PCS_NTF_France_III"},
+    {27591, "PCS_NTF_Nord_France"},
+    {27592, "PCS_NTF_Centre_France"},
+    {27593, "PCS_NTF_Sud_France"},
+    {27700, "PCS_British_National_Grid"},
+    {28232, "PCS_Point_Noire_UTM_32S"},
+    {28348, "PCS_GDA94_MGA_zone_48"},
+    {28349, "PCS_GDA94_MGA_zone_49"},
+    {28350, "PCS_GDA94_MGA_zone_50"},
+    {28351, "PCS_GDA94_MGA_zone_51"},
+    {28352, "PCS_GDA94_MGA_zone_52"},
+    {28353, "PCS_GDA94_MGA_zone_53"},
+    {28354, "PCS_GDA94_MGA_zone_54"},
+    {28355, "PCS_GDA94_MGA_zone_55"},
+    {28356, "PCS_GDA94_MGA_zone_56"},
+    {28357, "PCS_GDA94_MGA_zone_57"},
+    {28358, "PCS_GDA94_MGA_zone_58"},
+    {28404, "PCS_Pulkovo_Gauss_zone_4"},
+    {28405, "PCS_Pulkovo_Gauss_zone_5"},
+    {28406, "PCS_Pulkovo_Gauss_zone_6"},
+    {28407, "PCS_Pulkovo_Gauss_zone_7"},
+    {28408, "PCS_Pulkovo_Gauss_zone_8"},
+    {28409, "PCS_Pulkovo_Gauss_zone_9"},
+    {28410, "PCS_Pulkovo_Gauss_zone_10"},
+    {28411, "PCS_Pulkovo_Gauss_zone_11"},
+    {28412, "PCS_Pulkovo_Gauss_zone_12"},
+    {28413, "PCS_Pulkovo_Gauss_zone_13"},
+    {28414, "PCS_Pulkovo_Gauss_zone_14"},
+    {28415, "PCS_Pulkovo_Gauss_zone_15"},
+    {28416, "PCS_Pulkovo_Gauss_zone_16"},
+    {28417, "PCS_Pulkovo_Gauss_zone_17"},
+    {28418, "PCS_Pulkovo_Gauss_zone_18"},
+    {28419, "PCS_Pulkovo_Gauss_zone_19"},
+    {28420, "PCS_Pulkovo_Gauss_zone_20"},
+    {28421, "PCS_Pulkovo_Gauss_zone_21"},
+    {28422, "PCS_Pulkovo_Gauss_zone_22"},
+    {28423, "PCS_Pulkovo_Gauss_zone_23"},
+    {28424, "PCS_Pulkovo_Gauss_zone_24"},
+    {28425, "PCS_Pulkovo_Gauss_zone_25"},
+    {28426, "PCS_Pulkovo_Gauss_zone_26"},
+    {28427, "PCS_Pulkovo_Gauss_zone_27"},
+    {28428, "PCS_Pulkovo_Gauss_zone_28"},
+    {28429, "PCS_Pulkovo_Gauss_zone_29"},
+    {28430, "PCS_Pulkovo_Gauss_zone_30"},
+    {28431, "PCS_Pulkovo_Gauss_zone_31"},
+    {28432, "PCS_Pulkovo_Gauss_zone_32"},
+    {28464, "PCS_Pulkovo_Gauss_4N"},
+    {28465, "PCS_Pulkovo_Gauss_5N"},
+    {28466, "PCS_Pulkovo_Gauss_6N"},
+    {28467, "PCS_Pulkovo_Gauss_7N"},
+    {28468, "PCS_Pulkovo_Gauss_8N"},
+    {28469, "PCS_Pulkovo_Gauss_9N"},
+    {28470, "PCS_Pulkovo_Gauss_10N"},
+    {28471, "PCS_Pulkovo_Gauss_11N"},
+    {28472, "PCS_Pulkovo_Gauss_12N"},
+    {28473, "PCS_Pulkovo_Gauss_13N"},
+    {28474, "PCS_Pulkovo_Gauss_14N"},
+    {28475, "PCS_Pulkovo_Gauss_15N"},
+    {28476, "PCS_Pulkovo_Gauss_16N"},
+    {28477, "PCS_Pulkovo_Gauss_17N"},
+    {28478, "PCS_Pulkovo_Gauss_18N"},
+    {28479, "PCS_Pulkovo_Gauss_19N"},
+    {28480, "PCS_Pulkovo_Gauss_20N"},
+    {28481, "PCS_Pulkovo_Gauss_21N"},
+    {28482, "PCS_Pulkovo_Gauss_22N"},
+    {28483, "PCS_Pulkovo_Gauss_23N"},
+    {28484, "PCS_Pulkovo_Gauss_24N"},
+    {28485, "PCS_Pulkovo_Gauss_25N"},
+    {28486, "PCS_Pulkovo_Gauss_26N"},
+    {28487, "PCS_Pulkovo_Gauss_27N"},
+    {28488, "PCS_Pulkovo_Gauss_28N"},
+    {28489, "PCS_Pulkovo_Gauss_29N"},
+    {28490, "PCS_Pulkovo_Gauss_30N"},
+    {28491, "PCS_Pulkovo_Gauss_31N"},
+    {28492, "PCS_Pulkovo_Gauss_32N"},
+    {28600, "PCS_Qatar_National_Grid"},
+    {28991, "PCS_RD_Netherlands_Old"},
+    {28992, "PCS_RD_Netherlands_New"},
+    {29118, "PCS_SAD69_UTM_zone_18N"},
+    {29119, "PCS_SAD69_UTM_zone_19N"},
+    {29120, "PCS_SAD69_UTM_zone_20N"},
+    {29121, "PCS_SAD69_UTM_zone_21N"},
+    {29122, "PCS_SAD69_UTM_zone_22N"},
+    {29177, "PCS_SAD69_UTM_zone_17S"},
+    {29178, "PCS_SAD69_UTM_zone_18S"},
+    {29179, "PCS_SAD69_UTM_zone_19S"},
+    {29180, "PCS_SAD69_UTM_zone_20S"},
+    {29181, "PCS_SAD69_UTM_zone_21S"},
+    {29182, "PCS_SAD69_UTM_zone_22S"},
+    {29183, "PCS_SAD69_UTM_zone_23S"},
+    {29184, "PCS_SAD69_UTM_zone_24S"},
+    {29185, "PCS_SAD69_UTM_zone_25S"},
+    {29220, "PCS_Sapper_Hill_UTM_20S"},
+    {29221, "PCS_Sapper_Hill_UTM_21S"},
+    {29333, "PCS_Schwarzeck_UTM_33S"},
+    {29635, "PCS_Sudan_UTM_zone_35N"},
+    {29636, "PCS_Sudan_UTM_zone_36N"},
+    {29700, "PCS_Tananarive_Laborde"},
+    {29738, "PCS_Tananarive_UTM_38S"},
+    {29739, "PCS_Tananarive_UTM_39S"},
+    {29800, "PCS_Timbalai_1948_Borneo"},
+    {29849, "PCS_Timbalai_1948_UTM_49N"},
+    {29850, "PCS_Timbalai_1948_UTM_50N"},
+    {29900, "PCS_TM65_Irish_Nat_Grid"},
+    {30200, "PCS_Trinidad_1903_Trinidad"},
+    {30339, "PCS_TC_1948_UTM_zone_39N"},
+    {30340, "PCS_TC_1948_UTM_zone_40N"},
+    {30491, "PCS_Voirol_N_Algerie_ancien"},
+    {30492, "PCS_Voirol_S_Algerie_ancien"},
+    {30591, "PCS_Voirol_Unifie_N_Algerie"},
+    {30592, "PCS_Voirol_Unifie_S_Algerie"},
+    {30600, "PCS_Bern_1938_Swiss_New"},
+    {30729, "PCS_Nord_Sahara_UTM_29N"},
+    {30730, "PCS_Nord_Sahara_UTM_30N"},
+    {30731, "PCS_Nord_Sahara_UTM_31N"},
+    {30732, "PCS_Nord_Sahara_UTM_32N"},
+    {31028, "PCS_Yoff_UTM_zone_28N"},
+    {31121, "PCS_Zanderij_UTM_zone_21N"},
+    {31291, "PCS_MGI_Austria_West"},
+    {31292, "PCS_MGI_Austria_Central"},
+    {31293, "PCS_MGI_Austria_East"},
+    {31300, "PCS_Belge_Lambert_72"},
+    {31491, "PCS_DHDN_Germany_zone_1"},
+    {31492, "PCS_DHDN_Germany_zone_2"},
+    {31493, "PCS_DHDN_Germany_zone_3"},
+    {31494, "PCS_DHDN_Germany_zone_4"},
+    {31495, "PCS_DHDN_Germany_zone_5"},
+    {32001, "PCS_NAD27_Montana_North"},
+    {32002, "PCS_NAD27_Montana_Central"},
+    {32003, "PCS_NAD27_Montana_South"},
+    {32005, "PCS_NAD27_Nebraska_North"},
+    {32006, "PCS_NAD27_Nebraska_South"},
+    {32007, "PCS_NAD27_Nevada_East"},
+    {32008, "PCS_NAD27_Nevada_Central"},
+    {32009, "PCS_NAD27_Nevada_West"},
+    {32010, "PCS_NAD27_New_Hampshire"},
+    {32011, "PCS_NAD27_New_Jersey"},
+    {32012, "PCS_NAD27_New_Mexico_East"},
+    {32013, "PCS_NAD27_New_Mexico_Cent"},
+    {32014, "PCS_NAD27_New_Mexico_West"},
+    {32015, "PCS_NAD27_New_York_East"},
+    {32016, "PCS_NAD27_New_York_Central"},
+    {32017, "PCS_NAD27_New_York_West"},
+    {32018, "PCS_NAD27_New_York_Long_Is"},
+    {32019, "PCS_NAD27_North_Carolina"},
+    {32020, "PCS_NAD27_North_Dakota_N"},
+    {32021, "PCS_NAD27_North_Dakota_S"},
+    {32022, "PCS_NAD27_Ohio_North"},
+    {32023, "PCS_NAD27_Ohio_South"},
+    {32024, "PCS_NAD27_Oklahoma_North"},
+    {32025, "PCS_NAD27_Oklahoma_South"},
+    {32026, "PCS_NAD27_Oregon_North"},
+    {32027, "PCS_NAD27_Oregon_South"},
+    {32028, "PCS_NAD27_Pennsylvania_N"},
+    {32029, "PCS_NAD27_Pennsylvania_S"},
+    {32030, "PCS_NAD27_Rhode_Island"},
+    {32031, "PCS_NAD27_South_Carolina_N"},
+    {32033, "PCS_NAD27_South_Carolina_S"},
+    {32034, "PCS_NAD27_South_Dakota_N"},
+    {32035, "PCS_NAD27_South_Dakota_S"},
+    {32036, "PCS_NAD27_Tennessee"},
+    {32037, "PCS_NAD27_Texas_North"},
+    {32038, "PCS_NAD27_Texas_North_Cen"},
+    {32039, "PCS_NAD27_Texas_Central"},
+    {32040, "PCS_NAD27_Texas_South_Cen"},
+    {32041, "PCS_NAD27_Texas_South"},
+    {32042, "PCS_NAD27_Utah_North"},
+    {32043, "PCS_NAD27_Utah_Central"},
+    {32044, "PCS_NAD27_Utah_South"},
+    {32045, "PCS_NAD27_Vermont"},
+    {32046, "PCS_NAD27_Virginia_North"},
+    {32047, "PCS_NAD27_Virginia_South"},
+    {32048, "PCS_NAD27_Washington_North"},
+    {32049, "PCS_NAD27_Washington_South"},
+    {32050, "PCS_NAD27_West_Virginia_N"},
+    {32051, "PCS_NAD27_West_Virginia_S"},
+    {32052, "PCS_NAD27_Wisconsin_North"},
+    {32053, "PCS_NAD27_Wisconsin_Cen"},
+    {32054, "PCS_NAD27_Wisconsin_South"},
+    {32055, "PCS_NAD27_Wyoming_East"},
+    {32056, "PCS_NAD27_Wyoming_E_Cen"},
+    {32057, "PCS_NAD27_Wyoming_W_Cen"},
+    {32058, "PCS_NAD27_Wyoming_West"},
+    {32059, "PCS_NAD27_Puerto_Rico"},
+    {32060, "PCS_NAD27_St_Croix"},
+    {32100, "PCS_NAD83_Montana"},
+    {32104, "PCS_NAD83_Nebraska"},
+    {32107, "PCS_NAD83_Nevada_East"},
+    {32108, "PCS_NAD83_Nevada_Central"},
+    {32109, "PCS_NAD83_Nevada_West"},
+    {32110, "PCS_NAD83_New_Hampshire"},
+    {32111, "PCS_NAD83_New_Jersey"},
+    {32112, "PCS_NAD83_New_Mexico_East"},
+    {32113, "PCS_NAD83_New_Mexico_Cent"},
+    {32114, "PCS_NAD83_New_Mexico_West"},
+    {32115, "PCS_NAD83_New_York_East"},
+    {32116, "PCS_NAD83_New_York_Central"},
+    {32117, "PCS_NAD83_New_York_West"},
+    {32118, "PCS_NAD83_New_York_Long_Is"},
+    {32119, "PCS_NAD83_North_Carolina"},
+    {32120, "PCS_NAD83_North_Dakota_N"},
+    {32121, "PCS_NAD83_North_Dakota_S"},
+    {32122, "PCS_NAD83_Ohio_North"},
+    {32123, "PCS_NAD83_Ohio_South"},
+    {32124, "PCS_NAD83_Oklahoma_North"},
+    {32125, "PCS_NAD83_Oklahoma_South"},
+    {32126, "PCS_NAD83_Oregon_North"},
+    {32127, "PCS_NAD83_Oregon_South"},
+    {32128, "PCS_NAD83_Pennsylvania_N"},
+    {32129, "PCS_NAD83_Pennsylvania_S"},
+    {32130, "PCS_NAD83_Rhode_Island"},
+    {32133, "PCS_NAD83_South_Carolina"},
+    {32134, "PCS_NAD83_South_Dakota_N"},
+    {32135, "PCS_NAD83_South_Dakota_S"},
+    {32136, "PCS_NAD83_Tennessee"},
+    {32137, "PCS_NAD83_Texas_North"},
+    {32138, "PCS_NAD83_Texas_North_Cen"},
+    {32139, "PCS_NAD83_Texas_Central"},
+    {32140, "PCS_NAD83_Texas_South_Cen"},
+    {32141, "PCS_NAD83_Texas_South"},
+    {32142, "PCS_NAD83_Utah_North"},
+    {32143, "PCS_NAD83_Utah_Central"},
+    {32144, "PCS_NAD83_Utah_South"},
+    {32145, "PCS_NAD83_Vermont"},
+    {32146, "PCS_NAD83_Virginia_North"},
+    {32147, "PCS_NAD83_Virginia_South"},
+    {32148, "PCS_NAD83_Washington_North"},
+    {32149, "PCS_NAD83_Washington_South"},
+    {32150, "PCS_NAD83_West_Virginia_N"},
+    {32151, "PCS_NAD83_West_Virginia_S"},
+    {32152, "PCS_NAD83_Wisconsin_North"},
+    {32153, "PCS_NAD83_Wisconsin_Cen"},
+    {32154, "PCS_NAD83_Wisconsin_South"},
+    {32155, "PCS_NAD83_Wyoming_East"},
+    {32156, "PCS_NAD83_Wyoming_E_Cen"},
+    {32157, "PCS_NAD83_Wyoming_W_Cen"},
+    {32158, "PCS_NAD83_Wyoming_West"},
+    {32161, "PCS_NAD83_Puerto_Rico_Virgin_Is"},
+    {32201, "PCS_WGS72_UTM_zone_1N"},
+    {32202, "PCS_WGS72_UTM_zone_2N"},
+    {32203, "PCS_WGS72_UTM_zone_3N"},
+    {32204, "PCS_WGS72_UTM_zone_4N"},
+    {32205, "PCS_WGS72_UTM_zone_5N"},
+    {32206, "PCS_WGS72_UTM_zone_6N"},
+    {32207, "PCS_WGS72_UTM_zone_7N"},
+    {32208, "PCS_WGS72_UTM_zone_8N"},
+    {32209, "PCS_WGS72_UTM_zone_9N"},
+    {32210, "PCS_WGS72_UTM_zone_10N"},
+    {32211, "PCS_WGS72_UTM_zone_11N"},
+    {32212, "PCS_WGS72_UTM_zone_12N"},
+    {32213, "PCS_WGS72_UTM_zone_13N"},
+    {32214, "PCS_WGS72_UTM_zone_14N"},
+    {32215, "PCS_WGS72_UTM_zone_15N"},
+    {32216, "PCS_WGS72_UTM_zone_16N"},
+    {32217, "PCS_WGS72_UTM_zone_17N"},
+    {32218, "PCS_WGS72_UTM_zone_18N"},
+    {32219, "PCS_WGS72_UTM_zone_19N"},
+    {32220, "PCS_WGS72_UTM_zone_20N"},
+    {32221, "PCS_WGS72_UTM_zone_21N"},
+    {32222, "PCS_WGS72_UTM_zone_22N"},
+    {32223, "PCS_WGS72_UTM_zone_23N"},
+    {32224, "PCS_WGS72_UTM_zone_24N"},
+    {32225, "PCS_WGS72_UTM_zone_25N"},
+    {32226, "PCS_WGS72_UTM_zone_26N"},
+    {32227, "PCS_WGS72_UTM_zone_27N"},
+    {32228, "PCS_WGS72_UTM_zone_28N"},
+    {32229, "PCS_WGS72_UTM_zone_29N"},
+    {32230, "PCS_WGS72_UTM_zone_30N"},
+    {32231, "PCS_WGS72_UTM_zone_31N"},
+    {32232, "PCS_WGS72_UTM_zone_32N"},
+    {32233, "PCS_WGS72_UTM_zone_33N"},
+    {32234, "PCS_WGS72_UTM_zone_34N"},
+    {32235, "PCS_WGS72_UTM_zone_35N"},
+    {32236, "PCS_WGS72_UTM_zone_36N"},
+    {32237, "PCS_WGS72_UTM_zone_37N"},
+    {32238, "PCS_WGS72_UTM_zone_38N"},
+    {32239, "PCS_WGS72_UTM_zone_39N"},
+    {32240, "PCS_WGS72_UTM_zone_40N"},
+    {32241, "PCS_WGS72_UTM_zone_41N"},
+    {32242, "PCS_WGS72_UTM_zone_42N"},
+    {32243, "PCS_WGS72_UTM_zone_43N"},
+    {32244, "PCS_WGS72_UTM_zone_44N"},
+    {32245, "PCS_WGS72_UTM_zone_45N"},
+    {32246, "PCS_WGS72_UTM_zone_46N"},
+    {32247, "PCS_WGS72_UTM_zone_47N"},
+    {32248, "PCS_WGS72_UTM_zone_48N"},
+    {32249, "PCS_WGS72_UTM_zone_49N"},
+    {32250, "PCS_WGS72_UTM_zone_50N"},
+    {32251, "PCS_WGS72_UTM_zone_51N"},
+    {32252, "PCS_WGS72_UTM_zone_52N"},
+    {32253, "PCS_WGS72_UTM_zone_53N"},
+    {32254, "PCS_WGS72_UTM_zone_54N"},
+    {32255, "PCS_WGS72_UTM_zone_55N"},
+    {32256, "PCS_WGS72_UTM_zone_56N"},
+    {32257, "PCS_WGS72_UTM_zone_57N"},
+    {32258, "PCS_WGS72_UTM_zone_58N"},
+    {32259, "PCS_WGS72_UTM_zone_59N"},
+    {32260, "PCS_WGS72_UTM_zone_60N"},
+    {32301, "PCS_WGS72_UTM_zone_1S"},
+    {32302, "PCS_WGS72_UTM_zone_2S"},
+    {32303, "PCS_WGS72_UTM_zone_3S"},
+    {32304, "PCS_WGS72_UTM_zone_4S"},
+    {32305, "PCS_WGS72_UTM_zone_5S"},
+    {32306, "PCS_WGS72_UTM_zone_6S"},
+    {32307, "PCS_WGS72_UTM_zone_7S"},
+    {32308, "PCS_WGS72_UTM_zone_8S"},
+    {32309, "PCS_WGS72_UTM_zone_9S"},
+    {32310, "PCS_WGS72_UTM_zone_10S"},
+    {32311, "PCS_WGS72_UTM_zone_11S"},
+    {32312, "PCS_WGS72_UTM_zone_12S"},
+    {32313, "PCS_WGS72_UTM_zone_13S"},
+    {32314, "PCS_WGS72_UTM_zone_14S"},
+    {32315, "PCS_WGS72_UTM_zone_15S"},
+    {32316, "PCS_WGS72_UTM_zone_16S"},
+    {32317, "PCS_WGS72_UTM_zone_17S"},
+    {32318, "PCS_WGS72_UTM_zone_18S"},
+    {32319, "PCS_WGS72_UTM_zone_19S"},
+    {32320, "PCS_WGS72_UTM_zone_20S"},
+    {32321, "PCS_WGS72_UTM_zone_21S"},
+    {32322, "PCS_WGS72_UTM_zone_22S"},
+    {32323, "PCS_WGS72_UTM_zone_23S"},
+    {32324, "PCS_WGS72_UTM_zone_24S"},
+    {32325, "PCS_WGS72_UTM_zone_25S"},
+    {32326, "PCS_WGS72_UTM_zone_26S"},
+    {32327, "PCS_WGS72_UTM_zone_27S"},
+    {32328, "PCS_WGS72_UTM_zone_28S"},
+    {32329, "PCS_WGS72_UTM_zone_29S"},
+    {32330, "PCS_WGS72_UTM_zone_30S"},
+    {32331, "PCS_WGS72_UTM_zone_31S"},
+    {32332, "PCS_WGS72_UTM_zone_32S"},
+    {32333, "PCS_WGS72_UTM_zone_33S"},
+    {32334, "PCS_WGS72_UTM_zone_34S"},
+    {32335, "PCS_WGS72_UTM_zone_35S"},
+    {32336, "PCS_WGS72_UTM_zone_36S"},
+    {32337, "PCS_WGS72_UTM_zone_37S"},
+    {32338, "PCS_WGS72_UTM_zone_38S"},
+    {32339, "PCS_WGS72_UTM_zone_39S"},
+    {32340, "PCS_WGS72_UTM_zone_40S"},
+    {32341, "PCS_WGS72_UTM_zone_41S"},
+    {32342, "PCS_WGS72_UTM_zone_42S"},
+    {32343, "PCS_WGS72_UTM_zone_43S"},
+    {32344, "PCS_WGS72_UTM_zone_44S"},
+    {32345, "PCS_WGS72_UTM_zone_45S"},
+    {32346, "PCS_WGS72_UTM_zone_46S"},
+    {32347, "PCS_WGS72_UTM_zone_47S"},
+    {32348, "PCS_WGS72_UTM_zone_48S"},
+    {32349, "PCS_WGS72_UTM_zone_49S"},
+    {32350, "PCS_WGS72_UTM_zone_50S"},
+    {32351, "PCS_WGS72_UTM_zone_51S"},
+    {32352, "PCS_WGS72_UTM_zone_52S"},
+    {32353, "PCS_WGS72_UTM_zone_53S"},
+    {32354, "PCS_WGS72_UTM_zone_54S"},
+    {32355, "PCS_WGS72_UTM_zone_55S"},
+    {32356, "PCS_WGS72_UTM_zone_56S"},
+    {32357, "PCS_WGS72_UTM_zone_57S"},
+    {32358, "PCS_WGS72_UTM_zone_58S"},
+    {32359, "PCS_WGS72_UTM_zone_59S"},
+    {32360, "PCS_WGS72_UTM_zone_60S"},
+    {32401, "PCS_WGS72BE_UTM_zone_1N"},
+    {32402, "PCS_WGS72BE_UTM_zone_2N"},
+    {32403, "PCS_WGS72BE_UTM_zone_3N"},
+    {32404, "PCS_WGS72BE_UTM_zone_4N"},
+    {32405, "PCS_WGS72BE_UTM_zone_5N"},
+    {32406, "PCS_WGS72BE_UTM_zone_6N"},
+    {32407, "PCS_WGS72BE_UTM_zone_7N"},
+    {32408, "PCS_WGS72BE_UTM_zone_8N"},
+    {32409, "PCS_WGS72BE_UTM_zone_9N"},
+    {32410, "PCS_WGS72BE_UTM_zone_10N"},
+    {32411, "PCS_WGS72BE_UTM_zone_11N"},
+    {32412, "PCS_WGS72BE_UTM_zone_12N"},
+    {32413, "PCS_WGS72BE_UTM_zone_13N"},
+    {32414, "PCS_WGS72BE_UTM_zone_14N"},
+    {32415, "PCS_WGS72BE_UTM_zone_15N"},
+    {32416, "PCS_WGS72BE_UTM_zone_16N"},
+    {32417, "PCS_WGS72BE_UTM_zone_17N"},
+    {32418, "PCS_WGS72BE_UTM_zone_18N"},
+    {32419, "PCS_WGS72BE_UTM_zone_19N"},
+    {32420, "PCS_WGS72BE_UTM_zone_20N"},
+    {32421, "PCS_WGS72BE_UTM_zone_21N"},
+    {32422, "PCS_WGS72BE_UTM_zone_22N"},
+    {32423, "PCS_WGS72BE_UTM_zone_23N"},
+    {32424, "PCS_WGS72BE_UTM_zone_24N"},
+    {32425, "PCS_WGS72BE_UTM_zone_25N"},
+    {32426, "PCS_WGS72BE_UTM_zone_26N"},
+    {32427, "PCS_WGS72BE_UTM_zone_27N"},
+    {32428, "PCS_WGS72BE_UTM_zone_28N"},
+    {32429, "PCS_WGS72BE_UTM_zone_29N"},
+    {32430, "PCS_WGS72BE_UTM_zone_30N"},
+    {32431, "PCS_WGS72BE_UTM_zone_31N"},
+    {32432, "PCS_WGS72BE_UTM_zone_32N"},
+    {32433, "PCS_WGS72BE_UTM_zone_33N"},
+    {32434, "PCS_WGS72BE_UTM_zone_34N"},
+    {32435, "PCS_WGS72BE_UTM_zone_35N"},
+    {32436, "PCS_WGS72BE_UTM_zone_36N"},
+    {32437, "PCS_WGS72BE_UTM_zone_37N"},
+    {32438, "PCS_WGS72BE_UTM_zone_38N"},
+    {32439, "PCS_WGS72BE_UTM_zone_39N"},
+    {32440, "PCS_WGS72BE_UTM_zone_40N"},
+    {32441, "PCS_WGS72BE_UTM_zone_41N"},
+    {32442, "PCS_WGS72BE_UTM_zone_42N"},
+    {32443, "PCS_WGS72BE_UTM_zone_43N"},
+    {32444, "PCS_WGS72BE_UTM_zone_44N"},
+    {32445, "PCS_WGS72BE_UTM_zone_45N"},
+    {32446, "PCS_WGS72BE_UTM_zone_46N"},
+    {32447, "PCS_WGS72BE_UTM_zone_47N"},
+    {32448, "PCS_WGS72BE_UTM_zone_48N"},
+    {32449, "PCS_WGS72BE_UTM_zone_49N"},
+    {32450, "PCS_WGS72BE_UTM_zone_50N"},
+    {32451, "PCS_WGS72BE_UTM_zone_51N"},
+    {32452, "PCS_WGS72BE_UTM_zone_52N"},
+    {32453, "PCS_WGS72BE_UTM_zone_53N"},
+    {32454, "PCS_WGS72BE_UTM_zone_54N"},
+    {32455, "PCS_WGS72BE_UTM_zone_55N"},
+    {32456, "PCS_WGS72BE_UTM_zone_56N"},
+    {32457, "PCS_WGS72BE_UTM_zone_57N"},
+    {32458, "PCS_WGS72BE_UTM_zone_58N"},
+    {32459, "PCS_WGS72BE_UTM_zone_59N"},
+    {32460, "PCS_WGS72BE_UTM_zone_60N"},
+    {32501, "PCS_WGS72BE_UTM_zone_1S"},
+    {32502, "PCS_WGS72BE_UTM_zone_2S"},
+    {32503, "PCS_WGS72BE_UTM_zone_3S"},
+    {32504, "PCS_WGS72BE_UTM_zone_4S"},
+    {32505, "PCS_WGS72BE_UTM_zone_5S"},
+    {32506, "PCS_WGS72BE_UTM_zone_6S"},
+    {32507, "PCS_WGS72BE_UTM_zone_7S"},
+    {32508, "PCS_WGS72BE_UTM_zone_8S"},
+    {32509, "PCS_WGS72BE_UTM_zone_9S"},
+    {32510, "PCS_WGS72BE_UTM_zone_10S"},
+    {32511, "PCS_WGS72BE_UTM_zone_11S"},
+    {32512, "PCS_WGS72BE_UTM_zone_12S"},
+    {32513, "PCS_WGS72BE_UTM_zone_13S"},
+    {32514, "PCS_WGS72BE_UTM_zone_14S"},
+    {32515, "PCS_WGS72BE_UTM_zone_15S"},
+    {32516, "PCS_WGS72BE_UTM_zone_16S"},
+    {32517, "PCS_WGS72BE_UTM_zone_17S"},
+    {32518, "PCS_WGS72BE_UTM_zone_18S"},
+    {32519, "PCS_WGS72BE_UTM_zone_19S"},
+    {32520, "PCS_WGS72BE_UTM_zone_20S"},
+    {32521, "PCS_WGS72BE_UTM_zone_21S"},
+    {32522, "PCS_WGS72BE_UTM_zone_22S"},
+    {32523, "PCS_WGS72BE_UTM_zone_23S"},
+    {32524, "PCS_WGS72BE_UTM_zone_24S"},
+    {32525, "PCS_WGS72BE_UTM_zone_25S"},
+    {32526, "PCS_WGS72BE_UTM_zone_26S"},
+    {32527, "PCS_WGS72BE_UTM_zone_27S"},
+    {32528, "PCS_WGS72BE_UTM_zone_28S"},
+    {32529, "PCS_WGS72BE_UTM_zone_29S"},
+    {32530, "PCS_WGS72BE_UTM_zone_30S"},
+    {32531, "PCS_WGS72BE_UTM_zone_31S"},
+    {32532, "PCS_WGS72BE_UTM_zone_32S"},
+    {32533, "PCS_WGS72BE_UTM_zone_33S"},
+    {32534, "PCS_WGS72BE_UTM_zone_34S"},
+    {32535, "PCS_WGS72BE_UTM_zone_35S"},
+    {32536, "PCS_WGS72BE_UTM_zone_36S"},
+    {32537, "PCS_WGS72BE_UTM_zone_37S"},
+    {32538, "PCS_WGS72BE_UTM_zone_38S"},
+    {32539, "PCS_WGS72BE_UTM_zone_39S"},
+    {32540, "PCS_WGS72BE_UTM_zone_40S"},
+    {32541, "PCS_WGS72BE_UTM_zone_41S"},
+    {32542, "PCS_WGS72BE_UTM_zone_42S"},
+    {32543, "PCS_WGS72BE_UTM_zone_43S"},
+    {32544, "PCS_WGS72BE_UTM_zone_44S"},
+    {32545, "PCS_WGS72BE_UTM_zone_45S"},
+    {32546, "PCS_WGS72BE_UTM_zone_46S"},
+    {32547, "PCS_WGS72BE_UTM_zone_47S"},
+    {32548, "PCS_WGS72BE_UTM_zone_48S"},
+    {32549, "PCS_WGS72BE_UTM_zone_49S"},
+    {32550, "PCS_WGS72BE_UTM_zone_50S"},
+    {32551, "PCS_WGS72BE_UTM_zone_51S"},
+    {32552, "PCS_WGS72BE_UTM_zone_52S"},
+    {32553, "PCS_WGS72BE_UTM_zone_53S"},
+    {32554, "PCS_WGS72BE_UTM_zone_54S"},
+    {32555, "PCS_WGS72BE_UTM_zone_55S"},
+    {32556, "PCS_WGS72BE_UTM_zone_56S"},
+    {32557, "PCS_WGS72BE_UTM_zone_57S"},
+    {32558, "PCS_WGS72BE_UTM_zone_58S"},
+    {32559, "PCS_WGS72BE_UTM_zone_59S"},
+    {32560, "PCS_WGS72BE_UTM_zone_60S"},
+    {32601, "PCS_WGS84_UTM_zone_1N"},
+    {32602, "PCS_WGS84_UTM_zone_2N"},
+    {32603, "PCS_WGS84_UTM_zone_3N"},
+    {32604, "PCS_WGS84_UTM_zone_4N"},
+    {32605, "PCS_WGS84_UTM_zone_5N"},
+    {32606, "PCS_WGS84_UTM_zone_6N"},
+    {32607, "PCS_WGS84_UTM_zone_7N"},
+    {32608, "PCS_WGS84_UTM_zone_8N"},
+    {32609, "PCS_WGS84_UTM_zone_9N"},
+    {32610, "PCS_WGS84_UTM_zone_10N"},
+    {32611, "PCS_WGS84_UTM_zone_11N"},
+    {32612, "PCS_WGS84_UTM_zone_12N"},
+    {32613, "PCS_WGS84_UTM_zone_13N"},
+    {32614, "PCS_WGS84_UTM_zone_14N"},
+    {32615, "PCS_WGS84_UTM_zone_15N"},
+    {32616, "PCS_WGS84_UTM_zone_16N"},
+    {32617, "PCS_WGS84_UTM_zone_17N"},
+    {32618, "PCS_WGS84_UTM_zone_18N"},
+    {32619, "PCS_WGS84_UTM_zone_19N"},
+    {32620, "PCS_WGS84_UTM_zone_20N"},
+    {32621, "PCS_WGS84_UTM_zone_21N"},
+    {32622, "PCS_WGS84_UTM_zone_22N"},
+    {32623, "PCS_WGS84_UTM_zone_23N"},
+    {32624, "PCS_WGS84_UTM_zone_24N"},
+    {32625, "PCS_WGS84_UTM_zone_25N"},
+    {32626, "PCS_WGS84_UTM_zone_26N"},
+    {32627, "PCS_WGS84_UTM_zone_27N"},
+    {32628, "PCS_WGS84_UTM_zone_28N"},
+    {32629, "PCS_WGS84_UTM_zone_29N"},
+    {32630, "PCS_WGS84_UTM_zone_30N"},
+    {32631, "PCS_WGS84_UTM_zone_31N"},
+    {32632, "PCS_WGS84_UTM_zone_32N"},
+    {32633, "PCS_WGS84_UTM_zone_33N"},
+    {32634, "PCS_WGS84_UTM_zone_34N"},
+    {32635, "PCS_WGS84_UTM_zone_35N"},
+    {32636, "PCS_WGS84_UTM_zone_36N"},
+    {32637, "PCS_WGS84_UTM_zone_37N"},
+    {32638, "PCS_WGS84_UTM_zone_38N"},
+    {32639, "PCS_WGS84_UTM_zone_39N"},
+    {32640, "PCS_WGS84_UTM_zone_40N"},
+    {32641, "PCS_WGS84_UTM_zone_41N"},
+    {32642, "PCS_WGS84_UTM_zone_42N"},
+    {32643, "PCS_WGS84_UTM_zone_43N"},
+    {32644, "PCS_WGS84_UTM_zone_44N"},
+    {32645, "PCS_WGS84_UTM_zone_45N"},
+    {32646, "PCS_WGS84_UTM_zone_46N"},
+    {32647, "PCS_WGS84_UTM_zone_47N"},
+    {32648, "PCS_WGS84_UTM_zone_48N"},
+    {32649, "PCS_WGS84_UTM_zone_49N"},
+    {32650, "PCS_WGS84_UTM_zone_50N"},
+    {32651, "PCS_WGS84_UTM_zone_51N"},
+    {32652, "PCS_WGS84_UTM_zone_52N"},
+    {32653, "PCS_WGS84_UTM_zone_53N"},
+    {32654, "PCS_WGS84_UTM_zone_54N"},
+    {32655, "PCS_WGS84_UTM_zone_55N"},
+    {32656, "PCS_WGS84_UTM_zone_56N"},
+    {32657, "PCS_WGS84_UTM_zone_57N"},
+    {32658, "PCS_WGS84_UTM_zone_58N"},
+    {32659, "PCS_WGS84_UTM_zone_59N"},
+    {32660, "PCS_WGS84_UTM_zone_60N"},
+    {32701, "PCS_WGS84_UTM_zone_1S"},
+    {32702, "PCS_WGS84_UTM_zone_2S"},
+    {32703, "PCS_WGS84_UTM_zone_3S"},
+    {32704, "PCS_WGS84_UTM_zone_4S"},
+    {32705, "PCS_WGS84_UTM_zone_5S"},
+    {32706, "PCS_WGS84_UTM_zone_6S"},
+    {32707, "PCS_WGS84_UTM_zone_7S"},
+    {32708, "PCS_WGS84_UTM_zone_8S"},
+    {32709, "PCS_WGS84_UTM_zone_9S"},
+    {32710, "PCS_WGS84_UTM_zone_10S"},
+    {32711, "PCS_WGS84_UTM_zone_11S"},
+    {32712, "PCS_WGS84_UTM_zone_12S"},
+    {32713, "PCS_WGS84_UTM_zone_13S"},
+    {32714, "PCS_WGS84_UTM_zone_14S"},
+    {32715, "PCS_WGS84_UTM_zone_15S"},
+    {32716, "PCS_WGS84_UTM_zone_16S"},
+    {32717, "PCS_WGS84_UTM_zone_17S"},
+    {32718, "PCS_WGS84_UTM_zone_18S"},
+    {32719, "PCS_WGS84_UTM_zone_19S"},
+    {32720, "PCS_WGS84_UTM_zone_20S"},
+    {32721, "PCS_WGS84_UTM_zone_21S"},
+    {32722, "PCS_WGS84_UTM_zone_22S"},
+    {32723, "PCS_WGS84_UTM_zone_23S"},
+    {32724, "PCS_WGS84_UTM_zone_24S"},
+    {32725, "PCS_WGS84_UTM_zone_25S"},
+    {32726, "PCS_WGS84_UTM_zone_26S"},
+    {32727, "PCS_WGS84_UTM_zone_27S"},
+    {32728, "PCS_WGS84_UTM_zone_28S"},
+    {32729, "PCS_WGS84_UTM_zone_29S"},
+    {32730, "PCS_WGS84_UTM_zone_30S"},
+    {32731, "PCS_WGS84_UTM_zone_31S"},
+    {32732, "PCS_WGS84_UTM_zone_32S"},
+    {32733, "PCS_WGS84_UTM_zone_33S"},
+    {32734, "PCS_WGS84_UTM_zone_34S"},
+    {32735, "PCS_WGS84_UTM_zone_35S"},
+    {32736, "PCS_WGS84_UTM_zone_36S"},
+    {32737, "PCS_WGS84_UTM_zone_37S"},
+    {32738, "PCS_WGS84_UTM_zone_38S"},
+    {32739, "PCS_WGS84_UTM_zone_39S"},
+    {32740, "PCS_WGS84_UTM_zone_40S"},
+    {32741, "PCS_WGS84_UTM_zone_41S"},
+    {32742, "PCS_WGS84_UTM_zone_42S"},
+    {32743, "PCS_WGS84_UTM_zone_43S"},
+    {32744, "PCS_WGS84_UTM_zone_44S"},
+    {32745, "PCS_WGS84_UTM_zone_45S"},
+    {32746, "PCS_WGS84_UTM_zone_46S"},
+    {32747, "PCS_WGS84_UTM_zone_47S"},
+    {32748, "PCS_WGS84_UTM_zone_48S"},
+    {32749, "PCS_WGS84_UTM_zone_49S"},
+    {32750, "PCS_WGS84_UTM_zone_50S"},
+    {32751, "PCS_WGS84_UTM_zone_51S"},
+    {32752, "PCS_WGS84_UTM_zone_52S"},
+    {32753, "PCS_WGS84_UTM_zone_53S"},
+    {32754, "PCS_WGS84_UTM_zone_54S"},
+    {32755, "PCS_WGS84_UTM_zone_55S"},
+    {32756, "PCS_WGS84_UTM_zone_56S"},
+    {32757, "PCS_WGS84_UTM_zone_57S"},
+    {32758, "PCS_WGS84_UTM_zone_58S"},
+    {32759, "PCS_WGS84_UTM_zone_59S"},
+    {32760, "PCS_WGS84_UTM_zone_60S"}
+};
+
+const TiffGeoTagKeyName ff_tiff_projection_codes[] = {
+    {10101, "Proj_Alabama_CS27_East"},
+    {10102, "Proj_Alabama_CS27_West"},
+    {10131, "Proj_Alabama_CS83_East"},
+    {10132, "Proj_Alabama_CS83_West"},
+    {10201, "Proj_Arizona_Coordinate_System_east"},
+    {10202, "Proj_Arizona_Coordinate_System_Central"},
+    {10203, "Proj_Arizona_Coordinate_System_west"},
+    {10231, "Proj_Arizona_CS83_east"},
+    {10232, "Proj_Arizona_CS83_Central"},
+    {10233, "Proj_Arizona_CS83_west"},
+    {10301, "Proj_Arkansas_CS27_North"},
+    {10302, "Proj_Arkansas_CS27_South"},
+    {10331, "Proj_Arkansas_CS83_North"},
+    {10332, "Proj_Arkansas_CS83_South"},
+    {10401, "Proj_California_CS27_I"},
+    {10402, "Proj_California_CS27_II"},
+    {10403, "Proj_California_CS27_III"},
+    {10404, "Proj_California_CS27_IV"},
+    {10405, "Proj_California_CS27_V"},
+    {10406, "Proj_California_CS27_VI"},
+    {10407, "Proj_California_CS27_VII"},
+    {10431, "Proj_California_CS83_1"},
+    {10432, "Proj_California_CS83_2"},
+    {10433, "Proj_California_CS83_3"},
+    {10434, "Proj_California_CS83_4"},
+    {10435, "Proj_California_CS83_5"},
+    {10436, "Proj_California_CS83_6"},
+    {10501, "Proj_Colorado_CS27_North"},
+    {10502, "Proj_Colorado_CS27_Central"},
+    {10503, "Proj_Colorado_CS27_South"},
+    {10531, "Proj_Colorado_CS83_North"},
+    {10532, "Proj_Colorado_CS83_Central"},
+    {10533, "Proj_Colorado_CS83_South"},
+    {10600, "Proj_Connecticut_CS27"},
+    {10630, "Proj_Connecticut_CS83"},
+    {10700, "Proj_Delaware_CS27"},
+    {10730, "Proj_Delaware_CS83"},
+    {10901, "Proj_Florida_CS27_East"},
+    {10902, "Proj_Florida_CS27_West"},
+    {10903, "Proj_Florida_CS27_North"},
+    {10931, "Proj_Florida_CS83_East"},
+    {10932, "Proj_Florida_CS83_West"},
+    {10933, "Proj_Florida_CS83_North"},
+    {11001, "Proj_Georgia_CS27_East"},
+    {11002, "Proj_Georgia_CS27_West"},
+    {11031, "Proj_Georgia_CS83_East"},
+    {11032, "Proj_Georgia_CS83_West"},
+    {11101, "Proj_Idaho_CS27_East"},
+    {11102, "Proj_Idaho_CS27_Central"},
+    {11103, "Proj_Idaho_CS27_West"},
+    {11131, "Proj_Idaho_CS83_East"},
+    {11132, "Proj_Idaho_CS83_Central"},
+    {11133, "Proj_Idaho_CS83_West"},
+    {11201, "Proj_Illinois_CS27_East"},
+    {11202, "Proj_Illinois_CS27_West"},
+    {11231, "Proj_Illinois_CS83_East"},
+    {11232, "Proj_Illinois_CS83_West"},
+    {11301, "Proj_Indiana_CS27_East"},
+    {11302, "Proj_Indiana_CS27_West"},
+    {11331, "Proj_Indiana_CS83_East"},
+    {11332, "Proj_Indiana_CS83_West"},
+    {11401, "Proj_Iowa_CS27_North"},
+    {11402, "Proj_Iowa_CS27_South"},
+    {11431, "Proj_Iowa_CS83_North"},
+    {11432, "Proj_Iowa_CS83_South"},
+    {11501, "Proj_Kansas_CS27_North"},
+    {11502, "Proj_Kansas_CS27_South"},
+    {11531, "Proj_Kansas_CS83_North"},
+    {11532, "Proj_Kansas_CS83_South"},
+    {11601, "Proj_Kentucky_CS27_North"},
+    {11602, "Proj_Kentucky_CS27_South"},
+    {11631, "Proj_Kentucky_CS83_North"},
+    {11632, "Proj_Kentucky_CS83_South"},
+    {11701, "Proj_Louisiana_CS27_North"},
+    {11702, "Proj_Louisiana_CS27_South"},
+    {11731, "Proj_Louisiana_CS83_North"},
+    {11732, "Proj_Louisiana_CS83_South"},
+    {11801, "Proj_Maine_CS27_East"},
+    {11802, "Proj_Maine_CS27_West"},
+    {11831, "Proj_Maine_CS83_East"},
+    {11832, "Proj_Maine_CS83_West"},
+    {11900, "Proj_Maryland_CS27"},
+    {11930, "Proj_Maryland_CS83"},
+    {12001, "Proj_Massachusetts_CS27_Mainland"},
+    {12002, "Proj_Massachusetts_CS27_Island"},
+    {12031, "Proj_Massachusetts_CS83_Mainland"},
+    {12032, "Proj_Massachusetts_CS83_Island"},
+    {12101, "Proj_Michigan_State_Plane_East"},
+    {12102, "Proj_Michigan_State_Plane_Old_Central"},
+    {12103, "Proj_Michigan_State_Plane_West"},
+    {12111, "Proj_Michigan_CS27_North"},
+    {12112, "Proj_Michigan_CS27_Central"},
+    {12113, "Proj_Michigan_CS27_South"},
+    {12141, "Proj_Michigan_CS83_North"},
+    {12142, "Proj_Michigan_CS83_Central"},
+    {12143, "Proj_Michigan_CS83_South"},
+    {12201, "Proj_Minnesota_CS27_North"},
+    {12202, "Proj_Minnesota_CS27_Central"},
+    {12203, "Proj_Minnesota_CS27_South"},
+    {12231, "Proj_Minnesota_CS83_North"},
+    {12232, "Proj_Minnesota_CS83_Central"},
+    {12233, "Proj_Minnesota_CS83_South"},
+    {12301, "Proj_Mississippi_CS27_East"},
+    {12302, "Proj_Mississippi_CS27_West"},
+    {12331, "Proj_Mississippi_CS83_East"},
+    {12332, "Proj_Mississippi_CS83_West"},
+    {12401, "Proj_Missouri_CS27_East"},
+    {12402, "Proj_Missouri_CS27_Central"},
+    {12403, "Proj_Missouri_CS27_West"},
+    {12431, "Proj_Missouri_CS83_East"},
+    {12432, "Proj_Missouri_CS83_Central"},
+    {12433, "Proj_Missouri_CS83_West"},
+    {12501, "Proj_Montana_CS27_North"},
+    {12502, "Proj_Montana_CS27_Central"},
+    {12503, "Proj_Montana_CS27_South"},
+    {12530, "Proj_Montana_CS83"},
+    {12601, "Proj_Nebraska_CS27_North"},
+    {12602, "Proj_Nebraska_CS27_South"},
+    {12630, "Proj_Nebraska_CS83"},
+    {12701, "Proj_Nevada_CS27_East"},
+    {12702, "Proj_Nevada_CS27_Central"},
+    {12703, "Proj_Nevada_CS27_West"},
+    {12731, "Proj_Nevada_CS83_East"},
+    {12732, "Proj_Nevada_CS83_Central"},
+    {12733, "Proj_Nevada_CS83_West"},
+    {12800, "Proj_New_Hampshire_CS27"},
+    {12830, "Proj_New_Hampshire_CS83"},
+    {12900, "Proj_New_Jersey_CS27"},
+    {12930, "Proj_New_Jersey_CS83"},
+    {13001, "Proj_New_Mexico_CS27_East"},
+    {13002, "Proj_New_Mexico_CS27_Central"},
+    {13003, "Proj_New_Mexico_CS27_West"},
+    {13031, "Proj_New_Mexico_CS83_East"},
+    {13032, "Proj_New_Mexico_CS83_Central"},
+    {13033, "Proj_New_Mexico_CS83_West"},
+    {13101, "Proj_New_York_CS27_East"},
+    {13102, "Proj_New_York_CS27_Central"},
+    {13103, "Proj_New_York_CS27_West"},
+    {13104, "Proj_New_York_CS27_Long_Island"},
+    {13131, "Proj_New_York_CS83_East"},
+    {13132, "Proj_New_York_CS83_Central"},
+    {13133, "Proj_New_York_CS83_West"},
+    {13134, "Proj_New_York_CS83_Long_Island"},
+    {13200, "Proj_North_Carolina_CS27"},
+    {13230, "Proj_North_Carolina_CS83"},
+    {13301, "Proj_North_Dakota_CS27_North"},
+    {13302, "Proj_North_Dakota_CS27_South"},
+    {13331, "Proj_North_Dakota_CS83_North"},
+    {13332, "Proj_North_Dakota_CS83_South"},
+    {13401, "Proj_Ohio_CS27_North"},
+    {13402, "Proj_Ohio_CS27_South"},
+    {13431, "Proj_Ohio_CS83_North"},
+    {13432, "Proj_Ohio_CS83_South"},
+    {13501, "Proj_Oklahoma_CS27_North"},
+    {13502, "Proj_Oklahoma_CS27_South"},
+    {13531, "Proj_Oklahoma_CS83_North"},
+    {13532, "Proj_Oklahoma_CS83_South"},
+    {13601, "Proj_Oregon_CS27_North"},
+    {13602, "Proj_Oregon_CS27_South"},
+    {13631, "Proj_Oregon_CS83_North"},
+    {13632, "Proj_Oregon_CS83_South"},
+    {13701, "Proj_Pennsylvania_CS27_North"},
+    {13702, "Proj_Pennsylvania_CS27_South"},
+    {13731, "Proj_Pennsylvania_CS83_North"},
+    {13732, "Proj_Pennsylvania_CS83_South"},
+    {13800, "Proj_Rhode_Island_CS27"},
+    {13830, "Proj_Rhode_Island_CS83"},
+    {13901, "Proj_South_Carolina_CS27_North"},
+    {13902, "Proj_South_Carolina_CS27_South"},
+    {13930, "Proj_South_Carolina_CS83"},
+    {14001, "Proj_South_Dakota_CS27_North"},
+    {14002, "Proj_South_Dakota_CS27_South"},
+    {14031, "Proj_South_Dakota_CS83_North"},
+    {14032, "Proj_South_Dakota_CS83_South"},
+    {14100, "Proj_Tennessee_CS27"},
+    {14130, "Proj_Tennessee_CS83"},
+    {14201, "Proj_Texas_CS27_North"},
+    {14202, "Proj_Texas_CS27_North_Central"},
+    {14203, "Proj_Texas_CS27_Central"},
+    {14204, "Proj_Texas_CS27_South_Central"},
+    {14205, "Proj_Texas_CS27_South"},
+    {14231, "Proj_Texas_CS83_North"},
+    {14232, "Proj_Texas_CS83_North_Central"},
+    {14233, "Proj_Texas_CS83_Central"},
+    {14234, "Proj_Texas_CS83_South_Central"},
+    {14235, "Proj_Texas_CS83_South"},
+    {14301, "Proj_Utah_CS27_North"},
+    {14302, "Proj_Utah_CS27_Central"},
+    {14303, "Proj_Utah_CS27_South"},
+    {14331, "Proj_Utah_CS83_North"},
+    {14332, "Proj_Utah_CS83_Central"},
+    {14333, "Proj_Utah_CS83_South"},
+    {14400, "Proj_Vermont_CS27"},
+    {14430, "Proj_Vermont_CS83"},
+    {14501, "Proj_Virginia_CS27_North"},
+    {14502, "Proj_Virginia_CS27_South"},
+    {14531, "Proj_Virginia_CS83_North"},
+    {14532, "Proj_Virginia_CS83_South"},
+    {14601, "Proj_Washington_CS27_North"},
+    {14602, "Proj_Washington_CS27_South"},
+    {14631, "Proj_Washington_CS83_North"},
+    {14632, "Proj_Washington_CS83_South"},
+    {14701, "Proj_West_Virginia_CS27_North"},
+    {14702, "Proj_West_Virginia_CS27_South"},
+    {14731, "Proj_West_Virginia_CS83_North"},
+    {14732, "Proj_West_Virginia_CS83_South"},
+    {14801, "Proj_Wisconsin_CS27_North"},
+    {14802, "Proj_Wisconsin_CS27_Central"},
+    {14803, "Proj_Wisconsin_CS27_South"},
+    {14831, "Proj_Wisconsin_CS83_North"},
+    {14832, "Proj_Wisconsin_CS83_Central"},
+    {14833, "Proj_Wisconsin_CS83_South"},
+    {14901, "Proj_Wyoming_CS27_East"},
+    {14902, "Proj_Wyoming_CS27_East_Central"},
+    {14903, "Proj_Wyoming_CS27_West_Central"},
+    {14904, "Proj_Wyoming_CS27_West"},
+    {14931, "Proj_Wyoming_CS83_East"},
+    {14932, "Proj_Wyoming_CS83_East_Central"},
+    {14933, "Proj_Wyoming_CS83_West_Central"},
+    {14934, "Proj_Wyoming_CS83_West"},
+    {15001, "Proj_Alaska_CS27_1"},
+    {15002, "Proj_Alaska_CS27_2"},
+    {15003, "Proj_Alaska_CS27_3"},
+    {15004, "Proj_Alaska_CS27_4"},
+    {15005, "Proj_Alaska_CS27_5"},
+    {15006, "Proj_Alaska_CS27_6"},
+    {15007, "Proj_Alaska_CS27_7"},
+    {15008, "Proj_Alaska_CS27_8"},
+    {15009, "Proj_Alaska_CS27_9"},
+    {15010, "Proj_Alaska_CS27_10"},
+    {15031, "Proj_Alaska_CS83_1"},
+    {15032, "Proj_Alaska_CS83_2"},
+    {15033, "Proj_Alaska_CS83_3"},
+    {15034, "Proj_Alaska_CS83_4"},
+    {15035, "Proj_Alaska_CS83_5"},
+    {15036, "Proj_Alaska_CS83_6"},
+    {15037, "Proj_Alaska_CS83_7"},
+    {15038, "Proj_Alaska_CS83_8"},
+    {15039, "Proj_Alaska_CS83_9"},
+    {15040, "Proj_Alaska_CS83_10"},
+    {15101, "Proj_Hawaii_CS27_1"},
+    {15102, "Proj_Hawaii_CS27_2"},
+    {15103, "Proj_Hawaii_CS27_3"},
+    {15104, "Proj_Hawaii_CS27_4"},
+    {15105, "Proj_Hawaii_CS27_5"},
+    {15131, "Proj_Hawaii_CS83_1"},
+    {15132, "Proj_Hawaii_CS83_2"},
+    {15133, "Proj_Hawaii_CS83_3"},
+    {15134, "Proj_Hawaii_CS83_4"},
+    {15135, "Proj_Hawaii_CS83_5"},
+    {15201, "Proj_Puerto_Rico_CS27"},
+    {15202, "Proj_St_Croix"},
+    {15230, "Proj_Puerto_Rico_Virgin_Is"},
+    {15914, "Proj_BLM_14N_feet"},
+    {15915, "Proj_BLM_15N_feet"},
+    {15916, "Proj_BLM_16N_feet"},
+    {15917, "Proj_BLM_17N_feet"},
+    {17348, "Proj_Map_Grid_of_Australia_48"},
+    {17349, "Proj_Map_Grid_of_Australia_49"},
+    {17350, "Proj_Map_Grid_of_Australia_50"},
+    {17351, "Proj_Map_Grid_of_Australia_51"},
+    {17352, "Proj_Map_Grid_of_Australia_52"},
+    {17353, "Proj_Map_Grid_of_Australia_53"},
+    {17354, "Proj_Map_Grid_of_Australia_54"},
+    {17355, "Proj_Map_Grid_of_Australia_55"},
+    {17356, "Proj_Map_Grid_of_Australia_56"},
+    {17357, "Proj_Map_Grid_of_Australia_57"},
+    {17358, "Proj_Map_Grid_of_Australia_58"},
+    {17448, "Proj_Australian_Map_Grid_48"},
+    {17449, "Proj_Australian_Map_Grid_49"},
+    {17450, "Proj_Australian_Map_Grid_50"},
+    {17451, "Proj_Australian_Map_Grid_51"},
+    {17452, "Proj_Australian_Map_Grid_52"},
+    {17453, "Proj_Australian_Map_Grid_53"},
+    {17454, "Proj_Australian_Map_Grid_54"},
+    {17455, "Proj_Australian_Map_Grid_55"},
+    {17456, "Proj_Australian_Map_Grid_56"},
+    {17457, "Proj_Australian_Map_Grid_57"},
+    {17458, "Proj_Australian_Map_Grid_58"},
+    {18031, "Proj_Argentina_1"},
+    {18032, "Proj_Argentina_2"},
+    {18033, "Proj_Argentina_3"},
+    {18034, "Proj_Argentina_4"},
+    {18035, "Proj_Argentina_5"},
+    {18036, "Proj_Argentina_6"},
+    {18037, "Proj_Argentina_7"},
+    {18051, "Proj_Colombia_3W"},
+    {18052, "Proj_Colombia_Bogota"},
+    {18053, "Proj_Colombia_3E"},
+    {18054, "Proj_Colombia_6E"},
+    {18072, "Proj_Egypt_Red_Belt"},
+    {18073, "Proj_Egypt_Purple_Belt"},
+    {18074, "Proj_Extended_Purple_Belt"},
+    {18141, "Proj_New_Zealand_North_Island_Nat_Grid"},
+    {18142, "Proj_New_Zealand_South_Island_Nat_Grid"},
+    {19900, "Proj_Bahrain_Grid"},
+    {19905, "Proj_Netherlands_E_Indies_Equatorial"},
+    {19912, "Proj_RSO_Borneo"}
+};
+
+const char *const ff_tiff_coord_trans_codes[] = {
+    "CT_TransverseMercator",
+    "CT_TransvMercator_Modified_Alaska",
+    "CT_ObliqueMercator",
+    "CT_ObliqueMercator_Laborde",
+    "CT_ObliqueMercator_Rosenmund",
+    "CT_ObliqueMercator_Spherical",
+    "CT_Mercator",
+    "CT_LambertConfConic_2SP",
+    "CT_LambertConfConic_Helmert",
+    "CT_LambertAzimEqualArea",
+    "CT_AlbersEqualArea",
+    "CT_AzimuthalEquidistant",
+    "CT_EquidistantConic",
+    "CT_Stereographic",
+    "CT_PolarStereographic",
+    "CT_ObliqueStereographic",
+    "CT_Equirectangular",
+    "CT_CassiniSoldner",
+    "CT_Gnomonic",
+    "CT_MillerCylindrical",
+    "CT_Orthographic",
+    "CT_Polyconic",
+    "CT_Robinson",
+    "CT_Sinusoidal",
+    "CT_VanDerGrinten",
+    "CT_NewZealandMapGrid",
+    "CT_TransvMercator_SouthOriented"
+};
+
+const char *const ff_tiff_vert_cs_codes[] = {
+    "VertCS_Airy_1830_ellipsoid",
+    "VertCS_Airy_Modified_1849_ellipsoid",
+    "VertCS_ANS_ellipsoid",
+    "VertCS_Bessel_1841_ellipsoid",
+    "VertCS_Bessel_Modified_ellipsoid",
+    "VertCS_Bessel_Namibia_ellipsoid",
+    "VertCS_Clarke_1858_ellipsoid",
+    "VertCS_Clarke_1866_ellipsoid",
+    "VertCS_Clarke_1880_Benoit_ellipsoid",
+    "VertCS_Clarke_1880_IGN_ellipsoid",
+    "VertCS_Clarke_1880_RGS_ellipsoid",
+    "VertCS_Clarke_1880_Arc_ellipsoid",
+    "VertCS_Clarke_1880_SGA_1922_ellipsoid",
+    "VertCS_Everest_1830_1937_Adjustment_ellipsoid",
+    "VertCS_Everest_1830_1967_Definition_ellipsoid",
+    "VertCS_Everest_1830_1975_Definition_ellipsoid",
+    "VertCS_Everest_1830_Modified_ellipsoid",
+    "VertCS_GRS_1980_ellipsoid",
+    "VertCS_Helmert_1906_ellipsoid",
+    "VertCS_INS_ellipsoid",
+    "VertCS_International_1924_ellipsoid",
+    "VertCS_International_1967_ellipsoid",
+    "VertCS_Krassowsky_1940_ellipsoid",
+    "VertCS_NWL_9D_ellipsoid",
+    "VertCS_NWL_10D_ellipsoid",
+    "VertCS_Plessis_1817_ellipsoid",
+    "VertCS_Struve_1860_ellipsoid",
+    "VertCS_War_Office_ellipsoid",
+    "VertCS_WGS_84_ellipsoid",
+    "VertCS_GEM_10C_ellipsoid",
+    "VertCS_OSU86F_ellipsoid",
+    "VertCS_OSU91A_ellipsoid"
+};
+
+const char *const ff_tiff_ortho_vert_cs_codes[] = {
+    "VertCS_Newlyn",
+    "VertCS_North_American_Vertical_Datum_1929",
+    "VertCS_North_American_Vertical_Datum_1988",
+    "VertCS_Yellow_Sea_1956",
+    "VertCS_Baltic_Sea",
+    "VertCS_Caspian_Sea"
+};
diff --git a/libavcodec/tiff_data.h b/libavcodec/tiff_data.h
new file mode 100644
index 0000000000..57515f9030
--- /dev/null
+++ b/libavcodec/tiff_data.h
@@ -0,0 +1,92 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#ifndef AVCODEC_TIFF_DATA_H
+#define AVCODEC_TIFF_DATA_H
+
+#include "tiff.h"
+
+#define TIFF_CONF_KEY_ID_OFFSET 1024
+extern const TiffGeoTagNameType ff_tiff_conf_name_type_map[3];
+
+#define TIFF_GEOG_KEY_ID_OFFSET 2048
+extern const TiffGeoTagNameType ff_tiff_geog_name_type_map[14];
+
+#define TIFF_PROJ_KEY_ID_OFFSET 3072
+extern const TiffGeoTagNameType ff_tiff_proj_name_type_map[24];
+
+#define TIFF_VERT_KEY_ID_OFFSET 4096
+extern const TiffGeoTagNameType ff_tiff_vert_name_type_map[4];
+
+#define TIFF_GEO_KEY_UNDEFINED    0
+#define TIFF_GEO_KEY_USER_DEFINED 32767
+
+#define TIFF_GT_MODEL_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_model_type_codes[3];
+
+#define TIFF_GT_RASTER_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_raster_type_codes[2];
+
+#define TIFF_LINEAR_UNIT_OFFSET 9001
+extern const char *const ff_tiff_linear_unit_codes[15];
+
+#define TIFF_ANGULAR_UNIT_OFFSET 9101
+extern const char *const ff_tiff_angular_unit_codes[8];
+
+#define TIFF_GCS_TYPE_OFFSET 4201
+extern const char *const ff_tiff_gcs_type_codes[133];
+
+#define TIFF_GCSE_TYPE_OFFSET 4001
+extern const char *const ff_tiff_gcse_type_codes[35];
+
+#define TIFF_GEODETIC_DATUM_OFFSET 6201
+extern const char *const ff_tiff_geodetic_datum_codes[120];
+
+#define TIFF_GEODETIC_DATUM_E_OFFSET 6001
+extern const char *const ff_tiff_geodetic_datum_e_codes[35];
+
+#define TIFF_ELLIPSOID_OFFSET 7001
+extern const char *const ff_tiff_ellipsoid_codes[35];
+
+#define TIFF_PRIME_MERIDIAN_OFFSET 8901
+extern const char *const ff_tiff_prime_meridian_codes[11];
+
+extern const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[978];
+
+extern const TiffGeoTagKeyName ff_tiff_projection_codes[298];
+
+#define TIFF_COORD_TRANS_OFFSET 1
+extern const char *const ff_tiff_coord_trans_codes[27];
+
+#define TIFF_VERT_CS_OFFSET 5001
+extern const char *const ff_tiff_vert_cs_codes[32];
+
+#define TIFF_ORTHO_VERT_CS_OFFSET 5101
+extern const char *const ff_tiff_ortho_vert_cs_codes[6];
+#endif
diff --git a/libavcodec/tiffenc.c b/libavcodec/tiffenc.c
index 7c23ee2db8..3d37d2e084 100644
--- a/libavcodec/tiffenc.c
+++ b/libavcodec/tiffenc.c
@@ -2,20 +2,20 @@
  * TIFF image encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,11 +30,13 @@
 #include <zlib.h>
 #endif
 
+#include "libavutil/imgutils.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
 #include "lzw.h"
 #include "put_bits.h"
 #include "rle.h"
@@ -43,8 +45,8 @@
 #define TIFF_MAX_ENTRY 32
 
 /** sizes of various TIFF field types (string size = 1)*/
-static const uint8_t type_sizes2[6] = {
-    0, 1, 1, 2, 4, 8
+static const uint8_t type_sizes2[14] = {
+    0, 1, 1, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
 };
 
 typedef struct TiffEncoderContext {
@@ -58,6 +60,12 @@ typedef struct TiffEncoderContext {
     int bpp_tab_size;                       ///< bpp_tab size
     enum TiffPhotometric photometric_interpretation;  ///< photometric interpretation
     int strips;                             ///< number of strips
+    uint32_t *strip_sizes;
+    unsigned int strip_sizes_size;
+    uint32_t *strip_offsets;
+    unsigned int strip_offsets_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
     int rps;                                ///< row per strip
     uint8_t entries[TIFF_MAX_ENTRY * 12];   ///< entries in header
     int num_entries;                        ///< number of entries
@@ -66,10 +74,12 @@ typedef struct TiffEncoderContext {
     int buf_size;                           ///< buffer size
     uint16_t subsampling[2];                ///< YUV subsampling factors
     struct LZWEncodeState *lzws;            ///< LZW encode state
+    uint32_t dpi;                           ///< image resolution in DPI
 } TiffEncoderContext;
 
 /**
- * Check free space in buffer
+ * Check free space in buffer.
+ *
  * @param s Tiff context
  * @param need Needed bytes
  * @return 0 - ok, 1 - no free space
@@ -85,13 +95,13 @@ static inline int check_size(TiffEncoderContext *s, uint64_t need)
 }
 
 /**
- * Put n values to buffer
+ * Put n values to buffer.
  *
- * @param p Pointer to pointer to output buffer
- * @param n Number of values
- * @param val Pointer to values
- * @param type Type of values
- * @param flip =0 - normal copy, >0 - flip
+ * @param p pointer to pointer to output buffer
+ * @param n number of values
+ * @param val pointer to values
+ * @param type type of values
+ * @param flip = 0 - normal copy, >0 - flip
  */
 static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
                   int flip)
@@ -106,28 +116,29 @@ static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
 
 /**
  * Add entry to directory in tiff header.
+ *
  * @param s Tiff context
- * @param tag Tag that identifies the entry
- * @param type Entry type
- * @param count The number of values
- * @param ptr_val Pointer to values
+ * @param tag tag that identifies the entry
+ * @param type entry type
+ * @param count the number of values
+ * @param ptr_val pointer to values
  */
 static int add_entry(TiffEncoderContext *s, enum TiffTags tag,
                      enum TiffTypes type, int count, const void *ptr_val)
 {
     uint8_t *entries_ptr = s->entries + 12 * s->num_entries;
 
-    assert(s->num_entries < TIFF_MAX_ENTRY);
+    av_assert0(s->num_entries < TIFF_MAX_ENTRY);
 
     bytestream_put_le16(&entries_ptr, tag);
     bytestream_put_le16(&entries_ptr, type);
     bytestream_put_le32(&entries_ptr, count);
 
-    if (type_sizes[type] * count <= 4) {
+    if (type_sizes[type] * (int64_t)count <= 4) {
         tnput(&entries_ptr, count, ptr_val, type, 0);
     } else {
         bytestream_put_le32(&entries_ptr, *s->buf - s->buf_start);
-        if (check_size(s, count * type_sizes2[type]))
+        if (check_size(s, count * (int64_t)type_sizes2[type]))
             return AVERROR_INVALIDDATA;
         tnput(s->buf, count, ptr_val, type, 0);
     }
@@ -146,14 +157,14 @@ static int add_entry1(TiffEncoderContext *s,
 }
 
 /**
- * Encode one strip in tiff file
+ * Encode one strip in tiff file.
  *
  * @param s Tiff context
- * @param src Input buffer
- * @param dst Output buffer
- * @param n Size of input buffer
- * @param compr Compression method
- * @return Number of output bytes. If an output error is encountered, a negative
+ * @param src input buffer
+ * @param dst output buffer
+ * @param n size of input buffer
+ * @param compr compression method
+ * @return number of output bytes. If an output error is encountered, a negative
  * value corresponding to an AVERROR error code is returned.
  */
 static int encode_strip(TiffEncoderContext *s, const int8_t *src,
@@ -167,7 +178,7 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
         unsigned long zlen = s->buf_size - (*s->buf - s->buf_start);
         if (compress(dst, &zlen, src, n) != Z_OK) {
             av_log(s->avctx, AV_LOG_ERROR, "Compressing failed\n");
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
         }
         return zlen;
     }
@@ -183,6 +194,8 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
     case TIFF_LZW:
         return ff_lzw_encode(s->lzws, src, n);
     default:
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported compression method: %d\n",
+               compr);
         return AVERROR(EINVAL);
     }
 }
@@ -194,13 +207,24 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
     int w       = (s->width - 1) / s->subsampling[0] + 1;
     uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
     uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
-    for (i = 0; i < w; i++) {
-        for (j = 0; j < s->subsampling[1]; j++)
-            for (k = 0; k < s->subsampling[0]; k++)
-                *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
-                                    i * s->subsampling[0] + k];
-        *dst++ = *pu++;
-        *dst++ = *pv++;
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                                        FFMIN(i * s->subsampling[0] + k, s->width-1)];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
+                                        i * s->subsampling[0] + k];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
     }
 }
 
@@ -209,84 +233,75 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
         ret = add_entry(s, tag, type, count, ptr_val);  \
         if (ret < 0)                                    \
             goto fail;                                  \
-    } while(0);
+    } while (0)
 
 #define ADD_ENTRY1(s, tag, type, val)           \
     do {                                        \
         ret = add_entry1(s, tag, type, val);    \
         if (ret < 0)                            \
             goto fail;                          \
-    } while(0);
+    } while (0)
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     TiffEncoderContext *s = avctx->priv_data;
     const AVFrame *const p = pict;
     int i;
     uint8_t *ptr;
     uint8_t *offset;
     uint32_t strips;
-    uint32_t *strip_sizes   = NULL;
-    uint32_t *strip_offsets = NULL;
     int bytes_per_row;
-    uint32_t res[2]    = { 72, 1 };     // image resolution (72/1)
-    uint16_t bpp_tab[] = { 8, 8, 8, 8 };
+    uint32_t res[2] = { s->dpi, 1 };    // image resolution (72/1)
+    uint16_t bpp_tab[4];
     int ret = 0;
-    int is_yuv = 0;
-    uint8_t *yuv_line = NULL;
+    int is_yuv = 0, alpha = 0;
     int shift_h, shift_v;
     int packet_size;
-    const AVPixFmtDescriptor *pfd;
-
-    s->avctx = avctx;
 
     s->width          = avctx->width;
     s->height         = avctx->height;
     s->subsampling[0] = 1;
     s->subsampling[1] = 1;
 
+    avctx->bits_per_coded_sample =
+    s->bpp          = av_get_bits_per_pixel(desc);
+    s->bpp_tab_size = desc->nb_components;
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGBA64LE:
-    case AV_PIX_FMT_RGB48LE:
-    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_RGBA:
+        alpha = 1;
+    case AV_PIX_FMT_RGB48LE:
     case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_PAL8:
-        pfd    = av_pix_fmt_desc_get(avctx->pix_fmt);
-        s->bpp = av_get_bits_per_pixel(pfd);
-        if (pfd->flags & AV_PIX_FMT_FLAG_PAL)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
-        else if (pfd->flags & AV_PIX_FMT_FLAG_RGB)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
-        else
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size = pfd->nb_components;
-        for (i = 0; i < s->bpp_tab_size; i++)
-            bpp_tab[i] = s->bpp / s->bpp_tab_size;
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
         break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YA16LE:
+        alpha = avctx->pix_fmt == AV_PIX_FMT_GRAY8A || avctx->pix_fmt == AV_PIX_FMT_YA16LE;
+    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_MONOBLACK:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size               = 0;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
         break;
     case AV_PIX_FMT_MONOWHITE:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_WHITE_IS_ZERO;
-        s->bpp_tab_size               = 0;
         break;
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
         av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &shift_h, &shift_v);
         s->photometric_interpretation = TIFF_PHOTOMETRIC_YCBCR;
-        s->bpp                        = 8 + (16 >> (shift_h + shift_v));
         s->subsampling[0]             = 1 << shift_h;
         s->subsampling[1]             = 1 << shift_v;
-        s->bpp_tab_size               = 3;
         is_yuv                        = 1;
         break;
     default:
@@ -295,6 +310,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return AVERROR(EINVAL);
     }
 
+    for (i = 0; i < s->bpp_tab_size; i++)
+        bpp_tab[i] = desc->comp[i].depth;
+
     if (s->compr == TIFF_DEFLATE       ||
         s->compr == TIFF_ADOBE_DEFLATE ||
         s->compr == TIFF_LZW)
@@ -308,14 +326,13 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     strips = (s->height - 1) / s->rps + 1;
 
-    packet_size = avctx->height * ((avctx->width * s->bpp + 7) >> 3) * 2 +
+    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+    packet_size = avctx->height * bytes_per_row * 2 +
                   avctx->height * 4 + AV_INPUT_BUFFER_MIN_SIZE;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, packet_size, 0)) < 0)
         return ret;
-    }
     ptr          = pkt->data;
     s->buf_start = pkt->data;
     s->buf       = &ptr;
@@ -333,18 +350,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     offset = ptr;
     bytestream_put_le32(&ptr, 0);
 
-    strip_sizes   = av_mallocz_array(strips, sizeof(*strip_sizes));
-    strip_offsets = av_mallocz_array(strips, sizeof(*strip_offsets));
-    if (!strip_sizes || !strip_offsets) {
+    if (strips > INT_MAX / FFMAX(sizeof(s->strip_sizes[0]), sizeof(s->strip_offsets[0]))) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    av_fast_padded_mallocz(&s->strip_sizes  , &s->strip_sizes_size  , sizeof(s->strip_sizes  [0]) * strips);
+    av_fast_padded_mallocz(&s->strip_offsets, &s->strip_offsets_size, sizeof(s->strip_offsets[0]) * strips);
+
+    if (!s->strip_sizes || !s->strip_offsets) {
         ret = AVERROR(ENOMEM);
         goto fail;
     }
 
-    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
-                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
     if (is_yuv) {
-        yuv_line = av_malloc(bytes_per_row);
-        if (!yuv_line) {
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
             av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
             ret = AVERROR(ENOMEM);
             goto fail;
@@ -363,12 +383,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             ret = AVERROR(ENOMEM);
             goto fail;
         }
-        strip_offsets[0] = ptr - pkt->data;
+        s->strip_offsets[0] = ptr - pkt->data;
         zn               = 0;
         for (j = 0; j < s->rps; j++) {
             if (is_yuv) {
-                pack_yuv(s, p, yuv_line, j);
-                memcpy(zbuf + zn, yuv_line, bytes_per_row);
+                pack_yuv(s, p, s->yuv_line, j);
+                memcpy(zbuf + zn, s->yuv_line, bytes_per_row);
                 j += s->subsampling[1] - 1;
             } else
                 memcpy(zbuf + j * bytes_per_row,
@@ -382,9 +402,10 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             goto fail;
         }
         ptr           += ret;
-        strip_sizes[0] = ptr - pkt->data - strip_offsets[0];
+        s->strip_sizes[0] = ptr - pkt->data - s->strip_offsets[0];
     } else
 #endif
+    {
     if (s->compr == TIFF_LZW) {
         s->lzws = av_malloc(ff_lzw_encode_state_size);
         if (!s->lzws) {
@@ -393,17 +414,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
     for (i = 0; i < s->height; i++) {
-        if (strip_sizes[i / s->rps] == 0) {
+        if (s->strip_sizes[i / s->rps] == 0) {
             if (s->compr == TIFF_LZW) {
                 ff_lzw_encode_init(s->lzws, ptr,
                                    s->buf_size - (*s->buf - s->buf_start),
                                    12, FF_LZW_TIFF, put_bits);
             }
-            strip_offsets[i / s->rps] = ptr - pkt->data;
+            s->strip_offsets[i / s->rps] = ptr - pkt->data;
         }
         if (is_yuv) {
-            pack_yuv(s, p, yuv_line, i);
-            ret = encode_strip(s, yuv_line, ptr, bytes_per_row, s->compr);
+            pack_yuv(s, p, s->yuv_line, i);
+            ret = encode_strip(s, s->yuv_line, ptr, bytes_per_row, s->compr);
             i  += s->subsampling[1] - 1;
         } else
             ret = encode_strip(s, p->data[0] + i * p->linesize[0],
@@ -412,17 +433,18 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             av_log(s->avctx, AV_LOG_ERROR, "Encode strip failed\n");
             goto fail;
         }
-        strip_sizes[i / s->rps] += ret;
+        s->strip_sizes[i / s->rps] += ret;
         ptr                     += ret;
         if (s->compr == TIFF_LZW &&
             (i == s->height - 1 || i % s->rps == s->rps - 1)) {
             ret = ff_lzw_encode_flush(s->lzws, flush_put_bits);
-            strip_sizes[(i / s->rps)] += ret;
-            ptr                       += ret;
+            s->strip_sizes[(i / s->rps)] += ret;
+            ptr                          += ret;
         }
     }
     if (s->compr == TIFF_LZW)
-        av_free(s->lzws);
+        av_freep(&s->lzws);
+    }
 
     s->num_entries = 0;
 
@@ -435,14 +457,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     ADD_ENTRY1(s, TIFF_COMPR,       TIFF_SHORT, s->compr);
     ADD_ENTRY1(s, TIFF_PHOTOMETRIC, TIFF_SHORT, s->photometric_interpretation);
-    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, strip_offsets);
+    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, s->strip_offsets);
 
     if (s->bpp_tab_size)
         ADD_ENTRY1(s, TIFF_SAMPLES_PER_PIXEL, TIFF_SHORT, s->bpp_tab_size);
 
     ADD_ENTRY1(s, TIFF_ROWSPERSTRIP, TIFF_LONG,     s->rps);
-    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, strip_sizes);
+    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, s->strip_sizes);
     ADD_ENTRY(s,  TIFF_XRES,         TIFF_RATIONAL, 1,      res);
+    if (avctx->sample_aspect_ratio.num > 0 &&
+        avctx->sample_aspect_ratio.den > 0) {
+        AVRational y = av_mul_q(av_make_q(s->dpi, 1),
+                                avctx->sample_aspect_ratio);
+        res[0] = y.num;
+        res[1] = y.den;
+    }
     ADD_ENTRY(s,  TIFF_YRES,         TIFF_RATIONAL, 1,      res);
     ADD_ENTRY1(s, TIFF_RES_UNIT,     TIFF_SHORT,    2);
 
@@ -460,10 +489,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
         ADD_ENTRY(s, TIFF_PAL, TIFF_SHORT, 256 * 3, pal);
     }
+    if (alpha)
+        ADD_ENTRY1(s,TIFF_EXTRASAMPLES,      TIFF_SHORT,            2);
     if (is_yuv) {
         /** according to CCIR Recommendation 601.1 */
         uint32_t refbw[12] = { 15, 1, 235, 1, 128, 1, 240, 1, 128, 1, 240, 1 };
         ADD_ENTRY(s, TIFF_YCBCR_SUBSAMPLING, TIFF_SHORT,    2, s->subsampling);
+        if (avctx->chroma_sample_location == AVCHROMA_LOC_TOPLEFT)
+            ADD_ENTRY1(s, TIFF_YCBCR_POSITIONING, TIFF_SHORT, 2);
         ADD_ENTRY(s, TIFF_REFERENCE_BW,      TIFF_RATIONAL, 6, refbw);
     }
     // write offset to dir
@@ -482,20 +515,30 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     *got_packet = 1;
 
 fail:
-    av_free(strip_sizes);
-    av_free(strip_offsets);
-    av_free(yuv_line);
-    return ret;
+    return ret < 0 ? ret : 0;
 }
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
+    TiffEncoderContext *s = avctx->priv_data;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
     avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+    s->avctx = avctx;
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    TiffEncoderContext *s = avctx->priv_data;
+
+    av_freep(&s->strip_sizes);
+    av_freep(&s->strip_offsets);
+    av_freep(&s->yuv_line);
 
     return 0;
 }
@@ -503,6 +546,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(TiffEncoderContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
+    {"dpi", "set the image resolution (in dpi)", OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 72}, 1, 0x10000, AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_ENCODING_PARAM},
     { "compression_algo", NULL, OFFSET(compr), AV_OPT_TYPE_INT,   { .i64 = TIFF_PACKBITS }, TIFF_RAW, TIFF_DEFLATE, VE, "compression_algo" },
     { "packbits",         NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_PACKBITS }, 0,        0,            VE, "compression_algo" },
     { "raw",              NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_RAW      }, 0,        0,            VE, "compression_algo" },
@@ -527,13 +571,15 @@ AVCodec ff_tiff_encoder = {
     .id             = AV_CODEC_ID_TIFF,
     .priv_data_size = sizeof(TiffEncoderContext),
     .init           = encode_init,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB48LE, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_RGBA, AV_PIX_FMT_RGBA64LE,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16LE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A, AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_YA16LE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
         AV_PIX_FMT_NONE
     },
diff --git a/libavcodec/tmv.c b/libavcodec/tmv.c
index a9fcdf3b28..b738fcb103 100644
--- a/libavcodec/tmv.c
+++ b/libavcodec/tmv.c
@@ -2,20 +2,20 @@
  * 8088flex TMV video decoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/internal.h"
+#include "libavutil/xga_font_data.h"
 
 #include "cga_data.h"
 
@@ -45,10 +46,8 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
     unsigned x, y, fg, bg, c;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (avpkt->size < 2*char_rows*char_cols) {
         av_log(avctx, AV_LOG_ERROR,
@@ -63,6 +62,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
 
     frame->palette_has_changed = 1;
     memcpy(frame->data[1], ff_cga_palette, 16 * 4);
+    memset(frame->data[1] + 16 * 4, 0, AVPALETTE_SIZE - 16 * 4);
 
     for (y = 0; y < char_rows; y++) {
         for (x = 0; x < char_cols; x++) {
@@ -70,7 +70,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
             bg = *src  >> 4;
             fg = *src++ & 0xF;
             ff_draw_pc_font(dst + x * 8, frame->linesize[0],
-                            ff_cga_font, 8, c, fg, bg);
+                            avpriv_cga_font, 8, c, fg, bg);
         }
         dst += frame->linesize[0] * 8;
     }
diff --git a/libavcodec/tpeldsp.c b/libavcodec/tpeldsp.c
index 7ea1da4f7e..cc4fed328d 100644
--- a/libavcodec/tpeldsp.c
+++ b/libavcodec/tpeldsp.c
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tpeldsp.h b/libavcodec/tpeldsp.h
index 9c67d60850..3732f179f4 100644
--- a/libavcodec/tpeldsp.h
+++ b/libavcodec/tpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/truemotion1.c b/libavcodec/truemotion1.c
index 3eab33aa29..b8d0de4686 100644
--- a/libavcodec/truemotion1.c
+++ b/libavcodec/truemotion1.c
@@ -2,20 +2,20 @@
  * Duck TrueMotion 1.0 Decoder
  * Copyright (C) 2003 Alex Beregszaszi & Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -215,7 +215,7 @@ static int make_cdt16_entry(int p1, int p2, int16_t *cdt)
     b = cdt[p2];
     r = cdt[p1] << 11;
     lo = b + r;
-    return (lo + (lo << 16)) << 1;
+    return (lo + (lo * (1 << 16))) * 2;
 }
 
 static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
@@ -224,7 +224,7 @@ static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
 
     lo = ydt[p1];
     hi = ydt[p2];
-    return (lo + (hi << 8) + (hi << 16)) << 1;
+    return (lo + (hi * (1 << 8)) + (hi * (1 << 16))) * 2;
 }
 
 static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
@@ -232,8 +232,8 @@ static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
     int r, b;
 
     b = cdt[p2];
-    r = cdt[p1]<<16;
-    return (b+r) << 1;
+    r = cdt[p1] * (1 << 16);
+    return (b+r) * 2;
 }
 
 static void gen_vector_table15(TrueMotion1Context *s, const uint8_t *sel_vector_table)
@@ -396,12 +396,16 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     }
 
     if (compression_types[header.compression].algorithm == ALGO_RGB24H) {
-        new_pix_fmt = AV_PIX_FMT_RGB32;
+        new_pix_fmt = AV_PIX_FMT_0RGB32;
         width_shift = 1;
     } else
         new_pix_fmt = AV_PIX_FMT_RGB555; // RGB565 is supported as well
 
     s->w >>= width_shift;
+    if (s->w & 1) {
+        avpriv_request_sample(s->avctx, "Frame with odd width");
+        return AVERROR_PATCHWELCOME;
+    }
 
     if (s->w != s->avctx->width || s->h != s->avctx->height ||
         new_pix_fmt != s->avctx->pix_fmt) {
@@ -415,6 +419,8 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
 
         av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+        if (!s->vert_pred)
+            return AVERROR(ENOMEM);
     }
 
     /* There is 1 change bit per 4 pixels, so each change byte represents
@@ -483,6 +489,8 @@ static av_cold int truemotion1_decode_init(AVCodecContext *avctx)
     /* there is a vertical predictor for each pixel in a line; each vertical
      * predictor is 0 to start with */
     av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+    if (!s->vert_pred)
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -871,10 +879,8 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
     if ((ret = truemotion1_decode_header(s)) < 0)
         return ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
         truemotion1_decode_24bit(s);
@@ -896,7 +902,7 @@ static av_cold int truemotion1_decode_end(AVCodecContext *avctx)
     TrueMotion1Context *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
-    av_free(s->vert_pred);
+    av_freep(&s->vert_pred);
 
     return 0;
 }
diff --git a/libavcodec/truemotion1data.h b/libavcodec/truemotion1data.h
index e950450fcd..3e581434e4 100644
--- a/libavcodec/truemotion1data.h
+++ b/libavcodec/truemotion1data.h
@@ -6,20 +6,20 @@
  * the GNU LGPL using the common understanding that data tables necessary
  * for decoding algorithms are not necessarily copyrightable.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_TRUEMOTION1DATA_H
diff --git a/libavcodec/truemotion2.c b/libavcodec/truemotion2.c
index 84e8e2a9b3..75373a095b 100644
--- a/libavcodec/truemotion2.c
+++ b/libavcodec/truemotion2.c
@@ -2,20 +2,20 @@
  * Duck/ON2 TrueMotion 2 Decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,9 @@ typedef struct TM2Context {
     GetBitContext gb;
     BswapDSPContext bdsp;
 
+    uint8_t *buffer;
+    int buffer_size;
+
     /* TM2 streams */
     int *tokens[TM2_NUM_STREAMS];
     int tok_lens[TM2_NUM_STREAMS];
@@ -87,7 +90,7 @@ typedef struct TM2Context {
 * Huffman codes for each of streams
 */
 typedef struct TM2Codes {
-    VLC vlc; ///< table for Libav bitstream reader
+    VLC vlc; ///< table for FFmpeg bitstream reader
     int bits;
     int *recode; ///< table for converting from code indexes to values
     int length;
@@ -168,9 +171,10 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
 
     /* allocate space for codes - it is exactly ceil(nodes / 2) entries */
     huff.max_num = (huff.nodes + 1) >> 1;
-    huff.nums    = av_mallocz(huff.max_num * sizeof(int));
-    huff.bits    = av_mallocz(huff.max_num * sizeof(uint32_t));
-    huff.lens    = av_mallocz(huff.max_num * sizeof(int));
+    huff.nums    = av_calloc(huff.max_num, sizeof(int));
+    huff.bits    = av_calloc(huff.max_num, sizeof(uint32_t));
+    huff.lens    = av_calloc(huff.max_num, sizeof(int));
+
     if (!huff.nums || !huff.bits || !huff.lens) {
         res = AVERROR(ENOMEM);
         goto out;
@@ -196,7 +200,7 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
         else {
             code->bits = huff.max_bits;
             code->length = huff.max_num;
-            code->recode = av_malloc(code->length * sizeof(int));
+            code->recode = av_malloc_array(code->length, sizeof(int));
             if (!code->recode) {
                 res = AVERROR(ENOMEM);
                 goto out;
@@ -226,6 +230,8 @@ static inline int tm2_get_token(GetBitContext *gb, TM2Codes *code)
 {
     int val;
     val = get_vlc2(gb, code->vlc.table, code->bits, 1);
+    if(val<0)
+        return -1;
     return code->recode[val];
 }
 
@@ -257,7 +263,8 @@ static int tm2_read_deltas(TM2Context *ctx, int stream_id)
     d  = get_bits(&ctx->gb, 9);
     mb = get_bits(&ctx->gb, 5);
 
-    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1) || (mb > 32)) {
+    av_assert2(mb < 32);
+    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect delta table: %i deltas x %i bits\n", d, mb);
         return AVERROR_INVALIDDATA;
     }
@@ -283,6 +290,11 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     TM2Codes codes;
     GetByteContext gb;
 
+    if (buf_size < 4) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "not enough space for len left\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get stream length in dwords */
     bytestream2_init(&gb, buf, buf_size);
     len  = bytestream2_get_be32(&gb);
@@ -291,8 +303,8 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     if (len == 0)
         return 4;
 
-    if (len >= INT_MAX/4-1 || len < 0 || len > buf_size) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Error, invalid stream size.\n");
+    if (len >= INT_MAX/4-1 || len < 0 || skip > buf_size) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "invalid stream size\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -335,7 +347,11 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
         tm2_free_codes(&codes);
         return AVERROR_INVALIDDATA;
     }
-    ctx->tokens[stream_id]   = av_realloc(ctx->tokens[stream_id], toks * sizeof(int));
+    ret = av_reallocp_array(&ctx->tokens[stream_id], toks, sizeof(int));
+    if (ret < 0) {
+        ctx->tok_lens[stream_id] = 0;
+        return ret;
+    }
     ctx->tok_lens[stream_id] = toks;
     len = bytestream2_get_be32(&gb);
     if (len > 0) {
@@ -349,7 +365,7 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
                 return AVERROR_INVALIDDATA;
             }
             ctx->tokens[stream_id][i] = tm2_get_token(&ctx->gb, &codes);
-            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS) {
+            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS || ctx->tokens[stream_id][i]<0) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Invalid delta token index %d for type %d, n=%d\n",
                        ctx->tokens[stream_id][i], stream_id, i);
                 return AVERROR_INVALIDDATA;
@@ -376,8 +392,13 @@ static inline int GET_TOK(TM2Context *ctx,int type)
         av_log(ctx->avctx, AV_LOG_ERROR, "Read token from stream %i out of bounds (%i>=%i)\n", type, ctx->tok_ptrs[type], ctx->tok_lens[type]);
         return 0;
     }
-    if (type <= TM2_MOT)
+    if (type <= TM2_MOT) {
+        if (ctx->tokens[type][ctx->tok_ptrs[type]] >= TM2_DELTAS) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "token %d is too large\n", ctx->tokens[type][ctx->tok_ptrs[type]]);
+            return 0;
+        }
         return ctx->deltas[type][ctx->tokens[type][ctx->tok_ptrs[type]++]];
+    }
     return ctx->tokens[type][ctx->tok_ptrs[type]++];
 }
 
@@ -684,6 +705,11 @@ static inline void tm2_motion_block(TM2Context *ctx, AVFrame *pic, int bx, int b
     mx = av_clip(mx, -(bx * 4 + 4), ctx->avctx->width  - bx * 4);
     my = av_clip(my, -(by * 4 + 4), ctx->avctx->height - by * 4);
 
+    if (4*bx+mx<0 || 4*by+my<0 || 4*bx+mx+4 > ctx->avctx->width || 4*by+my+4 > ctx->avctx->height) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "MV out of picture\n");
+        return;
+    }
+
     Yo += my * oYstride + mx;
     Uo += (my >> 1) * oUstride + (mx >> 1);
     Vo += (my >> 1) * oVstride + (mx >> 1);
@@ -854,37 +880,34 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p    = l->pic;
     int offset           = TM2_HEADER_SIZE;
     int i, t, ret;
-    uint8_t *swbuf;
 
-    swbuf = av_malloc(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!swbuf) {
+    av_fast_padded_malloc(&l->buffer, &l->buffer_size, buf_size);
+    if (!l->buffer) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        av_free(swbuf);
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
-    l->bdsp.bswap_buf((uint32_t *) swbuf, (const uint32_t *) buf,
+    l->bdsp.bswap_buf((uint32_t *) l->buffer, (const uint32_t *) buf,
                       buf_size >> 2);
 
-    if ((ret = tm2_read_header(l, swbuf)) < 0) {
-        av_free(swbuf);
+    if ((ret = tm2_read_header(l, l->buffer)) < 0) {
         return ret;
     }
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         if (offset >= buf_size) {
-            av_free(swbuf);
+            av_log(avctx, AV_LOG_ERROR, "no space for tm2_read_stream\n");
             return AVERROR_INVALIDDATA;
         }
-        t = tm2_read_stream(l, swbuf + offset, tm2_stream_order[i],
+
+        t = tm2_read_stream(l, l->buffer + offset, tm2_stream_order[i],
                             buf_size - offset);
         if (t < 0) {
-            av_free(swbuf);
+            int j = tm2_stream_order[i];
+            memset(l->tokens[j], 0, sizeof(**l->tokens) * l->tok_lens[j]);
             return t;
         }
         offset += t;
@@ -898,7 +921,6 @@ static int decode_frame(AVCodecContext *avctx,
     l->cur = !l->cur;
     *got_frame      = 1;
     ret = av_frame_ref(data, l->pic);
-    av_free(swbuf);
 
     return (ret < 0) ? ret : buf_size;
 }
@@ -922,8 +944,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ff_bswapdsp_init(&l->bdsp);
 
-    l->last  = av_malloc(4 * sizeof(*l->last)  * (w >> 2));
-    l->clast = av_malloc(4 * sizeof(*l->clast) * (w >> 2));
+    l->last  = av_malloc_array(w >> 2, 4 * sizeof(*l->last) );
+    l->clast = av_malloc_array(w >> 2, 4 * sizeof(*l->clast));
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         l->tokens[i] = NULL;
@@ -932,15 +954,15 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     w += 8;
     h += 8;
-    l->Y1_base = av_malloc(sizeof(*l->Y1_base) * w * h);
-    l->Y2_base = av_malloc(sizeof(*l->Y2_base) * w * h);
+    l->Y1_base = av_calloc(w * h, sizeof(*l->Y1_base));
+    l->Y2_base = av_calloc(w * h, sizeof(*l->Y2_base));
     l->y_stride = w;
     w = (w + 1) >> 1;
     h = (h + 1) >> 1;
-    l->U1_base = av_malloc(sizeof(*l->U1_base) * w * h);
-    l->V1_base = av_malloc(sizeof(*l->V1_base) * w * h);
-    l->U2_base = av_malloc(sizeof(*l->U2_base) * w * h);
-    l->V2_base = av_malloc(sizeof(*l->V1_base) * w * h);
+    l->U1_base = av_calloc(w * h, sizeof(*l->U1_base));
+    l->V1_base = av_calloc(w * h, sizeof(*l->V1_base));
+    l->U2_base = av_calloc(w * h, sizeof(*l->U2_base));
+    l->V2_base = av_calloc(w * h, sizeof(*l->V1_base));
     l->uv_stride = w;
     l->cur = 0;
     if (!l->Y1_base || !l->Y2_base || !l->U1_base ||
@@ -954,6 +976,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_freep(&l->V2_base);
         av_freep(&l->last);
         av_freep(&l->clast);
+        av_frame_free(&l->pic);
         return AVERROR(ENOMEM);
     }
     l->Y1 = l->Y1_base + l->y_stride  * 4 + 4;
@@ -974,15 +997,17 @@ static av_cold int decode_end(AVCodecContext *avctx)
     av_free(l->last);
     av_free(l->clast);
     for (i = 0; i < TM2_NUM_STREAMS; i++)
-        av_free(l->tokens[i]);
+        av_freep(&l->tokens[i]);
     if (l->Y1) {
-        av_free(l->Y1_base);
-        av_free(l->U1_base);
-        av_free(l->V1_base);
-        av_free(l->Y2_base);
-        av_free(l->U2_base);
-        av_free(l->V2_base);
+        av_freep(&l->Y1_base);
+        av_freep(&l->U1_base);
+        av_freep(&l->V1_base);
+        av_freep(&l->Y2_base);
+        av_freep(&l->U2_base);
+        av_freep(&l->V2_base);
     }
+    av_freep(&l->buffer);
+    l->buffer_size = 0;
 
     av_frame_free(&l->pic);
 
diff --git a/libavcodec/truespeech.c b/libavcodec/truespeech.c
index b2195baef4..d4ddfcbf9c 100644
--- a/libavcodec/truespeech.c
+++ b/libavcodec/truespeech.c
@@ -2,20 +2,20 @@
  * DSP Group TrueSpeech compatible decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -325,10 +325,8 @@ static int truespeech_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = iterations * 240;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     memset(samples, 0, iterations * 240 * sizeof(*samples));
diff --git a/libavcodec/truespeech_data.h b/libavcodec/truespeech_data.h
index 6e9806a0b5..73ebda5e85 100644
--- a/libavcodec/truespeech_data.h
+++ b/libavcodec/truespeech_data.h
@@ -2,20 +2,20 @@
  * DSP Group TrueSpeech compatible decoder
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tscc.c b/libavcodec/tscc.c
index 98b5f7161b..6b5e275ace 100644
--- a/libavcodec/tscc.c
+++ b/libavcodec/tscc.c
@@ -2,20 +2,20 @@
  * TechSmith Camtasia decoder
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,7 @@
 typedef struct TsccContext {
 
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     // Bits per pixel
     int bpp;
@@ -67,13 +68,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamtasiaContext * const c = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = c->frame;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
         return ret;
-    }
 
     ret = inflateReset(&c->zstream);
     if (ret != Z_OK) {
@@ -109,6 +108,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     }
 
+    if ((ret = av_frame_ref(data, frame)) < 0)
+        return ret;
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
@@ -132,9 +133,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     case 24:
              avctx->pix_fmt = AV_PIX_FMT_BGR24;
              break;
-    case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_0RGB32; break;
     default: av_log(avctx, AV_LOG_ERROR, "Camtasia error: unknown depth %i bpp\n", avctx->bits_per_coded_sample);
-             return AVERROR_INVALIDDATA;
+             return AVERROR_PATCHWELCOME;
     }
     c->bpp = avctx->bits_per_coded_sample;
     // buffer size for RLE 'best' case when 2-byte code precedes each pixel and there may be padding after it too
@@ -157,6 +158,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
+    c->frame = av_frame_alloc();
+
     return 0;
 }
 
@@ -165,6 +168,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     CamtasiaContext * const c = avctx->priv_data;
 
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->frame);
 
     inflateEnd(&c->zstream);
 
diff --git a/libavcodec/tscc2.c b/libavcodec/tscc2.c
index d83ccd9d11..9bb7ab3ba3 100644
--- a/libavcodec/tscc2.c
+++ b/libavcodec/tscc2.c
@@ -2,20 +2,20 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -90,14 +90,14 @@ static av_cold int init_vlcs(TSCC2Context *c)
     return 0;
 }
 
-#define DEQUANT(val, q) ((q * val + 0x80) >> 8)
+#define DEQUANT(val, q) (((q) * (val) + 0x80) >> 8)
 #define DCT1D(d0, d1, d2, d3, s0, s1, s2, s3, OP) \
     OP(d0, 5 * ((s0) + (s1) + (s2)) + 2 * (s3));  \
     OP(d1, 5 * ((s0) - (s2) - (s3)) + 2 * (s1));  \
     OP(d2, 5 * ((s0) - (s2) + (s3)) - 2 * (s1));  \
     OP(d3, 5 * ((s0) - (s1) + (s2)) - 2 * (s3));  \
 
-#define COL_OP(a, b)  a = b
+#define COL_OP(a, b)  a = (b)
 #define ROW_OP(a, b)  a = ((b) + 0x20) >> 6
 
 static void tscc2_idct4_put(int *in, int q[3], uint8_t *dst, int stride)
@@ -194,7 +194,8 @@ static int tscc2_decode_slice(TSCC2Context *c, int mb_y,
     int i, mb_x, q, ret;
     int off;
 
-    init_get_bits(&c->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&c->gb, buf, buf_size)) < 0)
+        return ret;
 
     for (mb_x = 0; mb_x < c->mb_width; mb_x++) {
         q = c->slice_quants[mb_x + c->mb_width * mb_y];
@@ -234,7 +235,6 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
         return ret;
     }
 
diff --git a/libavcodec/tscc2data.h b/libavcodec/tscc2data.h
index 70a06e5e0a..4586da77a5 100644
--- a/libavcodec/tscc2data.h
+++ b/libavcodec/tscc2data.h
@@ -2,20 +2,20 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index 7399827f2f..1e2e9c4ef4 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -2,20 +2,20 @@
  * TTA (The Lossless True Audio) decoder
  * Copyright (c) 2006 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,35 +29,23 @@
 
 #define BITSTREAM_READER_LE
 #include <limits.h>
+#include "ttadata.h"
+#include "ttadsp.h"
 #include "avcodec.h"
 #include "get_bits.h"
+#include "thread.h"
+#include "unary.h"
 #include "internal.h"
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 
 #define FORMAT_SIMPLE    1
 #define FORMAT_ENCRYPTED 2
 
-#define MAX_ORDER 16
-typedef struct TTAFilter {
-    int32_t shift, round, error;
-    int32_t qm[MAX_ORDER];
-    int32_t dx[MAX_ORDER];
-    int32_t dl[MAX_ORDER];
-} TTAFilter;
-
-typedef struct TTARice {
-    uint32_t k0, k1, sum0, sum1;
-} TTARice;
-
-typedef struct TTAChannel {
-    int32_t predictor;
-    TTAFilter filter;
-    TTARice rice;
-} TTAChannel;
-
 typedef struct TTAContext {
+    AVClass *class;
     AVCodecContext *avctx;
-    GetBitContext gb;
     const AVCRC *crc_table;
 
     int format, channels, bps;
@@ -66,128 +54,65 @@ typedef struct TTAContext {
 
     int32_t *decode_buffer;
 
+    uint8_t crc_pass[8];
+    uint8_t *pass;
     TTAChannel *ch_ctx;
+    TTADSPContext dsp;
 } TTAContext;
 
-static const uint32_t shift_1[] = {
-    0x00000001, 0x00000002, 0x00000004, 0x00000008,
-    0x00000010, 0x00000020, 0x00000040, 0x00000080,
-    0x00000100, 0x00000200, 0x00000400, 0x00000800,
-    0x00001000, 0x00002000, 0x00004000, 0x00008000,
-    0x00010000, 0x00020000, 0x00040000, 0x00080000,
-    0x00100000, 0x00200000, 0x00400000, 0x00800000,
-    0x01000000, 0x02000000, 0x04000000, 0x08000000,
-    0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000
-};
-
-static const uint32_t * const shift_16 = shift_1 + 4;
-
-static const int32_t ttafilter_configs[4] = {
-    10,
-    9,
-    10,
-    12
+static const int64_t tta_channel_layouts[7] = {
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD,
+    0,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK|AV_CH_BACK_CENTER,
+    AV_CH_LAYOUT_7POINT1_WIDE
 };
 
-static void ttafilter_init(TTAFilter *c, int32_t shift) {
-    memset(c, 0, sizeof(TTAFilter));
-    c->shift = shift;
-   c->round = shift_1[shift-1];
-//    c->round = 1 << (shift - 1);
-}
-
-// FIXME: copy paste from original
-static inline void memshl(register int32_t *a, register int32_t *b) {
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a = *b;
-}
-
-static inline void ttafilter_process(TTAFilter *c, int32_t *in)
+static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
 {
-    register int32_t *dl = c->dl, *qm = c->qm, *dx = c->dx, sum = c->round;
-
-    if (!c->error) {
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        dx += 8;
-    } else if(c->error < 0) {
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-    } else {
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-    }
-
-    *(dx-0) = ((*(dl-1) >> 30) | 1) << 2;
-    *(dx-1) = ((*(dl-2) >> 30) | 1) << 1;
-    *(dx-2) = ((*(dl-3) >> 30) | 1) << 1;
-    *(dx-3) = ((*(dl-4) >> 30) | 1);
-
-    c->error = *in;
-    *in += (sum >> c->shift);
-    *dl = *in;
-
-    *(dl-1) = *dl - *(dl-1);
-    *(dl-2) = *(dl-1) - *(dl-2);
-    *(dl-3) = *(dl-2) - *(dl-3);
+    uint32_t crc, CRC;
 
-    memshl(c->dl, c->dl + 1);
-    memshl(c->dx, c->dx + 1);
-}
+    CRC = AV_RL32(buf + buf_size);
+    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
+    if (CRC != (crc ^ 0xFFFFFFFFU)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-static void rice_init(TTARice *c, uint32_t k0, uint32_t k1)
-{
-    c->k0 = k0;
-    c->k1 = k1;
-    c->sum0 = shift_16[k0];
-    c->sum1 = shift_16[k1];
+    return 0;
 }
 
-static int tta_get_unary(GetBitContext *gb)
+static uint64_t tta_check_crc64(uint8_t *pass)
 {
-    int ret = 0;
+    uint64_t crc = UINT64_MAX, poly = 0x42F0E1EBA9EA3693U;
+    uint8_t *end = pass + strlen(pass);
+    int i;
+
+    while (pass < end) {
+        crc ^= (uint64_t)*pass++ << 56;
+        for (i = 0; i < 8; i++)
+            crc = (crc << 1) ^ (poly & (((int64_t) crc) >> 63));
+    }
 
-    // count ones
-    while (get_bits_left(gb) > 0 && get_bits1(gb))
-        ret++;
-    return ret;
+    return crc ^ UINT64_MAX;
 }
 
-static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
+static int allocate_buffers(AVCodecContext *avctx)
 {
-    uint32_t crc, CRC;
+    TTAContext *s = avctx->priv_data;
 
-    CRC = AV_RL32(buf + buf_size);
-    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
-    if (CRC != (crc ^ 0xFFFFFFFFU)) {
-        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
-        return AVERROR_INVALIDDATA;
+    if (s->bps < 3) {
+        s->decode_buffer = av_mallocz_array(sizeof(int32_t)*s->frame_length, s->channels);
+        if (!s->decode_buffer)
+            return AVERROR(ENOMEM);
+    } else
+        s->decode_buffer = NULL;
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx) {
+        av_freep(&s->decode_buffer);
+        return AVERROR(ENOMEM);
     }
 
     return 0;
@@ -196,58 +121,63 @@ static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
 static av_cold int tta_decode_init(AVCodecContext * avctx)
 {
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int total_frames;
+    int ret;
 
     s->avctx = avctx;
 
-    // 30bytes includes a seektable with one frame
-    if (avctx->extradata_size < 30)
-        return -1;
+    // 30bytes includes TTA1 header
+    if (avctx->extradata_size < 22)
+        return AVERROR_INVALIDDATA;
 
-    init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size * 8);
-    if (show_bits_long(&s->gb, 32) == AV_RL32("TTA1"))
-    {
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
-            tta_check_crc(s, avctx->extradata, 18);
-        }
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
 
+    if (show_bits_long(&gb, 32) == AV_RL32("TTA1")) {
         /* signature */
-        skip_bits_long(&s->gb, 32);
+        skip_bits_long(&gb, 32);
 
-        s->format = get_bits(&s->gb, 16);
+        s->format = get_bits(&gb, 16);
         if (s->format > 2) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid format\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "Invalid format\n");
+            return AVERROR_INVALIDDATA;
         }
         if (s->format == FORMAT_ENCRYPTED) {
-            avpriv_report_missing_feature(s->avctx, "Encrypted TTA");
-            return AVERROR_PATCHWELCOME;
+            if (!s->pass) {
+                av_log(avctx, AV_LOG_ERROR, "Missing password for encrypted stream. Please use the -password option\n");
+                return AVERROR(EINVAL);
+            }
+            AV_WL64(s->crc_pass, tta_check_crc64(s->pass));
         }
-        avctx->channels = s->channels = get_bits(&s->gb, 16);
-        avctx->bits_per_coded_sample = get_bits(&s->gb, 16);
-        s->bps = (avctx->bits_per_coded_sample + 7) / 8;
-        avctx->sample_rate = get_bits_long(&s->gb, 32);
-        s->data_length = get_bits_long(&s->gb, 32);
-        skip_bits_long(&s->gb, 32); // CRC32 of header
+        avctx->channels = s->channels = get_bits(&gb, 16);
+        if (s->channels > 1 && s->channels < 9)
+            avctx->channel_layout = tta_channel_layouts[s->channels-2];
+        avctx->bits_per_raw_sample = get_bits(&gb, 16);
+        s->bps = (avctx->bits_per_raw_sample + 7) / 8;
+        avctx->sample_rate = get_bits_long(&gb, 32);
+        s->data_length = get_bits_long(&gb, 32);
+        skip_bits_long(&gb, 32); // CRC32 of header
 
         if (s->channels == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
             return AVERROR_INVALIDDATA;
         } else if (avctx->sample_rate == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid samplerate\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid samplerate\n");
             return AVERROR_INVALIDDATA;
         }
 
         switch(s->bps) {
+        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
         case 2:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-            avctx->bits_per_raw_sample = 16;
             break;
         case 3:
             avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-            avctx->bits_per_raw_sample = 24;
             break;
+        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
             return AVERROR_INVALIDDATA;
@@ -264,54 +194,35 @@ static av_cold int tta_decode_init(AVCodecContext * avctx)
         total_frames = s->data_length / s->frame_length +
                        (s->last_frame_length ? 1 : 0);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
             s->format, avctx->channels, avctx->bits_per_coded_sample, avctx->sample_rate,
             avctx->block_align);
-        av_log(s->avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
             s->data_length, s->frame_length, s->last_frame_length, total_frames);
 
-        // FIXME: seek table
-        if (avctx->extradata_size <= 26 || total_frames > INT_MAX / 4 ||
-            avctx->extradata_size - 26 < total_frames * 4)
-            av_log(avctx, AV_LOG_WARNING, "Seek table missing or too small\n");
-        else if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            int ret = tta_check_crc(s, avctx->extradata + 22, total_frames * 4);
-            if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-        skip_bits_long(&s->gb, 32 * total_frames);
-        skip_bits_long(&s->gb, 32); // CRC32 of seektable
-
         if(s->frame_length >= UINT_MAX / (s->channels * sizeof(int32_t))){
             av_log(avctx, AV_LOG_ERROR, "frame_length too large\n");
-            return -1;
-        }
-
-        if (s->bps == 2) {
-            s->decode_buffer = av_mallocz(sizeof(int32_t)*s->frame_length*s->channels);
-            if (!s->decode_buffer)
-                return AVERROR(ENOMEM);
-        }
-        s->ch_ctx = av_malloc(avctx->channels * sizeof(*s->ch_ctx));
-        if (!s->ch_ctx) {
-            av_freep(&s->decode_buffer);
-            return AVERROR(ENOMEM);
+            return AVERROR_INVALIDDATA;
         }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Wrong extradata present\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    ff_ttadsp_init(&s->dsp);
+
+    return allocate_buffers(avctx);
 }
 
 static int tta_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int i, ret;
     int cur_chan = 0, framelen = s->frame_length;
     int32_t *p;
@@ -322,14 +233,13 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_INVALIDDATA;
     }
 
-    init_get_bits(&s->gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     /* get output buffer */
     frame->nb_samples = framelen;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
     // decode directly to output buffer for 24-bit sample format
     if (s->bps == 3)
@@ -337,9 +247,15 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
 
     // init per channel states
     for (i = 0; i < s->channels; i++) {
+        TTAFilter *filter = &s->ch_ctx[i].filter;
         s->ch_ctx[i].predictor = 0;
-        ttafilter_init(&s->ch_ctx[i].filter, ttafilter_configs[s->bps-1]);
-        rice_init(&s->ch_ctx[i].rice, 10, 10);
+        ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+        if (s->format == FORMAT_ENCRYPTED) {
+            int i;
+            for (i = 0; i < 8; i++)
+                filter->qm[i] = sign_extend(s->crc_pass[i], 8);
+        }
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
     }
 
     i = 0;
@@ -350,7 +266,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         uint32_t unary, depth, k;
         int32_t value;
 
-        unary = tta_get_unary(&s->gb);
+        unary = get_unary(&gb, 0, get_bits_left(&gb));
 
         if (unary == 0) {
             depth = 0;
@@ -361,7 +277,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             unary--;
         }
 
-        if (get_bits_left(&s->gb) < k) {
+        if (get_bits_left(&gb) < k) {
             ret = AVERROR_INVALIDDATA;
             goto error;
         }
@@ -371,7 +287,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
                 ret = AVERROR_INVALIDDATA;
                 goto error;
             }
-            value = (unary << k) + get_bits(&s->gb, k);
+            value = (unary << k) + get_bits(&gb, k);
         } else
             value = unary;
 
@@ -379,16 +295,16 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         switch (depth) {
         case 1:
             rice->sum1 += value - (rice->sum1 >> 4);
-            if (rice->k1 > 0 && rice->sum1 < shift_16[rice->k1])
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
                 rice->k1--;
-            else if(rice->sum1 > shift_16[rice->k1 + 1])
+            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
                 rice->k1++;
-            value += shift_1[rice->k0];
+            value += ff_tta_shift_1[rice->k0];
         default:
             rice->sum0 += value - (rice->sum0 >> 4);
-            if (rice->k0 > 0 && rice->sum0 < shift_16[rice->k0])
+            if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
                 rice->k0--;
-            else if(rice->sum0 > shift_16[rice->k0 + 1])
+            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
                 rice->k0++;
         }
 
@@ -396,10 +312,11 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         *p = 1 + ((value >> 1) ^ ((value & 1) - 1));
 
         // run hybrid filter
-        ttafilter_process(filter, p);
+        s->dsp.ttafilter_process_dec(filter->qm, filter->dx, filter->dl, &filter->error, p,
+                                     filter->shift, filter->round);
 
         // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)x << k) - x) >> k)
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
         switch (s->bps) {
         case 1: *p += PRED(*predictor, 4); break;
         case 2:
@@ -421,32 +338,43 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             cur_chan = 0;
             i++;
             // check for last frame
-            if (i == s->last_frame_length && get_bits_left(&s->gb) / 8 == 4) {
+            if (i == s->last_frame_length && get_bits_left(&gb) / 8 == 4) {
                 frame->nb_samples = framelen = s->last_frame_length;
                 break;
             }
         }
     }
 
-    align_get_bits(&s->gb);
-    if (get_bits_left(&s->gb) < 32) {
+    align_get_bits(&gb);
+    if (get_bits_left(&gb) < 32) {
         ret = AVERROR_INVALIDDATA;
         goto error;
     }
-    skip_bits_long(&s->gb, 32); // frame crc
+    skip_bits_long(&gb, 32); // frame crc
 
     // convert to output buffer
-    if (s->bps == 2) {
+    switch (s->bps) {
+    case 1: {
+        uint8_t *samples = (uint8_t *)frame->data[0];
+        for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
+            *samples++ = *p + 0x80;
+        break;
+        }
+    case 2: {
         int16_t *samples = (int16_t *)frame->data[0];
         for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
             *samples++ = *p;
-    } else {
+        break;
+        }
+    case 3: {
         // shift samples for 24-bit sample format
         int32_t *samples = (int32_t *)frame->data[0];
         for (i = 0; i < framelen * s->channels; i++)
             *samples++ <<= 8;
         // reset decode buffer
         s->decode_buffer = NULL;
+        break;
+        }
     }
 
     *got_frame_ptr = 1;
@@ -459,15 +387,38 @@ error:
     return ret;
 }
 
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TTAContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return allocate_buffers(avctx);
+}
+
 static av_cold int tta_decode_close(AVCodecContext *avctx) {
     TTAContext *s = avctx->priv_data;
 
-    av_free(s->decode_buffer);
+    if (s->bps < 3)
+        av_freep(&s->decode_buffer);
+    s->decode_buffer = NULL;
     av_freep(&s->ch_ctx);
 
     return 0;
 }
 
+#define OFFSET(x) offsetof(TTAContext, x)
+#define DEC (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
+static const AVOption options[] = {
+    { "password", "Set decoding password", OFFSET(pass), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, DEC },
+    { NULL },
+};
+
+static const AVClass tta_decoder_class = {
+    .class_name = "TTA Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_tta_decoder = {
     .name           = "tta",
     .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
@@ -477,5 +428,7 @@ AVCodec ff_tta_decoder = {
     .init           = tta_decode_init,
     .close          = tta_decode_close,
     .decode         = tta_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &tta_decoder_class,
 };
diff --git a/libavcodec/ttadata.c b/libavcodec/ttadata.c
new file mode 100644
index 0000000000..bf793a4cc8
--- /dev/null
+++ b/libavcodec/ttadata.c
@@ -0,0 +1,52 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ttadata.h"
+
+const uint32_t ff_tta_shift_1[] = {
+    0x00000001, 0x00000002, 0x00000004, 0x00000008,
+    0x00000010, 0x00000020, 0x00000040, 0x00000080,
+    0x00000100, 0x00000200, 0x00000400, 0x00000800,
+    0x00001000, 0x00002000, 0x00004000, 0x00008000,
+    0x00010000, 0x00020000, 0x00040000, 0x00080000,
+    0x00100000, 0x00200000, 0x00400000, 0x00800000,
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000
+};
+
+const uint32_t * const ff_tta_shift_16 = ff_tta_shift_1 + 4;
+
+const uint8_t ff_tta_filter_configs[] = { 10, 9, 10, 12 };
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1)
+{
+    c->k0 = k0;
+    c->k1 = k1;
+    c->sum0 = ff_tta_shift_16[k0];
+    c->sum1 = ff_tta_shift_16[k1];
+}
+
+void ff_tta_filter_init(TTAFilter *c, int32_t shift) {
+    memset(c, 0, sizeof(TTAFilter));
+    c->shift = shift;
+    c->round = ff_tta_shift_1[shift-1];
+}
diff --git a/libavcodec/ttadata.h b/libavcodec/ttadata.h
new file mode 100644
index 0000000000..48c4cd0354
--- /dev/null
+++ b/libavcodec/ttadata.h
@@ -0,0 +1,50 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADATA_H
+#define AVCODEC_TTADATA_H
+
+#include "internal.h"
+
+#define MAX_ORDER 16
+typedef struct TTAFilter {
+    int32_t shift, round, error;
+    int32_t qm[MAX_ORDER];
+    int32_t dx[MAX_ORDER];
+    int32_t dl[MAX_ORDER];
+} TTAFilter;
+
+typedef struct TTARice {
+    uint32_t k0, k1, sum0, sum1;
+} TTARice;
+
+typedef struct TTAChannel {
+    int32_t predictor;
+    TTAFilter filter;
+    TTARice rice;
+} TTAChannel;
+
+extern const uint32_t ff_tta_shift_1[];
+extern const uint32_t * const ff_tta_shift_16;
+extern const uint8_t ff_tta_filter_configs[];
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1);
+void ff_tta_filter_init(TTAFilter *c, int32_t shift);
+#endif /* AVCODEC_TTADATA_H */
diff --git a/libavcodec/ttadsp.c b/libavcodec/ttadsp.c
new file mode 100644
index 0000000000..30b7ab9eb7
--- /dev/null
+++ b/libavcodec/ttadsp.c
@@ -0,0 +1,57 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ttadsp.h"
+
+static void ttafilter_process_dec_c(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round) {
+    if (*error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (*error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    round +=    dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+                dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    *error = *in;
+    *in += (round >> shift);
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+}
+
+av_cold void ff_ttadsp_init(TTADSPContext *c)
+{
+    c->ttafilter_process_dec = ttafilter_process_dec_c;
+
+    if (ARCH_X86)
+        ff_ttadsp_init_x86(c);
+}
diff --git a/libavcodec/ttadsp.h b/libavcodec/ttadsp.h
new file mode 100644
index 0000000000..56930f1c85
--- /dev/null
+++ b/libavcodec/ttadsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADSP_H
+#define AVCODEC_TTADSP_H
+
+#include <stdint.h>
+#include "ttadata.h"
+
+typedef struct TTADSPContext {
+    void (*ttafilter_process_dec)(int32_t *qm, int32_t *dx, int32_t *dl,
+                                  int32_t *error, int32_t *in, int32_t shift,
+                                  int32_t round);
+} TTADSPContext;
+
+void ff_ttadsp_init(TTADSPContext *c);
+void ff_ttadsp_init_x86(TTADSPContext *c);
+
+#endif /* AVCODEC_TTADSP_H */
diff --git a/libavcodec/ttaenc.c b/libavcodec/ttaenc.c
new file mode 100644
index 0000000000..0df1fcb6aa
--- /dev/null
+++ b/libavcodec/ttaenc.c
@@ -0,0 +1,232 @@
+/*
+ * TTA (The Lossless True Audio) encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+#include "ttadata.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "libavutil/crc.h"
+
+typedef struct TTAEncContext {
+    const AVCRC *crc_table;
+    int bps;
+    TTAChannel *ch_ctx;
+} TTAEncContext;
+
+static av_cold int tta_encode_init(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8:
+        avctx->bits_per_raw_sample = 8;
+        break;
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 24)
+            av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+        avctx->bits_per_raw_sample = 24;
+    }
+
+    s->bps = avctx->bits_per_raw_sample >> 3;
+    avctx->frame_size = 256 * avctx->sample_rate / 245;
+
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static inline void ttafilter_process(TTAFilter *c, int32_t *in)
+{
+    register int32_t *dl = c->dl, *qm = c->qm, *dx = c->dx, sum = c->round;
+
+    if (c->error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (c->error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    sum += dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+           dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+
+    *in -= (sum >> c->shift);
+    c->error = *in;
+}
+
+static int32_t get_sample(const AVFrame *frame, int sample,
+                          enum AVSampleFormat format)
+{
+    int32_t ret;
+
+    if (format == AV_SAMPLE_FMT_U8) {
+        ret = frame->data[0][sample] - 0x80;
+    } else if (format == AV_SAMPLE_FMT_S16) {
+        const int16_t *ptr = (const int16_t *)frame->data[0];
+        ret = ptr[sample];
+    } else {
+        const int32_t *ptr = (const int32_t *)frame->data[0];
+        ret = ptr[sample] >> 8;
+    }
+
+    return ret;
+}
+
+static int tta_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
+{
+    TTAEncContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int ret, i, out_bytes, cur_chan = 0, res = 0, samples = 0;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, frame->nb_samples * 2 * avctx->channels * s->bps, 0)) < 0)
+        return ret;
+    init_put_bits(&pb, avpkt->data, avpkt->size);
+
+    // init per channel states
+    for (i = 0; i < avctx->channels; i++) {
+        s->ch_ctx[i].predictor = 0;
+        ff_tta_filter_init(&s->ch_ctx[i].filter, ff_tta_filter_configs[s->bps - 1]);
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
+    }
+
+    for (i = 0; i < frame->nb_samples * avctx->channels; i++) {
+        TTAChannel *c = &s->ch_ctx[cur_chan];
+        TTAFilter *filter = &c->filter;
+        TTARice *rice = &c->rice;
+        uint32_t k, unary, outval;
+        int32_t value, temp;
+
+        value = get_sample(frame, samples++, avctx->sample_fmt);
+
+        if (avctx->channels > 1) {
+            if (cur_chan < avctx->channels - 1)
+                value  = res = get_sample(frame, samples, avctx->sample_fmt) - value;
+            else
+                value -= res / 2;
+        }
+
+        temp = value;
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+        switch (s->bps) {
+        case 1: value -= PRED(c->predictor, 4); break;
+        case 2:
+        case 3: value -= PRED(c->predictor, 5); break;
+        }
+        c->predictor = temp;
+
+        ttafilter_process(filter, &value);
+        outval = (value > 0) ? (value << 1) - 1: -value << 1;
+
+        k = rice->k0;
+
+        rice->sum0 += outval - (rice->sum0 >> 4);
+        if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
+            rice->k0--;
+        else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+            rice->k0++;
+
+        if (outval >= ff_tta_shift_1[k]) {
+            outval -= ff_tta_shift_1[k];
+            k = rice->k1;
+
+            rice->sum1 += outval - (rice->sum1 >> 4);
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
+                rice->k1--;
+            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+                rice->k1++;
+
+            unary = 1 + (outval >> k);
+            do {
+                if (unary > 31) {
+                    put_bits(&pb, 31, 0x7FFFFFFF);
+                    unary -= 31;
+                } else {
+                    put_bits(&pb, unary, (1 << unary) - 1);
+                    unary = 0;
+                }
+            } while (unary);
+        }
+
+        put_bits(&pb, 1, 0);
+
+        if (k)
+            put_bits(&pb, k, outval & (ff_tta_shift_1[k] - 1));
+
+        if (cur_chan < avctx->channels - 1)
+            cur_chan++;
+        else
+            cur_chan = 0;
+    }
+
+    flush_put_bits(&pb);
+    out_bytes = put_bits_count(&pb) >> 3;
+    put_bits32(&pb, av_crc(s->crc_table, UINT32_MAX, avpkt->data, out_bytes) ^ UINT32_MAX);
+    flush_put_bits(&pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = out_bytes + 4;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int tta_encode_close(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+    av_freep(&s->ch_ctx);
+    return 0;
+}
+
+AVCodec ff_tta_encoder = {
+    .name           = "tta",
+    .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_TTA,
+    .priv_data_size = sizeof(TTAEncContext),
+    .init           = tta_encode_init,
+    .close          = tta_encode_close,
+    .encode2        = tta_encode_frame,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_LOSSLESS,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8,
+                                                     AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_S32,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 940def45f5..7b2e19e536 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -358,7 +358,7 @@ static void imdct_and_window(TwinVQContext *tctx, enum TwinVQFrameType ftype,
 
         mdct->imdct_half(mdct, buf1 + bsize * j, in + bsize * j);
 
-        tctx->fdsp.vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
+        tctx->fdsp->vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
                                       buf1 + bsize * j,
                                       ff_sine_windows[av_log2(wsize)],
                                       wsize / 2);
@@ -405,7 +405,7 @@ static void imdct_output(TwinVQContext *tctx, enum TwinVQFrameType ftype,
                size1 * sizeof(*out2));
         memcpy(out2 + size1, &tctx->curr_frame[2 * mtab->size],
                size2 * sizeof(*out2));
-        tctx->fdsp.butterflies_float(out1, out2, mtab->size);
+        tctx->fdsp->butterflies_float(out1, out2, mtab->size);
     }
 }
 
@@ -446,7 +446,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
                                bits->bark_use_hist[i][j], i,
                                tctx->tmp_buf, gain[sub * i + j], ftype);
 
-            tctx->fdsp.vector_fmul(chunk + block_size * j,
+            tctx->fdsp->vector_fmul(chunk + block_size * j,
                                    chunk + block_size * j,
                                    tctx->tmp_buf, block_size);
         }
@@ -461,7 +461,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
         dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf);
 
         for (j = 0; j < mtab->fmode[ftype].sub; j++) {
-            tctx->fdsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
+            tctx->fdsp->vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
             chunk += block_size;
         }
     }
@@ -487,10 +487,8 @@ int ff_twinvq_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (tctx->discarded_packets >= 2) {
         frame->nb_samples = mtab->size * tctx->frames_per_packet;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         out = (float **)frame->extended_data;
     }
 
@@ -548,24 +546,24 @@ static av_cold int init_mdct_win(TwinVQContext *tctx)
             return ret;
     }
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->tmp_buf,
-                     mtab->size * sizeof(*tctx->tmp_buf), alloc_fail);
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->tmp_buf,
+                     mtab->size, sizeof(*tctx->tmp_buf), alloc_fail);
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->spectrum,
-                     2 * mtab->size * channels * sizeof(*tctx->spectrum),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->spectrum,
+                     2 * mtab->size, channels * sizeof(*tctx->spectrum),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->curr_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->curr_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->curr_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->curr_frame),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->prev_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->prev_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->prev_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->prev_frame),
                      alloc_fail);
 
     for (i = 0; i < 3; i++) {
         int m       = 4 * mtab->size / mtab->fmode[i].sub;
         double freq = 2 * M_PI / m;
-        FF_ALLOC_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
-                         (m / 4) * sizeof(*tctx->cos_tabs[i]), alloc_fail);
+        FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
+                         (m / 4), sizeof(*tctx->cos_tabs[i]), alloc_fail);
 
         for (j = 0; j <= m / 8; j++)
             tctx->cos_tabs[i][j] = cos((2 * j + 1) * freq);
@@ -757,13 +755,14 @@ av_cold int ff_twinvq_decode_close(AVCodecContext *avctx)
 
     for (i = 0; i < 3; i++) {
         ff_mdct_end(&tctx->mdct_ctx[i]);
-        av_free(tctx->cos_tabs[i]);
+        av_freep(&tctx->cos_tabs[i]);
     }
 
-    av_free(tctx->curr_frame);
-    av_free(tctx->spectrum);
-    av_free(tctx->prev_frame);
-    av_free(tctx->tmp_buf);
+    av_freep(&tctx->curr_frame);
+    av_freep(&tctx->spectrum);
+    av_freep(&tctx->prev_frame);
+    av_freep(&tctx->tmp_buf);
+    av_freep(&tctx->fdsp);
 
     return 0;
 }
@@ -790,7 +789,11 @@ av_cold int ff_twinvq_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    avpriv_float_dsp_init(&tctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    tctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!tctx->fdsp) {
+        ff_twinvq_decode_close(avctx);
+        return AVERROR(ENOMEM);
+    }
     if ((ret = init_mdct_win(tctx))) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing MDCT\n");
         ff_twinvq_decode_close(avctx);
diff --git a/libavcodec/twinvq.h b/libavcodec/twinvq.h
index 3dc0ab46f8..228a4894e2 100644
--- a/libavcodec/twinvq.h
+++ b/libavcodec/twinvq.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -136,7 +136,7 @@ typedef struct TwinVQModeTab {
 
 typedef struct TwinVQContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct_ctx[3];
 
     const TwinVQModeTab *mtab;
diff --git a/libavcodec/twinvq_data.h b/libavcodec/twinvq_data.h
index 01a54a5ea8..375acc245a 100644
--- a/libavcodec/twinvq_data.h
+++ b/libavcodec/twinvq_data.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/twinvqdec.c b/libavcodec/twinvqdec.c
index b340510192..5f4dd350a0 100644
--- a/libavcodec/twinvqdec.c
+++ b/libavcodec/twinvqdec.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -256,9 +256,10 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
     int channels              = tctx->avctx->channels;
     int sub;
     GetBitContext gb;
-    int i, j, k;
+    int i, j, k, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
     skip_bits(&gb, get_bits(&gb, 8));
 
     bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS);
@@ -312,7 +313,7 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
         }
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 static av_cold int twinvq_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/txd.c b/libavcodec/txd.c
index 463223c963..0c95faaf89 100644
--- a/libavcodec/txd.c
+++ b/libavcodec/txd.c
@@ -4,27 +4,27 @@
  *
  * See also: http://wiki.multimedia.cx/index.php?title=TXD
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
-#include "avcodec.h"
 #include "bytestream.h"
+#include "avcodec.h"
 #include "internal.h"
 #include "texturedsp.h"
 
@@ -76,10 +76,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     avctx->coded_width  = FFALIGN(w, 4);
     avctx->coded_height = FFALIGN(h, 4);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -92,6 +90,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             v = bytestream2_get_be32(&gb);
             pal[y] = (v >> 8) + (v << 24);
         }
+        if (bytestream2_get_bytes_left(&gb) < w * h)
+            return AVERROR_INVALIDDATA;
         bytestream2_skip(&gb, 4);
         for (y=0; y<h; y++) {
             bytestream2_get_buffer(&gb, ptr, w);
@@ -104,6 +104,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             if (!(flags & 1))
                 goto unsupported;
         case TXD_DXT1:
+            if (bytestream2_get_bytes_left(&gb) < FF_CEIL_RSHIFT(w, 2) * FF_CEIL_RSHIFT(h, 2) * 8)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -113,6 +115,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             }
             break;
         case TXD_DXT3:
+            if (bytestream2_get_bytes_left(&gb) < FF_CEIL_RSHIFT(w, 2) * FF_CEIL_RSHIFT(h, 2) * 16)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -128,6 +132,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         switch (d3d_format) {
         case 0x15:
         case 0x16:
+            if (bytestream2_get_bytes_left(&gb) < h * w * 4)
+                return AVERROR_INVALIDDATA;
             for (y=0; y<h; y++) {
                 bytestream2_get_buffer(&gb, ptr, w * 4);
                 ptr += stride;
diff --git a/libavcodec/ulti.c b/libavcodec/ulti.c
index 46aa27d5d0..e6f4374981 100644
--- a/libavcodec/ulti.c
+++ b/libavcodec/ulti.c
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * Copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -227,10 +227,8 @@ static int ulti_decode_frame(AVCodecContext *avctx,
     int skip;
     int tmp;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, buf, buf_size);
 
diff --git a/libavcodec/ulti_cb.h b/libavcodec/ulti_cb.h
index 0bd83ffd37..7061d839a8 100644
--- a/libavcodec/ulti_cb.h
+++ b/libavcodec/ulti_cb.h
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/unary.h b/libavcodec/unary.h
index d14929f797..908dc93507 100644
--- a/libavcodec/unary.h
+++ b/libavcodec/unary.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 1721c09129..7bfd7608c3 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,14 +26,17 @@
  */
 
 #include "config.h"
+#include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/frame.h"
 #include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/samplefmt.h"
@@ -43,34 +46,110 @@
 #include "me_cmp.h"
 #include "mpegvideo.h"
 #include "thread.h"
+#include "frame_thread_encoder.h"
 #include "internal.h"
+#include "raw.h"
 #include "bytestream.h"
 #include "version.h"
 #include <stdlib.h>
 #include <stdarg.h>
 #include <limits.h>
 #include <float.h>
+#if CONFIG_ICONV
+# include <iconv.h>
+#endif
+
+#if HAVE_PTHREADS
+#include <pthread.h>
+#elif HAVE_W32THREADS
+#include "compat/w32pthreads.h"
+#elif HAVE_OS2THREADS
+#include "compat/os2threads.h"
+#endif
+
+#include "libavutil/ffversion.h"
+const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+
+#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+{
+    void * volatile * mutex = arg;
+    int err;
+
+    switch (op) {
+    case AV_LOCK_CREATE:
+        return 0;
+    case AV_LOCK_OBTAIN:
+        if (!*mutex) {
+            pthread_mutex_t *tmp = av_malloc(sizeof(pthread_mutex_t));
+            if (!tmp)
+                return AVERROR(ENOMEM);
+            if ((err = pthread_mutex_init(tmp, NULL))) {
+                av_free(tmp);
+                return AVERROR(err);
+            }
+            if (avpriv_atomic_ptr_cas(mutex, NULL, tmp)) {
+                pthread_mutex_destroy(tmp);
+                av_free(tmp);
+            }
+        }
+
+        if ((err = pthread_mutex_lock(*mutex)))
+            return AVERROR(err);
+
+        return 0;
+    case AV_LOCK_RELEASE:
+        if ((err = pthread_mutex_unlock(*mutex)))
+            return AVERROR(err);
+
+        return 0;
+    case AV_LOCK_DESTROY:
+        if (*mutex)
+            pthread_mutex_destroy(*mutex);
+        av_free(*mutex);
+        avpriv_atomic_ptr_cas(mutex, *mutex, NULL);
+        return 0;
+    }
+    return 1;
+}
+static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = default_lockmgr_cb;
+#else
+static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = NULL;
+#endif
 
+
+volatile int ff_avcodec_locked;
 static int volatile entangled_thread_counter = 0;
-static int (*lockmgr_cb)(void **mutex, enum AVLockOp op);
 static void *codec_mutex;
 static void *avformat_mutex;
 
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size)
 {
-    void **p = ptr;
+    uint8_t **p = ptr;
     if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
         av_freep(p);
         *size = 0;
         return;
     }
-    av_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (*size)
-        memset((uint8_t *)*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+}
+
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size)
+{
+    uint8_t **p = ptr;
+    if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_freep(p);
+        *size = 0;
+        return;
+    }
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p, 0, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
 }
 
 /* encoder management */
 static AVCodec *first_avcodec = NULL;
+static AVCodec **last_avcodec = &first_avcodec;
 
 AVCodec *av_codec_next(const AVCodec *c)
 {
@@ -106,12 +185,13 @@ av_cold void avcodec_register(AVCodec *codec)
 {
     AVCodec **p;
     avcodec_init();
-    p = &first_avcodec;
-    while (*p)
-        p = &(*p)->next;
-    *p          = codec;
+    p = last_avcodec;
     codec->next = NULL;
 
+    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, codec))
+        p = &(*p)->next;
+    last_avcodec = &codec->next;
+
     if (codec->init_static_data)
         codec->init_static_data(codec);
 }
@@ -126,7 +206,10 @@ unsigned avcodec_get_edge_width(void)
 #if FF_API_SET_DIMENSIONS
 void avcodec_set_dimensions(AVCodecContext *s, int width, int height)
 {
-    ff_set_dimensions(s, width, height);
+    int ret = ff_set_dimensions(s, width, height);
+    if (ret < 0) {
+        av_log(s, AV_LOG_WARNING, "Failed to set dimensions %d %d\n", width, height);
+    }
 }
 #endif
 
@@ -136,8 +219,11 @@ int ff_set_dimensions(AVCodecContext *s, int width, int height)
 
     if (ret < 0)
         width = height = 0;
-    s->width  = s->coded_width  = width;
-    s->height = s->coded_height = height;
+
+    s->coded_width  = width;
+    s->coded_height = height;
+    s->width        = FF_CEIL_RSHIFT(width,  s->lowres);
+    s->height       = FF_CEIL_RSHIFT(height, s->lowres);
 
     return ret;
 }
@@ -177,18 +263,18 @@ int ff_side_data_update_matrix_encoding(AVFrame *frame,
     return 0;
 }
 
-#if HAVE_SIMD_ALIGN_16
-#   define STRIDE_ALIGN 16
-#else
-#   define STRIDE_ALIGN 8
-#endif
-
 void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS])
 {
     int i;
     int w_align = 1;
     int h_align = 1;
+    AVPixFmtDescriptor const *desc = av_pix_fmt_desc_get(s->pix_fmt);
+
+    if (desc) {
+        w_align = 1 << desc->log2_chroma_w;
+        h_align = 1 << desc->log2_chroma_h;
+    }
 
     switch (s->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
@@ -214,47 +300,97 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
     case AV_PIX_FMT_YUV420P9BE:
     case AV_PIX_FMT_YUV420P10LE:
     case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P12LE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV420P14LE:
+    case AV_PIX_FMT_YUV420P14BE:
+    case AV_PIX_FMT_YUV420P16LE:
+    case AV_PIX_FMT_YUV420P16BE:
+    case AV_PIX_FMT_YUVA420P9LE:
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA420P10LE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA420P16LE:
+    case AV_PIX_FMT_YUVA420P16BE:
     case AV_PIX_FMT_YUV422P9LE:
     case AV_PIX_FMT_YUV422P9BE:
     case AV_PIX_FMT_YUV422P10LE:
     case AV_PIX_FMT_YUV422P10BE:
+    case AV_PIX_FMT_YUV422P12LE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV422P14LE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV422P16LE:
+    case AV_PIX_FMT_YUV422P16BE:
+    case AV_PIX_FMT_YUVA422P9LE:
+    case AV_PIX_FMT_YUVA422P9BE:
     case AV_PIX_FMT_YUVA422P10LE:
     case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA422P16LE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUV440P10LE:
+    case AV_PIX_FMT_YUV440P10BE:
+    case AV_PIX_FMT_YUV440P12LE:
+    case AV_PIX_FMT_YUV440P12BE:
     case AV_PIX_FMT_YUV444P9LE:
     case AV_PIX_FMT_YUV444P9BE:
     case AV_PIX_FMT_YUV444P10LE:
     case AV_PIX_FMT_YUV444P10BE:
+    case AV_PIX_FMT_YUV444P12LE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV444P14LE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV444P16LE:
+    case AV_PIX_FMT_YUV444P16BE:
+    case AV_PIX_FMT_YUVA444P9LE:
+    case AV_PIX_FMT_YUVA444P9BE:
     case AV_PIX_FMT_YUVA444P10LE:
     case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA444P16LE:
+    case AV_PIX_FMT_YUVA444P16BE:
     case AV_PIX_FMT_GBRP9LE:
     case AV_PIX_FMT_GBRP9BE:
     case AV_PIX_FMT_GBRP10LE:
     case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP14LE:
+    case AV_PIX_FMT_GBRP14BE:
+    case AV_PIX_FMT_GBRP16LE:
+    case AV_PIX_FMT_GBRP16BE:
         w_align = 16; //FIXME assume 16 pixel per macroblock
         h_align = 16 * 2; // interlaced needs 2 macroblocks height
         break;
     case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUVJ411P:
     case AV_PIX_FMT_UYYVYY411:
         w_align = 32;
-        h_align = 8;
+        h_align = 16 * 2;
         break;
     case AV_PIX_FMT_YUV410P:
         if (s->codec_id == AV_CODEC_ID_SVQ1) {
             w_align = 64;
             h_align = 64;
         }
+        break;
     case AV_PIX_FMT_RGB555:
         if (s->codec_id == AV_CODEC_ID_RPZA) {
             w_align = 4;
             h_align = 4;
         }
+        break;
     case AV_PIX_FMT_PAL8:
     case AV_PIX_FMT_BGR8:
     case AV_PIX_FMT_RGB8:
-        if (s->codec_id == AV_CODEC_ID_SMC) {
+        if (s->codec_id == AV_CODEC_ID_SMC ||
+            s->codec_id == AV_CODEC_ID_CINEPAK) {
             w_align = 4;
             h_align = 4;
         }
+        if (s->codec_id == AV_CODEC_ID_JV) {
+            w_align = 8;
+            h_align = 8;
+        }
         break;
     case AV_PIX_FMT_BGR24:
         if ((s->codec_id == AV_CODEC_ID_MSZH) ||
@@ -263,18 +399,34 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
             h_align = 4;
         }
         break;
+    case AV_PIX_FMT_RGB24:
+        if (s->codec_id == AV_CODEC_ID_CINEPAK) {
+            w_align = 4;
+            h_align = 4;
+        }
+        break;
     default:
-        w_align = 1;
-        h_align = 1;
         break;
     }
 
+    if (s->codec_id == AV_CODEC_ID_IFF_ILBM || s->codec_id == AV_CODEC_ID_IFF_BYTERUN1) {
+        w_align = FFMAX(w_align, 8);
+    }
+
     *width  = FFALIGN(*width, w_align);
     *height = FFALIGN(*height, h_align);
-    if (s->codec_id == AV_CODEC_ID_H264)
+    if (s->codec_id == AV_CODEC_ID_H264 || s->lowres) {
         // some of the optimized chroma MC reads one line too much
+        // which is also done in mpeg decoders with lowres > 0
         *height += 2;
 
+        // H.264 uses edge emulation for out of frame motion vectors, for this
+        // it requires a temporary area large enough to hold a 21x21 block,
+        // increasing witdth ensure that the temporary area is large enough,
+        // the next rounded up width is 32
+        *width = FFMAX(*width, 32);
+    }
+
     for (i = 0; i < 4; i++)
         linesize_align[i] = STRIDE_ALIGN;
 }
@@ -294,6 +446,29 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height)
     *width              = FFALIGN(*width, align);
 }
 
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos)
+{
+    if (pos <= AVCHROMA_LOC_UNSPECIFIED || pos >= AVCHROMA_LOC_NB)
+        return AVERROR(EINVAL);
+    pos--;
+
+    *xpos = (pos&1) * 128;
+    *ypos = ((pos>>1)^(pos<4)) * 128;
+
+    return 0;
+}
+
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos)
+{
+    int pos, xout, yout;
+
+    for (pos = AVCHROMA_LOC_UNSPECIFIED + 1; pos < AVCHROMA_LOC_NB; pos++) {
+        if (avcodec_enum_to_chroma_pos(&xout, &yout, pos) == 0 && xout == xpos && yout == ypos)
+            return pos;
+    }
+    return AVCHROMA_LOC_UNSPECIFIED;
+}
+
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
                              int buf_size, int align)
@@ -308,7 +483,7 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
 
     planar = av_sample_fmt_is_planar(sample_fmt);
     if (planar && nb_channels > AV_NUM_DATA_POINTERS) {
-        if (!(frame->extended_data = av_mallocz(nb_channels *
+        if (!(frame->extended_data = av_mallocz_array(nb_channels,
                                                 sizeof(*frame->extended_data))))
             return AVERROR(ENOMEM);
     } else {
@@ -316,10 +491,10 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     }
 
     if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0],
-                                      buf, nb_channels, frame->nb_samples,
+                                      (uint8_t *)(intptr_t)buf, nb_channels, frame->nb_samples,
                                       sample_fmt, align)) < 0) {
         if (frame->extended_data != frame->data)
-            av_free(frame->extended_data);
+            av_freep(&frame->extended_data);
         return ret;
     }
     if (frame->extended_data != frame->data) {
@@ -374,7 +549,10 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
             av_buffer_pool_uninit(&pool->pools[i]);
             pool->linesize[i] = picture.linesize[i];
             if (size[i]) {
-                pool->pools[i] = av_buffer_pool_init(size[i] + 16, NULL);
+                pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                     CONFIG_MEMORY_POISONING ?
+                                                        NULL :
+                                                        av_buffer_allocz);
                 if (!pool->pools[i]) {
                     ret = AVERROR(ENOMEM);
                     goto fail;
@@ -388,7 +566,7 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         break;
         }
     case AVMEDIA_TYPE_AUDIO: {
-        int ch     = av_get_channel_layout_nb_channels(frame->channel_layout);
+        int ch     = av_frame_get_channels(frame); //av_get_channel_layout_nb_channels(frame->channel_layout);
         int planar = av_sample_fmt_is_planar(frame->format);
         int planes = planar ? ch : 1;
 
@@ -435,17 +613,19 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     frame->linesize[0] = pool->linesize[0];
 
     if (planes > AV_NUM_DATA_POINTERS) {
-        frame->extended_data = av_mallocz(planes * sizeof(*frame->extended_data));
+        frame->extended_data = av_mallocz_array(planes, sizeof(*frame->extended_data));
         frame->nb_extended_buf = planes - AV_NUM_DATA_POINTERS;
-        frame->extended_buf  = av_mallocz(frame->nb_extended_buf *
+        frame->extended_buf  = av_mallocz_array(frame->nb_extended_buf,
                                           sizeof(*frame->extended_buf));
         if (!frame->extended_data || !frame->extended_buf) {
             av_freep(&frame->extended_data);
             av_freep(&frame->extended_buf);
             return AVERROR(ENOMEM);
         }
-    } else
+    } else {
         frame->extended_data = frame->data;
+        av_assert0(frame->nb_extended_buf == 0);
+    }
 
     for (i = 0; i < FFMIN(planes, AV_NUM_DATA_POINTERS); i++) {
         frame->buf[i] = av_buffer_pool_get(pool->pools[0]);
@@ -507,6 +687,29 @@ fail:
     return AVERROR(ENOMEM);
 }
 
+void avpriv_color_frame(AVFrame *frame, const int c[4])
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int p, y, x;
+
+    av_assert0(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
+
+    for (p = 0; p<desc->nb_components; p++) {
+        uint8_t *dst = frame->data[p];
+        int is_chroma = p == 1 || p == 2;
+        int bytes  = is_chroma ? FF_CEIL_RSHIFT(frame->width,  desc->log2_chroma_w) : frame->width;
+        int height = is_chroma ? FF_CEIL_RSHIFT(frame->height, desc->log2_chroma_h) : frame->height;
+        for (y = 0; y < height; y++) {
+            if (desc->comp[0].depth >= 9) {
+                for (x = 0; x<bytes; x++)
+                    ((uint16_t*)dst)[x] = c[p];
+            }else
+                memset(dst, c[p], bytes);
+            dst += frame->linesize[p];
+        }
+    }
+}
+
 int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags)
 {
     int ret;
@@ -524,11 +727,23 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
     }
 }
 
-int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
+static int add_metadata_from_side_data(AVPacket *avpkt, AVFrame *frame)
+{
+    int size;
+    const uint8_t *side_metadata;
+
+    AVDictionary **frame_md = avpriv_frame_get_metadatap(frame);
+
+    side_metadata = av_packet_get_side_data(avpkt,
+                                            AV_PKT_DATA_STRINGS_METADATA, &size);
+    return av_packet_unpack_dictionary(side_metadata, size, frame_md);
+}
+
+int ff_init_buffer_info(AVCodecContext *avctx, AVFrame *frame)
 {
     AVPacket *pkt = avctx->internal->pkt;
     int i;
-    struct {
+    static const struct {
         enum AVPacketSideDataType packet;
         enum AVFrameSideDataType frame;
     } sd[] = {
@@ -538,56 +753,53 @@ int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
         { AV_PKT_DATA_AUDIO_SERVICE_TYPE, AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
     };
 
-    frame->color_primaries = avctx->color_primaries;
-    frame->color_trc       = avctx->color_trc;
-    frame->colorspace      = avctx->colorspace;
-    frame->color_range     = avctx->color_range;
-    frame->chroma_location = avctx->chroma_sample_location;
-
-    frame->reordered_opaque = avctx->reordered_opaque;
-    if (!pkt) {
-        frame->pkt_pts = AV_NOPTS_VALUE;
-        return 0;
-    }
-
-    frame->pkt_pts = pkt->pts;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
-        int size;
-        uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
-        if (packet_sd) {
-            AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
-                                                               sd[i].frame,
-                                                               size);
-            if (!frame_sd)
-                return AVERROR(ENOMEM);
-
-            memcpy(frame_sd->data, packet_sd, size);
+    if (pkt) {
+        frame->pkt_pts = pkt->pts;
+        av_frame_set_pkt_pos     (frame, pkt->pos);
+        av_frame_set_pkt_duration(frame, pkt->duration);
+        av_frame_set_pkt_size    (frame, pkt->size);
+
+        for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
+            int size;
+            uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
+            if (packet_sd) {
+                AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
+                                                                   sd[i].frame,
+                                                                   size);
+                if (!frame_sd)
+                    return AVERROR(ENOMEM);
+
+                memcpy(frame_sd->data, packet_sd, size);
+            }
         }
+        add_metadata_from_side_data(pkt, frame);
+    } else {
+        frame->pkt_pts = AV_NOPTS_VALUE;
+        av_frame_set_pkt_pos     (frame, -1);
+        av_frame_set_pkt_duration(frame, 0);
+        av_frame_set_pkt_size    (frame, -1);
     }
+    frame->reordered_opaque = avctx->reordered_opaque;
 
-    return 0;
-}
-
-int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
-{
-    const AVHWAccel *hwaccel = avctx->hwaccel;
-    int override_dimensions = 1;
-    int ret;
-
-    switch (avctx->codec_type) {
+    if (frame->color_primaries == AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = avctx->color_primaries;
+    if (frame->color_trc == AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = avctx->color_trc;
+    if (av_frame_get_colorspace(frame) == AVCOL_SPC_UNSPECIFIED)
+        av_frame_set_colorspace(frame, avctx->colorspace);
+    if (av_frame_get_color_range(frame) == AVCOL_RANGE_UNSPECIFIED)
+        av_frame_set_color_range(frame, avctx->color_range);
+    if (frame->chroma_location == AVCHROMA_LOC_UNSPECIFIED)
+        frame->chroma_location = avctx->chroma_sample_location;
+
+    switch (avctx->codec->type) {
     case AVMEDIA_TYPE_VIDEO:
-        if (frame->width <= 0 || frame->height <= 0) {
-            frame->width  = FFMAX(avctx->width, avctx->coded_width);
-            frame->height = FFMAX(avctx->height, avctx->coded_height);
-            override_dimensions = 0;
-        }
-        if (frame->format < 0)
-            frame->format              = avctx->pix_fmt;
+        frame->format              = avctx->pix_fmt;
         if (!frame->sample_aspect_ratio.num)
             frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
 
-        if (av_image_check_sar(frame->width, frame->height,
+        if (frame->width && frame->height &&
+            av_image_check_sar(frame->width, frame->height,
                                frame->sample_aspect_ratio) < 0) {
             av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
                    frame->sample_aspect_ratio.num,
@@ -595,8 +807,6 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
             frame->sample_aspect_ratio = (AVRational){ 0, 1 };
         }
 
-        if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-            return ret;
         break;
     case AVMEDIA_TYPE_AUDIO:
         if (!frame->sample_rate)
@@ -619,16 +829,38 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
                            avctx->channels);
                     return AVERROR(ENOSYS);
                 }
-
-                frame->channel_layout = av_get_default_channel_layout(avctx->channels);
-                if (!frame->channel_layout)
-                    frame->channel_layout = (1ULL << avctx->channels) - 1;
             }
         }
+        av_frame_set_channels(frame, avctx->channels);
         break;
-    default: return AVERROR(EINVAL);
     }
+    return 0;
+}
+
+int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
+{
+    return ff_init_buffer_info(avctx, frame);
+}
 
+static int get_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags)
+{
+    const AVHWAccel *hwaccel = avctx->hwaccel;
+    int override_dimensions = 1;
+    int ret;
+
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0 || avctx->pix_fmt<0) {
+            av_log(avctx, AV_LOG_ERROR, "video_get_buffer: image parameters invalid\n");
+            return AVERROR(EINVAL);
+        }
+    }
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        if (frame->width <= 0 || frame->height <= 0) {
+            frame->width  = FFMAX(avctx->width,  FF_CEIL_RSHIFT(avctx->coded_width,  avctx->lowres));
+            frame->height = FFMAX(avctx->height, FF_CEIL_RSHIFT(avctx->coded_height, avctx->lowres));
+            override_dimensions = 0;
+        }
+    }
     ret = ff_decode_frame_props(avctx, frame);
     if (ret < 0)
         return ret;
@@ -652,13 +884,29 @@ end:
     return ret;
 }
 
-int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+{
+    int ret = get_buffer_internal(avctx, frame, flags);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    return ret;
+}
+
+static int reget_buffer_internal(AVCodecContext *avctx, AVFrame *frame)
 {
     AVFrame *tmp;
     int ret;
 
     av_assert0(avctx->codec_type == AVMEDIA_TYPE_VIDEO);
 
+    if (frame->data[0] && (frame->width != avctx->width || frame->height != avctx->height || frame->format != avctx->pix_fmt)) {
+        av_log(avctx, AV_LOG_WARNING, "Picture changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s in reget buffer()\n",
+               frame->width, frame->height, av_get_pix_fmt_name(frame->format), avctx->width, avctx->height, av_get_pix_fmt_name(avctx->pix_fmt));
+        av_frame_unref(frame);
+    }
+
+    ff_init_buffer_info(avctx, frame);
+
     if (!frame->data[0])
         return ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
 
@@ -683,6 +931,14 @@ int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
     return 0;
 }
 
+int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret = reget_buffer_internal(avctx, frame);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    return ret;
+}
+
 int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2), void *arg, int *ret, int count, int size)
 {
     int i;
@@ -707,6 +963,17 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
     return 0;
 }
 
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags,
+                                       unsigned int fourcc)
+{
+    while (tags->pix_fmt >= 0) {
+        if (tags->fourcc == fourcc)
+            return tags->pix_fmt;
+        tags++;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
 static int is_hwaccel_pix_fmt(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -746,6 +1013,13 @@ static int setup_hwaccel(AVCodecContext *avctx,
         return AVERROR(ENOENT);
     }
 
+    if (hwa->capabilities & HWACCEL_CODEC_CAP_EXPERIMENTAL &&
+        avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_WARNING, "Ignoring experimental hwaccel: %s\n",
+               hwa->name);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (hwa->priv_data_size) {
         avctx->internal->hwaccel_priv_data = av_mallocz(hwa->priv_data_size);
         if (!avctx->internal->hwaccel_priv_data)
@@ -801,6 +1075,10 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
 
         if (!(desc->flags & AV_PIX_FMT_FLAG_HWACCEL))
             break;
+#if FF_API_CAP_VDPAU
+        if (avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+            break;
+#endif
 
         if (!setup_hwaccel(avctx, ret, desc->name))
             break;
@@ -818,6 +1096,63 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
     return ret;
 }
 
+MAKE_ACCESSORS(AVCodecContext, codec, AVRational, pkt_timebase)
+MAKE_ACCESSORS(AVCodecContext, codec, const AVCodecDescriptor *, codec_descriptor)
+MAKE_ACCESSORS(AVCodecContext, codec, int, lowres)
+MAKE_ACCESSORS(AVCodecContext, codec, int, seek_preroll)
+MAKE_ACCESSORS(AVCodecContext, codec, uint16_t*, chroma_intra_matrix)
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *codec)
+{
+    return codec->properties;
+}
+
+int av_codec_get_max_lowres(const AVCodec *codec)
+{
+    return codec->max_lowres;
+}
+
+static void get_subtitle_defaults(AVSubtitle *sub)
+{
+    memset(sub, 0, sizeof(*sub));
+    sub->pts = AV_NOPTS_VALUE;
+}
+
+static int64_t get_bit_rate(AVCodecContext *ctx)
+{
+    int64_t bit_rate;
+    int bits_per_sample;
+
+    switch (ctx->codec_type) {
+    case AVMEDIA_TYPE_VIDEO:
+    case AVMEDIA_TYPE_DATA:
+    case AVMEDIA_TYPE_SUBTITLE:
+    case AVMEDIA_TYPE_ATTACHMENT:
+        bit_rate = ctx->bit_rate;
+        break;
+    case AVMEDIA_TYPE_AUDIO:
+        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
+        bit_rate = bits_per_sample ? ctx->sample_rate * ctx->channels * bits_per_sample : ctx->bit_rate;
+        break;
+    default:
+        bit_rate = 0;
+        break;
+    }
+    return bit_rate;
+}
+
+int attribute_align_arg ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
+{
+    int ret = 0;
+
+    ff_unlock_avcodec(codec);
+
+    ret = avcodec_open2(avctx, codec, options);
+
+    ff_lock_avcodec(avctx, codec);
+    return ret;
+}
+
 int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
 {
     int ret = 0;
@@ -827,12 +1162,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         return 0;
 
     if ((!codec && !avctx->codec)) {
-        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2().\n");
+        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2()\n");
         return AVERROR(EINVAL);
     }
     if ((codec && avctx->codec && codec != avctx->codec)) {
         av_log(avctx, AV_LOG_ERROR, "This AVCodecContext was allocated for %s, "
-                                    "but %s passed to avcodec_open2().\n", avctx->codec->name, codec->name);
+                                    "but %s passed to avcodec_open2()\n", avctx->codec->name, codec->name);
         return AVERROR(EINVAL);
     }
     if (!codec)
@@ -844,22 +1179,9 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (options)
         av_dict_copy(&tmp, *options, 0);
 
-    /* If there is a user-supplied mutex locking routine, call it. */
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
-            return -1;
-    }
-
-    entangled_thread_counter++;
-    if (entangled_thread_counter != 1 &&
-        !(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Insufficient thread locking. At least %d threads are "
-               "calling avcodec_open2() at the same time right now.\n",
-               entangled_thread_counter);
-        ret = -1;
-        goto end;
-    }
+    ret = ff_lock_avcodec(avctx, codec);
+    if (ret < 0)
+        return ret;
 
     avctx->internal = av_mallocz(sizeof(AVCodecInternal));
     if (!avctx->internal) {
@@ -899,17 +1221,27 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if ((ret = av_opt_set_dict(avctx, &tmp)) < 0)
         goto free_and_end;
 
-    if (avctx->coded_width && avctx->coded_height && !avctx->width && !avctx->height)
+    if (avctx->codec_whitelist && av_match_list(codec->name, avctx->codec_whitelist, ',') <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Codec (%s) not on whitelist\n", codec->name);
+        ret = AVERROR(EINVAL);
+        goto free_and_end;
+    }
+
+    // only call ff_set_dimensions() for non H.264/VP6F/DXV codecs so as not to overwrite previously setup dimensions
+    if (!(avctx->coded_width && avctx->coded_height && avctx->width && avctx->height &&
+          (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_VP6F || avctx->codec_id == AV_CODEC_ID_DXV))) {
+    if (avctx->coded_width && avctx->coded_height)
         ret = ff_set_dimensions(avctx, avctx->coded_width, avctx->coded_height);
     else if (avctx->width && avctx->height)
         ret = ff_set_dimensions(avctx, avctx->width, avctx->height);
     if (ret < 0)
         goto free_and_end;
+    }
 
     if ((avctx->coded_width || avctx->coded_height || avctx->width || avctx->height)
         && (  av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx) < 0
            || av_image_check_size(avctx->width,       avctx->height,       0, avctx) < 0)) {
-        av_log(avctx, AV_LOG_WARNING, "ignoring invalid width/height values\n");
+        av_log(avctx, AV_LOG_WARNING, "Ignoring invalid width/height values\n");
         ff_set_dimensions(avctx, 0, 0);
     }
 
@@ -941,14 +1273,25 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     }
     if (avctx->codec_id != codec->id || (avctx->codec_type != codec->type
                                          && avctx->codec_type != AVMEDIA_TYPE_ATTACHMENT)) {
-        av_log(avctx, AV_LOG_ERROR, "codec type or id mismatches\n");
+        av_log(avctx, AV_LOG_ERROR, "Codec type or id mismatches\n");
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
     avctx->frame_number = 0;
+    avctx->codec_descriptor = avcodec_descriptor_get(avctx->codec_id);
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_EXPERIMENTAL) &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        const char *codec_string = av_codec_is_encoder(codec) ? "encoder" : "decoder";
+        AVCodec *codec2;
+        av_log(avctx, AV_LOG_ERROR,
+               "The %s '%s' is experimental but experimental codecs are not enabled, "
+               "add '-strict %d' if you want to use it.\n",
+               codec_string, codec->name, FF_COMPLIANCE_EXPERIMENTAL);
+        codec2 = av_codec_is_encoder(codec) ? avcodec_find_encoder(codec->id) : avcodec_find_decoder(codec->id);
+        if (!(codec2->capabilities & AV_CODEC_CAP_EXPERIMENTAL))
+            av_log(avctx, AV_LOG_ERROR, "Alternatively use the non experimental %s '%s'.\n",
+                codec_string, codec2->name);
         ret = AVERROR_EXPERIMENTAL;
         goto free_and_end;
     }
@@ -959,7 +1302,19 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         avctx->time_base.den = avctx->sample_rate;
     }
 
-    if (HAVE_THREADS) {
+    if (!HAVE_THREADS)
+        av_log(avctx, AV_LOG_WARNING, "Warning: not compiled with thread support, using thread emulation\n");
+
+    if (CONFIG_FRAME_THREAD_ENCODER) {
+        ff_unlock_avcodec(codec); //we will instanciate a few encoders thus kick the counter to prevent false detection of a problem
+        ret = ff_frame_thread_encoder_init(avctx, options ? *options : NULL);
+        ff_lock_avcodec(avctx, codec);
+        if (ret < 0)
+            goto free_and_end;
+    }
+
+    if (HAVE_THREADS
+        && !(avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))) {
         ret = ff_thread_init(avctx);
         if (ret < 0) {
             goto free_and_end;
@@ -968,6 +1323,19 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (!HAVE_THREADS && !(codec->capabilities & AV_CODEC_CAP_AUTO_THREADS))
         avctx->thread_count = 1;
 
+    if (avctx->codec->max_lowres < avctx->lowres || avctx->lowres < 0) {
+        av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n",
+               avctx->codec->max_lowres);
+        ret = AVERROR(EINVAL);
+        goto free_and_end;
+    }
+
+#if FF_API_VISMV
+    if (avctx->debug_mv)
+        av_log(avctx, AV_LOG_WARNING, "The 'vismv' option is deprecated, "
+               "see the codecview filter instead.\n");
+#endif
+
     if (av_codec_is_encoder(avctx->codec)) {
         int i;
 #if FF_API_CODED_FRAME
@@ -991,7 +1359,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
             }
             if (avctx->codec->sample_fmts[i] == AV_SAMPLE_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_fmt is not supported.\n");
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->sample_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified sample format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_sample_fmt_name(avctx->sample_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
@@ -1000,12 +1371,18 @@ FF_ENABLE_DEPRECATION_WARNINGS
             for (i = 0; avctx->codec->pix_fmts[i] != AV_PIX_FMT_NONE; i++)
                 if (avctx->pix_fmt == avctx->codec->pix_fmts[i])
                     break;
-            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified pix_fmt is not supported\n");
+            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE
+                && !((avctx->codec_id == AV_CODEC_ID_MJPEG || avctx->codec_id == AV_CODEC_ID_LJPEG)
+                     && avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL)) {
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->pix_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified pixel format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_pix_fmt_name(avctx->pix_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
             if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ420P ||
+                avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ411P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ422P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ440P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ444P)
@@ -1016,60 +1393,110 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
                     break;
             if (avctx->codec->supported_samplerates[i] == 0) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_rate is not supported\n");
+                av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                       avctx->sample_rate);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         }
+        if (avctx->sample_rate < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                    avctx->sample_rate);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
         if (avctx->codec->channel_layouts) {
             if (!avctx->channel_layout) {
-                av_log(avctx, AV_LOG_WARNING, "channel_layout not specified\n");
+                av_log(avctx, AV_LOG_WARNING, "Channel layout not specified\n");
             } else {
                 for (i = 0; avctx->codec->channel_layouts[i] != 0; i++)
                     if (avctx->channel_layout == avctx->codec->channel_layouts[i])
                         break;
                 if (avctx->codec->channel_layouts[i] == 0) {
-                    av_log(avctx, AV_LOG_ERROR, "Specified channel_layout is not supported\n");
+                    char buf[512];
+                    av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                    av_log(avctx, AV_LOG_ERROR, "Specified channel layout '%s' is not supported\n", buf);
                     ret = AVERROR(EINVAL);
                     goto free_and_end;
                 }
             }
         }
         if (avctx->channel_layout && avctx->channels) {
-            if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels) {
-                av_log(avctx, AV_LOG_ERROR, "channel layout does not match number of channels\n");
+            int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
+            if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                av_log(avctx, AV_LOG_ERROR,
+                       "Channel layout '%s' with %d channels does not match number of specified channels %d\n",
+                       buf, channels, avctx->channels);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         } else if (avctx->channel_layout) {
             avctx->channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
         }
+        if (avctx->channels < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified number of channels %d is not supported\n",
+                    avctx->channels);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
+        if(avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+            if (avctx->width <= 0 || avctx->height <= 0) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions not set\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            }
+        }
+        if (   (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO)
+            && avctx->bit_rate>0 && avctx->bit_rate<1000) {
+            av_log(avctx, AV_LOG_WARNING, "Bitrate %"PRId64" is extremely low, maybe you mean %"PRId64"k\n", (int64_t)avctx->bit_rate, (int64_t)avctx->bit_rate);
+        }
 
         if (!avctx->rc_initial_buffer_occupancy)
             avctx->rc_initial_buffer_occupancy = avctx->rc_buffer_size * 3 / 4;
     }
 
-    if (avctx->codec->init && !(avctx->active_thread_type & FF_THREAD_FRAME)) {
+    avctx->pts_correction_num_faulty_pts =
+    avctx->pts_correction_num_faulty_dts = 0;
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
+    if (   !CONFIG_GRAY && avctx->flags & AV_CODEC_FLAG_GRAY
+        && avctx->codec_descriptor->type == AVMEDIA_TYPE_VIDEO)
+        av_log(avctx, AV_LOG_WARNING,
+               "gray decoding requested but not enabled at configuration time\n");
+
+    if (   avctx->codec->init && (!(avctx->active_thread_type&FF_THREAD_FRAME)
+        || avctx->internal->frame_thread_encoder)) {
         ret = avctx->codec->init(avctx);
         if (ret < 0) {
             goto free_and_end;
         }
     }
 
+    ret=0;
+
 #if FF_API_AUDIOENC_DELAY
     if (av_codec_is_encoder(avctx->codec))
         avctx->delay = avctx->initial_padding;
 #endif
 
     if (av_codec_is_decoder(avctx->codec)) {
+        if (!avctx->bit_rate)
+            avctx->bit_rate = get_bit_rate(avctx);
         /* validate channel layout from the decoder */
         if (avctx->channel_layout) {
             int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
             if (!avctx->channels)
                 avctx->channels = channels;
             else if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
                 av_log(avctx, AV_LOG_WARNING,
-                       "channel layout does not match number of channels\n");
+                       "Channel layout '%s' with %d channels does not match specified number of channels %d: "
+                       "ignoring specified channel layout\n",
+                       buf, channels, avctx->channels);
                 avctx->channel_layout = 0;
             }
         }
@@ -1078,19 +1505,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
             ret = AVERROR(EINVAL);
             goto free_and_end;
         }
+        if (avctx->sub_charenc) {
+            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
+                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
+                       "supported with subtitles codecs\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
+                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
+                       "subtitles character encoding will be ignored\n",
+                       avctx->codec_descriptor->name);
+                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
+            } else {
+                /* input character encoding is set for a text based subtitle
+                 * codec at this point */
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
+                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_PRE_DECODER;
+
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_PRE_DECODER) {
+#if CONFIG_ICONV
+                    iconv_t cd = iconv_open("UTF-8", avctx->sub_charenc);
+                    if (cd == (iconv_t)-1) {
+                        ret = AVERROR(errno);
+                        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
+                               "with input character encoding \"%s\"\n", avctx->sub_charenc);
+                        goto free_and_end;
+                    }
+                    iconv_close(cd);
+#else
+                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
+                           "conversion needs a libavcodec built with iconv support "
+                           "for this codec\n");
+                    ret = AVERROR(ENOSYS);
+                    goto free_and_end;
+#endif
+                }
+            }
+        }
 
 #if FF_API_AVCTX_TIMEBASE
         if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
-            avctx->time_base = av_inv_q(avctx->framerate);
+            avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
     }
-end:
-    entangled_thread_counter--;
-
-    /* Release any user-supplied mutex. */
-    if (lockmgr_cb) {
-        (*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE);
+    if (codec->priv_data_size > 0 && avctx->priv_data && codec->priv_class) {
+        av_assert0(*(const AVClass **)avctx->priv_data == codec->priv_class);
     }
+
+end:
+    ff_unlock_avcodec(codec);
     if (options) {
         av_dict_free(options);
         *options = tmp;
@@ -1102,7 +1565,7 @@ free_and_end:
         (avctx->codec->caps_internal & FF_CODEC_CAP_INIT_CLEANUP))
         avctx->codec->close(avctx);
 
-    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+    if (codec->priv_class && codec->priv_data_size)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
 
@@ -1123,26 +1586,52 @@ FF_ENABLE_DEPRECATION_WARNINGS
     goto end;
 }
 
-int ff_alloc_packet(AVPacket *avpkt, int size)
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size)
 {
-    if (size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
+    if (avpkt->size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid negative user packet size %d\n", avpkt->size);
         return AVERROR(EINVAL);
+    }
+    if (size < 0 || size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid minimum required packet size %"PRId64" (max allowed is %d)\n",
+               size, INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx && 2*min_size < size) { // FIXME The factor needs to be finetuned
+        av_assert0(!avpkt->data || avpkt->data != avctx->internal->byte_buffer);
+        if (!avpkt->data || avpkt->size < size) {
+            av_fast_padded_malloc(&avctx->internal->byte_buffer, &avctx->internal->byte_buffer_size, size);
+            avpkt->data = avctx->internal->byte_buffer;
+            avpkt->size = avctx->internal->byte_buffer_size;
+        }
+    }
 
     if (avpkt->data) {
         AVBufferRef *buf = avpkt->buf;
 
-        if (avpkt->size < size)
+        if (avpkt->size < size) {
+            av_log(avctx, AV_LOG_ERROR, "User packet is too small (%d < %"PRId64")\n", avpkt->size, size);
             return AVERROR(EINVAL);
+        }
 
         av_init_packet(avpkt);
         avpkt->buf      = buf;
         avpkt->size     = size;
         return 0;
     } else {
-        return av_new_packet(avpkt, size);
+        int ret = av_new_packet(avpkt, size);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate packet of size %"PRId64"\n", size);
+        return ret;
     }
 }
 
+int ff_alloc_packet(AVPacket *avpkt, int size)
+{
+    return ff_alloc_packet2(NULL, avpkt, size, 0);
+}
+
 /**
  * Pad last frame with silence.
  */
@@ -1156,6 +1645,7 @@ static int pad_last_frame(AVCodecContext *s, AVFrame **dst, const AVFrame *src)
 
     frame->format         = src->format;
     frame->channel_layout = src->channel_layout;
+    av_frame_set_channels(frame, av_frame_get_channels(src));
     frame->nb_samples     = s->frame_size;
     ret = av_frame_get_buffer(frame, 32);
     if (ret < 0)
@@ -1187,10 +1677,11 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
                                               const AVFrame *frame,
                                               int *got_packet_ptr)
 {
-    AVFrame tmp;
+    AVFrame *extended_frame = NULL;
     AVFrame *padded_frame = NULL;
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
@@ -1211,9 +1702,13 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         }
         av_log(avctx, AV_LOG_WARNING, "extended_data is not set.\n");
 
-        tmp = *frame;
-        tmp.extended_data = tmp.data;
-        frame = &tmp;
+        extended_frame = av_frame_alloc();
+        if (!extended_frame)
+            return AVERROR(ENOMEM);
+
+        memcpy(extended_frame, frame, sizeof(AVFrame));
+        extended_frame->extended_data = extended_frame->data;
+        frame = extended_frame;
     }
 
     /* extract audio service type metadata */
@@ -1226,26 +1721,32 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
     /* check for valid frame size */
     if (frame) {
         if (avctx->codec->capabilities & AV_CODEC_CAP_SMALL_LAST_FRAME) {
-            if (frame->nb_samples > avctx->frame_size)
-                return AVERROR(EINVAL);
+            if (frame->nb_samples > avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "more samples than frame size (avcodec_encode_audio2)\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
         } else if (!(avctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE)) {
             if (frame->nb_samples < avctx->frame_size &&
                 !avctx->internal->last_audio_frame) {
                 ret = pad_last_frame(avctx, &padded_frame, frame);
                 if (ret < 0)
-                    return ret;
+                    goto end;
 
                 frame = padded_frame;
                 avctx->internal->last_audio_frame = 1;
             }
 
             if (frame->nb_samples != avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "nb_samples (%d) != frame_size (%d) (avcodec_encode_audio2)\n", frame->nb_samples, avctx->frame_size);
                 ret = AVERROR(EINVAL);
                 goto end;
             }
         }
     }
 
+    av_assert0(avctx->codec->encode2);
+
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
     if (!ret) {
         if (*got_packet_ptr) {
@@ -1260,9 +1761,29 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         } else {
             avpkt->size = 0;
         }
+    }
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else {
+            if (av_dup_packet(avpkt) < 0) {
+                ret = AVERROR(ENOMEM);
+            }
+        }
+    }
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+    if (!ret) {
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
@@ -1283,6 +1804,7 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
 
 end:
     av_frame_free(&padded_frame);
+    av_free(extended_frame);
 
 #if FF_API_AUDIOENC_DELAY
     avctx->delay = avctx->initial_padding;
@@ -1297,10 +1819,18 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
                                               int *got_packet_ptr)
 {
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
+    if(CONFIG_FRAME_THREAD_ENCODER &&
+       avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))
+        return ff_thread_video_encode_frame(avctx, avpkt, frame, got_packet_ptr);
+
+    if ((avctx->flags&AV_CODEC_FLAG_PASS1) && avctx->stats_out)
+        avctx->stats_out[0] = '\0';
+
     if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) && !frame) {
         av_free_packet(avpkt);
         av_init_packet(avpkt);
@@ -1311,17 +1841,43 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx))
         return AVERROR(EINVAL);
 
+    if (frame && frame->format == AV_PIX_FMT_NONE)
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.format is not set\n");
+    if (frame && (frame->width == 0 || frame->height == 0))
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.width or height is not set\n");
+
     av_assert0(avctx->codec->encode2);
 
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+    av_assert0(ret <= 0);
+
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else {
+            if (av_dup_packet(avpkt) < 0) {
+                ret = AVERROR(ENOMEM);
+            }
+        }
+    }
+
     if (!ret) {
         if (!*got_packet_ptr)
             avpkt->size = 0;
         else if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY))
             avpkt->pts = avpkt->dts = frame->pts;
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
@@ -1344,18 +1900,54 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         av_log(avctx, AV_LOG_ERROR, "start_display_time must be 0.\n");
         return -1;
     }
-    if (sub->num_rects == 0 || !sub->rects)
-        return -1;
+
     ret = avctx->codec->encode_sub(avctx, buf, buf_size, sub);
     avctx->frame_number++;
     return ret;
 }
 
+/**
+ * Attempt to guess proper monotonic timestamps for decoded video frames
+ * which might have incorrect times. Input timestamps may wrap around, in
+ * which case the output will as well.
+ *
+ * @param pts the pts field of the decoded AVPacket, as passed through
+ * AVFrame.pkt_pts
+ * @param dts the dts field of the decoded AVPacket
+ * @return one of the input values, may be AV_NOPTS_VALUE
+ */
+static int64_t guess_correct_pts(AVCodecContext *ctx,
+                                 int64_t reordered_pts, int64_t dts)
+{
+    int64_t pts = AV_NOPTS_VALUE;
+
+    if (dts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_dts += dts <= ctx->pts_correction_last_dts;
+        ctx->pts_correction_last_dts = dts;
+    } else if (reordered_pts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_dts = reordered_pts;
+
+    if (reordered_pts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_pts += reordered_pts <= ctx->pts_correction_last_pts;
+        ctx->pts_correction_last_pts = reordered_pts;
+    } else if(dts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_pts = dts;
+
+    if ((ctx->pts_correction_num_faulty_pts<=ctx->pts_correction_num_faulty_dts || dts == AV_NOPTS_VALUE)
+       && reordered_pts != AV_NOPTS_VALUE)
+        pts = reordered_pts;
+    else
+        pts = dts;
+
+    return pts;
+}
+
 static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
 {
     int size = 0, ret;
     const uint8_t *data;
     uint32_t flags;
+    int64_t val;
 
     data = av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, &size);
     if (!data)
@@ -1376,7 +1968,12 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT) {
         if (size < 4)
             goto fail;
-        avctx->channels = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid channel count");
+            return AVERROR_INVALIDDATA;
+        }
+        avctx->channels = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT) {
@@ -1388,7 +1985,12 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE) {
         if (size < 4)
             goto fail;
-        avctx->sample_rate = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample rate");
+            return AVERROR_INVALIDDATA;
+        }
+        avctx->sample_rate = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS) {
@@ -1426,7 +2028,7 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     memcpy(frame->data,     avci->to_free->data,     sizeof(frame->data));
     memcpy(frame->linesize, avci->to_free->linesize, sizeof(frame->linesize));
     if (avci->to_free->extended_data != avci->to_free->data) {
-        int planes = av_get_channel_layout_nb_channels(avci->to_free->channel_layout);
+        int planes = av_frame_get_channels(avci->to_free);
         int size   = planes * sizeof(*frame->extended_data);
 
         if (!size) {
@@ -1449,52 +2051,76 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     frame->height         = avci->to_free->height;
     frame->channel_layout = avci->to_free->channel_layout;
     frame->nb_samples     = avci->to_free->nb_samples;
+    av_frame_set_channels(frame, av_frame_get_channels(avci->to_free));
 
     return 0;
 }
 
 int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                                               int *got_picture_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     AVCodecInternal *avci = avctx->internal;
     int ret;
+    // copy to ensure we do not change avpkt
+    AVPacket tmp = *avpkt;
+
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_VIDEO) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for video\n");
+        return AVERROR(EINVAL);
+    }
 
     *got_picture_ptr = 0;
     if ((avctx->coded_width || avctx->coded_height) && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx))
-        return -1;
-
-    avctx->internal->pkt = avpkt;
-    ret = apply_param_change(avctx, avpkt);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error applying parameter changes.\n");
-        if (avctx->err_recognition & AV_EF_EXPLODE)
-            return ret;
-    }
+        return AVERROR(EINVAL);
 
     av_frame_unref(picture);
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size ||
         (avctx->active_thread_type & FF_THREAD_FRAME)) {
+        int did_split = av_packet_split_side_data(&tmp);
+        ret = apply_param_change(avctx, &tmp);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error applying parameter changes.\n");
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                goto fail;
+        }
+
+        avctx->internal->pkt = &tmp;
         if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME)
             ret = ff_thread_decode_frame(avctx, picture, got_picture_ptr,
-                                         avpkt);
+                                         &tmp);
         else {
             ret = avctx->codec->decode(avctx, picture, got_picture_ptr,
-                                       avpkt);
+                                       &tmp);
             if (!(avctx->codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS))
                 picture->pkt_dts = avpkt->dts;
+
+            if(!avctx->has_b_frames){
+                av_frame_set_pkt_pos(picture, avpkt->pos);
+            }
+            //FIXME these should be under if(!avctx->has_b_frames)
             /* get_buffer is supposed to set frame parameters */
             if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1)) {
-                picture->sample_aspect_ratio = avctx->sample_aspect_ratio;
-                picture->width               = avctx->width;
-                picture->height              = avctx->height;
-                picture->format              = avctx->pix_fmt;
+                if (!picture->sample_aspect_ratio.num)    picture->sample_aspect_ratio = avctx->sample_aspect_ratio;
+                if (!picture->width)                      picture->width               = avctx->width;
+                if (!picture->height)                     picture->height              = avctx->height;
+                if (picture->format == AV_PIX_FMT_NONE)   picture->format              = avctx->pix_fmt;
             }
         }
 
+fail:
         emms_c(); //needed to avoid an emms_c() call before every return;
 
+        avctx->internal->pkt = NULL;
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
+
         if (*got_picture_ptr) {
             if (!avctx->refcounted_frames) {
                 int err = unrefcount_frame(avci, picture);
@@ -1503,14 +2129,22 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
             }
 
             avctx->frame_number++;
+            av_frame_set_best_effort_timestamp(picture,
+                                               guess_correct_pts(avctx,
+                                                                 picture->pkt_pts,
+                                                                 picture->pkt_dts));
         } else
             av_frame_unref(picture);
     } else
         ret = 0;
 
+    /* many decoders assign whole AVFrames, thus overwriting extended_data;
+     * make sure it's set correctly */
+    av_assert0(!picture->extended_data || picture->extended_data == picture->data);
+
 #if FF_API_AVCTX_TIMEBASE
     if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
-        avctx->time_base = av_inv_q(avctx->framerate);
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
 
     return ret;
@@ -1519,37 +2153,144 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
 int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
                                               AVFrame *frame,
                                               int *got_frame_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     AVCodecInternal *avci = avctx->internal;
     int ret = 0;
 
     *got_frame_ptr = 0;
 
-    avctx->internal->pkt = avpkt;
-
     if (!avpkt->data && avpkt->size) {
         av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
         return AVERROR(EINVAL);
     }
-
-    ret = apply_param_change(avctx, avpkt);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error applying parameter changes.\n");
-        if (avctx->err_recognition & AV_EF_EXPLODE)
-            return ret;
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_AUDIO) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for audio\n");
+        return AVERROR(EINVAL);
     }
 
     av_frame_unref(frame);
 
-    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
-        ret = avctx->codec->decode(avctx, frame, got_frame_ptr, avpkt);
+    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size || (avctx->active_thread_type & FF_THREAD_FRAME)) {
+        uint8_t *side;
+        int side_size;
+        uint32_t discard_padding = 0;
+        uint8_t skip_reason = 0;
+        uint8_t discard_reason = 0;
+        // copy to ensure we do not change avpkt
+        AVPacket tmp = *avpkt;
+        int did_split = av_packet_split_side_data(&tmp);
+        ret = apply_param_change(avctx, &tmp);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error applying parameter changes.\n");
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                goto fail;
+        }
+
+        avctx->internal->pkt = &tmp;
+        if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME)
+            ret = ff_thread_decode_frame(avctx, frame, got_frame_ptr, &tmp);
+        else {
+            ret = avctx->codec->decode(avctx, frame, got_frame_ptr, &tmp);
+            av_assert0(ret <= tmp.size);
+            frame->pkt_dts = avpkt->dts;
+        }
         if (ret >= 0 && *got_frame_ptr) {
             avctx->frame_number++;
-            frame->pkt_dts = avpkt->dts;
+            av_frame_set_best_effort_timestamp(frame,
+                                               guess_correct_pts(avctx,
+                                                                 frame->pkt_pts,
+                                                                 frame->pkt_dts));
             if (frame->format == AV_SAMPLE_FMT_NONE)
                 frame->format = avctx->sample_fmt;
+            if (!frame->channel_layout)
+                frame->channel_layout = avctx->channel_layout;
+            if (!av_frame_get_channels(frame))
+                av_frame_set_channels(frame, avctx->channels);
+            if (!frame->sample_rate)
+                frame->sample_rate = avctx->sample_rate;
+        }
+
+        side= av_packet_get_side_data(avctx->internal->pkt, AV_PKT_DATA_SKIP_SAMPLES, &side_size);
+        if(side && side_size>=10) {
+            avctx->internal->skip_samples = AV_RL32(side);
+            discard_padding = AV_RL32(side + 4);
+            av_log(avctx, AV_LOG_DEBUG, "skip %d / discard %d samples due to side data\n",
+                   avctx->internal->skip_samples, (int)discard_padding);
+            skip_reason = AV_RL8(side + 8);
+            discard_reason = AV_RL8(side + 9);
+        }
+        if (avctx->internal->skip_samples && *got_frame_ptr &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if(frame->nb_samples <= avctx->internal->skip_samples){
+                *got_frame_ptr = 0;
+                avctx->internal->skip_samples -= frame->nb_samples;
+                av_log(avctx, AV_LOG_DEBUG, "skip whole frame, skip left: %d\n",
+                       avctx->internal->skip_samples);
+            } else {
+                av_samples_copy(frame->extended_data, frame->extended_data, 0, avctx->internal->skip_samples,
+                                frame->nb_samples - avctx->internal->skip_samples, avctx->channels, frame->format);
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(avctx->internal->skip_samples,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    if(frame->pkt_pts!=AV_NOPTS_VALUE)
+                        frame->pkt_pts += diff_ts;
+                    if(frame->pkt_dts!=AV_NOPTS_VALUE)
+                        frame->pkt_dts += diff_ts;
+                    if (av_frame_get_pkt_duration(frame) >= diff_ts)
+                        av_frame_set_pkt_duration(frame, av_frame_get_pkt_duration(frame) - diff_ts);
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for skipped samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "skip %d/%d samples\n",
+                       avctx->internal->skip_samples, frame->nb_samples);
+                frame->nb_samples -= avctx->internal->skip_samples;
+                avctx->internal->skip_samples = 0;
+            }
+        }
+
+        if (discard_padding > 0 && discard_padding <= frame->nb_samples && *got_frame_ptr &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if (discard_padding == frame->nb_samples) {
+                *got_frame_ptr = 0;
+            } else {
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(frame->nb_samples - discard_padding,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    if (av_frame_get_pkt_duration(frame) >= diff_ts)
+                        av_frame_set_pkt_duration(frame, av_frame_get_pkt_duration(frame) - diff_ts);
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for discarded samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "discard %d/%d samples\n",
+                       (int)discard_padding, frame->nb_samples);
+                frame->nb_samples -= discard_padding;
+            }
+        }
+
+        if ((avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL) && *got_frame_ptr) {
+            AVFrameSideData *fside = av_frame_new_side_data(frame, AV_FRAME_DATA_SKIP_SAMPLES, 10);
+            if (fside) {
+                AV_WL32(fside->data, avctx->internal->skip_samples);
+                AV_WL32(fside->data + 4, discard_padding);
+                AV_WL8(fside->data + 8, skip_reason);
+                AV_WL8(fside->data + 9, discard_reason);
+                avctx->internal->skip_samples = 0;
+            }
+        }
+fail:
+        avctx->internal->pkt = NULL;
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
 
+        if (ret >= 0 && *got_frame_ptr) {
             if (!avctx->refcounted_frames) {
                 int err = unrefcount_frame(avci, frame);
                 if (err < 0)
@@ -1559,21 +2300,178 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
             av_frame_unref(frame);
     }
 
+    return ret;
+}
 
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
+static int recode_subtitle(AVCodecContext *avctx,
+                           AVPacket *outpkt, const AVPacket *inpkt)
+{
+#if CONFIG_ICONV
+    iconv_t cd = (iconv_t)-1;
+    int ret = 0;
+    char *inb, *outb;
+    size_t inl, outl;
+    AVPacket tmp;
+#endif
+
+    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_PRE_DECODER || inpkt->size == 0)
+        return 0;
+
+#if CONFIG_ICONV
+    cd = iconv_open("UTF-8", avctx->sub_charenc);
+    av_assert0(cd != (iconv_t)-1);
+
+    inb = inpkt->data;
+    inl = inpkt->size;
+
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
+    if (ret < 0)
+        goto end;
+    outpkt->buf  = tmp.buf;
+    outpkt->data = tmp.data;
+    outpkt->size = tmp.size;
+    outb = outpkt->data;
+    outl = outpkt->size;
+
+    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
+        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
+        outl >= outpkt->size || inl != 0) {
+        ret = FFMIN(AVERROR(errno), -1);
+        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
+               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
+        av_free_packet(&tmp);
+        goto end;
+    }
+    outpkt->size -= outl;
+    memset(outpkt->data + outpkt->size, 0, outl);
+
+end:
+    if (cd != (iconv_t)-1)
+        iconv_close(cd);
     return ret;
+#else
+    av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without iconv");
+    return AVERROR(EINVAL);
+#endif
+}
+
+static int utf8_check(const uint8_t *str)
+{
+    const uint8_t *byte;
+    uint32_t codepoint, min;
+
+    while (*str) {
+        byte = str;
+        GET_UTF8(codepoint, *(byte++), return 0;);
+        min = byte - str == 1 ? 0 : byte - str == 2 ? 0x80 :
+              1 << (5 * (byte - str) - 4);
+        if (codepoint < min || codepoint >= 0x110000 ||
+            codepoint == 0xFFFE /* BOM */ ||
+            codepoint >= 0xD800 && codepoint <= 0xDFFF /* surrogates */)
+            return 0;
+        str = byte;
+    }
+    return 1;
 }
 
 int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
                              int *got_sub_ptr,
                              AVPacket *avpkt)
 {
-    int ret;
+    int i, ret = 0;
+
+    if (!avpkt->data && avpkt->size) {
+        av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
+        return AVERROR(EINVAL);
+    }
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_SUBTITLE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for subtitles\n");
+        return AVERROR(EINVAL);
+    }
 
-    avctx->internal->pkt = avpkt;
     *got_sub_ptr = 0;
-    ret = avctx->codec->decode(avctx, sub, got_sub_ptr, avpkt);
-    if (*got_sub_ptr)
-        avctx->frame_number++;
+    get_subtitle_defaults(sub);
+
+    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
+        AVPacket pkt_recoded;
+        AVPacket tmp = *avpkt;
+        int did_split = av_packet_split_side_data(&tmp);
+        //apply_param_change(avctx, &tmp);
+
+        if (did_split) {
+            /* FFMIN() prevents overflow in case the packet wasn't allocated with
+             * proper padding.
+             * If the side data is smaller than the buffer padding size, the
+             * remaining bytes should have already been filled with zeros by the
+             * original packet allocation anyway. */
+            memset(tmp.data + tmp.size, 0,
+                   FFMIN(avpkt->size - tmp.size, AV_INPUT_BUFFER_PADDING_SIZE));
+        }
+
+        pkt_recoded = tmp;
+        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
+        if (ret < 0) {
+            *got_sub_ptr = 0;
+        } else {
+            avctx->internal->pkt = &pkt_recoded;
+
+            if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
+                sub->pts = av_rescale_q(avpkt->pts,
+                                        avctx->pkt_timebase, AV_TIME_BASE_Q);
+            ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
+            av_assert1((ret >= 0) >= !!*got_sub_ptr &&
+                       !!*got_sub_ptr >= !!sub->num_rects);
+
+            if (sub->num_rects && !sub->end_display_time && avpkt->duration &&
+                avctx->pkt_timebase.num) {
+                AVRational ms = { 1, 1000 };
+                sub->end_display_time = av_rescale_q(avpkt->duration,
+                                                     avctx->pkt_timebase, ms);
+            }
+
+            for (i = 0; i < sub->num_rects; i++) {
+                if (sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Invalid UTF-8 in decoded subtitles text; "
+                           "maybe missing -sub_charenc option\n");
+                    avsubtitle_free(sub);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+
+            if (tmp.data != pkt_recoded.data) { // did we recode?
+                /* prevent from destroying side data from original packet */
+                pkt_recoded.side_data = NULL;
+                pkt_recoded.side_data_elems = 0;
+
+                av_free_packet(&pkt_recoded);
+            }
+            if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB)
+                sub->format = 0;
+            else if (avctx->codec_descriptor->props & AV_CODEC_PROP_TEXT_SUB)
+                sub->format = 1;
+            avctx->internal->pkt = NULL;
+        }
+
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
+
+        if (*got_sub_ptr)
+            avctx->frame_number++;
+    }
+
     return ret;
 }
 
@@ -1598,13 +2496,22 @@ void avsubtitle_free(AVSubtitle *sub)
 
 av_cold int avcodec_close(AVCodecContext *avctx)
 {
+    if (!avctx)
+        return 0;
+
     if (avcodec_is_open(avctx)) {
         FramePool *pool = avctx->internal->pool;
         int i;
+        if (CONFIG_FRAME_THREAD_ENCODER &&
+            avctx->internal->frame_thread_encoder && avctx->thread_count > 1) {
+            ff_frame_thread_encoder_free(avctx);
+        }
         if (HAVE_THREADS && avctx->internal->thread_ctx)
             ff_thread_free(avctx);
         if (avctx->codec && avctx->codec->close)
             avctx->codec->close(avctx);
+        avctx->internal->byte_buffer_size = 0;
+        av_freep(&avctx->internal->byte_buffer);
         av_frame_free(&avctx->internal->to_free);
         for (i = 0; i < FF_ARRAY_ELEMS(pool->pools); i++)
             av_buffer_pool_uninit(&pool->pools[i]);
@@ -1635,10 +2542,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+{
+    switch(id){
+        //This is for future deprecatec codec ids, its empty since
+        //last major bump but will fill up again over time, please don't remove it
+        default                                         : return id;
+    }
+}
+
 static AVCodec *find_encdec(enum AVCodecID id, int encoder)
 {
     AVCodec *p, *experimental = NULL;
     p = first_avcodec;
+    id= remap_deprecated_codec_id(id);
     while (p) {
         if ((encoder ? av_codec_is_encoder(p) : av_codec_is_decoder(p)) &&
             p->id == id) {
@@ -1690,27 +2607,24 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
     return NULL;
 }
 
-static int get_bit_rate(AVCodecContext *ctx)
+const char *avcodec_get_name(enum AVCodecID id)
 {
-    int bit_rate;
-    int bits_per_sample;
+    const AVCodecDescriptor *cd;
+    AVCodec *codec;
 
-    switch (ctx->codec_type) {
-    case AVMEDIA_TYPE_VIDEO:
-    case AVMEDIA_TYPE_DATA:
-    case AVMEDIA_TYPE_SUBTITLE:
-    case AVMEDIA_TYPE_ATTACHMENT:
-        bit_rate = ctx->bit_rate;
-        break;
-    case AVMEDIA_TYPE_AUDIO:
-        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
-        bit_rate = bits_per_sample ? ctx->sample_rate * ctx->channels * bits_per_sample : ctx->bit_rate;
-        break;
-    default:
-        bit_rate = 0;
-        break;
-    }
-    return bit_rate;
+    if (id == AV_CODEC_ID_NONE)
+        return "none";
+    cd = avcodec_descriptor_get(id);
+    if (cd)
+        return cd->name;
+    av_log(NULL, AV_LOG_WARNING, "Codec 0x%x is not in the full list.\n", id);
+    codec = avcodec_find_decoder(id);
+    if (codec)
+        return codec->name;
+    codec = avcodec_find_encoder(id);
+    if (codec)
+        return codec->name;
+    return "unknown_codec";
 }
 
 size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_tag)
@@ -1720,7 +2634,7 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 #define TAG_PRINT(x)                                              \
     (((x) >= '0' && (x) <= '9') ||                                \
      ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z') ||  \
-     ((x) == '.' || (x) == ' '))
+     ((x) == '.' || (x) == ' ' || (x) == '-' || (x) == '_'))
 
     for (i = 0; i < 4; i++) {
         len = snprintf(buf, buf_size,
@@ -1735,75 +2649,97 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
+    const char *codec_type;
     const char *codec_name;
     const char *profile = NULL;
     const AVCodec *p;
-    char buf1[32];
-    int bitrate;
+    int64_t bitrate;
     int new_line = 0;
     AVRational display_aspect_ratio;
+    const char *separator = enc->dump_separator ? (const char *)enc->dump_separator : ", ";
 
-    if (enc->codec)
-        p = enc->codec;
-    else if (encode)
-        p = avcodec_find_encoder(enc->codec_id);
-    else
-        p = avcodec_find_decoder(enc->codec_id);
-
-    if (p) {
-        codec_name = p->name;
-        profile = av_get_profile_name(p, enc->profile);
-    } else if (enc->codec_id == AV_CODEC_ID_MPEG2TS) {
-        /* fake mpeg2 transport stream codec (currently not
-         * registered) */
-        codec_name = "mpeg2ts";
-    } else {
-        /* output avi tags */
+    if (!buf || buf_size <= 0)
+        return;
+    codec_type = av_get_media_type_string(enc->codec_type);
+    codec_name = avcodec_get_name(enc->codec_id);
+    if (enc->profile != FF_PROFILE_UNKNOWN) {
+        if (enc->codec)
+            p = enc->codec;
+        else
+            p = encode ? avcodec_find_encoder(enc->codec_id) :
+                        avcodec_find_decoder(enc->codec_id);
+        if (p)
+            profile = av_get_profile_name(p, enc->profile);
+    }
+
+    snprintf(buf, buf_size, "%s: %s", codec_type ? codec_type : "unknown",
+             codec_name);
+    buf[0] ^= 'a' ^ 'A'; /* first letter in uppercase */
+
+    if (enc->codec && strcmp(enc->codec->name, codec_name))
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", enc->codec->name);
+
+    if (profile)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", profile);
+    if (   enc->codec_type == AVMEDIA_TYPE_VIDEO
+        && av_log_get_level() >= AV_LOG_VERBOSE
+        && enc->refs)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", %d reference frame%s",
+                 enc->refs, enc->refs > 1 ? "s" : "");
+
+    if (enc->codec_tag) {
         char tag_buf[32];
         av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-        snprintf(buf1, sizeof(buf1), "%s / 0x%04X", tag_buf, enc->codec_tag);
-        codec_name = buf1;
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 " (%s / 0x%04X)", tag_buf, enc->codec_tag);
     }
 
     switch (enc->codec_type) {
     case AVMEDIA_TYPE_VIDEO:
-        snprintf(buf, buf_size,
-                 "Video: %s%s",
-                 codec_name, enc->mb_decision ? " (hq)" : "");
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        {
+            char detail[256] = "(";
 
-        av_strlcat(buf, "\n      ", buf_size);
-        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+            av_strlcat(buf, separator, buf_size);
+
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
                  "%s", enc->pix_fmt == AV_PIX_FMT_NONE ? "none" :
                      av_get_pix_fmt_name(enc->pix_fmt));
+            if (enc->bits_per_raw_sample && enc->pix_fmt != AV_PIX_FMT_NONE &&
+                enc->bits_per_raw_sample < av_pix_fmt_desc_get(enc->pix_fmt)->comp[0].depth)
+                av_strlcatf(detail, sizeof(detail), "%d bpc, ", enc->bits_per_raw_sample);
+            if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_color_range_name(enc->color_range));
+
+            if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
+                enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
+                if (enc->colorspace != (int)enc->color_primaries ||
+                    enc->colorspace != (int)enc->color_trc) {
+                    new_line = 1;
+                    av_strlcatf(detail, sizeof(detail), "%s/%s/%s, ",
+                                av_color_space_name(enc->colorspace),
+                                av_color_primaries_name(enc->color_primaries),
+                                av_color_transfer_name(enc->color_trc));
+                } else
+                    av_strlcatf(detail, sizeof(detail), "%s, ",
+                                av_get_colorspace_name(enc->colorspace));
+            }
 
-        if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_color_range_name(enc->color_range));
-        if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
-            enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
-            enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
-            new_line = 1;
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s/%s/%s",
-                     av_color_space_name(enc->colorspace),
-                     av_color_primaries_name(enc->color_primaries),
-                     av_color_transfer_name(enc->color_trc));
-        }
-        if (av_log_get_level() >= AV_LOG_DEBUG &&
-            enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_chroma_location_name(enc->chroma_sample_location));
+            if (av_log_get_level() >= AV_LOG_DEBUG &&
+                enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_chroma_location_name(enc->chroma_sample_location));
+
+            if (strlen(detail) > 1) {
+                detail[strlen(detail) - 2] = 0;
+                av_strlcatf(buf, buf_size, "%s)", detail);
+            }
+        }
 
         if (enc->width) {
-            av_strlcat(buf, new_line ? "\n      " : ", ", buf_size);
+            av_strlcat(buf, new_line ? separator : ", ", buf_size);
 
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%dx%d",
@@ -1821,7 +2757,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                           enc->height * enc->sample_aspect_ratio.den,
                           1024 * 1024);
                 snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                         " [PAR %d:%d DAR %d:%d]",
+                         " [SAR %d:%d DAR %d:%d]",
                          enc->sample_aspect_ratio.num, enc->sample_aspect_ratio.den,
                          display_aspect_ratio.num, display_aspect_ratio.den);
             }
@@ -1835,23 +2771,18 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
         if (encode) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", q=%d-%d", enc->qmin, enc->qmax);
+        } else {
+            if (enc->properties & FF_CODEC_PROPERTY_CLOSED_CAPTIONS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", Closed Captions");
+            if (enc->properties & FF_CODEC_PROPERTY_LOSSLESS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", lossless");
         }
         break;
     case AVMEDIA_TYPE_AUDIO:
-        snprintf(buf, buf_size,
-                 "Audio: %s",
-                 codec_name);
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        av_strlcat(buf, separator, buf_size);
 
-        av_strlcat(buf, "\n      ", buf_size);
         if (enc->sample_rate) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%d Hz, ", enc->sample_rate);
@@ -1861,18 +2792,26 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", %s", av_get_sample_fmt_name(enc->sample_fmt));
         }
+        if (   enc->bits_per_raw_sample > 0
+            && enc->bits_per_raw_sample != av_get_bytes_per_sample(enc->sample_fmt) * 8)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     " (%d bit)", enc->bits_per_raw_sample);
         break;
     case AVMEDIA_TYPE_DATA:
-        snprintf(buf, buf_size, "Data: %s", codec_name);
+        if (av_log_get_level() >= AV_LOG_DEBUG) {
+            int g = av_gcd(enc->time_base.num, enc->time_base.den);
+            if (g)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", %d/%d",
+                         enc->time_base.num / g, enc->time_base.den / g);
+        }
         break;
     case AVMEDIA_TYPE_SUBTITLE:
-        snprintf(buf, buf_size, "Subtitle: %s", codec_name);
-        break;
-    case AVMEDIA_TYPE_ATTACHMENT:
-        snprintf(buf, buf_size, "Attachment: %s", codec_name);
+        if (enc->width)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     ", %dx%d", enc->width, enc->height);
         break;
     default:
-        snprintf(buf, buf_size, "Invalid Codec type %d", enc->codec_type);
         return;
     }
     if (encode) {
@@ -1886,7 +2825,10 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     bitrate = get_bit_rate(enc);
     if (bitrate != 0) {
         snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                 ", %d kb/s", bitrate / 1000);
+                 ", %"PRId64" kb/s", bitrate / 1000);
+    } else if (enc->rc_max_rate > 0) {
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", max. %"PRId64" kb/s", (int64_t)enc->rc_max_rate / 1000);
     }
 }
 
@@ -1905,18 +2847,25 @@ const char *av_get_profile_name(const AVCodec *codec, int profile)
 
 unsigned avcodec_version(void)
 {
+//    av_assert0(AV_CODEC_ID_V410==164);
+    av_assert0(AV_CODEC_ID_PCM_S8_PLANAR==65563);
+    av_assert0(AV_CODEC_ID_ADPCM_G722==69660);
+//     av_assert0(AV_CODEC_ID_BMV_AUDIO==86071);
+    av_assert0(AV_CODEC_ID_SRT==94216);
+    av_assert0(LIBAVCODEC_VERSION_MICRO >= 100);
+
     return LIBAVCODEC_VERSION_INT;
 }
 
 const char *avcodec_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *avcodec_license(void)
 {
 #define LICENSE_PREFIX "libavcodec license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 void avcodec_flush_buffers(AVCodecContext *avctx)
@@ -1926,6 +2875,9 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
     else if (avctx->codec->flush)
         avctx->codec->flush(avctx);
 
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
     if (!avctx->refcounted_frames)
         av_frame_unref(avctx->internal->to_free);
 }
@@ -1933,16 +2885,24 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
 int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
+    case AV_CODEC_ID_8SVX_EXP:
+    case AV_CODEC_ID_8SVX_FIB:
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_G722:
     case AV_CODEC_ID_ADPCM_YAMAHA:
         return 4;
+    case AV_CODEC_ID_DSD_LSBF:
+    case AV_CODEC_ID_DSD_MSBF:
+    case AV_CODEC_ID_DSD_LSBF_PLANAR:
+    case AV_CODEC_ID_DSD_MSBF_PLANAR:
     case AV_CODEC_ID_PCM_ALAW:
     case AV_CODEC_ID_PCM_MULAW:
     case AV_CODEC_ID_PCM_S8:
+    case AV_CODEC_ID_PCM_S8_PLANAR:
     case AV_CODEC_ID_PCM_U8:
     case AV_CODEC_ID_PCM_ZORK:
         return 8;
@@ -1976,6 +2936,27 @@ int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
     }
 }
 
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be)
+{
+    static const enum AVCodecID map[AV_SAMPLE_FMT_NB][2] = {
+        [AV_SAMPLE_FMT_U8  ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16 ] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32 ] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_FLT ] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBL ] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+        [AV_SAMPLE_FMT_U8P ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16P] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32P] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_FLTP] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBLP] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+    };
+    if (fmt < 0 || fmt >= AV_SAMPLE_FMT_NB)
+        return AV_CODEC_ID_NONE;
+    if (be < 0 || be > 1)
+        be = AV_NE(1, 0);
+    return map[fmt][be];
+}
+
 int av_get_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
@@ -2006,8 +2987,8 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
     bps = av_get_exact_bits_per_sample(avctx->codec_id);
 
     /* codecs with an exact constant bits per sample */
-    if (bps > 0 && ch > 0 && frame_bytes > 0)
-        return (frame_bytes * 8) / (bps * ch);
+    if (bps > 0 && ch > 0 && frame_bytes > 0 && ch < 32768 && bps < 32768)
+        return (frame_bytes * 8LL) / (bps * ch);
     bps = avctx->bits_per_coded_sample;
 
     /* codecs with a fixed packet duration */
@@ -2016,16 +2997,16 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
     case AV_CODEC_ID_ADPCM_IMA_QT: return   64;
     case AV_CODEC_ID_ADPCM_EA_XAS: return  128;
     case AV_CODEC_ID_AMR_NB:
+    case AV_CODEC_ID_EVRC:
     case AV_CODEC_ID_GSM:
     case AV_CODEC_ID_QCELP:
-    case AV_CODEC_ID_RA_144:
     case AV_CODEC_ID_RA_288:       return  160;
-    case AV_CODEC_ID_IMC:          return  256;
     case AV_CODEC_ID_AMR_WB:
     case AV_CODEC_ID_GSM_MS:       return  320;
     case AV_CODEC_ID_MP1:          return  384;
     case AV_CODEC_ID_ATRAC1:       return  512;
     case AV_CODEC_ID_ATRAC3:       return 1024;
+    case AV_CODEC_ID_ATRAC3P:      return 2048;
     case AV_CODEC_ID_MP2:
     case AV_CODEC_ID_MUSEPACK7:    return 1152;
     case AV_CODEC_ID_AC3:          return 1536;
@@ -2066,6 +3047,10 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
             return 240 * (frame_bytes / 32);
         if (id == AV_CODEC_ID_NELLYMOSER)
             return 256 * (frame_bytes / 64);
+        if (id == AV_CODEC_ID_RA_144)
+            return 160 * (frame_bytes / 20);
+        if (id == AV_CODEC_ID_G723_1)
+            return 240 * (frame_bytes / 24);
 
         if (bps > 0) {
             /* calc from frame_bytes and bits_per_coded_sample */
@@ -2076,6 +3061,10 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
         if (ch > 0) {
             /* calc from frame_bytes and channels */
             switch (id) {
+            case AV_CODEC_ID_ADPCM_AFC:
+                return frame_bytes / (9 * ch) * 16;
+            case AV_CODEC_ID_ADPCM_DTK:
+                return frame_bytes / (16 * ch) * 28;
             case AV_CODEC_ID_ADPCM_4XM:
             case AV_CODEC_ID_ADPCM_IMA_ISS:
                 return (frame_bytes - 4 * ch) * 2 / ch;
@@ -2083,6 +3072,11 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
                 return (frame_bytes - 4) * 2 / ch;
             case AV_CODEC_ID_ADPCM_IMA_AMV:
                 return (frame_bytes - 8) * 2 / ch;
+            case AV_CODEC_ID_ADPCM_THP:
+            case AV_CODEC_ID_ADPCM_THP_LE:
+                if (avctx->extradata)
+                    return frame_bytes * 14 / (8 * ch);
+                break;
             case AV_CODEC_ID_ADPCM_XA:
                 return (frame_bytes / 128) * 224 / ch;
             case AV_CODEC_ID_INTERPLAY_DPCM:
@@ -2097,6 +3091,9 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
                 return 6 * frame_bytes / ch;
             case AV_CODEC_ID_PCM_LXF:
                 return 2 * (frame_bytes / (5 * ch));
+            case AV_CODEC_ID_IAC:
+            case AV_CODEC_ID_IMC:
+                return 4 * frame_bytes / ch;
             }
 
             if (tag) {
@@ -2114,11 +3111,15 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
                 int blocks = frame_bytes / ba;
                 switch (avctx->codec_id) {
                 case AV_CODEC_ID_ADPCM_IMA_WAV:
-                    return blocks * (1 + (ba - 4 * ch) / (4 * ch) * 8);
+                    if (bps < 2 || bps > 5)
+                        return 0;
+                    return blocks * (1 + (ba - 4 * ch) / (bps * ch) * 8);
                 case AV_CODEC_ID_ADPCM_IMA_DK3:
                     return blocks * (((ba - 16) * 2 / 3 * 4) / ch);
                 case AV_CODEC_ID_ADPCM_IMA_DK4:
                     return blocks * (1 + (ba - 4 * ch) * 2 / ch);
+                case AV_CODEC_ID_ADPCM_IMA_RAD:
+                    return blocks * ((ba - 4 * ch) * 2 / ch);
                 case AV_CODEC_ID_ADPCM_MS:
                     return blocks * (2 + (ba - 7 * ch) * 2 / ch);
                 }
@@ -2128,8 +3129,12 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
                 /* calc from frame_bytes, channels, and bits_per_coded_sample */
                 switch (avctx->codec_id) {
                 case AV_CODEC_ID_PCM_DVD:
+                    if(bps<4)
+                        return 0;
                     return 2 * (frame_bytes / ((bps * 2 / 8) * ch));
                 case AV_CODEC_ID_PCM_BLURAY:
+                    if(bps<4)
+                        return 0;
                     return frame_bytes / ((FFALIGN(ch, 2) * bps) / 8);
                 case AV_CODEC_ID_S302M:
                     return 2 * (frame_bytes / ((bps + 4) / 4)) / ch;
@@ -2138,6 +3143,17 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
         }
     }
 
+    /* Fall back on using frame_size */
+    if (avctx->frame_size > 1 && frame_bytes)
+        return avctx->frame_size;
+
+    //For WMA we currently have no other means to calculate duration thus we
+    //do it here by assuming CBR, which is true for all known cases.
+    if (avctx->bit_rate>0 && frame_bytes>0 && avctx->sample_rate>0 && avctx->block_align>1) {
+        if (avctx->codec_id == AV_CODEC_ID_WMAV1 || avctx->codec_id == AV_CODEC_ID_WMAV2)
+            return  (frame_bytes * 8LL * avctx->sample_rate) / avctx->bit_rate;
+    }
+
     return 0;
 }
 
@@ -2174,7 +3190,7 @@ int ff_match_2uint16(const uint16_t(*tab)[2], int size, int a, int b)
 FF_DISABLE_DEPRECATION_WARNINGS
 void av_log_missing_feature(void *avc, const char *feature, int want_sample)
 {
-    av_log(avc, AV_LOG_WARNING, "%s is not implemented. Update your Libav "
+    av_log(avc, AV_LOG_WARNING, "%s is not implemented. Update your FFmpeg "
             "version to the newest one from Git. If the problem still "
             "occurs, it means that your file has a feature which has not "
             "been implemented.\n", feature);
@@ -2191,8 +3207,8 @@ void av_log_ask_for_sample(void *avc, const char *msg, ...)
     if (msg)
         av_vlog(avc, AV_LOG_WARNING, msg, argument_list);
     av_log(avc, AV_LOG_WARNING, "If you want to help, upload a sample "
-            "of this file to ftp://upload.libav.org/incoming/ "
-            "and contact the libav-devel mailing list.\n");
+            "of this file to ftp://upload.ffmpeg.org/incoming/ "
+            "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n");
 
     va_end(argument_list);
 }
@@ -2200,14 +3216,15 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif /* FF_API_MISSING_SAMPLE */
 
 static AVHWAccel *first_hwaccel = NULL;
+static AVHWAccel **last_hwaccel = &first_hwaccel;
 
 void av_register_hwaccel(AVHWAccel *hwaccel)
 {
-    AVHWAccel **p = &first_hwaccel;
-    while (*p)
-        p = &(*p)->next;
-    *p = hwaccel;
+    AVHWAccel **p = last_hwaccel;
     hwaccel->next = NULL;
+    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, hwaccel))
+        p = &(*p)->next;
+    last_hwaccel = &hwaccel->next;
 }
 
 AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel)
@@ -2247,6 +3264,48 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
     return 0;
 }
 
+int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec)
+{
+    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE)
+        return 0;
+
+    if (lockmgr_cb) {
+        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
+            return -1;
+    }
+
+    if (avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, 1) != 1) {
+        av_log(log_ctx, AV_LOG_ERROR,
+               "Insufficient thread locking. At least %d threads are "
+               "calling avcodec_open2() at the same time right now.\n",
+               entangled_thread_counter);
+        if (!lockmgr_cb)
+            av_log(log_ctx, AV_LOG_ERROR, "No lock manager is set, please see av_lockmgr_register()\n");
+        ff_avcodec_locked = 1;
+        ff_unlock_avcodec(codec);
+        return AVERROR(EINVAL);
+    }
+    av_assert0(!ff_avcodec_locked);
+    ff_avcodec_locked = 1;
+    return 0;
+}
+
+int ff_unlock_avcodec(const AVCodec *codec)
+{
+    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE)
+        return 0;
+
+    av_assert0(ff_avcodec_locked);
+    ff_avcodec_locked = 0;
+    avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, -1);
+    if (lockmgr_cb) {
+        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE))
+            return -1;
+    }
+
+    return 0;
+}
+
 int avpriv_lock_avformat(void)
 {
     if (lockmgr_cb) {
@@ -2270,7 +3329,7 @@ unsigned int avpriv_toupper4(unsigned int x)
     return av_toupper(x & 0xFF) +
           (av_toupper((x >>  8) & 0xFF) << 8)  +
           (av_toupper((x >> 16) & 0xFF) << 16) +
-          (av_toupper((x >> 24) & 0xFF) << 24);
+((unsigned)av_toupper((x >> 24) & 0xFF) << 24);
 }
 
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
@@ -2283,6 +3342,8 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
     if (ret < 0)
         return ret;
 
+    av_assert0(!dst->progress);
+
     if (src->progress &&
         !(dst->progress = av_buffer_ref(src->progress))) {
         ff_thread_release_buffer(dst->owner, dst);
@@ -2294,6 +3355,11 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
 
 #if !HAVE_THREADS
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    return ff_get_format(avctx, fmt);
+}
+
 int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
     f->owner = avctx;
@@ -2318,6 +3384,28 @@ void ff_thread_await_progress(ThreadFrame *f, int progress, int field)
 {
 }
 
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    return 1;
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    return 0;
+}
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+}
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+}
+
 #endif
 
 int avcodec_is_open(AVCodecContext *s)
@@ -2325,13 +3413,36 @@ int avcodec_is_open(AVCodecContext *s)
     return !!s->internal;
 }
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf)
+{
+    int ret;
+    char *str;
+
+    ret = av_bprint_finalize(buf, &str);
+    if (ret < 0)
+        return ret;
+    if (!av_bprint_is_complete(buf)) {
+        av_free(str);
+        return AVERROR(ENOMEM);
+    }
+
+    avctx->extradata = str;
+    /* Note: the string is NUL terminated (so extradata can be read as a
+     * string), but the ending character is not accounted in the size (in
+     * binary formats you are likely not supposed to mux that character). When
+     * extradata is copied, it is also padded with AV_INPUT_BUFFER_PADDING_SIZE
+     * zeros. */
+    avctx->extradata_size = buf->len;
+    return 0;
+}
+
+const uint8_t *avpriv_find_start_code(const uint8_t *av_restrict p,
                                       const uint8_t *end,
-                                      uint32_t * restrict state)
+                                      uint32_t *av_restrict state)
 {
     int i;
 
-    assert(p <= end);
+    av_assert0(p <= end);
     if (p >= end)
         return end;
 
diff --git a/libavcodec/utvideo.c b/libavcodec/utvideo.c
index eb5a924222..308adb75d9 100644
--- a/libavcodec/utvideo.c
+++ b/libavcodec/utvideo.c
@@ -2,20 +2,20 @@
  * Common Ut Video code
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/utvideo.h b/libavcodec/utvideo.h
index 718273c47f..78c3ec54dc 100644
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -2,20 +2,20 @@
  * Common Ut Video header
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index 30ca4d2164..760d9e5a7f 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -2,20 +2,20 @@
  * Ut Video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,13 +56,14 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         *fsym = he[0].sym;
         return 0;
     }
-    if (he[0].len > 32)
-        return -1;
 
     last = 255;
     while (he[last].len == 255 && last)
         last--;
 
+    if (he[last].len > 32)
+        return -1;
+
     code = 1;
     for (i = last; i >= 0; i--) {
         codes[i] = code >> (32 - he[i].len);
@@ -71,7 +72,7 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         code += 0x80000000u >> (he[i].len - 1);
     }
 
-    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 9), last + 1,
+    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 11), last + 1,
                               bits,  sizeof(*bits),  sizeof(*bits),
                               codes, sizeof(*codes), sizeof(*codes),
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
@@ -156,7 +157,7 @@ static int decode_plane(UtvideoContext *c, int plane_no,
                            "Slice decoding ran out of bits\n");
                     goto fail;
                 }
-                pix = get_vlc2(&gb, vlc.table, vlc.bits, 4);
+                pix = get_vlc2(&gb, vlc.table, vlc.bits, 3);
                 if (pix < 0) {
                     av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
                     goto fail;
@@ -213,9 +214,9 @@ static void restore_median(uint8_t *src, int step, int stride,
         slice_start  = ((slice * height) / slices) & cmask;
         slice_height = ((((slice + 1) * height) / slices) & cmask) -
                        slice_start;
+
         if (!slice_height)
             continue;
-
         bsrc = src + slice_start * stride;
 
         // first line - left neighbour prediction
@@ -226,7 +227,7 @@ static void restore_median(uint8_t *src, int step, int stride,
             A        = bsrc[i];
         }
         bsrc += stride;
-        if (slice_height == 1)
+        if (slice_height <= 1)
             continue;
         // second line - first element has top prediction, the rest uses median
         C        = bsrc[-stride];
@@ -288,7 +289,7 @@ static void restore_median_il(uint8_t *src, int step, int stride,
             A                 = bsrc[stride + i];
         }
         bsrc += stride2;
-        if (slice_height == 1)
+        if (slice_height <= 1)
             continue;
         // second line - first element has top prediction, the rest uses median
         C        = bsrc[-stride2];
@@ -339,12 +340,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     GetByteContext gb;
     ThreadFrame frame = { .f = data };
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
-
-    ff_thread_finish_setup(avctx);
 
     /* parse plane structure to get frame flags and validate slice offsets */
     bytestream2_init(&gb, buf, buf_size);
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index 9af0f66a71..b8e1cc33e5 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -2,20 +2,20 @@
  * Ut Video encoder
  * Copyright (c) 2012 Jan Ekström
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -380,7 +380,7 @@ static int write_huff_codes(uint8_t *src, uint8_t *dst, int dst_size,
 }
 
 static int encode_plane(AVCodecContext *avctx, uint8_t *src,
-                        uint8_t *dst, int stride,
+                        uint8_t *dst, int stride, int plane_no,
                         int width, int height, PutByteContext *pb)
 {
     UtvideoContext *c        = avctx->priv_data;
@@ -390,15 +390,17 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     HuffEntry he[256];
 
     uint32_t offset = 0, slice_len = 0;
+    const int cmask = ~(!plane_no && avctx->pix_fmt == AV_PIX_FMT_YUV420P);
     int      i, sstart, send = 0;
     int      symbol;
+    int      ret;
 
     /* Do prediction / make planes */
     switch (c->frame_pred) {
     case PRED_NONE:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             av_image_copy_plane(dst + sstart * width, width,
                                 src + sstart * stride, stride,
                                 width, send - sstart);
@@ -407,7 +409,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     case PRED_LEFT:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             left_predict(src + sstart * stride, dst + sstart * width,
                          stride, width, send - sstart);
         }
@@ -415,7 +417,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     case PRED_MEDIAN:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             median_predict(c, src + sstart * stride, dst + sstart * width,
                            stride, width, send - sstart);
         }
@@ -434,7 +436,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
         /* If non-zero count is found, see if it matches width * height */
         if (counts[symbol]) {
             /* Special case if only one symbol was used */
-            if (counts[symbol] == width * height) {
+            if (counts[symbol] == width * (int64_t)height) {
                 /*
                  * Write a zero for the single symbol
                  * used in the plane, else 0xFF.
@@ -458,7 +460,8 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     }
 
     /* Calculate huffman lengths */
-    ff_huff_gen_len_table(lengths, counts);
+    if ((ret = ff_huff_gen_len_table(lengths, counts, 256, 1)) < 0)
+        return ret;
 
     /*
      * Write the plane's header into the output packet:
@@ -478,14 +481,14 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     send = 0;
     for (i = 0; i < c->slices; i++) {
         sstart  = send;
-        send    = height * (i + 1) / c->slices;
+        send    = height * (i + 1) / c->slices & cmask;
 
         /*
          * Write the huffman codes to a buffer,
          * get the offset in bits and convert to bytes.
          */
         offset += write_huff_codes(dst + sstart * width, c->slice_bits,
-                                   width * (send - sstart), width,
+                                   width * height + 4, width,
                                    send - sstart, he) >> 3;
 
         slice_len = offset - slice_len;
@@ -532,22 +535,17 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int i, ret = 0;
 
     /* Allocate a new packet if needed, and set it to the pointer dst */
-    ret = ff_alloc_packet(pkt, (256 + 4 * c->slices + width * height) *
-                          c->planes + 4);
+    ret = ff_alloc_packet2(avctx, pkt, (256 + 4 * c->slices + width * height) *
+                           c->planes + 4, 0);
 
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error allocating the output packet, or the provided packet "
-               "was too small.\n");
+    if (ret < 0)
         return ret;
-    }
 
     dst = pkt->data;
 
     bytestream2_init_writer(&pb, dst, pkt->size);
 
-    av_fast_malloc(&c->slice_bits, &c->slice_bits_size,
-                   width * height + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&c->slice_bits, &c->slice_bits_size, width * height + 4);
 
     if (!c->slice_bits) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer 2.\n");
@@ -565,7 +563,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGBA:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, c->slice_buffer[i] + 2 * c->slice_stride,
-                               c->slice_buffer[i], c->slice_stride,
+                               c->slice_buffer[i], c->slice_stride, i,
                                width, height, &pb);
 
             if (ret) {
@@ -577,7 +575,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV422P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height, &pb);
+                               pic->linesize[i], i, width >> !!i, height, &pb);
 
             if (ret) {
                 av_log(avctx, AV_LOG_ERROR, "Error encoding plane %d.\n", i);
@@ -588,7 +586,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV420P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height >> !!i,
+                               pic->linesize[i], i, width >> !!i, height >> !!i,
                                &pb);
 
             if (ret) {
@@ -640,6 +638,7 @@ AVCodec ff_utvideo_encoder = {
     .init           = utvideo_encode_init,
     .encode2        = utvideo_encode_frame,
     .close          = utvideo_encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
                           AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_YUV422P,
                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 74301fe48b..12d5d5407f 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -4,31 +4,55 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "internal.h"
+#include "v210dec.h"
 #include "libavutil/bswap.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
+#define READ_PIXELS(a, b, c)         \
+    do {                             \
+        val  = av_le2ne32(*src++);   \
+        *a++ =  val & 0x3FF;         \
+        *b++ = (val >> 10) & 0x3FF;  \
+        *c++ = (val >> 20) & 0x3FF;  \
+    } while (0)
+
+static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+{
+    uint32_t val;
+    int i;
+
+    for( i = 0; i < width-5; i += 6 ){
+        READ_PIXELS(u, y, v);
+        READ_PIXELS(y, u, y);
+        READ_PIXELS(v, y, u);
+        READ_PIXELS(y, v, y);
+    }
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
+    V210DecContext *s = avctx->priv_data;
+
     if (avctx->width & 1) {
         av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
         return AVERROR_INVALIDDATA;
@@ -36,22 +60,48 @@ static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
     avctx->bits_per_raw_sample = 10;
 
+    s->unpack_frame            = v210_planar_unpack_c;
+
+    if (HAVE_MMX)
+        ff_v210_x86_init(s);
+
     return 0;
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
-    int h, w, ret;
+    V210DecContext *s = avctx->priv_data;
+
+    int h, w, ret, stride, aligned_input;
     AVFrame *pic = data;
     const uint8_t *psrc = avpkt->data;
     uint16_t *y, *u, *v;
-    int aligned_width = ((avctx->width + 47) / 48) * 48;
-    int stride = aligned_width * 8 / 3;
+
+    if (s->custom_stride )
+        stride = s->custom_stride;
+    else {
+        int aligned_width = ((avctx->width + 47) / 48) * 48;
+        stride = aligned_width * 8 / 3;
+    }
 
     if (avpkt->size < stride * avctx->height) {
-        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
-        return AVERROR_INVALIDDATA;
+        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
+            stride = avpkt->size / avctx->height;
+            if (!s->stride_warning_shown)
+                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
+            s->stride_warning_shown = 1;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
+    if (aligned_input != s->aligned_input) {
+        s->aligned_input = aligned_input;
+        if (HAVE_MMX)
+            ff_v210_x86_init(s);
     }
 
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
@@ -63,36 +113,31 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-#define READ_PIXELS(a, b, c)         \
-    do {                             \
-        val  = av_le2ne32(*src++);   \
-        *a++ =  val & 0x3FF;         \
-        *b++ = (val >> 10) & 0x3FF;  \
-        *c++ = (val >> 20) & 0x3FF;  \
-    } while (0)
-
     for (h = 0; h < avctx->height; h++) {
         const uint32_t *src = (const uint32_t*)psrc;
         uint32_t val;
-        for (w = 0; w < avctx->width - 5; w += 6) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-        }
+
+        w = (avctx->width / 6) * 6;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
         if (w < avctx->width - 1) {
             READ_PIXELS(u, y, v);
 
             val  = av_le2ne32(*src++);
             *y++ =  val & 0x3FF;
-        }
-        if (w < avctx->width - 3) {
-            *u++ = (val >> 10) & 0x3FF;
-            *y++ = (val >> 20) & 0x3FF;
+            if (w < avctx->width - 3) {
+                *u++ = (val >> 10) & 0x3FF;
+                *y++ = (val >> 20) & 0x3FF;
 
-            val  = av_le2ne32(*src++);
-            *v++ =  val & 0x3FF;
-            *y++ = (val >> 10) & 0x3FF;
+                val  = av_le2ne32(*src++);
+                *v++ =  val & 0x3FF;
+                *y++ = (val >> 10) & 0x3FF;
+            }
         }
 
         psrc += stride;
@@ -101,17 +146,40 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         v += pic->linesize[2] / 2 - avctx->width / 2;
     }
 
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        pic->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            pic->top_field_first = 1;
+    }
+
     *got_frame      = 1;
 
     return avpkt->size;
 }
 
+#define V210DEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption v210dec_options[] = {
+    {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), AV_OPT_TYPE_INT,
+     {.i64 = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS},
+    {NULL}
+};
+
+static const AVClass v210dec_class = {
+    "V210 Decoder",
+    av_default_item_name,
+    v210dec_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_v210_decoder = {
     .name           = "v210",
     .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_V210,
+    .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
new file mode 100644
index 0000000000..533afc435c
--- /dev/null
+++ b/libavcodec/v210dec.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V210DEC_H
+#define AVCODEC_V210DEC_H
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+
+typedef struct {
+    AVClass *av_class;
+    int custom_stride;
+    int aligned_input;
+    int stride_warning_shown;
+    void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+} V210DecContext;
+
+void ff_v210_x86_init(V210DecContext *s);
+
+#endif /* AVCODEC_V210DEC_H */
diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c
index ca6ad2ee2f..061224885f 100644
--- a/libavcodec/v210enc.c
+++ b/libavcodec/v210enc.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,7 +121,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int h, w, ret;
     uint8_t *dst;
 
-    ret = ff_alloc_packet(pkt, avctx->height * stride);
+    ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride, avctx->height * stride);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
         return ret;
diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h
index 81a3531228..a2054270d3 100644
--- a/libavcodec/v210enc.h
+++ b/libavcodec/v210enc.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v210x.c b/libavcodec/v210x.c
index 3f220ff6f4..f6a453aabf 100644
--- a/libavcodec/v210x.c
+++ b/libavcodec/v210x.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v308dec.c b/libavcodec/v308dec.c
new file mode 100644
index 0000000000..dd53fbded4
--- /dev/null
+++ b/libavcodec/v308dec.c
@@ -0,0 +1,83 @@
+/*
+ * v308 decoder
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+
+    if (avctx->width & 1)
+        av_log(avctx, AV_LOG_WARNING, "v308 requires width to be even.\n");
+
+    return 0;
+}
+
+static int v308_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            v[j] = *src++;
+            y[j] = *src++;
+            u[j] = *src++;
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_v308_decoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_decode_init,
+    .decode       = v308_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/v308enc.c b/libavcodec/v308enc.c
new file mode 100644
index 0000000000..b60a72cee6
--- /dev/null
+++ b/libavcodec/v308enc.c
@@ -0,0 +1,83 @@
+/*
+ * v308 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "v308 requires width to be even.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int v308_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 3, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            *dst++ = v[j];
+            *dst++ = y[j];
+            *dst++ = u[j];
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v308_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_v308_encoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_encode_init,
+    .encode2      = v308_encode_frame,
+    .close        = v308_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/v408dec.c b/libavcodec/v408dec.c
new file mode 100644
index 0000000000..acff95d6bb
--- /dev/null
+++ b/libavcodec/v408dec.c
@@ -0,0 +1,103 @@
+/*
+ * v408 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+
+    return 0;
+}
+
+static int v408_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                v[j] = *src++;
+                u[j] = *src++;
+                y[j] = *src++;
+                a[j] = *src++;
+            } else {
+                u[j] = *src++;
+                y[j] = *src++;
+                v[j] = *src++;
+                a[j] = *src++;
+            }
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#if CONFIG_AYUV_DECODER
+AVCodec ff_ayuv_decoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_V408_DECODER
+AVCodec ff_v408_decoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/v408enc.c b/libavcodec/v408enc.c
new file mode 100644
index 0000000000..f37f360b73
--- /dev/null
+++ b/libavcodec/v408enc.c
@@ -0,0 +1,104 @@
+/*
+ * v408 encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_encode_init(AVCodecContext *avctx)
+{
+
+    return 0;
+}
+
+static int v408_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+           if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                *dst++ = v[j];
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = a[j];
+            } else {
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = v[j];
+                *dst++ = a[j];
+            }
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v408_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+#if CONFIG_AYUV_ENCODER
+AVCodec ff_ayuv_encoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_V408_ENCODER
+AVCodec ff_v408_encoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
index ca68e0b7ec..48fab68273 100644
--- a/libavcodec/v410dec.c
+++ b/libavcodec/v410dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,17 +49,15 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *src = avpkt->data;
     uint16_t *y, *u, *v;
     uint32_t val;
-    int i, j;
+    int i, j, ret;
 
     if (avpkt->size < 4 * avctx->height * avctx->width) {
         av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
         return AVERROR(EINVAL);
     }
 
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
 
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
diff --git a/libavcodec/v410enc.c b/libavcodec/v410enc.c
index 1e3f38fd71..f35ff75963 100644
--- a/libavcodec/v410enc.c
+++ b/libavcodec/v410enc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 static av_cold int v410_encode_init(AVCodecContext *avctx)
 {
     if (avctx->width & 1) {
-        av_log(avctx, AV_LOG_ERROR, "v410 requires even width.\n");
+        av_log(avctx, AV_LOG_ERROR, "v410 requires width to be even.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -43,10 +43,9 @@ static int v410_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t val;
     int i, j, ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * 4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4,
+                                            avctx->width * avctx->height * 4)) < 0)
         return ret;
-    }
     dst = pkt->data;
 
 #if FF_API_CODED_FRAME
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
index d00be2fc98..a1ea790add 100644
--- a/libavcodec/vaapi.c
+++ b/libavcodec/vaapi.c
@@ -4,24 +4,24 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264.h"
+#include "libavutil/log.h"
 #include "mpegvideo.h"
 #include "vaapi_internal.h"
 
@@ -35,27 +35,56 @@ static void destroy_buffers(VADisplay display, VABufferID *buffers, unsigned int
 {
     unsigned int i;
     for (i = 0; i < n_buffers; i++) {
-        if (buffers[i]) {
+        if (buffers[i] != VA_INVALID_ID) {
             vaDestroyBuffer(display, buffers[i]);
-            buffers[i] = 0;
+            buffers[i] = VA_INVALID_ID;
         }
     }
 }
 
-int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
+int ff_vaapi_context_init(AVCodecContext *avctx)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    const struct vaapi_context * const user_vactx = avctx->hwaccel_context;
+
+    if (!user_vactx) {
+        av_log(avctx, AV_LOG_ERROR, "Hardware acceleration context (hwaccel_context) does not exist.\n");
+        return AVERROR(ENOSYS);
+    }
+
+    vactx->display              = user_vactx->display;
+    vactx->config_id            = user_vactx->config_id;
+    vactx->context_id           = user_vactx->context_id;
+
+    vactx->pic_param_buf_id     = VA_INVALID_ID;
+    vactx->iq_matrix_buf_id     = VA_INVALID_ID;
+    vactx->bitplane_buf_id      = VA_INVALID_ID;
+
+    return 0;
+}
+
+int ff_vaapi_context_fini(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface)
 {
     VABufferID va_buffers[3];
     unsigned int n_va_buffers = 0;
 
+    if (vactx->pic_param_buf_id == VA_INVALID_ID)
+        return 0;
+
     vaUnmapBuffer(vactx->display, vactx->pic_param_buf_id);
     va_buffers[n_va_buffers++] = vactx->pic_param_buf_id;
 
-    if (vactx->iq_matrix_buf_id) {
+    if (vactx->iq_matrix_buf_id != VA_INVALID_ID) {
         vaUnmapBuffer(vactx->display, vactx->iq_matrix_buf_id);
         va_buffers[n_va_buffers++] = vactx->iq_matrix_buf_id;
     }
 
-    if (vactx->bitplane_buf_id) {
+    if (vactx->bitplane_buf_id != VA_INVALID_ID) {
         vaUnmapBuffer(vactx->display, vactx->bitplane_buf_id);
         va_buffers[n_va_buffers++] = vactx->bitplane_buf_id;
     }
@@ -79,7 +108,7 @@ int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
     return 0;
 }
 
-int ff_vaapi_commit_slices(struct vaapi_context *vactx)
+int ff_vaapi_commit_slices(FFVAContext *vactx)
 {
     VABufferID *slice_buf_ids;
     VABufferID slice_param_buf_id, slice_data_buf_id;
@@ -95,7 +124,7 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
         return -1;
     vactx->slice_buf_ids = slice_buf_ids;
 
-    slice_param_buf_id = 0;
+    slice_param_buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        VASliceParameterBufferType,
                        vactx->slice_param_size,
@@ -104,7 +133,7 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
         return -1;
     vactx->slice_count = 0;
 
-    slice_data_buf_id = 0;
+    slice_data_buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        VASliceDataBufferType,
                        vactx->slice_data_size,
@@ -119,11 +148,11 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
     return 0;
 }
 
-static void *alloc_buffer(struct vaapi_context *vactx, int type, unsigned int size, uint32_t *buf_id)
+static void *alloc_buffer(FFVAContext *vactx, int type, unsigned int size, uint32_t *buf_id)
 {
     void *data = NULL;
 
-    *buf_id = 0;
+    *buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        type, size, 1, NULL, buf_id) == VA_STATUS_SUCCESS)
         vaMapBuffer(vactx->display, *buf_id, &data);
@@ -131,22 +160,22 @@ static void *alloc_buffer(struct vaapi_context *vactx, int type, unsigned int si
     return data;
 }
 
-void *ff_vaapi_alloc_pic_param(struct vaapi_context *vactx, unsigned int size)
+void *ff_vaapi_alloc_pic_param(FFVAContext *vactx, unsigned int size)
 {
     return alloc_buffer(vactx, VAPictureParameterBufferType, size, &vactx->pic_param_buf_id);
 }
 
-void *ff_vaapi_alloc_iq_matrix(struct vaapi_context *vactx, unsigned int size)
+void *ff_vaapi_alloc_iq_matrix(FFVAContext *vactx, unsigned int size)
 {
     return alloc_buffer(vactx, VAIQMatrixBufferType, size, &vactx->iq_matrix_buf_id);
 }
 
-uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size)
+uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size)
 {
     return alloc_buffer(vactx, VABitPlaneBufferType, size, &vactx->bitplane_buf_id);
 }
 
-VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, const uint8_t *buffer, uint32_t size)
+VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8_t *buffer, uint32_t size)
 {
     uint8_t *slice_params;
     VASliceParameterBufferBase *slice_param;
@@ -179,7 +208,7 @@ VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, co
 
 void ff_vaapi_common_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
 
     ff_dlog(avctx, "ff_vaapi_common_end_frame()\n");
 
@@ -200,7 +229,7 @@ void ff_vaapi_common_end_frame(AVCodecContext *avctx)
     CONFIG_VC1_VAAPI_HWACCEL   || CONFIG_WMV3_VAAPI_HWACCEL
 int ff_vaapi_mpeg_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     MpegEncContext *s = avctx->priv_data;
     int ret;
 
diff --git a/libavcodec/vaapi.h b/libavcodec/vaapi.h
index 39e88259d6..7a29f6f881 100644
--- a/libavcodec/vaapi.h
+++ b/libavcodec/vaapi.h
@@ -1,23 +1,23 @@
 /*
- * Video Acceleration API (shared data between Libav and the video player)
+ * Video Acceleration API (shared data between FFmpeg and the video player)
  * HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,8 @@
  */
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
+#include "version.h"
 
 /**
  * @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
@@ -39,7 +41,7 @@
  */
 
 /**
- * This structure is used to share data between the Libav library and
+ * This structure is used to share data between the FFmpeg library and
  * the client video application.
  * This shall be zero-allocated and available as
  * AVCodecContext.hwaccel_context. All user members can be set once
@@ -72,12 +74,14 @@ struct vaapi_context {
      */
     uint32_t context_id;
 
+#if FF_API_VAAPI_CONTEXT
     /**
      * VAPictureParameterBuffer ID
      *
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t pic_param_buf_id;
 
     /**
@@ -86,6 +90,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t iq_matrix_buf_id;
 
     /**
@@ -94,6 +99,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t bitplane_buf_id;
 
     /**
@@ -102,6 +108,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t *slice_buf_ids;
 
     /**
@@ -110,6 +117,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int n_slice_buf_ids;
 
     /**
@@ -118,6 +126,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_buf_ids_alloc;
 
     /**
@@ -126,6 +135,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     void *slice_params;
 
     /**
@@ -134,6 +144,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_param_size;
 
     /**
@@ -142,6 +153,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_params_alloc;
 
     /**
@@ -150,6 +162,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_count;
 
     /**
@@ -157,6 +170,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     const uint8_t *slice_data;
 
     /**
@@ -165,7 +179,9 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t slice_data_size;
+#endif
 };
 
 /* @} */
diff --git a/libavcodec/vaapi_h264.c b/libavcodec/vaapi_h264.c
index 82a49f6060..ded2cb3d49 100644
--- a/libavcodec/vaapi_h264.c
+++ b/libavcodec/vaapi_h264.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 /**
  * @file
- * This file implements the glue code between Libav's and VA API's
+ * This file implements the glue code between FFmpeg's and VA API's
  * structures for H.264 decoding.
  */
 
@@ -44,10 +44,10 @@ static void init_vaapi_pic(VAPictureH264 *va_pic)
 }
 
 /**
- * Translate an Libav Picture into its VA API form.
+ * Translate an FFmpeg Picture into its VA API form.
  *
  * @param[out] va_pic          A pointer to VA API's own picture struct
- * @param[in]  pic             A pointer to the Libav picture struct to convert
+ * @param[in]  pic             A pointer to the FFmpeg picture struct to convert
  * @param[in]  pic_structure   The picture field type (as defined in mpegvideo.h),
  *                             supersedes pic's field type if nonzero.
  */
@@ -148,11 +148,11 @@ static int fill_vaapi_ReferenceFrames(VAPictureParameterBufferH264 *pic_param,
 }
 
 /**
- * Fill in VA API reference picture lists from the Libav reference
+ * Fill in VA API reference picture lists from the FFmpeg reference
  * picture list.
  *
  * @param[out] RefPicList  VA API internal reference picture list
- * @param[in]  ref_list    A pointer to the Libav reference list
+ * @param[in]  ref_list    A pointer to the FFmpeg reference list
  * @param[in]  ref_count   The number of reference pictures in ref_list
  */
 static void fill_vaapi_RefPicList(VAPictureH264 RefPicList[32],
@@ -162,7 +162,8 @@ static void fill_vaapi_RefPicList(VAPictureH264 RefPicList[32],
     unsigned int i, n = 0;
     for (i = 0; i < ref_count; i++)
         if (ref_list[i].reference)
-            fill_vaapi_pic(&RefPicList[n++], ref_list[i].parent, 0);
+            fill_vaapi_pic(&RefPicList[n++], ref_list[i].parent,
+                           ref_list[i].reference);
 
     for (; n < 32; n++)
         init_vaapi_pic(&RefPicList[n]);
@@ -226,7 +227,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
                                   av_unused uint32_t       size)
 {
     H264Context * const h = avctx->priv_data;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferH264 *pic_param;
     VAIQMatrixBufferH264 *iq_matrix;
 
@@ -260,7 +261,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
     pic_param->seq_fields.bits.delta_pic_order_always_zero_flag = h->sps.delta_pic_order_always_zero_flag;
     pic_param->num_slice_groups_minus1                          = h->pps.slice_group_count - 1;
     pic_param->slice_group_map_type                             = h->pps.mb_slice_group_map_type;
-    pic_param->slice_group_change_rate_minus1                   = 0; /* XXX: unimplemented in Libav */
+    pic_param->slice_group_change_rate_minus1                   = 0; /* XXX: unimplemented in FFmpeg */
     pic_param->pic_init_qp_minus26                              = h->pps.init_qp - 26;
     pic_param->pic_init_qs_minus26                              = h->pps.init_qs - 26;
     pic_param->chroma_qp_index_offset                           = h->pps.chroma_qp_index_offset[0];
@@ -291,7 +292,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
 /** End a hardware decoding based frame. */
 static int vaapi_h264_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     H264Context * const h = avctx->priv_data;
     H264SliceContext *sl = &h->slice_ctx[0];
     int ret;
@@ -317,6 +318,7 @@ static int vaapi_h264_decode_slice(AVCodecContext *avctx,
                                    const uint8_t  *buffer,
                                    uint32_t        size)
 {
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     H264Context * const h = avctx->priv_data;
     H264SliceContext *sl  = &h->slice_ctx[0];
     VASliceParameterBufferH264 *slice_param;
@@ -325,7 +327,7 @@ static int vaapi_h264_decode_slice(AVCodecContext *avctx,
             buffer, size);
 
     /* Fill in VASliceParameterBufferH264. */
-    slice_param = (VASliceParameterBufferH264 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferH264 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->slice_data_bit_offset          = get_bits_count(&sl->gb) + 8; /* bit buffer started beyond nal_unit_type */
@@ -358,8 +360,11 @@ AVHWAccel ff_h264_vaapi_hwaccel = {
     .name           = "h264_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_h264_start_frame,
     .end_frame      = vaapi_h264_end_frame,
     .decode_slice   = vaapi_h264_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vaapi_hevc.c b/libavcodec/vaapi_hevc.c
new file mode 100644
index 0000000000..762511f6e5
--- /dev/null
+++ b/libavcodec/vaapi_hevc.c
@@ -0,0 +1,490 @@
+/*
+ * HEVC HW decode acceleration through VA API
+ *
+ * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vaapi_internal.h"
+#include "hevc.h"
+#include "mpegutils.h"
+
+/**
+ * @file
+ * This file implements the glue code between FFmpeg's and VA API's
+ * structures for HEVC decoding.
+ */
+
+typedef struct vaapi_hevc_frame_data {
+    VAPictureParameterBufferHEVC *pic_param;
+    VASliceParameterBufferHEVC *last_slice_param;
+} vaapi_hevc_frame_data;
+
+/**
+ * Initialize an empty VA API picture.
+ *
+ * VA API requires a fixed-size reference picture array.
+ */
+static void init_vaapi_pic(VAPictureHEVC *va_pic)
+{
+    va_pic->picture_id = VA_INVALID_ID;
+    va_pic->flags = VA_PICTURE_HEVC_INVALID;
+    va_pic->pic_order_cnt = 0;
+}
+
+static void fill_vaapi_pic(VAPictureHEVC *va_pic, const HEVCFrame *pic, int rps_type)
+{
+    va_pic->picture_id = ff_vaapi_get_surface_id(pic->frame);
+    va_pic->pic_order_cnt = pic->poc;
+    va_pic->flags = rps_type;
+
+    if (pic->flags & HEVC_FRAME_FLAG_LONG_REF)
+        va_pic->flags |= VA_PICTURE_HEVC_LONG_TERM_REFERENCE;
+
+    if (pic->frame->interlaced_frame) {
+        va_pic->flags |= VA_PICTURE_HEVC_FIELD_PIC;
+
+        if (!pic->frame->top_field_first) {
+            va_pic->flags |= VA_PICTURE_HEVC_BOTTOM_FIELD;
+        }
+    }
+}
+
+static int find_frame_rps_type(const HEVCContext *h, const HEVCFrame *pic)
+{
+    VASurfaceID pic_surf = ff_vaapi_get_surface_id(pic->frame);
+    int i;
+
+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[ST_CURR_BEF].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE;
+    }
+
+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[ST_CURR_AFT].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_ST_CURR_AFTER;
+    }
+
+    for (i = 0; i < h->rps[LT_CURR].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[LT_CURR].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_LT_CURR;
+    }
+
+    return 0;
+}
+
+static void fill_vaapi_ReferenceFrames(const HEVCContext *h, VAPictureParameterBufferHEVC *pp)
+{
+    const HEVCFrame *current_picture = h->ref;
+    int i, j, rps_type;
+
+    for (i = 0, j = 0; i < FF_ARRAY_ELEMS(pp->ReferenceFrames); i++) {
+        const HEVCFrame *frame = NULL;
+
+        while (!frame && j < FF_ARRAY_ELEMS(h->DPB)) {
+            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)))
+                frame = &h->DPB[j];
+            j++;
+        }
+
+        init_vaapi_pic(&pp->ReferenceFrames[i]);
+
+        if (frame) {
+            rps_type = find_frame_rps_type(h, frame);
+            fill_vaapi_pic(&pp->ReferenceFrames[i], frame, rps_type);
+        }
+    }
+}
+
+static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame)
+{
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    VAPictureParameterBufferHEVC *pp = frame_data->pic_param;
+    uint8_t i;
+
+    if (!frame)
+        return 0xff;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(pp->ReferenceFrames); ++i) {
+        VASurfaceID pid = pp->ReferenceFrames[i].picture_id;
+        int poc = pp->ReferenceFrames[i].pic_order_cnt;
+        if (pid != VA_INVALID_ID && pid == ff_vaapi_get_surface_id(frame->frame) && poc == frame->poc)
+            return i;
+    }
+
+    return 0xff;
+}
+
+static void fill_picture_parameters(const HEVCContext *h, VAPictureParameterBufferHEVC *pp)
+{
+    int i;
+
+    pp->pic_fields.value = 0;
+    pp->slice_parsing_fields.value = 0;
+
+    fill_vaapi_pic(&pp->CurrPic, h->ref, 0);
+    fill_vaapi_ReferenceFrames(h, pp);
+
+    pp->pic_width_in_luma_samples  = h->ps.sps->width;
+    pp->pic_height_in_luma_samples = h->ps.sps->height;
+
+    pp->log2_min_luma_coding_block_size_minus3 = h->ps.sps->log2_min_cb_size - 3;
+
+    pp->pic_fields.bits.chroma_format_idc = h->ps.sps->chroma_format_idc;
+
+    pp->sps_max_dec_pic_buffering_minus1 = h->ps.sps->temporal_layer[h->ps.sps->max_sub_layers - 1].max_dec_pic_buffering - 1;
+    pp->log2_diff_max_min_luma_coding_block_size = h->ps.sps->log2_diff_max_min_coding_block_size;
+    pp->log2_min_transform_block_size_minus2 = h->ps.sps->log2_min_tb_size - 2;
+    pp->log2_diff_max_min_transform_block_size = h->ps.sps->log2_max_trafo_size  - h->ps.sps->log2_min_tb_size;
+    pp->max_transform_hierarchy_depth_inter = h->ps.sps->max_transform_hierarchy_depth_inter;
+    pp->max_transform_hierarchy_depth_intra = h->ps.sps->max_transform_hierarchy_depth_intra;
+    pp->num_short_term_ref_pic_sets = h->ps.sps->nb_st_rps;
+    pp->num_long_term_ref_pic_sps = h->ps.sps->num_long_term_ref_pics_sps;
+
+    pp->num_ref_idx_l0_default_active_minus1 = h->ps.pps->num_ref_idx_l0_default_active - 1;
+    pp->num_ref_idx_l1_default_active_minus1 = h->ps.pps->num_ref_idx_l1_default_active - 1;
+    pp->init_qp_minus26 = h->ps.pps->pic_init_qp_minus26;
+
+    pp->pps_cb_qp_offset = h->ps.pps->cb_qp_offset;
+    pp->pps_cr_qp_offset = h->ps.pps->cr_qp_offset;
+
+    pp->pic_fields.bits.tiles_enabled_flag = h->ps.pps->tiles_enabled_flag;
+    pp->pic_fields.bits.separate_colour_plane_flag = h->ps.sps->separate_colour_plane_flag;
+    pp->pic_fields.bits.pcm_enabled_flag = h->ps.sps->pcm_enabled_flag;
+    pp->pic_fields.bits.scaling_list_enabled_flag = h->ps.sps->scaling_list_enable_flag;
+    pp->pic_fields.bits.transform_skip_enabled_flag = h->ps.pps->transform_skip_enabled_flag;
+    pp->pic_fields.bits.amp_enabled_flag = h->ps.sps->amp_enabled_flag;
+    pp->pic_fields.bits.strong_intra_smoothing_enabled_flag = h->ps.sps->sps_strong_intra_smoothing_enable_flag;
+    pp->pic_fields.bits.sign_data_hiding_enabled_flag = h->ps.pps->sign_data_hiding_flag;
+    pp->pic_fields.bits.constrained_intra_pred_flag = h->ps.pps->constrained_intra_pred_flag;
+    pp->pic_fields.bits.cu_qp_delta_enabled_flag = h->ps.pps->cu_qp_delta_enabled_flag;
+    pp->pic_fields.bits.weighted_pred_flag = h->ps.pps->weighted_pred_flag;
+    pp->pic_fields.bits.weighted_bipred_flag = h->ps.pps->weighted_bipred_flag;
+    pp->pic_fields.bits.transquant_bypass_enabled_flag = h->ps.pps->transquant_bypass_enable_flag;
+    pp->pic_fields.bits.entropy_coding_sync_enabled_flag = h->ps.pps->entropy_coding_sync_enabled_flag;
+    pp->pic_fields.bits.pps_loop_filter_across_slices_enabled_flag = h->ps.pps->seq_loop_filter_across_slices_enabled_flag;
+    pp->pic_fields.bits.loop_filter_across_tiles_enabled_flag = h->ps.pps->loop_filter_across_tiles_enabled_flag;
+
+    pp->pic_fields.bits.pcm_loop_filter_disabled_flag = h->ps.sps->pcm.loop_filter_disable_flag;
+    pp->pcm_sample_bit_depth_luma_minus1 = h->ps.sps->pcm.bit_depth - 1;
+    pp->pcm_sample_bit_depth_chroma_minus1 = h->ps.sps->pcm.bit_depth_chroma - 1;
+    pp->log2_min_pcm_luma_coding_block_size_minus3 = h->ps.sps->pcm.log2_min_pcm_cb_size - 3;
+    pp->log2_diff_max_min_pcm_luma_coding_block_size = h->ps.sps->pcm.log2_max_pcm_cb_size - h->ps.sps->pcm.log2_min_pcm_cb_size;
+
+    memset(pp->column_width_minus1, 0, sizeof(pp->column_width_minus1));
+    memset(pp->row_height_minus1, 0, sizeof(pp->row_height_minus1));
+
+    if (h->ps.pps->tiles_enabled_flag) {
+        pp->num_tile_columns_minus1 = h->ps.pps->num_tile_columns - 1;
+        pp->num_tile_rows_minus1 = h->ps.pps->num_tile_rows - 1;
+
+        for (i = 0; i < h->ps.pps->num_tile_columns; i++)
+            pp->column_width_minus1[i] = h->ps.pps->column_width[i] - 1;
+
+        for (i = 0; i < h->ps.pps->num_tile_rows; i++)
+            pp->row_height_minus1[i] = h->ps.pps->row_height[i] - 1;
+    }
+
+    pp->diff_cu_qp_delta_depth = h->ps.pps->diff_cu_qp_delta_depth;
+    pp->pps_beta_offset_div2 = h->ps.pps->beta_offset / 2;
+    pp->pps_tc_offset_div2 = h->ps.pps->tc_offset / 2;
+    pp->log2_parallel_merge_level_minus2 = h->ps.pps->log2_parallel_merge_level - 2;
+
+    /* Diffrent chroma/luma bit depths are currently not supported by ffmpeg. */
+    pp->bit_depth_luma_minus8 = h->ps.sps->bit_depth - 8;
+    pp->bit_depth_chroma_minus8 = h->ps.sps->bit_depth - 8;
+
+    pp->slice_parsing_fields.bits.lists_modification_present_flag = h->ps.pps->lists_modification_present_flag;
+    pp->slice_parsing_fields.bits.long_term_ref_pics_present_flag = h->ps.sps->long_term_ref_pics_present_flag;
+    pp->slice_parsing_fields.bits.sps_temporal_mvp_enabled_flag = h->ps.sps->sps_temporal_mvp_enabled_flag;
+    pp->slice_parsing_fields.bits.cabac_init_present_flag = h->ps.pps->cabac_init_present_flag;
+    pp->slice_parsing_fields.bits.output_flag_present_flag = h->ps.pps->output_flag_present_flag;
+    pp->slice_parsing_fields.bits.dependent_slice_segments_enabled_flag = h->ps.pps->dependent_slice_segments_enabled_flag;
+    pp->slice_parsing_fields.bits.pps_slice_chroma_qp_offsets_present_flag = h->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag;
+    pp->slice_parsing_fields.bits.sample_adaptive_offset_enabled_flag = h->ps.sps->sao_enabled;
+    pp->slice_parsing_fields.bits.deblocking_filter_override_enabled_flag = h->ps.pps->deblocking_filter_override_enabled_flag;
+    pp->slice_parsing_fields.bits.pps_disable_deblocking_filter_flag = h->ps.pps->disable_dbf;
+    pp->slice_parsing_fields.bits.slice_segment_header_extension_present_flag = h->ps.pps->slice_header_extension_present_flag;
+
+    pp->log2_max_pic_order_cnt_lsb_minus4 = h->ps.sps->log2_max_poc_lsb - 4;
+    pp->num_extra_slice_header_bits = h->ps.pps->num_extra_slice_header_bits;
+
+    if (h->nal_unit_type >= NAL_BLA_W_LP && h->nal_unit_type <= NAL_CRA_NUT) {
+        pp->slice_parsing_fields.bits.RapPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.RapPicFlag = 0;
+    }
+
+    if (IS_IDR(h)) {
+        pp->slice_parsing_fields.bits.IdrPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.IdrPicFlag = 0;
+    }
+
+    if (IS_IRAP(h)) {
+        pp->slice_parsing_fields.bits.IntraPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.IntraPicFlag = 0;
+    }
+
+    if (h->sh.short_term_ref_pic_set_sps_flag == 0 && h->sh.short_term_rps) {
+        pp->st_rps_bits = h->sh.short_term_ref_pic_set_size;
+    } else {
+        pp->st_rps_bits = 0;
+    }
+
+    /* TODO */
+    pp->pic_fields.bits.NoPicReorderingFlag = 0;
+    pp->pic_fields.bits.NoBiPredFlag = 0;
+}
+
+
+/** Initialize and start decoding a frame with VA API. */
+static int vaapi_hevc_start_frame(AVCodecContext          *avctx,
+                                  av_unused const uint8_t *buffer,
+                                  av_unused uint32_t       size)
+{
+    HEVCContext * const h = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    VAPictureParameterBufferHEVC *pic_param;
+    VAIQMatrixBufferHEVC *iq_matrix;
+    ScalingList const * scaling_list;
+    int i, j, pos;
+
+    ff_dlog(avctx, "vaapi_hevc_start_frame()\n");
+
+    vactx->slice_param_size = sizeof(VASliceParameterBufferHEVC);
+
+    /* Fill in VAPictureParameterBufferHEVC. */
+    pic_param = ff_vaapi_alloc_pic_param(vactx, sizeof(VAPictureParameterBufferHEVC));
+    if (!pic_param)
+        return -1;
+    fill_picture_parameters(h, pic_param);
+    frame_data->pic_param = pic_param;
+
+    /* Fill in VAIQMatrixBufferHEVC. */
+    if (h->ps.pps->scaling_list_data_present_flag) {
+        scaling_list = &h->ps.pps->scaling_list;
+    } else if (h->ps.sps->scaling_list_enable_flag) {
+        scaling_list = &h->ps.sps->scaling_list;
+    } else {
+        return 0;
+    }
+
+    iq_matrix = ff_vaapi_alloc_iq_matrix(vactx, sizeof(VAIQMatrixBufferHEVC));
+    if (!iq_matrix)
+        return -1;
+
+    for (i = 0; i < 6; ++i) {
+        for (j = 0; j < 16; ++j) {
+            pos = 4 * ff_hevc_diag_scan4x4_y[j] + ff_hevc_diag_scan4x4_x[j];
+            iq_matrix->ScalingList4x4[i][j] = scaling_list->sl[0][i][pos];
+        }
+        for (j = 0; j < 64; ++j) {
+            pos = 8 * ff_hevc_diag_scan8x8_y[j] + ff_hevc_diag_scan8x8_x[j];
+            iq_matrix->ScalingList8x8[i][j] = scaling_list->sl[1][i][pos];
+            iq_matrix->ScalingList16x16[i][j] = scaling_list->sl[2][i][pos];
+            if (i < 2) {
+                iq_matrix->ScalingList32x32[i][j] = scaling_list->sl[3][i * 3][pos];
+            }
+        }
+        iq_matrix->ScalingListDC16x16[i] = scaling_list->sl_dc[0][i];
+        if (i < 2) {
+            iq_matrix->ScalingListDC32x32[i] = scaling_list->sl_dc[1][i * 3];
+        }
+    }
+
+    return 0;
+}
+
+/** End a hardware decoding based frame. */
+static int vaapi_hevc_end_frame(AVCodecContext *avctx)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    HEVCContext * const h = avctx->priv_data;
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    int ret;
+
+    ff_dlog(avctx, "vaapi_hevc_end_frame()\n");
+
+    frame_data->last_slice_param->LongSliceFlags.fields.LastSliceOfPic = 1;
+
+    ret = ff_vaapi_commit_slices(vactx);
+    if (ret < 0)
+        goto finish;
+
+    ret = ff_vaapi_render_picture(vactx, ff_vaapi_get_surface_id(h->ref->frame));
+    if (ret < 0)
+        goto finish;
+
+finish:
+    ff_vaapi_common_end_frame(avctx);
+    return ret;
+}
+
+static int fill_pred_weight_table(HEVCContext * const h,
+                                  VASliceParameterBufferHEVC *slice_param,
+                                  SliceHeader * const sh)
+{
+    int i;
+
+    memset(slice_param->delta_luma_weight_l0, 0, sizeof(slice_param->delta_luma_weight_l0));
+    memset(slice_param->delta_luma_weight_l1, 0, sizeof(slice_param->delta_luma_weight_l1));
+    memset(slice_param->luma_offset_l0, 0, sizeof(slice_param->luma_offset_l0));
+    memset(slice_param->luma_offset_l1, 0, sizeof(slice_param->luma_offset_l1));
+    memset(slice_param->delta_chroma_weight_l0, 0, sizeof(slice_param->delta_chroma_weight_l0));
+    memset(slice_param->delta_chroma_weight_l1, 0, sizeof(slice_param->delta_chroma_weight_l1));
+    memset(slice_param->ChromaOffsetL0, 0, sizeof(slice_param->ChromaOffsetL0));
+    memset(slice_param->ChromaOffsetL1, 0, sizeof(slice_param->ChromaOffsetL1));
+
+    slice_param->delta_chroma_log2_weight_denom = 0;
+    slice_param->luma_log2_weight_denom = 0;
+
+    if (  sh->slice_type == I_SLICE
+      || (sh->slice_type == P_SLICE && !h->ps.pps->weighted_pred_flag)
+      || (sh->slice_type == B_SLICE && !h->ps.pps->weighted_bipred_flag)) {
+        return 0;
+    }
+
+    slice_param->luma_log2_weight_denom = sh->luma_log2_weight_denom;
+
+    if (h->ps.sps->chroma_format_idc) {
+        slice_param->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
+    }
+
+    for (i = 0; i < 15 && i < sh->nb_refs[L0]; ++i) {
+        slice_param->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - (1 << sh->luma_log2_weight_denom);
+        slice_param->luma_offset_l0[i] = sh->luma_offset_l0[i];
+        slice_param->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - (1 << sh->chroma_log2_weight_denom);
+        slice_param->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - (1 << sh->chroma_log2_weight_denom);
+        slice_param->ChromaOffsetL0[i][0] = sh->chroma_offset_l0[i][0];
+        slice_param->ChromaOffsetL0[i][1] = sh->chroma_offset_l0[i][1];
+    }
+
+    if (sh->slice_type == B_SLICE) {
+        for (i = 0; i < 15 && i < sh->nb_refs[L1]; ++i) {
+            slice_param->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - (1 << sh->luma_log2_weight_denom);
+            slice_param->luma_offset_l1[i] = sh->luma_offset_l1[i];
+            slice_param->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - (1 << sh->chroma_log2_weight_denom);
+            slice_param->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - (1 << sh->chroma_log2_weight_denom);
+            slice_param->ChromaOffsetL1[i][0] = sh->chroma_offset_l1[i][0];
+            slice_param->ChromaOffsetL1[i][1] = sh->chroma_offset_l1[i][1];
+        }
+    }
+
+    return 0;
+}
+
+/** Decode the given hevc slice with VA API. */
+static int vaapi_hevc_decode_slice(AVCodecContext *avctx,
+                                   const uint8_t  *buffer,
+                                   uint32_t        size)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    HEVCContext * const h = avctx->priv_data;
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    SliceHeader * const sh = &h->sh;
+    VASliceParameterBufferHEVC *slice_param;
+    int i, list_idx;
+    uint8_t nb_list = sh->slice_type == B_SLICE ? 2 : 1;
+
+    if (sh->slice_type == I_SLICE)
+        nb_list = 0;
+
+    ff_dlog(avctx, "vaapi_hevc_decode_slice(): buffer %p, size %d\n", buffer, size);
+
+    /* Fill in VASliceParameterBufferH264. */
+    slice_param = (VASliceParameterBufferHEVC *)ff_vaapi_alloc_slice(vactx, buffer, size);
+    if (!slice_param)
+        return -1;
+
+    frame_data->last_slice_param = slice_param;
+
+    /* The base structure changed, so this has to be re-set in order to be valid on every byte order. */
+    slice_param->slice_data_flag = VA_SLICE_DATA_FLAG_ALL;
+
+    /* Add 1 to the bits count here to account for the byte_alignment bit, which allways is at least one bit and not accounted for otherwise. */
+    slice_param->slice_data_byte_offset = (get_bits_count(&h->HEVClc->gb) + 1 + 7) / 8;
+
+    slice_param->slice_segment_address = sh->slice_segment_addr;
+
+    slice_param->LongSliceFlags.value = 0;
+    slice_param->LongSliceFlags.fields.dependent_slice_segment_flag = sh->dependent_slice_segment_flag;
+    slice_param->LongSliceFlags.fields.slice_type = sh->slice_type;
+    slice_param->LongSliceFlags.fields.color_plane_id = sh->colour_plane_id;
+    slice_param->LongSliceFlags.fields.mvd_l1_zero_flag = sh->mvd_l1_zero_flag;
+    slice_param->LongSliceFlags.fields.cabac_init_flag = sh->cabac_init_flag;
+    slice_param->LongSliceFlags.fields.slice_temporal_mvp_enabled_flag = sh->slice_temporal_mvp_enabled_flag;
+    slice_param->LongSliceFlags.fields.slice_deblocking_filter_disabled_flag = sh->disable_deblocking_filter_flag;
+    slice_param->LongSliceFlags.fields.collocated_from_l0_flag = sh->collocated_list == L0 ? 1 : 0;
+    slice_param->LongSliceFlags.fields.slice_loop_filter_across_slices_enabled_flag = sh->slice_loop_filter_across_slices_enabled_flag;
+
+    slice_param->LongSliceFlags.fields.slice_sao_luma_flag = sh->slice_sample_adaptive_offset_flag[0];
+    if (h->ps.sps->chroma_format_idc) {
+        slice_param->LongSliceFlags.fields.slice_sao_chroma_flag = sh->slice_sample_adaptive_offset_flag[1];
+    }
+
+    if (sh->slice_temporal_mvp_enabled_flag) {
+        slice_param->collocated_ref_idx = sh->collocated_ref_idx;
+    } else {
+        slice_param->collocated_ref_idx = 0xFF;
+    }
+
+    slice_param->slice_qp_delta = sh->slice_qp_delta;
+    slice_param->slice_cb_qp_offset = sh->slice_cb_qp_offset;
+    slice_param->slice_cr_qp_offset = sh->slice_cr_qp_offset;
+    slice_param->slice_beta_offset_div2 = sh->beta_offset / 2;
+    slice_param->slice_tc_offset_div2 = sh->tc_offset / 2;
+
+    if (sh->slice_type == I_SLICE) {
+        slice_param->five_minus_max_num_merge_cand = 0;
+    } else {
+        slice_param->five_minus_max_num_merge_cand = 5 - sh->max_num_merge_cand;
+    }
+
+    slice_param->num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0;
+    slice_param->num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0;
+
+    memset(slice_param->RefPicList, 0xFF, sizeof(slice_param->RefPicList));
+
+    /* h->ref->refPicList is updated befor calling each slice */
+    for (list_idx = 0; list_idx < nb_list; ++list_idx) {
+        RefPicList *rpl = &h->ref->refPicList[list_idx];
+
+        for (i = 0; i < rpl->nb_refs; ++i) {
+            slice_param->RefPicList[list_idx][i] = get_ref_pic_index(h, rpl->ref[i]);
+        }
+    }
+
+    return fill_pred_weight_table(h, slice_param, sh);
+}
+
+AVHWAccel ff_hevc_vaapi_hwaccel = {
+    .name                 = "hevc_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_HEVC,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = vaapi_hevc_start_frame,
+    .end_frame            = vaapi_hevc_end_frame,
+    .decode_slice         = vaapi_hevc_decode_slice,
+    .init                 = ff_vaapi_context_init,
+    .uninit               = ff_vaapi_context_fini,
+    .priv_data_size       = sizeof(FFVAContext),
+    .frame_priv_data_size = sizeof(vaapi_hevc_frame_data),
+};
diff --git a/libavcodec/vaapi_internal.h b/libavcodec/vaapi_internal.h
index 5e2a6ca631..306ae13b37 100644
--- a/libavcodec/vaapi_internal.h
+++ b/libavcodec/vaapi_internal.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <va/va.h>
 #include "vaapi.h"
 #include "avcodec.h"
+#include "internal.h"
 
 /**
  * @addtogroup VAAPI_Decoding
@@ -34,23 +35,53 @@
  * @{
  */
 
+typedef struct {
+    VADisplay display;                  ///< Windowing system dependent handle
+    VAConfigID config_id;               ///< Configuration ID
+    VAContextID context_id;             ///< Context ID (video decode pipeline)
+    VABufferID pic_param_buf_id;        ///< Picture parameter buffer
+    VABufferID iq_matrix_buf_id;        ///< Inverse quantiser matrix buffer
+    VABufferID bitplane_buf_id;         ///< Bitplane buffer (for VC-1 decoding)
+    VABufferID *slice_buf_ids;          ///< Slice parameter/data buffers
+    unsigned int n_slice_buf_ids;       ///< Number of effective slice buffers
+    unsigned int slice_buf_ids_alloc;   ///< Number of allocated slice buffers
+    void *slice_params;                 ///< Pointer to slice parameter buffers
+    unsigned int slice_param_size;      ///< Size of a slice parameter element
+    unsigned int slice_params_alloc;    ///< Number of allocated slice parameters
+    unsigned int slice_count;           ///< Number of slices currently filled in
+    const uint8_t *slice_data;          ///< Pointer to slice data buffer base
+    unsigned int slice_data_size;       ///< Current size of slice data
+} FFVAContext;
+
+/** Extract vaapi_context from an AVCodecContext */
+static inline FFVAContext *ff_vaapi_get_context(AVCodecContext *avctx)
+{
+    return avctx->internal->hwaccel_priv_data;
+}
+
 /** Extract VASurfaceID from an AVFrame */
 static inline VASurfaceID ff_vaapi_get_surface_id(AVFrame *pic)
 {
     return (uintptr_t)pic->data[3];
 }
 
+/** Common AVHWAccel.init() implementation */
+int ff_vaapi_context_init(AVCodecContext *avctx);
+
+/** Common AVHWAccel.uninit() implementation */
+int ff_vaapi_context_fini(AVCodecContext *avctx);
+
 /** Common AVHWAccel.end_frame() implementation */
 void ff_vaapi_common_end_frame(AVCodecContext *avctx);
 
 /** Allocate a new picture parameter buffer */
-void *ff_vaapi_alloc_pic_param(struct vaapi_context *vactx, unsigned int size);
+void *ff_vaapi_alloc_pic_param(FFVAContext *vactx, unsigned int size);
 
 /** Allocate a new IQ matrix buffer */
-void *ff_vaapi_alloc_iq_matrix(struct vaapi_context *vactx, unsigned int size);
+void *ff_vaapi_alloc_iq_matrix(FFVAContext *vactx, unsigned int size);
 
 /** Allocate a new bit-plane buffer */
-uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size);
+uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size);
 
 /**
  * Allocate a new slice descriptor for the input slice.
@@ -60,11 +91,11 @@ uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size);
  * @param size the size of the slice in bytes
  * @return the newly allocated slice parameter
  */
-VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, const uint8_t *buffer, uint32_t size);
+VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8_t *buffer, uint32_t size);
 
 int ff_vaapi_mpeg_end_frame(AVCodecContext *avctx);
-int ff_vaapi_commit_slices(struct vaapi_context *vactx);
-int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface);
+int ff_vaapi_commit_slices(FFVAContext *vactx);
+int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface);
 
 /* @} */
 
diff --git a/libavcodec/vaapi_mpeg2.c b/libavcodec/vaapi_mpeg2.c
index 7d0e205932..518fec0493 100644
--- a/libavcodec/vaapi_mpeg2.c
+++ b/libavcodec/vaapi_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,7 @@ static inline int mpeg2_get_is_frame_start(MpegEncContext *s)
 static int vaapi_mpeg2_start_frame(AVCodecContext *avctx, av_unused const uint8_t *buffer, av_unused uint32_t size)
 {
     struct MpegEncContext * const s = avctx->priv_data;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferMPEG2 *pic_param;
     VAIQMatrixBufferMPEG2 *iq_matrix;
     int i;
@@ -104,6 +104,7 @@ static int vaapi_mpeg2_start_frame(AVCodecContext *avctx, av_unused const uint8_
 static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     MpegEncContext * const s = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferMPEG2 *slice_param;
     GetBitContext gb;
     uint32_t quantiser_scale_code, intra_slice_flag, macroblock_offset;
@@ -118,13 +119,13 @@ static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer
     intra_slice_flag = get_bits1(&gb);
     if (intra_slice_flag) {
         skip_bits(&gb, 8);
-        while (get_bits1(&gb) != 0)
-            skip_bits(&gb, 8);
+        if (skip_1stop_8data_bits(&gb) < 0)
+            return AVERROR_INVALIDDATA;
     }
     macroblock_offset = get_bits_count(&gb);
 
     /* Fill in VASliceParameterBufferMPEG2 */
-    slice_param = (VASliceParameterBufferMPEG2 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferMPEG2 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset              = macroblock_offset;
@@ -139,8 +140,11 @@ AVHWAccel ff_mpeg2_vaapi_hwaccel = {
     .name           = "mpeg2_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MPEG2VIDEO,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_mpeg2_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg2_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vaapi_mpeg4.c b/libavcodec/vaapi_mpeg4.c
index 1b9053c79b..b5b946dba1 100644
--- a/libavcodec/vaapi_mpeg4.c
+++ b/libavcodec/vaapi_mpeg4.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ static int vaapi_mpeg4_start_frame(AVCodecContext *avctx, av_unused const uint8_
 {
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext * const s = &ctx->m;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferMPEG4 *pic_param;
     VAIQMatrixBufferMPEG4 *iq_matrix;
     int i;
@@ -122,29 +122,19 @@ static int vaapi_mpeg4_start_frame(AVCodecContext *avctx, av_unused const uint8_
 static int vaapi_mpeg4_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     MpegEncContext * const s = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferMPEG4 *slice_param;
 
     ff_dlog(avctx, "vaapi_mpeg4_decode_slice(): buffer %p, size %d\n", buffer, size);
 
-    /* video_plane_with_short_video_header() contains all GOBs
-     * in-order, and this is what VA API (Intel backend) expects: only
-     * a single slice param. So fake macroblock_number for Libav so
-     * that we don't call vaapi_mpeg4_decode_slice() again
-     */
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        size = s->gb.buffer_end - buffer;
-
     /* Fill in VASliceParameterBufferMPEG4 */
-    slice_param = (VASliceParameterBufferMPEG4 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferMPEG4 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset      = get_bits_count(&s->gb) % 8;
-    slice_param->macroblock_number      = s->mb_y * s->mb_width + s->mb_x;
+    slice_param->macroblock_number      = 0;
     slice_param->quant_scale            = s->qscale;
 
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        s->mb_y = s->mb_height;
-
     return 0;
 }
 
@@ -153,10 +143,13 @@ AVHWAccel ff_mpeg4_vaapi_hwaccel = {
     .name           = "mpeg4_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MPEG4,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_mpeg4_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg4_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
 
@@ -165,9 +158,12 @@ AVHWAccel ff_h263_vaapi_hwaccel = {
     .name           = "h263_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H263,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_mpeg4_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg4_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
diff --git a/libavcodec/vaapi_vc1.c b/libavcodec/vaapi_vc1.c
index fe01c52080..5ded5dba89 100644
--- a/libavcodec/vaapi_vc1.c
+++ b/libavcodec/vaapi_vc1.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "vc1.h"
 #include "vc1data.h"
 
-/** Translate Libav MV modes to VA API */
+/** Translate FFmpeg MV modes to VA API */
 static int get_VAMvModeVC1(enum MVModes mv_mode)
 {
     switch (mv_mode) {
@@ -129,7 +129,7 @@ static inline int vc1_get_TTFRM(VC1Context *v)
     return 0;
 }
 
-/** Pack Libav bitplanes into a VABitPlaneBuffer element */
+/** Pack FFmpeg bitplanes into a VABitPlaneBuffer element */
 static inline void vc1_pack_bitplanes(uint8_t *bitplane, int n, const uint8_t *ff_bp[3], int x, int y, int stride)
 {
     const int bitplane_index = n / 2;
@@ -148,7 +148,7 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
 {
     VC1Context * const v = avctx->priv_data;
     MpegEncContext * const s = &v->s;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferVC1 *pic_param;
 
     ff_dlog(avctx, "vaapi_vc1_start_frame()\n");
@@ -315,6 +315,7 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
 {
     VC1Context * const v = avctx->priv_data;
     MpegEncContext * const s = &v->s;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferVC1 *slice_param;
 
     ff_dlog(avctx, "vaapi_vc1_decode_slice(): buffer %p, size %d\n", buffer, size);
@@ -326,7 +327,7 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
     }
 
     /* Fill in VASliceParameterBufferVC1 */
-    slice_param = (VASliceParameterBufferVC1 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferVC1 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset       = get_bits_count(&s->gb);
@@ -339,10 +340,13 @@ AVHWAccel ff_wmv3_vaapi_hwaccel = {
     .name           = "wmv3_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_WMV3,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_vc1_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_vc1_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
 
@@ -350,8 +354,11 @@ AVHWAccel ff_vc1_vaapi_hwaccel = {
     .name           = "vc1_vaapi",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_VC1,
-    .pix_fmt        = AV_PIX_FMT_VAAPI_VLD,
+    .pix_fmt        = AV_PIX_FMT_VAAPI,
     .start_frame    = vaapi_vc1_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_vc1_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vb.c b/libavcodec/vb.c
index 43954c16ee..560165adc7 100644
--- a/libavcodec/vb.c
+++ b/libavcodec/vb.c
@@ -2,20 +2,20 @@
  * Beam Software VB decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static void vb_decode_palette(VBDecContext *c, int data_size)
         return;
     }
     for (i = start; i <= start + size; i++)
-        c->pal[i] = bytestream2_get_be24(&c->stream);
+        c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&c->stream);
 }
 
 static inline int check_pixel(uint8_t *buf, uint8_t *start, uint8_t *end)
@@ -197,10 +197,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     bytestream2_init(&c->stream, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     flags = bytestream2_get_le16(&c->stream);
 
@@ -211,6 +209,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     }
     if (flags & VB_HAS_VIDEO) {
         size = bytestream2_get_le32(&c->stream);
+        if(size > bytestream2_get_bytes_left(&c->stream)+4 || size<4){
+            av_log(avctx, AV_LOG_ERROR, "Frame size invalid\n");
+            return -1;
+        }
         vb_decode_framedata(c, offset);
         bytestream2_skip(&c->stream, size - 4);
     }
@@ -249,6 +251,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->frame      = av_mallocz(avctx->width * avctx->height);
     c->prev_frame = av_mallocz(avctx->width * avctx->height);
 
+    if (!c->frame || !c->prev_frame) {
+        av_freep(&c->frame);
+        av_freep(&c->prev_frame);
+        return AVERROR(ENOMEM);
+    }
+
     return 0;
 }
 
diff --git a/libavcodec/vble.c b/libavcodec/vble.c
index 996a984a58..30b77cecf6 100644
--- a/libavcodec/vble.c
+++ b/libavcodec/vble.c
@@ -2,20 +2,20 @@
  * VBLE Decoder
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,56 +37,52 @@ typedef struct VBLEContext {
     HuffYUVDSPContext hdsp;
 
     int            size;
-    uint8_t        *val; /* First holds the lengths of vlc symbols and then their values */
+    uint8_t        *val; ///< This array first holds the lengths of vlc symbols and then their value.
 } VBLEContext;
 
-static uint8_t vble_read_reverse_unary(GetBitContext *gb)
-{
-    /* At most we need to read 9 bits total to get indices up to 8 */
-    uint8_t val = show_bits(gb, 8);
-
-    if (val) {
-        val = 7 - av_log2_16bit(ff_reverse[val]);
-        skip_bits(gb, val + 1);
-        return val;
-    } else {
-        skip_bits(gb, 8);
-        if (get_bits1(gb))
-            return 8;
-    }
-
-    /* Return something larger than 8 on error */
-    return UINT8_MAX;
-}
-
 static int vble_unpack(VBLEContext *ctx, GetBitContext *gb)
 {
     int i;
+    int allbits = 0;
+    static const uint8_t LUT[256] = {
+        8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+    };
 
     /* Read all the lengths in first */
     for (i = 0; i < ctx->size; i++) {
-        ctx->val[i] = vble_read_reverse_unary(gb);
-
-        if (ctx->val[i] == UINT8_MAX)
-            return -1;
-    }
-
-    for (i = 0; i < ctx->size; i++) {
-        /* Check we have enough bits left */
-        if (get_bits_left(gb) < ctx->val[i])
-            return -1;
-
-        /* get_bits can't take a length of 0 */
-        if (ctx->val[i])
-            ctx->val[i] = (1 << ctx->val[i]) + get_bits(gb, ctx->val[i]) - 1;
+        /* At most we need to read 9 bits total to get indices up to 8 */
+        int val = show_bits(gb, 8);
+
+        // read reverse unary
+        if (val) {
+            val = LUT[val];
+            skip_bits(gb, val + 1);
+            ctx->val[i] = val;
+        } else {
+            skip_bits(gb, 8);
+            if (!get_bits1(gb))
+                return -1;
+            ctx->val[i] = 8;
+        }
+        allbits += ctx->val[i];
     }
 
+    /* Check we have enough bits left */
+    if (get_bits_left(gb) < allbits)
+        return -1;
     return 0;
 }
 
 static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
-                               int plane, int offset,
-                               int width, int height)
+                               GetBitContext *gb, int plane,
+                               int offset, int width, int height)
 {
     uint8_t *dst = pic->data[plane];
     uint8_t *val = ctx->val + offset;
@@ -94,9 +90,13 @@ static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
     int i, j, left, left_top;
 
     for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j++)
-            val[j] = (val[j] >> 1) ^ -(val[j] & 1);
-
+        for (j = 0; j < width; j++) {
+            /* get_bits can't take a length of 0 */
+            if (val[j]) {
+                int v = (1 << val[j]) + get_bits(gb, val[j]) - 1;
+                val[j] = (v >> 1) ^ -(v & 1);
+            }
+        }
         if (i) {
             left = 0;
             left_top = dst[-stride];
@@ -122,13 +122,17 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int version;
     int offset = 0;
     int width_uv = avctx->width / 2, height_uv = avctx->height / 2;
+    int ret;
 
-    /* Allocate buffer */
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
+    if (avpkt->size < 4 || avpkt->size - 4 > INT_MAX/8) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size\n");
+        return AVERROR_INVALIDDATA;
     }
 
+    /* Allocate buffer */
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
     /* Set flags */
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
@@ -148,15 +152,15 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     }
 
     /* Restore planes. Should be almost identical to Huffyuv's. */
-    vble_restore_plane(ctx, pic, 0, offset, avctx->width, avctx->height);
+    vble_restore_plane(ctx, pic, &gb, 0, offset, avctx->width, avctx->height);
 
     /* Chroma */
     if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         offset += avctx->width * avctx->height;
-        vble_restore_plane(ctx, pic, 1, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 1, offset, width_uv, height_uv);
 
         offset += width_uv * height_uv;
-        vble_restore_plane(ctx, pic, 2, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 2, offset, width_uv, height_uv);
     }
 
     *got_frame       = 1;
@@ -186,7 +190,7 @@ static av_cold int vble_decode_init(AVCodecContext *avctx)
     ctx->size = avpicture_get_size(avctx->pix_fmt,
                                    avctx->width, avctx->height);
 
-    ctx->val = av_malloc(ctx->size * sizeof(*ctx->val));
+    ctx->val = av_malloc_array(ctx->size, sizeof(*ctx->val));
 
     if (!ctx->val) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate values buffer.\n");
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 0631439a1c..08e1f2c4c7 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,9 +37,6 @@
 #include "unary.h"
 #include "simple_idct.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 /***********************************************************************/
 /**
  * @name VC-1 Bitplane decoding
@@ -47,21 +44,6 @@
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
-
 /** Decode rows by checking if they are skipped
  * @param plane Buffer to store decoded bits
  * @param[in] width Width of this buffer
@@ -137,12 +119,16 @@ static int bitplane_decoding(uint8_t* data, int *raw_flag, VC1Context *v)
     case IMODE_NORM2:
         if ((height * width) & 1) {
             *planep++ = get_bits1(gb);
-            offset    = 1;
+            y = offset = 1;
+            if (offset == width) {
+                offset = 0;
+                planep += stride - width;
+            }
         }
         else
-            offset = 0;
+            y = offset = 0;
         // decode bitplane as one long line
-        for (y = offset; y < height * width; y += 2) {
+        for (; y < height * width; y += 2) {
             code = get_vlc2(gb, ff_vc1_norm2_vlc.table, VC1_NORM2_VLC_BITS, 1);
             *planep++ = code & 1;
             offset++;
@@ -248,37 +234,34 @@ static int vop_dquant_decoding(VC1Context *v)
     int pqdiff;
 
     //variable size
-    if (v->dquant == 2) {
-        pqdiff = get_bits(gb, 3);
-        if (pqdiff == 7)
-            v->altpq = get_bits(gb, 5);
-        else
-            v->altpq = v->pq + pqdiff + 1;
-    } else {
+    if (v->dquant != 2) {
         v->dquantfrm = get_bits1(gb);
-        if (v->dquantfrm) {
-            v->dqprofile = get_bits(gb, 2);
-            switch (v->dqprofile) {
-            case DQPROFILE_SINGLE_EDGE:
-            case DQPROFILE_DOUBLE_EDGES:
-                v->dqsbedge = get_bits(gb, 2);
-                break;
-            case DQPROFILE_ALL_MBS:
-                v->dqbilevel = get_bits1(gb);
-                if (!v->dqbilevel)
-                    v->halfpq = 0;
-            default:
-                break; //Forbidden ?
-            }
-            if (v->dqbilevel || v->dqprofile != DQPROFILE_ALL_MBS) {
-                pqdiff = get_bits(gb, 3);
-                if (pqdiff == 7)
-                    v->altpq = get_bits(gb, 5);
-                else
-                    v->altpq = v->pq + pqdiff + 1;
+        if (!v->dquantfrm)
+            return 0;
+
+        v->dqprofile = get_bits(gb, 2);
+        switch (v->dqprofile) {
+        case DQPROFILE_SINGLE_EDGE:
+        case DQPROFILE_DOUBLE_EDGES:
+            v->dqsbedge = get_bits(gb, 2);
+            break;
+        case DQPROFILE_ALL_MBS:
+            v->dqbilevel = get_bits1(gb);
+            if (!v->dqbilevel) {
+                v->halfpq = 0;
+                return 0;
             }
+        default:
+            break; //Forbidden ?
         }
     }
+
+    pqdiff = get_bits(gb, 3);
+    if (pqdiff == 7)
+        v->altpq = get_bits(gb, 5);
+    else
+        v->altpq = v->pq + pqdiff + 1;
+
     return 0;
 }
 
@@ -293,7 +276,7 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);
  */
 int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
-    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits(gb, 32));
+    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits_long(gb, 32));
     v->profile = get_bits(gb, 2);
     if (v->profile == PROFILE_COMPLEX) {
         av_log(avctx, AV_LOG_WARNING, "WMV3 Complex Profile is not fully supported\n");
@@ -304,6 +287,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         v->zz_4x8 = ff_vc1_adv_progressive_4x8_zz;
         return decode_sequence_header_adv(v, gb);
     } else {
+        v->chromaformat = 1;
         v->zz_8x4 = ff_wmv2_scantableA;
         v->zz_4x8 = ff_wmv2_scantableB;
         v->res_y411   = get_bits1(gb);
@@ -348,8 +332,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         return -1;
     }
     v->extended_mv     = get_bits1(gb); //common
-    if (!v->profile && v->extended_mv)
-    {
+    if (!v->profile && v->extended_mv) {
         av_log(avctx, AV_LOG_ERROR,
                "Extended MVs unavailable in Simple Profile\n");
         return -1;
@@ -358,8 +341,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->vstransform     = get_bits1(gb); //common
 
     v->res_transtab    = get_bits1(gb);
-    if (v->res_transtab)
-    {
+    if (v->res_transtab) {
         av_log(avctx, AV_LOG_ERROR,
                "1 for reserved RES_TRANSTAB is forbidden\n");
         return -1;
@@ -380,8 +362,13 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->finterpflag = get_bits1(gb); //common
 
     if (v->res_sprite) {
-        v->s.avctx->width  = v->s.avctx->coded_width  = get_bits(gb, 11);
-        v->s.avctx->height = v->s.avctx->coded_height = get_bits(gb, 11);
+        int w = get_bits(gb, 11);
+        int h = get_bits(gb, 11);
+        int ret = ff_set_dimensions(v->s.avctx, w, h);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+            return ret;
+        }
         skip_bits(gb, 5); //frame rate
         v->res_x8 = get_bits1(gb);
         if (get_bits1(gb)) { // something to do with DC VLC selection
@@ -433,10 +420,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
     v->bitrtq_postproc       = get_bits(gb, 5); //common
     v->postprocflag          = get_bits1(gb);   //common
 
-    v->s.avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->width        = v->s.avctx->coded_width;
-    v->s.avctx->height       = v->s.avctx->coded_height;
+    v->max_coded_width       = (get_bits(gb, 12) + 1) << 1;
+    v->max_coded_height      = (get_bits(gb, 12) + 1) << 1;
     v->broadcast             = get_bits1(gb);
     v->interlace             = get_bits1(gb);
     v->tfcntrflag            = get_bits1(gb);
@@ -497,7 +482,6 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
                 }
             }
             if (v->broadcast) { // Pulldown may be present
-                v->s.avctx->framerate.num  *= 2;
                 v->s.avctx->ticks_per_frame = 2;
             }
         }
@@ -526,6 +510,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
 int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
     int i;
+    int w,h;
+    int ret;
 
     av_log(avctx, AV_LOG_DEBUG, "Entry point: %08X\n", show_bits_long(gb, 32));
     v->broken_link    = get_bits1(gb);
@@ -533,6 +519,8 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
     v->panscanflag    = get_bits1(gb);
     v->refdist_flag   = get_bits1(gb);
     v->s.loop_filter  = get_bits1(gb);
+    if (v->s.avctx->skip_loop_filter >= AVDISCARD_ALL)
+        v->s.loop_filter = 0;
     v->fastuvmc       = get_bits1(gb);
     v->extended_mv    = get_bits1(gb);
     v->dquant         = get_bits(gb, 2);
@@ -546,10 +534,18 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         }
     }
 
-    if (get_bits1(gb)) {
-        avctx->width  = avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-        avctx->height = avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
+    if(get_bits1(gb)){
+        w = (get_bits(gb, 12)+1)<<1;
+        h = (get_bits(gb, 12)+1)<<1;
+    } else {
+        w = v->max_coded_width;
+        h = v->max_coded_height;
+    }
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+        return ret;
     }
+
     if (v->extended_mv)
         v->extended_dmv = get_bits1(gb);
     if ((v->range_mapy_flag = get_bits1(gb))) {
@@ -576,13 +572,13 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         int scale, shift, i;                                                  \
         if (!lumscale) {                                                      \
             scale = -64;                                                      \
-            shift = (255 - lumshift * 2) << 6;                                \
+            shift = (255 - lumshift * 2) * 64;                                \
             if (lumshift > 31)                                                \
                 shift += 128 << 6;                                            \
         } else {                                                              \
             scale = lumscale + 32;                                            \
             if (lumshift > 31)                                                \
-                shift = (lumshift - 64) << 6;                                 \
+                shift = (lumshift - 64) * 64;                                 \
             else                                                              \
                 shift = lumshift << 6;                                        \
         }                                                                     \
@@ -601,32 +597,44 @@ static void rotate_luts(VC1Context *v)
             C = A;                                            \
         } else {                                              \
             DEF;                                              \
-            memcpy(&tmp, &L  , sizeof(tmp));                  \
-            memcpy(&L  , &N  , sizeof(tmp));                  \
-            memcpy(&N  , &tmp, sizeof(tmp));                  \
+            memcpy(&tmp, L   , sizeof(tmp));                  \
+            memcpy(L   , N   , sizeof(tmp));                  \
+            memcpy(N   , &tmp, sizeof(tmp));                  \
             C = N;                                            \
         }                                                     \
     } while(0)
 
-    ROTATE(int tmp,             v->last_use_ic, v->next_use_ic, v->curr_use_ic, v->aux_use_ic);
+    ROTATE(int tmp,             &v->last_use_ic, &v->next_use_ic, v->curr_use_ic, &v->aux_use_ic);
     ROTATE(uint8_t tmp[2][256], v->last_luty,   v->next_luty,   v->curr_luty,   v->aux_luty);
     ROTATE(uint8_t tmp[2][256], v->last_lutuv,  v->next_lutuv,  v->curr_lutuv,  v->aux_lutuv);
 
     INIT_LUT(32, 0, v->curr_luty[0], v->curr_lutuv[0], 0);
     INIT_LUT(32, 0, v->curr_luty[1], v->curr_lutuv[1], 0);
-    v->curr_use_ic = 0;
-    if (v->curr_luty == v->next_luty) {
-        // If we just initialized next_lut, clear next_use_ic to match.
-        v->next_use_ic = 0;
+    *v->curr_use_ic = 0;
+}
+
+static int read_bfraction(VC1Context *v, GetBitContext* gb) {
+    int bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
+
+    if (bfraction_lut_index == 21 || bfraction_lut_index < 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "bfraction invalid\n");
+        return AVERROR_INVALIDDATA;
     }
+    v->bfraction_lut_index = bfraction_lut_index;
+    v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+    return 0;
 }
 
 int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 {
     int pqindex, lowquant, status;
 
+    v->field_mode = 0;
+    v->fcm = 0;
     if (v->finterpflag)
         v->interpfrm = get_bits1(gb);
+    if (!v->s.avctx->codec)
+        return -1;
     if (v->s.avctx->codec_id == AV_CODEC_ID_MSS2)
         v->respic   =
         v->rangered =
@@ -636,22 +644,19 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
     v->rangeredfrm = 0;
     if (v->rangered)
         v->rangeredfrm = get_bits1(gb);
-    v->s.pict_type = get_bits1(gb);
-    if (v->s.avctx->max_b_frames) {
-        if (!v->s.pict_type) {
-            if (get_bits1(gb))
-                v->s.pict_type = AV_PICTURE_TYPE_I;
-            else
-                v->s.pict_type = AV_PICTURE_TYPE_B;
+    if (get_bits1(gb)) {
+        v->s.pict_type = AV_PICTURE_TYPE_P;
+    } else {
+        if (v->s.avctx->max_b_frames && !get_bits1(gb)) {
+            v->s.pict_type = AV_PICTURE_TYPE_B;
         } else
-            v->s.pict_type = AV_PICTURE_TYPE_P;
-    } else
-        v->s.pict_type = v->s.pict_type ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+            v->s.pict_type = AV_PICTURE_TYPE_I;
+    }
 
     v->bi_type = 0;
     if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-        v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-        v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+        if (read_bfraction(v, gb) < 0)
+            return AVERROR_INVALIDDATA;
         if (v->bfraction == 0) {
             v->s.pict_type = AV_PICTURE_TYPE_BI;
         }
@@ -676,19 +681,25 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
     v->dquantfrm = 0;
     if (v->extended_mv == 1)
         v->mvrange = get_unary(gb, 0, 3);
@@ -712,9 +723,7 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 
     switch (v->s.pict_type) {
     case AV_PICTURE_TYPE_P:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         lowquant = (v->pq > 12) ? 0 : 1;
         v->mv_mode = ff_vc1_mv_pmode_table[lowquant][get_unary(gb, 1, 4)];
@@ -728,16 +737,15 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             INIT_LUT(v->lumscale, v->lumshift, v->last_luty[1], v->last_lutuv[1], 1);
         }
         v->qs_last = v->s.quarter_sample;
-        if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-            v->s.quarter_sample = 0;
-        else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-            if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else
-                v->s.quarter_sample = 1;
-        } else
-            v->s.quarter_sample = 1;
-        v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN || (v->mv_mode == MV_PMODE_INTENSITY_COMP && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+        if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+            v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+        } else {
+            v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+        }
 
         if ((v->mv_mode  == MV_PMODE_INTENSITY_COMP &&
              v->mv_mode2 == MV_PMODE_MIXED_MV)      ||
@@ -766,21 +774,19 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
         }
         break;
     case AV_PICTURE_TYPE_B:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         v->mv_mode          = get_bits1(gb) ? MV_PMODE_1MV : MV_PMODE_1MV_HPEL_BILIN;
         v->qs_last          = v->s.quarter_sample;
@@ -806,12 +812,12 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -846,9 +852,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->numref          = 0;
     v->p_frame_skipped = 0;
     if (v->second_field) {
-        v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+        if (v->fcm != ILACE_FIELD || v->field_mode!=1)
+            return -1;
         if (v->fptype & 4)
             v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
         if (!v->pic_header_flag)
             goto parse_common_info;
@@ -869,12 +878,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->field_mode = field_mode;
     v->fcm = fcm;
 
+    av_assert0(    v->s.mb_height == v->s.height + 15 >> 4
+                || v->s.mb_height == FFALIGN(v->s.height + 15 >> 4, 2));
     if (v->field_mode) {
         v->s.mb_height = FFALIGN(v->s.height + 15 >> 4, 2);
         v->fptype = get_bits(gb, 3);
-        v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         if (v->fptype & 4) // B-picture
             v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
     } else {
         v->s.mb_height = v->s.height + 15 >> 4;
         switch (get_unary(gb, 0, 4)) {
@@ -905,6 +917,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             v->tff = get_bits1(gb);
             v->rff = get_bits1(gb);
         }
+    } else {
+        v->tff = 1;
     }
     if (v->panscanflag) {
         avpriv_report_missing_feature(v->s.avctx, "Pan-scan");
@@ -916,6 +930,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->rnd = get_bits1(gb);
     if (v->interlace)
         v->uvsamp = get_bits1(gb);
+    if(!ff_vc1_bfraction_vlc.table)
+        return 0; //parsing only, vlc tables havnt been allocated
     if (v->field_mode) {
         if (!v->refdist_flag)
             v->refdist = 0;
@@ -925,8 +941,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->refdist += get_unary(gb, 0, 16);
         }
         if ((v->s.pict_type == AV_PICTURE_TYPE_B) || (v->s.pict_type == AV_PICTURE_TYPE_BI)) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             v->frfd = (v->bfraction * v->refdist) >> 8;
             v->brfd = v->refdist - v->frfd - 1;
             if (v->brfd < 0)
@@ -938,8 +954,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         if (v->finterpflag)
             v->interpfrm = get_bits1(gb);
         if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 v->s.pict_type = AV_PICTURE_TYPE_BI; /* XXX: should not happen here */
             }
@@ -952,24 +968,29 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     pqindex = get_bits(gb, 5);
     if (!pqindex)
         return -1;
-    v->pqindex = pqindex;
     if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
     if (v->postprocflag)
         v->postproc = get_bits(gb, 2);
 
@@ -1059,12 +1080,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
         if (v->fcm != ILACE_FRAME) {
             int mvmode;
             mvmode     = get_unary(gb, 1, 4);
@@ -1100,7 +1116,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                         INIT_LUT(v->lumscale2, v->lumshift2, v->curr_luty[v->cur_field_type^1], v->curr_lutuv[v->cur_field_type^1], 0);
                         INIT_LUT(v->lumscale , v->lumshift , v->last_luty[v->cur_field_type  ], v->last_lutuv[v->cur_field_type  ], 1);
                     }
-                    v->next_use_ic = v->curr_use_ic = 1;
+                    v->next_use_ic = *v->curr_use_ic = 1;
                 } else {
                     INIT_LUT(v->lumscale , v->lumshift , v->last_luty[0], v->last_lutuv[0], 1);
                     INIT_LUT(v->lumscale2, v->lumshift2, v->last_luty[1], v->last_lutuv[1], 1);
@@ -1108,18 +1124,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->last_use_ic = 1;
             }
             v->qs_last = v->s.quarter_sample;
-            if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-                if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                    v->s.quarter_sample = 0;
-                else
-                    v->s.quarter_sample = 1;
-            } else
-                v->s.quarter_sample = 1;
-            v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN
-                           || (v->mv_mode == MV_PMODE_INTENSITY_COMP
-                               && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+            if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+                v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            } else {
+                v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            }
         }
         if (v->fcm == PROGRESSIVE) { // progressive
             if ((v->mv_mode == MV_PMODE_INTENSITY_COMP &&
@@ -1170,12 +1183,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1183,8 +1196,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         break;
     case AV_PICTURE_TYPE_B:
         if (v->fcm == ILACE_FRAME) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 return -1;
             }
@@ -1198,15 +1211,11 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         if (v->field_mode) {
             int mvmode;
+            av_log(v->s.avctx, AV_LOG_DEBUG, "B Fields\n");
             if (v->extended_dmv)
                 v->dmvrange = get_unary(gb, 0, 3);
             mvmode = get_unary(gb, 1, 3);
@@ -1290,12 +1299,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1321,11 +1330,10 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         vop_dquant_decoding(v);
     }
 
-    v->bi_type = 0;
-    if (v->s.pict_type == AV_PICTURE_TYPE_BI) {
+    v->bi_type = (v->s.pict_type == AV_PICTURE_TYPE_BI);
+    if (v->bi_type)
         v->s.pict_type = AV_PICTURE_TYPE_B;
-        v->bi_type = 1;
-    }
+
     return 0;
 }
 
diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
index 0c4958ca77..eefb613ee8 100644
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -151,6 +151,21 @@ enum FrameCodingMode {
     ILACE_FIELD         ///<  in the bitstream is reported as 11b
 };
 
+/**
+ * Imode types
+ * @{
+ */
+enum Imode {
+    IMODE_RAW,
+    IMODE_NORM2,
+    IMODE_DIFF2,
+    IMODE_NORM6,
+    IMODE_DIFF6,
+    IMODE_ROWSKIP,
+    IMODE_COLSKIP
+};
+/** @} */ //imode defines
+
 /** The VC1 Context
  * @todo Change size wherever another size is more efficient
  * Many members are only used for Advanced Profile
@@ -203,6 +218,7 @@ typedef struct VC1Context{
     int profile;          ///< 2bits, Profile
     int frmrtq_postproc;  ///< 3bits,
     int bitrtq_postproc;  ///< 5bits, quantized framerate-based postprocessing strength
+    int max_coded_width, max_coded_height;
     int fastuvmc;         ///< Rounding of qpel vector to hpel ? (not in Simple)
     int extended_mv;      ///< Ext MV in P/B (not in Simple)
     int dquant;           ///< How qscale varies with MBs, 2bits (not in Simple)
@@ -278,7 +294,7 @@ typedef struct VC1Context{
     uint8_t  aux_luty[2][256],  aux_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t next_luty[2][256], next_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t (*curr_luty)[256]  ,(*curr_lutuv)[256];
-    int last_use_ic, curr_use_ic, next_use_ic, aux_use_ic;
+    int last_use_ic, *curr_use_ic, next_use_ic, aux_use_ic;
     int rnd;                        ///< rounding control
 
     /** Frame decoding info for S/M profiles only */
@@ -329,7 +345,7 @@ typedef struct VC1Context{
     uint8_t fourmvbp;
     uint8_t* fieldtx_plane;
     int fieldtx_is_raw;
-    int8_t zzi_8x8[64];
+    uint8_t zzi_8x8[64];
     uint8_t *blk_mv_type_base, *blk_mv_type;    ///< 0: frame MV, 1: field MV (interlaced frame)
     uint8_t *mv_f_base, *mv_f[2];               ///< 0: MV obtained from same field, 1: opposite field
     uint8_t *mv_f_next_base, *mv_f_next[2];
diff --git a/libavcodec/vc1_block.c b/libavcodec/vc1_block.c
index 4588af4100..255ba1da70 100644
--- a/libavcodec/vc1_block.c
+++ b/libavcodec/vc1_block.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,8 +40,10 @@
 #define DC_VLC_BITS 9
 
 // offset tables for interlaced picture MVDATA decoding
-static const int offset_table1[9] = {  0,  1,  2,  4,  8, 16, 32,  64, 128 };
-static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
+static const uint8_t offset_table[2][9] = {
+    {  0,  1,  2,  4,  8, 16, 32,  64, 128 },
+    {  0,  1,  3,  7, 15, 31, 63, 127, 255 },
+};
 
 /***********************************************************************/
 /**
@@ -50,22 +52,8 @@ static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
 
-static void init_block_index(VC1Context *v)
+static inline void init_block_index(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     ff_init_block_index(s);
@@ -111,12 +99,14 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
                                               s->dest[0] - v_dist * s->linesize - 8,
                                               stride_y);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
                                               s->dest[1] - 8 * s->uvlinesize - 8,
                                               s->uvlinesize);
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
                                               s->dest[2] - 8 * s->uvlinesize - 8,
                                               s->uvlinesize);
+            }
         }
         if (s->mb_x == s->mb_width - 1) {
             top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
@@ -136,12 +126,14 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
                                               s->dest[0] - v_dist * s->linesize + 8,
                                               stride_y);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
                                               s->dest[1] - 8 * s->uvlinesize,
                                               s->uvlinesize);
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
                                               s->dest[2] - 8 * s->uvlinesize,
                                               s->uvlinesize);
+            }
         }
     }
 
@@ -230,33 +222,32 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         s->mb_intra = 1;                                                \
     } else {                                                            \
         index1 = index % 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val&1);                                             \
-        _dmv_x = (sign ^ ((val>>1) + offset_table[index1])) - sign;     \
+        _dmv_x = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_x = (sign ^ ((val >> 1) + _dmv_x)) - sign;             \
+        }                                                               \
                                                                         \
         index1 = index / 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val & 1);                                           \
-        _dmv_y = (sign ^ ((val >> 1) + offset_table[index1])) - sign;   \
+        _dmv_y = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_y = (sign ^ ((val >> 1) + _dmv_y)) - sign;             \
+        }                                                               \
     }
 
 static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
                                                    int *dmv_y, int *pred_flag)
 {
     int index, index1;
-    int extend_x = 0, extend_y = 0;
+    int extend_x, extend_y;
     GetBitContext *gb = &v->s.gb;
     int bits, esc;
     int val, sign;
-    const int* offs_tab;
 
     if (v->numref) {
         bits = VC1_2REF_MVDATA_VLC_BITS;
@@ -265,51 +256,32 @@ static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
         bits = VC1_1REF_MVDATA_VLC_BITS;
         esc  = 71;
     }
-    switch (v->dmvrange) {
-    case 1:
-        extend_x = 1;
-        break;
-    case 2:
-        extend_y = 1;
-        break;
-    case 3:
-        extend_x = extend_y = 1;
-        break;
-    }
+    extend_x = v->dmvrange & 1;
+    extend_y = (v->dmvrange >> 1) & 1;
     index = get_vlc2(gb, v->imv_vlc->table, bits, 3);
     if (index == esc) {
         *dmv_x = get_bits(gb, v->k_x);
         *dmv_y = get_bits(gb, v->k_y);
         if (v->numref) {
-            if (pred_flag) {
+            if (pred_flag)
                 *pred_flag = *dmv_y & 1;
-                *dmv_y     = (*dmv_y + *pred_flag) >> 1;
-            } else {
-                *dmv_y     = (*dmv_y + (*dmv_y & 1)) >> 1;
-            }
+            *dmv_y = (*dmv_y + (*dmv_y & 1)) >> 1;
         }
     }
     else {
-        if (extend_x)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
+        av_assert0(index < esc);
         index1 = (index + 1) % 9;
         if (index1 != 0) {
             val    = get_bits(gb, index1 + extend_x);
-            sign   = 0 -(val & 1);
-            *dmv_x = (sign ^ ((val >> 1) + offs_tab[index1])) - sign;
+            sign   = 0 - (val & 1);
+            *dmv_x = (sign ^ ((val >> 1) + offset_table[extend_x][index1])) - sign;
         } else
             *dmv_x = 0;
-        if (extend_y)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
         index1 = (index + 1) / 9;
         if (index1 > v->numref) {
-            val    = get_bits(gb, (index1 + (extend_y << v->numref)) >> v->numref);
+            val    = get_bits(gb, (index1 >> v->numref) + extend_y);
             sign   = 0 - (val & 1);
-            *dmv_y = (sign ^ ((val >> 1) + offs_tab[index1 >> v->numref])) - sign;
+            *dmv_y = (sign ^ ((val >> 1) + offset_table[extend_y][index1 >> v->numref])) - sign;
         } else
             *dmv_y = 0;
         if (v->numref && pred_flag)
@@ -420,6 +392,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     int q1, q2 = 0;
     int dqscale_index;
 
+    /* scale predictors if needed */
+    q1 = s->current_picture.qscale_table[mb_pos];
+    dqscale_index = s->y_dc_scale_table[q1] - 1;
+    if (dqscale_index < 0)
+        return 0;
+
     wrap = s->block_wrap[n];
     dc_val = s->dc_val[0] + s->block_index[n];
 
@@ -429,11 +407,7 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     c = dc_val[ - 1];
     b = dc_val[ - 1 - wrap];
     a = dc_val[ - wrap];
-    /* scale predictors if needed */
-    q1 = s->current_picture.qscale_table[mb_pos];
-    dqscale_index = s->y_dc_scale_table[q1] - 1;
-    if (dqscale_index < 0)
-        return 0;
+
     if (c_avail && (n != 1 && n != 3)) {
         q2 = s->current_picture.qscale_table[mb_pos - 1];
         if (q2 && q2 != q1)
@@ -455,20 +429,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
             b = (b * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
 
-    if (a_avail && c_avail) {
-        if (abs(a - b) <= abs(b - c)) {
-            pred     = c;
-            *dir_ptr = 1; // left
-        } else {
-            pred     = a;
-            *dir_ptr = 0; // top
-        }
+    if (c_avail && (!a_avail || abs(a - b) <= abs(b - c))) {
+        pred     = c;
+        *dir_ptr = 1; // left
     } else if (a_avail) {
         pred     = a;
         *dir_ptr = 0; // top
-    } else if (c_avail) {
-        pred     = c;
-        *dir_ptr = 1; // left
     } else {
         pred     = 0;
         *dir_ptr = 1; // left
@@ -527,17 +493,16 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                                 int *value, int codingset)
 {
     GetBitContext *gb = &v->s.gb;
-    int index, escape, run = 0, level = 0, lst = 0;
+    int index, run, level, lst, sign;
 
     index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
     if (index != ff_vc1_ac_sizes[codingset] - 1) {
         run   = vc1_index_decode_table[codingset][index][0];
         level = vc1_index_decode_table[codingset][index][1];
         lst   = index >= vc1_last_decode_table[codingset] || get_bits_left(gb) < 0;
-        if (get_bits1(gb))
-            level = -level;
+        sign  = get_bits1(gb);
     } else {
-        escape = decode210(gb);
+        int escape = decode210(gb);
         if (escape != 2) {
             index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
             run   = vc1_index_decode_table[codingset][index][0];
@@ -554,10 +519,8 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                 else
                     run += vc1_delta_run_table[codingset][level] + 1;
             }
-            if (get_bits1(gb))
-                level = -level;
+            sign = get_bits1(gb);
         } else {
-            int sign;
             lst = get_bits1(gb);
             if (v->s.esc3_level_length == 0) {
                 if (v->pq < 8 || v->dquantfrm) { // table 59
@@ -572,14 +535,12 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
             run   = get_bits(gb, v->s.esc3_run_length);
             sign  = get_bits1(gb);
             level = get_bits(gb, v->s.esc3_level_length);
-            if (sign)
-                level = -level;
         }
     }
 
     *last  = lst;
     *skip  = run;
-    *value = level;
+    *value = (level ^ -sign) + sign;
 }
 
 /** Decode intra block in intra frames - should be faster than decode_intra_block
@@ -598,7 +559,7 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     int i;
     int16_t *dc_val;
     int16_t *ac_val, *ac_val2;
-    int dcdiff;
+    int dcdiff, scale;
 
     /* Get DC differential */
     if (n < 4) {
@@ -611,16 +572,12 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (v->pq == 1 || v->pq == 2) ? 3 - v->pq : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (v->pq == 1)      dcdiff = get_bits(gb, 10);
-            else if (v->pq == 2) dcdiff = get_bits(gb, 9);
-            else                 dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (v->pq == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (v->pq == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -631,27 +588,29 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-    /* Skip ? */
-    if (!coded) {
-        goto not_coded;
-    }
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
-    // AC Decoding
-    i = 1;
+    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val2 = ac_val;
+    if (dc_pred_dir) // left
+        ac_val -= 16;
+    else // top
+        ac_val -= 16 * s->block_wrap[n];
 
-    {
+    scale = v->pq * 2 + v->halfpq;
+
+    //AC Decoding
+    i = !!coded;
+
+    if (coded) {
         int last = 0, skip, value;
         const uint8_t *zz_table;
-        int scale;
         int k;
 
-        scale = v->pq * 2 + v->halfpq;
-
         if (v->s.ac_pred) {
             if (!dc_pred_dir)
                 zz_table = v->zz_8x8[2];
@@ -660,13 +619,6 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         } else
             zz_table = v->zz_8x8[1];
 
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
-        if (dc_pred_dir) // left
-            ac_val -= 16;
-        else // top
-            ac_val -= 16 * s->block_wrap[n];
-
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, codingset);
             i += skip;
@@ -677,13 +629,15 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++)
-                    block[k << v->left_blk_sh] += ac_val[k];
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++)
-                    block[k << v->top_blk_sh] += ac_val[k + 8];
+                sh = v->top_blk_sh;
+                ac_val += 8;
             }
+            for (k = 1; k < 8; k++)
+                block[k << sh] += ac_val[k];
         }
         /* save AC coeffs for further prediction */
         for (k = 1; k < 8; k++) {
@@ -699,46 +653,30 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -v->pq : v->pq;
             }
 
-        if (s->ac_pred) i = 63;
-    }
-
-not_coded:
-    if (!coded) {
-        int k, scale;
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
+    } else {
+        int k;
 
-        i = 0;
-        scale = v->pq * 2 + v->halfpq;
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            ac_val -= 16;
-            if (s->ac_pred)
-                memcpy(ac_val2, ac_val, 8 * 2);
-        } else { // top
-            ac_val -= 16 * s->block_wrap[n];
-            if (s->ac_pred)
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-        }
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { //left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -v->pq : v->pq;
             }
-            i = 63;
         }
     }
+    if (s->ac_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -759,7 +697,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int a_avail = v->a_avail, c_avail = v->c_avail;
@@ -779,16 +717,12 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -799,39 +733,42 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-
-    //AC Decoding
-    i = 1;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
     /* check if AC is needed at all */
     if (!a_avail && !c_avail)
         use_pred = 0;
-    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-    ac_val2 = ac_val;
 
     scale = mquant * 2 + ((mquant == v->pq) ? v->halfpq : 0);
 
+    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val2 = ac_val;
     if (dc_pred_dir) // left
         ac_val -= 16;
     else // top
         ac_val -= 16 * s->block_wrap[n];
 
     q1 = s->current_picture.qscale_table[mb_pos];
-    if (dc_pred_dir && c_avail && mb_pos)
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
-    if (!dc_pred_dir && a_avail && mb_pos >= s->mb_stride)
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
-    if (dc_pred_dir && n == 1)
-        q2 = q1;
-    if (!dc_pred_dir && n == 2)
-        q2 = q1;
     if (n == 3)
         q2 = q1;
+    else if (dc_pred_dir) {
+        if (n == 1)
+            q2 = q1;
+        else if (c_avail && mb_pos)
+            q2 = s->current_picture.qscale_table[mb_pos - 1];
+    } else {
+        if (n == 2)
+            q2 = q1;
+        else if (a_avail && mb_pos >= s->mb_stride)
+            q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+    }
+
+    //AC Decoding
+    i = 1;
 
     if (coded) {
         int last = 0, skip, value;
@@ -864,28 +801,24 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
+            if (dc_pred_dir) { // left
+                sh = v->left_blk_sh;
+            } else { // top
+                sh = v->top_blk_sh;
+                ac_val += 8;
+            }
             /* scale predictors if needed*/
             if (q2 && q1 != q2) {
                 q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
                 if (q1 < 1)
                     return AVERROR_INVALIDDATA;
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                } else { // top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += (ac_val[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
             } else {
-                if (dc_pred_dir) { //left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += ac_val[k];
-                } else { //top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += ac_val[k + 8];
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += ac_val[k];
             }
         }
         /* save AC coeffs for further prediction */
@@ -902,55 +835,38 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -mquant : mquant;
             }
 
-        if (use_pred) i = 63;
     } else { // no AC coeffs
         int k;
 
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            if (use_pred) {
-                memcpy(ac_val2, ac_val, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        } else { // top
-            if (use_pred) {
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val2[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            if (q2 && q1 != q2) {
+                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                for (k = 1; k < 8; k++)
+                    ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
+            }
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val2[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -mquant : mquant;
             }
-            i = 63;
         }
     }
+    if (use_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -971,7 +887,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
@@ -983,7 +899,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     s->bdsp.clear_block(block);
 
     /* XXX: Guard against dumb values of mquant */
-    mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant);
+    mquant = av_clip_uintp2(mquant, 5);
 
     /* Set DC scale - y and c use the same */
     s->y_dc_scale = s->y_dc_scale_table[mquant];
@@ -1000,16 +916,12 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -1333,8 +1245,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
 
 /** @} */ // Macroblock group
 
-static const int size_table  [6] = { 0, 2, 3, 4,  5,  8 };
-static const int offset_table[6] = { 0, 1, 3, 7, 15, 31 };
+static const uint8_t size_table[6] = { 0, 2, 3, 4,  5,  8 };
 
 /** Decode one P-frame MB
  */
@@ -1416,7 +1327,7 @@ static int vc1_decode_p_mb(VC1Context *v)
 
                     vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
@@ -1437,7 +1348,7 @@ static int vc1_decode_p_mb(VC1Context *v)
                 } else if (val) {
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
                                              s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1527,7 +1438,7 @@ static int vc1_decode_p_mb(VC1Context *v)
 
                     vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
@@ -1549,7 +1460,7 @@ static int vc1_decode_p_mb(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                              &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
@@ -1602,7 +1513,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
     int idx_mbmode = 0, mvbp;
     int stride_y, fieldtx;
 
-    mquant = v->pq; /* Loosy initialization */
+    mquant = v->pq; /* Lossy initialization */
 
     if (v->skip_is_raw)
         skipped = get_bits1(gb);
@@ -1675,7 +1586,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
 
                 vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                        (i & 4) ? v->codingset2 : v->codingset);
-                if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                 if (i < 4) {
@@ -1711,19 +1622,14 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
             dst_idx = 0;
             if (fourmv) {
                 mvbp = v->fourmvbp;
-                for (i = 0; i < 6; i++) {
-                    if (i < 4) {
-                        dmv_x = dmv_y = 0;
-                        val   = ((mvbp >> (3 - i)) & 1);
-                        if (val) {
-                            get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                        }
-                        ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
-                        ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                    } else if (i == 4) {
-                        ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
-                    }
+                for (i = 0; i < 4; i++) {
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & (8 >> i))
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
+                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
                 }
+                ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
             } else if (twomv) {
                 mvbp  = v->twomvbp;
                 dmv_x = dmv_y = 0;
@@ -1767,7 +1673,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1810,11 +1716,11 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     int val; /* temp values */
     int first_block = 1;
     int dst_idx, off;
-    int pred_flag;
+    int pred_flag = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0;
 
-    mquant = v->pq; /* Loosy initialization */
+    mquant = v->pq; /* Lossy initialization */
 
     idx_mbmode = get_vlc2(gb, v->mbmode_vlc->table, VC1_IF_MBMODE_VLC_BITS, 2);
     if (idx_mbmode <= 1) { // intra MB
@@ -1846,7 +1752,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
@@ -1859,7 +1765,8 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (idx_mbmode <= 5) { // 1-MV
             dmv_x = dmv_y = pred_flag = 0;
             if (idx_mbmode & 1) {
@@ -1870,18 +1777,14 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
             mb_has_coeffs = !(idx_mbmode & 2);
         } else { // 4-MV
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x = dmv_y = pred_flag = 0;
-                    val   = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
-                    }
-                    ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
-                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, 0);
+            for (i = 0; i < 4; i++) {
+                dmv_x = dmv_y = pred_flag = 0;
+                if (v->fourmvbp & (8 >> i))
+                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
+                ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
+                ff_vc1_mc_4mv_luma(v, i, 0, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, 0);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -1903,10 +1806,11 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
                 pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                          first_block, s->dest[dst_idx] + off,
                                          (i & 4) ? s->uvlinesize : s->linesize,
-                                         (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                         CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                          &block_tt);
                 block_cbp |= pat << (i << 2);
-                if (!v->ttmbf && ttmb < 8) ttmb = -1;
+                if (!v->ttmbf && ttmb < 8)
+                    ttmb = -1;
                 first_block = 0;
             }
         }
@@ -2049,7 +1953,7 @@ static void vc1_decode_b_mb(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2063,7 +1967,7 @@ static void vc1_decode_b_mb(VC1Context *v)
             vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                first_block, s->dest[dst_idx] + off,
                                (i & 4) ? s->uvlinesize : s->linesize,
-                               (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                               CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
             if (!v->ttmbf && ttmb < 8)
                 ttmb = -1;
             first_block = 0;
@@ -2089,9 +1993,9 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
     int fwd;
     int dmv_x[2], dmv_y[2], pred_flag[2];
     int bmvtype = BMV_TYPE_BACKWARD;
-    int idx_mbmode, interpmvp;
+    int idx_mbmode;
 
-    mquant      = v->pq; /* Loosy initialization */
+    mquant      = v->pq; /* Lossy initialization */
     s->mb_intra = 0;
 
     idx_mbmode = get_vlc2(gb, v->mbmode_vlc->table, VC1_IF_MBMODE_VLC_BITS, 2);
@@ -2124,7 +2028,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2140,12 +2044,14 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (v->fmb_is_raw)
             fwd = v->forward_mb_plane[mb_pos] = get_bits1(gb);
         else
             fwd = v->forward_mb_plane[mb_pos];
         if (idx_mbmode <= 5) { // 1-MV
+            int interpmvp = 0;
             dmv_x[0]     = dmv_x[1] = dmv_y[0] = dmv_y[1] = 0;
             pred_flag[0] = pred_flag[1] = 0;
             if (fwd)
@@ -2168,12 +2074,16 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
             if (bmvtype != BMV_TYPE_DIRECT && idx_mbmode & 1) {
                 get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD], &dmv_y[bmvtype == BMV_TYPE_BACKWARD], &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
             }
-            if (bmvtype == BMV_TYPE_INTERPOLATED && interpmvp) {
+            if (interpmvp) {
                 get_mvdata_interlaced(v, &dmv_x[1], &dmv_y[1], &pred_flag[1]);
             }
             if (bmvtype == BMV_TYPE_DIRECT) {
                 dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
                 dmv_x[1] = dmv_y[1] = pred_flag[0] = 0;
+                if (!s->next_picture_ptr->field_picture) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Mixed field/frame direct mode not supported\n");
+                    return;
+                }
             }
             ff_vc1_pred_b_mv_intfi(v, 0, dmv_x, dmv_y, 1, pred_flag);
             vc1_b_mc(v, dmv_x, dmv_y, (bmvtype == BMV_TYPE_DIRECT), bmvtype);
@@ -2183,21 +2093,18 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 bmvtype = BMV_TYPE_FORWARD;
             v->bmvtype  = bmvtype;
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
-                    dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
-                    val = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
-                                                 &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
-                                             &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
-                    }
-                    ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
-                    ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
+            for (i = 0; i < 4; i++) {
+                dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
+                dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
+                if (v->fourmvbp & (8 >> i)) {
+                    get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
+                                             &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
+                                         &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
+                }
+                ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
+                ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -2219,7 +2126,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                    first_block, s->dest[dst_idx] + off,
                                    (i & 4) ? s->uvlinesize : s->linesize,
-                                   (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                                   CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
                 if (!v->ttmbf && ttmb < 8)
                     ttmb = -1;
                 first_block = 0;
@@ -2281,6 +2188,8 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         direct = v->direct_mb_plane[mb_pos];
 
     if (direct) {
+        if (s->next_picture_ptr->field_picture)
+            av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
         s->mv[0][0][0] = s->current_picture.motion_val[0][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 0, s->quarter_sample);
         s->mv[0][0][1] = s->current_picture.motion_val[0][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 0, s->quarter_sample);
         s->mv[1][0][0] = s->current_picture.motion_val[1][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 1, s->quarter_sample);
@@ -2342,7 +2251,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if (i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (i < 4) {
@@ -2508,7 +2417,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -2651,7 +2560,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
 
                 vc1_decode_i_block(v, s->block[k], k, val, (k < 4) ? v->codingset : v->codingset2);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
                 if (v->pq >= 9 && v->overlap) {
@@ -2675,7 +2584,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
                 if (s->mb_x) {
                     v->vc1dsp.vc1_h_overlap(s->dest[0], s->linesize);
                     v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                         v->vc1dsp.vc1_h_overlap(s->dest[1], s->uvlinesize);
                         v->vc1dsp.vc1_h_overlap(s->dest[2], s->uvlinesize);
                     }
@@ -2685,7 +2594,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
                 if (!s->first_slice_line) {
                     v->vc1dsp.vc1_v_overlap(s->dest[0], s->linesize);
                     v->vc1dsp.vc1_v_overlap(s->dest[0] + 8, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                         v->vc1dsp.vc1_v_overlap(s->dest[1], s->uvlinesize);
                         v->vc1dsp.vc1_v_overlap(s->dest[2], s->uvlinesize);
                     }
@@ -2814,7 +2723,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                 vc1_decode_i_block_adv(v, block[k], k, val,
                                        (k < 4) ? v->codingset : v->codingset2, mquant);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(block[k]);
             }
@@ -2842,15 +2751,14 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
     /* raw bottom MB row */
     s->mb_x = 0;
     init_block_index(v);
-
-    for (;s->mb_x < s->mb_width; s->mb_x++) {
+    for (; s->mb_x < s->mb_width; s->mb_x++) {
         ff_update_block_index(s);
         vc1_put_signed_blocks_clamped(v);
         if (v->s.loop_filter)
             ff_vc1_loop_filter_iblk_delayed(v, v->pq);
     }
     if (v->s.loop_filter)
-        ff_mpeg_draw_horiz_band(s, (s->end_mb_y-1)*16, 16);
+        ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
     ff_er_add_slice(&s->er, 0, s->start_mb_y << v->field_mode, s->mb_width - 1,
                     (s->end_mb_y << v->field_mode) - 1, ER_MB_END);
 }
@@ -2914,7 +2822,8 @@ static void vc1_decode_p_blocks(VC1Context *v)
         memmove(v->ttblk_base,    v->ttblk,    sizeof(v->ttblk_base[0])    * s->mb_stride);
         memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
         memmove(v->luma_mv_base,  v->luma_mv,  sizeof(v->luma_mv_base[0])  * s->mb_stride);
-        if (s->mb_y != s->start_mb_y) ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
+        if (s->mb_y != s->start_mb_y)
+            ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
     }
     if (apply_loop_filter) {
diff --git a/libavcodec/vc1_common.h b/libavcodec/vc1_common.h
index 788d324cd8..b46c33f7e2 100644
--- a/libavcodec/vc1_common.h
+++ b/libavcodec/vc1_common.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "internal.h"
 
 /** Markers used in VC-1 AP frame data */
 //@{
@@ -57,12 +58,9 @@ enum Profile {
  */
 static av_always_inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
 {
-    uint32_t mrk = 0xFFFFFFFF;
-
-    if (end-src < 4)
-        return end;
-    while (src < end) {
-        mrk = (mrk << 8) | *src++;
+    if (end - src >= 4) {
+        uint32_t mrk = 0xFFFFFFFF;
+        src = avpriv_find_start_code(src, end, &mrk);
         if (IS_MARKER(mrk))
             return src - 4;
     }
diff --git a/libavcodec/vc1_loopfilter.c b/libavcodec/vc1_loopfilter.c
index 52cff1e49b..025776bac9 100644
--- a/libavcodec/vc1_loopfilter.c
+++ b/libavcodec/vc1_loopfilter.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,7 @@ void ff_vc1_loop_filter_iblk(VC1Context *v, int pq)
         if (s->mb_x)
             v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
         v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
         for (j = 0; j < 2; j++) {
             v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1], s->uvlinesize, pq);
             if (s->mb_x)
@@ -51,8 +52,10 @@ void ff_vc1_loop_filter_iblk(VC1Context *v, int pq)
     if (s->mb_y == s->end_mb_y - 1) {
         if (s->mb_x) {
             v->vc1dsp.vc1_h_loop_filter16(s->dest[0], s->linesize, pq);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             v->vc1dsp.vc1_h_loop_filter8(s->dest[1], s->uvlinesize, pq);
             v->vc1dsp.vc1_h_loop_filter8(s->dest[2], s->uvlinesize, pq);
+            }
         }
         v->vc1dsp.vc1_h_loop_filter16(s->dest[0] + 8, s->linesize, pq);
     }
@@ -73,6 +76,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x >= 2)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 16, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 8, s->linesize, pq);
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 for (j = 0; j < 2; j++) {
                     v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
                     if (s->mb_x >= 2) {
@@ -90,6 +94,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize + 8, s->linesize, pq);
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 for (j = 0; j < 2; j++) {
                     v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
                     if (s->mb_x >= 2) {
@@ -105,7 +110,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x >= 2)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 16, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 8, s->linesize, pq);
-                if (s->mb_x >= 2) {
+                if (s->mb_x >= 2 && (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))) {
                     for (j = 0; j < 2; j++) {
                         v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
                     }
@@ -116,7 +121,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
-                if (s->mb_x) {
+                if (s->mb_x && (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))) {
                     for (j = 0; j < 2; j++) {
                         v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
                     }
@@ -150,7 +155,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                       v->block[v->cur_blk_idx][0]);
             v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][3],
                                       v->block[v->cur_blk_idx][2]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                 v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][4],
                                           v->block[v->cur_blk_idx][4]);
                 v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][5],
@@ -169,7 +174,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                           v->block[v->cur_blk_idx][0]);
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][3],
                                           v->block[v->cur_blk_idx][1]);
-                if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                     v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][4],
                                               v->block[v->cur_blk_idx][4]);
                     v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][5],
@@ -189,7 +194,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                       v->block[v->left_blk_idx][0]);
             v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][3],
                                       v->block[v->left_blk_idx][1]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][4],
                                           v->block[v->left_blk_idx][4]);
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][5],
@@ -209,7 +214,7 @@ static av_always_inline void vc1_apply_p_v_loop_filter(VC1Context *v, int block_
     int mb_cbp         = v->cbp[s->mb_x - s->mb_stride],
         block_cbp      = mb_cbp      >> (block_num * 4), bottom_cbp,
         mb_is_intra    = v->is_intra[s->mb_x - s->mb_stride],
-        block_is_intra = mb_is_intra >> (block_num * 4), bottom_is_intra;
+        block_is_intra = mb_is_intra >> block_num, bottom_is_intra;
     int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
     uint8_t *dst;
 
@@ -331,21 +336,22 @@ void ff_vc1_apply_p_loop_filter(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int i;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
 
-    for (i = 0; i < 6; i++) {
+    for (i = 0; i < block_count; i++) {
         vc1_apply_p_v_loop_filter(v, i);
     }
 
     /* V always precedes H, therefore we run H one MB before V;
      * at the end of a row, we catch up to complete the row */
     if (s->mb_x) {
-        for (i = 0; i < 6; i++) {
+        for (i = 0; i < block_count; i++) {
             vc1_apply_p_h_loop_filter(v, i);
         }
         if (s->mb_x == s->mb_width - 1) {
             s->mb_x++;
             ff_update_block_index(s);
-            for (i = 0; i < 6; i++) {
+            for (i = 0; i < block_count; i++) {
                 vc1_apply_p_h_loop_filter(v, i);
             }
         }
diff --git a/libavcodec/vc1_mc.c b/libavcodec/vc1_mc.c
index 0faa6f26a9..4467646e73 100644
--- a/libavcodec/vc1_mc.c
+++ b/libavcodec/vc1_mc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,140 @@
 #include "mpegvideo.h"
 #include "vc1.h"
 
+static av_always_inline void vc1_scale_luma(uint8_t *srcY,
+                                            int k, int linesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++)
+            srcY[i] = ((srcY[i] - 128) >> 1) + 128;
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                              int k, int uvlinesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = ((srcU[i] - 128) >> 1) + 128;
+            srcV[i] = ((srcV[i] - 128) >> 1) + 128;
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_luma(uint8_t *srcY,
+                                                uint8_t *lut1, uint8_t *lut2,
+                                                int k, int linesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++)
+            srcY[i] = lut1[srcY[i]];
+        srcY += linesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++)
+            srcY[i] = lut2[srcY[i]];
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                                  uint8_t *lut1, uint8_t *lut2,
+                                                  int k, int uvlinesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut1[srcU[i]];
+            srcV[i] = lut1[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut2[srcU[i]];
+            srcV[i] = lut2[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static const uint8_t popcount4[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+static av_always_inline int get_luma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = v->mv_f[dir][s->block_index[0] + v->blocks_off] |
+             (v->mv_f[dir][s->block_index[1] + v->blocks_off] << 1) |
+             (v->mv_f[dir][s->block_index[2] + v->blocks_off] << 2) |
+             (v->mv_f[dir][s->block_index[3] + v->blocks_off] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x23, 0, 0x13, 0x03, 0, 0, 0x12, 0x02, 0, 0x01, 0, 0, 0 };
+    int opp_count = popcount4[idx];
+
+    switch (opp_count) {
+    case 0:
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 1:
+        *tx = mid_pred(s->mv[dir][idx < 2][0], s->mv[dir][1 + (idx < 4)][0], s->mv[dir][2 + (idx < 8)][0]);
+        *ty = mid_pred(s->mv[dir][idx < 2][1], s->mv[dir][1 + (idx < 4)][1], s->mv[dir][2 + (idx < 8)][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    }
+    return opp_count;
+}
+
+static av_always_inline int get_chroma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = !v->mb_type[0][s->block_index[0]] |
+             (!v->mb_type[0][s->block_index[1]] << 1) |
+             (!v->mb_type[0][s->block_index[2]] << 2) |
+             (!v->mb_type[0][s->block_index[3]] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x01, 0, 0x02, 0x12, 0, 0, 0x03, 0x13, 0, 0x23, 0, 0, 0 };
+    int valid_count = popcount4[idx];
+
+    switch (valid_count) {
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    default:
+        return 0;
+    }
+    return valid_count;
+}
+
 /** Do motion compensation over 1 macroblock
  * Mostly adapted hpel_motion and qpel_motion from mpegvideo.c
  */
@@ -85,7 +219,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             srcV = s->current_picture.f->data[2];
             luty  = v->curr_luty;
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcY = s->last_picture.f->data[0];
             srcU = s->last_picture.f->data[1];
@@ -136,7 +270,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -145,81 +279,51 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         || s->h_edge_pos < 22 || v_edge_pos < 22
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx&3) - 16 - s->mspel * 3
         || (unsigned)(src_y - 1)        > v_edge_pos    - (my&3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
+                                 k, k,
                                  src_x - s->mspel, src_y - s->mspel,
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+        s->vdsp.emulated_edge_mc(ubuf, srcU,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        s->vdsp.emulated_edge_mc(vbuf, srcV,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + src_y - s->mspel) & 1) ;
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((0 + src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((1 + src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0]    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.put_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -228,7 +332,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             s->hdsp.put_no_rnd_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel bilinear */
     uvmx = (uvmx & 3) << 1;
@@ -242,17 +346,6 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     }
 }
 
-static inline int median4(int a, int b, int c, int d)
-{
-    if (a < b) {
-        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
-        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
-    } else {
-        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
-        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
-    }
-}
-
 /** Do motion compensation for 4-MV macroblock - luminance block
  */
 void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
@@ -278,7 +371,7 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
         if (v->field_mode && (v->cur_field_type != v->ref_field_type[dir]) && v->second_field) {
             srcY = s->current_picture.f->data[0];
             luty = v->curr_luty;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcY = s->last_picture.f->data[0];
             luty = v->last_luty;
@@ -301,35 +394,10 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 
     if (s->pict_type == AV_PICTURE_TYPE_P && n == 3 && v->field_mode) {
-        int same_count = 0, opp_count = 0, k;
-        int chosen_mv[2][4][2], f;
-        int tx = 0, ty = 0;
-        for (k = 0; k < 4; k++) {
-            f = v->mv_f[0][s->block_index[k] + v->blocks_off];
-            chosen_mv[f][f ? opp_count : same_count][0] = s->mv[0][k][0];
-            chosen_mv[f][f ? opp_count : same_count][1] = s->mv[0][k][1];
-            opp_count  += f;
-            same_count += 1 - f;
-        }
-        f = opp_count > same_count;
-        switch (f ? opp_count : same_count) {
-        case 4:
-            tx = median4(chosen_mv[f][0][0], chosen_mv[f][1][0],
-                         chosen_mv[f][2][0], chosen_mv[f][3][0]);
-            ty = median4(chosen_mv[f][0][1], chosen_mv[f][1][1],
-                         chosen_mv[f][2][1], chosen_mv[f][3][1]);
-            break;
-        case 3:
-            tx = mid_pred(chosen_mv[f][0][0], chosen_mv[f][1][0], chosen_mv[f][2][0]);
-            ty = mid_pred(chosen_mv[f][0][1], chosen_mv[f][1][1], chosen_mv[f][2][1]);
-            break;
-        case 2:
-            tx = (chosen_mv[f][0][0] + chosen_mv[f][1][0]) / 2;
-            ty = (chosen_mv[f][0][1] + chosen_mv[f][1][1]) / 2;
-            break;
-        }
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = tx;
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = ty;
+        int opp_count = get_luma_mv(v, 0,
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0],
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1]);
+        int k, f = opp_count > 2;
         for (k = 0; k < 4; k++)
             v->mv_f[1][s->block_index[k] + v->blocks_off] = f;
     }
@@ -385,46 +453,36 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     if (v->field_mode && v->ref_field_type[dir])
         srcY += s->current_picture_ptr->f->linesize[0];
 
-    if (fieldmv && !(src_y & 1))
-        v_edge_pos--;
-    if (fieldmv && (src_y & 1) && src_y < 4)
-        src_y--;
+    if (fieldmv) {
+        if (!(src_y & 1))
+            v_edge_pos--;
+        else
+            src_y -= (src_y < 4);
+    }
     if (v->rangeredfrm || use_ic
         || s->h_edge_pos < 13 || v_edge_pos < 23
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 8 - s->mspel * 2
         || (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) {
+        const int k = 9 + s->mspel * 2;
+
         srcY -= s->mspel * (1 + (s->linesize << fieldmv));
         /* check emulate edge stride and offset */
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv,
+                                 k, k << fieldmv,
                                  src_x - s->mspel, src_y - (s->mspel << fieldmv),
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize << fieldmv;
-            }
+            vc1_scale_luma(srcY, k, s->linesize << fieldmv);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : (((j<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1);
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize << fieldmv;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((0<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((1<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               k, s->linesize << fieldmv);
         }
         srcY += s->mspel * (1 + (s->linesize << fieldmv));
     }
@@ -432,9 +490,9 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
         if (avg)
-            v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.avg_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
         else
-            v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.put_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -444,59 +502,6 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 }
 
-static av_always_inline int get_chroma_mv(int *mvx, int *mvy, int *a, int flag, int *tx, int *ty)
-{
-    int idx, i;
-    static const int count[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-    idx =  ((a[3] != flag) << 3)
-         | ((a[2] != flag) << 2)
-         | ((a[1] != flag) << 1)
-         |  (a[0] != flag);
-    if (!idx) {
-        *tx = median4(mvx[0], mvx[1], mvx[2], mvx[3]);
-        *ty = median4(mvy[0], mvy[1], mvy[2], mvy[3]);
-        return 4;
-    } else if (count[idx] == 1) {
-        switch (idx) {
-        case 0x1:
-            *tx = mid_pred(mvx[1], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[1], mvy[2], mvy[3]);
-            return 3;
-        case 0x2:
-            *tx = mid_pred(mvx[0], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[2], mvy[3]);
-            return 3;
-        case 0x4:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[3]);
-            return 3;
-        case 0x8:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[2]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[2]);
-            return 3;
-        }
-    } else if (count[idx] == 2) {
-        int t1 = 0, t2 = 0;
-        for (i = 0; i < 3; i++)
-            if (!a[i]) {
-                t1 = i;
-                break;
-            }
-        for (i = t1 + 1; i < 4; i++)
-            if (!a[i]) {
-                t2 = i;
-                break;
-            }
-        *tx = (mvx[t1] + mvx[t2]) / 2;
-        *ty = (mvy[t1] + mvy[t2]) / 2;
-        return 2;
-    } else {
-        return 0;
-    }
-    return -1;
-}
-
 /** Do motion compensation for 4-MV macroblock - both chroma blocks
  */
 void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
@@ -505,44 +510,30 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcU, *srcV;
     int uvmx, uvmy, uvsrc_x, uvsrc_y;
-    int k, tx = 0, ty = 0;
-    int mvx[4], mvy[4], intra[4], mv_f[4];
-    int valid_count;
-    int chroma_ref_type = v->cur_field_type;
+    int16_t tx, ty;
+    int chroma_ref_type;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     uint8_t (*lutuv)[256];
     int use_ic;
 
     if (!v->field_mode && !v->s.last_picture.f->data[0])
         return;
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    for (k = 0; k < 4; k++) {
-        mvx[k] = s->mv[dir][k][0];
-        mvy[k] = s->mv[dir][k][1];
-        intra[k] = v->mb_type[0][s->block_index[k]];
-        if (v->field_mode)
-            mv_f[k] = v->mv_f[dir][s->block_index[k] + v->blocks_off];
-    }
-
     /* calculate chroma MV vector from four luma MVs */
-    if (!v->field_mode || (v->field_mode && !v->numref)) {
-        valid_count = get_chroma_mv(mvx, mvy, intra, 0, &tx, &ty);
-        chroma_ref_type = v->reffield;
+    if (!v->field_mode || !v->numref) {
+        int valid_count = get_chroma_mv(v, dir, &tx, &ty);
         if (!valid_count) {
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = 0;
             v->luma_mv[s->mb_x][0] = v->luma_mv[s->mb_x][1] = 0;
             return; //no need to do MC for intra blocks
         }
+        chroma_ref_type = v->ref_field_type[dir];
     } else {
-        int dominant = 0;
-        if (mv_f[0] + mv_f[1] + mv_f[2] + mv_f[3] > 2)
-            dominant = 1;
-        valid_count = get_chroma_mv(mvx, mvy, mv_f, dominant, &tx, &ty);
-        if (dominant)
-            chroma_ref_type = !v->cur_field_type;
+        int opp_count = get_luma_mv(v, dir, &tx, &ty);
+        chroma_ref_type = v->cur_field_type ^ (opp_count > 2);
     }
     if (v->field_mode && chroma_ref_type == 1 && v->cur_field_type == 1 && !v->s.last_picture.f->data[0])
         return;
@@ -578,7 +569,7 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
             srcU = s->current_picture.f->data[1];
             srcV = s->current_picture.f->data[2];
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcU = s->last_picture.f->data[1];
             srcV = s->last_picture.f->data[2];
@@ -624,36 +615,14 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
 
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? chroma_ref_type : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? chroma_ref_type : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? chroma_ref_type : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
     }
 
@@ -680,13 +649,13 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
     int uvmx_field[4], uvmy_field[4];
     int i, off, tx, ty;
     int fieldmv = v->blk_mv_type[s->block_index[0]];
-    static const int s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
+    static const uint8_t s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
     int v_dist = fieldmv ? 1 : 4; // vertical offset for lower sub-blocks
     int v_edge_pos = s->v_edge_pos >> 1;
     int use_ic;
     uint8_t (*lutuv)[256];
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
     for (i = 0; i < 4; i++) {
@@ -708,23 +677,29 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
         uvsrc_x = av_clip(uvsrc_x, -8, s->avctx->coded_width  >> 1);
         uvsrc_y = av_clip(uvsrc_y, -8, s->avctx->coded_height >> 1);
         if (i < 2 ? dir : dir2) {
-            srcU = s->next_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->next_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->next_picture.f->data[1];
+            srcV = s->next_picture.f->data[2];
             lutuv  = v->next_lutuv;
             use_ic = v->next_use_ic;
         } else {
-            srcU = s->last_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->last_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->last_picture.f->data[1];
+            srcV = s->last_picture.f->data[2];
             lutuv  = v->last_lutuv;
             use_ic = v->last_use_ic;
         }
+        if (!srcU)
+            return;
+        srcU += uvsrc_y * s->uvlinesize + uvsrc_x;
+        srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
         uvmx_field[i] = (uvmx_field[i] & 3) << 1;
         uvmy_field[i] = (uvmy_field[i] & 3) << 1;
 
-        if (fieldmv && !(uvsrc_y & 1))
-            v_edge_pos--;
-        if (fieldmv && (uvsrc_y & 1) && uvsrc_y < 2)
-            uvsrc_y--;
+        if (fieldmv) {
+            if (!(uvsrc_y & 1))
+                v_edge_pos = (s->v_edge_pos >> 1) - 1;
+            else
+                uvsrc_y -= (uvsrc_y < 2);
+        }
         if (use_ic
             || s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv)
             || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5
@@ -742,20 +717,10 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
 
             /* if we deal with intensity compensation we need to scale source blocks */
             if (use_ic) {
-                int i, j;
-                uint8_t *src, *src2;
-
-                src  = srcU;
-                src2 = srcV;
-                for (j = 0; j < 5; j++) {
-                    int f = (uvsrc_y + (j << fieldmv)) & 1;
-                    for (i = 0; i < 5; i++) {
-                        src[i]  = lutuv[f][src[i]];
-                        src2[i] = lutuv[f][src2[i]];
-                    }
-                    src  += s->uvlinesize << fieldmv;
-                    src2 += s->uvlinesize << fieldmv;
-                }
+                vc1_lut_scale_chroma(srcU, srcV,
+                                     lutuv[(uvsrc_y + (0 << fieldmv)) & 1],
+                                     lutuv[(uvsrc_y + (1 << fieldmv)) & 1],
+                                     5, s->uvlinesize << fieldmv);
             }
         }
         if (avg) {
@@ -786,7 +751,6 @@ void ff_vc1_interp_mc(VC1Context *v)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcY, *srcU, *srcV;
     int dxy, mx, my, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
-    int off, off_uv;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     int use_ic = v->next_use_ic;
 
@@ -837,7 +801,7 @@ void ff_vc1_interp_mc(VC1Context *v)
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -845,105 +809,72 @@ void ff_vc1_interp_mc(VC1Context *v)
     if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22 || use_ic
         || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3
         || (unsigned)(src_y - 1) > v_edge_pos    - (my & 3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
+                                 k, k,
                                  src_x - s->mspel, src_y - s->mspel,
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+        s->vdsp.emulated_edge_mc(ubuf, srcU,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        s->vdsp.emulated_edge_mc(vbuf, srcV,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
 
         if (use_ic) {
             uint8_t (*luty )[256] = v->next_luty;
             uint8_t (*lutuv)[256] = v->next_lutuv;
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+src_y - s->mspel) & 1);
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[1] : ((0+src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[1] : ((1+src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((0+uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((1+uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
-    off    = 0;
-    off_uv = 0;
-
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.avg_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc
         dxy = (my & 2) | ((mx & 2) >> 1);
 
         if (!v->rnd)
-            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
         else
-            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel blilinear */
     uvmx = (uvmx & 3) << 1;
     uvmy = (uvmy & 3) << 1;
     if (!v->rnd) {
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     } else {
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     }
 }
diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index 38b62f72bd..9ca6154e71 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 #include "parser.h"
 #include "vc1.h"
 #include "get_bits.h"
+#include "internal.h"
 
 /** The maximum number of bytes of a sequence, entry point or
  *  frame header whose values we pay any attention to */
@@ -63,6 +64,7 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
     /* Parse the header we just finished unescaping */
     VC1ParseContext *vpc = s->priv_data;
     GetBitContext gb;
+    int ret;
     vpc->v.s.avctx = avctx;
     vpc->v.parse_only = 1;
     init_get_bits(&gb, buf, buf_size * 8);
@@ -75,9 +77,12 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
         break;
     case VC1_CODE_FRAME & 0xFF:
         if(vpc->v.profile < PROFILE_ADVANCED)
-            ff_vc1_parse_frame_header    (&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header    (&vpc->v, &gb);
         else
-            ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+
+        if (ret < 0)
+            break;
 
         /* keep AV_PICTURE_TYPE_BI internal to VC1 */
         if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
@@ -108,6 +113,8 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
 
         break;
     }
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 }
 
 static int vc1_parse(AVCodecParserContext *s,
@@ -233,7 +240,7 @@ static int vc1_parse(AVCodecParserContext *s,
      * the start code we've already seen, or cause extra bytes to be
      * inserted at the start of the unescaped buffer. */
     vpc->bytes_to_skip = 4;
-    if (next < 0 && start_code_found)
+    if (next < 0 && next != END_NOT_FOUND)
         vpc->bytes_to_skip += next;
 
     *poutbuf = buf;
@@ -244,20 +251,18 @@ static int vc1_parse(AVCodecParserContext *s,
 static int vc1_split(AVCodecContext *avctx,
                            const uint8_t *buf, int buf_size)
 {
-    int i;
-    uint32_t state= -1;
-    int charged=0;
+    uint32_t state = -1;
+    int charged = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for(i=0; i<buf_size; i++){
-        state= (state<<8) | buf[i];
-        if(IS_MARKER(state)){
-            if(state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT){
-                charged=1;
-            }else if(charged){
-                return i-3;
-            }
-        }
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if (state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT) {
+            charged = 1;
+        } else if (charged && IS_MARKER(state))
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
 
@@ -265,6 +270,7 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
 {
     VC1ParseContext *vpc = s->priv_data;
     vpc->v.s.slice_context_count = 1;
+    vpc->v.first_pic_header_flag = 1;
     vpc->prev_start_code = 0;
     vpc->bytes_to_skip = 0;
     vpc->unesc_index = 0;
diff --git a/libavcodec/vc1_pred.c b/libavcodec/vc1_pred.c
index 96426f53ea..13134e5d8a 100644
--- a/libavcodec/vc1_pred.c
+++ b/libavcodec/vc1_pred.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -170,9 +170,9 @@ static av_always_inline int scaleforsame(VC1Context *v, int i, int n /* MV */,
     n >>= hpel;
     if (v->s.pict_type != AV_PICTURE_TYPE_B || v->second_field || !dir) {
         if (dim)
-            n = scaleforsame_y(v, i, n, dir) << hpel;
+            n = scaleforsame_y(v, i, n, dir) * (1 << hpel);
         else
-            n = scaleforsame_x(v, n, dir) << hpel;
+            n = scaleforsame_x(v, n, dir) * (1 << hpel);
         return n;
     }
     brfd      = FFMIN(v->brfd, 3);
@@ -202,7 +202,7 @@ static av_always_inline int scaleforopp(VC1Context *v, int n /* MV */,
         refdist = dir ? v->brfd : v->frfd;
     scaleopp = ff_vc1_field_mvpred_scales[dir ^ v->second_field][0][refdist];
 
-    n = (n * scaleopp >> 8) << hpel;
+    n = (n * scaleopp >> 8) * (1 << hpel);
     return n;
 }
 
@@ -231,8 +231,10 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     else
         mixedmv_pic = 0;
     /* scale MV difference to be quad-pel */
-    dmv_x <<= 1 - s->quarter_sample;
-    dmv_y <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x *= 2;
+        dmv_y *= 2;
+    }
 
     wrap = s->b8_stride;
     xy   = s->block_index[n];
@@ -392,17 +394,13 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     /* Pullback MV as specified in 8.3.5.3.4 */
     if (!v->field_mode) {
         int qx, qy, X, Y;
+        int MV = mv1 ? -60 : -28;
         qx = (s->mb_x << 6) + ((n == 1 || n == 3) ? 32 : 0);
         qy = (s->mb_y << 6) + ((n == 2 || n == 3) ? 32 : 0);
         X  = (s->mb_width  << 6) - 4;
         Y  = (s->mb_height << 6) - 4;
-        if (mv1) {
-            if (qx + px < -60) px = -60 - qx;
-            if (qy + py < -60) py = -60 - qy;
-        } else {
-            if (qx + px < -28) px = -28 - qx;
-            if (qy + py < -28) py = -28 - qy;
-        }
+        if (qx + px < MV) px = MV - qx;
+        if (qy + py < MV) py = MV - qy;
         if (qx + px > X) px = X - qx;
         if (qy + py > Y) py = Y - qy;
     }
@@ -602,9 +600,9 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 px = mid_pred(A[0], B[0], C[0]);
                 py = mid_pred(A[1], B[1], C[1]);
             } else if (total_valid) {
-                if (a_valid) { px = A[0]; py = A[1]; }
-                if (b_valid) { px = B[0]; py = B[1]; }
-                if (c_valid) { px = C[0]; py = C[1]; }
+                if      (a_valid) { px = A[0]; py = A[1]; }
+                else if (b_valid) { px = B[0]; py = B[1]; }
+                else              { px = C[0]; py = C[1]; }
             }
         }
     } else {
@@ -644,7 +642,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 } else if (!field_b && b_valid) {
                     px = B[0];
                     py = B[1];
-                } else if (c_valid) {
+                } else /*if (c_valid)*/ {
+                    av_assert1(c_valid);
                     px = C[0];
                     py = C[1];
                 }
@@ -652,7 +651,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 if (field_a && a_valid) {
                     px = A[0];
                     py = A[1];
-                } else if (field_b && b_valid) {
+                } else /*if (field_b && b_valid)*/ {
+                    av_assert1(field_b && b_valid);
                     px = B[0];
                     py = B[1];
                 }
@@ -692,25 +692,31 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
     int r_x, r_y;
     const uint8_t *is_intra = v->mb_type[0];
 
+    av_assert0(!v->field_mode);
+
     r_x = v->range_x;
     r_y = v->range_y;
     /* scale MV difference to be quad-pel */
-    dmv_x[0] <<= 1 - s->quarter_sample;
-    dmv_y[0] <<= 1 - s->quarter_sample;
-    dmv_x[1] <<= 1 - s->quarter_sample;
-    dmv_y[1] <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x[0] *= 2;
+        dmv_y[0] *= 2;
+        dmv_x[1] *= 2;
+        dmv_y[1] *= 2;
+    }
 
     wrap = s->b8_stride;
     xy = s->block_index[0];
 
     if (s->mb_intra) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = 0;
+        s->current_picture.motion_val[0][xy][0] =
+        s->current_picture.motion_val[0][xy][1] =
+        s->current_picture.motion_val[1][xy][0] =
+        s->current_picture.motion_val[1][xy][1] = 0;
         return;
     }
-    if (!v->field_mode) {
+        if (direct && s->next_picture_ptr->field_picture)
+            av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
+
         s->mv[0][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 0, s->quarter_sample);
         s->mv[0][0][1] = scale_mv(s->next_picture.motion_val[1][xy][1], v->bfraction, 0, s->quarter_sample);
         s->mv[1][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 1, s->quarter_sample);
@@ -721,12 +727,11 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         s->mv[0][0][1] = av_clip(s->mv[0][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
         s->mv[1][0][0] = av_clip(s->mv[1][0][0], -60 - (s->mb_x << 6), (s->mb_width  << 6) - 4 - (s->mb_x << 6));
         s->mv[1][0][1] = av_clip(s->mv[1][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
-    }
     if (direct) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] = s->mv[0][0][0];
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] = s->mv[0][0][1];
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] = s->mv[1][0][0];
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = s->mv[1][0][1];
+        s->current_picture.motion_val[0][xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][xy][1] = s->mv[0][0][1];
+        s->current_picture.motion_val[1][xy][0] = s->mv[1][0][0];
+        s->current_picture.motion_val[1][xy][1] = s->mv[1][0][1];
         return;
     }
 
@@ -754,25 +759,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
@@ -833,25 +829,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
diff --git a/libavcodec/vc1_pred.h b/libavcodec/vc1_pred.h
index 34c9c1a08c..4d47f86696 100644
--- a/libavcodec/vc1_pred.h
+++ b/libavcodec/vc1_pred.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1acdata.h b/libavcodec/vc1acdata.h
index 73ebe40bdf..a70b44ae05 100644
--- a/libavcodec/vc1acdata.h
+++ b/libavcodec/vc1acdata.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder
  * copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1data.c b/libavcodec/vc1data.c
index 70cead8b2c..fc9ba6da13 100644
--- a/libavcodec/vc1data.c
+++ b/libavcodec/vc1data.c
@@ -4,20 +4,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1019,21 +1019,21 @@ const uint8_t ff_vc1_mv_diff_bits[4][73] = {
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Table 232 */
-const int8_t ff_vc1_simple_progressive_4x4_zz [16] = {
+const uint8_t ff_vc1_simple_progressive_4x4_zz [16] = {
      0,     8,    16,     1,
      9,    24,    17,     2,
     10,    18,    25,     3,
     11,    26,    19,    27
 };
 
-const int8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
+const uint8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
      0,     8,     1,    16,     2,     9,    10,     3,
     24,    17,     4,    11,    18,    12,     5,    19,
     25,    13,    20,    26,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
+const uint8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
      0,     1,     8,     2,
      9,    16,    17,    24,
     10,    32,    25,    18,
@@ -1044,7 +1044,7 @@ const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
+const uint8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
      0,     8,     1,    16,    24,     9,     2,    32,
     40,    48,    56,    17,    10,     3,    25,    18,
     11,     4,    33,    41,    49,    57,    26,    34,
@@ -1055,14 +1055,14 @@ const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
     61,    62,    54,    46,    39,    47,    55,    63
 };
 
-const int8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
+const uint8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
      0,     8,    16,    24,     1,     9,     2,    17,
     25,    10,     3,    18,    26,     4,    11,    19,
     12,     5,    13,    20,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
+const uint8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
      0,     1,     2,     8,
     16,     9,    24,    17,
     10,     3,    32,    40,
@@ -1073,7 +1073,7 @@ const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
+const uint8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
      0,     8,    16,    24,
      1,     9,    17,     2,
     25,    10,    18,     3,
diff --git a/libavcodec/vc1data.h b/libavcodec/vc1data.h
index 66c569b797..763cd48a60 100644
--- a/libavcodec/vc1data.h
+++ b/libavcodec/vc1data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -181,15 +181,15 @@ extern const uint8_t ff_vc1_2ref_mvdata_bits[8][126];
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Scantables/ZZ scan are at 11.9 (p262) and 8.1.1.12 (p10) */
-extern const int8_t ff_vc1_simple_progressive_4x4_zz [16];
-extern const int8_t ff_vc1_adv_progressive_8x4_zz [32];
-extern const int8_t ff_vc1_adv_progressive_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_8x8_zz [64];
-extern const int8_t ff_vc1_adv_interlaced_8x4_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x4_zz [16];
-extern const int8_t ff_vc1_intra_horz_8x8_zz [64];
-extern const int8_t ff_vc1_intra_vert_8x8_zz [64];
+extern const uint8_t ff_vc1_simple_progressive_4x4_zz [16];
+extern const uint8_t ff_vc1_adv_progressive_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_progressive_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_8x8_zz [64];
+extern const uint8_t ff_vc1_adv_interlaced_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x4_zz [16];
+extern const uint8_t ff_vc1_intra_horz_8x8_zz [64];
+extern const uint8_t ff_vc1_intra_vert_8x8_zz [64];
 
 /* DQScale as specified in 8.1.3.9 - almost identical to 0x40000/i */
 extern const int32_t ff_vc1_dqscale[63];
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 8e8411289e..acd29bcd2b 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,9 @@
 #include "msmpeg4data.h"
 #include "vc1.h"
 #include "vc1data.h"
+#include "vdpau_compat.h"
+#include "libavutil/avassert.h"
+
 
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
 
@@ -97,7 +100,7 @@ static void vc1_sprite_parse_transform(GetBitContext* gb, int c[7])
         c[6] = 1 << 16;
 }
 
-static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
+static int vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
 {
     AVCodecContext *avctx = v->s.avctx;
     int sprite, i;
@@ -141,7 +144,7 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         sd->effect_pcount2 = get_bits(gb, 16);
         if (sd->effect_pcount2 > 10) {
             av_log(avctx, AV_LOG_ERROR, "Too many effect parameters\n");
-            return;
+            return AVERROR_INVALIDDATA;
         } else if (sd->effect_pcount2) {
             i = -1;
             av_log(avctx, AV_LOG_DEBUG, "Effect params 2: ");
@@ -158,10 +161,14 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         av_log(avctx, AV_LOG_DEBUG, "Effect flag set\n");
 
     if (get_bits_count(gb) >= gb->size_in_bits +
-       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0))
+       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0)) {
         av_log(avctx, AV_LOG_ERROR, "Buffer overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (get_bits_count(gb) < gb->size_in_bits - 8)
         av_log(avctx, AV_LOG_WARNING, "Buffer not fully read\n");
+
+    return 0;
 }
 
 static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
@@ -173,7 +180,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     int ysub[2];
     MpegEncContext *s = &v->s;
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i <= v->two_sprites; i++) {
         xoff[i] = av_clip(sd->coefs[i][2], 0, v->sprite_width-1 << 16);
         xadv[i] = sd->coefs[i][0];
         if (xadv[i] != 1<<16 || (v->sprite_width << 16) - (v->output_width << 16) - xoff[i])
@@ -184,7 +191,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     }
     alpha = av_clip_uint16(sd->coefs[1][6]);
 
-    for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
+    for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
         int width = v->output_width>>!!plane;
 
         for (row = 0; row < v->output_height>>!!plane; row++) {
@@ -251,7 +258,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
         }
 
         if (!plane) {
-            for (i = 0; i < 2; i++) {
+            for (i = 0; i <= v->two_sprites; i++) {
                 xoff[i] >>= 1;
                 yoff[i] >>= 1;
             }
@@ -263,15 +270,20 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
 
 static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
 {
+    int ret;
     MpegEncContext *s     = &v->s;
     AVCodecContext *avctx = s->avctx;
     SpriteData sd;
 
-    vc1_parse_sprites(v, gb, &sd);
+    memset(&sd, 0, sizeof(sd));
 
-    if (!s->current_picture.f->data[0]) {
+    ret = vc1_parse_sprites(v, gb, &sd);
+    if (ret < 0)
+        return ret;
+
+    if (!s->current_picture.f || !s->current_picture.f->data[0]) {
         av_log(avctx, AV_LOG_ERROR, "Got no sprites\n");
-        return -1;
+        return AVERROR_UNKNOWN;
     }
 
     if (v->two_sprites && (!s->last_picture_ptr || !s->last_picture.f->data[0])) {
@@ -280,10 +292,8 @@ static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
     }
 
     av_frame_unref(v->sprite_output_frame);
-    if (ff_get_buffer(avctx, v->sprite_output_frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, v->sprite_output_frame, 0)) < 0)
+        return ret;
 
     vc1_draw_sprites(v, &sd);
 
@@ -302,7 +312,7 @@ static void vc1_sprite_flush(AVCodecContext *avctx)
        wrong but it looks better than doing nothing. */
 
     if (f && f->data[0])
-        for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
+        for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
             for (i = 0; i < v->sprite_height>>!!plane; i++)
                 memset(f->data[plane] + i * f->linesize[plane],
                        plane ? 128 : 0, f->linesize[plane]);
@@ -332,7 +342,7 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
     v->ttblk            = v->ttblk_base + s->mb_stride;
     v->is_intra_base    = av_mallocz(sizeof(v->is_intra_base[0]) * 2 * s->mb_stride);
     v->is_intra         = v->is_intra_base + s->mb_stride;
-    v->luma_mv_base     = av_malloc(sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
+    v->luma_mv_base     = av_mallocz(sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
     v->luma_mv          = v->luma_mv_base + s->mb_stride;
 
     /* allocate block type info in that way so it could be used with s->block_index[] */
@@ -363,7 +373,8 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
 
     if (s->avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || s->avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
         for (i = 0; i < 4; i++)
-            if (!(v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width))) return -1;
+            if (!(v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width)))
+                return AVERROR(ENOMEM);
     }
 
     if (!v->mv_type_mb_plane || !v->direct_mb_plane || !v->acpred_plane || !v->over_flags_plane ||
@@ -389,7 +400,7 @@ av_cold void ff_vc1_init_transposed_scantables(VC1Context *v)
 {
     int i;
     for (i = 0; i < 64; i++) {
-#define transpose(x) ((x >> 3) | ((x & 7) << 3))
+#define transpose(x) (((x) >> 3) | (((x) & 7) << 3))
         v->zz_8x8[0][i] = transpose(ff_wmv1_scantable[0][i]);
         v->zz_8x8[1][i] = transpose(ff_wmv1_scantable[1][i]);
         v->zz_8x8[2][i] = transpose(ff_wmv1_scantable[2][i]);
@@ -409,6 +420,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
     VC1Context *v = avctx->priv_data;
     MpegEncContext *s = &v->s;
     GetBitContext gb;
+    int ret;
 
     /* save the container output size for WMImage */
     v->output_width  = avctx->width;
@@ -416,17 +428,10 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
     if (!avctx->extradata_size || !avctx->extradata)
         return -1;
-    if (!(avctx->flags & AV_CODEC_FLAG_GRAY))
-        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    else
-        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
     v->s.avctx = avctx;
 
-    if (ff_vc1_init_common(v) < 0)
-        return -1;
-    ff_blockdsp_init(&s->bdsp, avctx);
-    ff_h264chroma_init(&v->h264chroma, 8);
-    ff_qpeldsp_init(&s->qdsp);
+    if ((ret = ff_vc1_init_common(v)) < 0)
+        return ret;
 
     if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
         int count = 0;
@@ -438,8 +443,8 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
         init_get_bits(&gb, avctx->extradata, avctx->extradata_size*8);
 
-        if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0)
-          return -1;
+        if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0)
+          return ret;
 
         count = avctx->extradata_size*8 - get_bits_count(&gb);
         if (count > 0) {
@@ -462,6 +467,9 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         }
 
         buf2  = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
+
         start = find_next_marker(start, end); // in WVC1 extradata first byte is its size, but can be 0 in mkv
         next  = start;
         for (; next < end; start = next) {
@@ -473,16 +481,16 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
-                if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 seq_initialized = 1;
                 break;
             case VC1_CODE_ENTRYPOINT:
-                if (ff_vc1_decode_entry_point(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_entry_point(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 ep_initialized = 1;
                 break;
@@ -496,14 +504,38 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         v->res_sprite = (avctx->codec_id == AV_CODEC_ID_VC1IMAGE);
     }
 
-    v->sprite_output_frame = av_frame_alloc();
-    if (!v->sprite_output_frame)
-        return AVERROR(ENOMEM);
-
     avctx->profile = v->profile;
     if (v->profile == PROFILE_ADVANCED)
         avctx->level = v->level;
 
+    if (!CONFIG_GRAY || !(avctx->flags & AV_CODEC_FLAG_GRAY))
+        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    else {
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+    }
+
+    // ensure static VLC tables are initialized
+    if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
+        return ret;
+    if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0)
+        return ret;
+    // Hack to ensure the above functions will be called
+    // again once we know all necessary settings.
+    // That this is necessary might indicate a bug.
+    ff_vc1_decode_end(avctx);
+
+    ff_blockdsp_init(&s->bdsp, avctx);
+    ff_h264chroma_init(&v->h264chroma, 8);
+    ff_qpeldsp_init(&s->qdsp);
+
+    // Must happen after calling ff_vc1_decode_end
+    // to avoid de-allocating the sprite_output_frame
+    v->sprite_output_frame = av_frame_alloc();
+    if (!v->sprite_output_frame)
+        return AVERROR(ENOMEM);
+
     avctx->has_b_frames = !!avctx->max_b_frames;
 
     if (v->color_prim == 1 || v->color_prim == 5 || v->color_prim == 6)
@@ -536,6 +568,11 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             v->sprite_height > 1 << 14 ||
             v->output_width  > 1 << 14 ||
             v->output_height > 1 << 14) return -1;
+
+        if ((v->sprite_width&1) || (v->sprite_height&1)) {
+            avpriv_request_sample(avctx, "odd sprites support");
+            return AVERROR_PATCHWELCOME;
+        }
     }
     return 0;
 }
@@ -587,14 +624,19 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     MpegEncContext *s = &v->s;
     AVFrame *pict = data;
     uint8_t *buf2 = NULL;
-    const uint8_t *buf_start = buf;
-    int mb_height, n_slices1;
+    const uint8_t *buf_start = buf, *buf_start_second_field = NULL;
+    int mb_height, n_slices1=-1;
     struct {
         uint8_t *buf;
         GetBitContext gb;
         int mby_start;
     } *slices = NULL, *tmp;
 
+    v->second_field = 0;
+
+    if(s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
+        s->low_delay = 1;
+
     /* no supplementary picture */
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == VC1_CODE_ENDOFSEQ)) {
         /* special case for last picture */
@@ -606,13 +648,24 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame = 1;
         }
 
-        return 0;
+        return buf_size;
     }
 
+#if FF_API_CAP_VDPAU
+    if (s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU) {
+        if (v->profile < PROFILE_ADVANCED)
+            avctx->pix_fmt = AV_PIX_FMT_VDPAU_WMV3;
+        else
+            avctx->pix_fmt = AV_PIX_FMT_VDPAU_VC1;
+    }
+#endif
+
     //for advanced profile we may need to parse and unescape data
     if (avctx->codec_id == AV_CODEC_ID_VC1 || avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
         int buf_size2 = 0;
         buf2 = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
 
         if (IS_MARKER(AV_RB32(buf))) { /* frame starts with marker and needs to be parsed */
             const uint8_t *start, *end, *next;
@@ -625,26 +678,40 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 if (size <= 0) continue;
                 switch (AV_RB32(start)) {
                 case VC1_CODE_FRAME:
-                    if (avctx->hwaccel)
+                    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                        || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                        )
                         buf_start = start;
                     buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                        || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                        )
+                        buf_start_second_field = start;
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     /* assuming that the field marker is at the exact middle,
                        hope it's correct */
-                    slices[n_slices].mby_start = s->mb_height >> 1;
+                    slices[n_slices].mby_start = s->mb_height + 1 >> 1;
                     n_slices1 = n_slices - 1; // index of the last slice of the first field
                     n_slices++;
                     break;
@@ -656,13 +723,17 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 case VC1_CODE_SLICE: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
@@ -680,19 +751,30 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             divider = find_next_marker(buf, buf + buf_size);
             if ((divider == (buf + buf_size)) || AV_RB32(divider) != VC1_CODE_FIELD) {
                 av_log(avctx, AV_LOG_ERROR, "Error in WVC1 interlaced frame\n");
+                ret = AVERROR_INVALIDDATA;
                 goto err;
             } else { // found field marker, unescape second field
-                tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                if (!tmp)
+                if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                    || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                    )
+                    buf_start_second_field = divider;
+                tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                if (!tmp) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 slices = tmp;
                 slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                if (!slices[n_slices].buf)
+                if (!slices[n_slices].buf) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
-                slices[n_slices].mby_start = s->mb_height >> 1;
+                slices[n_slices].mby_start = s->mb_height + 1 >> 1;
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
@@ -729,9 +811,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (!s->context_initialized) {
-        if (ff_msmpeg4_decode_init(avctx) < 0)
+        if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
             goto err;
-        if (ff_vc1_decode_init_alloc_tables(v) < 0) {
+        if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0) {
             ff_mpv_common_end(s);
             goto err;
         }
@@ -739,6 +821,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         s->low_delay = !avctx->has_b_frames || v->res_sprite;
 
         if (v->profile == PROFILE_ADVANCED) {
+            if(avctx->coded_width<=1 || avctx->coded_height<=1) {
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
             s->h_edge_pos = avctx->coded_width;
             s->v_edge_pos = avctx->coded_height;
         }
@@ -748,19 +834,29 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     v->pic_header_flag = 0;
     v->first_pic_header_flag = 1;
     if (v->profile < PROFILE_ADVANCED) {
-        if (ff_vc1_parse_frame_header(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header(v, &s->gb)) < 0) {
             goto err;
         }
     } else {
-        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
             goto err;
         }
     }
     v->first_pic_header_flag = 0;
 
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(v->s.avctx, AV_LOG_DEBUG, "pict_type: %c\n", av_get_picture_type_char(s->pict_type));
+
     if ((avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || avctx->codec_id == AV_CODEC_ID_VC1IMAGE)
         && s->pict_type != AV_PICTURE_TYPE_I) {
         av_log(v->s.avctx, AV_LOG_ERROR, "Sprite decoder: expected I-frame\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    if ((s->mb_height >> v->field_mode) == 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "image too short\n");
+        ret = AVERROR_INVALIDDATA;
         goto err;
     }
 
@@ -770,6 +866,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* skip B-frames if we don't have reference frames */
     if (!s->last_picture_ptr && (s->pict_type == AV_PICTURE_TYPE_B || s->droppable)) {
+        av_log(v->s.avctx, AV_LOG_DEBUG, "Skipping B frame without reference frames\n");
         goto end;
     }
     if ((avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type == AV_PICTURE_TYPE_B) ||
@@ -785,10 +882,14 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->next_p_frame_damaged = 0;
     }
 
-    if (ff_mpv_frame_start(s, avctx) < 0) {
+    if ((ret = ff_mpv_frame_start(s, avctx)) < 0) {
         goto err;
     }
 
+    v->s.current_picture_ptr->field_picture = v->field_mode;
+    v->s.current_picture_ptr->f->interlaced_frame = (v->fcm != PROGRESSIVE);
+    v->s.current_picture_ptr->f->top_field_first  = v->tff;
+
     // process pulldown flags
     s->current_picture_ptr->f->repeat_pict = 0;
     // Pulldown flags are only valid when 'broadcast' has been set.
@@ -804,13 +905,55 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     s->me.qpel_put = s->qdsp.put_qpel_pixels_tab;
     s->me.qpel_avg = s->qdsp.avg_qpel_pixels_tab;
 
+#if FF_API_CAP_VDPAU
+    if ((CONFIG_VC1_VDPAU_DECODER)
+        &&s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU) {
+        if (v->field_mode && buf_start_second_field) {
+            ff_vdpau_vc1_decode_picture(s, buf_start, buf_start_second_field - buf_start);
+            ff_vdpau_vc1_decode_picture(s, buf_start_second_field, (buf + buf_size) - buf_start_second_field);
+        } else {
+            ff_vdpau_vc1_decode_picture(s, buf_start, (buf + buf_size) - buf_start);
+        }
+    } else
+#endif
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->start_frame(avctx, buf, buf_size) < 0)
-            goto err;
-        if (avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start) < 0)
-            goto err;
-        if (avctx->hwaccel->end_frame(avctx) < 0)
-            goto err;
+        if (v->field_mode && buf_start_second_field) {
+            // decode first field
+            s->picture_structure = PICT_BOTTOM_FIELD - v->tff;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+
+            // decode second field
+            s->gb = slices[n_slices1 + 1].gb;
+            s->picture_structure = PICT_TOP_FIELD + v->tff;
+            v->second_field = 1;
+            v->pic_header_flag = 0;
+            if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "parsing header for second field failed");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
+
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        } else {
+            s->picture_structure = PICT_FRAME;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        }
     } else {
         int header_ret = 0;
 
@@ -827,10 +970,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         }
         mb_height = s->mb_height >> v->field_mode;
 
-        if (!mb_height) {
-            av_log(v->s.avctx, AV_LOG_ERROR, "Invalid mb_height.\n");
-            goto err;
-        }
+        av_assert0 (mb_height > 0);
 
         for (i = 0; i <= n_slices; i++) {
             if (i > 0 &&  slices[i - 1].mby_start >= mb_height) {
@@ -841,7 +981,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     continue;
                 }
                 v->second_field = 1;
-                v->blocks_off   = s->mb_width  * s->mb_height << 1;
+                av_assert0((s->mb_height & 1) == 0);
+                v->blocks_off   = s->b8_stride * (s->mb_height&~1);
                 v->mb_off       = s->mb_stride * s->mb_height >> 1;
             } else {
                 v->second_field = 0;
@@ -853,6 +994,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 if (v->field_mode && i == n_slices1 + 2) {
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Field header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -861,6 +1003,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     v->pic_header_flag = 1;
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -872,8 +1015,21 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->start_mb_y = (i == 0) ? 0 : FFMAX(0, slices[i-1].mby_start % mb_height);
             if (!v->field_mode || v->second_field)
                 s->end_mb_y = (i == n_slices     ) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
-            else
+            else {
+                if (i >= n_slices) {
+                    av_log(v->s.avctx, AV_LOG_ERROR, "first field slice count too large\n");
+                    continue;
+                }
                 s->end_mb_y = (i <= n_slices1 + 1) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
+            }
+            if (s->end_mb_y <= s->start_mb_y) {
+                av_log(v->s.avctx, AV_LOG_ERROR, "end mb y %d %d invalid\n", s->end_mb_y, s->start_mb_y);
+                continue;
+            }
+            if (!v->p_frame_skipped && s->pict_type != AV_PICTURE_TYPE_I && !v->cbpcy_vlc) {
+                av_log(v->s.avctx, AV_LOG_ERROR, "missing cbpcy_vlc\n");
+                continue;
+            }
             ff_vc1_decode_blocks(v);
             if (i != n_slices)
                 s->gb = slices[i].gb;
@@ -894,6 +1050,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 get_bits_count(&s->gb), s->gb.size_in_bits);
 //  if (get_bits_count(&s->gb) > buf_size * 8)
 //      return -1;
+        if(s->er.error_occurred && s->pict_type == AV_PICTURE_TYPE_B) {
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
         if (!v->field_mode)
             ff_er_frame_end(&s->er);
     }
@@ -907,7 +1067,7 @@ image:
         if (avctx->skip_frame >= AVDISCARD_NONREF)
             goto end;
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
-        if (vc1_decode_sprites(v, &s->gb))
+        if ((ret = vc1_decode_sprites(v, &s->gb)) < 0)
             goto err;
 #endif
         if ((ret = av_frame_ref(pict, v->sprite_output_frame)) < 0)
@@ -917,12 +1077,12 @@ image:
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
             *got_frame = 1;
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
             *got_frame = 1;
         }
     }
@@ -939,7 +1099,7 @@ err:
     for (i = 0; i < n_slices; i++)
         av_free(slices[i].buf);
     av_free(slices);
-    return -1;
+    return ret;
 }
 
 
@@ -959,7 +1119,7 @@ static const enum AVPixelFormat vc1_hwaccel_pixfmt_list_420[] = {
     AV_PIX_FMT_D3D11VA_VLD,
 #endif
 #if CONFIG_VC1_VAAPI_HWACCEL
-    AV_PIX_FMT_VAAPI_VLD,
+    AV_PIX_FMT_VAAPI,
 #endif
 #if CONFIG_VC1_VDPAU_HWACCEL
     AV_PIX_FMT_VDPAU,
@@ -1000,6 +1160,38 @@ AVCodec ff_wmv3_decoder = {
 };
 #endif
 
+#if CONFIG_WMV3_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_wmv3_vdpau_decoder = {
+    .name           = "wmv3_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Video 9 VDPAU"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WMV3,
+    .priv_data_size = sizeof(VC1Context),
+    .init           = vc1_decode_init,
+    .close          = ff_vc1_decode_end,
+    .decode         = vc1_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_VDPAU_WMV3, AV_PIX_FMT_NONE },
+    .profiles       = NULL_IF_CONFIG_SMALL(profiles)
+};
+#endif
+
+#if CONFIG_VC1_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_vc1_vdpau_decoder = {
+    .name           = "vc1_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-1 VDPAU"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .priv_data_size = sizeof(VC1Context),
+    .init           = vc1_decode_init,
+    .close          = ff_vc1_decode_end,
+    .decode         = vc1_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_VDPAU_VC1, AV_PIX_FMT_NONE },
+    .profiles       = NULL_IF_CONFIG_SMALL(profiles)
+};
+#endif
+
 #if CONFIG_WMV3IMAGE_DECODER
 AVCodec ff_wmv3image_decoder = {
     .name           = "wmv3image",
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index a193dd704d..a16c8d512c 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,12 @@
  *
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "h264chroma.h"
 #include "qpeldsp.h"
+#include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
 
@@ -582,10 +585,10 @@ static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
 }
 
 /* Function used to do motion compensation with bicubic interpolation */
-#define VC1_MSPEL_MC(OP, OPNAME)                                              \
+#define VC1_MSPEL_MC(OP, OP4, OPNAME)                                         \
 static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
                                                     const uint8_t *src,       \
-                                                    int stride,               \
+                                                    ptrdiff_t stride,         \
                                                     int hmode,                \
                                                     int vmode,                \
                                                     int rnd)                  \
@@ -640,13 +643,93 @@ static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
         dst += stride;                                                        \
         src += stride;                                                        \
     }                                                                         \
+}\
+static av_always_inline void OPNAME ## vc1_mspel_mc_16(uint8_t *dst,          \
+                                                       const uint8_t *src,    \
+                                                       ptrdiff_t stride,      \
+                                                       int hmode,             \
+                                                       int vmode,             \
+                                                       int rnd)               \
+{                                                                             \
+    int i, j;                                                                 \
+                                                                              \
+    if (vmode) { /* Horizontal filter to apply */                             \
+        int r;                                                                \
+                                                                              \
+        if (hmode) { /* Vertical filter to apply, output to tmp */            \
+            static const int shift_value[] = { 0, 5, 1, 5 };                  \
+            int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;       \
+            int16_t tmp[19 * 16], *tptr = tmp;                                \
+                                                                              \
+            r = (1 << (shift - 1)) + rnd - 1;                                 \
+                                                                              \
+            src -= 1;                                                         \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 19; i++)                                      \
+                    tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \
+                src  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            r    = 64 - rnd;                                                  \
+            tptr = tmp + 1;                                                   \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \
+                dst  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            return;                                                           \
+        } else { /* No horizontal filter, output 8 lines to dst */            \
+            r = 1 - rnd;                                                      \
+                                                                              \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r));  \
+                src += stride;                                                \
+                dst += stride;                                                \
+            }                                                                 \
+            return;                                                           \
+        }                                                                     \
+    }                                                                         \
+                                                                              \
+    /* Horizontal mode with no vertical mode */                               \
+    for (j = 0; j < 16; j++) {                                                \
+        for (i = 0; i < 16; i++)                                              \
+            OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd));             \
+        dst += stride;                                                        \
+        src += stride;                                                        \
+    }                                                                         \
+}\
+static void OPNAME ## pixels8x8_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<8; i++){\
+        OP4(*(uint32_t*)(block  ), AV_RN32(pixels  ));\
+        OP4(*(uint32_t*)(block+4), AV_RN32(pixels+4));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void OPNAME ## pixels16x16_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<16; i++){\
+        OP4(*(uint32_t*)(block   ), AV_RN32(pixels   ));\
+        OP4(*(uint32_t*)(block+ 4), AV_RN32(pixels+ 4));\
+        OP4(*(uint32_t*)(block+ 8), AV_RN32(pixels+ 8));\
+        OP4(*(uint32_t*)(block+12), AV_RN32(pixels+12));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
 }
 
-#define op_put(a, b) a = av_clip_uint8(b)
-#define op_avg(a, b) a = (a + av_clip_uint8(b) + 1) >> 1
+#define op_put(a, b) (a) = av_clip_uint8(b)
+#define op_avg(a, b) (a) = ((a) + av_clip_uint8(b) + 1) >> 1
+#define op4_avg(a, b) (a) = rnd_avg32(a, b)
+#define op4_put(a, b) (a) = (b)
 
-VC1_MSPEL_MC(op_put, put_)
-VC1_MSPEL_MC(op_avg, avg_)
+VC1_MSPEL_MC(op_put, op4_put, put_)
+VC1_MSPEL_MC(op_avg, op4_avg, avg_)
 
 /* pixel functions - really are entry points to vc1_mspel_mc */
 
@@ -662,6 +745,18 @@ static void avg_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst,                    \
                                              ptrdiff_t stride, int rnd)       \
 {                                                                             \
     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                            \
+}                                                                             \
+static void put_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
+}                                                                             \
+static void avg_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
 }
 
 PUT_VC1_MSPEL(1, 0)
@@ -683,19 +778,6 @@ PUT_VC1_MSPEL(1, 3)
 PUT_VC1_MSPEL(2, 3)
 PUT_VC1_MSPEL(3, 3)
 
-
-static void put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8x8_c(dst, src, stride);
-}
-
-static void avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8x8_c(dst, src, stride);
-}
-
 #define chroma_mc(a) \
     ((A * src[a] + B * src[a + 1] + \
       C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
@@ -709,7 +791,7 @@ static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -734,7 +816,7 @@ static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -757,7 +839,7 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -783,7 +865,7 @@ static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
     const int D = (    x) * (    y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -878,6 +960,11 @@ static void sprite_v_double_twoscale_c(uint8_t *dst,
 }
 
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = put_vc1_mspel_mc##X##Y##_c; \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = put_vc1_mspel_mc##X##Y##_16_c; \
+    dsp->avg_vc1_mspel_pixels_tab[1][X+4*Y] = avg_vc1_mspel_mc##X##Y##_c; \
+    dsp->avg_vc1_mspel_pixels_tab[0][X+4*Y] = avg_vc1_mspel_mc##X##Y##_16_c
 
 av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 {
@@ -902,39 +989,28 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
     dsp->vc1_v_loop_filter16  = vc1_v_loop_filter16_c;
     dsp->vc1_h_loop_filter16  = vc1_h_loop_filter16_c;
 
-    dsp->put_vc1_mspel_pixels_tab[0]  = put_vc1_mspel_mc00_c;
-    dsp->put_vc1_mspel_pixels_tab[1]  = put_vc1_mspel_mc10_c;
-    dsp->put_vc1_mspel_pixels_tab[2]  = put_vc1_mspel_mc20_c;
-    dsp->put_vc1_mspel_pixels_tab[3]  = put_vc1_mspel_mc30_c;
-    dsp->put_vc1_mspel_pixels_tab[4]  = put_vc1_mspel_mc01_c;
-    dsp->put_vc1_mspel_pixels_tab[5]  = put_vc1_mspel_mc11_c;
-    dsp->put_vc1_mspel_pixels_tab[6]  = put_vc1_mspel_mc21_c;
-    dsp->put_vc1_mspel_pixels_tab[7]  = put_vc1_mspel_mc31_c;
-    dsp->put_vc1_mspel_pixels_tab[8]  = put_vc1_mspel_mc02_c;
-    dsp->put_vc1_mspel_pixels_tab[9]  = put_vc1_mspel_mc12_c;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_c;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_c;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_c;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_c;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c;
-
-    dsp->avg_vc1_mspel_pixels_tab[0]  = avg_vc1_mspel_mc00_c;
-    dsp->avg_vc1_mspel_pixels_tab[1]  = avg_vc1_mspel_mc10_c;
-    dsp->avg_vc1_mspel_pixels_tab[2]  = avg_vc1_mspel_mc20_c;
-    dsp->avg_vc1_mspel_pixels_tab[3]  = avg_vc1_mspel_mc30_c;
-    dsp->avg_vc1_mspel_pixels_tab[4]  = avg_vc1_mspel_mc01_c;
-    dsp->avg_vc1_mspel_pixels_tab[5]  = avg_vc1_mspel_mc11_c;
-    dsp->avg_vc1_mspel_pixels_tab[6]  = avg_vc1_mspel_mc21_c;
-    dsp->avg_vc1_mspel_pixels_tab[7]  = avg_vc1_mspel_mc31_c;
-    dsp->avg_vc1_mspel_pixels_tab[8]  = avg_vc1_mspel_mc02_c;
-    dsp->avg_vc1_mspel_pixels_tab[9]  = avg_vc1_mspel_mc12_c;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_c;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_c;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_c;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_c;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_c;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_c;
+    dsp->put_vc1_mspel_pixels_tab[0][0] = put_pixels16x16_c;
+    dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_pixels16x16_c;
+    dsp->put_vc1_mspel_pixels_tab[1][0] = put_pixels8x8_c;
+    dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_pixels8x8_c;
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(0, 3);
+
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(1, 3);
+
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(2, 3);
+
+    FN_ASSIGN(3, 0);
+    FN_ASSIGN(3, 1);
+    FN_ASSIGN(3, 2);
+    FN_ASSIGN(3, 3);
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_c;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_c;
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index 682f682144..e2f75ac7dc 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,8 @@
 #include "hpeldsp.h"
 #include "h264chroma.h"
 
+typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
+
 typedef struct VC1DSPContext {
     /* vc1 functions */
     void (*vc1_inv_trans_8x8)(int16_t *b);
@@ -55,8 +57,8 @@ typedef struct VC1DSPContext {
     /* put 8x8 block with bicubic interpolation and quarterpel precision
      * last argument is actually round value instead of height
      */
-    op_pixels_func put_vc1_mspel_pixels_tab[16];
-    op_pixels_func avg_vc1_mspel_pixels_tab[16];
+    vc1op_pixels_func put_vc1_mspel_pixels_tab[2][16];
+    vc1op_pixels_func avg_vc1_mspel_pixels_tab[2][16];
 
     /* This is really one func used in VC-1 decoding */
     h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
diff --git a/libavcodec/vcr1.c b/libavcodec/vcr1.c
index 76c47eb2a4..28a5eec7d1 100644
--- a/libavcodec/vcr1.c
+++ b/libavcodec/vcr1.c
@@ -2,20 +2,20 @@
  * ATI VCR1 codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 
 typedef struct VCR1Context {
@@ -37,8 +38,8 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 {
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
-    if (avctx->width & 7) {
-        av_log(avctx, AV_LOG_ERROR, "Width %d is not divisble by 8.\n", avctx->width);
+    if (avctx->width % 8 || avctx->height%4) {
+        avpriv_request_sample(avctx, "odd dimensions (%d x %d) support", avctx->width, avctx->height);
         return AVERROR_INVALIDDATA;
     }
 
@@ -48,27 +49,25 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf        = avpkt->data;
-    int buf_size              = avpkt->size;
     VCR1Context *const a      = avctx->priv_data;
     AVFrame *const p          = data;
-    const uint8_t *bytestream = buf;
+    const uint8_t *bytestream = avpkt->data;
+    const uint8_t *bytestream_end = bytestream + avpkt->size;
     int i, x, y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if(avpkt->size < 32 + avctx->height + avctx->width*avctx->height*5/8){
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data. %d < %d\n", avpkt->size ,  32 + avctx->height + avctx->width*avctx->height*5/8);
+        return AVERROR(EINVAL);
     }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
-    if (buf_size < 32)
-        goto packet_small;
-
     for (i = 0; i < 16; i++) {
         a->delta[i] = *bytestream++;
         bytestream++;
-        buf_size--;
     }
 
     for (y = 0; y < avctx->height; y++) {
@@ -79,12 +78,10 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
             uint8_t *cb = &p->data[1][(y >> 2) * p->linesize[1]];
             uint8_t *cr = &p->data[2][(y >> 2) * p->linesize[2]];
 
-            if (buf_size < 4 + avctx->width)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= 4 + avctx->width);
 
             for (i = 0; i < 4; i++)
                 a->offset[i] = *bytestream++;
-            buf_size -= 4;
 
             offset = a->offset[0] - a->delta[bytestream[2] & 0xF];
             for (x = 0; x < avctx->width; x += 4) {
@@ -98,11 +95,9 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 *cr++       = bytestream[1];
 
                 bytestream += 4;
-                buf_size   -= 4;
             }
         } else {
-            if (buf_size < avctx->width / 2)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= avctx->width / 2);
 
             offset = a->offset[y & 3] - a->delta[bytestream[2] & 0xF];
 
@@ -117,17 +112,13 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 luma[7]     = offset += a->delta[bytestream[1] >>  4];
                 luma       += 8;
                 bytestream += 4;
-                buf_size   -= 4;
             }
         }
     }
 
     *got_frame = 1;
 
-    return buf_size;
-packet_small:
-    av_log(avctx, AV_LOG_ERROR, "Input packet too small.\n");
-    return AVERROR_INVALIDDATA;
+    return bytestream - avpkt->data;
 }
 
 AVCodec ff_vcr1_decoder = {
diff --git a/libavcodec/vda.c b/libavcodec/vda.c
index eb4b9982cb..4670140c76 100644
--- a/libavcodec/vda.c
+++ b/libavcodec/vda.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,7 +21,7 @@
 #include "libavutil/mem.h"
 
 #include "vda.h"
-#include "vda_internal.h"
+#include "vda_vt_internal.h"
 
 #if CONFIG_H264_VDA_HWACCEL
 AVVDAContext *av_vda_alloc_context(void)
diff --git a/libavcodec/vda.h b/libavcodec/vda.h
index 5e7228c94e..bde14e31d7 100644
--- a/libavcodec/vda.h
+++ b/libavcodec/vda.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2011 Sebastien Zwickert
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,6 @@
  */
 
 #include "libavcodec/avcodec.h"
-#include "libavcodec/version.h"
 
 #include <stdint.h>
 
@@ -42,6 +41,14 @@
 #include <VideoDecodeAcceleration/VDADecoder.h>
 #undef Picture
 
+#include "libavcodec/version.h"
+
+// extra flags not defined in VDADecoder.h
+enum {
+    kVDADecodeInfo_Asynchronous = 1UL << 0,
+    kVDADecodeInfo_FrameDropped = 1UL << 1
+};
+
 /**
  * @defgroup lavc_codec_hwaccel_vda VDA
  * @ingroup lavc_codec_hwaccel
@@ -51,7 +58,7 @@
 
 /**
  * This structure is used to provide the necessary configurations and data
- * to the VDA Libav HWAccel implementation.
+ * to the VDA FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  */
@@ -126,6 +133,17 @@ struct vda_context {
      * unused
      */
     int                 priv_allocated_size;
+
+    /**
+     * Use av_buffer to manage buffer.
+     * When the flag is set, the CVPixelBuffers returned by the decoder will
+     * be released automatically, so you have to retain them if necessary.
+     * Not setting this flag may cause memory leak.
+     *
+     * encoding: unused
+     * decoding: Set by user.
+     */
+    int                 use_ref_buffer;
 };
 
 /** Create the video decoder. */
diff --git a/libavcodec/vda_h264.c b/libavcodec/vda_h264.c
index 62ad5e9f8d..8c526c0754 100644
--- a/libavcodec/vda_h264.c
+++ b/libavcodec/vda_h264.c
@@ -1,49 +1,40 @@
 /*
- * VDA H.264 hardware acceleration
+ * VDA H264 HW acceleration.
  *
  * copyright (c) 2011 Sebastien Zwickert
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <CoreFoundation/CFDictionary.h>
 #include <CoreFoundation/CFNumber.h>
 #include <CoreFoundation/CFData.h>
-#include <CoreFoundation/CFString.h>
 
+#include "vda.h"
 #include "libavutil/avutil.h"
 #include "h264.h"
-#include "internal.h"
-#include "vda.h"
-#include "vda_internal.h"
-
-typedef struct VDAContext {
-    // The current bitstream buffer.
-    uint8_t             *bitstream;
-
-    // The current size of the bitstream.
-    int                  bitstream_size;
-
-    // The reference size used for fast reallocation.
-    int                  allocated_size;
 
-    CVImageBufferRef frame;
-} VDAContext;
+struct vda_buffer {
+    CVPixelBufferRef cv_buffer;
+};
+#include "internal.h"
+#include "vda_vt_internal.h"
 
-/* Decoder callback that adds the VDA frame to the queue in display order. */
+/* Decoder callback that adds the vda frame to the queue in display order. */
 static void vda_decoder_callback(void *vda_hw_ctx,
                                  CFDictionaryRef user_info,
                                  OSStatus status,
@@ -52,6 +43,9 @@ static void vda_decoder_callback(void *vda_hw_ctx,
 {
     struct vda_context *vda_ctx = vda_hw_ctx;
 
+    if (infoFlags & kVDADecodeInfo_FrameDropped)
+        vda_ctx->cv_buffer = NULL;
+
     if (!image_buffer)
         return;
 
@@ -61,7 +55,7 @@ static void vda_decoder_callback(void *vda_hw_ctx,
     vda_ctx->cv_buffer = CVPixelBufferRetain(image_buffer);
 }
 
-static int vda_sync_decode(VDAContext *ctx, struct vda_context *vda_ctx)
+static int vda_sync_decode(VTContext *ctx, struct vda_context *vda_ctx)
 {
     OSStatus status;
     CFDataRef coded_frame;
@@ -86,8 +80,8 @@ static int vda_old_h264_start_frame(AVCodecContext *avctx,
                                 av_unused const uint8_t *buffer,
                                 av_unused uint32_t size)
 {
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
+    VTContext *vda = avctx->internal->hwaccel_priv_data;
+    struct vda_context *vda_ctx = avctx->hwaccel_context;
 
     if (!vda_ctx->decoder)
         return -1;
@@ -101,8 +95,8 @@ static int vda_old_h264_decode_slice(AVCodecContext *avctx,
                                  const uint8_t *buffer,
                                  uint32_t size)
 {
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
+    VTContext *vda              = avctx->internal->hwaccel_priv_data;
+    struct vda_context *vda_ctx = avctx->hwaccel_context;
     void *tmp;
 
     if (!vda_ctx->decoder)
@@ -124,12 +118,21 @@ static int vda_old_h264_decode_slice(AVCodecContext *avctx,
     return 0;
 }
 
+static void vda_h264_release_buffer(void *opaque, uint8_t *data)
+{
+    struct vda_buffer *context = opaque;
+    CVPixelBufferRelease(context->cv_buffer);
+    av_free(context);
+}
+
 static int vda_old_h264_end_frame(AVCodecContext *avctx)
 {
     H264Context *h                      = avctx->priv_data;
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
+    VTContext *vda                      = avctx->internal->hwaccel_priv_data;
     struct vda_context *vda_ctx         = avctx->hwaccel_context;
     AVFrame *frame                      = h->cur_pic_ptr->f;
+    struct vda_buffer *context;
+    AVBufferRef *buffer;
     int status;
 
     if (!vda_ctx->decoder || !vda->bitstream)
@@ -141,6 +144,20 @@ static int vda_old_h264_end_frame(AVCodecContext *avctx)
     if (status)
         av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
 
+    if (!vda_ctx->use_ref_buffer || status)
+        return status;
+
+    context = av_mallocz(sizeof(*context));
+    buffer = av_buffer_create(NULL, 0, vda_h264_release_buffer, context, 0);
+    if (!context || !buffer) {
+        CVPixelBufferRelease(vda_ctx->cv_buffer);
+        av_free(context);
+        return -1;
+    }
+
+    context->cv_buffer = vda_ctx->cv_buffer;
+    frame->buf[3] = buffer;
+
     return status;
 }
 
@@ -148,7 +165,7 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
                           uint8_t *extradata,
                           int extradata_size)
 {
-    OSStatus status = kVDADecoderNoErr;
+    OSStatus status;
     CFNumberRef height;
     CFNumberRef width;
     CFNumberRef format;
@@ -158,7 +175,10 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
     CFMutableDictionaryRef io_surface_properties;
     CFNumberRef cv_pix_fmt;
 
-    /* Each VCL NAL in the bistream sent to the decoder
+    vda_ctx->priv_bitstream = NULL;
+    vda_ctx->priv_allocated_size = 0;
+
+    /* Each VCL NAL in the bitstream sent to the decoder
      * is preceded by a 4 bytes length header.
      * Change the avcC atom header if needed, to signal headers of 4 bytes. */
     if (extradata_size >= 4 && (extradata[4] & 0x03) != 0x03) {
@@ -200,9 +220,9 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
                                                       0,
                                                       &kCFTypeDictionaryKeyCallBacks,
                                                       &kCFTypeDictionaryValueCallBacks);
-    cv_pix_fmt      = CFNumberCreate(kCFAllocatorDefault,
-                                     kCFNumberSInt32Type,
-                                     &vda_ctx->cv_pix_fmt_type);
+    cv_pix_fmt  = CFNumberCreate(kCFAllocatorDefault,
+                                 kCFNumberSInt32Type,
+                                 &vda_ctx->cv_pix_fmt_type);
     CFDictionarySetValue(buffer_attributes,
                          kCVPixelBufferPixelFormatTypeKey,
                          cv_pix_fmt);
@@ -238,15 +258,6 @@ int ff_vda_destroy_decoder(struct vda_context *vda_ctx)
     return status;
 }
 
-static int vda_h264_uninit(AVCodecContext *avctx)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    av_freep(&vda->bitstream);
-    if (vda->frame)
-        CVPixelBufferRelease(vda->frame);
-    return 0;
-}
-
 AVHWAccel ff_h264_vda_old_hwaccel = {
     .name           = "h264_vda",
     .type           = AVMEDIA_TYPE_VIDEO,
@@ -255,8 +266,8 @@ AVHWAccel ff_h264_vda_old_hwaccel = {
     .start_frame    = vda_old_h264_start_frame,
     .decode_slice   = vda_old_h264_decode_slice,
     .end_frame      = vda_old_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
 };
 
 void ff_vda_output_callback(void *opaque,
@@ -266,7 +277,7 @@ void ff_vda_output_callback(void *opaque,
                             CVImageBufferRef image_buffer)
 {
     AVCodecContext *ctx = opaque;
-    VDAContext *vda = ctx->internal->hwaccel_priv_data;
+    VTContext *vda = ctx->internal->hwaccel_priv_data;
 
 
     if (vda->frame) {
@@ -280,50 +291,10 @@ void ff_vda_output_callback(void *opaque,
     vda->frame = CVPixelBufferRetain(image_buffer);
 }
 
-static int vda_h264_start_frame(AVCodecContext *avctx,
-                                const uint8_t *buffer,
-                                uint32_t size)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-
-    vda->bitstream_size = 0;
-
-    return 0;
-}
-
-static int vda_h264_decode_slice(AVCodecContext *avctx,
-                                 const uint8_t *buffer,
-                                 uint32_t size)
-{
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
-    void *tmp;
-
-    tmp = av_fast_realloc(vda->bitstream,
-                          &vda->allocated_size,
-                          vda->bitstream_size + size + 4);
-    if (!tmp)
-        return AVERROR(ENOMEM);
-
-    vda->bitstream = tmp;
-
-    AV_WB32(vda->bitstream + vda->bitstream_size, size);
-    memcpy(vda->bitstream + vda->bitstream_size + 4, buffer, size);
-
-    vda->bitstream_size += size + 4;
-
-    return 0;
-}
-
-static void release_buffer(void *opaque, uint8_t *data)
-{
-    CVImageBufferRef frame = (CVImageBufferRef)data;
-    CVPixelBufferRelease(frame);
-}
-
 static int vda_h264_end_frame(AVCodecContext *avctx)
 {
     H264Context *h        = avctx->priv_data;
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
+    VTContext *vda        = avctx->internal->hwaccel_priv_data;
     AVVDAContext *vda_ctx = avctx->hwaccel_context;
     AVFrame *frame        = h->cur_pic_ptr->f;
     uint32_t flush_flags  = 1 << 0; ///< kVDADecoderFlush_emitFrames
@@ -353,19 +324,7 @@ static int vda_h264_end_frame(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
-    av_buffer_unref(&frame->buf[0]);
-
-    frame->buf[0] = av_buffer_create((uint8_t*)vda->frame,
-                                     sizeof(vda->frame),
-                                     release_buffer, NULL,
-                                     AV_BUFFER_FLAG_READONLY);
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-
-    frame->data[3] = (uint8_t*)vda->frame;
-    vda->frame = NULL;
-
-    return 0;
+    return ff_videotoolbox_buffer_create(vda, frame);
 }
 
 int ff_vda_default_init(AVCodecContext *avctx)
@@ -384,26 +343,7 @@ int ff_vda_default_init(AVCodecContext *avctx)
 
     // kCVPixelFormatType_420YpCbCr8Planar;
 
-    /* Each VCL NAL in the bistream sent to the decoder
-     * is preceded by a 4 bytes length header.
-     * Change the avcC atom header if needed, to signal headers of 4 bytes. */
-    if (avctx->extradata_size >= 4 && (avctx->extradata[4] & 0x03) != 0x03) {
-        uint8_t *rw_extradata;
-
-        if (!(rw_extradata = av_malloc(avctx->extradata_size)))
-            return AVERROR(ENOMEM);
-
-        memcpy(rw_extradata, avctx->extradata, avctx->extradata_size);
-
-        rw_extradata[4] |= 0x03;
-
-        avc_data = CFDataCreate(kCFAllocatorDefault, rw_extradata, avctx->extradata_size);
-
-        av_freep(&rw_extradata);
-    } else {
-        avc_data = CFDataCreate(kCFAllocatorDefault,
-                                avctx->extradata, avctx->extradata_size);
-    }
+    avc_data = ff_videotoolbox_avcc_extradata_create(avctx);
 
     config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
                                             4,
@@ -471,27 +411,15 @@ int ff_vda_default_init(AVCodecContext *avctx)
     }
 }
 
-static int vda_h264_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
-{
-    frame->width  = avctx->width;
-    frame->height = avctx->height;
-    frame->format = avctx->pix_fmt;
-    frame->buf[0] = av_buffer_alloc(1);
-
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-    return 0;
-}
-
 AVHWAccel ff_h264_vda_hwaccel = {
     .name           = "h264_vda",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
     .pix_fmt        = AV_PIX_FMT_VDA,
-    .alloc_frame    = vda_h264_alloc_frame,
-    .start_frame    = vda_h264_start_frame,
-    .decode_slice   = vda_h264_decode_slice,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = ff_videotoolbox_h264_start_frame,
+    .decode_slice   = ff_videotoolbox_h264_decode_slice,
     .end_frame      = vda_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
 };
diff --git a/libavcodec/vda_h264_dec.c b/libavcodec/vda_h264_dec.c
new file mode 100644
index 0000000000..a092693dbc
--- /dev/null
+++ b/libavcodec/vda_h264_dec.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2012, Xidorn Quan
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 decoder via VDA
+ * @author Xidorn Quan <quanxunzhen@gmail.com>
+ */
+
+#include <string.h>
+#include <CoreFoundation/CoreFoundation.h>
+
+#include "vda.h"
+#include "h264.h"
+#include "avcodec.h"
+
+#ifndef kCFCoreFoundationVersionNumber10_7
+#define kCFCoreFoundationVersionNumber10_7      635.00
+#endif
+
+extern AVCodec ff_h264_decoder, ff_h264_vda_decoder;
+
+static const enum AVPixelFormat vda_pixfmts_prior_10_7[] = {
+    AV_PIX_FMT_UYVY422,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat vda_pixfmts[] = {
+    AV_PIX_FMT_UYVY422,
+    AV_PIX_FMT_YUYV422,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+typedef struct {
+    H264Context h264ctx;
+    int h264_initialized;
+    struct vda_context vda_ctx;
+    enum AVPixelFormat pix_fmt;
+
+    /* for backing-up fields set by user.
+     * we have to gain full control of such fields here */
+    void *hwaccel_context;
+    enum AVPixelFormat (*get_format)(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+} VDADecoderContext;
+
+static enum AVPixelFormat get_format(struct AVCodecContext *avctx,
+        const enum AVPixelFormat *fmt)
+{
+    return AV_PIX_FMT_VDA_VLD;
+}
+
+typedef struct {
+    CVPixelBufferRef cv_buffer;
+} VDABufferContext;
+
+static void release_buffer(void *opaque, uint8_t *data)
+{
+    VDABufferContext *context = opaque;
+    CVPixelBufferUnlockBaseAddress(context->cv_buffer, 0);
+    CVPixelBufferRelease(context->cv_buffer);
+    av_free(context);
+}
+
+static int get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flag)
+{
+    VDABufferContext *context = av_mallocz(sizeof(VDABufferContext));
+    AVBufferRef *buffer = av_buffer_create(NULL, 0, release_buffer, context, 0);
+    if (!context || !buffer) {
+        av_free(context);
+        return AVERROR(ENOMEM);
+    }
+
+    pic->buf[0] = buffer;
+    pic->data[0] = (void *)1;
+    return 0;
+}
+
+static inline void set_context(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    ctx->hwaccel_context = avctx->hwaccel_context;
+    avctx->hwaccel_context = &ctx->vda_ctx;
+    ctx->get_format = avctx->get_format;
+    avctx->get_format = get_format;
+    ctx->get_buffer2 = avctx->get_buffer2;
+    avctx->get_buffer2 = get_buffer2;
+}
+
+static inline void restore_context(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    avctx->hwaccel_context = ctx->hwaccel_context;
+    avctx->get_format = ctx->get_format;
+    avctx->get_buffer2 = ctx->get_buffer2;
+}
+
+static int vdadec_decode(AVCodecContext *avctx,
+        void *data, int *got_frame, AVPacket *avpkt)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    AVFrame *pic = data;
+    int ret;
+
+    set_context(avctx);
+    ret = ff_h264_decoder.decode(avctx, data, got_frame, avpkt);
+    restore_context(avctx);
+    if (*got_frame) {
+        AVBufferRef *buffer = pic->buf[0];
+        VDABufferContext *context = av_buffer_get_opaque(buffer);
+        CVPixelBufferRef cv_buffer = (CVPixelBufferRef)pic->data[3];
+
+        CVPixelBufferRetain(cv_buffer);
+        CVPixelBufferLockBaseAddress(cv_buffer, 0);
+        context->cv_buffer = cv_buffer;
+        pic->format = ctx->pix_fmt;
+        if (CVPixelBufferIsPlanar(cv_buffer)) {
+            int i, count = CVPixelBufferGetPlaneCount(cv_buffer);
+            av_assert0(count < 4);
+            for (i = 0; i < count; i++) {
+                pic->data[i] = CVPixelBufferGetBaseAddressOfPlane(cv_buffer, i);
+                pic->linesize[i] = CVPixelBufferGetBytesPerRowOfPlane(cv_buffer, i);
+            }
+        } else {
+            pic->data[0] = CVPixelBufferGetBaseAddress(cv_buffer);
+            pic->linesize[0] = CVPixelBufferGetBytesPerRow(cv_buffer);
+        }
+    }
+    avctx->pix_fmt = ctx->pix_fmt;
+
+    return ret;
+}
+
+static av_cold int vdadec_close(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    /* release buffers and decoder */
+    ff_vda_destroy_decoder(&ctx->vda_ctx);
+    /* close H.264 decoder */
+    if (ctx->h264_initialized) {
+        set_context(avctx);
+        ff_h264_decoder.close(avctx);
+        restore_context(avctx);
+    }
+    return 0;
+}
+
+static av_cold int vdadec_init(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    struct vda_context *vda_ctx = &ctx->vda_ctx;
+    OSStatus status;
+    int ret, i;
+
+    ctx->h264_initialized = 0;
+
+    /* init pix_fmts of codec */
+    if (!ff_h264_vda_decoder.pix_fmts) {
+        if (kCFCoreFoundationVersionNumber < kCFCoreFoundationVersionNumber10_7)
+            ff_h264_vda_decoder.pix_fmts = vda_pixfmts_prior_10_7;
+        else
+            ff_h264_vda_decoder.pix_fmts = vda_pixfmts;
+    }
+
+    /* init vda */
+    memset(vda_ctx, 0, sizeof(struct vda_context));
+    vda_ctx->width = avctx->width;
+    vda_ctx->height = avctx->height;
+    vda_ctx->format = 'avc1';
+    vda_ctx->use_sync_decoding = 1;
+    vda_ctx->use_ref_buffer = 1;
+    ctx->pix_fmt = avctx->get_format(avctx, avctx->codec->pix_fmts);
+    switch (ctx->pix_fmt) {
+    case AV_PIX_FMT_UYVY422:
+        vda_ctx->cv_pix_fmt_type = '2vuy';
+        break;
+    case AV_PIX_FMT_YUYV422:
+        vda_ctx->cv_pix_fmt_type = 'yuvs';
+        break;
+    case AV_PIX_FMT_NV12:
+        vda_ctx->cv_pix_fmt_type = '420v';
+        break;
+    case AV_PIX_FMT_YUV420P:
+        vda_ctx->cv_pix_fmt_type = 'y420';
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format: %d\n", avctx->pix_fmt);
+        goto failed;
+    }
+    status = ff_vda_create_decoder(vda_ctx,
+                                   avctx->extradata, avctx->extradata_size);
+    if (status != kVDADecoderNoErr) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Failed to init VDA decoder: %d.\n", status);
+        goto failed;
+    }
+
+    /* init H.264 decoder */
+    set_context(avctx);
+    ret = ff_h264_decoder.init(avctx);
+    restore_context(avctx);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to open H.264 decoder.\n");
+        goto failed;
+    }
+    ctx->h264_initialized = 1;
+
+    for (i = 0; i < MAX_SPS_COUNT; i++) {
+        SPS *sps = ctx->h264ctx.sps_buffers[i];
+        if (sps && (sps->bit_depth_luma != 8 ||
+                sps->chroma_format_idc == 2 ||
+                sps->chroma_format_idc == 3)) {
+            av_log(avctx, AV_LOG_ERROR, "Format is not supported.\n");
+            goto failed;
+        }
+    }
+
+    return 0;
+
+failed:
+    vdadec_close(avctx);
+    return -1;
+}
+
+static void vdadec_flush(AVCodecContext *avctx)
+{
+    set_context(avctx);
+    ff_h264_decoder.flush(avctx);
+    restore_context(avctx);
+}
+
+AVCodec ff_h264_vda_decoder = {
+    .name           = "h264_vda",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(VDADecoderContext),
+    .init           = vdadec_init,
+    .close          = vdadec_close,
+    .decode         = vdadec_decode,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .flush          = vdadec_flush,
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 (VDA acceleration)"),
+};
diff --git a/libavcodec/vda_internal.h b/libavcodec/vda_internal.h
deleted file mode 100644
index 9d0ed807bf..0000000000
--- a/libavcodec/vda_internal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_VDA_INTERNAL_H
-#define AVCODEC_VDA_INTERNAL_H
-
-#include "vda.h"
-
-void ff_vda_output_callback(void *vda_hw_ctx,
-                            CFDictionaryRef user_info,
-                            OSStatus status,
-                            uint32_t infoFlags,
-                            CVImageBufferRef image_buffer);
-
-int ff_vda_default_init(AVCodecContext *avctx);
-void ff_vda_default_free(AVCodecContext *avctx);
-
-#endif /* AVCODEC_VDA_INTERNAL_H */
diff --git a/libavcodec/vda_vt_internal.h b/libavcodec/vda_vt_internal.h
new file mode 100644
index 0000000000..9ff63ccc52
--- /dev/null
+++ b/libavcodec/vda_vt_internal.h
@@ -0,0 +1,55 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDA_VT_INTERNAL_H
+#define AVCODEC_VDA_VT_INTERNAL_H
+
+void ff_vda_output_callback(void *vda_hw_ctx,
+                            CFDictionaryRef user_info,
+                            OSStatus status,
+                            uint32_t infoFlags,
+                            CVImageBufferRef image_buffer);
+
+int ff_vda_default_init(AVCodecContext *avctx);
+void ff_vda_default_free(AVCodecContext *avctx);
+
+typedef struct VTContext {
+    // The current bitstream buffer.
+    uint8_t                     *bitstream;
+
+    // The current size of the bitstream.
+    int                         bitstream_size;
+
+    // The reference size used for fast reallocation.
+    int                         allocated_size;
+
+    // The core video buffer
+    CVImageBufferRef            frame;
+} VTContext;
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame);
+int ff_videotoolbox_uninit(AVCodecContext *avctx);
+int ff_videotoolbox_buffer_create(VTContext *vtctx, AVFrame *frame);
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size);
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size);
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx);
+#endif /* AVCODEC_VDA_VT_INTERNAL_H */
diff --git a/libavcodec/vdpau.c b/libavcodec/vdpau.c
index 77b649b9ff..60c7235bac 100644
--- a/libavcodec/vdpau.c
+++ b/libavcodec/vdpau.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include <assert.h>
 
 #include "vdpau.h"
+#include "vdpau_compat.h"
 #include "vdpau_internal.h"
 
 /**
@@ -64,6 +65,13 @@ static int vdpau_error(VdpStatus status)
     }
 }
 
+AVVDPAUContext *av_alloc_vdpaucontext(void)
+{
+    return av_vdpau_alloc_context();
+}
+
+MAKE_ACCESSORS(AVVDPAUContext, vdpau_hwaccel, AVVDPAU_Render2, render2)
+
 int av_vdpau_get_surface_parameters(AVCodecContext *avctx,
                                     VdpChromaType *type,
                                     uint32_t *width, uint32_t *height)
@@ -122,7 +130,12 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
 
     vdctx->width            = UINT32_MAX;
     vdctx->height           = UINT32_MAX;
-    hwctx->reset            = 0;
+
+    if (!hwctx) {
+        vdctx->device  = VDP_INVALID_HANDLE;
+        av_log(avctx, AV_LOG_WARNING, "hwaccel_context has not been setup by the user application, cannot initialize\n");
+        return 0;
+    }
 
     if (hwctx->context.decoder != VDP_INVALID_HANDLE) {
         vdctx->decoder = hwctx->context.decoder;
@@ -130,6 +143,7 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
         vdctx->device  = VDP_INVALID_HANDLE;
         return 0; /* Decoder created by user */
     }
+    hwctx->reset            = 0;
 
     vdctx->device           = hwctx->device;
     vdctx->get_proc_address = hwctx->get_proc_address;
@@ -263,6 +277,7 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
                               struct vdpau_picture_context *pic_ctx)
 {
     VDPAUContext *vdctx = avctx->internal->hwaccel_priv_data;
+    AVVDPAUContext *hwctx = avctx->hwaccel_context;
     VdpVideoSurface surf = ff_vdpau_get_surface_id(frame);
     VdpStatus status;
     int val;
@@ -271,11 +286,34 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
     if (val < 0)
         return val;
 
+#if FF_API_BUFS_VDPAU
+FF_DISABLE_DEPRECATION_WARNINGS
+    av_assert0(sizeof(hwctx->info) <= sizeof(pic_ctx->info));
+    memcpy(&hwctx->info, &pic_ctx->info, sizeof(hwctx->info));
+    hwctx->bitstream_buffers = pic_ctx->bitstream_buffers;
+    hwctx->bitstream_buffers_used = pic_ctx->bitstream_buffers_used;
+    hwctx->bitstream_buffers_allocated = pic_ctx->bitstream_buffers_allocated;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    if (!hwctx->render && hwctx->render2) {
+        status = hwctx->render2(avctx, frame, (void *)&pic_ctx->info,
+                                pic_ctx->bitstream_buffers_used, pic_ctx->bitstream_buffers);
+    } else
     status = vdctx->render(vdctx->decoder, surf, (void *)&pic_ctx->info,
                            pic_ctx->bitstream_buffers_used,
                            pic_ctx->bitstream_buffers);
 
     av_freep(&pic_ctx->bitstream_buffers);
+
+#if FF_API_BUFS_VDPAU
+FF_DISABLE_DEPRECATION_WARNINGS
+    hwctx->bitstream_buffers = NULL;
+    hwctx->bitstream_buffers_used = 0;
+    hwctx->bitstream_buffers_allocated = 0;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     return vdpau_error(status);
 }
 
@@ -317,6 +355,345 @@ int ff_vdpau_add_buffer(struct vdpau_picture_context *pic_ctx,
     return 0;
 }
 
+/* Obsolete non-hwaccel VDPAU support below... */
+
+#if FF_API_VDPAU
+void ff_vdpau_add_data_chunk(uint8_t *data, const uint8_t *buf, int buf_size)
+{
+    struct vdpau_render_state *render = (struct vdpau_render_state*)data;
+    assert(render);
+
+    render->bitstream_buffers= av_fast_realloc(
+        render->bitstream_buffers,
+        &render->bitstream_buffers_allocated,
+        sizeof(*render->bitstream_buffers)*(render->bitstream_buffers_used + 1)
+    );
+
+    render->bitstream_buffers[render->bitstream_buffers_used].struct_version  = VDP_BITSTREAM_BUFFER_VERSION;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream       = buf;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream_bytes = buf_size;
+    render->bitstream_buffers_used++;
+}
+
+#if CONFIG_H264_VDPAU_DECODER
+void ff_vdpau_h264_set_reference_frames(H264Context *h)
+{
+    struct vdpau_render_state *render, *render_ref;
+    VdpReferenceFrameH264 *rf, *rf2;
+    H264Picture *pic;
+    int i, list, pic_frame_idx;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    rf = &render->info.h264.referenceFrames[0];
+#define H264_RF_COUNT FF_ARRAY_ELEMS(render->info.h264.referenceFrames)
+
+    for (list = 0; list < 2; ++list) {
+        H264Picture **lp = list ? h->long_ref : h->short_ref;
+        int ls = list ? 16 : h->short_ref_count;
+
+        for (i = 0; i < ls; ++i) {
+            pic = lp[i];
+            if (!pic || !pic->reference)
+                continue;
+            pic_frame_idx = pic->long_ref ? pic->pic_id : pic->frame_num;
+
+            render_ref = (struct vdpau_render_state *)pic->f->data[0];
+            assert(render_ref);
+
+            rf2 = &render->info.h264.referenceFrames[0];
+            while (rf2 != rf) {
+                if (
+                    (rf2->surface == render_ref->surface)
+                    && (rf2->is_long_term == pic->long_ref)
+                    && (rf2->frame_idx == pic_frame_idx)
+                )
+                    break;
+                ++rf2;
+            }
+            if (rf2 != rf) {
+                rf2->top_is_reference    |= (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+                rf2->bottom_is_reference |= (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+                continue;
+            }
+
+            if (rf >= &render->info.h264.referenceFrames[H264_RF_COUNT])
+                continue;
+
+            rf->surface             = render_ref->surface;
+            rf->is_long_term        = pic->long_ref;
+            rf->top_is_reference    = (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+            rf->bottom_is_reference = (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+            rf->field_order_cnt[0]  = pic->field_poc[0];
+            rf->field_order_cnt[1]  = pic->field_poc[1];
+            rf->frame_idx           = pic_frame_idx;
+
+            ++rf;
+        }
+    }
+
+    for (; rf < &render->info.h264.referenceFrames[H264_RF_COUNT]; ++rf) {
+        rf->surface             = VDP_INVALID_HANDLE;
+        rf->is_long_term        = 0;
+        rf->top_is_reference    = 0;
+        rf->bottom_is_reference = 0;
+        rf->field_order_cnt[0]  = 0;
+        rf->field_order_cnt[1]  = 0;
+        rf->frame_idx           = 0;
+    }
+}
+
+void ff_vdpau_h264_picture_start(H264Context *h)
+{
+    struct vdpau_render_state *render;
+    int i;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    for (i = 0; i < 2; ++i) {
+        int foc = h->cur_pic_ptr->field_poc[i];
+        if (foc == INT_MAX)
+            foc = 0;
+        render->info.h264.field_order_cnt[i] = foc;
+    }
+
+    render->info.h264.frame_num = h->frame_num;
+}
+
+void ff_vdpau_h264_picture_complete(H264Context *h)
+{
+    struct vdpau_render_state *render;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    render->info.h264.slice_count = h->current_slice;
+    if (render->info.h264.slice_count < 1)
+        return;
+
+    render->info.h264.is_reference                           = (h->cur_pic_ptr->reference & 3) ? VDP_TRUE : VDP_FALSE;
+    render->info.h264.field_pic_flag                         = h->picture_structure != PICT_FRAME;
+    render->info.h264.bottom_field_flag                      = h->picture_structure == PICT_BOTTOM_FIELD;
+    render->info.h264.num_ref_frames                         = h->sps.ref_frame_count;
+    render->info.h264.mb_adaptive_frame_field_flag           = h->sps.mb_aff && !render->info.h264.field_pic_flag;
+    render->info.h264.constrained_intra_pred_flag            = h->pps.constrained_intra_pred;
+    render->info.h264.weighted_pred_flag                     = h->pps.weighted_pred;
+    render->info.h264.weighted_bipred_idc                    = h->pps.weighted_bipred_idc;
+    render->info.h264.frame_mbs_only_flag                    = h->sps.frame_mbs_only_flag;
+    render->info.h264.transform_8x8_mode_flag                = h->pps.transform_8x8_mode;
+    render->info.h264.chroma_qp_index_offset                 = h->pps.chroma_qp_index_offset[0];
+    render->info.h264.second_chroma_qp_index_offset          = h->pps.chroma_qp_index_offset[1];
+    render->info.h264.pic_init_qp_minus26                    = h->pps.init_qp - 26;
+    render->info.h264.num_ref_idx_l0_active_minus1           = h->pps.ref_count[0] - 1;
+    render->info.h264.num_ref_idx_l1_active_minus1           = h->pps.ref_count[1] - 1;
+    render->info.h264.log2_max_frame_num_minus4              = h->sps.log2_max_frame_num - 4;
+    render->info.h264.pic_order_cnt_type                     = h->sps.poc_type;
+    render->info.h264.log2_max_pic_order_cnt_lsb_minus4      = h->sps.poc_type ? 0 : h->sps.log2_max_poc_lsb - 4;
+    render->info.h264.delta_pic_order_always_zero_flag       = h->sps.delta_pic_order_always_zero_flag;
+    render->info.h264.direct_8x8_inference_flag              = h->sps.direct_8x8_inference_flag;
+    render->info.h264.entropy_coding_mode_flag               = h->pps.cabac;
+    render->info.h264.pic_order_present_flag                 = h->pps.pic_order_present;
+    render->info.h264.deblocking_filter_control_present_flag = h->pps.deblocking_filter_parameters_present;
+    render->info.h264.redundant_pic_cnt_present_flag         = h->pps.redundant_pic_cnt_present;
+    memcpy(render->info.h264.scaling_lists_4x4, h->pps.scaling_matrix4, sizeof(render->info.h264.scaling_lists_4x4));
+    memcpy(render->info.h264.scaling_lists_8x8[0], h->pps.scaling_matrix8[0], sizeof(render->info.h264.scaling_lists_8x8[0]));
+    memcpy(render->info.h264.scaling_lists_8x8[1], h->pps.scaling_matrix8[3], sizeof(render->info.h264.scaling_lists_8x8[0]));
+
+    ff_h264_draw_horiz_band(h, &h->slice_ctx[0], 0, h->avctx->height);
+    render->bitstream_buffers_used = 0;
+}
+#endif /* CONFIG_H264_VDPAU_DECODER */
+
+#if CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER
+void ff_vdpau_mpeg_picture_complete(MpegEncContext *s, const uint8_t *buf,
+                                    int buf_size, int slice_count)
+{
+    struct vdpau_render_state *render, *last, *next;
+    int i;
+
+    if (!s->current_picture_ptr) return;
+
+    render = (struct vdpau_render_state *)s->current_picture_ptr->f->data[0];
+    assert(render);
+
+    /* fill VdpPictureInfoMPEG1Or2 struct */
+    render->info.mpeg.picture_structure          = s->picture_structure;
+    render->info.mpeg.picture_coding_type        = s->pict_type;
+    render->info.mpeg.intra_dc_precision         = s->intra_dc_precision;
+    render->info.mpeg.frame_pred_frame_dct       = s->frame_pred_frame_dct;
+    render->info.mpeg.concealment_motion_vectors = s->concealment_motion_vectors;
+    render->info.mpeg.intra_vlc_format           = s->intra_vlc_format;
+    render->info.mpeg.alternate_scan             = s->alternate_scan;
+    render->info.mpeg.q_scale_type               = s->q_scale_type;
+    render->info.mpeg.top_field_first            = s->top_field_first;
+    render->info.mpeg.full_pel_forward_vector    = s->full_pel[0]; // MPEG-1 only.  Set 0 for MPEG-2
+    render->info.mpeg.full_pel_backward_vector   = s->full_pel[1]; // MPEG-1 only.  Set 0 for MPEG-2
+    render->info.mpeg.f_code[0][0]               = s->mpeg_f_code[0][0]; // For MPEG-1 fill both horiz. & vert.
+    render->info.mpeg.f_code[0][1]               = s->mpeg_f_code[0][1];
+    render->info.mpeg.f_code[1][0]               = s->mpeg_f_code[1][0];
+    render->info.mpeg.f_code[1][1]               = s->mpeg_f_code[1][1];
+    for (i = 0; i < 64; ++i) {
+        render->info.mpeg.intra_quantizer_matrix[i]     = s->intra_matrix[i];
+        render->info.mpeg.non_intra_quantizer_matrix[i] = s->inter_matrix[i];
+    }
+
+    render->info.mpeg.forward_reference          = VDP_INVALID_HANDLE;
+    render->info.mpeg.backward_reference         = VDP_INVALID_HANDLE;
+
+    switch(s->pict_type){
+    case  AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.mpeg.backward_reference     = next->surface;
+        // no return here, going to set forward prediction
+    case  AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        if (!last) // FIXME: Does this test make sense?
+            last = render; // predict second field from the first
+        render->info.mpeg.forward_reference      = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    render->info.mpeg.slice_count                = slice_count;
+
+    if (slice_count)
+        ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used               = 0;
+}
+#endif /* CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER */
+
+#if CONFIG_VC1_VDPAU_DECODER
+void ff_vdpau_vc1_decode_picture(MpegEncContext *s, const uint8_t *buf,
+                                 int buf_size)
+{
+    VC1Context *v = s->avctx->priv_data;
+    struct vdpau_render_state *render, *last, *next;
+
+    render = (struct vdpau_render_state *)s->current_picture.f->data[0];
+    assert(render);
+
+    /*  fill LvPictureInfoVC1 struct */
+    render->info.vc1.frame_coding_mode  = v->fcm ? v->fcm + 1 : 0;
+    render->info.vc1.postprocflag       = v->postprocflag;
+    render->info.vc1.pulldown           = v->broadcast;
+    render->info.vc1.interlace          = v->interlace;
+    render->info.vc1.tfcntrflag         = v->tfcntrflag;
+    render->info.vc1.finterpflag        = v->finterpflag;
+    render->info.vc1.psf                = v->psf;
+    render->info.vc1.dquant             = v->dquant;
+    render->info.vc1.panscan_flag       = v->panscanflag;
+    render->info.vc1.refdist_flag       = v->refdist_flag;
+    render->info.vc1.quantizer          = v->quantizer_mode;
+    render->info.vc1.extended_mv        = v->extended_mv;
+    render->info.vc1.extended_dmv       = v->extended_dmv;
+    render->info.vc1.overlap            = v->overlap;
+    render->info.vc1.vstransform        = v->vstransform;
+    render->info.vc1.loopfilter         = v->s.loop_filter;
+    render->info.vc1.fastuvmc           = v->fastuvmc;
+    render->info.vc1.range_mapy_flag    = v->range_mapy_flag;
+    render->info.vc1.range_mapy         = v->range_mapy;
+    render->info.vc1.range_mapuv_flag   = v->range_mapuv_flag;
+    render->info.vc1.range_mapuv        = v->range_mapuv;
+    /* Specific to simple/main profile only */
+    render->info.vc1.multires           = v->multires;
+    render->info.vc1.syncmarker         = v->resync_marker;
+    render->info.vc1.rangered           = v->rangered | (v->rangeredfrm << 1);
+    render->info.vc1.maxbframes         = v->s.max_b_frames;
+
+    render->info.vc1.deblockEnable      = v->postprocflag & 1;
+    render->info.vc1.pquant             = v->pq;
+
+    render->info.vc1.forward_reference  = VDP_INVALID_HANDLE;
+    render->info.vc1.backward_reference = VDP_INVALID_HANDLE;
+
+    if (v->bi_type)
+        render->info.vc1.picture_type = 4;
+    else
+        render->info.vc1.picture_type = s->pict_type - 1 + s->pict_type / 3;
+
+    switch(s->pict_type){
+    case  AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.vc1.backward_reference = next->surface;
+        // no break here, going to set forward prediction
+    case  AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        if (!last) // FIXME: Does this test make sense?
+            last = render; // predict second field from the first
+        render->info.vc1.forward_reference = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    render->info.vc1.slice_count          = 1;
+
+    ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used        = 0;
+}
+#endif /* (CONFIG_VC1_VDPAU_DECODER */
+
+#if CONFIG_MPEG4_VDPAU_DECODER
+void ff_vdpau_mpeg4_decode_picture(Mpeg4DecContext *ctx, const uint8_t *buf,
+                                   int buf_size)
+{
+    MpegEncContext *s = &ctx->m;
+    struct vdpau_render_state *render, *last, *next;
+    int i;
+
+    if (!s->current_picture_ptr) return;
+
+    render = (struct vdpau_render_state *)s->current_picture_ptr->f->data[0];
+    assert(render);
+
+    /* fill VdpPictureInfoMPEG4Part2 struct */
+    render->info.mpeg4.trd[0]                            = s->pp_time;
+    render->info.mpeg4.trb[0]                            = s->pb_time;
+    render->info.mpeg4.trd[1]                            = s->pp_field_time >> 1;
+    render->info.mpeg4.trb[1]                            = s->pb_field_time >> 1;
+    render->info.mpeg4.vop_time_increment_resolution     = s->avctx->time_base.den;
+    render->info.mpeg4.vop_coding_type                   = 0;
+    render->info.mpeg4.vop_fcode_forward                 = s->f_code;
+    render->info.mpeg4.vop_fcode_backward                = s->b_code;
+    render->info.mpeg4.resync_marker_disable             = !ctx->resync_marker;
+    render->info.mpeg4.interlaced                        = !s->progressive_sequence;
+    render->info.mpeg4.quant_type                        = s->mpeg_quant;
+    render->info.mpeg4.quarter_sample                    = s->quarter_sample;
+    render->info.mpeg4.short_video_header                = s->avctx->codec->id == AV_CODEC_ID_H263;
+    render->info.mpeg4.rounding_control                  = s->no_rounding;
+    render->info.mpeg4.alternate_vertical_scan_flag      = s->alternate_scan;
+    render->info.mpeg4.top_field_first                   = s->top_field_first;
+    for (i = 0; i < 64; ++i) {
+        render->info.mpeg4.intra_quantizer_matrix[i]     = s->intra_matrix[i];
+        render->info.mpeg4.non_intra_quantizer_matrix[i] = s->inter_matrix[i];
+    }
+    render->info.mpeg4.forward_reference                 = VDP_INVALID_HANDLE;
+    render->info.mpeg4.backward_reference                = VDP_INVALID_HANDLE;
+
+    switch (s->pict_type) {
+    case AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.mpeg4.backward_reference     = next->surface;
+        render->info.mpeg4.vop_coding_type        = 2;
+        // no break here, going to set forward prediction
+    case AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        assert(last);
+        render->info.mpeg4.forward_reference      = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used = 0;
+}
+#endif /* CONFIG_MPEG4_VDPAU_DECODER */
+#endif /* FF_API_VDPAU */
+
 int av_vdpau_get_profile(AVCodecContext *avctx, VdpDecoderProfile *profile)
 {
 #define PROFILE(prof)                      \
diff --git a/libavcodec/vdpau.h b/libavcodec/vdpau.h
index d25ee1ce17..e85e4d9e9a 100644
--- a/libavcodec/vdpau.h
+++ b/libavcodec/vdpau.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,7 @@
  * - VDPAU decoding
  * - VDPAU presentation
  *
- * The VDPAU decoding module parses all headers using Libav
+ * The VDPAU decoding module parses all headers using FFmpeg
  * parsing mechanisms and uses VDPAU for the actual decoding.
  *
  * As per the current implementation, the actual decoding
@@ -51,7 +51,7 @@
 
 #include <vdpau/vdpau.h>
 #include <vdpau/vdpau_x11.h>
-
+#include "libavutil/avconfig.h"
 #include "libavutil/attributes.h"
 
 #include "avcodec.h"
@@ -66,10 +66,18 @@ union AVVDPAUPictureInfo {
 };
 #endif
 
+struct AVCodecContext;
+struct AVFrame;
+
+typedef int (*AVVDPAU_Render2)(struct AVCodecContext *, struct AVFrame *,
+                               const VdpPictureInfo *, uint32_t,
+                               const VdpBitstreamBuffer *);
+
 /**
  * This structure is used to share data between the libavcodec library and
  * the client video application.
- * The user shall zero-allocate the structure and make it available as
+ * The user shall allocate the structure via the av_alloc_vdpau_hwaccel
+ * function and make it available as
  * AVCodecContext.hwaccel_context. Members can be set by the user once
  * during initialization or through each AVCodecContext.get_buffer()
  * function call. In any case, they must be valid prior to calling
@@ -128,9 +136,20 @@ typedef struct AVVDPAUContext {
     attribute_deprecated
     VdpBitstreamBuffer *bitstream_buffers;
 #endif
+    AVVDPAU_Render2 render2;
 } AVVDPAUContext;
 
 /**
+ * @brief allocation function for AVVDPAUContext
+ *
+ * Allows extending the struct without breaking API/ABI
+ */
+AVVDPAUContext *av_alloc_vdpaucontext(void);
+
+AVVDPAU_Render2 av_vdpau_hwaccel_get_render2(const AVVDPAUContext *);
+void av_vdpau_hwaccel_set_render2(AVVDPAUContext *, AVVDPAU_Render2);
+
+/**
  * Associate a VDPAU device with a codec context for hardware acceleration.
  * This function is meant to be called from the get_format() codec callback,
  * or earlier. It can also be called after avcodec_flush_buffers() to change
@@ -138,7 +157,7 @@ typedef struct AVVDPAUContext {
  * display preemption).
  *
  * @note get_format() must return AV_PIX_FMT_VDPAU if this function completes
- * succesfully.
+ * successfully.
  *
  * @param avctx decoding context whose get_format() callback is invoked
  * @param device VDPAU device handle to use for hardware acceleration
@@ -206,11 +225,11 @@ int av_vdpau_get_profile(AVCodecContext *avctx, VdpDecoderProfile *profile);
 #define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
 
 /**
- * @brief This structure is used as a callback between the Libav
+ * @brief This structure is used as a callback between the FFmpeg
  * decoder (vd_) and presentation (vo_) module.
  * This is used for defining a video frame containing surface,
  * picture parameter, bitstream information etc which are passed
- * between the Libav decoder and its clients.
+ * between the FFmpeg decoder and its clients.
  */
 struct vdpau_render_state {
     VdpVideoSurface surface; ///< Used as rendered surface, never changed.
diff --git a/libavcodec/vdpau_compat.h b/libavcodec/vdpau_compat.h
new file mode 100644
index 0000000000..6b4b0865c9
--- /dev/null
+++ b/libavcodec/vdpau_compat.h
@@ -0,0 +1,48 @@
+/*
+ * Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * HW decode acceleration for MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (C) 2008 NVIDIA
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDPAU_COMPAT_H
+#define AVCODEC_VDPAU_COMPAT_H
+
+#include <stdint.h>
+
+#include "h264.h"
+#include "mpeg4video.h"
+
+void ff_vdpau_add_data_chunk(uint8_t *data, const uint8_t *buf,
+                             int buf_size);
+
+void ff_vdpau_mpeg_picture_complete(MpegEncContext *s, const uint8_t *buf,
+                                    int buf_size, int slice_count);
+
+void ff_vdpau_h264_picture_start(H264Context *h);
+void ff_vdpau_h264_set_reference_frames(H264Context *h);
+void ff_vdpau_h264_picture_complete(H264Context *h);
+
+void ff_vdpau_vc1_decode_picture(MpegEncContext *s, const uint8_t *buf,
+                                 int buf_size);
+
+void ff_vdpau_mpeg4_decode_picture(Mpeg4DecContext *s, const uint8_t *buf,
+                                   int buf_size);
+
+#endif /* AVCODEC_VDPAU_COMPAT_H */
diff --git a/libavcodec/vdpau_h264.c b/libavcodec/vdpau_h264.c
index d03d127ee5..10376ac931 100644
--- a/libavcodec/vdpau_h264.c
+++ b/libavcodec/vdpau_h264.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_hevc.c b/libavcodec/vdpau_hevc.c
new file mode 100644
index 0000000000..3c1dc5f223
--- /dev/null
+++ b/libavcodec/vdpau_hevc.c
@@ -0,0 +1,437 @@
+/*
+ * MPEG-H Part 2 / HEVC / H.265 HW decode acceleration through VDPAU
+ *
+ * Copyright (c) 2013 Philip Langdale
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vdpau/vdpau.h>
+
+#include "avcodec.h"
+#include "internal.h"
+#include "hevc.h"
+#include "vdpau.h"
+#include "vdpau_internal.h"
+
+static int vdpau_hevc_start_frame(AVCodecContext *avctx,
+                                  const uint8_t *buffer, uint32_t size)
+{
+    HEVCContext *h = avctx->priv_data;
+    HEVCFrame *pic = h->ref;
+    struct vdpau_picture_context *pic_ctx = pic->hwaccel_picture_private;
+
+    VdpPictureInfoHEVC *info = &pic_ctx->info.hevc;
+
+    const HEVCSPS *sps = h->ps.sps;
+    const HEVCPPS *pps = h->ps.pps;
+    const SliceHeader *sh = &h->sh;
+    const ScalingList *sl = pps->scaling_list_data_present_flag ?
+                            &pps->scaling_list : &sps->scaling_list;
+
+    /* init VdpPictureInfoHEVC */
+
+    /* SPS */
+    info->chroma_format_idc = sps->chroma_format_idc;
+    info->separate_colour_plane_flag = sps->separate_colour_plane_flag;
+    info->pic_width_in_luma_samples = sps->width;
+    info->pic_height_in_luma_samples = sps->height;
+    info->bit_depth_luma_minus8 = sps->bit_depth - 8;
+    info->bit_depth_chroma_minus8 = sps->bit_depth - 8;
+    info->log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4;
+    /** Provides the value corresponding to the nuh_temporal_id of the frame
+        to be decoded. */
+    info->sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1;
+    info->log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3;
+    info->log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size;
+    info->log2_min_transform_block_size_minus2 = sps->log2_min_tb_size - 2;
+    info->log2_diff_max_min_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size;
+    info->max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter;
+    info->max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra;
+    info->scaling_list_enabled_flag = sps->scaling_list_enable_flag;
+    /** Scaling lists, in diagonal order, to be used for this frame. */
+    for (size_t i = 0; i < 6; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            /** Scaling List for 4x4 quantization matrix,
+                indexed as ScalingList4x4[matrixId][i]. */
+            uint8_t pos = 4 * ff_hevc_diag_scan4x4_y[j] + ff_hevc_diag_scan4x4_x[j];
+            info->ScalingList4x4[i][j] = sl->sl[0][i][pos];
+        }
+        for (size_t j = 0; j < 64; j++) {
+            uint8_t pos = 8 * ff_hevc_diag_scan8x8_y[j] + ff_hevc_diag_scan8x8_x[j];
+            /** Scaling List for 8x8 quantization matrix,
+                indexed as ScalingList8x8[matrixId][i]. */
+            info->ScalingList8x8[i][j] = sl->sl[1][i][pos];
+            /** Scaling List for 16x16 quantization matrix,
+                indexed as ScalingList16x16[matrixId][i]. */
+            info->ScalingList16x16[i][j] = sl->sl[2][i][pos];
+            if (i < 2) {
+                /** Scaling List for 32x32 quantization matrix,
+                    indexed as ScalingList32x32[matrixId][i]. */
+                info->ScalingList32x32[i][j] = sl->sl[3][i * 3][pos];
+            }
+        }
+        /** Scaling List DC Coefficients for 16x16,
+            indexed as ScalingListDCCoeff16x16[matrixId]. */
+        info->ScalingListDCCoeff16x16[i] = sl->sl_dc[0][i];
+        if (i < 2) {
+            /** Scaling List DC Coefficients for 32x32,
+                indexed as ScalingListDCCoeff32x32[matrixId]. */
+            info->ScalingListDCCoeff32x32[i] = sl->sl_dc[1][i * 3];
+        }
+    }
+    info->amp_enabled_flag = sps->amp_enabled_flag;
+    info->sample_adaptive_offset_enabled_flag = sps->sao_enabled;
+    info->pcm_enabled_flag = sps->pcm_enabled_flag;
+    if (info->pcm_enabled_flag) {
+        /** Only needs to be set if pcm_enabled_flag is set. Ignored otherwise. */
+        info->pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1;
+        /** Only needs to be set if pcm_enabled_flag is set. Ignored otherwise. */
+        info->pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1;
+        /** Only needs to be set if pcm_enabled_flag is set. Ignored otherwise. */
+        info->log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3;
+        /** Only needs to be set if pcm_enabled_flag is set. Ignored otherwise. */
+        info->log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size;
+        /** Only needs to be set if pcm_enabled_flag is set. Ignored otherwise. */
+        info->pcm_loop_filter_disabled_flag = sps->pcm.loop_filter_disable_flag;
+    }
+    /** Per spec, when zero, assume short_term_ref_pic_set_sps_flag
+        is also zero. */
+    info->num_short_term_ref_pic_sets = sps->nb_st_rps;
+    info->long_term_ref_pics_present_flag = sps->long_term_ref_pics_present_flag;
+    /** Only needed if long_term_ref_pics_present_flag is set. Ignored
+        otherwise. */
+    info->num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps;
+    info->sps_temporal_mvp_enabled_flag = sps->sps_temporal_mvp_enabled_flag;
+    info->strong_intra_smoothing_enabled_flag = sps->sps_strong_intra_smoothing_enable_flag;
+    /** @} */
+
+    /** \name HEVC Picture Parameter Set
+     *
+     * Copies of the HEVC Picture Parameter Set bitstream fields.
+     * @{ */
+    info->dependent_slice_segments_enabled_flag = pps->dependent_slice_segments_enabled_flag;
+    info->output_flag_present_flag = pps->output_flag_present_flag;
+    info->num_extra_slice_header_bits = pps->num_extra_slice_header_bits;
+    info->sign_data_hiding_enabled_flag = pps->sign_data_hiding_flag;
+    info->cabac_init_present_flag = pps->cabac_init_present_flag;
+    info->num_ref_idx_l0_default_active_minus1 = pps->num_ref_idx_l0_default_active - 1;
+    info->num_ref_idx_l1_default_active_minus1 = pps->num_ref_idx_l1_default_active - 1;
+    info->init_qp_minus26 = pps->pic_init_qp_minus26;
+    info->constrained_intra_pred_flag = pps->constrained_intra_pred_flag;
+    info->transform_skip_enabled_flag = pps->transform_skip_enabled_flag;
+    info->cu_qp_delta_enabled_flag = pps->cu_qp_delta_enabled_flag;
+    /** Only needed if cu_qp_delta_enabled_flag is set. Ignored otherwise. */
+    info->diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth;
+    info->pps_cb_qp_offset = pps->cb_qp_offset;
+    info->pps_cr_qp_offset = pps->cr_qp_offset;
+    info->pps_slice_chroma_qp_offsets_present_flag = pps->pic_slice_level_chroma_qp_offsets_present_flag;
+    info->weighted_pred_flag = pps->weighted_pred_flag;
+    info->weighted_bipred_flag = pps->weighted_bipred_flag;
+    info->transquant_bypass_enabled_flag = pps->transquant_bypass_enable_flag;
+    info->tiles_enabled_flag = pps->tiles_enabled_flag;
+    info->entropy_coding_sync_enabled_flag = pps->entropy_coding_sync_enabled_flag;
+    if (info->tiles_enabled_flag) {
+        /** Only valid if tiles_enabled_flag is set. Ignored otherwise. */
+        info->num_tile_columns_minus1 = pps->num_tile_columns - 1;
+        /** Only valid if tiles_enabled_flag is set. Ignored otherwise. */
+        info->num_tile_rows_minus1 = pps->num_tile_rows - 1;
+        /** Only valid if tiles_enabled_flag is set. Ignored otherwise. */
+        info->uniform_spacing_flag = pps->uniform_spacing_flag;
+        /** Only need to set 0..num_tile_columns_minus1. The struct
+            definition reserves up to the maximum of 20. Invalid values are
+            ignored. */
+        for (ssize_t i = 0; i < pps->num_tile_columns; i++) {
+            info->column_width_minus1[i] = pps->column_width[i] - 1;
+        }
+        /** Only need to set 0..num_tile_rows_minus1. The struct
+          definition reserves up to the maximum of 22. Invalid values are
+          ignored.*/
+        for (ssize_t i = 0; i < pps->num_tile_rows; i++) {
+            info->row_height_minus1[i] = pps->row_height[i] - 1;
+        }
+        /** Only needed if tiles_enabled_flag is set. Invalid values are
+          ignored. */
+        info->loop_filter_across_tiles_enabled_flag = pps->loop_filter_across_tiles_enabled_flag;
+    }
+    info->pps_loop_filter_across_slices_enabled_flag = pps->seq_loop_filter_across_slices_enabled_flag;
+    info->deblocking_filter_control_present_flag = pps->deblocking_filter_control_present_flag;
+    /** Only valid if deblocking_filter_control_present_flag is set. Ignored
+        otherwise. */
+    info->deblocking_filter_override_enabled_flag = pps->deblocking_filter_override_enabled_flag;
+    /** Only valid if deblocking_filter_control_present_flag is set. Ignored
+        otherwise. */
+    info->pps_deblocking_filter_disabled_flag = pps->disable_dbf;
+    /** Only valid if deblocking_filter_control_present_flag is set and
+        pps_deblocking_filter_disabled_flag is not set. Ignored otherwise.*/
+    info->pps_beta_offset_div2 = pps->beta_offset / 2;
+    /** Only valid if deblocking_filter_control_present_flag is set and
+        pps_deblocking_filter_disabled_flag is not set. Ignored otherwise. */
+    info->pps_tc_offset_div2 = pps->tc_offset / 2;
+    info->lists_modification_present_flag = pps->lists_modification_present_flag;
+    info->log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2;
+    info->slice_segment_header_extension_present_flag = pps->slice_header_extension_present_flag;
+
+    /** \name HEVC Slice Segment Header
+     *
+     * Copies of the HEVC Slice Segment Header bitstream fields and calculated
+     * values detailed in the specification.
+     * @{ */
+    /** Set to 1 if nal_unit_type is equal to IDR_W_RADL or IDR_N_LP.
+        Set to zero otherwise. */
+    info->IDRPicFlag = IS_IDR(h);
+    /** Set to 1 if nal_unit_type in the range of BLA_W_LP to
+        RSV_IRAP_VCL23, inclusive. Set to zero otherwise.*/
+    info->RAPPicFlag = IS_IRAP(h);
+    /** See section 7.4.7.1 of the specification. */
+    info->CurrRpsIdx = sps->nb_st_rps;
+    if (sh->short_term_ref_pic_set_sps_flag == 1) {
+        for (size_t i = 0; i < sps->nb_st_rps; i++) {
+            if (sh->short_term_rps == &sps->st_rps[i]) {
+                info->CurrRpsIdx = i;
+                break;
+            }
+        }
+    }
+    /** See section 7.4.7.2 of the specification. */
+    info->NumPocTotalCurr = ff_hevc_frame_nb_refs(h);
+    if (sh->short_term_ref_pic_set_sps_flag == 0 && sh->short_term_rps) {
+        /** Corresponds to specification field, NumDeltaPocs[RefRpsIdx].
+            Only applicable when short_term_ref_pic_set_sps_flag == 0.
+            Implementations will ignore this value in other cases. See 7.4.8. */
+        info->NumDeltaPocsOfRefRpsIdx = sh->short_term_rps->rps_idx_num_delta_pocs;
+    }
+    /** Section 7.6.3.1 of the H.265/HEVC Specification defines the syntax of
+        the slice_segment_header. This header contains information that
+        some VDPAU implementations may choose to skip. The VDPAU API
+        requires client applications to track the number of bits used in the
+        slice header for structures associated with short term and long term
+        reference pictures. First, VDPAU requires the number of bits used by
+        the short_term_ref_pic_set array in the slice_segment_header. */
+    info->NumShortTermPictureSliceHeaderBits = sh->short_term_ref_pic_set_size;
+    /** Second, VDPAU requires the number of bits used for long term reference
+        pictures in the slice_segment_header. This is equal to the number
+        of bits used for the contents of the block beginning with
+        "if(long_term_ref_pics_present_flag)". */
+    info->NumLongTermPictureSliceHeaderBits = sh->long_term_ref_pic_set_size;
+    /** @} */
+
+    /** Slice Decoding Process - Picture Order Count */
+    /** The value of PicOrderCntVal of the picture in the access unit
+        containing the SEI message. The picture being decoded. */
+    info->CurrPicOrderCntVal = h->poc;
+
+    /** Slice Decoding Process - Reference Picture Sets */
+    for (size_t i = 0; i < 16; i++) {
+        info->RefPics[i] = VDP_INVALID_HANDLE;
+        info->PicOrderCntVal[i] = 0;
+        info->IsLongTerm[i] = 0;
+    }
+    for (size_t i = 0, j = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
+        const HEVCFrame *frame = &h->DPB[i];
+        if (frame != h->ref && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF |
+                                                HEVC_FRAME_FLAG_SHORT_REF))) {
+            if (j > 16) {
+                av_log(avctx, AV_LOG_WARNING,
+                     "VDPAU only supports up to 16 references in the DPB. "
+                     "This frame may not be decoded correctly.\n");
+                break;
+            }
+            /** Array of video reference surfaces.
+                Set any unused positions to VDP_INVALID_HANDLE. */
+            info->RefPics[j] = ff_vdpau_get_surface_id(frame->frame);
+            /** Array of picture order counts. These correspond to positions
+                in the RefPics array. */
+            info->PicOrderCntVal[j] = frame->poc;
+            /** Array used to specify whether a particular RefPic is
+                a long term reference. A value of "1" indicates a long-term
+                reference. */
+            // XXX: Setting this caused glitches in the nvidia implementation
+            // Always setting it to zero, produces correct results
+            //info->IsLongTerm[j] = frame->flags & HEVC_FRAME_FLAG_LONG_REF;
+            info->IsLongTerm[j] = 0;
+            j++;
+        }
+    }
+    /** Copy of specification field, see Section 8.3.2 of the
+        H.265/HEVC Specification. */
+    info->NumPocStCurrBefore = h->rps[ST_CURR_BEF].nb_refs;
+    if (info->NumPocStCurrBefore > 8) {
+        av_log(avctx, AV_LOG_WARNING,
+             "VDPAU only supports up to 8 references in StCurrBefore. "
+             "This frame may not be decoded correctly.\n");
+        info->NumPocStCurrBefore = 8;
+    }
+    /** Copy of specification field, see Section 8.3.2 of the
+        H.265/HEVC Specification. */
+    info->NumPocStCurrAfter = h->rps[ST_CURR_AFT].nb_refs;
+    if (info->NumPocStCurrAfter > 8) {
+        av_log(avctx, AV_LOG_WARNING,
+             "VDPAU only supports up to 8 references in StCurrAfter. "
+             "This frame may not be decoded correctly.\n");
+        info->NumPocStCurrAfter = 8;
+    }
+    /** Copy of specification field, see Section 8.3.2 of the
+        H.265/HEVC Specification. */
+    info->NumPocLtCurr = h->rps[LT_CURR].nb_refs;
+    if (info->NumPocLtCurr > 8) {
+        av_log(avctx, AV_LOG_WARNING,
+             "VDPAU only supports up to 8 references in LtCurr. "
+             "This frame may not be decoded correctly.\n");
+        info->NumPocLtCurr = 8;
+    }
+    /** Reference Picture Set list, one of the short-term RPS. These
+        correspond to positions in the RefPics array. */
+    for (ssize_t i = 0, j = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
+        HEVCFrame *frame = h->rps[ST_CURR_BEF].ref[i];
+        if (frame) {
+            uint8_t found = 0;
+            uintptr_t id = ff_vdpau_get_surface_id(frame->frame);
+            for (size_t k = 0; k < 16; k++) {
+                if (id == info->RefPics[k]) {
+                    info->RefPicSetStCurrBefore[j] = k;
+                    j++;
+                    found = 1;
+                    break;
+                }
+            }
+            if (!found) {
+                av_log(avctx, AV_LOG_WARNING, "missing surface: %p\n",
+                       (void *)id);
+            }
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "missing STR Before frame: %zd\n", i);
+        }
+    }
+    /** Reference Picture Set list, one of the short-term RPS. These
+        correspond to positions in the RefPics array. */
+    for (ssize_t i = 0, j = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
+        HEVCFrame *frame = h->rps[ST_CURR_AFT].ref[i];
+        if (frame) {
+            uint8_t found = 0;
+            uintptr_t id = ff_vdpau_get_surface_id(frame->frame);
+            for (size_t k = 0; k < 16; k++) {
+                if (id == info->RefPics[k]) {
+                    info->RefPicSetStCurrAfter[j] = k;
+                    j++;
+                    found = 1;
+                    break;
+                }
+            }
+            if (!found) {
+                av_log(avctx, AV_LOG_WARNING, "missing surface: %p\n",
+                       (void *)id);
+            }
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "missing STR After frame: %zd\n", i);
+        }
+    }
+    /** Reference Picture Set list, one of the long-term RPS. These
+        correspond to positions in the RefPics array. */
+    for (ssize_t i = 0, j = 0; i < h->rps[LT_CURR].nb_refs; i++) {
+        HEVCFrame *frame = h->rps[LT_CURR].ref[i];
+        if (frame) {
+            uint8_t found = 0;
+            uintptr_t id = ff_vdpau_get_surface_id(frame->frame);
+            for (size_t k = 0; k < 16; k++) {
+                if (id == info->RefPics[k]) {
+                    info->RefPicSetLtCurr[j] = k;
+                    j++;
+                    found = 1;
+                    break;
+                }
+            }
+            if (!found) {
+                av_log(avctx, AV_LOG_WARNING, "missing surface: %p\n",
+                       (void *)id);
+            }
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "missing LTR frame: %zd\n", i);
+        }
+    }
+
+    return ff_vdpau_common_start_frame(pic_ctx, buffer, size);
+}
+
+static const uint8_t start_code_prefix[3] = { 0x00, 0x00, 0x01 };
+
+static int vdpau_hevc_decode_slice(AVCodecContext *avctx,
+                                   const uint8_t *buffer, uint32_t size)
+{
+    HEVCContext *h = avctx->priv_data;
+    struct vdpau_picture_context *pic_ctx = h->ref->hwaccel_picture_private;
+    int val;
+
+    val = ff_vdpau_add_buffer(pic_ctx, start_code_prefix, 3);
+    if (val)
+        return val;
+
+    val = ff_vdpau_add_buffer(pic_ctx, buffer, size);
+    if (val)
+        return val;
+
+    return 0;
+}
+
+static int vdpau_hevc_end_frame(AVCodecContext *avctx)
+{
+    HEVCContext *h = avctx->priv_data;
+    struct vdpau_picture_context *pic_ctx = h->ref->hwaccel_picture_private;
+    int val;
+
+    val = ff_vdpau_common_end_frame(avctx, h->ref->frame, pic_ctx);
+    if (val < 0)
+        return val;
+
+    return 0;
+}
+
+static int vdpau_hevc_init(AVCodecContext *avctx)
+{
+    VdpDecoderProfile profile;
+    uint32_t level = avctx->level;
+
+    switch (avctx->profile) {
+    case FF_PROFILE_HEVC_MAIN:
+        profile = VDP_DECODER_PROFILE_HEVC_MAIN;
+        break;
+    case FF_PROFILE_HEVC_MAIN_10:
+        profile = VDP_DECODER_PROFILE_HEVC_MAIN_10;
+        break;
+    case FF_PROFILE_HEVC_MAIN_STILL_PICTURE:
+        profile = VDP_DECODER_PROFILE_HEVC_MAIN_STILL;
+        break;
+    default:
+        return AVERROR(ENOTSUP);
+    }
+
+    return ff_vdpau_common_init(avctx, profile, level);
+}
+
+AVHWAccel ff_hevc_vdpau_hwaccel = {
+    .name           = "hevc_vdpau",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_HEVC,
+    .pix_fmt        = AV_PIX_FMT_VDPAU,
+    .start_frame    = vdpau_hevc_start_frame,
+    .end_frame      = vdpau_hevc_end_frame,
+    .decode_slice   = vdpau_hevc_decode_slice,
+    .frame_priv_data_size = sizeof(struct vdpau_picture_context),
+    .init           = vdpau_hevc_init,
+    .uninit         = ff_vdpau_common_uninit,
+    .priv_data_size = sizeof(VDPAUContext),
+};
diff --git a/libavcodec/vdpau_internal.h b/libavcodec/vdpau_internal.h
index f35c99d3a5..8a63733546 100644
--- a/libavcodec/vdpau_internal.h
+++ b/libavcodec/vdpau_internal.h
@@ -4,33 +4,35 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VDPAU_INTERNAL_H
 #define AVCODEC_VDPAU_INTERNAL_H
 
+#include "config.h"
 #include <stdint.h>
+#if CONFIG_VDPAU
 #include <vdpau/vdpau.h>
+#endif
 
 #include "libavutil/frame.h"
 
 #include "avcodec.h"
-#include "vdpau.h"
 
 /** Extract VdpVideoSurface from an AVFrame */
 static inline uintptr_t ff_vdpau_get_surface_id(AVFrame *pic)
@@ -38,6 +40,8 @@ static inline uintptr_t ff_vdpau_get_surface_id(AVFrame *pic)
     return (uintptr_t)pic->data[3];
 }
 
+struct vdpau_picture_context;
+#if CONFIG_VDPAU
 union VDPAUPictureInfo {
     VdpPictureInfoH264        h264;
     VdpPictureInfoMPEG1Or2    mpeg;
@@ -46,8 +50,13 @@ union VDPAUPictureInfo {
 #ifdef VDP_DECODER_PROFILE_H264_HIGH_444_PREDICTIVE
     VdpPictureInfoH264Predictive h264_predictive;
 #endif
+#ifdef VDP_DECODER_PROFILE_HEVC_MAIN
+    VdpPictureInfoHEVC        hevc;
+#endif
 };
 
+#include "vdpau.h"
+
 typedef struct VDPAUHWContext {
     AVVDPAUContext context;
     VdpDevice device;
@@ -105,6 +114,8 @@ struct vdpau_picture_context {
 
 int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
                          int level);
+#endif //CONFIG_VDPAU
+
 int ff_vdpau_common_uninit(AVCodecContext *avctx);
 
 int ff_vdpau_common_start_frame(struct vdpau_picture_context *pic,
diff --git a/libavcodec/vdpau_mpeg12.c b/libavcodec/vdpau_mpeg12.c
index cb6f81a76d..3ac2cb827d 100644
--- a/libavcodec/vdpau_mpeg12.c
+++ b/libavcodec/vdpau_mpeg12.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_mpeg4.c b/libavcodec/vdpau_mpeg4.c
index 978456ad2a..9141becd9f 100644
--- a/libavcodec/vdpau_mpeg4.c
+++ b/libavcodec/vdpau_mpeg4.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,6 +119,9 @@ static int vdpau_mpeg4_init(AVCodecContext *avctx)
     case FF_PROFILE_MPEG4_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_SP;
         break;
+    // As any ASP decoder must be able to decode SP, this
+    // should be a safe fallback if profile is unknown/unspecified.
+    case FF_PROFILE_UNKNOWN:
     case FF_PROFILE_MPEG4_ADVANCED_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_ASP;
         break;
diff --git a/libavcodec/vdpau_vc1.c b/libavcodec/vdpau_vc1.c
index 4f87c52ecc..ffd6505d13 100644
--- a/libavcodec/vdpau_vc1.c
+++ b/libavcodec/vdpau_vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,14 +44,18 @@ static int vdpau_vc1_start_frame(AVCodecContext *avctx,
 
     switch (s->pict_type) {
     case AV_PICTURE_TYPE_B:
+        if (s->next_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->next_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->backward_reference = ref;
+        }
         /* fall-through */
     case AV_PICTURE_TYPE_P:
+        if (s->last_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->last_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->forward_reference  = ref;
+        }
     }
 
     info->slice_count       = 0;
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 4b487ca948..1c97a1e5d2 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -1,19 +1,19 @@
 /*
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 
 #include "libavutil/version.h"
 
-#define LIBAVCODEC_VERSION_MAJOR 57
-#define LIBAVCODEC_VERSION_MINOR  3
-#define LIBAVCODEC_VERSION_MICRO  0
+#define LIBAVCODEC_VERSION_MAJOR  57
+#define LIBAVCODEC_VERSION_MINOR   4
+#define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                                LIBAVCODEC_VERSION_MINOR, \
@@ -46,8 +46,21 @@
  * FF_API_* defines may be placed below to indicate public API that will be
  * dropped at a future version bump. The defines themselves are not part of
  * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
  */
 
+#ifndef FF_API_VIMA_DECODER
+#define FF_API_VIMA_DECODER     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AUDIO_CONVERT
+#define FF_API_AUDIO_CONVERT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AVCODEC_RESAMPLE
+#define FF_API_AVCODEC_RESAMPLE  FF_API_AUDIO_CONVERT
+#endif
 #ifndef FF_API_MISSING_SAMPLE
 #define FF_API_MISSING_SAMPLE    (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
@@ -96,9 +109,6 @@
 #ifndef FF_API_MAX_BFRAMES
 #define FF_API_MAX_BFRAMES       (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
-#ifndef FF_API_FAST_MALLOC
-#define FF_API_FAST_MALLOC       (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
 #ifndef FF_API_NEG_LINESIZES
 #define FF_API_NEG_LINESIZES     (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
@@ -135,9 +145,16 @@
 #ifndef FF_API_AFD
 #define FF_API_AFD               (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
+#ifndef FF_API_VISMV
+/* XXX: don't forget to drop the -vismv documentation */
+#define FF_API_VISMV             (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
 #ifndef FF_API_AUDIOENC_DELAY
 #define FF_API_AUDIOENC_DELAY    (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
+#ifndef FF_API_VAAPI_CONTEXT
+#define FF_API_VAAPI_CONTEXT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
 #ifndef FF_API_AVCTX_TIMEBASE
 #define FF_API_AVCTX_TIMEBASE    (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
index e6d9303903..ba618a7bbc 100644
--- a/libavcodec/videodsp.c
+++ b/libavcodec/videodsp.c
@@ -1,24 +1,25 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "videodsp.h"
 
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
index 04c012a826..fc01a31dca 100644
--- a/libavcodec/videodsp.h
+++ b/libavcodec/videodsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,14 +29,25 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#define EMULATED_EDGE(depth) \
+void ff_emulated_edge_mc_ ## depth(uint8_t *dst, const uint8_t *src, \
+                                   ptrdiff_t dst_stride, ptrdiff_t src_stride, \
+                                   int block_w, int block_h,\
+                                   int src_x, int src_y, int w, int h);
+
+EMULATED_EDGE(8)
+EMULATED_EDGE(16)
+
 typedef struct VideoDSPContext {
     /**
      * Copy a rectangular area of samples to a temporary buffer and replicate
      * the border samples.
      *
-     * @param buf destination buffer
+     * @param dst destination buffer
+     * @param dst_stride number of bytes between 2 vertically adjacent samples
+     *                   in destination buffer
      * @param src source buffer
-     * @param buf_linesize number of bytes between 2 vertically adjacent
+     * @param dst_linesize number of bytes between 2 vertically adjacent
      *                     samples in the destination buffer
      * @param src_linesize number of bytes between 2 vertically adjacent
      *                     samples in both the source buffer
@@ -49,8 +60,8 @@ typedef struct VideoDSPContext {
      * @param w width of the source buffer
      * @param h height of the source buffer
      */
-    void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                             ptrdiff_t buf_linesize,
+    void (*emulated_edge_mc)(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t dst_linesize,
                              ptrdiff_t src_linesize,
                              int block_w, int block_h,
                              int src_x, int src_y, int w, int h);
diff --git a/libavcodec/videodsp_template.c b/libavcodec/videodsp_template.c
index 28b8c3286d..94c1b7188d 100644
--- a/libavcodec/videodsp_template.c
+++ b/libavcodec/videodsp_template.c
@@ -1,42 +1,46 @@
 /*
- * Copyright (c) 2002-2004 Michael Niedermayer
+ * Copyright (c) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
-
 #include "bit_depth_template.c"
-
-static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                                      ptrdiff_t buf_linesize,
-                                      ptrdiff_t src_linesize,
-                                      int block_w, int block_h,
-                                      int src_x, int src_y, int w, int h)
+void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
+                               ptrdiff_t buf_linesize,
+                               ptrdiff_t src_linesize,
+                               int block_w, int block_h,
+                               int src_x, int src_y, int w, int h)
 {
     int x, y;
     int start_y, start_x, end_y, end_x;
 
+    if (!w || !h)
+        return;
+
+    av_assert2(block_w * sizeof(pixel) <= FFABS(buf_linesize));
+
     if (src_y >= h) {
-        src  += (h - 1 - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (h - 1) * src_linesize;
         src_y = h - 1;
     } else if (src_y <= -block_h) {
-        src  += (1 - block_h - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (1 - block_h) * src_linesize;
         src_y = 1 - block_h;
     }
     if (src_x >= w) {
@@ -51,8 +55,8 @@ static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y = FFMIN(block_h, h-src_y);
     end_x = FFMIN(block_w, w-src_x);
-    assert(start_y < end_y && block_h);
-    assert(start_x < end_x && block_w);
+    av_assert2(start_y < end_y && block_h);
+    av_assert2(start_x < end_x && block_w);
 
     w    = end_x - start_x;
     src += start_y * src_linesize + start_x * sizeof(pixel);
diff --git a/libavcodec/videotoolbox.c b/libavcodec/videotoolbox.c
new file mode 100644
index 0000000000..cc1e592ac9
--- /dev/null
+++ b/libavcodec/videotoolbox.c
@@ -0,0 +1,699 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if CONFIG_VIDEOTOOLBOX
+#  include "videotoolbox.h"
+#else
+#  include "vda.h"
+#endif
+#include "vda_vt_internal.h"
+#include "libavutil/avutil.h"
+#include "bytestream.h"
+#include "h264.h"
+#include "mpegvideo.h"
+
+#ifndef kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder
+#  define kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder CFSTR("RequireHardwareAcceleratedVideoDecoder")
+#endif
+
+#define VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING  12
+
+static void videotoolbox_buffer_release(void *opaque, uint8_t *data)
+{
+    CVPixelBufferRef cv_buffer = (CVImageBufferRef)data;
+    CVPixelBufferRelease(cv_buffer);
+}
+
+static int videotoolbox_buffer_copy(VTContext *vtctx,
+                                    const uint8_t *buffer,
+                                    uint32_t size)
+{
+    void *tmp;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                         &vtctx->allocated_size,
+                         size);
+
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+    memcpy(vtctx->bitstream, buffer, size);
+    vtctx->bitstream_size = size;
+
+    return 0;
+}
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    frame->width  = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+    frame->buf[0] = av_buffer_alloc(1);
+
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+#define AV_W8(p, v) *(p) = (v)
+
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx)
+{
+    H264Context *h     = avctx->priv_data;
+    CFDataRef data = NULL;
+    uint8_t *p;
+    int vt_extradata_size = 6 + 3 + h->sps.data_size + 4 + h->pps.data_size;
+    uint8_t *vt_extradata = av_malloc(vt_extradata_size);
+    if (!vt_extradata)
+        return NULL;
+
+    p = vt_extradata;
+
+    AV_W8(p + 0, 1); /* version */
+    AV_W8(p + 1, h->sps.data[0]); /* profile */
+    AV_W8(p + 2, h->sps.data[1]); /* profile compat */
+    AV_W8(p + 3, h->sps.data[2]); /* level */
+    AV_W8(p + 4, 0xff); /* 6 bits reserved (111111) + 2 bits nal size length - 3 (11) */
+    AV_W8(p + 5, 0xe1); /* 3 bits reserved (111) + 5 bits number of sps (00001) */
+    AV_WB16(p + 6, h->sps.data_size + 1);
+    AV_W8(p + 8, NAL_SPS | (3 << 5)); // NAL unit header
+    memcpy(p + 9, h->sps.data, h->sps.data_size);
+    p += 9 + h->sps.data_size;
+    AV_W8(p + 0, 1); /* number of pps */
+    AV_WB16(p + 1, h->pps.data_size + 1);
+    AV_W8(p + 3, NAL_PPS | (3 << 5)); // NAL unit header
+    memcpy(p + 4, h->pps.data, h->pps.data_size);
+
+    p += 4 + h->pps.data_size;
+    av_assert0(p - vt_extradata == vt_extradata_size);
+
+    data = CFDataCreate(kCFAllocatorDefault, vt_extradata, vt_extradata_size);
+    av_free(vt_extradata);
+    return data;
+}
+
+int ff_videotoolbox_buffer_create(VTContext *vtctx, AVFrame *frame)
+{
+    av_buffer_unref(&frame->buf[0]);
+
+    frame->buf[0] = av_buffer_create((uint8_t*)vtctx->frame,
+                                     sizeof(vtctx->frame),
+                                     videotoolbox_buffer_release,
+                                     NULL,
+                                     AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[0]) {
+        return AVERROR(ENOMEM);
+    }
+
+    frame->data[3] = (uint8_t*)vtctx->frame;
+    vtctx->frame = NULL;
+
+    return 0;
+}
+
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h  = avctx->priv_data;
+
+    vtctx->bitstream_size = 0;
+
+    if (h->is_avc == 1) {
+        return videotoolbox_buffer_copy(vtctx, buffer, size);
+    }
+
+    return 0;
+}
+
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h  = avctx->priv_data;
+    void *tmp;
+
+    if (h->is_avc == 1)
+        return 0;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                          &vtctx->allocated_size,
+                          vtctx->bitstream_size+size+4);
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+
+    AV_WB32(vtctx->bitstream + vtctx->bitstream_size, size);
+    memcpy(vtctx->bitstream + vtctx->bitstream_size + 4, buffer, size);
+
+    vtctx->bitstream_size += size + 4;
+
+    return 0;
+}
+
+int ff_videotoolbox_uninit(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    if (vtctx) {
+        av_freep(&vtctx->bitstream);
+        if (vtctx->frame)
+            CVPixelBufferRelease(vtctx->frame);
+    }
+
+    return 0;
+}
+
+#if CONFIG_VIDEOTOOLBOX
+static void videotoolbox_write_mp4_descr_length(PutByteContext *pb, int length)
+{
+    int i;
+    uint8_t b;
+
+    for (i = 3; i >= 0; i--) {
+        b = (length >> (i * 7)) & 0x7F;
+        if (i != 0)
+            b |= 0x80;
+
+        bytestream2_put_byteu(pb, b);
+    }
+}
+
+static CFDataRef videotoolbox_esds_extradata_create(AVCodecContext *avctx)
+{
+    CFDataRef data;
+    uint8_t *rw_extradata;
+    PutByteContext pb;
+    int full_size = 3 + 5 + 13 + 5 + avctx->extradata_size + 3;
+    // ES_DescrTag data + DecoderConfigDescrTag + data + DecSpecificInfoTag + size + SLConfigDescriptor
+    int config_size = 13 + 5 + avctx->extradata_size;
+    int s;
+
+    if (!(rw_extradata = av_mallocz(full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING)))
+        return NULL;
+
+    bytestream2_init_writer(&pb, rw_extradata, full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING);
+    bytestream2_put_byteu(&pb, 0);        // version
+    bytestream2_put_ne24(&pb, 0);         // flags
+
+    // elementary stream descriptor
+    bytestream2_put_byteu(&pb, 0x03);     // ES_DescrTag
+    videotoolbox_write_mp4_descr_length(&pb, full_size);
+    bytestream2_put_ne16(&pb, 0);         // esid
+    bytestream2_put_byteu(&pb, 0);        // stream priority (0-32)
+
+    // decoder configuration descriptor
+    bytestream2_put_byteu(&pb, 0x04);     // DecoderConfigDescrTag
+    videotoolbox_write_mp4_descr_length(&pb, config_size);
+    bytestream2_put_byteu(&pb, 32);       // object type indication. 32 = AV_CODEC_ID_MPEG4
+    bytestream2_put_byteu(&pb, 0x11);     // stream type
+    bytestream2_put_ne24(&pb, 0);         // buffer size
+    bytestream2_put_ne32(&pb, 0);         // max bitrate
+    bytestream2_put_ne32(&pb, 0);         // avg bitrate
+
+    // decoder specific descriptor
+    bytestream2_put_byteu(&pb, 0x05);     ///< DecSpecificInfoTag
+    videotoolbox_write_mp4_descr_length(&pb, avctx->extradata_size);
+
+    bytestream2_put_buffer(&pb, avctx->extradata, avctx->extradata_size);
+
+    // SLConfigDescriptor
+    bytestream2_put_byteu(&pb, 0x06);     // SLConfigDescrTag
+    bytestream2_put_byteu(&pb, 0x01);     // length
+    bytestream2_put_byteu(&pb, 0x02);     //
+
+    s = bytestream2_size_p(&pb);
+
+    data = CFDataCreate(kCFAllocatorDefault, rw_extradata, s);
+
+    av_freep(&rw_extradata);
+    return data;
+}
+
+static CMSampleBufferRef videotoolbox_sample_buffer_create(CMFormatDescriptionRef fmt_desc,
+                                                           void *buffer,
+                                                           int size)
+{
+    OSStatus status;
+    CMBlockBufferRef  block_buf;
+    CMSampleBufferRef sample_buf;
+
+    block_buf  = NULL;
+    sample_buf = NULL;
+
+    status = CMBlockBufferCreateWithMemoryBlock(kCFAllocatorDefault,// structureAllocator
+                                                buffer,             // memoryBlock
+                                                size,               // blockLength
+                                                kCFAllocatorNull,   // blockAllocator
+                                                NULL,               // customBlockSource
+                                                0,                  // offsetToData
+                                                size,               // dataLength
+                                                0,                  // flags
+                                                &block_buf);
+
+    if (!status) {
+        status = CMSampleBufferCreate(kCFAllocatorDefault,  // allocator
+                                      block_buf,            // dataBuffer
+                                      TRUE,                 // dataReady
+                                      0,                    // makeDataReadyCallback
+                                      0,                    // makeDataReadyRefcon
+                                      fmt_desc,             // formatDescription
+                                      1,                    // numSamples
+                                      0,                    // numSampleTimingEntries
+                                      NULL,                 // sampleTimingArray
+                                      0,                    // numSampleSizeEntries
+                                      NULL,                 // sampleSizeArray
+                                      &sample_buf);
+    }
+
+    if (block_buf)
+        CFRelease(block_buf);
+
+    return sample_buf;
+}
+
+static void videotoolbox_decoder_callback(void *opaque,
+                                          void *sourceFrameRefCon,
+                                          OSStatus status,
+                                          VTDecodeInfoFlags flags,
+                                          CVImageBufferRef image_buffer,
+                                          CMTime pts,
+                                          CMTime duration)
+{
+    AVCodecContext *avctx = opaque;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    if (vtctx->frame) {
+        CVPixelBufferRelease(vtctx->frame);
+        vtctx->frame = NULL;
+    }
+
+    if (!image_buffer) {
+        av_log(NULL, AV_LOG_DEBUG, "vt decoder cb: output image buffer is null\n");
+        return;
+    }
+
+    vtctx->frame = CVPixelBufferRetain(image_buffer);
+}
+
+static OSStatus videotoolbox_session_decode_frame(AVCodecContext *avctx)
+{
+    OSStatus status;
+    CMSampleBufferRef sample_buf;
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    sample_buf = videotoolbox_sample_buffer_create(videotoolbox->cm_fmt_desc,
+                                                   vtctx->bitstream,
+                                                   vtctx->bitstream_size);
+
+    if (!sample_buf)
+        return -1;
+
+    status = VTDecompressionSessionDecodeFrame(videotoolbox->session,
+                                               sample_buf,
+                                               0,       // decodeFlags
+                                               NULL,    // sourceFrameRefCon
+                                               0);      // infoFlagsOut
+    if (status == noErr)
+        status = VTDecompressionSessionWaitForAsynchronousFrames(videotoolbox->session);
+
+    CFRelease(sample_buf);
+
+    return status;
+}
+
+static int videotoolbox_common_end_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    int status;
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    if (!videotoolbox->session || !vtctx->bitstream)
+        return AVERROR_INVALIDDATA;
+
+    status = videotoolbox_session_decode_frame(avctx);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
+        return AVERROR_UNKNOWN;
+    }
+
+    if (!vtctx->frame)
+        return AVERROR_UNKNOWN;
+
+    return ff_videotoolbox_buffer_create(vtctx, frame);
+}
+
+static int videotoolbox_h264_end_frame(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    AVFrame *frame = h->cur_pic_ptr->f;
+
+    return videotoolbox_common_end_frame(avctx, frame);
+}
+
+static int videotoolbox_mpeg_start_frame(AVCodecContext *avctx,
+                                         const uint8_t *buffer,
+                                         uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    return videotoolbox_buffer_copy(vtctx, buffer, size);
+}
+
+static int videotoolbox_mpeg_decode_slice(AVCodecContext *avctx,
+                                          const uint8_t *buffer,
+                                          uint32_t size)
+{
+    return 0;
+}
+
+static int videotoolbox_mpeg_end_frame(AVCodecContext *avctx)
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *frame = s->current_picture_ptr->f;
+
+    return videotoolbox_common_end_frame(avctx, frame);
+}
+
+static CFDictionaryRef videotoolbox_decoder_config_create(CMVideoCodecType codec_type,
+                                                          AVCodecContext *avctx)
+{
+    CFMutableDictionaryRef config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                                   1,
+                                                                   &kCFTypeDictionaryKeyCallBacks,
+                                                                   &kCFTypeDictionaryValueCallBacks);
+
+    CFDictionarySetValue(config_info,
+                         kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder,
+                         kCFBooleanTrue);
+
+    if (avctx->extradata_size) {
+        CFMutableDictionaryRef avc_info;
+        CFDataRef data = NULL;
+
+        avc_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                             1,
+                                             &kCFTypeDictionaryKeyCallBacks,
+                                             &kCFTypeDictionaryValueCallBacks);
+
+        switch (codec_type) {
+        case kCMVideoCodecType_MPEG4Video :
+            data = videotoolbox_esds_extradata_create(avctx);
+            if (data)
+                CFDictionarySetValue(avc_info, CFSTR("esds"), data);
+            break;
+        case kCMVideoCodecType_H264 :
+            data = ff_videotoolbox_avcc_extradata_create(avctx);
+            if (data)
+                CFDictionarySetValue(avc_info, CFSTR("avcC"), data);
+            break;
+        default:
+            break;
+        }
+
+        CFDictionarySetValue(config_info,
+                kCMFormatDescriptionExtension_SampleDescriptionExtensionAtoms,
+                avc_info);
+
+        if (data)
+            CFRelease(data);
+
+        CFRelease(avc_info);
+    }
+    return config_info;
+}
+
+static CFDictionaryRef videotoolbox_buffer_attributes_create(int width,
+                                                             int height,
+                                                             OSType pix_fmt)
+{
+    CFMutableDictionaryRef buffer_attributes;
+    CFMutableDictionaryRef io_surface_properties;
+    CFNumberRef cv_pix_fmt;
+    CFNumberRef w;
+    CFNumberRef h;
+
+    w = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &width);
+    h = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &height);
+    cv_pix_fmt = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &pix_fmt);
+
+    buffer_attributes = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                  4,
+                                                  &kCFTypeDictionaryKeyCallBacks,
+                                                  &kCFTypeDictionaryValueCallBacks);
+    io_surface_properties = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                      0,
+                                                      &kCFTypeDictionaryKeyCallBacks,
+                                                      &kCFTypeDictionaryValueCallBacks);
+
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferPixelFormatTypeKey, cv_pix_fmt);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferIOSurfacePropertiesKey, io_surface_properties);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferWidthKey, w);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferHeightKey, h);
+
+    CFRelease(io_surface_properties);
+    CFRelease(cv_pix_fmt);
+    CFRelease(w);
+    CFRelease(h);
+
+    return buffer_attributes;
+}
+
+static CMVideoFormatDescriptionRef videotoolbox_format_desc_create(CMVideoCodecType codec_type,
+                                                                   CFDictionaryRef decoder_spec,
+                                                                   int width,
+                                                                   int height)
+{
+    CMFormatDescriptionRef cm_fmt_desc;
+    OSStatus status;
+
+    status = CMVideoFormatDescriptionCreate(kCFAllocatorDefault,
+                                            codec_type,
+                                            width,
+                                            height,
+                                            decoder_spec, // Dictionary of extension
+                                            &cm_fmt_desc);
+
+    if (status)
+        return NULL;
+
+    return cm_fmt_desc;
+}
+
+static int videotoolbox_default_init(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    OSStatus status;
+    VTDecompressionOutputCallbackRecord decoder_cb;
+    CFDictionaryRef decoder_spec;
+    CFDictionaryRef buf_attr;
+
+    if (!videotoolbox) {
+        av_log(avctx, AV_LOG_ERROR, "hwaccel context is not set\n");
+        return -1;
+    }
+
+    switch( avctx->codec_id ) {
+    case AV_CODEC_ID_H263 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H263;
+        break;
+    case AV_CODEC_ID_H264 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H264;
+        break;
+    case AV_CODEC_ID_MPEG1VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG1Video;
+        break;
+    case AV_CODEC_ID_MPEG2VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG2Video;
+        break;
+    case AV_CODEC_ID_MPEG4 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG4Video;
+        break;
+    default :
+        break;
+    }
+
+    decoder_spec = videotoolbox_decoder_config_create(videotoolbox->cm_codec_type, avctx);
+
+    videotoolbox->cm_fmt_desc = videotoolbox_format_desc_create(videotoolbox->cm_codec_type,
+                                                                decoder_spec,
+                                                                avctx->width,
+                                                                avctx->height);
+    if (!videotoolbox->cm_fmt_desc) {
+        if (decoder_spec)
+            CFRelease(decoder_spec);
+
+        av_log(avctx, AV_LOG_ERROR, "format description creation failed\n");
+        return -1;
+    }
+
+    buf_attr = videotoolbox_buffer_attributes_create(avctx->width,
+                                                     avctx->height,
+                                                     videotoolbox->cv_pix_fmt_type);
+
+    decoder_cb.decompressionOutputCallback = videotoolbox_decoder_callback;
+    decoder_cb.decompressionOutputRefCon   = avctx;
+
+    status = VTDecompressionSessionCreate(NULL,                      // allocator
+                                          videotoolbox->cm_fmt_desc, // videoFormatDescription
+                                          decoder_spec,              // videoDecoderSpecification
+                                          buf_attr,                  // destinationImageBufferAttributes
+                                          &decoder_cb,               // outputCallback
+                                          &videotoolbox->session);   // decompressionSessionOut
+
+    if (decoder_spec)
+        CFRelease(decoder_spec);
+    if (buf_attr)
+        CFRelease(buf_attr);
+
+    switch (status) {
+    case kVTVideoDecoderNotAvailableNowErr:
+    case kVTVideoDecoderUnsupportedDataFormatErr:
+        return AVERROR(ENOSYS);
+    case kVTVideoDecoderMalfunctionErr:
+        return AVERROR(EINVAL);
+    case kVTVideoDecoderBadDataErr :
+        return AVERROR_INVALIDDATA;
+    case 0:
+        return 0;
+    default:
+        return AVERROR_UNKNOWN;
+    }
+}
+
+static void videotoolbox_default_free(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+
+    if (videotoolbox) {
+        if (videotoolbox->cm_fmt_desc)
+            CFRelease(videotoolbox->cm_fmt_desc);
+
+        if (videotoolbox->session)
+            VTDecompressionSessionInvalidate(videotoolbox->session);
+    }
+}
+
+AVHWAccel ff_h263_videotoolbox_hwaccel = {
+    .name           = "h263_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_h264_videotoolbox_hwaccel = {
+    .name           = "h264_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = ff_videotoolbox_h264_start_frame,
+    .decode_slice   = ff_videotoolbox_h264_decode_slice,
+    .end_frame      = videotoolbox_h264_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg1_videotoolbox_hwaccel = {
+    .name           = "mpeg1_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg2_videotoolbox_hwaccel = {
+    .name           = "mpeg2_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg4_videotoolbox_hwaccel = {
+    .name           = "mpeg4_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void)
+{
+    AVVideotoolboxContext *ret = av_mallocz(sizeof(*ret));
+
+    if (ret) {
+        ret->output_callback = videotoolbox_decoder_callback;
+        ret->cv_pix_fmt_type = kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange;
+    }
+
+    return ret;
+}
+
+int av_videotoolbox_default_init(AVCodecContext *avctx)
+{
+    return av_videotoolbox_default_init2(avctx, NULL);
+}
+
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx)
+{
+    avctx->hwaccel_context = vtctx ?: av_videotoolbox_alloc_context();
+    if (!avctx->hwaccel_context)
+        return AVERROR(ENOMEM);
+    return videotoolbox_default_init(avctx);
+}
+
+void av_videotoolbox_default_free(AVCodecContext *avctx)
+{
+
+    videotoolbox_default_free(avctx);
+    av_freep(&avctx->hwaccel_context);
+}
+#endif /* CONFIG_VIDEOTOOLBOX */
diff --git a/libavcodec/videotoolbox.h b/libavcodec/videotoolbox.h
new file mode 100644
index 0000000000..a48638e2b2
--- /dev/null
+++ b/libavcodec/videotoolbox.h
@@ -0,0 +1,126 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VIDEOTOOLBOX_H
+#define AVCODEC_VIDEOTOOLBOX_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_videotoolbox
+ * Public libavcodec Videotoolbox header.
+ */
+
+#include <stdint.h>
+
+#define Picture QuickdrawPicture
+#include <VideoToolbox/VideoToolbox.h>
+#undef Picture
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This struct holds all the information that needs to be passed
+ * between the caller and libavcodec for initializing Videotoolbox decoding.
+ * Its size is not a part of the public ABI, it must be allocated with
+ * av_videotoolbox_alloc_context() and freed with av_free().
+ */
+typedef struct AVVideotoolboxContext {
+    /**
+     * Videotoolbox decompression session object.
+     * Created and freed the caller.
+     */
+    VTDecompressionSessionRef session;
+
+    /**
+     * The output callback that must be passed to the session.
+     * Set by av_videottoolbox_default_init()
+     */
+    VTDecompressionOutputCallback output_callback;
+
+    /**
+     * CVPixelBuffer Format Type that Videotoolbox will use for decoded frames.
+     * set by the caller.
+     */
+    OSType cv_pix_fmt_type;
+
+    /**
+     * CoreMedia Format Description that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    CMVideoFormatDescriptionRef cm_fmt_desc;
+
+    /**
+     * CoreMedia codec type that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    int cm_codec_type;
+} AVVideotoolboxContext;
+
+/**
+ * Allocate and initialize a Videotoolbox context.
+ *
+ * This function should be called from the get_format() callback when the caller
+ * selects the AV_PIX_FMT_VIDETOOLBOX format. The caller must then create
+ * the decoder object (using the output callback provided by libavcodec) that
+ * will be used for Videotoolbox-accelerated decoding.
+ *
+ * When decoding with Videotoolbox is finished, the caller must destroy the decoder
+ * object and free the Videotoolbox context using av_free().
+ *
+ * @return the newly allocated context or NULL on failure
+ */
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init(AVCodecContext *avctx);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ * @param vtctx the Videotoolbox context to use
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx);
+
+/**
+ * This function must be called to free the Videotoolbox context initialized with
+ * av_videotoolbox_default_init().
+ *
+ * @param avctx the corresponding codec context
+ */
+void av_videotoolbox_default_free(AVCodecContext *avctx);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_VIDEOTOOLBOX_H */
diff --git a/libavcodec/vima.c b/libavcodec/vima.c
index 6f539a86b6..b4620acf6b 100644
--- a/libavcodec/vima.c
+++ b/libavcodec/vima.c
@@ -2,20 +2,20 @@
  * LucasArts VIMA decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vmdaudio.c b/libavcodec/vmdaudio.c
index 9e02ba7633..e8c8a064c7 100644
--- a/libavcodec/vmdaudio.c
+++ b/libavcodec/vmdaudio.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD audio decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +35,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
@@ -74,7 +76,7 @@ static av_cold int vmdaudio_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
-    if (avctx->block_align < 1) {
+    if (avctx->block_align < 1 || avctx->block_align % avctx->channels) {
         av_log(avctx, AV_LOG_ERROR, "invalid block align\n");
         return AVERROR(EINVAL);
     }
@@ -180,17 +182,16 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     frame->nb_samples = ((silent_chunks + audio_chunks) * avctx->block_align) /
                         avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples_u8  =            frame->data[0];
     output_samples_s16 = (int16_t *)frame->data[0];
 
     /* decode silent chunks */
     if (silent_chunks > 0) {
-        int silent_size = FFMIN(avctx->block_align * silent_chunks,
-                                frame->nb_samples * avctx->channels);
+        int silent_size = avctx->block_align * silent_chunks;
+        av_assert0(avctx->block_align * silent_chunks <= frame->nb_samples * avctx->channels);
+
         if (s->out_bps == 2) {
             memset(output_samples_s16, 0x00, silent_size * 2);
             output_samples_s16 += silent_size;
@@ -202,8 +203,9 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* decode audio chunks */
     if (audio_chunks > 0) {
-        buf_end = buf + (buf_size & ~(avctx->channels > 1));
-        while (buf + s->chunk_size <= buf_end) {
+        buf_end = buf + buf_size;
+        av_assert0((buf_size & (avctx->channels > 1)) == 0);
+        while (buf_end - buf >= s->chunk_size) {
             if (s->out_bps == 2) {
                 decode_audio_s16(output_samples_s16, buf, s->chunk_size,
                                  avctx->channels);
diff --git a/libavcodec/vmdvideo.c b/libavcodec/vmdvideo.c
index 2e91c06374..b97032ff7e 100644
--- a/libavcodec/vmdvideo.c
+++ b/libavcodec/vmdvideo.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD video decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +63,7 @@ typedef struct VmdVideoContext {
 #define QUEUE_SIZE 0x1000
 #define QUEUE_MASK 0x0FFF
 
-static void lz_unpack(const unsigned char *src, int src_len,
+static int lz_unpack(const unsigned char *src, int src_len,
                       unsigned char *dest, int dest_len)
 {
     unsigned char *d;
@@ -83,9 +84,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
     dataleft = bytestream2_get_le32(&gb);
     memset(queue, 0x20, QUEUE_SIZE);
     if (bytestream2_get_bytes_left(&gb) < 4)
-        return;
+        return AVERROR_INVALIDDATA;
     if (bytestream2_peek_le32(&gb) == 0x56781234) {
-        bytestream2_get_le32(&gb);
+        bytestream2_skipu(&gb, 4);
         qpos = 0x111;
         speclen = 0xF + 3;
     } else {
@@ -96,8 +97,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
     while (dataleft > 0 && bytestream2_get_bytes_left(&gb) > 0) {
         tag = bytestream2_get_byteu(&gb);
         if ((tag == 0xFF) && (dataleft > 8)) {
-            if (d + 8 > d_end || bytestream2_get_bytes_left(&gb) < 8)
-                return;
+            if (d_end - d < 8 || bytestream2_get_bytes_left(&gb) < 8)
+                return AVERROR_INVALIDDATA;
             for (i = 0; i < 8; i++) {
                 queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                 qpos &= QUEUE_MASK;
@@ -108,9 +109,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
                 if (dataleft == 0)
                     break;
                 if (tag & 0x01) {
-                    if (d + 1 > d_end || bytestream2_get_bytes_left(&gb) < 1)
-                        return;
-                    queue[qpos++] = *d++ = bytestream2_get_byte(&gb);
+                    if (d_end - d < 1 || bytestream2_get_bytes_left(&gb) < 1)
+                        return AVERROR_INVALIDDATA;
+                    queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                     qpos &= QUEUE_MASK;
                     dataleft--;
                 } else {
@@ -120,8 +121,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
                     if (chainlen == speclen) {
                         chainlen = bytestream2_get_byte(&gb) + 0xF + 3;
                     }
-                    if (d + chainlen > d_end)
-                        return;
+                    if (d_end - d < chainlen)
+                        return AVERROR_INVALIDDATA;
                     for (j = 0; j < chainlen; j++) {
                         *d = queue[chainofs++ & QUEUE_MASK];
                         queue[qpos++] = *d++;
@@ -133,10 +134,10 @@ static void lz_unpack(const unsigned char *src, int src_len,
             }
         }
     }
+    return d - dest;
 }
-
 static int rle_unpack(const unsigned char *src, unsigned char *dest,
-    int src_count, int src_size, int dest_len)
+                      int src_count, int src_size, int dest_len)
 {
     unsigned char *pd;
     int i, l, used = 0;
@@ -159,12 +160,12 @@ static int rle_unpack(const unsigned char *src, unsigned char *dest,
         l = bytestream2_get_byteu(&gb);
         if (l & 0x80) {
             l = (l & 0x7F) * 2;
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < l)
+            if (dest_end - pd < l || bytestream2_get_bytes_left(&gb) < l)
                 return bytestream2_tell(&gb);
-            bytestream2_get_buffer(&gb, pd, l);
+            bytestream2_get_bufferu(&gb, pd, l);
             pd += l;
         } else {
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < 2)
+            if (dest_end - pd < 2*l || bytestream2_get_bytes_left(&gb) < 2)
                 return bytestream2_tell(&gb);
             run_val = bytestream2_get_ne16(&gb);
             for (i = 0; i < l; i++) {
@@ -200,6 +201,16 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
     frame_y = AV_RL16(&s->buf[8]);
     frame_width = AV_RL16(&s->buf[10]) - frame_x + 1;
     frame_height = AV_RL16(&s->buf[12]) - frame_y + 1;
+
+    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
+        (frame_x || frame_y)) {
+
+        s->x_off = frame_x;
+        s->y_off = frame_y;
+    }
+    frame_x -= s->x_off;
+    frame_y -= s->y_off;
+
     if (frame_x < 0 || frame_width < 0 ||
         frame_x >= s->avctx->width ||
         frame_width > s->avctx->width ||
@@ -219,15 +230,6 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     }
 
-    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
-        (frame_x || frame_y)) {
-
-        s->x_off = frame_x;
-        s->y_off = frame_y;
-    }
-    frame_x -= s->x_off;
-    frame_y -= s->y_off;
-
     /* if only a certain region will be updated, copy the entire previous
      * frame before the decode */
     if (s->prev_frame->data[0] &&
@@ -248,13 +250,13 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                 r = bytestream2_get_byteu(&gb) * 4;
                 g = bytestream2_get_byteu(&gb) * 4;
                 b = bytestream2_get_byteu(&gb) * 4;
-                palette32[i] = (r << 16) | (g << 8) | (b);
+                palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+                palette32[i] |= palette32[i] >> 6 & 0x30303;
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Incomplete palette\n");
             return AVERROR_INVALIDDATA;
         }
-        s->size -= PALETTE_COUNT * 3 + 2;
     }
 
     if (!s->size)
@@ -265,15 +267,18 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     meth = bytestream2_get_byteu(&gb);
     if (meth & 0x80) {
+        int size;
         if (!s->unpack_buffer_size) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Trying to unpack LZ-compressed frame with no LZ buffer\n");
             return AVERROR_INVALIDDATA;
         }
-        lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
-                  s->unpack_buffer, s->unpack_buffer_size);
+        size = lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
+                         s->unpack_buffer, s->unpack_buffer_size);
+        if (size < 0)
+            return size;
         meth &= 0x7F;
-        bytestream2_init(&gb, s->unpack_buffer, s->unpack_buffer_size);
+        bytestream2_init(&gb, s->unpack_buffer, size);
     }
 
     dp = &frame->data[0][frame_y * frame->linesize[0] + frame_x];
@@ -289,7 +294,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                     if (ofs + len > frame_width ||
                         bytestream2_get_bytes_left(&gb) < len)
                         return AVERROR_INVALIDDATA;
-                    bytestream2_get_buffer(&gb, &dp[ofs], len);
+                    bytestream2_get_bufferu(&gb, &dp[ofs], len);
                     ofs += len;
                 } else {
                     /* interframe pixel copy */
@@ -301,7 +306,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -334,6 +339,9 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                         ofs += slen;
                         bytestream2_skip(&gb, len);
                     } else {
+                        if (ofs + len > frame_width ||
+                            bytestream2_get_bytes_left(&gb) < len)
+                            return AVERROR_INVALIDDATA;
                         bytestream2_get_buffer(&gb, &dp[ofs], len);
                         ofs += len;
                     }
@@ -347,7 +355,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -364,7 +372,8 @@ static av_cold int vmdvideo_decode_end(AVCodecContext *avctx)
     VmdVideoContext *s = avctx->priv_data;
 
     av_frame_free(&s->prev_frame);
-    av_free(s->unpack_buffer);
+    av_freep(&s->unpack_buffer);
+    s->unpack_buffer_size = 0;
 
     return 0;
 }
@@ -384,9 +393,9 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
 
     /* make sure the VMD header made it */
     if (s->avctx->extradata_size != VMD_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n",
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n",
             VMD_HEADER_SIZE);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     vmd_header = (unsigned char *)avctx->extradata;
 
@@ -404,7 +413,8 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
         r = raw_palette[palette_index++] * 4;
         g = raw_palette[palette_index++] * 4;
         b = raw_palette[palette_index++] * 4;
-        palette32[i] = (r << 16) | (g << 8) | (b);
+        palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+        palette32[i] |= palette32[i] >> 6 & 0x30303;
     }
 
     s->prev_frame = av_frame_alloc();
@@ -432,10 +442,8 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx,
     if (buf_size < 16)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if ((ret = vmd_decode(s, frame)) < 0)
         return ret;
diff --git a/libavcodec/vmnc.c b/libavcodec/vmnc.c
index 3ef2134117..49abb776f2 100644
--- a/libavcodec/vmnc.c
+++ b/libavcodec/vmnc.c
@@ -2,20 +2,20 @@
  * VMware Screen Codec (VMnc) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -291,6 +291,11 @@ static int decode_hextile(VmncContext *c, uint8_t* dst, GetByteContext *gb,
                         fg = vmnc_get_pixel(gb, bpp, c->bigendian);
                     xy = bytestream2_get_byte(gb);
                     wh = bytestream2_get_byte(gb);
+                    if (   (xy >> 4) + (wh >> 4) + 1 > w - i
+                        || (xy & 0xF) + (wh & 0xF)+1 > h - j) {
+                        av_log(c->avctx, AV_LOG_ERROR, "Rectangle outside picture\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     paint_rect(dst2, xy >> 4, xy & 0xF,
                                (wh>>4)+1, (wh & 0xF)+1, fg, bpp, stride);
                 }
@@ -307,6 +312,8 @@ static void reset_buffers(VmncContext *c)
     av_freep(&c->curmask);
     av_freep(&c->screendta);
     c->cur_w = c->cur_h = 0;
+    c->cur_hx = c->cur_hy = 0;
+
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
@@ -319,10 +326,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     uint8_t *outptr;
     int dx, dy, w, h, depth, enc, chunks, res, size_left, ret;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(gb, buf, buf_size);
 
@@ -360,6 +365,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     bytestream2_skip(gb, 2);
     chunks = bytestream2_get_be16(gb);
     while (chunks--) {
+        if (bytestream2_get_bytes_left(gb) < 12) {
+            av_log(avctx, AV_LOG_ERROR, "Premature end of data!\n");
+            return -1;
+        }
         dx  = bytestream2_get_be16(gb);
         dy  = bytestream2_get_be16(gb);
         w   = bytestream2_get_be16(gb);
@@ -369,6 +378,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         size_left = bytestream2_get_bytes_left(gb);
         switch (enc) {
         case MAGIC_WMVd: // cursor
+            if (w*(int64_t)h*c->bpp2 > INT_MAX/2 - 2) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions too large\n");
+                return AVERROR_INVALIDDATA;
+            }
             if (size_left < 2 + w * h * c->bpp2 * 2) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Premature end of data! (need %i got %i)\n",
@@ -419,18 +432,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             c->pic->pict_type = AV_PICTURE_TYPE_I;
             depth = bytestream2_get_byte(gb);
             if (depth != c->bpp) {
-                av_log(avctx, AV_LOG_WARNING, "Depth mismatch. "
-                       "Container %i bpp / Codec %i bpp\n", c->bpp, depth);
-
-                if (depth != 8 && depth != 16 && depth != 32) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Unsupported codec bitdepth %i\n", depth);
-                    return AVERROR_INVALIDDATA;
-                }
-
-                /* reset values */
-                c->bpp  = depth;
-                c->bpp2 = c->bpp / 8;
+                av_log(avctx, AV_LOG_INFO,
+                       "Depth mismatch. Container %i bpp, "
+                       "Frame data: %i bpp\n",
+                       c->bpp, depth);
             }
             bytestream2_skip(gb, 1);
             c->bigendian = bytestream2_get_byte(gb);
@@ -523,7 +528,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->width  = avctx->width;
     c->height = avctx->height;
     c->bpp    = avctx->bits_per_coded_sample;
-    c->bpp2   = c->bpp / 8;
 
     switch (c->bpp) {
     case 8:
@@ -534,14 +538,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case 24:
         /* 24 bits is not technically supported, but some clients might
-         * mistakenly set it -- delay the actual check until decode_frame() */
+         * mistakenly set it, so let's assume they actually meant 32 bits */
+        c->bpp = 32;
     case 32:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_0RGB32;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bitdepth %i\n", c->bpp);
         return AVERROR_INVALIDDATA;
     }
+    c->bpp2 = c->bpp / 8;
 
     c->pic = av_frame_alloc();
     if (!c->pic)
@@ -556,9 +562,9 @@ static av_cold int decode_end(AVCodecContext *avctx)
 
     av_frame_free(&c->pic);
 
-    av_free(c->curbits);
-    av_free(c->curmask);
-    av_free(c->screendta);
+    av_freep(&c->curbits);
+    av_freep(&c->curmask);
+    av_freep(&c->screendta);
     return 0;
 }
 
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c
index 66fa21b9e1..86d10407f4 100644
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Common code for Vorbis I encoder and decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -67,7 +71,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
 
     codes[p] = 0;
     if (bits[p] > 32)
-        return 1;
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < bits[p]; ++i)
         exit_at_level[i+1] = 1 << i;
 
@@ -81,9 +85,14 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
 
     ++p;
 
+    for (i = p; (bits[i] == 0) && (i < num); ++i)
+        ;
+    if (i == num)
+        return 0;
+
     for (; p < num; ++p) {
         if (bits[p] > 32)
-             return 1;
+             return AVERROR_INVALIDDATA;
         if (bits[p] == 0)
              continue;
         // find corresponding exit(node which the tree can grow further from)
@@ -91,7 +100,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
             if (exit_at_level[i])
                 break;
         if (!i) // overspecified tree
-             return 1;
+             return AVERROR_INVALIDDATA;
         code = exit_at_level[i];
         exit_at_level[i] = 0;
         // construct code (append 0s to end) and introduce new exits
@@ -112,7 +121,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
     //no exits should be left (underspecified tree - ie. unused valid vlcs - not allowed by SPEC)
     for (p = 1; p < 33; p++)
         if (exit_at_level[p])
-            return 1;
+            return AVERROR_INVALIDDATA;
 
     return 0;
 }
diff --git a/libavcodec/vorbis.h b/libavcodec/vorbis.h
index 5ae20ac243..98dd14f9d4 100644
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_data.c b/libavcodec/vorbis_data.c
index bafb77b1e8..063a075ce0 100644
--- a/libavcodec/vorbis_data.c
+++ b/libavcodec/vorbis_data.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2005 Denes Balatoni ( dbalatoni programozo hu )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_enc_data.h b/libavcodec/vorbis_enc_data.h
index a1e743ebe5..a51aaec978 100644
--- a/libavcodec/vorbis_enc_data.h
+++ b/libavcodec/vorbis_enc_data.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -492,13 +492,13 @@ static const struct {
     int dim;
     int subclass;
     int masterbook;
-    const int *nbooks;
+    const int nbooks[4];
 } floor_classes[] = {
-    { 3, 0, 0, (const int[]){  4             } },
-    { 4, 1, 0, (const int[]){  5,  6         } },
-    { 3, 1, 1, (const int[]){  7,  8         } },
-    { 4, 2, 2, (const int[]){ -1,  9, 10, 11 } },
-    { 3, 2, 3, (const int[]){ -1, 12, 13, 14 } },
+    { 3, 0, 0, {  4             } },
+    { 4, 1, 0, {  5,  6         } },
+    { 3, 1, 1, {  7,  8         } },
+    { 4, 2, 2, { -1,  9, 10, 11 } },
+    { 3, 2, 3, { -1, 12, 13, 14 } },
 };
 
 #endif /* AVCODEC_VORBIS_ENC_DATA_H */
diff --git a/libavcodec/vorbis_parser.c b/libavcodec/vorbis_parser.c
index 054635d100..0b2c97cde5 100644
--- a/libavcodec/vorbis_parser.c
+++ b/libavcodec/vorbis_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -173,7 +173,7 @@ static int parse_setup_header(AVVorbisParseContext *s,
     skip_bits_long(&gb, got_framing_bit);
     for (i = mode_count - 1; i >= 0; i--) {
         skip_bits_long(&gb, 40);
-        s->mode_blocksize[i] = s->blocksize[get_bits1(&gb)];
+        s->mode_blocksize[i] = get_bits1(&gb);
     }
 
 bad_header:
@@ -184,7 +184,7 @@ bad_header:
 static int vorbis_parse_init(AVVorbisParseContext *s,
                              const uint8_t *extradata, int extradata_size)
 {
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int ret;
 
@@ -205,13 +205,13 @@ static int vorbis_parse_init(AVVorbisParseContext *s,
         return ret;
 
     s->valid_extradata = 1;
-    s->previous_blocksize = s->mode_blocksize[0];
+    s->previous_blocksize = s->blocksize[s->mode_blocksize[0]];
 
     return 0;
 }
 
-int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                          int buf_size)
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags)
 {
     int duration = 0;
 
@@ -220,6 +220,24 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
         int previous_blocksize = s->previous_blocksize;
 
         if (buf[0] & 1) {
+            /* If the user doesn't care about special packets, it's a bad one. */
+            if (!flags)
+                goto bad_packet;
+
+            /* Set the flag for which kind of special packet it is. */
+            if (buf[0] == 1)
+                *flags |= VORBIS_FLAG_HEADER;
+            else if (buf[0] == 3)
+                *flags |= VORBIS_FLAG_COMMENT;
+            else if (buf[0] == 5)
+                *flags |= VORBIS_FLAG_SETUP;
+            else
+                goto bad_packet;
+
+            /* Special packets have no duration. */
+            return 0;
+
+bad_packet:
             av_log(s, AV_LOG_ERROR, "Invalid packet\n");
             return AVERROR_INVALIDDATA;
         }
@@ -231,11 +249,11 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
             av_log(s, AV_LOG_ERROR, "Invalid mode in packet\n");
             return AVERROR_INVALIDDATA;
         }
-        if (mode) {
+        if(s->mode_blocksize[mode]){
             int flag = !!(buf[0] & s->prev_mask);
             previous_blocksize = s->blocksize[flag];
         }
-        current_blocksize     = s->mode_blocksize[mode];
+        current_blocksize     = s->blocksize[s->mode_blocksize[mode]];
         duration              = (previous_blocksize + current_blocksize) >> 2;
         s->previous_blocksize = current_blocksize;
     }
@@ -243,10 +261,16 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
     return duration;
 }
 
+int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
+                          int buf_size)
+{
+    return av_vorbis_parse_frame_flags(s, buf, buf_size, NULL);
+}
+
 void av_vorbis_parse_reset(AVVorbisParseContext *s)
 {
     if (s->valid_extradata)
-        s->previous_blocksize = s->mode_blocksize[0];
+        s->previous_blocksize = s->blocksize[0];
 }
 
 void av_vorbis_parse_free(AVVorbisParseContext **s)
@@ -272,22 +296,6 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
     return s;
 }
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s)
-{
-    return vorbis_parse_init(s, avctx->extradata, avctx->extradata_size);
-}
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s)
-{
-    av_vorbis_parse_reset(s);
-}
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size)
-{
-    return av_vorbis_parse_frame(s, buf, buf_size);
-}
-#endif
-
 #if CONFIG_VORBIS_PARSER
 
 typedef struct VorbisParseContext {
diff --git a/libavcodec/vorbis_parser.h b/libavcodec/vorbis_parser.h
index 9ae1630f9f..06e48bd3b0 100644
--- a/libavcodec/vorbis_parser.h
+++ b/libavcodec/vorbis_parser.h
@@ -1,19 +1,19 @@
 /*
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,6 +45,24 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
  */
 void av_vorbis_parse_free(AVVorbisParseContext **s);
 
+#define VORBIS_FLAG_HEADER  0x00000001
+#define VORBIS_FLAG_COMMENT 0x00000002
+#define VORBIS_FLAG_SETUP   0x00000004
+
+/**
+ * Get the duration for a Vorbis packet.
+ *
+ * If @p flags is @c NULL,
+ * special frames are considered invalid.
+ *
+ * @param s        Vorbis parser context
+ * @param buf      buffer containing a Vorbis frame
+ * @param buf_size size of the buffer
+ * @param flags    flags for special frames
+ */
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags);
+
 /**
  * Get the duration for a Vorbis packet.
  *
diff --git a/libavcodec/vorbis_parser_internal.h b/libavcodec/vorbis_parser_internal.h
index 8f76af706d..98e48524a4 100644
--- a/libavcodec/vorbis_parser_internal.h
+++ b/libavcodec/vorbis_parser_internal.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,29 +43,4 @@ struct AVVorbisParseContext {
     int prev_mask;              ///< bitmask used to get the previous mode flag in each packet
 };
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-/**
- * Initialize the Vorbis parser using headers in the extradata.
- *
- * @param avctx codec context
- * @param s     Vorbis parser context
- */
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s);
-
-/**
- * Get the duration for a Vorbis packet.
- *
- * avpriv_vorbis_parse_extradata() must have been successfully called prior to
- * this in order for a correct duration to be returned.
- *
- * @param s        Vorbis parser context
- * @param buf      buffer containing a Vorbis frame
- * @param buf_size size of the buffer
- */
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size);
-
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s);
-#endif
-
 #endif /* AVCODEC_VORBIS_PARSER_H */
diff --git a/libavcodec/vorbisdec.c b/libavcodec/vorbisdec.c
index abc01c8617..f773afaa41 100644
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Vorbis I decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
+ *
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +31,7 @@
 
 #define BITSTREAM_READER_LE
 #include "libavutil/float_dsp.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "fft.h"
@@ -121,7 +126,7 @@ typedef struct vorbis_context_s {
     AVCodecContext *avctx;
     GetBitContext gb;
     VorbisDSPContext dsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
     FFTContext mdct[2];
     uint8_t       first_frame;
@@ -144,7 +149,7 @@ typedef struct vorbis_context_s {
     uint8_t       mode_count;
     vorbis_mode  *modes;
     uint8_t       mode_number; // mode number for the current packet
-    uint8_t       previous_window;
+    int8_t       previous_window;
     float        *channel_residues;
     float        *saved;
 } vorbis_context;
@@ -186,38 +191,43 @@ static void vorbis_free(vorbis_context *vc)
 
     av_freep(&vc->channel_residues);
     av_freep(&vc->saved);
+    av_freep(&vc->fdsp);
 
-    for (i = 0; i < vc->residue_count; i++)
-        av_free(vc->residues[i].classifs);
+    if (vc->residues)
+        for (i = 0; i < vc->residue_count; i++)
+            av_freep(&vc->residues[i].classifs);
     av_freep(&vc->residues);
     av_freep(&vc->modes);
 
     ff_mdct_end(&vc->mdct[0]);
     ff_mdct_end(&vc->mdct[1]);
 
-    for (i = 0; i < vc->codebook_count; ++i) {
-        av_free(vc->codebooks[i].codevectors);
-        ff_free_vlc(&vc->codebooks[i].vlc);
-    }
+    if (vc->codebooks)
+        for (i = 0; i < vc->codebook_count; ++i) {
+            av_freep(&vc->codebooks[i].codevectors);
+            ff_free_vlc(&vc->codebooks[i].vlc);
+        }
     av_freep(&vc->codebooks);
 
-    for (i = 0; i < vc->floor_count; ++i) {
-        if (vc->floors[i].floor_type == 0) {
-            av_free(vc->floors[i].data.t0.map[0]);
-            av_free(vc->floors[i].data.t0.map[1]);
-            av_free(vc->floors[i].data.t0.book_list);
-            av_free(vc->floors[i].data.t0.lsp);
-        } else {
-            av_free(vc->floors[i].data.t1.list);
+    if (vc->floors)
+        for (i = 0; i < vc->floor_count; ++i) {
+            if (vc->floors[i].floor_type == 0) {
+                av_freep(&vc->floors[i].data.t0.map[0]);
+                av_freep(&vc->floors[i].data.t0.map[1]);
+                av_freep(&vc->floors[i].data.t0.book_list);
+                av_freep(&vc->floors[i].data.t0.lsp);
+            } else {
+                av_freep(&vc->floors[i].data.t1.list);
+            }
         }
-    }
     av_freep(&vc->floors);
 
-    for (i = 0; i < vc->mapping_count; ++i) {
-        av_free(vc->mappings[i].magnitude);
-        av_free(vc->mappings[i].angle);
-        av_free(vc->mappings[i].mux);
-    }
+    if (vc->mappings)
+        for (i = 0; i < vc->mapping_count; ++i) {
+            av_freep(&vc->mappings[i].magnitude);
+            av_freep(&vc->mappings[i].angle);
+            av_freep(&vc->mappings[i].mux);
+        }
     av_freep(&vc->mappings);
 }
 
@@ -369,10 +379,12 @@ static int vorbis_parse_setup_hdr_codebooks(vorbis_context *vc)
 // Weed out unused vlcs and build codevector vector
             if (used_entries) {
                 codebook_setup->codevectors =
-                    av_mallocz(used_entries * codebook_setup->dimensions *
+                    av_mallocz_array(used_entries, codebook_setup->dimensions *
                                sizeof(*codebook_setup->codevectors));
-                if (!codebook_setup->codevectors)
-                    return AVERROR(ENOMEM);
+                if (!codebook_setup->codevectors) {
+                    ret = AVERROR(ENOMEM);
+                    goto error;
+                }
             } else
                 codebook_setup->codevectors = NULL;
 
@@ -555,7 +567,7 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             for (j = 0; j < floor_setup->data.t1.partitions; ++j)
                 floor_setup->data.t1.x_list_dim+=floor_setup->data.t1.class_dimensions[floor_setup->data.t1.partition_class[j]];
 
-            floor_setup->data.t1.list = av_mallocz(floor_setup->data.t1.x_list_dim *
+            floor_setup->data.t1.list = av_mallocz_array(floor_setup->data.t1.x_list_dim,
                                                    sizeof(*floor_setup->data.t1.list));
             if (!floor_setup->data.t1.list)
                 return AVERROR(ENOMEM);
@@ -634,8 +646,8 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             /* codebook dim is for padding if codebook dim doesn't *
              * divide order+1 then we need to read more data       */
             floor_setup->data.t0.lsp =
-                av_malloc((floor_setup->data.t0.order + 1 + max_codebook_dim)
-                          * sizeof(*floor_setup->data.t0.lsp));
+                av_malloc_array((floor_setup->data.t0.order + 1 + max_codebook_dim),
+                                sizeof(*floor_setup->data.t0.lsp));
             if (!floor_setup->data.t0.lsp)
                 return AVERROR(ENOMEM);
 
@@ -695,8 +707,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
         res_setup->partition_size = get_bits(gb, 24) + 1;
         /* Validations to prevent a buffer overflow later. */
         if (res_setup->begin>res_setup->end ||
-            res_setup->end > (res_setup->type == 2 ? vc->avctx->channels : 1) * vc->blocksize[1] / 2 ||
-            (res_setup->end-res_setup->begin) / res_setup->partition_size > V_MAX_PARTITIONS) {
+            (res_setup->end-res_setup->begin) / res_setup->partition_size > FFMIN(V_MAX_PARTITIONS, 65535)) {
             av_log(vc->avctx, AV_LOG_ERROR,
                    "partition out of bounds: type, begin, end, size, blocksize: %"PRIu16", %"PRIu32", %"PRIu32", %u, %"PRIu32"\n",
                    res_setup->type, res_setup->begin, res_setup->end,
@@ -709,7 +720,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
 
         res_setup->ptns_to_read =
             (res_setup->end - res_setup->begin) / res_setup->partition_size;
-        res_setup->classifs = av_malloc(res_setup->ptns_to_read *
+        res_setup->classifs = av_malloc_array(res_setup->ptns_to_read,
                                         vc->audio_channels *
                                         sizeof(*res_setup->classifs));
         if (!res_setup->classifs)
@@ -802,7 +813,7 @@ static int vorbis_parse_setup_hdr_mappings(vorbis_context *vc)
         }
 
         if (mapping_setup->submaps>1) {
-            mapping_setup->mux = av_mallocz(vc->audio_channels *
+            mapping_setup->mux = av_mallocz_array(vc->audio_channels,
                                             sizeof(*mapping_setup->mux));
             if (!mapping_setup->mux)
                 return AVERROR(ENOMEM);
@@ -837,7 +848,7 @@ static int create_map(vorbis_context *vc, unsigned floor_number)
     for (blockflag = 0; blockflag < 2; ++blockflag) {
         n = vc->blocksize[blockflag] / 2;
         floors[floor_number].data.t0.map[blockflag] =
-            av_malloc((n + 1) * sizeof(int32_t)); // n + sentinel
+            av_malloc_array(n + 1, sizeof(int32_t)); // n + sentinel
         if (!floors[floor_number].data.t0.map[blockflag])
             return AVERROR(ENOMEM);
 
@@ -964,12 +975,12 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
     vc->bitrate_minimum = get_bits_long(gb, 32);
     bl0 = get_bits(gb, 4);
     bl1 = get_bits(gb, 4);
-    vc->blocksize[0] = (1 << bl0);
-    vc->blocksize[1] = (1 << bl1);
     if (bl0 > 13 || bl0 < 6 || bl1 > 13 || bl1 < 6 || bl1 < bl0) {
         av_log(vc->avctx, AV_LOG_ERROR, " Vorbis id header packet corrupt (illegal blocksize). \n");
         return AVERROR_INVALIDDATA;
     }
+    vc->blocksize[0] = (1 << bl0);
+    vc->blocksize[1] = (1 << bl1);
     vc->win[0] = ff_vorbis_vwin[bl0 - 6];
     vc->win[1] = ff_vorbis_vwin[bl1 - 6];
 
@@ -978,15 +989,18 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
         return AVERROR_INVALIDDATA;
     }
 
-    vc->channel_residues =  av_malloc((vc->blocksize[1]  / 2) * vc->audio_channels * sizeof(*vc->channel_residues));
-    vc->saved            =  av_mallocz((vc->blocksize[1] / 4) * vc->audio_channels * sizeof(*vc->saved));
+    vc->channel_residues =  av_malloc_array(vc->blocksize[1]  / 2, vc->audio_channels * sizeof(*vc->channel_residues));
+    vc->saved            =  av_mallocz_array(vc->blocksize[1] / 4, vc->audio_channels * sizeof(*vc->saved));
     if (!vc->channel_residues || !vc->saved)
         return AVERROR(ENOMEM);
 
-    vc->previous_window  = 0;
+    vc->previous_window  = -1;
 
     ff_mdct_init(&vc->mdct[0], bl0, 1, -1.0);
     ff_mdct_init(&vc->mdct[1], bl1, 1, -1.0);
+    vc->fdsp = avpriv_float_dsp_alloc(vc->avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!vc->fdsp)
+        return AVERROR(ENOMEM);
 
     ff_dlog(NULL, " vorbis version %d \n audio_channels %d \n audio_samplerate %d \n bitrate_max %d \n bitrate_nom %d \n bitrate_min %d \n blk_0 %d blk_1 %d \n ",
             vc->version, vc->audio_channels, vc->audio_samplerate, vc->bitrate_maximum, vc->bitrate_nominal, vc->bitrate_minimum, vc->blocksize[0], vc->blocksize[1]);
@@ -1008,14 +1022,13 @@ static av_cold int vorbis_decode_init(AVCodecContext *avctx)
     vorbis_context *vc = avctx->priv_data;
     uint8_t *headers   = avctx->extradata;
     int headers_len    = avctx->extradata_size;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     GetBitContext *gb = &vc->gb;
     int hdr_type, ret;
 
     vc->avctx = avctx;
     ff_vorbisdsp_init(&vc->dsp);
-    avpriv_float_dsp_init(&vc->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
@@ -1187,7 +1200,7 @@ static int vorbis_floor1_decode(vorbis_context *vc,
     uint16_t floor1_Y[258];
     uint16_t floor1_Y_final[258];
     int floor1_flag[258];
-    unsigned class, cdim, cbits, csub, cval, offset, i, j;
+    unsigned partition_class, cdim, cbits, csub, cval, offset, i, j;
     int book, adx, ady, dy, off, predicted, err;
 
 
@@ -1203,28 +1216,31 @@ static int vorbis_floor1_decode(vorbis_context *vc,
 
     offset = 2;
     for (i = 0; i < vf->partitions; ++i) {
-        class = vf->partition_class[i];
-        cdim   = vf->class_dimensions[class];
-        cbits  = vf->class_subclasses[class];
+        partition_class = vf->partition_class[i];
+        cdim   = vf->class_dimensions[partition_class];
+        cbits  = vf->class_subclasses[partition_class];
         csub = (1 << cbits) - 1;
         cval = 0;
 
         ff_dlog(NULL, "Cbits %u\n", cbits);
 
         if (cbits) // this reads all subclasses for this partition's class
-            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[class]].vlc.table,
-                            vc->codebooks[vf->class_masterbook[class]].nb_bits, 3);
+            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[partition_class]].vlc.table,
+                            vc->codebooks[vf->class_masterbook[partition_class]].nb_bits, 3);
 
         for (j = 0; j < cdim; ++j) {
-            book = vf->subclass_books[class][cval & csub];
+            book = vf->subclass_books[partition_class][cval & csub];
 
             ff_dlog(NULL, "book %d Cbits %u cval %u  bits:%d\n",
                     book, cbits, cval, get_bits_count(gb));
 
             cval = cval >> cbits;
             if (book > -1) {
-                floor1_Y[offset+j] = get_vlc2(gb, vc->codebooks[book].vlc.table,
-                vc->codebooks[book].nb_bits, 3);
+                int v = get_vlc2(gb, vc->codebooks[book].vlc.table,
+                                 vc->codebooks[book].nb_bits, 3);
+                if (v < 0)
+                    return AVERROR_INVALIDDATA;
+                floor1_Y[offset+j] = v;
             } else {
                 floor1_Y[offset+j] = 0;
             }
@@ -1305,7 +1321,9 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
                                            vorbis_residue *vr,
                                            uint8_t *do_not_decode,
                                            unsigned ch_used,
-                                           int partition_count)
+                                           int partition_count,
+                                           int ptns_to_read
+                                          )
 {
     vorbis_codebook *codebook = vc->codebooks + vr->classbook;
     int p, j, i;
@@ -1319,21 +1337,25 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
 
             ff_dlog(NULL, "Classword: %u\n", temp);
 
+            av_assert0(temp < 65536);
+
             if (temp < 0) {
                 av_log(vc->avctx, AV_LOG_ERROR,
                        "Invalid vlc code decoding %d channel.", j);
                 return AVERROR_INVALIDDATA;
             }
 
+            av_assert0(vr->classifications > 1); //needed for inverse[]
+
             for (i = partition_count + c_p_c - 1; i >= partition_count; i--) {
                 temp2 = (((uint64_t)temp) * inverse_class) >> 32;
 
-                if (i < vr->ptns_to_read)
+                if (i < ptns_to_read)
                     vr->classifs[p + i] = temp - temp2 * vr->classifications;
                 temp = temp2;
             }
         }
-        p += vr->ptns_to_read;
+        p += ptns_to_read;
     }
     return 0;
 }
@@ -1354,6 +1376,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     unsigned pass, ch_used, i, j, k, l;
     unsigned max_output = (ch - 1) * vlen;
     int ptns_to_read = vr->ptns_to_read;
+    int libvorbis_bug = 0;
 
     if (vr_type == 2) {
         for (j = 1; j < ch; ++j)
@@ -1368,8 +1391,13 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     }
 
     if (max_output > ch_left * vlen) {
-        av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
-        return AVERROR_INVALIDDATA;
+        if (max_output <= ch_left * vlen + vr->partition_size*ch_used/ch) {
+            ptns_to_read--;
+            libvorbis_bug = 1;
+        } else {
+            av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     ff_dlog(NULL, " residue type 0/1/2 decode begin, ch: %d  cpc %d  \n", ch, c_p_c);
@@ -1380,7 +1408,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
         voffset = vr->begin;
         for (partition_count = 0; partition_count < ptns_to_read;) {  // SPEC        error
             if (!pass) {
-                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count);
+                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count, ptns_to_read);
                 if (ret < 0)
                     return ret;
             }
@@ -1478,6 +1506,14 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
                 voffset += vr->partition_size;
             }
         }
+        if (libvorbis_bug && !pass) {
+            for (j = 0; j < ch_used; ++j) {
+                if (!do_not_decode[j]) {
+                    get_vlc2(&vc->gb, vc->codebooks[vr->classbook].vlc.table,
+                                vc->codebooks[vr->classbook].nb_bits, 3);
+                }
+            }
+        }
     }
     return 0;
 }
@@ -1530,7 +1566,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 {
     GetBitContext *gb = &vc->gb;
     FFTContext *mdct;
-    unsigned previous_window = vc->previous_window;
+    int previous_window = vc->previous_window;
     unsigned mode_number, blockflag, blocksize;
     int i, j;
     uint8_t no_residue[255];
@@ -1563,9 +1599,11 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
     blocksize = vc->blocksize[blockflag];
     vlen = blocksize / 2;
     if (blockflag) {
-        previous_window = get_bits(gb, 1);
-        skip_bits1(gb); // next_window
-    }
+        int code = get_bits(gb, 2);
+        if (previous_window < 0)
+            previous_window = code>>1;
+    } else if (previous_window < 0)
+        previous_window = 0;
 
     memset(ch_res_ptr,   0, sizeof(float) * vc->audio_channels * vlen); //FIXME can this be removed ?
     for (i = 0; i < vc->audio_channels; ++i)
@@ -1653,7 +1691,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 
     for (j = vc->audio_channels-1;j >= 0; j--) {
         ch_res_ptr   = vc->channel_residues + res_chan[j] * blocksize / 2;
-        vc->fdsp.vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
+        vc->fdsp->vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
         mdct->imdct_half(mdct, ch_res_ptr, floor_ptr[j]);
     }
 
@@ -1670,13 +1708,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
         const float *win  = vc->win[blockflag & previous_window];
 
         if (blockflag == previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, blocksize / 4);
         } else if (blockflag > previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, bs0 / 4);
             memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
         } else {
             memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
-            vc->fdsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
         }
         memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
     }
@@ -1700,12 +1738,53 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
 
     ff_dlog(NULL, "packet length %d \n", buf_size);
 
+    if (*buf == 1 && buf_size > 7) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        vorbis_free(vc);
+        if ((ret = vorbis_parse_id_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Id header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+
+        if (vc->audio_channels > 8)
+            avctx->channel_layout = 0;
+        else
+            avctx->channel_layout = ff_vorbis_channel_layouts[vc->audio_channels - 1];
+
+        avctx->channels    = vc->audio_channels;
+        avctx->sample_rate = vc->audio_samplerate;
+        return buf_size;
+    }
+
+    if (*buf == 3 && buf_size > 7) {
+        av_log(avctx, AV_LOG_DEBUG, "Ignoring comment header\n");
+        return buf_size;
+    }
+
+    if (*buf == 5 && buf_size > 7 && vc->channel_residues && !vc->modes) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        if ((ret = vorbis_parse_setup_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Setup header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+        return buf_size;
+    }
+
+    if (!vc->channel_residues || !vc->modes) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet before valid headers\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = vc->blocksize[1] / 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (vc->audio_channels > 8) {
         for (i = 0; i < vc->audio_channels; i++)
@@ -1717,7 +1796,8 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
         }
     }
 
-    init_get_bits(gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(gb, buf, buf_size)) < 0)
+        return ret;
 
     if ((len = vorbis_parse_audio_packet(vc, channel_ptrs)) <= 0)
         return len;
@@ -1757,7 +1837,8 @@ static av_cold void vorbis_decode_flush(AVCodecContext *avctx)
         memset(vc->saved, 0, (vc->blocksize[1] / 4) * vc->audio_channels *
                              sizeof(*vc->saved));
     }
-    vc->previous_window = 0;
+    vc->previous_window = -1;
+    vc->first_frame = 0;
 }
 
 AVCodec ff_vorbis_decoder = {
diff --git a/libavcodec/vorbisdsp.c b/libavcodec/vorbisdsp.c
index c37e2c4124..362a276296 100644
--- a/libavcodec/vorbisdsp.c
+++ b/libavcodec/vorbisdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisdsp.h b/libavcodec/vorbisdsp.h
index ea41c401ce..7abec4e4b7 100644
--- a/libavcodec/vorbisdsp.h
+++ b/libavcodec/vorbisdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index 35bdd57fb1..2974ca2cf9 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -142,9 +142,9 @@ typedef struct vorbis_enc_context {
 static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
                                int entry)
 {
-    assert(entry >= 0);
-    assert(entry < cb->nentries);
-    assert(cb->lens[entry]);
+    av_assert2(entry >= 0);
+    av_assert2(entry < cb->nentries);
+    av_assert2(cb->lens[entry]);
     if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry])
         return AVERROR(EINVAL);
     put_bits(pb, cb->lens[entry], cb->codewords[entry]);
@@ -170,8 +170,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
         cb->pow2 = cb->dimensions = NULL;
     } else {
         int vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-        cb->dimensions = av_malloc(sizeof(float) * cb->nentries * cb->ndimensions);
-        cb->pow2 = av_mallocz(sizeof(float) * cb->nentries);
+        cb->dimensions = av_malloc_array(cb->nentries, sizeof(float) * cb->ndimensions);
+        cb->pow2 = av_mallocz_array(cb->nentries, sizeof(float));
         if (!cb->dimensions || !cb->pow2)
             return AVERROR(ENOMEM);
         for (i = 0; i < cb->nentries; i++) {
@@ -200,8 +200,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
 static int ready_residue(vorbis_enc_residue *rc, vorbis_enc_context *venc)
 {
     int i;
-    assert(rc->type == 2);
-    rc->maxes = av_mallocz(sizeof(float[2]) * rc->classifications);
+    av_assert0(rc->type == 2);
+    rc->maxes = av_mallocz_array(rc->classifications, sizeof(float[2]));
     if (!rc->maxes)
         return AVERROR(ENOMEM);
     for (i = 0; i < rc->classifications; i++) {
@@ -266,8 +266,8 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         cb->lookup      = cvectors[book].lookup;
         cb->seq_p       = 0;
 
-        cb->lens      = av_malloc(sizeof(uint8_t)  * cb->nentries);
-        cb->codewords = av_malloc(sizeof(uint32_t) * cb->nentries);
+        cb->lens      = av_malloc_array(cb->nentries, sizeof(uint8_t));
+        cb->codewords = av_malloc_array(cb->nentries, sizeof(uint32_t));
         if (!cb->lens || !cb->codewords)
             return AVERROR(ENOMEM);
         memcpy(cb->lens, cvectors[book].clens, cvectors[book].len);
@@ -275,7 +275,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
 
         if (cb->lookup) {
             vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-            cb->quantlist = av_malloc(sizeof(int) * vals);
+            cb->quantlist = av_malloc_array(vals, sizeof(int));
             if (!cb->quantlist)
                 return AVERROR(ENOMEM);
             for (i = 0; i < vals; i++)
@@ -305,7 +305,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         fc->nclasses = FFMAX(fc->nclasses, fc->partition_to_class[i]);
     }
     fc->nclasses++;
-    fc->classes = av_malloc(sizeof(vorbis_enc_floor_class) * fc->nclasses);
+    fc->classes = av_malloc_array(fc->nclasses, sizeof(vorbis_enc_floor_class));
     if (!fc->classes)
         return AVERROR(ENOMEM);
     for (i = 0; i < fc->nclasses; i++) {
@@ -315,7 +315,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         c->subclass   = floor_classes[i].subclass;
         c->masterbook = floor_classes[i].masterbook;
         books         = (1 << c->subclass);
-        c->books      = av_malloc(sizeof(int) * books);
+        c->books      = av_malloc_array(books, sizeof(int));
         if (!c->books)
             return AVERROR(ENOMEM);
         for (j = 0; j < books; j++)
@@ -328,7 +328,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
     for (i = 0; i < fc->partitions; i++)
         fc->values += fc->classes[fc->partition_to_class[i]].dim;
 
-    fc->list = av_malloc(sizeof(vorbis_floor1_entry) * fc->values);
+    fc->list = av_malloc_array(fc->values, sizeof(vorbis_floor1_entry));
     if (!fc->list)
         return AVERROR(ENOMEM);
     fc->list[0].x = 0;
@@ -419,10 +419,10 @@ static int create_vorbis_context(vorbis_enc_context *venc,
     venc->modes[0].mapping   = 0;
 
     venc->have_saved = 0;
-    venc->saved      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->samples    = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]));
-    venc->floor      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->coeffs     = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
+    venc->saved      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->samples    = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]));
+    venc->floor      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->coeffs     = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
     if (!venc->saved || !venc->samples || !venc->floor || !venc->coeffs)
         return AVERROR(ENOMEM);
 
@@ -585,9 +585,11 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
 {
     int i;
     PutBitContext pb;
-    uint8_t buffer[50000] = {0}, *p = buffer;
-    int buffer_len = sizeof buffer;
     int len, hlens[3];
+    int buffer_len = 50000;
+    uint8_t *buffer = av_mallocz(buffer_len), *p = buffer;
+    if (!buffer)
+        return AVERROR(ENOMEM);
 
     // identification header
     init_put_bits(&pb, p, buffer_len);
@@ -710,6 +712,7 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
         buffer_len += hlens[i];
     }
 
+    av_freep(&buffer);
     return p - *out;
 }
 
@@ -880,8 +883,8 @@ static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
     int classes[MAX_CHANNELS][NUM_RESIDUE_PARTITIONS];
     int classwords = venc->codebooks[rc->classbook].ndimensions;
 
-    assert(rc->type == 2);
-    assert(real_ch == 2);
+    av_assert0(rc->type == 2);
+    av_assert0(real_ch == 2);
     for (p = 0; p < partitions; p++) {
         float max1 = 0.0, max2 = 0.0;
         int s = rc->begin + p * psize;
@@ -1015,7 +1018,6 @@ static int apply_window_and_mdct(vorbis_enc_context *venc,
     return 1;
 }
 
-
 static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                const AVFrame *frame, int *got_packet_ptr)
 {
@@ -1031,10 +1033,8 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         return 0;
     samples = 1 << (venc->log2_blocksize[0] - 1);
 
-    if ((ret = ff_alloc_packet(avpkt, 8192))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
@@ -1170,7 +1170,7 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     int ret;
 
     if (avctx->channels != 2) {
-        av_log(avctx, AV_LOG_ERROR, "Current Libav Vorbis encoder only supports 2 channels.\n");
+        av_log(avctx, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only supports 2 channels.\n");
         return -1;
     }
 
@@ -1181,7 +1181,7 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     if (avctx->flags & AV_CODEC_FLAG_QSCALE)
         venc->quality = avctx->global_quality / (float)FF_QP2LAMBDA;
     else
-        venc->quality = 3.0;
+        venc->quality = 8;
     venc->quality *= venc->quality;
 
     if ((ret = put_main_header(venc, (uint8_t**)&avctx->extradata)) < 0)
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index c2fc3c90d9..1e5547443f 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2003-2004 the ffmpeg project
+ * Copyright (c) 2003-2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ typedef struct Vp3Fragment {
 /* special internal mode */
 #define MODE_COPY             8
 
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb);
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb);
+
+
 /* There are 6 preset schemes, plus a free-form scheme */
 static const int ModeAlphabet[6][CODING_MODE_COUNT] = {
     /* scheme 1: Last motion vector dominates */
@@ -173,6 +177,7 @@ typedef struct Vp3DecodeContext {
     int data_offset[3];
     uint8_t offset_x;
     uint8_t offset_y;
+    int offset_x_warned;
 
     int8_t (*motion_val[2])[2];
 
@@ -260,6 +265,20 @@ typedef struct Vp3DecodeContext {
  * VP3 specific functions
  ************************************************************************/
 
+static av_cold void free_tables(AVCodecContext *avctx)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+
+    av_freep(&s->superblock_coding);
+    av_freep(&s->all_fragments);
+    av_freep(&s->coded_fragment_list[0]);
+    av_freep(&s->dct_tokens_base);
+    av_freep(&s->superblock_fragments);
+    av_freep(&s->macroblock_coding);
+    av_freep(&s->motion_val[0]);
+    av_freep(&s->motion_val[1]);
+}
+
 static void vp3_decode_flush(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -277,16 +296,11 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int i;
 
-    av_freep(&s->superblock_coding);
-    av_freep(&s->all_fragments);
-    av_freep(&s->coded_fragment_list[0]);
-    av_freep(&s->dct_tokens_base);
-    av_freep(&s->superblock_fragments);
-    av_freep(&s->macroblock_coding);
-    av_freep(&s->motion_val[0]);
-    av_freep(&s->motion_val[1]);
+    free_tables(avctx);
     av_freep(&s->edge_emu_buffer);
 
+    s->theora_tables = 0;
+
     /* release all frames */
     vp3_decode_flush(avctx);
     av_frame_free(&s->current_frame.f);
@@ -312,7 +326,7 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     return 0;
 }
 
-/*
+/**
  * This function sets up all of the various blocks mappings:
  * superblocks <-> fragments, macroblocks <-> fragments,
  * superblocks <-> macroblocks
@@ -403,7 +417,7 @@ static void init_loop_filter(Vp3DecodeContext *s)
     int value;
 
     filter_limit = s->filter_limit_values[s->qps[0]];
-    assert(filter_limit < 128);
+    av_assert0(filter_limit < 128U);
 
     /* set up the bounding values */
     memset(s->bounding_values_array, 0, 256 * sizeof(int));
@@ -456,7 +470,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
             if (current_run == 34)
                 current_run += get_bits(gb, 12);
 
-            if (current_superblock + current_run > s->superblock_count) {
+            if (current_run > s->superblock_count - current_superblock) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Invalid partially coded superblock run length\n");
                 return -1;
@@ -1021,7 +1035,7 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
     if (blocks_ended > s->num_coded_frags[plane][coeff_index])
         av_log(s->avctx, AV_LOG_ERROR, "More blocks ended than coded!\n");
 
-    // decrement the number of blocks that have higher coeffecients for each
+    // decrement the number of blocks that have higher coefficients for each
     // EOB run at this level
     if (blocks_ended)
         for (i = coeff_index + 1; i < 64; i++)
@@ -1600,20 +1614,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                         /* invert DCT and place (or add) in final output */
 
                         if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                            int index;
-                            index = vp3_dequant(s, s->all_fragments + i,
-                                                plane, 0, block);
-                            if (index > 63)
-                                continue;
+                            vp3_dequant(s, s->all_fragments + i,
+                                        plane, 0, block);
                             s->vp3dsp.idct_put(output_plane + first_pixel,
                                                stride,
                                                block);
                         } else {
-                            int index = vp3_dequant(s, s->all_fragments + i,
-                                                    plane, 1, block);
-                            if (index > 63)
-                                continue;
-                            if (index > 0) {
+                            if (vp3_dequant(s, s->all_fragments + i,
+                                            plane, 1, block)) {
                                 s->vp3dsp.idct_add(output_plane + first_pixel,
                                                    stride,
                                                    block);
@@ -1657,22 +1665,24 @@ static av_cold int allocate_tables(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int y_fragment_count, c_fragment_count;
 
+    free_tables(avctx);
+
     y_fragment_count = s->fragment_width[0] * s->fragment_height[0];
     c_fragment_count = s->fragment_width[1] * s->fragment_height[1];
 
-    s->superblock_coding = av_malloc(s->superblock_count);
-    s->all_fragments     = av_malloc(s->fragment_count * sizeof(Vp3Fragment));
+    s->superblock_coding = av_mallocz(s->superblock_count);
+    s->all_fragments     = av_mallocz_array(s->fragment_count, sizeof(Vp3Fragment));
 
-    s->coded_fragment_list[0] = av_malloc(s->fragment_count * sizeof(int));
+    s->coded_fragment_list[0] = av_mallocz_array(s->fragment_count, sizeof(int));
 
-    s->dct_tokens_base = av_malloc(64 * s->fragment_count *
-                                   sizeof(*s->dct_tokens_base));
-    s->motion_val[0] = av_malloc(y_fragment_count * sizeof(*s->motion_val[0]));
-    s->motion_val[1] = av_malloc(c_fragment_count * sizeof(*s->motion_val[1]));
+    s->dct_tokens_base = av_mallocz_array(s->fragment_count,
+                                          64 * sizeof(*s->dct_tokens_base));
+    s->motion_val[0] = av_mallocz_array(y_fragment_count, sizeof(*s->motion_val[0]));
+    s->motion_val[1] = av_mallocz_array(c_fragment_count, sizeof(*s->motion_val[1]));
 
     /* work out the block mapping tables */
-    s->superblock_fragments = av_malloc(s->superblock_count * 16 * sizeof(int));
-    s->macroblock_coding    = av_malloc(s->macroblock_count + 1);
+    s->superblock_fragments = av_mallocz_array(s->superblock_count, 16 * sizeof(int));
+    s->macroblock_coding    = av_mallocz(s->macroblock_count + 1);
 
     if (!s->superblock_coding    || !s->all_fragments          ||
         !s->dct_tokens_base      || !s->coded_fragment_list[0] ||
@@ -1725,7 +1735,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     s->avctx  = avctx;
     s->width  = FFALIGN(avctx->coded_width, 16);
     s->height = FFALIGN(avctx->coded_height, 16);
-    if (avctx->pix_fmt == AV_PIX_FMT_NONE)
+    if (avctx->codec_id != AV_CODEC_ID_THEORA)
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     ff_hpeldsp_init(&s->hdsp, avctx->flags | AV_CODEC_FLAG_BITEXACT);
@@ -1733,7 +1743,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
 
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_permutation[i] = TRANSPOSE(i);
         s->idct_scantable[i]   = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
@@ -1744,8 +1754,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++)
         s->qps[i] = -1;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
 
     s->y_superblock_width  = (s->width  + 31) / 32;
     s->y_superblock_height = (s->height + 31) / 32;
@@ -1921,6 +1930,7 @@ static int ref_frames(Vp3DecodeContext *dst, Vp3DecodeContext *src)
     return 0;
 }
 
+#if HAVE_THREADS
 static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
 {
     Vp3DecodeContext *s = dst->priv_data, *s1 = src->priv_data;
@@ -1938,6 +1948,8 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
     }
 
     if (s != s1) {
+        if (!s->current_frame.f)
+            return AVERROR(ENOMEM);
         // init tables if the first frame hasn't been decoded
         if (!s->current_frame.f->data[0]) {
             int y_fragment_count, c_fragment_count;
@@ -1978,6 +1990,7 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
 
     return update_frames(dst);
 }
+#endif
 
 static int vp3_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame,
@@ -1989,15 +2002,47 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     GetBitContext gb;
     int i, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
+#if CONFIG_THEORA_DECODER
     if (s->theora && get_bits1(&gb)) {
+        int type = get_bits(&gb, 7);
+        skip_bits_long(&gb, 6*8); /* "theora" */
+
+        if (s->avctx->active_thread_type&FF_THREAD_FRAME) {
+            av_log(avctx, AV_LOG_ERROR, "midstream reconfiguration with multithreading is unsupported, try -threads 1\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (type == 0) {
+            vp3_decode_end(avctx);
+            ret = theora_decode_header(avctx, &gb);
+
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+            } else
+                ret = vp3_decode_init(avctx);
+            return ret;
+        } else if (type == 2) {
+            ret = theora_decode_tables(avctx, &gb);
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+            } else
+                ret = vp3_decode_init(avctx);
+            return ret;
+        }
+
         av_log(avctx, AV_LOG_ERROR,
                "Header packet passed to frame decoder, skipping\n");
         return -1;
     }
+#endif
 
     s->keyframe = !get_bits1(&gb);
+    if (!s->all_fragments) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet without prior valid headers\n");
+        return -1;
+    }
     if (!s->theora)
         skip_bits(&gb, 1);
     for (i = 0; i < 3; i++)
@@ -2032,10 +2077,9 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
     s->current_frame.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                                 : AV_PICTURE_TYPE_P;
-    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    s->current_frame.f->key_frame = s->keyframe;
+    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0)
         goto error;
-    }
 
     if (!s->edge_emu_buffer)
         s->edge_emu_buffer = av_malloc(9 * FFABS(s->current_frame.f->linesize[0]));
@@ -2064,10 +2108,8 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
             s->golden_frame.f->pict_type = AV_PICTURE_TYPE_I;
             if (ff_thread_get_buffer(avctx, &s->golden_frame,
-                                     AV_GET_BUFFER_FLAG_REF) < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                     AV_GET_BUFFER_FLAG_REF) < 0)
                 goto error;
-            }
             ff_thread_release_buffer(avctx, &s->last_frame);
             if ((ret = ff_thread_ref_frame(&s->last_frame,
                                            &s->golden_frame)) < 0)
@@ -2181,6 +2223,7 @@ static int read_huffman_tree(AVCodecContext *avctx, GetBitContext *gb)
     return 0;
 }
 
+#if HAVE_THREADS
 static int vp3_init_thread_copy(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -2197,6 +2240,7 @@ static int vp3_init_thread_copy(AVCodecContext *avctx)
 
     return init_frames(s);
 }
+#endif
 
 #if CONFIG_THEORA_DECODER
 static const enum AVPixelFormat theora_pix_fmts[4] = {
@@ -2239,7 +2283,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (av_image_check_size(visible_width, visible_height, 0, avctx) < 0 ||
         visible_width  + offset_x > s->width ||
         visible_height + offset_y > s->height) {
-        av_log(s, AV_LOG_ERROR,
+        av_log(avctx, AV_LOG_ERROR,
                "Invalid frame dimensions - w:%d h:%d x:%d y:%d (%dx%d).\n",
                visible_width, visible_height, offset_x, offset_y,
                s->width, s->height);
@@ -2276,14 +2320,17 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (s->theora >= 0x030200) {
         skip_bits(gb, 5); /* keyframe frequency force */
         avctx->pix_fmt = theora_pix_fmts[get_bits(gb, 2)];
+        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid pixel format\n");
+            return AVERROR_INVALIDDATA;
+        }
         skip_bits(gb, 3); /* reserved */
     }
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
-    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) &&
-        (visible_width != s->width || visible_height != s->height)) {
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP)) {
         avctx->width  = visible_width;
         avctx->height = visible_height;
         // translate offsets from theora axis ([0,0] lower left)
@@ -2293,9 +2340,12 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
 
         if ((s->offset_x & 0x1F) && !(avctx->flags & AV_CODEC_FLAG_UNALIGNED)) {
             s->offset_x &= ~0x1F;
-            av_log(avctx, AV_LOG_WARNING, "Reducing offset_x from %d to %d"
-                   "chroma samples to preserve alignment.\n",
-                   offset_x, s->offset_x);
+            if (!s->offset_x_warned) {
+                s->offset_x_warned = 1;
+                av_log(avctx, AV_LOG_WARNING, "Reducing offset_x from %d to %d"
+                    "chroma samples to preserve alignment.\n",
+                    offset_x, s->offset_x);
+            }
         }
     }
 
@@ -2426,9 +2476,12 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     GetBitContext gb;
     int ptype;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int i;
+    int ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     s->theora = 1;
 
@@ -2446,7 +2499,9 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++) {
         if (header_len[i] <= 0)
             continue;
-        init_get_bits(&gb, header_start[i], header_len[i] * 8);
+        ret = init_get_bits8(&gb, header_start[i], header_len[i]);
+        if (ret < 0)
+            return ret;
 
         ptype = get_bits(&gb, 8);
 
@@ -2460,7 +2515,8 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
 
         switch (ptype) {
         case 0x80:
-            theora_decode_header(avctx, &gb);
+            if (theora_decode_header(avctx, &gb) < 0)
+                return -1;
             break;
         case 0x81:
 // FIXME: is this needed? it breaks sometimes
diff --git a/libavcodec/vp3_parser.c b/libavcodec/vp3_parser.c
index e8fdcca488..7ee046c543 100644
--- a/libavcodec/vp3_parser.c
+++ b/libavcodec/vp3_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp3data.h b/libavcodec/vp3data.h
index da325c0345..da31f6d026 100644
--- a/libavcodec/vp3data.h
+++ b/libavcodec/vp3data.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 the ffmpeg project
+ * copyright (C) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 /* these coefficients dequantize intraframe Y plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_y_dequant[64] = {
+static const int8_t vp31_intra_y_dequant[64] = {
     16, 11, 10, 16,  24,  40,  51,  61,
     12, 12, 14, 19,  26,  58,  60,  55,
     14, 13, 16, 24,  40,  57,  69,  56,
@@ -39,7 +39,7 @@ static const int16_t vp31_intra_y_dequant[64] = {
 
 /* these coefficients dequantize intraframe C plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_c_dequant[64] = {
+static const int8_t vp31_intra_c_dequant[64] = {
     17, 18, 24, 47, 99, 99, 99, 99,
     18, 21, 26, 66, 99, 99, 99, 99,
     24, 26, 56, 99, 99, 99, 99, 99,
@@ -51,7 +51,7 @@ static const int16_t vp31_intra_c_dequant[64] = {
 };
 
 /* these coefficients dequantize interframe coefficients (all planes) */
-static const int16_t vp31_inter_dequant[64] = {
+static const int8_t vp31_inter_dequant[64] = {
     16, 16, 16, 20, 24, 28,  32,  40,
     16, 16, 20, 24, 28, 32,  40,  48,
     16, 20, 24, 28, 32, 40,  48,  64,
@@ -62,7 +62,7 @@ static const int16_t vp31_inter_dequant[64] = {
     40, 48, 64, 64, 64, 96, 128, 128
 };
 
-static const int16_t vp31_dc_scale_factor[64] = {
+static const uint8_t vp31_dc_scale_factor[64] = {
     220, 200, 190, 180, 170, 170, 160, 160,
     150, 150, 140, 140, 130, 130, 120, 120,
     110, 110, 100, 100,  90,  90,  90,  80,
@@ -176,7 +176,7 @@ static const uint8_t motion_vector_vlc_table[63][2] = {
     { 0xFC, 8 }, { 0xFD, 8 }, { 0xFE, 8 }, { 0xFF, 8 }
 };
 
-static const int motion_vector_table[63] = {
+static const int8_t motion_vector_table[63] = {
      0,   1, -1,
      2,  -2,
      3,  -3,
@@ -198,21 +198,21 @@ static const int8_t fixed_motion_vector_table[64] = {
 };
 
 /* only tokens 0..6 indicate eob runs */
-static const int eob_run_base[7] = {
+static const uint8_t eob_run_base[7] = {
     1, 2, 3, 4, 8, 16, 0
 };
-static const int eob_run_get_bits[7] = {
+static const uint8_t eob_run_get_bits[7] = {
     0, 0, 0, 2, 3, 4, 12
 };
 
-static const int zero_run_base[32] = {
+static const uint8_t zero_run_base[32] = {
     0,  0, 0, 0, 0, 0, 0,   /* 0..6 are never used */
     0,  0,                  /* 7..8 */
     0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
     1,  2, 3, 4, 5,         /* 23..27 */
     6, 10, 1, 2             /* 28..31 */
 };
-static const int zero_run_get_bits[32] = {
+static const uint8_t zero_run_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     3, 6,                   /* 7..8 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
@@ -220,7 +220,7 @@ static const int zero_run_get_bits[32] = {
     2, 3, 0, 1              /* 28..31 */
 };
 
-static const int coeff_get_bits[32] = {
+static const uint8_t coeff_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     0, 0, 0, 0, 0, 0,       /* 7..12 use constant coeffs */
     1, 1, 1, 1,             /* 13..16 are constants but still need sign bit */
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index af687ec5aa..d8a3e0a354 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2004 the ffmpeg project
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3099a7e13c..b95adae79e 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c
index 81c725d96e..5bcf9b6217 100644
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,7 @@
 #include "vp5data.h"
 
 
-static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int rows, cols;
@@ -86,7 +85,7 @@ static void vp5_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
 
     for (comp=0; comp<2; comp++) {
         int delta = 0;
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             int sign = vp56_rac_get_prob(c, model->vector_sig[comp]);
             di  = vp56_rac_get_prob(c, model->vector_pdi[comp][0]);
             di |= vp56_rac_get_prob(c, model->vector_pdi[comp][1]) << 1;
@@ -109,19 +108,19 @@ static void vp5_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][2]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][2]))
             model->vector_pdi[comp][0] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][3]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][3]))
             model->vector_pdi[comp][1] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][4 + node]))
+            if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][4 + node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -138,7 +137,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp5_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp5_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -149,7 +148,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp5_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp5_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -194,9 +193,9 @@ static void vp5_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if (vp56_rac_get_prob(c, model2[0])) {
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+            if (vp56_rac_get_prob_branchy(c, model2[0])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 4;
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         sign = vp56_rac_get(c);
@@ -204,7 +203,7 @@ static void vp5_parse_coeff(VP56Context *s)
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4])) {
+                        if (vp56_rac_get_prob_branchy(c, model2[4])) {
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                             s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 3;
                         } else {
@@ -225,7 +224,7 @@ static void vp5_parse_coeff(VP56Context *s)
                     coeff *= s->dequant_ac;
                 s->block_coeff[b][permute[coeff_idx]] = coeff;
             } else {
-                if (ct && !vp56_rac_get_prob(c, model2[1]))
+                if (ct && !vp56_rac_get_prob_branchy(c, model2[1]))
                     break;
                 ct = 0;
                 s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 0;
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 2f1de5a5a0..631924828d 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -83,16 +83,16 @@ static void vp56_parse_mb_type_models(VP56Context *s)
     int i, ctx, type;
 
     for (ctx=0; ctx<3; ctx++) {
-        if (vp56_rac_get_prob(c, 174)) {
+        if (vp56_rac_get_prob_branchy(c, 174)) {
             int idx = vp56_rac_gets(c, 4);
             memcpy(model->mb_types_stats[ctx],
                    ff_vp56_pre_def_mb_type_stats[idx][ctx],
                    sizeof(model->mb_types_stats[ctx]));
         }
-        if (vp56_rac_get_prob(c, 254)) {
+        if (vp56_rac_get_prob_branchy(c, 254)) {
             for (type=0; type<10; type++) {
                 for(i=0; i<2; i++) {
-                    if (vp56_rac_get_prob(c, 205)) {
+                    if (vp56_rac_get_prob_branchy(c, 205)) {
                         int delta, sign = vp56_rac_get(c);
 
                         delta = vp56_rac_get_tree(c, ff_vp56_pmbtm_tree,
@@ -153,7 +153,7 @@ static VP56mb vp56_parse_mb_type(VP56Context *s,
     uint8_t *mb_type_model = s->modelp->mb_type[ctx][prev_type];
     VP56RangeCoder *c = &s->c;
 
-    if (vp56_rac_get_prob(c, mb_type_model[0]))
+    if (vp56_rac_get_prob_branchy(c, mb_type_model[0]))
         return prev_type;
     else
         return vp56_rac_get_tree(c, ff_vp56_pmbt_tree, mb_type_model);
@@ -340,11 +340,11 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
     if (x<0 || x+12>=s->plane_width[plane] ||
         y<0 || y+12>=s->plane_height[plane]) {
         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                            src + s->block_offset[b] + (dy-2)*stride + (dx-2),
-                            stride, stride,
-                            12, 12, x, y,
-                            s->plane_width[plane],
-                            s->plane_height[plane]);
+                                 src + s->block_offset[b] + (dy-2)*stride + (dx-2),
+                                 stride, stride,
+                                 12, 12, x, y,
+                                 s->plane_width[plane],
+                                 s->plane_height[plane]);
         src_block = s->edge_emu_buffer;
         src_offset = 2 + 2*stride;
     } else if (deblock_filtering) {
@@ -453,9 +453,9 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
     }
 }
 
-static int vp56_size_changed(AVCodecContext *avctx)
+static int vp56_size_changed(VP56Context *s)
 {
-    VP56Context *s = avctx->priv_data;
+    AVCodecContext *avctx = s->avctx;
     int stride = s->frames[VP56_FRAME_CURRENT]->linesize[0];
     int i;
 
@@ -476,19 +476,26 @@ static int vp56_size_changed(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    s->above_blocks = av_realloc(s->above_blocks,
-                                 (4*s->mb_width+6) * sizeof(*s->above_blocks));
-    s->macroblocks = av_realloc(s->macroblocks,
-                                s->mb_width*s->mb_height*sizeof(*s->macroblocks));
+    av_reallocp_array(&s->above_blocks, 4*s->mb_width+6,
+                      sizeof(*s->above_blocks));
+    av_reallocp_array(&s->macroblocks, s->mb_width*s->mb_height,
+                      sizeof(*s->macroblocks));
     av_free(s->edge_emu_buffer_alloc);
     s->edge_emu_buffer_alloc = av_malloc(16*stride);
     s->edge_emu_buffer = s->edge_emu_buffer_alloc;
+    if (!s->above_blocks || !s->macroblocks || !s->edge_emu_buffer_alloc)
+        return AVERROR(ENOMEM);
     if (s->flip < 0)
         s->edge_emu_buffer += 15 * stride;
 
+    if (s->alpha_context)
+        return vp56_size_changed(s->alpha_context);
+
     return 0;
 }
 
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *, int, int);
+
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt)
 {
@@ -496,8 +503,9 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     VP56Context *s = avctx->priv_data;
     AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
     int remaining_buf_size = avpkt->size;
-    int is_alpha, av_uninit(alpha_offset);
-    int res;
+    int av_uninit(alpha_offset);
+    int i, res;
+    int ret;
 
     if (s->has_alpha) {
         if (remaining_buf_size < 3)
@@ -508,156 +516,184 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
     }
 
-    for (is_alpha=0; is_alpha < 1+s->has_alpha; is_alpha++) {
-        int mb_row, mb_col, mb_row_flip, mb_offset = 0;
-        int block, y, uv;
-        ptrdiff_t stride_y, stride_uv;
-        int golden_frame = 0;
+    res = s->parse_header(s, buf, remaining_buf_size);
+    if (res < 0)
+        return res;
 
-        s->modelp = &s->models[is_alpha];
+    if (res == VP56_SIZE_CHANGE) {
+        for (i = 0; i < 4; i++) {
+            av_frame_unref(s->frames[i]);
+            if (s->alpha_context)
+                av_frame_unref(s->alpha_context->frames[i]);
+        }
+    }
 
-        res = s->parse_header(s, buf, remaining_buf_size, &golden_frame);
-        if (res < 0) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            return res;
+    ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
+    if (ret < 0)
+        return ret;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        av_frame_unref(s->alpha_context->frames[VP56_FRAME_CURRENT]);
+        if ((ret = av_frame_ref(s->alpha_context->frames[VP56_FRAME_CURRENT], p)) < 0) {
+            av_frame_unref(p);
+            return ret;
         }
+    }
 
-        if (res == VP56_SIZE_CHANGE) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            if (is_alpha) {
-                ff_set_dimensions(avctx, 0, 0);
-                return AVERROR_INVALIDDATA;
-            }
+    if (res == VP56_SIZE_CHANGE) {
+        if (vp56_size_changed(s)) {
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (!is_alpha) {
-            int ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                return ret;
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        int bak_w = avctx->width;
+        int bak_h = avctx->height;
+        int bak_cw = avctx->coded_width;
+        int bak_ch = avctx->coded_height;
+        buf += alpha_offset;
+        remaining_buf_size -= alpha_offset;
+
+        res = s->alpha_context->parse_header(s->alpha_context, buf, remaining_buf_size);
+        if (res != 0) {
+            if(res==VP56_SIZE_CHANGE) {
+                av_log(avctx, AV_LOG_ERROR, "Alpha reconfiguration\n");
+                avctx->width  = bak_w;
+                avctx->height = bak_h;
+                avctx->coded_width  = bak_cw;
+                avctx->coded_height = bak_ch;
             }
-
-            if (res == VP56_SIZE_CHANGE)
-                if (vp56_size_changed(avctx)) {
-                    av_frame_unref(p);
-                    return AVERROR_INVALIDDATA;
-                }
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (p->key_frame) {
-            p->pict_type = AV_PICTURE_TYPE_I;
-            s->default_models_init(s);
-            for (block=0; block<s->mb_height*s->mb_width; block++)
-                s->macroblocks[block].type = VP56_MB_INTRA;
-        } else {
-            p->pict_type = AV_PICTURE_TYPE_P;
-            vp56_parse_mb_type_models(s);
-            s->parse_vector_models(s);
-            s->mb_type = VP56_MB_INTER_NOVEC_PF;
-        }
+    avctx->execute2(avctx, ff_vp56_decode_mbs, 0, 0, (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) + 1);
 
-        if (s->parse_coeff_models(s))
-            goto next;
+    if ((res = av_frame_ref(data, p)) < 0)
+        return res;
+    *got_frame = 1;
 
-        memset(s->prev_dc, 0, sizeof(s->prev_dc));
-        s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
-        s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
+    return avpkt->size;
+}
 
-        for (block=0; block < 4*s->mb_width+6; block++) {
-            s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
-            s->above_blocks[block].dc_coeff = 0;
-            s->above_blocks[block].not_null_dc = 0;
-        }
-        s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
-        s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *data,
+                              int jobnr, int threadnr)
+{
+    VP56Context *s0 = avctx->priv_data;
+    int is_alpha = (jobnr == 1);
+    VP56Context *s = is_alpha ? s0->alpha_context : s0;
+    AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
+    int mb_row, mb_col, mb_row_flip, mb_offset = 0;
+    int block, y, uv;
+    ptrdiff_t stride_y, stride_uv;
+    int res;
+
+    if (p->key_frame) {
+        p->pict_type = AV_PICTURE_TYPE_I;
+        s->default_models_init(s);
+        for (block=0; block<s->mb_height*s->mb_width; block++)
+            s->macroblocks[block].type = VP56_MB_INTRA;
+    } else {
+        p->pict_type = AV_PICTURE_TYPE_P;
+        vp56_parse_mb_type_models(s);
+        s->parse_vector_models(s);
+        s->mb_type = VP56_MB_INTER_NOVEC_PF;
+    }
+
+    if (s->parse_coeff_models(s))
+        goto next;
 
-        stride_y  = p->linesize[0];
-        stride_uv = p->linesize[1];
+    memset(s->prev_dc, 0, sizeof(s->prev_dc));
+    s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
+    s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
 
+    for (block=0; block < 4*s->mb_width+6; block++) {
+        s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
+        s->above_blocks[block].dc_coeff = 0;
+        s->above_blocks[block].not_null_dc = 0;
+    }
+    s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
+    s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+
+    stride_y  = p->linesize[0];
+    stride_uv = p->linesize[1];
+
+    if (s->flip < 0)
+        mb_offset = 7;
+
+    /* main macroblocks loop */
+    for (mb_row=0; mb_row<s->mb_height; mb_row++) {
         if (s->flip < 0)
-            mb_offset = 7;
-
-        /* main macroblocks loop */
-        for (mb_row=0; mb_row<s->mb_height; mb_row++) {
-            if (s->flip < 0)
-                mb_row_flip = s->mb_height - mb_row - 1;
-            else
-                mb_row_flip = mb_row;
-
-            for (block=0; block<4; block++) {
-                s->left_block[block].ref_frame = VP56_FRAME_NONE;
-                s->left_block[block].dc_coeff = 0;
-                s->left_block[block].not_null_dc = 0;
-            }
-            memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
-            memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
-
-            s->above_block_idx[0] = 1;
-            s->above_block_idx[1] = 2;
-            s->above_block_idx[2] = 1;
-            s->above_block_idx[3] = 2;
-            s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
-            s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
-
-            s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
-            s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
-            s->block_offset[1] = s->block_offset[0] + 8;
-            s->block_offset[3] = s->block_offset[2] + 8;
-            s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
-            s->block_offset[5] = s->block_offset[4];
-
-            for (mb_col=0; mb_col<s->mb_width; mb_col++) {
-                vp56_decode_mb(s, mb_row, mb_col, is_alpha);
-
-                for (y=0; y<4; y++) {
-                    s->above_block_idx[y] += 2;
-                    s->block_offset[y] += 16;
-                }
+            mb_row_flip = s->mb_height - mb_row - 1;
+        else
+            mb_row_flip = mb_row;
 
-                for (uv=4; uv<6; uv++) {
-                    s->above_block_idx[uv] += 1;
-                    s->block_offset[uv] += 8;
-                }
-            }
+        for (block=0; block<4; block++) {
+            s->left_block[block].ref_frame = VP56_FRAME_NONE;
+            s->left_block[block].dc_coeff = 0;
+            s->left_block[block].not_null_dc = 0;
         }
+        memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
+        memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
+
+        s->above_block_idx[0] = 1;
+        s->above_block_idx[1] = 2;
+        s->above_block_idx[2] = 1;
+        s->above_block_idx[3] = 2;
+        s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
+        s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
+
+        s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
+        s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
+        s->block_offset[1] = s->block_offset[0] + 8;
+        s->block_offset[3] = s->block_offset[2] + 8;
+        s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
+        s->block_offset[5] = s->block_offset[4];
+
+        for (mb_col=0; mb_col<s->mb_width; mb_col++) {
+            vp56_decode_mb(s, mb_row, mb_col, is_alpha);
+
+            for (y=0; y<4; y++) {
+                s->above_block_idx[y] += 2;
+                s->block_offset[y] += 16;
+            }
 
-    next:
-        if (p->key_frame || golden_frame) {
-            av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
-            if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
-                return res;
+            for (uv=4; uv<6; uv++) {
+                s->above_block_idx[uv] += 1;
+                s->block_offset[uv] += 8;
+            }
         }
+    }
 
-        if (s->has_alpha) {
-            FFSWAP(AVFrame *, s->frames[VP56_FRAME_GOLDEN],
-                              s->frames[VP56_FRAME_GOLDEN2]);
-            buf += alpha_offset;
-            remaining_buf_size -= alpha_offset;
-        }
+next:
+    if (p->key_frame || s->golden_frame) {
+        av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
+        if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
+            return res;
     }
 
     av_frame_unref(s->frames[VP56_FRAME_PREVIOUS]);
     FFSWAP(AVFrame *, s->frames[VP56_FRAME_CURRENT],
                       s->frames[VP56_FRAME_PREVIOUS]);
-
-    if ((res = av_frame_ref(data, p)) < 0)
-        return res;
-    *got_frame = 1;
-
-    return avpkt->size;
+    return 0;
 }
 
 av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_init_context(avctx, s, flip, has_alpha);
+}
+
+av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                                  int flip, int has_alpha)
+{
     int i;
 
     s->avctx = avctx;
     avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
+    if (avctx->skip_alpha) avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     ff_h264chroma_init(&s->h264chroma, 8);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
@@ -665,7 +701,7 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
     ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
     }
@@ -683,10 +719,14 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     s->macroblocks = NULL;
     s->quantizer = -1;
     s->deblock_filtering = 1;
+    s->golden_frame = 0;
 
     s->filter = NULL;
 
     s->has_alpha = has_alpha;
+
+    s->modelp = &s->model;
+
     if (flip) {
         s->flip = -1;
         s->frbi = 2;
@@ -703,6 +743,11 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 av_cold int ff_vp56_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_free_context(s);
+}
+
+av_cold int ff_vp56_free_context(VP56Context *s)
+{
     int i;
 
     av_freep(&s->above_blocks);
diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h
index f2ed770b85..56c30919b7 100644
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ typedef void (*VP56DefaultModelsInit)(VP56Context *s);
 typedef void (*VP56ParseVectorModels)(VP56Context *s);
 typedef int  (*VP56ParseCoeffModels)(VP56Context *s);
 typedef int  (*VP56ParseHeader)(VP56Context *s, const uint8_t *buf,
-                                int buf_size, int *golden_frame);
+                                int buf_size);
 
 typedef struct VP56RangeCoder {
     int high;
@@ -135,6 +135,7 @@ struct vp56_context {
     int sub_version;
 
     /* frame info */
+    int golden_frame;
     int plane_width[4];
     int plane_height[4];
     int mb_width;   /* number of horizontal MB */
@@ -189,8 +190,11 @@ struct vp56_context {
     VP56ParseCoeffModels parse_coeff_models;
     VP56ParseHeader parse_header;
 
+    /* for "slice" parallelism between YUV and A */
+    VP56Context *alpha_context;
+
     VP56Model *modelp;
-    VP56Model models[2];
+    VP56Model model;
 
     /* huffman decoding */
     int use_huffman;
@@ -203,7 +207,10 @@ struct vp56_context {
 
 
 int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
+int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                          int flip, int has_alpha);
 int ff_vp56_free(AVCodecContext *avctx);
+int ff_vp56_free_context(VP56Context *s);
 void ff_vp56_init_dequant(VP56Context *s, int quantizer);
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt);
@@ -356,7 +363,7 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
                       const uint8_t *probs)
 {
     while (tree->val > 0) {
-        if (vp56_rac_get_prob(c, probs[tree->prob_idx]))
+        if (vp56_rac_get_prob_branchy(c, probs[tree->prob_idx]))
             tree += tree->val;
         else
             tree++;
@@ -364,15 +371,13 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
     return -tree->val;
 }
 
-/**
- * This is identical to vp8_rac_get_tree except for the possibility of starting
- * on a node other than the root node, needed for coeff decode where this is
- * used to save a bit after a 0 token (by disallowing EOB to immediately follow.)
- */
-static av_always_inline
-int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
-                                 const uint8_t *probs, int i)
+// how probabilities are associated with decisions is different I think
+// well, the new scheme fits in the old but this way has one fewer branches per decision
+static av_always_inline int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
+                                   const uint8_t *probs)
 {
+    int i = 0;
+
     do {
         i = tree[i][vp56_rac_get_prob(c, probs[i])];
     } while (i > 0);
@@ -380,15 +385,6 @@ int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
     return -i;
 }
 
-// how probabilities are associated with decisions is different I think
-// well, the new scheme fits in the old but this way has one fewer branches per decision
-static av_always_inline
-int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
-                     const uint8_t *probs)
-{
-    return vp8_rac_get_tree_with_offset(c, tree, probs, 0);
-}
-
 // DCTextra
 static av_always_inline int vp8_rac_get_coeff(VP56RangeCoder *c, const uint8_t *prob)
 {
diff --git a/libavcodec/vp56data.c b/libavcodec/vp56data.c
index 989c76a649..0080370f02 100644
--- a/libavcodec/vp56data.c
+++ b/libavcodec/vp56data.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56data.h b/libavcodec/vp56data.h
index 21907bdb50..3be268c317 100644
--- a/libavcodec/vp56data.h
+++ b/libavcodec/vp56data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp56dsp.c
index 5e09d2414e..fa533ec5c9 100644
--- a/libavcodec/vp56dsp.c
+++ b/libavcodec/vp56dsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h
index 389d35901c..7807baa4b1 100644
--- a/libavcodec/vp56dsp.h
+++ b/libavcodec/vp56dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56rac.c b/libavcodec/vp56rac.c
index 270a3cad3a..6061b7ee72 100644
--- a/libavcodec/vp56rac.c
+++ b/libavcodec/vp56rac.c
@@ -2,20 +2,20 @@
  * VP5/6/8 decoder
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp5data.h b/libavcodec/vp5data.h
index b11b99d9a9..e16ff2da4b 100644
--- a/libavcodec/vp5data.h
+++ b/libavcodec/vp5data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c
index c48c2b8180..a2bb4578d5 100644
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,8 +43,7 @@
 static void vp6_parse_coeff(VP56Context *s);
 static void vp6_parse_coeff_huffman(VP56Context *s);
 
-static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int parse_filter_info = 0;
@@ -113,6 +112,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
         if (sub_version < 8)
             vrt_shift = 5;
         s->sub_version = sub_version;
+        s->golden_frame = 0;
     } else {
         if (!s->sub_version || !s->avctx->coded_width || !s->avctx->coded_height)
             return AVERROR_INVALIDDATA;
@@ -124,7 +124,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
         }
         ff_vp56_init_range_decoder(c, buf+1, buf_size-1);
 
-        *golden_frame = vp56_rac_get(c);
+        s->golden_frame = vp56_rac_get(c);
         if (s->filter_header) {
             s->deblock_filtering = vp56_rac_get(c);
             if (s->deblock_filtering)
@@ -211,20 +211,20 @@ static void vp6_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp6_pdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_pdv_pct[comp][node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<8; node++)
-            if (vp56_rac_get_prob(c, vp6_fdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_fdv_pct[comp][node]))
                 model->vector_fdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -270,7 +270,7 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp6_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp6_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -279,21 +279,21 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     if (vp56_rac_get(c)) {
         for (pos=1; pos<64; pos++)
-            if (vp56_rac_get_prob(c, vp6_coeff_reorder_pct[pos]))
+            if (vp56_rac_get_prob_branchy(c, vp6_coeff_reorder_pct[pos]))
                 model->coeff_reorder[pos] = vp56_rac_gets(c, 4);
         vp6_coeff_order_table_init(s);
     }
 
     for (cg=0; cg<2; cg++)
         for (node=0; node<14; node++)
-            if (vp56_rac_get_prob(c, vp6_runv_pct[cg][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_runv_pct[cg][node]))
                 model->coeff_runv[cg][node] = vp56_rac_gets_nn(c, 7);
 
     for (ct=0; ct<3; ct++)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp6_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp6_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -339,7 +339,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
     for (comp=0; comp<2; comp++) {
         int i, delta = 0;
 
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             static const uint8_t prob_order[] = {0, 1, 2, 7, 6, 5, 4};
             for (i=0; i<sizeof(prob_order); i++) {
                 int j = prob_order[i];
@@ -354,7 +354,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
                                       model->vector_pdv[comp]);
         }
 
-        if (delta && vp56_rac_get_prob(c, model->vector_sig[comp]))
+        if (delta && vp56_rac_get_prob_branchy(c, model->vector_sig[comp]))
             delta = -delta;
 
         if (!comp)
@@ -462,16 +462,16 @@ static void vp6_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob(c, model2[0])) {
+            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob_branchy(c, model2[0])) {
                 /* parse a coeff */
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         coeff = ff_vp56_coeff_bias[idx+5];
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4]))
+                        if (vp56_rac_get_prob_branchy(c, model2[4]))
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                         else
                             coeff = 2;
@@ -492,7 +492,7 @@ static void vp6_parse_coeff(VP56Context *s)
                 /* parse a run */
                 ct = 0;
                 if (coeff_idx > 0) {
-                    if (!vp56_rac_get_prob(c, model2[1]))
+                    if (!vp56_rac_get_prob_branchy(c, model2[1]))
                         break;
 
                     model3 = model->coeff_runv[coeff_idx >= 6];
@@ -604,6 +604,8 @@ static void vp6_filter(VP56Context *s, uint8_t *dst, uint8_t *src,
     }
 }
 
+static av_cold void vp6_decode_init_context(VP56Context *s);
+
 static av_cold int vp6_decode_init(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
@@ -613,6 +615,21 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
                             avctx->codec->id == AV_CODEC_ID_VP6A)) < 0)
         return ret;
 
+    vp6_decode_init_context(s);
+
+    if (s->has_alpha) {
+        s->alpha_context = av_mallocz(sizeof(VP56Context));
+        ff_vp56_init_context(avctx, s->alpha_context,
+                             s->flip == -1, s->has_alpha);
+        vp6_decode_init_context(s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_init_context(VP56Context *s)
+{
+    s->deblock_filtering = 0;
     s->vp56_coord_div = vp6_coord_div;
     s->parse_vector_adjustment = vp6_parse_vector_adjustment;
     s->filter = vp6_filter;
@@ -620,16 +637,29 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
     s->parse_vector_models = vp6_parse_vector_models;
     s->parse_coeff_models = vp6_parse_coeff_models;
     s->parse_header = vp6_parse_header;
-
-    return 0;
 }
 
+static av_cold void vp6_decode_free_context(VP56Context *s);
+
 static av_cold int vp6_decode_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
-    int pt, ct, cg;
 
     ff_vp56_free(avctx);
+    vp6_decode_free_context(s);
+
+    if (s->alpha_context) {
+        ff_vp56_free_context(s->alpha_context);
+        vp6_decode_free_context(s->alpha_context);
+        av_freep(&s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_free_context(VP56Context *s)
+{
+    int pt, ct, cg;
 
     for (pt=0; pt<2; pt++) {
         ff_free_vlc(&s->dccv_vlc[pt]);
@@ -638,7 +668,6 @@ static av_cold int vp6_decode_free(AVCodecContext *avctx)
             for (cg=0; cg<6; cg++)
                 ff_free_vlc(&s->ract_vlc[pt][ct][cg]);
     }
-    return 0;
 }
 
 AVCodec ff_vp6_decoder = {
@@ -676,5 +705,5 @@ AVCodec ff_vp6a_decoder = {
     .init           = vp6_decode_init,
     .close          = vp6_decode_free,
     .decode         = ff_vp56_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
 };
diff --git a/libavcodec/vp6data.h b/libavcodec/vp6data.h
index 2de90e7be7..539e19a627 100644
--- a/libavcodec/vp6data.h
+++ b/libavcodec/vp6data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6dsp.c b/libavcodec/vp6dsp.c
index 54a96ed13a..67c6be07de 100644
--- a/libavcodec/vp6dsp.c
+++ b/libavcodec/vp6dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 55ebae69fd..64037fc089 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -7,20 +7,20 @@
  * Copyright (C) 2012 Daniel Kang
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,14 @@
 #   include "arm/vp8.h"
 #endif
 
+#if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
+#define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
+#elif CONFIG_VP7_DECODER
+#define VPX(vp7, f) vp7_ ## f
+#else // CONFIG_VP8_DECODER
+#define VPX(vp7, f) vp8_ ## f
+#endif
+
 static void free_buffers(VP8Context *s)
 {
     int i;
@@ -143,7 +151,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
     AVCodecContext *avctx = s->avctx;
     int i, ret;
 
-    if (width  != s->avctx->width ||
+    if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
         height != s->avctx->height) {
         vp8_decode_flush_impl(s->avctx, 1);
 
@@ -156,7 +164,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
     s->mb_height = (s->avctx->coded_height + 15) / 16;
 
     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
-                   FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
+                   avctx->thread_count > 1;
     if (!s->mb_layout) { // Frame threading and one thread
         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
                                                sizeof(*s->macroblocks));
@@ -202,6 +210,7 @@ static int vp8_update_dimensions(VP8Context *s, int width, int height)
     return update_dimensions(s, width, height, IS_VP8);
 }
 
+
 static void parse_segment_info(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
@@ -292,7 +301,7 @@ static void vp7_get_quants(VP8Context *s)
     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 }
 
-static void get_quants(VP8Context *s)
+static void vp8_get_quants(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
     int i, base_qi;
@@ -413,7 +422,7 @@ static void update_refs(VP8Context *s)
     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 }
 
-static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
+static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 {
     int i, j;
 
@@ -424,16 +433,16 @@ static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
     }
 }
 
-static void fade(uint8_t *dst, uint8_t *src,
-                 int width, int height, int linesize,
+static void fade(uint8_t *dst, int dst_linesize,
+                 const uint8_t *src, int src_linesize,
+                 int width, int height,
                  int alpha, int beta)
 {
     int i, j;
-
     for (j = 0; j < height; j++) {
         for (i = 0; i < width; i++) {
-            uint8_t y = src[j * linesize + i];
-            dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
+            uint8_t y = src[j * src_linesize + i];
+            dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
         }
     }
 }
@@ -449,8 +458,11 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
         int height = s->mb_height * 16;
         AVFrame *src, *dst;
 
-        if (!s->framep[VP56_FRAME_PREVIOUS])
+        if (!s->framep[VP56_FRAME_PREVIOUS] ||
+            !s->framep[VP56_FRAME_GOLDEN]) {
+            av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
             return AVERROR_INVALIDDATA;
+        }
 
         dst =
         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
@@ -459,15 +471,16 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
-               return ret;
+                return ret;
 
             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 
-            copy_luma(dst, src, width, height);
+            copy_chroma(dst, src, width, height);
         }
 
-        fade(dst->data[0], src->data[0],
-             width, height, dst->linesize[0], alpha, beta);
+        fade(dst->data[0], dst->linesize[0],
+             src->data[0], src->linesize[0],
+             width, height, alpha, beta);
     }
 
     return 0;
@@ -494,13 +507,14 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->invisible = 0;
     part1_size   = AV_RL24(buf) >> 4;
 
-    buf      += 4 - s->profile;
-    buf_size -= 4 - s->profile;
-
-    if (buf_size < part1_size) {
+    if (buf_size < 4 - s->profile + part1_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
         return AVERROR_INVALIDDATA;
     }
 
+    buf      += 4 - s->profile;
+    buf_size -= 4 - s->profile;
+
     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 
     ff_vp56_init_range_decoder(c, buf, part1_size);
@@ -546,7 +560,7 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
              if (vp7_feature_value_size[s->profile][i])
                  for (j = 0; j < 4; j++)
                      s->feature_value[i][j] =
-                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
+                        vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
         }
     }
 
@@ -629,6 +643,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     int width  = s->avctx->width;
     int height = s->avctx->height;
 
+    if (buf_size < 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     s->keyframe  = !(buf[0] & 1);
     s->profile   =  (buf[0]>>1) & 7;
     s->invisible = !(buf[0] & 0x10);
@@ -709,11 +728,12 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     }
 
     if (!s->macroblocks_base || /* first frame */
-        width != s->avctx->width || height != s->avctx->height)
+        width != s->avctx->width || height != s->avctx->height ||
+        (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
             return ret;
 
-    get_quants(s);
+    vp8_get_quants(s);
 
     if (!s->keyframe) {
         update_refs(s);
@@ -746,14 +766,16 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
 static av_always_inline
 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 {
-    dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
-    dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
+    dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
+    dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 }
 
 /**
  * Motion vector coding, 17.1.
  */
-static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
+static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 {
     int bit, x = 0;
 
@@ -781,6 +803,16 @@ static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 }
 
+static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 1);
+}
+
+static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 0);
+}
+
 static av_always_inline
 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 {
@@ -971,8 +1003,8 @@ void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
                 } else {
-                    mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
-                    mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
+                    mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
@@ -1071,8 +1103,8 @@ void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
                 } else {
-                    mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
-                    mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
+                    mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
@@ -1096,7 +1128,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 {
     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 
-    if (layout == 1) {
+    if (layout) {
         VP8Macroblock *mb_top = mb - s->mb_width - 1;
         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
     }
@@ -1104,7 +1136,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
         int x, y;
         uint8_t *top;
         uint8_t *const left = s->intra4x4_pred_mode_left;
-        if (layout == 1)
+        if (layout)
             top = mb->intra4x4_pred_mode_top;
         else
             top = s->intra4x4_pred_mode_top + 4 * mb_x;
@@ -1139,7 +1171,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         *segment = 0;
         for (i = 0; i < 4; i++) {
             if (s->feature_enabled[i]) {
-                if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
+                if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
                                                    s->feature_index_prob[i]);
                       av_log(s->avctx, AV_LOG_WARNING,
@@ -1148,9 +1180,10 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
                 }
            }
         }
-    } else if (s->segmentation.update_map)
-        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
-    else if (s->segmentation.enabled)
+    } else if (s->segmentation.update_map) {
+        int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
+        *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
+    } else if (s->segmentation.enabled)
         *segment = ref ? *ref : *segment;
     mb->segment = *segment;
 
@@ -1165,7 +1198,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         } else {
             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
-            if (s->mb_layout == 1)
+            if (s->mb_layout)
                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
             else
                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
@@ -1329,6 +1362,7 @@ static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
  * @param zero_nhood the initial prediction context for number of surrounding
  *                   all-zero blocks (only left/top, so 0-2)
  * @param qmul       array holding the dc/ac dequant factor at position 0/1
+ * @param scan       scan pattern (VP7 only)
  *
  * @return 0 if no coeffs were decoded
  *         otherwise, the index of the last coeff decoded plus one
@@ -1592,7 +1626,7 @@ void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
             for (x = 0; x < 4; x++) {
                 int copy = 0, linesize = s->linesize;
                 uint8_t *dst = ptr + 4 * x;
-                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
+                LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
 
                 if ((y == 0 || x == 3) && mb_y == 0) {
                     topright = tr_top;
@@ -1698,8 +1732,8 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
     if (AV_RN32A(mv)) {
         int src_linesize = linesize;
 
-        int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
-        int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
+        int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
+        int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
 
         x_off += mv->x >> 2;
         y_off += mv->y >> 2;
@@ -1769,7 +1803,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src1 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -1777,7 +1812,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src2 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -2213,7 +2249,7 @@ static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
             td->wait_mb_pos = INT_MAX;                                        \
             pthread_mutex_unlock(&otd->lock);                                 \
         }                                                                     \
-    } while (0);
+    } while (0)
 
 #define update_pos(td, mb_y, mb_x)                                            \
     do {                                                                      \
@@ -2232,13 +2268,13 @@ static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
             pthread_cond_broadcast(&td->cond);                                \
             pthread_mutex_unlock(&td->lock);                                  \
         }                                                                     \
-    } while (0);
+    } while (0)
 #else
-#define check_thread_pos(td, otd, mb_x_check, mb_y_check)
-#define update_pos(td, mb_y, mb_x)
+#define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
+#define update_pos(td, mb_y, mb_x) while(0)
 #endif
 
-static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
                                         int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
@@ -2359,7 +2395,19 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
     }
 }
 
-static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
+}
+
+static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
                               int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
@@ -2418,6 +2466,18 @@ static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
     }
 }
 
+static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
+}
+
 static av_always_inline
 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
                               int threadnr, int is_vp7)
@@ -2433,9 +2493,9 @@ int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
         if (mb_y >= s->mb_height)
             break;
         td->thread_mb_pos = mb_y << 16;
-        vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
+        s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
         if (s->deblock_filter)
-            vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
+            s->filter_mb_row(avctx, tdata, jobnr, threadnr);
         update_pos(td, mb_y, INT_MAX & 0xFFFF);
 
         s->mv_min.y -= 64;
@@ -2528,10 +2588,8 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     curframe->tf.f->key_frame = s->keyframe;
     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                             : AV_PICTURE_TYPE_P;
-    if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
+    if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
         goto err;
-    }
 
     // check if golden and altref are swapped
     if (s->update_altref != VP56_FRAME_NONE)
@@ -2551,7 +2609,8 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     s->next_framep[VP56_FRAME_CURRENT] = curframe;
 
-    ff_thread_finish_setup(avctx);
+    if (avctx->codec->update_thread_context)
+        ff_thread_finish_setup(avctx);
 
     s->linesize   = curframe->tf.f->linesize[0];
     s->uvlinesize = curframe->tf.f->linesize[1];
@@ -2639,6 +2698,9 @@ av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
     VP8Context *s = avctx->priv_data;
     int i;
 
+    if (!s)
+        return 0;
+
     vp8_decode_flush_impl(avctx, 1);
     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
         av_frame_free(&s->frames[i].tf.f);
@@ -2664,6 +2726,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     int ret;
 
     s->avctx = avctx;
+    s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
 
@@ -2673,9 +2736,13 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     if (CONFIG_VP7_DECODER && is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
         ff_vp7dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp7_filter_mb_row;
     } else if (CONFIG_VP8_DECODER && !is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
         ff_vp8dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp8_filter_mb_row;
     }
 
     /* does not change for VP8 */
@@ -2702,6 +2769,7 @@ av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
 }
 
 #if CONFIG_VP8_DECODER
+#if HAVE_THREADS
 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
 {
     VP8Context *s = avctx->priv_data;
@@ -2717,7 +2785,7 @@ static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
     return 0;
 }
 
-#define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
+#define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
 
 static int vp8_decode_update_thread_context(AVCodecContext *dst,
                                             const AVCodecContext *src)
@@ -2752,6 +2820,7 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
 
     return 0;
 }
+#endif /* HAVE_THREADS */
 #endif /* CONFIG_VP8_DECODER */
 
 #if CONFIG_VP7_DECODER
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 2919a143af..2135bd9d83 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -6,20 +6,20 @@
  * Copyright (C) 2010 Fiona Glaser
  * Copyright (C) 2012 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,8 @@
 
 #if HAVE_PTHREADS
 #   include <pthread.h>
+#elif HAVE_OS2THREADS
+#   include "compat/os2threads.h"
 #elif HAVE_W32THREADS
 #   include "compat/w32pthreads.h"
 #endif
@@ -132,6 +134,11 @@ typedef struct VP8Frame {
     AVBufferRef *seg_map;
 } VP8Frame;
 
+typedef struct VP8intmv {
+    int x;
+    int y;
+} VP8intmv;
+
 #define MAX_THREADS 8
 typedef struct VP8Context {
     VP8ThreadData *thread_data;
@@ -150,8 +157,8 @@ typedef struct VP8Context {
     uint8_t deblock_filter;
     uint8_t mbskip_enabled;
     uint8_t profile;
-    VP56mv mv_min;
-    VP56mv mv_max;
+    VP8intmv mv_min;
+    VP8intmv mv_max;
 
     int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type
     int ref_count[3];
@@ -275,6 +282,11 @@ typedef struct VP8Context {
      */
     int mb_layout;
 
+    void (*decode_mb_row_no_filter)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+    void (*filter_mb_row)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+
+    int vp7;
+
     /**
      * Fade bit present in bitstream (VP7)
      */
diff --git a/libavcodec/vp8_parser.c b/libavcodec/vp8_parser.c
index 8f6459ccec..afc7f991e6 100644
--- a/libavcodec/vp8_parser.c
+++ b/libavcodec/vp8_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8data.h b/libavcodec/vp8data.h
index b49dea98fa..f9dbf56feb 100644
--- a/libavcodec/vp8data.h
+++ b/libavcodec/vp8data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 4e4012f88e..07bea69c78 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "mathops.h"
 #include "vp8dsp.h"
@@ -71,10 +72,7 @@ static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         b1 = (tmp[i + 0] - tmp[i + 8]) * 23170;
         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
+        AV_ZERO64(dc + i * 4);
         block[0][i][0] = (a1 + d1 + 0x20000) >> 18;
         block[3][i][0] = (a1 - d1 + 0x20000) >> 18;
         block[1][i][0] = (b1 + c1 + 0x20000) >> 18;
@@ -105,10 +103,7 @@ static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         b1 = (block[i * 4 + 0] - block[i * 4 + 2]) * 23170;
         c1 = block[i * 4 + 1] * 12540 - block[i * 4 + 3] * 30274;
         d1 = block[i * 4 + 1] * 30274 + block[i * 4 + 3] * 12540;
-        block[i * 4 + 0] = 0;
-        block[i * 4 + 1] = 0;
-        block[i * 4 + 2] = 0;
-        block[i * 4 + 3] = 0;
+        AV_ZERO64(block + i * 4);
         tmp[i * 4 + 0] = (a1 + d1) >> 14;
         tmp[i * 4 + 3] = (a1 - d1) >> 14;
         tmp[i * 4 + 1] = (b1 + c1) >> 14;
@@ -171,10 +166,7 @@ static void vp8_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         t1 = dc[i * 4 + 1] + dc[i * 4 + 2];
         t2 = dc[i * 4 + 1] - dc[i * 4 + 2];
         t3 = dc[i * 4 + 0] - dc[i * 4 + 3] + 3; // rounding
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
+        AV_ZERO64(dc + i * 4);
 
         block[i][0][0] = (t0 + t1) >> 3;
         block[i][1][0] = (t3 + t2) >> 3;
@@ -262,7 +254,7 @@ MK_IDCT_DC_ADD4_C(vp8)
     int av_unused q2 = p[ 2 * stride];                                        \
     int av_unused q3 = p[ 3 * stride];
 
-#define clip_int8(n) (cm[n + 0x80] - 0x80)
+#define clip_int8(n) (cm[(n) + 0x80] - 0x80)
 
 static av_always_inline void filter_common(uint8_t *p, ptrdiff_t stride,
                                            int is4tap, int is_vp7)
@@ -743,5 +735,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
         ff_vp8dsp_init_arm(dsp);
     if (ARCH_X86)
         ff_vp8dsp_init_x86(dsp);
+    if (ARCH_MIPS)
+        ff_vp8dsp_init_mips(dsp);
 }
 #endif /* CONFIG_VP8_DECODER */
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
index 4864cf7969..0401c9221b 100644
--- a/libavcodec/vp8dsp.h
+++ b/libavcodec/vp8dsp.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,6 +98,7 @@ void ff_vp78dsp_init_x86(VP8DSPContext *c);
 void ff_vp8dsp_init(VP8DSPContext *c);
 void ff_vp8dsp_init_arm(VP8DSPContext *c);
 void ff_vp8dsp_init_x86(VP8DSPContext *c);
+void ff_vp8dsp_init_mips(VP8DSPContext *c);
 
 #define IS_VP7 1
 #define IS_VP8 0
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 45f0771695..ed0d905a0a 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -4,99 +4,414 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avassert.h"
-
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
+#include "thread.h"
 #include "videodsp.h"
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dsp.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
 
 #define VP9_SYNCCODE 0x498342
-#define MAX_PROB 255
 
-static void vp9_decode_flush(AVCodecContext *avctx)
+enum CompPredMode {
+    PRED_SINGLEREF,
+    PRED_COMPREF,
+    PRED_SWITCHABLE,
+};
+
+enum BlockLevel {
+    BL_64X64,
+    BL_32X32,
+    BL_16X16,
+    BL_8X8,
+};
+
+enum BlockSize {
+    BS_64x64,
+    BS_64x32,
+    BS_32x64,
+    BS_32x32,
+    BS_32x16,
+    BS_16x32,
+    BS_16x16,
+    BS_16x8,
+    BS_8x16,
+    BS_8x8,
+    BS_8x4,
+    BS_4x8,
+    BS_4x4,
+    N_BS_SIZES,
+};
+
+struct VP9mvrefPair {
+    VP56mv mv[2];
+    int8_t ref[2];
+};
+
+typedef struct VP9Frame {
+    ThreadFrame tf;
+    AVBufferRef *extradata;
+    uint8_t *segmentation_map;
+    struct VP9mvrefPair *mv;
+    int uses_2pass;
+} VP9Frame;
+
+struct VP9Filter {
+    uint8_t level[8 * 8];
+    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
+                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
+};
+
+typedef struct VP9Block {
+    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
+    enum FilterMode filter;
+    VP56mv mv[4 /* b_idx */][2 /* ref */];
+    enum BlockSize bs;
+    enum TxfmMode tx, uvtx;
+    enum BlockLevel bl;
+    enum BlockPartition bp;
+} VP9Block;
+
+typedef struct VP9Context {
+    VP9DSPContext dsp;
+    VideoDSPContext vdsp;
+    GetBitContext gb;
+    VP56RangeCoder c;
+    VP56RangeCoder *c_b;
+    unsigned c_b_size;
+    VP9Block *b_base, *b;
+    int pass;
+    int row, row7, col, col7;
+    uint8_t *dst[3];
+    ptrdiff_t y_stride, uv_stride;
+
+    // bitstream header
+    uint8_t keyframe, last_keyframe;
+    uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
+    uint8_t invisible;
+    uint8_t use_last_frame_mvs;
+    uint8_t errorres;
+    uint8_t ss_h, ss_v;
+    uint8_t intraonly;
+    uint8_t resetctx;
+    uint8_t refreshrefmask;
+    uint8_t highprecisionmvs;
+    enum FilterMode filtermode;
+    uint8_t allowcompinter;
+    uint8_t fixcompref;
+    uint8_t refreshctx;
+    uint8_t parallelmode;
+    uint8_t framectxid;
+    uint8_t refidx[3];
+    uint8_t signbias[3];
+    uint8_t varcompref[2];
+    ThreadFrame refs[8], next_refs[8];
+#define CUR_FRAME 0
+#define REF_FRAME_MVPAIR 1
+#define REF_FRAME_SEGMAP 2
+    VP9Frame frames[3];
+
+    struct {
+        uint8_t level;
+        int8_t sharpness;
+        uint8_t lim_lut[64];
+        uint8_t mblim_lut[64];
+    } filter;
+    struct {
+        uint8_t enabled;
+        int8_t mode[2];
+        int8_t ref[4];
+    } lf_delta;
+    uint8_t yac_qi;
+    int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
+    uint8_t lossless;
+#define MAX_SEGMENT 8
+    struct {
+        uint8_t enabled;
+        uint8_t temporal;
+        uint8_t absolute_vals;
+        uint8_t update_map;
+        struct {
+            uint8_t q_enabled;
+            uint8_t lf_enabled;
+            uint8_t ref_enabled;
+            uint8_t skip_enabled;
+            uint8_t ref_val;
+            int16_t q_val;
+            int8_t lf_val;
+            int16_t qmul[2][2];
+            uint8_t lflvl[4][2];
+        } feat[MAX_SEGMENT];
+    } segmentation;
+    struct {
+        unsigned log2_tile_cols, log2_tile_rows;
+        unsigned tile_cols, tile_rows;
+        unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    } tiling;
+    unsigned sb_cols, sb_rows, rows, cols;
+    struct {
+        prob_context p;
+        uint8_t coef[4][2][2][6][6][3];
+    } prob_ctx[4];
+    struct {
+        prob_context p;
+        uint8_t coef[4][2][2][6][6][11];
+        uint8_t seg[7];
+        uint8_t segpred[3];
+    } prob;
+    struct {
+        unsigned y_mode[4][10];
+        unsigned uv_mode[10][10];
+        unsigned filter[4][3];
+        unsigned mv_mode[7][4];
+        unsigned intra[4][2];
+        unsigned comp[5][2];
+        unsigned single_ref[5][2][2];
+        unsigned comp_ref[5][2];
+        unsigned tx32p[2][4];
+        unsigned tx16p[2][3];
+        unsigned tx8p[2][2];
+        unsigned skip[3][2];
+        unsigned mv_joint[4];
+        struct {
+            unsigned sign[2];
+            unsigned classes[11];
+            unsigned class0[2];
+            unsigned bits[10][2];
+            unsigned class0_fp[2][4];
+            unsigned fp[4];
+            unsigned class0_hp[2];
+            unsigned hp[2];
+        } mv_comp[2];
+        unsigned partition[4][4][4];
+        unsigned coef[4][2][2][6][6][3];
+        unsigned eob[4][2][2][6][6][2];
+    } counts;
+    enum TxfmMode txfmmode;
+    enum CompPredMode comppredmode;
+
+    // contextual (left/above) cache
+    DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
+    DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
+    DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
+    DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
+    uint8_t *above_partition_ctx;
+    uint8_t *above_mode_ctx;
+    // FIXME maybe merge some of the below in a flags field?
+    uint8_t *above_y_nnz_ctx;
+    uint8_t *above_uv_nnz_ctx[2];
+    uint8_t *above_skip_ctx; // 1bit
+    uint8_t *above_txfm_ctx; // 2bit
+    uint8_t *above_segpred_ctx; // 1bit
+    uint8_t *above_intra_ctx; // 1bit
+    uint8_t *above_comp_ctx; // 1bit
+    uint8_t *above_ref_ctx; // 2bit
+    uint8_t *above_filter_ctx;
+    VP56mv (*above_mv_ctx)[2];
+
+    // whole-frame cache
+    uint8_t *intra_pred_data[3];
+    struct VP9Filter *lflvl;
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
+
+    // block reconstruction intermediates
+    int block_alloc_using_2pass;
+    int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
+    uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
+    struct { int x, y; } min_mv, max_mv;
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
+    uint16_t mvscale[3][2];
+    uint8_t mvstep[3][2];
+} VP9Context;
+
+static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
+    {
+        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
+        { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
+    }, {
+        { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
+        { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+    }
+};
+
+static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
 {
-    VP9Context *s = avctx->priv_data;
-    int i;
+    VP9Context *s = ctx->priv_data;
+    int ret, sz;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        av_frame_unref(s->refs[i]);
+    if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+    sz = 64 * s->sb_cols * s->sb_rows;
+    if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
+        ff_thread_release_buffer(ctx, &f->tf);
+        return AVERROR(ENOMEM);
+    }
+
+    f->segmentation_map = f->extradata->data;
+    f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
+
+    return 0;
+}
+
+static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
+{
+    ff_thread_release_buffer(ctx, &f->tf);
+    av_buffer_unref(&f->extradata);
+    f->segmentation_map = NULL;
 }
 
-static int update_size(AVCodecContext *avctx, int w, int h)
+static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
 {
-    VP9Context *s = avctx->priv_data;
+    int res;
+
+    if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
+        return res;
+    } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
+        vp9_unref_frame(ctx, dst);
+        return AVERROR(ENOMEM);
+    }
+
+    dst->segmentation_map = src->segmentation_map;
+    dst->mv = src->mv;
+    dst->uses_2pass = src->uses_2pass;
+
+    return 0;
+}
+
+static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
+{
+    VP9Context *s = ctx->priv_data;
     uint8_t *p;
+    int bytesperpixel = s->bytesperpixel;
 
-    if (s->above_partition_ctx && w == avctx->width && h == avctx->height)
+    av_assert0(w > 0 && h > 0);
+
+    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
         return 0;
 
-    vp9_decode_flush(avctx);
+    ctx->width   = w;
+    ctx->height  = h;
+    ctx->pix_fmt = fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
 
-    if (w <= 0 || h <= 0)
-        return AVERROR_INVALIDDATA;
-
-    avctx->width  = w;
-    avctx->height = h;
-    s->sb_cols    = (w + 63) >> 6;
-    s->sb_rows    = (h + 63) >> 6;
-    s->cols       = (w +  7) >> 3;
-    s->rows       = (h +  7) >> 3;
-
-#define assign(var, type, n) var = (type)p; p += s->sb_cols * n * sizeof(*var)
-    av_free(s->above_partition_ctx);
-    p = av_malloc(s->sb_cols *
-                  (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx) +
-                   64 * s->sb_rows * (1 + sizeof(*s->mv[0]) * 2)));
+#define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
+    av_freep(&s->intra_pred_data[0]);
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
+                                sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
-    assign(s->above_partition_ctx, uint8_t *,     8);
-    assign(s->above_skip_ctx,      uint8_t *,     8);
-    assign(s->above_txfm_ctx,      uint8_t *,     8);
-    assign(s->above_mode_ctx,      uint8_t *,    16);
-    assign(s->above_y_nnz_ctx,     uint8_t *,    16);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,     8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,     8);
-    assign(s->intra_pred_data[0],  uint8_t *,    64);
-    assign(s->intra_pred_data[1],  uint8_t *,    32);
-    assign(s->intra_pred_data[2],  uint8_t *,    32);
-    assign(s->above_segpred_ctx,   uint8_t *,     8);
-    assign(s->above_intra_ctx,     uint8_t *,     8);
-    assign(s->above_comp_ctx,      uint8_t *,     8);
-    assign(s->above_ref_ctx,       uint8_t *,     8);
-    assign(s->above_filter_ctx,    uint8_t *,     8);
-    assign(s->lflvl,               VP9Filter *,   1);
-    assign(s->above_mv_ctx,        VP56mv(*)[2], 16);
-    assign(s->segmentation_map,    uint8_t *,      64 * s->sb_rows);
-    assign(s->mv[0],               VP9MVRefPair *, 64 * s->sb_rows);
-    assign(s->mv[1],               VP9MVRefPair *, 64 * s->sb_rows);
+    assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
+    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
+    assign(s->above_mode_ctx,      uint8_t *,             16);
+    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
+    assign(s->above_partition_ctx, uint8_t *,              8);
+    assign(s->above_skip_ctx,      uint8_t *,              8);
+    assign(s->above_txfm_ctx,      uint8_t *,              8);
+    assign(s->above_segpred_ctx,   uint8_t *,              8);
+    assign(s->above_intra_ctx,     uint8_t *,              8);
+    assign(s->above_comp_ctx,      uint8_t *,              8);
+    assign(s->above_ref_ctx,       uint8_t *,              8);
+    assign(s->above_filter_ctx,    uint8_t *,              8);
+    assign(s->lflvl,               struct VP9Filter *,     1);
 #undef assign
 
+    // these will be re-allocated a little later
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
+
+    if (s->bpp != s->last_bpp) {
+        ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
+        ff_videodsp_init(&s->vdsp, s->bpp);
+        s->last_bpp = s->bpp;
+    }
+
+    return 0;
+}
+
+static int update_block_buffers(AVCodecContext *ctx)
+{
+    VP9Context *s = ctx->priv_data;
+    int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+
+    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
+        return 0;
+
+    av_free(s->b_base);
+    av_free(s->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
+    if (s->frames[CUR_FRAME].uses_2pass) {
+        int sbs = s->sb_cols * s->sb_rows;
+
+        s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
+        if (!s->b_base || !s->block_base)
+            return AVERROR(ENOMEM);
+        s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
+    } else {
+        s->b_base = av_malloc(sizeof(VP9Block));
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                   16 * 16 + 2 * chroma_eobs);
+        if (!s->b_base || !s->block_base)
+            return AVERROR(ENOMEM);
+        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
+        s->uveob_base[0] = s->eob_base + 16 * 16;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
+    }
+    s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
+
     return 0;
 }
 
-// The sign bit is at the end, not the start, of a bit sequence
-static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
+// for some reason the sign bit is at the end, not the start, of a bit sequence
+static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 {
     int v = get_bits(gb, n);
     return get_bits1(gb) ? -v : v;
@@ -104,17 +419,13 @@ static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
 
 static av_always_inline int inv_recenter_nonneg(int v, int m)
 {
-    if (v > 2 * m)
-        return v;
-    if (v & 1)
-        return m - ((v + 1) >> 1);
-    return m + (v >> 1);
+    return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
 // differential forward probability updates
 static int update_prob(VP56RangeCoder *c, int p)
 {
-    static const int inv_map_table[MAX_PROB - 1] = {
+    static const int inv_map_table[255] = {
           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
@@ -133,19 +444,19 @@ static int update_prob(VP56RangeCoder *c, int p)
         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
-        252, 253,
+        252, 253, 253,
     };
     int d;
 
     /* This code is trying to do a differential probability update. For a
      * current probability A in the range [1, 255], the difference to a new
-     * probability of any value can be expressed differentially as 1-A, 255-A
+     * probability of any value can be expressed differentially as 1-A,255-A
      * where some part of this (absolute range) exists both in positive as
      * well as the negative part, whereas another part only exists in one
      * half. We're trying to code this shared part differentially, i.e.
      * times two where the value of the lowest bit specifies the sign, and
      * the single part is then coded on top of this. This absolute difference
-     * then again has a value of [0, 254], but a bigger value in this range
+     * then again has a value of [0,254], but a bigger value in this range
      * indicates that we're further away from the original value A, so we
      * can code this as a VLC code, since higher values are increasingly
      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
@@ -160,101 +471,146 @@ static int update_prob(VP56RangeCoder *c, int p)
         d = vp8_rac_get_uint(c, 5) + 32;
     } else {
         d = vp8_rac_get_uint(c, 7);
-        if (d >= 65) {
+        if (d >= 65)
             d = (d << 1) - 65 + vp8_rac_get(c);
-            d = av_clip(d, 0, MAX_PROB - 65 - 1);
-        }
         d += 64;
+        av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
     }
 
-    return p <= 128
-           ?   1 + inv_recenter_nonneg(inv_map_table[d], p - 1)
-           : 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+    return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
+                    255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 }
 
-static int decode_frame_header(AVCodecContext *avctx,
+static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
+{
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    VP9Context *s = ctx->priv_data;
+    enum AVPixelFormat res;
+    int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
+
+    s->bpp_index = bits;
+    s->bpp = 8 + bits * 2;
+    s->bytesperpixel = (7 + s->bpp) >> 3;
+    ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
+    if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
+        static const enum AVPixelFormat pix_fmt_rgb[3] = {
+            AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
+        };
+        if (ctx->profile & 1) {
+            s->ss_h = s->ss_v = 0;
+            res = pix_fmt_rgb[bits];
+            ctx->color_range = AVCOL_RANGE_JPEG;
+            if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
+                   ctx->profile);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
+            { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
+              { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
+            { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
+              { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
+            { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
+              { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
+        };
+        ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+        if (ctx->profile & 1) {
+            s->ss_h = get_bits1(&s->gb);
+            s->ss_v = get_bits1(&s->gb);
+            if ((res = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
+                av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
+                       ctx->profile);
+                return AVERROR_INVALIDDATA;
+            } else if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
+                       ctx->profile);
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            s->ss_h = s->ss_v = 1;
+            res = pix_fmt_for_ss[bits][1][1];
+        }
+    }
+
+    return res;
+}
+
+static int decode_frame_header(AVCodecContext *ctx,
                                const uint8_t *data, int size, int *ref)
 {
-    VP9Context *s = avctx->priv_data;
-    int c, i, j, k, l, m, n, w, h, max, size2, ret, sharp;
+    VP9Context *s = ctx->priv_data;
+    int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
+    enum AVPixelFormat fmt = ctx->pix_fmt;
     int last_invisible;
     const uint8_t *data2;
 
     /* general header */
-    if ((ret = init_get_bits8(&s->gb, data, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
-        return ret;
+    if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
+        return res;
     }
     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
-        av_log(avctx, AV_LOG_ERROR, "Invalid frame marker\n");
+        av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
         return AVERROR_INVALIDDATA;
     }
-    s->profile = get_bits1(&s->gb);
-    if (get_bits1(&s->gb)) { // reserved bit
-        av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+    ctx->profile  = get_bits1(&s->gb);
+    ctx->profile |= get_bits1(&s->gb) << 1;
+    if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
+    if (ctx->profile > 3) {
+        av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
         return AVERROR_INVALIDDATA;
     }
     if (get_bits1(&s->gb)) {
         *ref = get_bits(&s->gb, 3);
         return 0;
     }
-
-    s->last_keyframe = s->keyframe;
-    s->keyframe      = !get_bits1(&s->gb);
-
-    last_invisible = s->invisible;
-    s->invisible   = !get_bits1(&s->gb);
-    s->errorres    = get_bits1(&s->gb);
-    // FIXME disable this upon resolution change
+    s->last_keyframe  = s->keyframe;
+    s->keyframe       = !get_bits1(&s->gb);
+    last_invisible    = s->invisible;
+    s->invisible      = !get_bits1(&s->gb);
+    s->errorres       = get_bits1(&s->gb);
     s->use_last_frame_mvs = !s->errorres && !last_invisible;
-
     if (s->keyframe) {
         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
-            av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->colorspace = get_bits(&s->gb, 3);
-        if (s->colorspace == 7) { // RGB = profile 1
-            av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
+            av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
             return AVERROR_INVALIDDATA;
         }
-        s->fullrange = get_bits1(&s->gb);
-
-        // subsampling bits
-        if (s->profile == 1 || s->profile == 3) {
-            s->sub_x = get_bits1(&s->gb);
-            s->sub_y = get_bits1(&s->gb);
-            if (s->sub_x && s->sub_y) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "4:2:0 color not supported in profile 1 or 3\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (get_bits1(&s->gb)) { // reserved bit
-                av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
-                return AVERROR_INVALIDDATA;
-            }
-        } else {
-            s->sub_x = s->sub_y = 1;
-        }
-        if (!s->sub_x || !s->sub_y) {
-            avpriv_report_missing_feature(avctx, "Subsampling %d:%d",
-                                          s->sub_x, s->sub_y);
-            return AVERROR_PATCHWELCOME;
-        }
-
+        if ((fmt = read_colorspace_details(ctx)) < 0)
+            return fmt;
+        // for profile 1, here follows the subsampling bits
         s->refreshrefmask = 0xff;
         w = get_bits(&s->gb, 16) + 1;
         h = get_bits(&s->gb, 16) + 1;
         if (get_bits1(&s->gb)) // display size
             skip_bits(&s->gb, 32);
     } else {
-        s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
-        s->resetctx  = s->errorres ? 0 : get_bits(&s->gb, 2);
+        s->intraonly  = s->invisible ? get_bits1(&s->gb) : 0;
+        s->resetctx   = s->errorres ? 0 : get_bits(&s->gb, 2);
         if (s->intraonly) {
             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
-                av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
+                av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (ctx->profile >= 1) {
+                if ((fmt = read_colorspace_details(ctx)) < 0)
+                    return fmt;
+            } else {
+                s->ss_h = s->ss_v = 1;
+                s->bpp = 8;
+                s->bpp_index = 0;
+                s->bytesperpixel = 1;
+                fmt = AV_PIX_FMT_YUV420P;
+                ctx->colorspace = AVCOL_SPC_BT470BG;
+                ctx->color_range = AVCOL_RANGE_JPEG;
+            }
             s->refreshrefmask = get_bits(&s->gb, 8);
             w = get_bits(&s->gb, 16) + 1;
             h = get_bits(&s->gb, 16) + 1;
@@ -263,38 +619,42 @@ static int decode_frame_header(AVCodecContext *avctx,
         } else {
             s->refreshrefmask = get_bits(&s->gb, 8);
             s->refidx[0]      = get_bits(&s->gb, 3);
-            s->signbias[0]    = get_bits1(&s->gb);
+            s->signbias[0]    = get_bits1(&s->gb) && !s->errorres;
             s->refidx[1]      = get_bits(&s->gb, 3);
-            s->signbias[1]    = get_bits1(&s->gb);
+            s->signbias[1]    = get_bits1(&s->gb) && !s->errorres;
             s->refidx[2]      = get_bits(&s->gb, 3);
-            s->signbias[2]    = get_bits1(&s->gb);
-            if (!s->refs[s->refidx[0]]->buf[0] ||
-                !s->refs[s->refidx[1]]->buf[0] ||
-                !s->refs[s->refidx[2]]->buf[0]) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not all references are available\n");
+            s->signbias[2]    = get_bits1(&s->gb) && !s->errorres;
+            if (!s->refs[s->refidx[0]].f->data[0] ||
+                !s->refs[s->refidx[1]].f->data[0] ||
+                !s->refs[s->refidx[2]].f->data[0]) {
+                av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
                 return AVERROR_INVALIDDATA;
             }
             if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[0]]->width;
-                h = s->refs[s->refidx[0]]->height;
+                w = s->refs[s->refidx[0]].f->width;
+                h = s->refs[s->refidx[0]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[1]]->width;
-                h = s->refs[s->refidx[1]]->height;
+                w = s->refs[s->refidx[1]].f->width;
+                h = s->refs[s->refidx[1]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[2]]->width;
-                h = s->refs[s->refidx[2]]->height;
+                w = s->refs[s->refidx[2]].f->width;
+                h = s->refs[s->refidx[2]].f->height;
             } else {
                 w = get_bits(&s->gb, 16) + 1;
                 h = get_bits(&s->gb, 16) + 1;
             }
+            // Note that in this code, "CUR_FRAME" is actually before we
+            // have formally allocated a frame, and thus actually represents
+            // the _last_ frame
+            s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
+                                     s->frames[CUR_FRAME].tf.f->height == h;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
             s->highprecisionmvs = get_bits1(&s->gb);
-            s->filtermode       = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
-                                  get_bits(&s->gb, 2);
-            s->allowcompinter   = s->signbias[0] != s->signbias[1] ||
-                                  s->signbias[0] != s->signbias[2];
+            s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
+                                                get_bits(&s->gb, 2);
+            s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
+                                 s->signbias[0] != s->signbias[2]);
             if (s->allowcompinter) {
                 if (s->signbias[0] == s->signbias[1]) {
                     s->fixcompref    = 2;
@@ -310,18 +670,53 @@ static int decode_frame_header(AVCodecContext *avctx,
                     s->varcompref[1] = 2;
                 }
             }
+
+            for (i = 0; i < 3; i++) {
+                AVFrame *ref = s->refs[s->refidx[i]].f;
+                int refw = ref->width, refh = ref->height;
+
+                if (ref->format != fmt) {
+                    av_log(ctx, AV_LOG_ERROR,
+                           "Ref pixfmt (%s) did not match current frame (%s)",
+                           av_get_pix_fmt_name(ref->format),
+                           av_get_pix_fmt_name(fmt));
+                    return AVERROR_INVALIDDATA;
+                } else if (refw == w && refh == h) {
+                    s->mvscale[i][0] = s->mvscale[i][1] = 0;
+                } else {
+                    if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
+                               refw, refh, w, h);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    s->mvscale[i][0] = (refw << 14) / w;
+                    s->mvscale[i][1] = (refh << 14) / h;
+                    s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
+                    s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
+                }
+            }
         }
     }
-
     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
     s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
     s->framectxid   = c = get_bits(&s->gb, 2);
 
     /* loopfilter header data */
+    if (s->keyframe || s->errorres || s->intraonly) {
+        // reset loopfilter defaults
+        s->lf_delta.ref[0] = 1;
+        s->lf_delta.ref[1] = 0;
+        s->lf_delta.ref[2] = -1;
+        s->lf_delta.ref[3] = -1;
+        s->lf_delta.mode[0] = 0;
+        s->lf_delta.mode[1] = 0;
+        memset(s->segmentation.feat, 0, sizeof(s->segmentation.feat));
+    }
     s->filter.level = get_bits(&s->gb, 6);
-    sharp           = get_bits(&s->gb, 3);
-    /* If sharpness changed, reinit lim/mblim LUTs. if it didn't change,
-     * keep the old cache values since they are still valid. */
+    sharp = get_bits(&s->gb, 3);
+    // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
+    // the old cache values since they are still valid
     if (s->filter.sharpness != sharp)
         memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
     s->filter.sharpness = sharp;
@@ -329,22 +724,22 @@ static int decode_frame_header(AVCodecContext *avctx,
         if (get_bits1(&s->gb)) {
             for (i = 0; i < 4; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.ref[i] = get_bits_with_sign(&s->gb, 6);
+                    s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
             for (i = 0; i < 2; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.mode[i] = get_bits_with_sign(&s->gb, 6);
+                    s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
         }
-    } else {
-        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
     }
 
     /* quantization header data */
     s->yac_qi      = get_bits(&s->gb, 8);
-    s->ydc_qdelta  = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvdc_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvac_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
+    s->ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
     s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
+    if (s->lossless)
+        ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 
     /* segmentation header info */
     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
@@ -352,83 +747,83 @@ static int decode_frame_header(AVCodecContext *avctx,
             for (i = 0; i < 7; i++)
                 s->prob.seg[i] = get_bits1(&s->gb) ?
                                  get_bits(&s->gb, 8) : 255;
-            if ((s->segmentation.temporal = get_bits1(&s->gb)))
+            if ((s->segmentation.temporal = get_bits1(&s->gb))) {
                 for (i = 0; i < 3; i++)
                     s->prob.segpred[i] = get_bits1(&s->gb) ?
                                          get_bits(&s->gb, 8) : 255;
+            }
         }
 
         if (get_bits1(&s->gb)) {
             s->segmentation.absolute_vals = get_bits1(&s->gb);
             for (i = 0; i < 8; i++) {
                 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].q_val = get_bits_with_sign(&s->gb, 8);
+                    s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
                 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].lf_val = get_bits_with_sign(&s->gb, 6);
+                    s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
                 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
                     s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
                 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
             }
         }
-    } else {
-        s->segmentation.feat[0].q_enabled    = 0;
-        s->segmentation.feat[0].lf_enabled   = 0;
-        s->segmentation.feat[0].skip_enabled = 0;
-        s->segmentation.feat[0].ref_enabled  = 0;
     }
 
     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
     for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
         int qyac, qydc, quvac, quvdc, lflvl, sh;
 
-        if (s->segmentation.feat[i].q_enabled) {
+        if (s->segmentation.enabled && s->segmentation.feat[i].q_enabled) {
             if (s->segmentation.absolute_vals)
-                qyac = s->segmentation.feat[i].q_val;
+                qyac = av_clip_uintp2(s->segmentation.feat[i].q_val, 8);
             else
-                qyac = s->yac_qi + s->segmentation.feat[i].q_val;
+                qyac = av_clip_uintp2(s->yac_qi + s->segmentation.feat[i].q_val, 8);
         } else {
-            qyac = s->yac_qi;
+            qyac  = s->yac_qi;
         }
         qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
         quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
         quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
         qyac  = av_clip_uintp2(qyac, 8);
 
-        s->segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[qydc];
-        s->segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[qyac];
-        s->segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[quvdc];
-        s->segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[quvac];
+        s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
+        s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
+        s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
+        s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
 
         sh = s->filter.level >= 32;
-        if (s->segmentation.feat[i].lf_enabled) {
+        if (s->segmentation.enabled && s->segmentation.feat[i].lf_enabled) {
             if (s->segmentation.absolute_vals)
-                lflvl = s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
             else
-                lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
         } else {
-            lflvl = s->filter.level;
+            lflvl  = s->filter.level;
         }
-        s->segmentation.feat[i].lflvl[0][0] =
-        s->segmentation.feat[i].lflvl[0][1] =
-            av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
-        for (j = 1; j < 4; j++) {
-            s->segmentation.feat[i].lflvl[j][0] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[0]) << sh), 6);
-            s->segmentation.feat[i].lflvl[j][1] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[1]) << sh), 6);
+        if (s->lf_delta.enabled) {
+            s->segmentation.feat[i].lflvl[0][0] =
+            s->segmentation.feat[i].lflvl[0][1] =
+                av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
+            for (j = 1; j < 4; j++) {
+                s->segmentation.feat[i].lflvl[j][0] =
+                    av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                             s->lf_delta.mode[0]) * (1 << sh)), 6);
+                s->segmentation.feat[i].lflvl[j][1] =
+                    av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                             s->lf_delta.mode[1]) * (1 << sh)), 6);
+            }
+        } else {
+            memset(s->segmentation.feat[i].lflvl, lflvl,
+                   sizeof(s->segmentation.feat[i].lflvl));
         }
     }
 
     /* tiling info */
-    if ((ret = update_size(avctx, w, h)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Failed to initialize decoder for %dx%d\n", w, h);
-        return ret;
+    if ((res = update_size(ctx, w, h, fmt)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
+        return res;
     }
     for (s->tiling.log2_tile_cols = 0;
-         (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
+         s->sb_cols > (64 << s->tiling.log2_tile_cols);
          s->tiling.log2_tile_cols++) ;
     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
     max = FFMAX(0, max - 1);
@@ -439,56 +834,56 @@ static int decode_frame_header(AVCodecContext *avctx,
             break;
     }
     s->tiling.log2_tile_rows = decode012(&s->gb);
-    s->tiling.tile_rows      = 1 << s->tiling.log2_tile_rows;
+    s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
     if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
         s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
-        s->c_b              = av_fast_realloc(s->c_b, &s->c_b_size,
-                                              sizeof(VP56RangeCoder) *
-                                              s->tiling.tile_cols);
+        s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
+                                 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
         if (!s->c_b) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Ran out of memory during range coder init\n");
+            av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
             return AVERROR(ENOMEM);
         }
     }
 
-    if (s->keyframe || s->errorres || s->intraonly) {
-        s->prob_ctx[0].p =
-        s->prob_ctx[1].p =
-        s->prob_ctx[2].p =
-        s->prob_ctx[3].p = ff_vp9_default_probs;
-        memcpy(s->prob_ctx[0].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[1].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[2].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[3].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
+    if (s->keyframe || s->errorres || (s->intraonly && s->resetctx == 3)) {
+        s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
+                           s->prob_ctx[3].p = vp9_default_probs;
+        memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+    } else if (s->intraonly && s->resetctx == 2) {
+        s->prob_ctx[c].p = vp9_default_probs;
+        memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
     }
 
     // next 16 bits is size of the rest of the header (arith-coded)
     size2 = get_bits(&s->gb, 16);
     data2 = align_get_bits(&s->gb);
     if (size2 > size - (data2 - data)) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid compressed header size\n");
+        av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
         return AVERROR_INVALIDDATA;
     }
     ff_vp56_init_range_decoder(&s->c, data2, size2);
     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
-        av_log(avctx, AV_LOG_ERROR, "Marker bit was set\n");
+        av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->keyframe || s->intraonly)
-        memset(s->counts.coef, 0,
-               sizeof(s->counts.coef) + sizeof(s->counts.eob));
-    else
+    if (s->keyframe || s->intraonly) {
+        memset(s->counts.coef, 0, sizeof(s->counts.coef));
+        memset(s->counts.eob,  0, sizeof(s->counts.eob));
+    } else {
         memset(&s->counts, 0, sizeof(s->counts));
-
-    /* FIXME is it faster to not copy here, but do it down in the fw updates
-     * as explicit copies if the fw update is missing (and skip the copy upon
-     * fw update)? */
+    }
+    // FIXME is it faster to not copy here, but do it down in the fw updates
+    // as explicit copies if the fw update is missing (and skip the copy upon
+    // fw update)?
     s->prob.p = s->prob_ctx[c].p;
 
     // txfm updates
@@ -529,10 +924,11 @@ static int decode_frame_header(AVCodecContext *avctx,
                             if (m >= 3 && l == 0) // dc only has 3 pt
                                 break;
                             for (n = 0; n < 3; n++) {
-                                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                                if (vp56_rac_get_prob_branchy(&s->c, 252)) {
                                     p[n] = update_prob(&s->c, r[n]);
-                                else
+                                } else {
                                     p[n] = r[n];
+                                }
                             }
                             p[3] = 0;
                         }
@@ -617,8 +1013,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                 for (k = 0; k < 3; k++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
                         s->prob.p.partition[3 - i][j][k] =
-                            update_prob(&s->c,
-                                        s->prob.p.partition[3 - i][j][k]);
+                            update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 
         // mv fields don't use the update_prob subexp model for some reason
         for (i = 0; i < 3; i++)
@@ -627,8 +1022,7 @@ static int decode_frame_header(AVCodecContext *avctx,
 
         for (i = 0; i < 2; i++) {
             if (vp56_rac_get_prob_branchy(&s->c, 252))
-                s->prob.p.mv_comp[i].sign =
-                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+                s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             for (j = 0; j < 10; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -636,8 +1030,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             if (vp56_rac_get_prob_branchy(&s->c, 252))
-                s->prob.p.mv_comp[i].class0 =
-                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+                s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             for (j = 0; j < 10; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -674,640 +1067,3321 @@ static int decode_frame_header(AVCodecContext *avctx,
     return (data2 - data) + size2;
 }
 
-static int decode_subblock(AVCodecContext *avctx, int row, int col,
-                           VP9Filter *lflvl,
-                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
+                                      VP9Context *s)
+{
+    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
+    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
+}
+
+static void find_ref_mvs(VP9Context *s,
+                         VP56mv *pmv, int ref, int z, int idx, int sb)
+{
+    static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
+        [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
+                      { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
+        [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
+        [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
+                      { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
+        [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
+                      { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
+                      {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
+        [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
+                      {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
+        [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
+                      { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
+        [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+    };
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col, row7 = s->row7;
+    const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
+#define INVALID_MV 0x80008000U
+    uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
+    int i;
+
+#define RETURN_DIRECT_MV(mv) \
+    do { \
+        uint32_t m = AV_RN32A(&mv); \
+        if (!idx) { \
+            AV_WN32A(pmv, m); \
+            return; \
+        } else if (mem == INVALID_MV) { \
+            mem = m; \
+        } else if (m != mem) { \
+            AV_WN32A(pmv, m); \
+            return; \
+        } \
+    } while (0)
+
+    if (sb >= 0) {
+        if (sb == 2 || sb == 1) {
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        } else if (sb == 3) {
+            RETURN_DIRECT_MV(b->mv[2][z]);
+            RETURN_DIRECT_MV(b->mv[1][z]);
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        }
+
+#define RETURN_MV(mv) \
+    do { \
+        if (sb > 0) { \
+            VP56mv tmp; \
+            uint32_t m; \
+            av_assert2(idx == 1); \
+            av_assert2(mem != INVALID_MV); \
+            if (mem_sub8x8 == INVALID_MV) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                    return; \
+                } \
+                mem_sub8x8 = AV_RN32A(&mv); \
+            } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                } else { \
+                    /* BUG I'm pretty sure this isn't the intention */ \
+                    AV_WN32A(pmv, 0); \
+                } \
+                return; \
+            } \
+        } else { \
+            uint32_t m = AV_RN32A(&mv); \
+            if (!idx) { \
+                clamp_mv(pmv, &mv, s); \
+                return; \
+            } else if (mem == INVALID_MV) { \
+                mem = m; \
+            } else if (m != mem) { \
+                clamp_mv(pmv, &mv, s); \
+                return; \
+            } \
+        } \
+    } while (0)
+
+        if (row > 0) {
+            struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
+            if (mv->ref[0] == ref) {
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
+            }
+        }
+        if (col > s->tiling.tile_col_start) {
+            struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
+            if (mv->ref[0] == ref) {
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
+            }
+        }
+        i = 2;
+    } else {
+        i = 0;
+    }
+
+    // previously coded MVs in this neighbourhood, using same reference frame
+    for (; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
+            struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] == ref) {
+                RETURN_MV(mv->mv[0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(mv->mv[1]);
+            }
+        }
+    }
+
+    // MV at this position in previous frame, using same reference frame
+    if (s->use_last_frame_mvs) {
+        struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
+
+        if (!s->frames[REF_FRAME_MVPAIR].uses_2pass)
+            ff_thread_await_progress(&s->frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
+        if (mv->ref[0] == ref) {
+            RETURN_MV(mv->mv[0]);
+        } else if (mv->ref[1] == ref) {
+            RETURN_MV(mv->mv[1]);
+        }
+    }
+
+#define RETURN_SCALE_MV(mv, scale) \
+    do { \
+        if (scale) { \
+            VP56mv mv_temp = { -mv.x, -mv.y }; \
+            RETURN_MV(mv_temp); \
+        } else { \
+            RETURN_MV(mv); \
+        } \
+    } while (0)
+
+    // previously coded MVs in this neighbourhood, using different reference frame
+    for (i = 0; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
+            struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] != ref && mv->ref[0] >= 0) {
+                RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
+            }
+            if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
+                // BUG - libvpx has this condition regardless of whether
+                // we used the first ref MV and pre-scaling
+                AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
+                RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
+            }
+        }
+    }
+
+    // MV at this position in previous frame, using different reference frame
+    if (s->use_last_frame_mvs) {
+        struct VP9mvrefPair *mv = &s->frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
+
+        // no need to await_progress, because we already did that above
+        if (mv->ref[0] != ref && mv->ref[0] >= 0) {
+            RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
+        }
+        if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
+            // BUG - libvpx has this condition regardless of whether
+            // we used the first ref MV and pre-scaling
+            AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
+            RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
+        }
+    }
+
+    AV_ZERO32(pmv);
+    clamp_mv(pmv, pmv, s);
+#undef INVALID_MV
+#undef RETURN_MV
+#undef RETURN_SCALE_MV
+}
+
+static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
+{
+    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
+    int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
+                                s->prob.p.mv_comp[idx].classes);
+
+    s->counts.mv_comp[idx].sign[sign]++;
+    s->counts.mv_comp[idx].classes[c]++;
+    if (c) {
+        int m;
+
+        for (n = 0, m = 0; m < c; m++) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
+            n |= bit << m;
+            s->counts.mv_comp[idx].bits[m][bit]++;
+        }
+        n <<= 3;
+        bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
+        n |= bit << 1;
+        s->counts.mv_comp[idx].fp[bit]++;
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
+            s->counts.mv_comp[idx].hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].hp[1]++;
+        }
+        n += 8 << c;
+    } else {
+        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
+        s->counts.mv_comp[idx].class0[n]++;
+        bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
+                               s->prob.p.mv_comp[idx].class0_fp[n]);
+        s->counts.mv_comp[idx].class0_fp[n][bit]++;
+        n = (n << 3) | (bit << 1);
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
+            s->counts.mv_comp[idx].class0_hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].class0_hp[1]++;
+        }
+    }
+
+    return sign ? -(n + 1) : (n + 1);
+}
+
+static void fill_mv(VP9Context *s,
+                    VP56mv *mv, int mode, int sb)
+{
+    VP9Block *b = s->b;
+
+    if (mode == ZEROMV) {
+        AV_ZERO64(mv);
+    } else {
+        int hp;
+
+        // FIXME cache this value and reuse for other subblocks
+        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
+                     mode == NEWMV ? -1 : sb);
+        // FIXME maybe move this code into find_ref_mvs()
+        if ((mode == NEWMV || sb == -1) &&
+            !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
+            if (mv[0].y & 1) {
+                if (mv[0].y < 0)
+                    mv[0].y++;
+                else
+                    mv[0].y--;
+            }
+            if (mv[0].x & 1) {
+                if (mv[0].x < 0)
+                    mv[0].x++;
+                else
+                    mv[0].x--;
+            }
+        }
+        if (mode == NEWMV) {
+            enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
+                                              s->prob.p.mv_joint);
+
+            s->counts.mv_joint[j]++;
+            if (j >= MV_JOINT_V)
+                mv[0].y += read_mv_component(s, 0, hp);
+            if (j & 1)
+                mv[0].x += read_mv_component(s, 1, hp);
+        }
+
+        if (b->comp) {
+            // FIXME cache this value and reuse for other subblocks
+            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
+                         mode == NEWMV ? -1 : sb);
+            if ((mode == NEWMV || sb == -1) &&
+                !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
+                if (mv[1].y & 1) {
+                    if (mv[1].y < 0)
+                        mv[1].y++;
+                    else
+                        mv[1].y--;
+                }
+                if (mv[1].x & 1) {
+                    if (mv[1].x < 0)
+                        mv[1].x++;
+                    else
+                        mv[1].x--;
+                }
+            }
+            if (mode == NEWMV) {
+                enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
+                                                  s->prob.p.mv_joint);
+
+                s->counts.mv_joint[j]++;
+                if (j >= MV_JOINT_V)
+                    mv[1].y += read_mv_component(s, 0, hp);
+                if (j & 1)
+                    mv[1].x += read_mv_component(s, 1, hp);
+            }
+        }
+    }
+}
+
+static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
+                                       ptrdiff_t stride, int v)
+{
+    switch (w) {
+    case 1:
+        do {
+            *ptr = v;
+            ptr += stride;
+        } while (--h);
+        break;
+    case 2: {
+        int v16 = v * 0x0101;
+        do {
+            AV_WN16A(ptr, v16);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 4: {
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr, v32);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 8: {
+#if HAVE_FAST_64BIT
+        uint64_t v64 = v * 0x0101010101010101ULL;
+        do {
+            AV_WN64A(ptr, v64);
+            ptr += stride;
+        } while (--h);
+#else
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr,     v32);
+            AV_WN32A(ptr + 4, v32);
+            ptr += stride;
+        } while (--h);
+#endif
+        break;
+    }
+    }
+}
+
+static void decode_mode(AVCodecContext *ctx)
+{
+    static const uint8_t left_ctx[N_BS_SIZES] = {
+        0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
+    };
+    static const uint8_t above_ctx[N_BS_SIZES] = {
+        0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
+    };
+    static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
+        TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
+        TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
+    };
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col, row7 = s->row7;
+    enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
+    int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
+    int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
+    int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
+    int vref, filter_id;
+
+    if (!s->segmentation.enabled) {
+        b->seg_id = 0;
+    } else if (s->keyframe || s->intraonly) {
+        b->seg_id = !s->segmentation.update_map ? 0 :
+                    vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
+    } else if (!s->segmentation.update_map ||
+               (s->segmentation.temporal &&
+                vp56_rac_get_prob_branchy(&s->c,
+                    s->prob.segpred[s->above_segpred_ctx[col] +
+                                    s->left_segpred_ctx[row7]]))) {
+        if (!s->errorres && s->frames[REF_FRAME_SEGMAP].segmentation_map) {
+            int pred = 8, x;
+            uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
+
+            if (!s->frames[REF_FRAME_SEGMAP].uses_2pass)
+                ff_thread_await_progress(&s->frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
+            for (y = 0; y < h4; y++) {
+                int idx_base = (y + row) * 8 * s->sb_cols + col;
+                for (x = 0; x < w4; x++)
+                    pred = FFMIN(pred, refsegmap[idx_base + x]);
+            }
+            av_assert1(pred < 8);
+            b->seg_id = pred;
+        } else {
+            b->seg_id = 0;
+        }
+
+        memset(&s->above_segpred_ctx[col], 1, w4);
+        memset(&s->left_segpred_ctx[row7], 1, h4);
+    } else {
+        b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
+                                     s->prob.seg);
+
+        memset(&s->above_segpred_ctx[col], 0, w4);
+        memset(&s->left_segpred_ctx[row7], 0, h4);
+    }
+    if (s->segmentation.enabled &&
+        (s->segmentation.update_map || s->keyframe || s->intraonly)) {
+        setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
+                  bw4, bh4, 8 * s->sb_cols, b->seg_id);
+    }
+
+    b->skip = s->segmentation.enabled &&
+        s->segmentation.feat[b->seg_id].skip_enabled;
+    if (!b->skip) {
+        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
+        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
+        s->counts.skip[c][b->skip]++;
+    }
+
+    if (s->keyframe || s->intraonly) {
+        b->intra = 1;
+    } else if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
+        b->intra = !s->segmentation.feat[b->seg_id].ref_val;
+    } else {
+        int c, bit;
+
+        if (have_a && have_l) {
+            c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
+            c += (c == 2);
+        } else {
+            c = have_a ? 2 * s->above_intra_ctx[col] :
+                have_l ? 2 * s->left_intra_ctx[row7] : 0;
+        }
+        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
+        s->counts.intra[c][bit]++;
+        b->intra = !bit;
+    }
+
+    if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
+        int c;
+        if (have_a) {
+            if (have_l) {
+                c = (s->above_skip_ctx[col] ? max_tx :
+                     s->above_txfm_ctx[col]) +
+                    (s->left_skip_ctx[row7] ? max_tx :
+                     s->left_txfm_ctx[row7]) > max_tx;
+            } else {
+                c = s->above_skip_ctx[col] ? 1 :
+                    (s->above_txfm_ctx[col] * 2 > max_tx);
+            }
+        } else if (have_l) {
+            c = s->left_skip_ctx[row7] ? 1 :
+                (s->left_txfm_ctx[row7] * 2 > max_tx);
+        } else {
+            c = 1;
+        }
+        switch (max_tx) {
+        case TX_32X32:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
+            if (b->tx) {
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
+                if (b->tx == 2)
+                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
+            }
+            s->counts.tx32p[c][b->tx]++;
+            break;
+        case TX_16X16:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
+            if (b->tx)
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
+            s->counts.tx16p[c][b->tx]++;
+            break;
+        case TX_8X8:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
+            s->counts.tx8p[c][b->tx]++;
+            break;
+        case TX_4X4:
+            b->tx = TX_4X4;
+            break;
+        }
+    } else {
+        b->tx = FFMIN(max_tx, s->txfmmode);
+    }
+
+    if (s->keyframe || s->intraonly) {
+        uint8_t *a = &s->above_mode_ctx[col * 2];
+        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
+
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            // FIXME the memory storage intermediates here aren't really
+            // necessary, they're just there to make the code slightly
+            // simpler for now
+            b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                    vp9_default_kf_ymode_probs[a[0]][l[0]]);
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
+                l[0] = a[1] = b->mode[1];
+            } else {
+                l[0] = a[1] = b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                        vp9_default_kf_ymode_probs[a[0]][l[1]]);
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
+                    l[1] = a[1] = b->mode[3];
+                } else {
+                    l[1] = a[1] = b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                l[1] = a[1] = b->mode[3] = b->mode[1];
+            }
+        } else {
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          vp9_default_kf_ymode_probs[*a][*l]);
+            b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
+            // FIXME this can probably be optimized
+            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
+            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                     vp9_default_kf_uvmode_probs[b->mode[3]]);
+    } else if (b->intra) {
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          s->prob.p.y_mode[0]);
+            s->counts.y_mode[0][b->mode[0]]++;
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[1]]++;
+            } else {
+                b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[2]]++;
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                                  s->prob.p.y_mode[0]);
+                    s->counts.y_mode[0][b->mode[3]]++;
+                } else {
+                    b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                b->mode[3] = b->mode[1];
+            }
+        } else {
+            static const uint8_t size_group[10] = {
+                3, 3, 3, 3, 2, 2, 2, 1, 1, 1
+            };
+            int sz = size_group[b->bs];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          s->prob.p.y_mode[sz]);
+            b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
+            s->counts.y_mode[sz][b->mode[3]]++;
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                     s->prob.p.uv_mode[b->mode[3]]);
+        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
+    } else {
+        static const uint8_t inter_mode_ctx_lut[14][14] = {
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
+        };
+
+        if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].ref_enabled) {
+            av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
+            b->comp = 0;
+            b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
+        } else {
+            // read comp_pred flag
+            if (s->comppredmode != PRED_SWITCHABLE) {
+                b->comp = s->comppredmode == PRED_COMPREF;
+            } else {
+                int c;
+
+                // FIXME add intra as ref=0xff (or -1) to make these easier?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
+                            c = 4;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 2 + (s->left_intra_ctx[row7] ||
+                                     s->left_ref_ctx[row7] == s->fixcompref);
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 2 + (s->above_intra_ctx[col] ||
+                                     s->above_ref_ctx[col] == s->fixcompref);
+                        } else {
+                            c = (!s->above_intra_ctx[col] &&
+                                 s->above_ref_ctx[col] == s->fixcompref) ^
+                            (!s->left_intra_ctx[row7] &&
+                             s->left_ref_ctx[row & 7] == s->fixcompref);
+                        }
+                    } else {
+                        c = s->above_comp_ctx[col] ? 3 :
+                        (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
+                    }
+                } else if (have_l) {
+                    c = s->left_comp_ctx[row7] ? 3 :
+                    (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
+                } else {
+                    c = 1;
+                }
+                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
+                s->counts.comp[c][b->comp]++;
+            }
+
+            // read actual references
+            // FIXME probably cache a few variables here to prevent repetitive
+            // memory accesses below
+            if (b->comp) /* two references */ {
+                int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
+
+                b->ref[fix_idx] = s->fixcompref;
+                // FIXME can this codeblob be replaced by some sort of LUT?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_intra_ctx[col]) {
+                            if (s->left_intra_ctx[row7]) {
+                                c = 2;
+                            } else {
+                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                            }
+                        } else if (s->left_intra_ctx[row7]) {
+                            c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        } else {
+                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
+
+                            if (refl == refa && refa == s->varcompref[1]) {
+                                c = 0;
+                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
+                                if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
+                                    (refl == s->fixcompref && refa == s->varcompref[0])) {
+                                    c = 4;
+                                } else {
+                                    c = (refa == refl) ? 3 : 1;
+                                }
+                            } else if (!s->left_comp_ctx[row7]) {
+                                if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refl == s->varcompref[1] &&
+                                         refa != s->varcompref[1]) ? 2 : 4;
+                                }
+                            } else if (!s->above_comp_ctx[col]) {
+                                if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refa == s->varcompref[1] &&
+                                         refl != s->varcompref[1]) ? 2 : 4;
+                                }
+                            } else {
+                                c = (refl == refa) ? 4 : 2;
+                            }
+                        }
+                    } else {
+                        if (s->above_intra_ctx[col]) {
+                            c = 2;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        } else {
+                            c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        }
+                    }
+                } else if (have_l) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                    } else {
+                        c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
+                b->ref[var_idx] = s->varcompref[bit];
+                s->counts.comp_ref[c][bit]++;
+            } else /* single reference */ {
+                int bit, c;
+
+                if (have_a && !s->above_intra_ctx[col]) {
+                    if (have_l && !s->left_intra_ctx[row7]) {
+                        if (s->left_comp_ctx[row7]) {
+                            if (s->above_comp_ctx[col]) {
+                                c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
+                                         !s->above_ref_ctx[col]);
+                            } else {
+                                c = (3 * !s->above_ref_ctx[col]) +
+                                    (!s->fixcompref || !s->left_ref_ctx[row7]);
+                            }
+                        } else if (s->above_comp_ctx[col]) {
+                            c = (3 * !s->left_ref_ctx[row7]) +
+                                (!s->fixcompref || !s->above_ref_ctx[col]);
+                        } else {
+                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
+                        }
+                    } else if (s->above_intra_ctx[col]) {
+                        c = 2;
+                    } else if (s->above_comp_ctx[col]) {
+                        c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
+                    } else {
+                        c = 4 * (!s->above_ref_ctx[col]);
+                    }
+                } else if (have_l && !s->left_intra_ctx[row7]) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
+                    } else {
+                        c = 4 * (!s->left_ref_ctx[row7]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
+                s->counts.single_ref[c][0][bit]++;
+                if (!bit) {
+                    b->ref[0] = 0;
+                } else {
+                    // FIXME can this codeblob be replaced by some sort of LUT?
+                    if (have_a) {
+                        if (have_l) {
+                            if (s->left_intra_ctx[row7]) {
+                                if (s->above_intra_ctx[col]) {
+                                    c = 2;
+                                } else if (s->above_comp_ctx[col]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else if (!s->above_ref_ctx[col]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->above_intra_ctx[col]) {
+                                if (s->left_intra_ctx[row7]) {
+                                    c = 2;
+                                } else if (s->left_comp_ctx[row7]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (s->above_comp_ctx[col]) {
+                                if (s->left_comp_ctx[row7]) {
+                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
+                                        c = 3 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                    } else {
+                                        c = 2;
+                                    }
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else {
+                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
+                                    (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->left_comp_ctx[row7]) {
+                                if (!s->above_ref_ctx[col]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else {
+                                    c = 3 * (s->above_ref_ctx[col] == 1) +
+                                    (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->above_ref_ctx[col]) {
+                                if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->left_ref_ctx[row7]) {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 2 * (s->left_ref_ctx[row7] == 1) +
+                                2 * (s->above_ref_ctx[col] == 1);
+                            }
+                        } else {
+                            if (s->above_intra_ctx[col] ||
+                                (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
+                                c = 2;
+                            } else if (s->above_comp_ctx[col]) {
+                                c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            }
+                        }
+                    } else if (have_l) {
+                        if (s->left_intra_ctx[row7] ||
+                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
+                            c = 2;
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                        } else {
+                            c = 4 * (s->left_ref_ctx[row7] == 1);
+                        }
+                    } else {
+                        c = 2;
+                    }
+                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
+                    s->counts.single_ref[c][1][bit]++;
+                    b->ref[0] = 1 + bit;
+                }
+            }
+        }
+
+        if (b->bs <= BS_8x8) {
+            if (s->segmentation.enabled && s->segmentation.feat[b->seg_id].skip_enabled) {
+                b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
+            } else {
+                static const uint8_t off[10] = {
+                    3, 0, 0, 1, 0, 0, 0, 0, 0, 0
+                };
+
+                // FIXME this needs to use the LUT tables from find_ref_mvs
+                // because not all are -1,0/0,-1
+                int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
+                                          [s->left_mode_ctx[row7 + off[b->bs]]];
+
+                b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
+                s->counts.mv_mode[c][b->mode[0] - 10]++;
+            }
+        }
+
+        if (s->filtermode == FILTER_SWITCHABLE) {
+            int c;
+
+            if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
+                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
+                        s->left_filter_ctx[row7] : 3;
+                } else {
+                    c = s->above_filter_ctx[col];
+                }
+            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                c = s->left_filter_ctx[row7];
+            } else {
+                c = 3;
+            }
+
+            filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
+                                         s->prob.p.filter[c]);
+            s->counts.filter[c][filter_id]++;
+            b->filter = vp9_filter_lut[filter_id];
+        } else {
+            b->filter = s->filtermode;
+        }
+
+        if (b->bs > BS_8x8) {
+            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                          s->prob.p.mv_mode[c]);
+            s->counts.mv_mode[c][b->mode[0] - 10]++;
+            fill_mv(s, b->mv[0], b->mode[0], 0);
+
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[1] - 10]++;
+                fill_mv(s, b->mv[1], b->mode[1], 1);
+            } else {
+                b->mode[1] = b->mode[0];
+                AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            }
+
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[2] - 10]++;
+                fill_mv(s, b->mv[2], b->mode[2], 2);
+
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                                  s->prob.p.mv_mode[c]);
+                    s->counts.mv_mode[c][b->mode[3] - 10]++;
+                    fill_mv(s, b->mv[3], b->mode[3], 3);
+                } else {
+                    b->mode[3] = b->mode[2];
+                    AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
+                    AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+                b->mode[3] = b->mode[1];
+                AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
+                AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
+            }
+        } else {
+            fill_mv(s, b->mv[0], b->mode[0], -1);
+            AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
+        }
+
+        vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
+    }
+
+#if HAVE_FAST_64BIT
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                                    break; \
+    case 2:  AV_WN16A(&var, val *             0x0101);     break; \
+    case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
+    case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
+    case 16: { \
+        uint64_t v64 = val * 0x0101010101010101ULL; \
+        AV_WN64A(              &var,     v64); \
+        AV_WN64A(&((uint8_t *) &var)[8], v64); \
+        break; \
+    } \
+    }
+#else
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                         break; \
+    case 2:  AV_WN16A(&var, val *     0x0101);  break; \
+    case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
+    case 8: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,     v32); \
+        AV_WN32A(&((uint8_t *) &var)[4], v32); \
+        break; \
+    } \
+    case 16: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,      v32); \
+        AV_WN32A(&((uint8_t *) &var)[4],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[8],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[12], v32); \
+        break; \
+    } \
+    }
+#endif
+
+    switch (bwh_tab[1][b->bs][0]) {
+#define SET_CTXS(dir, off, n) \
+    do { \
+        SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
+        SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
+        SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
+        if (!s->keyframe && !s->intraonly) { \
+            SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
+            SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
+            SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
+            if (!b->intra) { \
+                SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
+                if (s->filtermode == FILTER_SWITCHABLE) { \
+                    SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
+                } \
+            } \
+        } \
+    } while (0)
+    case 1: SET_CTXS(above, col, 1); break;
+    case 2: SET_CTXS(above, col, 2); break;
+    case 4: SET_CTXS(above, col, 4); break;
+    case 8: SET_CTXS(above, col, 8); break;
+    }
+    switch (bwh_tab[1][b->bs][1]) {
+    case 1: SET_CTXS(left, row7, 1); break;
+    case 2: SET_CTXS(left, row7, 2); break;
+    case 4: SET_CTXS(left, row7, 4); break;
+    case 8: SET_CTXS(left, row7, 8); break;
+    }
+#undef SPLAT_CTX
+#undef SET_CTXS
+
+    if (!s->keyframe && !s->intraonly) {
+        if (b->bs > BS_8x8) {
+            int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
+        } else {
+            int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            for (n = 0; n < w4 * 2; n++) {
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
+            }
+            for (n = 0; n < h4 * 2; n++) {
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
+            }
+        }
+    }
+
+    // FIXME kinda ugly
+    for (y = 0; y < h4; y++) {
+        int x, o = (row + y) * s->sb_cols * 8 + col;
+        struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
+
+        if (b->intra) {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] =
+                mv[x].ref[1] = -1;
+            }
+        } else if (b->comp) {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] = b->ref[0];
+                mv[x].ref[1] = b->ref[1];
+                AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
+                AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
+            }
+        } else {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] = b->ref[0];
+                mv[x].ref[1] = -1;
+                AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
+            }
+        }
+    }
+}
+
+// FIXME merge cnt/eob arguments?
+static av_always_inline int
+decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                        int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
+                        unsigned (*eob)[6][2], uint8_t (*p)[6][11],
+                        int nnz, const int16_t *scan, const int16_t (*nb)[2],
+                        const int16_t *band_counts, const int16_t *qmul)
+{
+    int i = 0, band = 0, band_left = band_counts[band];
+    uint8_t *tp = p[0][nnz];
+    uint8_t cache[1024];
+
+    do {
+        int val, rc;
+
+        val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
+        eob[band][nnz][val]++;
+        if (!val)
+            break;
+
+    skip_eob:
+        if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
+            cnt[band][nnz][0]++;
+            if (!--band_left)
+                band_left = band_counts[++band];
+            cache[scan[i]] = 0;
+            nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+            tp = p[band][nnz];
+            if (++i == n_coeffs)
+                break; //invalid input; blocks should end with EOB
+            goto skip_eob;
+        }
+
+        rc = scan[i];
+        if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
+            cnt[band][nnz][1]++;
+            val = 1;
+            cache[rc] = 1;
+        } else {
+            // fill in p[3-10] (model fill) - only once per frame for each pos
+            if (!tp[3])
+                memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
+
+            cnt[band][nnz][2]++;
+            if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
+                if (!vp56_rac_get_prob_branchy(c, tp[4])) {
+                    cache[rc] = val = 2;
+                } else {
+                    val = 3 + vp56_rac_get_prob(c, tp[5]);
+                    cache[rc] = 3;
+                }
+            } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
+                cache[rc] = 4;
+                if (!vp56_rac_get_prob_branchy(c, tp[7])) {
+                    val = 5 + vp56_rac_get_prob(c, 159);
+                } else {
+                    val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
+                    val +=      vp56_rac_get_prob(c, 145);
+                }
+            } else { // cat 3-6
+                cache[rc] = 5;
+                if (!vp56_rac_get_prob_branchy(c, tp[8])) {
+                    if (!vp56_rac_get_prob_branchy(c, tp[9])) {
+                        val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
+                        val +=      (vp56_rac_get_prob(c, 148) << 1);
+                        val +=       vp56_rac_get_prob(c, 140);
+                    } else {
+                        val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
+                        val +=      (vp56_rac_get_prob(c, 155) << 2);
+                        val +=      (vp56_rac_get_prob(c, 140) << 1);
+                        val +=       vp56_rac_get_prob(c, 135);
+                    }
+                } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
+                    val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
+                    val +=      (vp56_rac_get_prob(c, 157) << 3);
+                    val +=      (vp56_rac_get_prob(c, 141) << 2);
+                    val +=      (vp56_rac_get_prob(c, 134) << 1);
+                    val +=       vp56_rac_get_prob(c, 130);
+                } else {
+                    val = 67;
+                    if (!is8bitsperpixel) {
+                        if (bpp == 12) {
+                            val += vp56_rac_get_prob(c, 255) << 17;
+                            val += vp56_rac_get_prob(c, 255) << 16;
+                        }
+                        val +=  (vp56_rac_get_prob(c, 255) << 15);
+                        val +=  (vp56_rac_get_prob(c, 255) << 14);
+                    }
+                    val +=      (vp56_rac_get_prob(c, 254) << 13);
+                    val +=      (vp56_rac_get_prob(c, 254) << 12);
+                    val +=      (vp56_rac_get_prob(c, 254) << 11);
+                    val +=      (vp56_rac_get_prob(c, 252) << 10);
+                    val +=      (vp56_rac_get_prob(c, 249) << 9);
+                    val +=      (vp56_rac_get_prob(c, 243) << 8);
+                    val +=      (vp56_rac_get_prob(c, 230) << 7);
+                    val +=      (vp56_rac_get_prob(c, 196) << 6);
+                    val +=      (vp56_rac_get_prob(c, 177) << 5);
+                    val +=      (vp56_rac_get_prob(c, 153) << 4);
+                    val +=      (vp56_rac_get_prob(c, 140) << 3);
+                    val +=      (vp56_rac_get_prob(c, 133) << 2);
+                    val +=      (vp56_rac_get_prob(c, 130) << 1);
+                    val +=       vp56_rac_get_prob(c, 129);
+                }
+            }
+        }
+#define STORE_COEF(c, i, v) do { \
+    if (is8bitsperpixel) { \
+        c[i] = v; \
+    } else { \
+        AV_WN32A(&c[i * 2], v); \
+    } \
+} while (0)
+        if (!--band_left)
+            band_left = band_counts[++band];
+        if (is_tx32x32)
+            STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
+        else
+            STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
+        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+        tp = p[band][nnz];
+    } while (++i < n_coeffs);
+
+    return i;
+}
+
+static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                const int16_t (*nb)[2], const int16_t *band_counts,
+                                const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                  const int16_t (*nb)[2], const int16_t *band_counts,
+                                  const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                 const int16_t (*nb)[2], const int16_t *band_counts,
+                                 const int16_t *qmul)
 {
-    VP9Context *s = avctx->priv_data;
-    int c = ((s->above_partition_ctx[col]       >> (3 - bl)) & 1) |
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                   const int16_t (*nb)[2], const int16_t *band_counts,
+                                   const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
+    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int n, pl, x, y, res;
+    int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
+    int tx = 4 * s->lossless + b->tx;
+    const int16_t * const *yscans = vp9_scans[tx];
+    const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
+    const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
+    const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
+    uint8_t *a = &s->above_y_nnz_ctx[col * 2];
+    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
+    static const int16_t band_counts[4][8] = {
+        { 1, 2, 3, 4,  3,   16 - 13 },
+        { 1, 2, 3, 4, 11,   64 - 21 },
+        { 1, 2, 3, 4, 11,  256 - 21 },
+        { 1, 2, 3, 4, 11, 1024 - 21 },
+    };
+    const int16_t *y_band_counts = band_counts[b->tx];
+    const int16_t *uv_band_counts = band_counts[b->uvtx];
+    int bytesperpixel = is8bitsperpixel ? 1 : 2;
+    int total_coeff = 0;
+
+#define MERGE(la, end, step, rd) \
+    for (n = 0; n < end; n += step) \
+        la[n] = !!rd(&la[n])
+#define MERGE_CTX(step, rd) \
+    do { \
+        MERGE(l, end_y, step, rd); \
+        MERGE(a, end_x, step, rd); \
+    } while (0)
+
+#define DECODE_Y_COEF_LOOP(step, mode_index, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
+                                     c, e, p, a[x] + l[y], yscans[txtp], \
+                                     ynbs[txtp], y_band_counts, qmul[0]); \
+            a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->eob[n], res); \
+            } else { \
+                s->eob[n] = res; \
+            } \
+        } \
+    }
+
+#define SPLAT(la, end, step, cond) \
+    if (step == 2) { \
+        for (n = 1; n < end; n += step) \
+            la[n] = la[n - 1]; \
+    } else if (step == 4) { \
+        if (cond) { \
+            for (n = 0; n < end; n += step) \
+                AV_WN32A(&la[n], la[n] * 0x01010101); \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
+        } \
+    } else /* step == 8 */ { \
+        if (cond) { \
+            if (HAVE_FAST_64BIT) { \
+                for (n = 0; n < end; n += step) \
+                    AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
+            } else { \
+                for (n = 0; n < end; n += step) { \
+                    uint32_t v32 = la[n] * 0x01010101; \
+                    AV_WN32A(&la[n],     v32); \
+                    AV_WN32A(&la[n + 4], v32); \
+                } \
+            } \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
+        } \
+    }
+#define SPLAT_CTX(step) \
+    do { \
+        SPLAT(a, end_x, step, end_x == w4); \
+        SPLAT(l, end_y, step, end_y == h4); \
+    } while (0)
+
+    /* y tokens */
+    switch (b->tx) {
+    case TX_4X4:
+        DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
+        break;
+    case TX_8X8:
+        MERGE_CTX(2, AV_RN16A);
+        DECODE_Y_COEF_LOOP(2, 0,);
+        SPLAT_CTX(2);
+        break;
+    case TX_16X16:
+        MERGE_CTX(4, AV_RN32A);
+        DECODE_Y_COEF_LOOP(4, 0,);
+        SPLAT_CTX(4);
+        break;
+    case TX_32X32:
+        MERGE_CTX(8, AV_RN64A);
+        DECODE_Y_COEF_LOOP(8, 0, 32);
+        SPLAT_CTX(8);
+        break;
+    }
+
+#define DECODE_UV_COEF_LOOP(step, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
+                                     16 * step * step, c, e, p, a[x] + l[y], \
+                                     uvscan, uvnb, uv_band_counts, qmul[1]); \
+            a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->uveob[pl][n], res); \
+            } else { \
+                s->uveob[pl][n] = res; \
+            } \
+        } \
+    }
+
+    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
+    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
+    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    h4 >>= s->ss_v;
+    end_y >>= s->ss_v;
+    for (pl = 0; pl < 2; pl++) {
+        a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
+        l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
+        switch (b->uvtx) {
+        case TX_4X4:
+            DECODE_UV_COEF_LOOP(1,);
+            break;
+        case TX_8X8:
+            MERGE_CTX(2, AV_RN16A);
+            DECODE_UV_COEF_LOOP(2,);
+            SPLAT_CTX(2);
+            break;
+        case TX_16X16:
+            MERGE_CTX(4, AV_RN32A);
+            DECODE_UV_COEF_LOOP(4,);
+            SPLAT_CTX(4);
+            break;
+        case TX_32X32:
+            MERGE_CTX(8, AV_RN64A);
+            DECODE_UV_COEF_LOOP(8, 32);
+            SPLAT_CTX(8);
+            break;
+        }
+    }
+
+    return total_coeff;
+}
+
+static int decode_coeffs_8bpp(AVCodecContext *ctx)
+{
+    return decode_coeffs(ctx, 1);
+}
+
+static int decode_coeffs_16bpp(AVCodecContext *ctx)
+{
+    return decode_coeffs(ctx, 0);
+}
+
+static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
+                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
+                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
+                                             uint8_t *l, int col, int x, int w,
+                                             int row, int y, enum TxfmMode tx,
+                                             int p, int ss_h, int ss_v, int bytesperpixel)
+{
+    int have_top = row > 0 || y > 0;
+    int have_left = col > s->tiling.tile_col_start || x > 0;
+    int have_right = x < w - 1;
+    int bpp = s->bpp;
+    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
+        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
+                                   { DC_127_PRED,          VERT_PRED } },
+        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
+                                   { HOR_PRED,             HOR_PRED } },
+        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
+                                   { LEFT_DC_PRED,         DC_PRED } },
+        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
+                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
+        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
+                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
+        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
+                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
+        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
+                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
+        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
+                                   { DC_127_PRED,          VERT_LEFT_PRED } },
+        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
+                                   { HOR_UP_PRED,          HOR_UP_PRED } },
+        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
+                                   { HOR_PRED,             TM_VP8_PRED } },
+    };
+    static const struct {
+        uint8_t needs_left:1;
+        uint8_t needs_top:1;
+        uint8_t needs_topleft:1;
+        uint8_t needs_topright:1;
+        uint8_t invert_left:1;
+    } edges[N_INTRA_PRED_MODES] = {
+        [VERT_PRED]            = { .needs_top  = 1 },
+        [HOR_PRED]             = { .needs_left = 1 },
+        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
+        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
+        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
+        [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
+        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [LEFT_DC_PRED]         = { .needs_left = 1 },
+        [TOP_DC_PRED]          = { .needs_top  = 1 },
+        [DC_128_PRED]          = { 0 },
+        [DC_127_PRED]          = { 0 },
+        [DC_129_PRED]          = { 0 }
+    };
+
+    av_assert2(mode >= 0 && mode < 10);
+    mode = mode_conv[mode][have_left][have_top];
+    if (edges[mode].needs_top) {
+        uint8_t *top, *topleft;
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
+        int n_px_need_tr = 0;
+
+        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
+            n_px_need_tr = 4;
+
+        // if top of sb64-row, use s->intra_pred_data[] instead of
+        // dst[-stride] for intra prediction (it contains pre- instead of
+        // post-loopfilter data)
+        if (have_top) {
+            top = !(row & 7) && !y ?
+                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
+            if (have_left)
+                topleft = !(row & 7) && !y ?
+                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
+                    &dst_inner[-stride_inner];
+        }
+
+        if (have_top &&
+            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
+            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
+            n_px_need + n_px_need_tr <= n_px_have) {
+            *a = top;
+        } else {
+            if (have_top) {
+                if (n_px_need <= n_px_have) {
+                    memcpy(*a, top, n_px_need * bytesperpixel);
+                } else {
+#define memset_bpp(c, i1, v, i2, num) do { \
+    if (bytesperpixel == 1) { \
+        memset(&(c)[(i1)], (v)[(i2)], (num)); \
+    } else { \
+        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[((i1) + n) * 2], val); \
+        } \
+    } \
+} while (0)
+                    memcpy(*a, top, n_px_have * bytesperpixel);
+                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+#define memset_val(c, val, num) do { \
+    if (bytesperpixel == 1) { \
+        memset((c), (val), (num)); \
+    } else { \
+        int n; \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[n * 2], (val)); \
+        } \
+    } \
+} while (0)
+                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
+            }
+            if (edges[mode].needs_topleft) {
+                if (have_left && have_top) {
+#define assign_bpp(c, i1, v, i2) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i1)] = (v)[(i2)]; \
+    } else { \
+        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
+    } \
+} while (0)
+                    assign_bpp(*a, -1, topleft, -1);
+                } else {
+#define assign_val(c, i, v) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i)] = (v); \
+    } else { \
+        AV_WN16A(&(c)[(i) * 2], (v)); \
+    } \
+} while (0)
+                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
+                }
+            }
+            if (tx == TX_4X4 && edges[mode].needs_topright) {
+                if (have_top && have_right &&
+                    n_px_need + n_px_need_tr <= n_px_have) {
+                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
+                } else {
+                    memset_bpp(*a, 4, *a, 3, 4);
+                }
+            }
+        }
+    }
+    if (edges[mode].needs_left) {
+        if (have_left) {
+            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
+            uint8_t *dst = x == 0 ? dst_edge : dst_inner;
+            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
+
+            if (edges[mode].invert_left) {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
+                }
+            }
+        } else {
+            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
+        }
+    }
+
+    return mode;
+}
+
+static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
+                                         ptrdiff_t uv_off, int bytesperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
+    int uvstep1d = 1 << b->uvtx, p;
+    uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
+    LOCAL_ALIGNED_32(uint8_t, l, [64]);
+
+    for (n = 0, y = 0; y < end_y; y += step1d) {
+        uint8_t *ptr = dst, *ptr_r = dst_r;
+        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
+                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
+            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
+                               y * 2 + x : 0];
+            uint8_t *a = &a_buf[32];
+            enum TxfmType txtp = vp9_intra_txfm_type[mode];
+            int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+            mode = check_intra_mode(s, mode, &a, ptr_r,
+                                    s->frames[CUR_FRAME].tf.f->linesize[0],
+                                    ptr, s->y_stride, l,
+                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
+            s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
+            if (eob)
+                s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
+                                           s->block + 16 * n * bytesperpixel, eob);
+        }
+        dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
+        dst   += 4 * step1d * s->y_stride;
+    }
+
+    // U/V
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    end_y >>= s->ss_v;
+    step = 1 << (b->uvtx * 2);
+    for (p = 0; p < 2; p++) {
+        dst   = s->dst[1 + p];
+        dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+            uint8_t *ptr = dst, *ptr_r = dst_r;
+            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
+                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
+                int mode = b->uvmode;
+                uint8_t *a = &a_buf[32];
+                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
+
+                mode = check_intra_mode(s, mode, &a, ptr_r,
+                                        s->frames[CUR_FRAME].tf.f->linesize[1],
+                                        ptr, s->uv_stride, l, col, x, w4, row, y,
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
+                s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
+                if (eob)
+                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
+                                                    s->uvblock[p] + 16 * n * bytesperpixel, eob);
+            }
+            dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
+            dst   += 4 * uvstep1d * s->uv_stride;
+        }
+    }
+}
+
+static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    intra_recon(ctx, y_off, uv_off, 1);
+}
+
+static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    intra_recon(ctx, y_off, uv_off, 2);
+}
+
+static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                              uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h, int bytesperpixel)
+{
+    int mx = mv->x, my = mv->y, th;
+
+    y += my >> 3;
+    x += mx >> 3;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 7;
+    my &= 7;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
+                                 160, ref_stride,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        ref_stride = 160;
+    }
+    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
+}
+
+static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                                uint8_t *dst_u, uint8_t *dst_v,
+                                                ptrdiff_t dst_stride,
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                                ThreadFrame *ref_frame,
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                                int bw, int bh, int w, int h, int bytesperpixel)
+{
+    int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
+
+    y += my >> 4;
+    x += mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_u,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_v,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
+    } else {
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
+    }
+}
+
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                     mv, bw, bh, w, h, bytesperpixel)
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                       row, col, mv, bw, bh, w, h, bytesperpixel)
+#define SCALED 0
+#define FN(x) x##_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                            vp9_mc_func (*mc)[2],
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                            int px, int py, int pw, int ph,
+                                            int bw, int bh, int w, int h, int bytesperpixel,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+    if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
+                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3 * bytesperpixel,
+                                 288, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        ref_stride = 288;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+    }
+}
+
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                              vp9_mc_func (*mc)[2],
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                              int px, int py, int pw, int ph,
+                                              int bw, int bh, int w, int h, int bytesperpixel,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    if (s->frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
+                           ref_v, src_stride_v, ref_frame,
+                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    if (s->ss_h) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    } else {
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
+        mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
+    }
+    if (s->ss_v) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    } else {
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
+        my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
+    }
+#undef scale_mv
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
+                                 288, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
+                                 288, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+    }
+}
+
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define SCALED 1
+#define FN(x) x##_scaled_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_scaled_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
+        if (bytesperpixel == 1) {
+            inter_pred_scaled_8bpp(ctx);
+        } else {
+            inter_pred_scaled_16bpp(ctx);
+        }
+    } else {
+        if (bytesperpixel == 1) {
+            inter_pred_8bpp(ctx);
+        } else {
+            inter_pred_16bpp(ctx);
+        }
+    }
+    if (!b->skip) {
+        /* mostly copied intra_recon() */
+
+        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+        int end_x = FFMIN(2 * (s->cols - col), w4);
+        int end_y = FFMIN(2 * (s->rows - row), h4);
+        int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
+        int uvstep1d = 1 << b->uvtx, p;
+        uint8_t *dst = s->dst[0];
+
+        // y itxfm add
+        for (n = 0, y = 0; y < end_y; y += step1d) {
+            uint8_t *ptr = dst;
+            for (x = 0; x < end_x; x += step1d,
+                 ptr += 4 * step1d * bytesperpixel, n += step) {
+                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+                if (eob)
+                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
+                                                  s->block + 16 * n * bytesperpixel, eob);
+            }
+            dst += 4 * s->y_stride * step1d;
+        }
+
+        // uv itxfm add
+        end_x >>= s->ss_h;
+        end_y >>= s->ss_v;
+        step = 1 << (b->uvtx * 2);
+        for (p = 0; p < 2; p++) {
+            dst = s->dst[p + 1];
+            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+                uint8_t *ptr = dst;
+                for (x = 0; x < end_x; x += uvstep1d,
+                     ptr += 4 * uvstep1d * bytesperpixel, n += step) {
+                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
+
+                    if (eob)
+                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
+                                                        s->uvblock[p] + 16 * n * bytesperpixel, eob);
+                }
+                dst += 4 * uvstep1d * s->uv_stride;
+            }
+        }
+    }
+}
+
+static void inter_recon_8bpp(AVCodecContext *ctx)
+{
+    inter_recon(ctx, 1);
+}
+
+static void inter_recon_16bpp(AVCodecContext *ctx)
+{
+    inter_recon(ctx, 2);
+}
+
+static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
+                                        int row_and_7, int col_and_7,
+                                        int w, int h, int col_end, int row_end,
+                                        enum TxfmMode tx, int skip_inter)
+{
+    static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
+    static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
+
+    // FIXME I'm pretty sure all loops can be replaced by a single LUT if
+    // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
+    // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
+    // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
+
+    // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
+    // edges. This means that for UV, we work on two subsampled blocks at
+    // a time, and we only use the topleft block's mode information to set
+    // things like block strength. Thus, for any block size smaller than
+    // 16x16, ignore the odd portion of the block.
+    if (tx == TX_4X4 && (ss_v | ss_h)) {
+        if (h == ss_v) {
+            if (row_and_7 & 1)
+                return;
+            if (!row_end)
+                h += 1;
+        }
+        if (w == ss_h) {
+            if (col_and_7 & 1)
+                return;
+            if (!col_end)
+                w += 1;
+        }
+    }
+
+    if (tx == TX_4X4 && !skip_inter) {
+        int t = 1 << col_and_7, m_col = (t << w) - t, y;
+        // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
+        int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
+
+        for (y = row_and_7; y < h + row_and_7; y++) {
+            int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
+
+            mask[0][y][1] |= m_row_8;
+            mask[0][y][2] |= m_row_4;
+            // for odd lines, if the odd col is not being filtered,
+            // skip odd row also:
+            // .---. <-- a
+            // |   |
+            // |___| <-- b
+            // ^   ^
+            // c   d
+            //
+            // if a/c are even row/col and b/d are odd, and d is skipped,
+            // e.g. right edge of size-66x66.webm, then skip b also (bug)
+            if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
+                mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
+            } else {
+                mask[1][y][col_mask_id] |= m_col;
+            }
+            if (!ss_h)
+                mask[0][y][3] |= m_col;
+            if (!ss_v) {
+                if (ss_h && (col_end & 1))
+                    mask[1][y][3] |= (t << (w - 1)) - t;
+                else
+                    mask[1][y][3] |= m_col;
+            }
+        }
+    } else {
+        int y, t = 1 << col_and_7, m_col = (t << w) - t;
+
+        if (!skip_inter) {
+            int mask_id = (tx == TX_8X8);
+            static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
+            int l2 = tx + ss_h - 1, step1d;
+            int m_row = m_col & masks[l2];
+
+            // at odd UV col/row edges tx16/tx32 loopfilter edges, force
+            // 8wd loopfilter to prevent going off the visible edge.
+            if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
+                int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
+                int m_row_8 = m_row - m_row_16;
+
+                for (y = row_and_7; y < h + row_and_7; y++) {
+                    mask[0][y][0] |= m_row_16;
+                    mask[0][y][1] |= m_row_8;
+                }
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y++)
+                    mask[0][y][mask_id] |= m_row;
+            }
+
+            l2 = tx + ss_v - 1;
+            step1d = 1 << l2;
+            if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
+                for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
+                    mask[1][y][0] |= m_col;
+                if (y - row_and_7 == h - 1)
+                    mask[1][y][1] |= m_col;
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y += step1d)
+                    mask[1][y][mask_id] |= m_col;
+            }
+        } else if (tx != TX_4X4) {
+            int mask_id;
+
+            mask_id = (tx == TX_8X8) || (h == ss_v);
+            mask[1][row_and_7][mask_id] |= m_col;
+            mask_id = (tx == TX_8X8) || (w == ss_h);
+            for (y = row_and_7; y < h + row_and_7; y++)
+                mask[0][y][mask_id] |= t;
+        } else {
+            int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                mask[0][y][2] |= t4;
+                mask[0][y][1] |= t8;
+            }
+            mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
+        }
+    }
+}
+
+static void decode_b(AVCodecContext *ctx, int row, int col,
+                     struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                     enum BlockLevel bl, enum BlockPartition bp)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    enum BlockSize bs = bl * 3 + bp;
+    int bytesperpixel = s->bytesperpixel;
+    int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
+    int emu[2];
+    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+
+    s->row = row;
+    s->row7 = row & 7;
+    s->col = col;
+    s->col7 = col & 7;
+    s->min_mv.x = -(128 + col * 64);
+    s->min_mv.y = -(128 + row * 64);
+    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
+    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
+    if (s->pass < 2) {
+        b->bs = bs;
+        b->bl = bl;
+        b->bp = bp;
+        decode_mode(ctx);
+        b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
+                           (s->ss_v && h4 * 2 == (1 << b->tx)));
+
+        if (!b->skip) {
+            int has_coeffs;
+
+            if (bytesperpixel == 1) {
+                has_coeffs = decode_coeffs_8bpp(ctx);
+            } else {
+                has_coeffs = decode_coeffs_16bpp(ctx);
+            }
+            if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
+                b->skip = 1;
+                memset(&s->above_skip_ctx[col], 1, w4);
+                memset(&s->left_skip_ctx[s->row7], 1, h4);
+            }
+        } else {
+            int row7 = s->row7;
+
+#define SPLAT_ZERO_CTX(v, n) \
+    switch (n) { \
+    case 1:  v = 0;          break; \
+    case 2:  AV_ZERO16(&v);  break; \
+    case 4:  AV_ZERO32(&v);  break; \
+    case 8:  AV_ZERO64(&v);  break; \
+    case 16: AV_ZERO128(&v); break; \
+    }
+#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
+    do { \
+        SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
+        if (s->ss_##dir2) { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        } else { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
+        } \
+    } while (0)
+
+            switch (w4) {
+            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
+            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
+            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
+            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
+            }
+            switch (h4) {
+            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
+            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
+            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
+            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
+            }
+        }
+        if (s->pass == 1) {
+            s->b++;
+            s->block += w4 * h4 * 64 * bytesperpixel;
+            s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->eob += 4 * w4 * h4;
+            s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+            s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+
+            return;
+        }
+    }
+
+    // emulated overhangs if the stride of the target buffer can't hold. This
+    // makes it possible to support emu-edge and so on even if we have large block
+    // overhangs
+    emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
+             (row + h4) > s->rows;
+    emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
+             (row + h4) > s->rows;
+    if (emu[0]) {
+        s->dst[0] = s->tmp_y;
+        s->y_stride = 128;
+    } else {
+        s->dst[0] = f->data[0] + yoff;
+        s->y_stride = f->linesize[0];
+    }
+    if (emu[1]) {
+        s->dst[1] = s->tmp_uv[0];
+        s->dst[2] = s->tmp_uv[1];
+        s->uv_stride = 128;
+    } else {
+        s->dst[1] = f->data[1] + uvoff;
+        s->dst[2] = f->data[2] + uvoff;
+        s->uv_stride = f->linesize[1];
+    }
+    if (b->intra) {
+        if (s->bpp > 8) {
+            intra_recon_16bpp(ctx, yoff, uvoff);
+        } else {
+            intra_recon_8bpp(ctx, yoff, uvoff);
+        }
+    } else {
+        if (s->bpp > 8) {
+            inter_recon_16bpp(ctx);
+        } else {
+            inter_recon_8bpp(ctx);
+        }
+    }
+    if (emu[0]) {
+        int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
+
+        for (n = 0; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
+                                         s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+    if (emu[1]) {
+        int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
+        int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
+
+        for (n = s->ss_h; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
+                                         s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
+                                         s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+
+    // pick filter level and find edges to apply filter to
+    if (s->filter.level &&
+        (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
+                                                    [b->mode[3] != ZEROMV]) > 0) {
+        int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
+        int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
+
+        setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
+        mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        if (s->ss_h || s->ss_v)
+            mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
+                       s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
+                       s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
+                       b->uvtx, skip_inter);
+
+        if (!s->filter.lim_lut[lvl]) {
+            int sharp = s->filter.sharpness;
+            int limit = lvl;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter.lim_lut[lvl] = limit;
+            s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
+        }
+    }
+
+    if (s->pass == 2) {
+        s->b++;
+        s->block += w4 * h4 * 64 * bytesperpixel;
+        s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->eob += 4 * w4 * h4;
+        s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+        s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+    }
+}
+
+static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
+                      ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+{
+    VP9Context *s = ctx->priv_data;
+    int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
-    int ret;
-    const uint8_t *p = s->keyframe ? ff_vp9_default_kf_partition_probs[bl][c]
-                                   : s->prob.p.partition[bl][c];
+    const uint8_t *p = s->keyframe || s->intraonly ? vp9_default_kf_partition_probs[bl][c] :
+                                                     s->prob.p.partition[bl][c];
     enum BlockPartition bp;
     ptrdiff_t hbs = 4 >> bl;
+    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
+    int bytesperpixel = s->bytesperpixel;
 
     if (bl == BL_8X8) {
-        bp  = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
-        ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
-    } else if (col + hbs < s->cols) {
-        if (row + hbs < s->rows) {
-            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+        bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+    } else if (col + hbs < s->cols) { // FIXME why not <=?
+        if (row + hbs < s->rows) { // FIXME why not <=?
+            bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
             switch (bp) {
             case PARTITION_NONE:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_H:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                    uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                    ret    = ff_vp9_decode_block(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_V:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8;
-                    uvoff += hbs * 4;
-                    ret    = ff_vp9_decode_block(avctx, row, col + hbs, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_SPLIT:
-                ret = decode_subblock(avctx, row, col, lflvl,
-                                      yoff, uvoff, bl + 1);
-                if (!ret) {
-                    ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                          yoff + 8 * hbs, uvoff + 4 * hbs,
-                                          bl + 1);
-                    if (!ret) {
-                        yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                        uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                        ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl + 1);
-                        if (!ret) {
-                            ret = decode_subblock(avctx, row + hbs, col + hbs,
-                                                  lflvl, yoff + 8 * hbs,
-                                                  uvoff + 4 * hbs, bl + 1);
-                        }
-                    }
-                }
+                decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(ctx, row, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(ctx, row + hbs, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 break;
             default:
-                av_log(avctx, AV_LOG_ERROR, "Unexpected partition %d.", bp);
-                return AVERROR_INVALIDDATA;
+                av_assert0(0);
             }
         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret)
-                ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                      yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+            bp = PARTITION_SPLIT;
+            decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(ctx, row, col + hbs, lflvl,
+                      yoff + 8 * hbs * bytesperpixel,
+                      uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
         } else {
-            bp  = PARTITION_H;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_H;
+            decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
         }
-    } else if (row + hbs < s->rows) {
+    } else if (row + hbs < s->rows) { // FIXME why not <=?
         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret) {
-                yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                         yoff, uvoff, bl + 1);
-            }
+            bp = PARTITION_SPLIT;
+            decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         } else {
-            bp  = PARTITION_V;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_V;
+            decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
         }
     } else {
-        bp  = PARTITION_SPLIT;
-        ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        bp = PARTITION_SPLIT;
+        decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
     }
     s->counts.partition[bl][c][bp]++;
+}
+
+static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
+                          ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    ptrdiff_t hbs = 4 >> bl;
+    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
+    int bytesperpixel = s->bytesperpixel;
 
-    return ret;
+    if (bl == BL_8X8) {
+        av_assert2(b->bl == BL_8X8);
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+    } else if (s->b->bl == bl) {
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        if (b->bp == PARTITION_H && row + hbs < s->rows) {
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
+            yoff  += hbs * 8 * bytesperpixel;
+            uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+            decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+        }
+    } else {
+        decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        if (col + hbs < s->cols) { // FIXME why not <=?
+            if (row + hbs < s->rows) {
+                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
+                              yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+            } else {
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+            }
+        } else if (row + hbs < s->rows) {
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+        }
+    }
 }
 
-static void loopfilter_subblock(AVCodecContext *avctx, VP9Filter *lflvl,
-                                int row, int col,
-                                ptrdiff_t yoff, ptrdiff_t uvoff)
+static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
 {
-    VP9Context *s = avctx->priv_data;
-    uint8_t *dst   = s->cur_frame->data[0] + yoff, *lvl = lflvl->level;
-    ptrdiff_t ls_y = s->cur_frame->linesize[0], ls_uv = s->cur_frame->linesize[1];
-    int y, x, p;
-
-    /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g.
-     * if you think of them as acting on a 8x8 block max, we can interleave
-     * each v/h within the single x loop, but that only works if we work on
-     * 8 pixel blocks, and we won't always do that (we want at least 16px
-     * to use SSE2 optimizations, perhaps 32 for AVX2). */
-
-    // filter edges between columns, Y plane (e.g. block1 | block2)
-    for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
-        uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
-        uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
-        unsigned hm  = hm1 | hm2 | hm13 | hm23;
+        unsigned hm = hm1 | hm2 | hm13 | hm23;
 
-        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
-            if (hm1 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
+            if (col || x > 1) {
+                if (hm1 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                if (col || x > 1) {
                     if (hmask1[0] & x) {
                         if (hmask2[0] & x) {
-                            av_assert2(l[8] == L);
-                            s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
+                            av_assert2(l[8 << ss_v] == L);
+                            s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
                         } else {
-                            s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
+                            s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
                         }
                     } else if (hm2 & x) {
-                        L  = l[8];
+                        L = l[8 << ss_v];
                         H |= (L >> 4) << 8;
                         E |= s->filter.mblim_lut[L] << 8;
                         I |= s->filter.lim_lut[L] << 8;
                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
                                                [!!(hmask2[1] & x)]
-                                               [0](ptr, ls_y, E, I, H);
+                                               [0](ptr, ls, E, I, H);
                     } else {
                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                            [0](ptr, ls_y, E, I, H);
+                                            [0](ptr, ls, E, I, H);
                     }
-                }
-            } else if (hm2 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                } else if (hm2 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                if (col || x > 1) {
                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                        [0](ptr + 8 * ls_y, ls_y, E, I, H);
+                                        [0](ptr + 8 * ls, ls, E, I, H);
                 }
             }
-            if (hm13 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+            if (ss_h) {
+                if (x & 0xAA)
+                    l += 2;
+            } else {
+                if (hm13 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                if (hm23 & x) {
-                    L  = l[8];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
-                }
-            } else if (hm23 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    if (hm23 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    }
+                } else if (hm23 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
+                }
+                l++;
             }
         }
     }
+}
 
-    //                                          block1
-    // filter edges between rows, Y plane (e.g. ------)
-    //                                          block2
-    dst = s->cur_frame->data[0] + yoff;
-    lvl = lflvl->level;
-    for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
-        uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
+static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
 
-        for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
+        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
             if (row || y) {
                 if (vm & x) {
                     int L = *l, H = L >> 4;
                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
                     if (vmask[0] & x) {
-                        if (vmask[0] & (x << 1)) {
-                            av_assert2(l[1] == L);
-                            s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
+                        if (vmask[0] & (x << (1 + ss_h))) {
+                            av_assert2(l[1 + ss_h] == L);
+                            s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
                         } else {
-                            s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
+                            s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
                         }
-                    } else if (vm & (x << 1)) {
-                        L  = l[1];
+                    } else if (vm & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
                         H |= (L >> 4) << 8;
                         E |= s->filter.mblim_lut[L] << 8;
                         I |= s->filter.lim_lut[L] << 8;
                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                               [!!(vmask[1] & (x << 1))]
-                                               [1](ptr, ls_y, E, I, H);
+                                               [!!(vmask[1] & (x << (1 + ss_h)))]
+                                               [1](ptr, ls, E, I, H);
                     } else {
                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                            [1](ptr, ls_y, E, I, H);
+                                            [1](ptr, ls, E, I, H);
                     }
-                } else if (vm & (x << 1)) {
-                    int L = l[1], H = L >> 4;
+                } else if (vm & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
                     int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                    s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
-                                        [1](ptr + 8, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
+                                        [1](ptr + 8 * bytesperpixel, ls, E, I, H);
                 }
             }
-            if (vm3 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+            if (!ss_v) {
+                if (vm3 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                if (vm3 & (x << 1)) {
-                    L  = l[1];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                }
-            } else if (vm3 & (x << 1)) {
-                int L = l[1], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    if (vm3 & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
+                    }
+                } else if (vm3 & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
 
-                s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
+                }
             }
         }
+        if (ss_v) {
+            if (y & 1)
+                lvl += 16;
+        } else {
+            lvl += 8;
+        }
     }
+}
+
+static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
+                          int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
+{
+    VP9Context *s = ctx->priv_data;
+    AVFrame *f = s->frames[CUR_FRAME].tf.f;
+    uint8_t *dst = f->data[0] + yoff;
+    ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
+    uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
+    int p;
+
+    // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
+    // if you think of them as acting on a 8x8 block max, we can interleave
+    // each v/h within the single x loop, but that only works if we work on
+    // 8 pixel blocks, and we won't always do that (we want at least 16px
+    // to use SSE2 optimizations, perhaps 32 for AVX2)
+
+    filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
+    filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
 
-    // same principle but for U/V planes
     for (p = 0; p < 2; p++) {
-        lvl = lflvl->level;
-        dst = s->cur_frame->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
-            uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
-            uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
-            unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
-            unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
-
-            for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
-                if (col || x > 1) {
-                    if (hm1 & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
-
-                        if (hmask1[0] & x) {
-                            if (hmask2[0] & x) {
-                                av_assert2(l[16] == L);
-                                s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (hm2 & x) {
-                            L  = l[16];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                                   [!!(hmask2[1] & x)]
-                                                   [0](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                                [0](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (hm2 & x) {
-                        int L = l[16], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+        dst = f->data[1 + p] + uvoff;
+        filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
+        filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
+    }
+}
 
-                        s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                            [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
+static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+{
+    int sb_start = ( idx      * n) >> log2_n;
+    int sb_end   = ((idx + 1) * n) >> log2_n;
+    *start = FFMIN(sb_start, n) << 3;
+    *end   = FFMIN(sb_end,   n) << 3;
+}
+
+static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
+                                        int max_count, int update_factor)
+{
+    unsigned ct = ct0 + ct1, p2, p1;
+
+    if (!ct)
+        return;
+
+    p1 = *p;
+    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
+    p2 = av_clip(p2, 1, 255);
+    ct = FFMIN(ct, max_count);
+    update_factor = FASTDIV(update_factor * ct, max_count);
+
+    // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
+    *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
+}
+
+static void adapt_probs(VP9Context *s)
+{
+    int i, j, k, l, m;
+    prob_context *p = &s->prob_ctx[s->framectxid].p;
+    int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
+
+    // coefficients
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 2; j++)
+            for (k = 0; k < 2; k++)
+                for (l = 0; l < 6; l++)
+                    for (m = 0; m < 6; m++) {
+                        uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
+                        unsigned *e = s->counts.eob[i][j][k][l][m];
+                        unsigned *c = s->counts.coef[i][j][k][l][m];
+
+                        if (l == 0 && m >= 3) // dc only has 3 pt
+                            break;
+
+                        adapt_prob(&pp[0], e[0], e[1], 24, uf);
+                        adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
+                        adapt_prob(&pp[2], c[1], c[2], 24, uf);
                     }
-                }
-                if (x & 0xAA)
-                    l += 2;
-            }
+
+    if (s->keyframe || s->intraonly) {
+        memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
+        memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
+        memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
+        memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
+        return;
+    }
+
+    // skip flag
+    for (i = 0; i < 3; i++)
+        adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
+
+    // intra/inter flag
+    for (i = 0; i < 4; i++)
+        adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
+
+    // comppred flag
+    if (s->comppredmode == PRED_SWITCHABLE) {
+      for (i = 0; i < 5; i++)
+          adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
+    }
+
+    // reference frames
+    if (s->comppredmode != PRED_SINGLEREF) {
+      for (i = 0; i < 5; i++)
+          adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
+                     s->counts.comp_ref[i][1], 20, 128);
+    }
+
+    if (s->comppredmode != PRED_COMPREF) {
+      for (i = 0; i < 5; i++) {
+          uint8_t *pp = p->single_ref[i];
+          unsigned (*c)[2] = s->counts.single_ref[i];
+
+          adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
+          adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
+      }
+    }
+
+    // block partitioning
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 4; j++) {
+            uint8_t *pp = p->partition[i][j];
+            unsigned *c = s->counts.partition[i][j];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
         }
-        lvl = lflvl->level;
-        dst = s->cur_frame->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
-            uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
-            unsigned vm = vmask[0] | vmask[1] | vmask[2];
 
-            for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
-                if (row || y) {
-                    if (vm & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+    // tx size
+    if (s->txfmmode == TX_SWITCHABLE) {
+      for (i = 0; i < 2; i++) {
+          unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
 
-                        if (vmask[0] & x) {
-                            if (vmask[0] & (x << 2)) {
-                                av_assert2(l[2] == L);
-                                s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (vm & (x << 2)) {
-                            L  = l[2];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                                   [!!(vmask[1] & (x << 2))]
-                                                   [1](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                                [1](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (vm & (x << 2)) {
-                        int L = l[2], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+          adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
+          adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
+          adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
+          adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
+          adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
+          adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
+      }
+    }
 
-                        s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
-                                            [1](ptr + 8, ls_uv, E, I, H);
-                    }
-                }
-            }
-            if (y & 1)
-                lvl += 16;
+    // interpolation filter
+    if (s->filtermode == FILTER_SWITCHABLE) {
+        for (i = 0; i < 4; i++) {
+            uint8_t *pp = p->filter[i];
+            unsigned *c = s->counts.filter[i];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2], 20, 128);
         }
     }
+
+    // inter modes
+    for (i = 0; i < 7; i++) {
+        uint8_t *pp = p->mv_mode[i];
+        unsigned *c = s->counts.mv_mode[i];
+
+        adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[1], c[3], 20, 128);
+    }
+
+    // mv joints
+    {
+        uint8_t *pp = p->mv_joint;
+        unsigned *c = s->counts.mv_joint;
+
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+    }
+
+    // mv components
+    for (i = 0; i < 2; i++) {
+        uint8_t *pp;
+        unsigned *c, (*c2)[2], sum;
+
+        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
+                   s->counts.mv_comp[i].sign[1], 20, 128);
+
+        pp = p->mv_comp[i].classes;
+        c = s->counts.mv_comp[i].classes;
+        sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
+        adapt_prob(&pp[0], c[0], sum, 20, 128);
+        sum -= c[1];
+        adapt_prob(&pp[1], c[1], sum, 20, 128);
+        sum -= c[2] + c[3];
+        adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
+        adapt_prob(&pp[3], c[2], c[3], 20, 128);
+        sum -= c[4] + c[5];
+        adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
+        adapt_prob(&pp[5], c[4], c[5], 20, 128);
+        sum -= c[6];
+        adapt_prob(&pp[6], c[6], sum, 20, 128);
+        adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
+        adapt_prob(&pp[8], c[7], c[8], 20, 128);
+        adapt_prob(&pp[9], c[9], c[10], 20, 128);
+
+        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
+                   s->counts.mv_comp[i].class0[1], 20, 128);
+        pp = p->mv_comp[i].bits;
+        c2 = s->counts.mv_comp[i].bits;
+        for (j = 0; j < 10; j++)
+            adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
+
+        for (j = 0; j < 2; j++) {
+            pp = p->mv_comp[i].class0_fp[j];
+            c = s->counts.mv_comp[i].class0_fp[j];
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
+        }
+        pp = p->mv_comp[i].fp;
+        c = s->counts.mv_comp[i].fp;
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+
+        if (s->highprecisionmvs) {
+            adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
+                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
+            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
+                       s->counts.mv_comp[i].hp[1], 20, 128);
+        }
+    }
+
+    // y intra modes
+    for (i = 0; i < 4; i++) {
+        uint8_t *pp = p->y_mode[i];
+        unsigned *c = s->counts.y_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
+
+    // uv intra modes
+    for (i = 0; i < 10; i++) {
+        uint8_t *pp = p->uv_mode[i];
+        unsigned *c = s->counts.uv_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
 }
 
-static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+static void free_buffers(VP9Context *s)
 {
-    int sb_start =  (idx      * n) >> log2_n;
-    int sb_end   = ((idx + 1) * n) >> log2_n;
-    *start = FFMIN(sb_start, n) << 3;
-    *end   = FFMIN(sb_end,   n) << 3;
+    av_freep(&s->intra_pred_data[0]);
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
 }
 
-static int vp9_decode_frame(AVCodecContext *avctx, AVFrame *frame,
-                            int *got_frame, const uint8_t *data, int size)
+static av_cold int vp9_decode_free(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
-    int ret, tile_row, tile_col, i, ref = -1, row, col;
-    ptrdiff_t yoff = 0, uvoff = 0;
+    VP9Context *s = ctx->priv_data;
+    int i;
 
-    ret = decode_frame_header(avctx, data, size, &ref);
-    if (ret < 0) {
-        return ret;
-    } else if (!ret) {
-        if (!s->refs[ref]->buf[0]) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Requested reference %d not available\n", ref);
+    for (i = 0; i < 3; i++) {
+        if (s->frames[i].tf.f->data[0])
+            vp9_unref_frame(ctx, &s->frames[i]);
+        av_frame_free(&s->frames[i].tf.f);
+    }
+    for (i = 0; i < 8; i++) {
+        if (s->refs[i].f->data[0])
+            ff_thread_release_buffer(ctx, &s->refs[i]);
+        av_frame_free(&s->refs[i].f);
+        if (s->next_refs[i].f->data[0])
+            ff_thread_release_buffer(ctx, &s->next_refs[i]);
+        av_frame_free(&s->next_refs[i].f);
+    }
+    free_buffers(s);
+    av_freep(&s->c_b);
+    s->c_b_size = 0;
+
+    return 0;
+}
+
+
+static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
+                            int *got_frame, AVPacket *pkt)
+{
+    const uint8_t *data = pkt->data;
+    int size = pkt->size;
+    VP9Context *s = ctx->priv_data;
+    int res, tile_row, tile_col, i, ref, row, col;
+    int retain_segmap_ref = s->frames[REF_FRAME_SEGMAP].segmentation_map &&
+                            (!s->segmentation.enabled || !s->segmentation.update_map);
+    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
+    AVFrame *f;
+    int bytesperpixel;
+
+    if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
+        return res;
+    } else if (res == 0) {
+        if (!s->refs[ref].f->data[0]) {
+            av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
             return AVERROR_INVALIDDATA;
         }
-
-        ret = av_frame_ref(frame, s->refs[ref]);
-        if (ret < 0)
-            return ret;
+        if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
+            return res;
+        ((AVFrame *)frame)->pkt_pts = pkt->pts;
+        ((AVFrame *)frame)->pkt_dts = pkt->dts;
+        for (i = 0; i < 8; i++) {
+            if (s->next_refs[i].f->data[0])
+                ff_thread_release_buffer(ctx, &s->next_refs[i]);
+            if (s->refs[i].f->data[0] &&
+                (res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i])) < 0)
+                return res;
+        }
         *got_frame = 1;
-        return 0;
+        return pkt->size;
     }
-    data += ret;
-    size -= ret;
+    data += res;
+    size -= res;
 
-    s->cur_frame = frame;
-
-    av_frame_unref(s->cur_frame);
-    if ((ret = ff_get_buffer(avctx, s->cur_frame,
-                             s->refreshrefmask ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
-        return ret;
-    s->cur_frame->key_frame = s->keyframe;
-    s->cur_frame->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
-                                          : AV_PICTURE_TYPE_P;
+    if (!retain_segmap_ref || s->keyframe || s->intraonly) {
+        if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0])
+            vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
+        if (!s->keyframe && !s->intraonly && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
+            (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_SEGMAP], &s->frames[CUR_FRAME])) < 0)
+            return res;
+    }
+    if (s->frames[REF_FRAME_MVPAIR].tf.f->data[0])
+        vp9_unref_frame(ctx, &s->frames[REF_FRAME_MVPAIR]);
+    if (!s->intraonly && !s->keyframe && !s->errorres && s->frames[CUR_FRAME].tf.f->data[0] &&
+        (res = vp9_ref_frame(ctx, &s->frames[REF_FRAME_MVPAIR], &s->frames[CUR_FRAME])) < 0)
+        return res;
+    if (s->frames[CUR_FRAME].tf.f->data[0])
+        vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
+    if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
+        return res;
+    f = s->frames[CUR_FRAME].tf.f;
+    f->key_frame = s->keyframe;
+    f->pict_type = (s->keyframe || s->intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
 
-    if (s->fullrange)
-        avctx->color_range = AVCOL_RANGE_JPEG;
-    else
-        avctx->color_range = AVCOL_RANGE_MPEG;
+    if (s->frames[REF_FRAME_SEGMAP].tf.f->data[0] &&
+        (s->frames[REF_FRAME_MVPAIR].tf.f->width  != s->frames[CUR_FRAME].tf.f->width ||
+         s->frames[REF_FRAME_MVPAIR].tf.f->height != s->frames[CUR_FRAME].tf.f->height)) {
+        vp9_unref_frame(ctx, &s->frames[REF_FRAME_SEGMAP]);
+    }
 
-    switch (s->colorspace) {
-    case 1: avctx->colorspace = AVCOL_SPC_BT470BG; break;
-    case 2: avctx->colorspace = AVCOL_SPC_BT709; break;
-    case 3: avctx->colorspace = AVCOL_SPC_SMPTE170M; break;
-    case 4: avctx->colorspace = AVCOL_SPC_SMPTE240M; break;
+    // ref frame setup
+    for (i = 0; i < 8; i++) {
+        if (s->next_refs[i].f->data[0])
+            ff_thread_release_buffer(ctx, &s->next_refs[i]);
+        if (s->refreshrefmask & (1 << i)) {
+            res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
+        } else if (s->refs[i].f->data[0]) {
+            res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
+        }
+        if (res < 0)
+            return res;
     }
 
     // main tile decode loop
+    bytesperpixel = s->bytesperpixel;
     memset(s->above_partition_ctx, 0, s->cols);
     memset(s->above_skip_ctx, 0, s->cols);
-    if (s->keyframe || s->intraonly)
+    if (s->keyframe || s->intraonly) {
         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
-    else
+    } else {
         memset(s->above_mode_ctx, NEARESTMV, s->cols);
+    }
     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
-    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
-    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
     memset(s->above_segpred_ctx, 0, s->cols);
-    for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
-        set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
-                        tile_row, s->tiling.log2_tile_rows, s->sb_rows);
-        for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
-            int64_t tile_size;
-
-            if (tile_col == s->tiling.tile_cols - 1 &&
-                tile_row == s->tiling.tile_rows - 1) {
-                tile_size = size;
-            } else {
-                tile_size = AV_RB32(data);
-                data     += 4;
-                size     -= 4;
-            }
-            if (tile_size > size)
-                return AVERROR_INVALIDDATA;
-            ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
-            if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
-                return AVERROR_INVALIDDATA;
-            data += tile_size;
-            size -= tile_size;
-        }
-
-        for (row = s->tiling.tile_row_start;
-             row < s->tiling.tile_row_end;
-             row += 8, yoff += s->cur_frame->linesize[0] * 64,
-             uvoff += s->cur_frame->linesize[1] * 32) {
-            VP9Filter *lflvl = s->lflvl;
-            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
-            for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
-                set_tile_offset(&s->tiling.tile_col_start,
-                                &s->tiling.tile_col_end,
-                                tile_col, s->tiling.log2_tile_cols, s->sb_cols);
-
-                memset(s->left_partition_ctx, 0, 8);
-                memset(s->left_skip_ctx, 0, 8);
-                if (s->keyframe || s->intraonly)
-                    memset(s->left_mode_ctx, DC_PRED, 16);
-                else
-                    memset(s->left_mode_ctx, NEARESTMV, 8);
-                memset(s->left_y_nnz_ctx, 0, 16);
-                memset(s->left_uv_nnz_ctx, 0, 16);
-                memset(s->left_segpred_ctx, 0, 8);
-
-                memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
-                for (col = s->tiling.tile_col_start;
-                     col < s->tiling.tile_col_end;
-                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++) {
-                    // FIXME integrate with lf code (i.e. zero after each
-                    // use, similar to invtxfm coefficients, or similar)
-                    memset(lflvl->mask, 0, sizeof(lflvl->mask));
-
-                    if ((ret = decode_subblock(avctx, row, col, lflvl,
-                                               yoff2, uvoff2, BL_64X64)) < 0)
-                        return ret;
-                }
-                memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
-            }
-
-            // backup pre-loopfilter reconstruction data for intra
-            // prediction of next row of sb64s
-            if (row + 8 < s->rows) {
-                memcpy(s->intra_pred_data[0],
-                       s->cur_frame->data[0] + yoff +
-                       63 * s->cur_frame->linesize[0],
-                       8 * s->cols);
-                memcpy(s->intra_pred_data[1],
-                       s->cur_frame->data[1] + uvoff +
-                       31 * s->cur_frame->linesize[1],
-                       4 * s->cols);
-                memcpy(s->intra_pred_data[2],
-                       s->cur_frame->data[2] + uvoff +
-                       31 * s->cur_frame->linesize[2],
-                       4 * s->cols);
-            }
-
-            // loopfilter one row
-            if (s->filter.level) {
-                yoff2  = yoff;
-                uvoff2 = uvoff;
-                lflvl  = s->lflvl;
-                for (col = 0; col < s->cols;
-                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++)
-                    loopfilter_subblock(avctx, lflvl, row, col, yoff2, uvoff2);
-            }
-        }
-    }
-
-    // bw adaptivity (or in case of parallel decoding mode, fw adaptivity
-    // probability maintenance between frames)
-    if (s->refreshctx) {
-        if (s->parallelmode) {
-            int j, k, l, m;
-            for (i = 0; i < 4; i++) {
-                for (j = 0; j < 2; j++)
-                    for (k = 0; k < 2; k++)
-                        for (l = 0; l < 6; l++)
-                            for (m = 0; m < 6; m++)
-                                memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
-                                       s->prob.coef[i][j][k][l][m], 3);
-                if (s->txfmmode == i)
-                    break;
-            }
-            s->prob_ctx[s->framectxid].p = s->prob.p;
-        } else {
-            ff_vp9_adapt_probs(s);
-        }
+    s->pass = s->frames[CUR_FRAME].uses_2pass =
+        ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
+    if ((res = update_block_buffers(ctx)) < 0) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Failed to allocate block buffers\n");
+        return res;
     }
-    FFSWAP(VP9MVRefPair *, s->mv[0], s->mv[1]);
+    if (s->refreshctx && s->parallelmode) {
+        int j, k, l, m;
 
-    // ref frame setup
-    for (i = 0; i < 8; i++)
-        if (s->refreshrefmask & (1 << i)) {
-            av_frame_unref(s->refs[i]);
-            ret = av_frame_ref(s->refs[i], s->cur_frame);
-            if (ret < 0)
-                return ret;
+        for (i = 0; i < 4; i++) {
+            for (j = 0; j < 2; j++)
+                for (k = 0; k < 2; k++)
+                    for (l = 0; l < 6; l++)
+                        for (m = 0; m < 6; m++)
+                            memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
+                                   s->prob.coef[i][j][k][l][m], 3);
+            if (s->txfmmode == i)
+                break;
         }
+        s->prob_ctx[s->framectxid].p = s->prob.p;
+        ff_thread_finish_setup(ctx);
+    } else if (!s->refreshctx) {
+        ff_thread_finish_setup(ctx);
+    }
 
-    if (s->invisible)
-        av_frame_unref(s->cur_frame);
-    else
-        *got_frame = 1;
+    do {
+        yoff = uvoff = 0;
+        s->b = s->b_base;
+        s->block = s->block_base;
+        s->uvblock[0] = s->uvblock_base[0];
+        s->uvblock[1] = s->uvblock_base[1];
+        s->eob = s->eob_base;
+        s->uveob[0] = s->uveob_base[0];
+        s->uveob[1] = s->uveob_base[1];
 
-    return 0;
-}
+        for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
+            set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
+                            tile_row, s->tiling.log2_tile_rows, s->sb_rows);
+            if (s->pass != 2) {
+                for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
+                    int64_t tile_size;
 
-static int vp9_decode_packet(AVCodecContext *avctx, void *frame,
-                             int *got_frame, AVPacket *avpkt)
-{
-    const uint8_t *data = avpkt->data;
-    int size            = avpkt->size;
-    int marker, ret;
-
-    /* Read superframe index - this is a collection of individual frames
-     * that together lead to one visible frame */
-    marker = data[size - 1];
-    if ((marker & 0xe0) == 0xc0) {
-        int nbytes   = 1 + ((marker >> 3) & 0x3);
-        int n_frames = 1 + (marker & 0x7);
-        int idx_sz   = 2 + n_frames * nbytes;
-
-        if (size >= idx_sz && data[size - idx_sz] == marker) {
-            const uint8_t *idx = data + size + 1 - idx_sz;
-
-            while (n_frames--) {
-                unsigned sz = AV_RL32(idx);
-
-                if (nbytes < 4)
-                    sz &= (1 << (8 * nbytes)) - 1;
-                idx += nbytes;
-
-                if (sz > size) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Superframe packet size too big: %u > %d\n",
-                           sz, size);
-                    return AVERROR_INVALIDDATA;
+                    if (tile_col == s->tiling.tile_cols - 1 &&
+                        tile_row == s->tiling.tile_rows - 1) {
+                        tile_size = size;
+                    } else {
+                        tile_size = AV_RB32(data);
+                        data += 4;
+                        size -= 4;
+                    }
+                    if (tile_size > size) {
+                        ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
+                        ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    data += tile_size;
+                    size -= tile_size;
                 }
+            }
+
+            for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
+                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+                struct VP9Filter *lflvl_ptr = s->lflvl;
+                ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+                for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
+                    set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
+                                    tile_col, s->tiling.log2_tile_cols, s->sb_cols);
+
+                    if (s->pass != 2) {
+                        memset(s->left_partition_ctx, 0, 8);
+                        memset(s->left_skip_ctx, 0, 8);
+                        if (s->keyframe || s->intraonly) {
+                            memset(s->left_mode_ctx, DC_PRED, 16);
+                        } else {
+                            memset(s->left_mode_ctx, NEARESTMV, 8);
+                        }
+                        memset(s->left_y_nnz_ctx, 0, 16);
+                        memset(s->left_uv_nnz_ctx, 0, 32);
+                        memset(s->left_segpred_ctx, 0, 8);
 
-                ret = vp9_decode_frame(avctx, frame, got_frame, data, sz);
-                if (ret < 0)
-                    return ret;
-                data += sz;
-                size -= sz;
+                        memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
+                    }
+
+                    for (col = s->tiling.tile_col_start;
+                         col < s->tiling.tile_col_end;
+                         col += 8, yoff2 += 64 * bytesperpixel,
+                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                        // FIXME integrate with lf code (i.e. zero after each
+                        // use, similar to invtxfm coefficients, or similar)
+                        if (s->pass != 1) {
+                            memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                        }
+
+                        if (s->pass == 2) {
+                            decode_sb_mem(ctx, row, col, lflvl_ptr,
+                                          yoff2, uvoff2, BL_64X64);
+                        } else {
+                            decode_sb(ctx, row, col, lflvl_ptr,
+                                      yoff2, uvoff2, BL_64X64);
+                        }
+                    }
+                    if (s->pass != 2) {
+                        memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
+                    }
+                }
+
+                if (s->pass == 1) {
+                    continue;
+                }
+
+                // backup pre-loopfilter reconstruction data for intra
+                // prediction of next row of sb64s
+                if (row + 8 < s->rows) {
+                    memcpy(s->intra_pred_data[0],
+                           f->data[0] + yoff + 63 * ls_y,
+                           8 * s->cols * bytesperpixel);
+                    memcpy(s->intra_pred_data[1],
+                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols * bytesperpixel >> s->ss_h);
+                    memcpy(s->intra_pred_data[2],
+                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols * bytesperpixel >> s->ss_h);
+                }
+
+                // loopfilter one row
+                if (s->filter.level) {
+                    yoff2 = yoff;
+                    uvoff2 = uvoff;
+                    lflvl_ptr = s->lflvl;
+                    for (col = 0; col < s->cols;
+                         col += 8, yoff2 += 64 * bytesperpixel,
+                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                        loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
+                    }
+                }
+
+                // FIXME maybe we can make this more finegrained by running the
+                // loopfilter per-block instead of after each sbrow
+                // In fact that would also make intra pred left preparation easier?
+                ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
             }
-            return size;
         }
+
+        if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
+            adapt_probs(s);
+            ff_thread_finish_setup(ctx);
+        }
+    } while (s->pass++ == 1);
+    ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
+
+    // ref frame setup
+    for (i = 0; i < 8; i++) {
+        if (s->refs[i].f->data[0])
+            ff_thread_release_buffer(ctx, &s->refs[i]);
+        if (s->next_refs[i].f->data[0] &&
+            (res = ff_thread_ref_frame(&s->refs[i], &s->next_refs[i])) < 0)
+            return res;
     }
 
-    /* If we get here, there was no valid superframe index, i.e. this is just
-     * one whole single frame. Decode it as such from the complete input buf. */
-    if ((ret = vp9_decode_frame(avctx, frame, got_frame, data, size)) < 0)
-        return ret;
-    return size;
+    if (!s->invisible) {
+        if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
+            return res;
+        *got_frame = 1;
+    }
+
+    return pkt->size;
 }
 
-static av_cold int vp9_decode_free(AVCodecContext *avctx)
+static void vp9_decode_flush(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
+    VP9Context *s = ctx->priv_data;
     int i;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        av_frame_free(&s->refs[i]);
+    for (i = 0; i < 3; i++)
+        vp9_unref_frame(ctx, &s->frames[i]);
+    for (i = 0; i < 8; i++)
+        ff_thread_release_buffer(ctx, &s->refs[i]);
+}
+
+static int init_frames(AVCodecContext *ctx)
+{
+    VP9Context *s = ctx->priv_data;
+    int i;
 
-    av_freep(&s->c_b);
-    av_freep(&s->above_partition_ctx);
+    for (i = 0; i < 3; i++) {
+        s->frames[i].tf.f = av_frame_alloc();
+        if (!s->frames[i].tf.f) {
+            vp9_decode_free(ctx);
+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        s->refs[i].f = av_frame_alloc();
+        s->next_refs[i].f = av_frame_alloc();
+        if (!s->refs[i].f || !s->next_refs[i].f) {
+            vp9_decode_free(ctx);
+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
 
     return 0;
 }
 
-static av_cold int vp9_decode_init(AVCodecContext *avctx)
+static av_cold int vp9_decode_init(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
-    int i;
+    VP9Context *s = ctx->priv_data;
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+    ctx->internal->allocate_progress = 1;
+    s->last_bpp = 0;
+    s->filter.sharpness = -1;
 
-    ff_vp9dsp_init(&s->dsp);
-    ff_videodsp_init(&s->vdsp, 8);
+    return init_frames(ctx);
+}
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
-        s->refs[i] = av_frame_alloc();
-        if (!s->refs[i]) {
-            vp9_decode_free(avctx);
-            return AVERROR(ENOMEM);
+#if HAVE_THREADS
+static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    return init_frames(avctx);
+}
+
+static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    int i, res;
+    VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
+
+    // detect size changes in other threads
+    if (s->intra_pred_data[0] &&
+        (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols ||
+         s->rows != ssrc->rows || s->bpp != ssrc->bpp)) {
+        free_buffers(s);
+    }
+
+    for (i = 0; i < 3; i++) {
+        if (s->frames[i].tf.f->data[0])
+            vp9_unref_frame(dst, &s->frames[i]);
+        if (ssrc->frames[i].tf.f->data[0]) {
+            if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
+                return res;
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        if (s->refs[i].f->data[0])
+            ff_thread_release_buffer(dst, &s->refs[i]);
+        if (ssrc->next_refs[i].f->data[0]) {
+            if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
+                return res;
         }
     }
 
-    s->filter.sharpness = -1;
+    s->invisible = ssrc->invisible;
+    s->keyframe = ssrc->keyframe;
+    s->intraonly = ssrc->intraonly;
+    s->ss_v = ssrc->ss_v;
+    s->ss_h = ssrc->ss_h;
+    s->segmentation.enabled = ssrc->segmentation.enabled;
+    s->segmentation.update_map = ssrc->segmentation.update_map;
+    s->segmentation.absolute_vals = ssrc->segmentation.absolute_vals;
+    s->bytesperpixel = ssrc->bytesperpixel;
+    s->bpp = ssrc->bpp;
+    s->bpp_index = ssrc->bpp_index;
+    memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
+    memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
+    memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
+           sizeof(s->segmentation.feat));
 
     return 0;
 }
+#endif
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_VP9_0, "Profile 0" },
+    { FF_PROFILE_VP9_1, "Profile 1" },
+    { FF_PROFILE_VP9_2, "Profile 2" },
+    { FF_PROFILE_VP9_3, "Profile 3" },
+    { FF_PROFILE_UNKNOWN },
+};
 
 AVCodec ff_vp9_decoder = {
-    .name           = "vp9",
-    .long_name      = NULL_IF_CONFIG_SMALL("Google VP9"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_VP9,
-    .priv_data_size = sizeof(VP9Context),
-    .init           = vp9_decode_init,
-    .decode         = vp9_decode_packet,
-    .flush          = vp9_decode_flush,
-    .close          = vp9_decode_free,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .name                  = "vp9",
+    .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
+    .type                  = AVMEDIA_TYPE_VIDEO,
+    .id                    = AV_CODEC_ID_VP9,
+    .priv_data_size        = sizeof(VP9Context),
+    .init                  = vp9_decode_init,
+    .close                 = vp9_decode_free,
+    .decode                = vp9_decode_frame,
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .flush                 = vp9_decode_flush,
+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
+    .profiles              = NULL_IF_CONFIG_SMALL(profiles),
 };
diff --git a/libavcodec/vp9.h b/libavcodec/vp9.h
index c6f395e93a..9a29416e8d 100644
--- a/libavcodec/vp9.h
+++ b/libavcodec/vp9.h
@@ -4,34 +4,26 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VP9_H
 #define AVCODEC_VP9_H
 
-#include <stddef.h>
-#include <stdint.h>
-
-#include "libavutil/internal.h"
-
-#include "avcodec.h"
-#include "vp56.h"
-
 enum TxfmMode {
     TX_4X4,
     TX_8X8,
@@ -77,346 +69,4 @@ enum FilterMode {
     FILTER_SWITCHABLE,
 };
 
-enum BlockPartition {
-    PARTITION_NONE,    // [ ] <-.
-    PARTITION_H,       // [-]   |
-    PARTITION_V,       // [|]   |
-    PARTITION_SPLIT,   // [+] --'
-};
-
-enum InterPredMode {
-    NEARESTMV = 10,
-    NEARMV    = 11,
-    ZEROMV    = 12,
-    NEWMV     = 13,
-};
-
-enum MVJoint {
-    MV_JOINT_ZERO,
-    MV_JOINT_H,
-    MV_JOINT_V,
-    MV_JOINT_HV,
-};
-
-typedef struct ProbContext {
-    uint8_t y_mode[4][9];
-    uint8_t uv_mode[10][9];
-    uint8_t filter[4][2];
-    uint8_t mv_mode[7][3];
-    uint8_t intra[4];
-    uint8_t comp[5];
-    uint8_t single_ref[5][2];
-    uint8_t comp_ref[5];
-    uint8_t tx32p[2][3];
-    uint8_t tx16p[2][2];
-    uint8_t tx8p[2];
-    uint8_t skip[3];
-    uint8_t mv_joint[3];
-    struct {
-        uint8_t sign;
-        uint8_t classes[10];
-        uint8_t class0;
-        uint8_t bits[10];
-        uint8_t class0_fp[2][3];
-        uint8_t fp[3];
-        uint8_t class0_hp;
-        uint8_t hp;
-    } mv_comp[2];
-    uint8_t partition[4][4][3];
-} ProbContext;
-
-typedef void (*vp9_mc_func)(uint8_t *dst, const uint8_t *ref,
-                            ptrdiff_t dst_stride,
-                            ptrdiff_t ref_stride,
-                            int h, int mx, int my);
-
-typedef struct VP9DSPContext {
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
-     * dimension 2: intra prediction modes
-     *
-     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * top[-1] is top/left; top[4,7] is top-right for 4x4
-     */
-    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
-    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
-    // also needs to fit in with what h264/vp8/etc do
-    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
-                                                         ptrdiff_t stride,
-                                                         const uint8_t *left,
-                                                         const uint8_t *top);
-
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
-     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
-     *
-     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * block is 16-byte aligned
-     * eob indicates the position (+1) of the last non-zero coefficient,
-     * in scan-order. This can be used to write faster versions, e.g. a
-     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
-     * etc.
-     */
-    // FIXME also write idct_add_block() versions for whole (inter) pred
-    // blocks, so we can do 2 4x4s at once
-    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
-                                                      ptrdiff_t stride,
-                                                      int16_t *block, int eob);
-
-    /*
-     * dimension 1: width of filter (0=4, 1=8, 2=16)
-     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by 8
-     */
-    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
-                                int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * The width of filter is assumed to be 16; dst/stride are aligned by 16
-     */
-    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
-                              int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
-     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by operation size
-     * this basically calls loop_filter[d1][d3][0](), followed by
-     * loop_filter[d2][d3][0]() on the next 8 pixels
-     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
-     * integer.
-     */
-    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
-    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
-                                      int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
-     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
-     * dimension 3: averaging type (0: put, 1: avg)
-     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
-     * dimension 5: y subpel interpolation (1: none, 1: 8tap/bilin)
-     *
-     * dst/stride are aligned by hsize
-     */
-    vp9_mc_func mc[5][4][2][2][2];
-} VP9DSPContext;
-
-enum CompPredMode {
-    PRED_SINGLEREF,
-    PRED_COMPREF,
-    PRED_SWITCHABLE,
-};
-
-typedef struct VP9MVRefPair {
-    VP56mv mv[2];
-    int8_t ref[2];
-} VP9MVRefPair;
-
-typedef struct VP9Filter {
-    uint8_t level[8 * 8];
-    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
-                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
-} VP9Filter;
-
-enum BlockLevel {
-    BL_64X64,
-    BL_32X32,
-    BL_16X16,
-    BL_8X8,
-};
-
-enum BlockSize {
-    BS_64x64,
-    BS_64x32,
-    BS_32x64,
-    BS_32x32,
-    BS_32x16,
-    BS_16x32,
-    BS_16x16,
-    BS_16x8,
-    BS_8x16,
-    BS_8x8,
-    BS_8x4,
-    BS_4x8,
-    BS_4x4,
-    N_BS_SIZES,
-};
-
-typedef struct VP9Block {
-    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
-    enum FilterMode filter;
-    VP56mv mv[4 /* b_idx */][2 /* ref */];
-    enum BlockSize bs;
-    enum TxfmMode tx, uvtx;
-
-    int row, row7, col, col7;
-    uint8_t *dst[3];
-    ptrdiff_t y_stride, uv_stride;
-} VP9Block;
-
-typedef struct VP9Context {
-    VP9DSPContext dsp;
-    VideoDSPContext vdsp;
-    GetBitContext gb;
-    VP56RangeCoder c;
-    VP56RangeCoder *c_b;
-    unsigned c_b_size;
-    VP9Block b;
-
-    // bitstream header
-    uint8_t profile;
-    uint8_t keyframe, last_keyframe;
-    uint8_t invisible;
-    uint8_t use_last_frame_mvs;
-    uint8_t errorres;
-    uint8_t colorspace;
-    uint8_t sub_x;
-    uint8_t sub_y;
-    uint8_t fullrange;
-    uint8_t intraonly;
-    uint8_t resetctx;
-    uint8_t refreshrefmask;
-    uint8_t highprecisionmvs;
-    enum FilterMode filtermode;
-    uint8_t allowcompinter;
-    uint8_t fixcompref;
-    uint8_t refreshctx;
-    uint8_t parallelmode;
-    uint8_t framectxid;
-    uint8_t refidx[3];
-    uint8_t signbias[3];
-    uint8_t varcompref[2];
-    AVFrame *refs[8];
-    AVFrame *cur_frame;
-
-    struct {
-        uint8_t level;
-        int8_t sharpness;
-        uint8_t lim_lut[64];
-        uint8_t mblim_lut[64];
-    } filter;
-    struct {
-        uint8_t enabled;
-        int8_t mode[2];
-        int8_t ref[4];
-    } lf_delta;
-    uint8_t yac_qi;
-    int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
-    uint8_t lossless;
-    struct {
-        uint8_t enabled;
-        uint8_t temporal;
-        uint8_t absolute_vals;
-        uint8_t update_map;
-        #define MAX_SEGMENT 8
-        struct {
-            uint8_t q_enabled;
-            uint8_t lf_enabled;
-            uint8_t ref_enabled;
-            uint8_t skip_enabled;
-            uint8_t ref_val;
-            int16_t q_val;
-            int8_t lf_val;
-            int16_t qmul[2][2];
-            uint8_t lflvl[4][2];
-        } feat[MAX_SEGMENT];
-    } segmentation;
-    struct {
-        unsigned log2_tile_cols, log2_tile_rows;
-        unsigned tile_cols, tile_rows;
-        unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
-    } tiling;
-    unsigned sb_cols, sb_rows, rows, cols;
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][3];
-    } prob_ctx[4];
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][11];
-        uint8_t seg[7];
-        uint8_t segpred[3];
-    } prob;
-    struct {
-        unsigned y_mode[4][10];
-        unsigned uv_mode[10][10];
-        unsigned filter[4][3];
-        unsigned mv_mode[7][4];
-        unsigned intra[4][2];
-        unsigned comp[5][2];
-        unsigned single_ref[5][2][2];
-        unsigned comp_ref[5][2];
-        unsigned tx32p[2][4];
-        unsigned tx16p[2][3];
-        unsigned tx8p[2][2];
-        unsigned skip[3][2];
-        unsigned mv_joint[4];
-        struct {
-            unsigned sign[2];
-            unsigned classes[11];
-            unsigned class0[2];
-            unsigned bits[10][2];
-            unsigned class0_fp[2][4];
-            unsigned fp[4];
-            unsigned class0_hp[2];
-            unsigned hp[2];
-        } mv_comp[2];
-        unsigned partition[4][4][4];
-        unsigned coef[4][2][2][6][6][3];
-        unsigned eob[4][2][2][6][6][2];
-    } counts;
-    enum TxfmMode txfmmode;
-    enum CompPredMode comppredmode;
-
-    // contextual (left/above) cache
-    uint8_t left_partition_ctx[8], *above_partition_ctx;
-    uint8_t left_mode_ctx[16], *above_mode_ctx;
-    // FIXME maybe merge some of the below in a flags field?
-    uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
-    uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
-    uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
-    uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
-    uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
-    uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
-    uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
-    uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
-    uint8_t left_filter_ctx[8], *above_filter_ctx;
-    VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
-
-    // whole-frame cache
-    uint8_t *intra_pred_data[3];
-    uint8_t *segmentation_map;
-    VP9MVRefPair *mv[2];
-    VP9Filter *lflvl;
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71 * 80];
-
-    // block reconstruction intermediates
-    DECLARE_ALIGNED(32, int16_t, block)[4096];
-    DECLARE_ALIGNED(32, int16_t, uvblock)[2][1024];
-    uint8_t eob[256];
-    uint8_t uveob[2][64];
-    VP56mv min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32 * 32];
-} VP9Context;
-
-void ff_vp9dsp_init(VP9DSPContext *dsp);
-
-void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
-
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb);
-
-void ff_vp9_adapt_probs(VP9Context *s);
-
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp);
-
 #endif /* AVCODEC_VP9_H */
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
new file mode 100644
index 0000000000..f4eb4e56ac
--- /dev/null
+++ b/libavcodec/vp9_mc_template.c
@@ -0,0 +1,435 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define ROUNDED_DIV_MVx2(a, b) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x, 2), .y = ROUNDED_DIV(a.y + b.y, 2) }
+#define ROUNDED_DIV_MVx4(a, b, c, d) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x + c.x + d.x, 4), \
+               .y = ROUNDED_DIV(a.y + b.y + c.y + d.y, 4) }
+
+static void FN(inter_pred)(AVCodecContext *ctx)
+{
+    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
+        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
+    };
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
+    AVFrame *ref1 = tref1->f, *ref2;
+    int w1 = ref1->width, h1 = ref1->height, w2, h2;
+    ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
+    int bytesperpixel = BYTES_PER_PIXEL;
+
+    if (b->comp) {
+        tref2 = &s->refs[s->refidx[b->ref[1]]];
+        ref2 = tref2->f;
+        w2 = ref2->width;
+        h2 = ref2->height;
+    }
+
+    // y inter pred
+    if (b->bs > BS_8x8) {
+        VP56mv uvmv;
+
+#if SCALED == 0
+        if (b->bs == BS_8x4) {
+            mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[3][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0);
+            w1 = (w1 + s->ss_h) >> s->ss_h;
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 2, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 3, col << (3 - s->ss_h),
+                              &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+                // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                // to get the motion vector for the bottom 4x4 block
+                // https://code.google.com/p/webm/issues/detail?id=993
+                if (s->ss_h == 0) {
+                    uvmv = b->mv[2][0];
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                }
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              (row << 3) + 4, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[3][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1);
+                w2 = (w2 + s->ss_h) >> s->ss_h;
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 2, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 3, col << (3 - s->ss_h),
+                                  &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                    // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                    // to get the motion vector for the bottom 4x4 block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    if (s->ss_h == 0) {
+                        uvmv = b->mv[2][1];
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    }
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  (row << 3) + 4, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                }
+            }
+        } else if (b->bs == BS_4x8) {
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0);
+            h1 = (h1 + s->ss_v) >> s->ss_v;
+            if (s->ss_h) {
+                w1 = (w1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 2,
+                              &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 3,
+                              &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1] + 4 * bytesperpixel,
+                              s->dst[2] + 4 * bytesperpixel, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), (col << 3) + 4,
+                              &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1);
+                h2 = (h2 + s->ss_v) >> s->ss_v;
+                if (s->ss_h) {
+                    w2 = (w2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 2,
+                                  &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 3,
+                                  &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), (col << 3) + 4,
+                                  &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                }
+            }
+        } else
+#endif
+        {
+            av_assert2(b->bs == BS_4x4);
+
+            // FIXME if two horizontally adjacent blocks have the same MV,
+            // do a w8 instead of a w4 call
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],
+                        0, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],
+                        4, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],
+                        0, 4, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0],
+                        4, 4, 8, 8, 4, 4, w1, h1, 0);
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx4(b->mv[0][0], b->mv[1][0],
+                                            b->mv[2][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 2,
+                                  &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0);
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 3,
+                                  &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0);
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, (col << 3) + 4,
+                                  &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0);
+                }
+            } else {
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 2,
+                                  &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0);
+                    // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                    // bottom block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 2,
+                                  &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 3,
+                                  &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, (col << 3) + 4,
+                                  &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 3,
+                                  &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, (col << 3) + 4,
+                                  &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0);
+                }
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx4(b->mv[0][1], b->mv[1][1],
+                                                b->mv[2][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 2,
+                                      &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1);
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 3,
+                                      &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1);
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, (col << 3) + 4,
+                                      &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1);
+                    }
+                } else {
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 2,
+                                      &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1);
+                        // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                        // bottom block
+                        // https://code.google.com/p/webm/issues/detail?id=993
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 2,
+                                      &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1);
+                    } else {
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 3,
+                                      &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, (col << 3) + 4,
+                                      &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 3,
+                                      &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, (col << 3) + 4,
+                                      &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                    }
+                }
+            }
+        }
+    } else {
+        int bwl = bwlog_tab[0][b->bs];
+        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
+        int uvbw = bwh_tab[s->ss_h][b->bs][0] * 4, uvbh = bwh_tab[s->ss_v][b->bs][1] * 4;
+
+        mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
+                    ref1->data[0], ref1->linesize[0], tref1,
+                    row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0);
+        w1 = (w1 + s->ss_h) >> s->ss_h;
+        h1 = (h1 + s->ss_v) >> s->ss_v;
+        mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0],
+                      s->dst[1], s->dst[2], ls_uv,
+                      ref1->data[1], ref1->linesize[1],
+                      ref1->data[2], ref1->linesize[2], tref1,
+                      row << (3 - s->ss_v), col << (3 - s->ss_h),
+                      &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0);
+
+        if (b->comp) {
+            mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0], tref2,
+                        row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1);
+            w2 = (w2 + s->ss_h) >> s->ss_h;
+            h2 = (h2 + s->ss_v) >> s->ss_v;
+            mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1],
+                          s->dst[1], s->dst[2], ls_uv,
+                          ref2->data[1], ref2->linesize[1],
+                          ref2->data[2], ref2->linesize[2], tref2,
+                          row << (3 - s->ss_v), col << (3 - s->ss_h),
+                          &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1);
+        }
+    }
+}
diff --git a/libavcodec/vp9_parser.c b/libavcodec/vp9_parser.c
new file mode 100644
index 0000000000..f1f7e350d2
--- /dev/null
+++ b/libavcodec/vp9_parser.c
@@ -0,0 +1,156 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavcodec/get_bits.h"
+#include "parser.h"
+
+typedef struct VP9ParseContext {
+    int n_frames; // 1-8
+    int size[8];
+    int64_t pts;
+} VP9ParseContext;
+
+static int parse_frame(AVCodecParserContext *ctx, const uint8_t *buf, int size)
+{
+    VP9ParseContext *s = ctx->priv_data;
+    GetBitContext gb;
+    int res, profile, keyframe, invisible;
+
+    if ((res = init_get_bits8(&gb, buf, size)) < 0)
+        return res;
+    get_bits(&gb, 2); // frame marker
+    profile  = get_bits1(&gb);
+    profile |= get_bits1(&gb) << 1;
+    if (profile == 3) profile += get_bits1(&gb);
+
+    if (get_bits1(&gb)) {
+        keyframe = 0;
+        invisible = 0;
+    } else {
+        keyframe  = !get_bits1(&gb);
+        invisible = !get_bits1(&gb);
+    }
+
+    if (!keyframe) {
+        ctx->pict_type = AV_PICTURE_TYPE_P;
+        ctx->key_frame = 0;
+    } else {
+        ctx->pict_type = AV_PICTURE_TYPE_I;
+        ctx->key_frame = 1;
+    }
+
+    if (!invisible) {
+        if (ctx->pts == AV_NOPTS_VALUE)
+            ctx->pts = s->pts;
+        s->pts = AV_NOPTS_VALUE;
+    } else {
+        s->pts = ctx->pts;
+        ctx->pts = AV_NOPTS_VALUE;
+    }
+
+    return 0;
+}
+
+static int parse(AVCodecParserContext *ctx,
+                 AVCodecContext *avctx,
+                 const uint8_t **out_data, int *out_size,
+                 const uint8_t *data, int size)
+{
+    VP9ParseContext *s = ctx->priv_data;
+    int full_size = size;
+    int marker;
+
+    if (size <= 0) {
+        *out_size = 0;
+        *out_data = data;
+
+        return 0;
+    }
+
+    if (s->n_frames > 0) {
+        *out_data = data;
+        *out_size = s->size[--s->n_frames];
+        parse_frame(ctx, *out_data, *out_size);
+
+        return s->n_frames > 0 ? *out_size : size /* i.e. include idx tail */;
+    }
+
+    marker = data[size - 1];
+    if ((marker & 0xe0) == 0xc0) {
+        int nbytes = 1 + ((marker >> 3) & 0x3);
+        int n_frames = 1 + (marker & 0x7), idx_sz = 2 + n_frames * nbytes;
+
+        if (size >= idx_sz && data[size - idx_sz] == marker) {
+            const uint8_t *idx = data + size + 1 - idx_sz;
+            int first = 1;
+
+            switch (nbytes) {
+#define case_n(a, rd) \
+            case a: \
+                while (n_frames--) { \
+                    unsigned sz = rd; \
+                    idx += a; \
+                    if (sz > size) { \
+                        s->n_frames = 0; \
+                        *out_size = size; \
+                        *out_data = data; \
+                        av_log(avctx, AV_LOG_ERROR, \
+                               "Superframe packet size too big: %u > %d\n", \
+                               sz, size); \
+                        return full_size; \
+                    } \
+                    if (first) { \
+                        first = 0; \
+                        *out_data = data; \
+                        *out_size = sz; \
+                        s->n_frames = n_frames; \
+                    } else { \
+                        s->size[n_frames] = sz; \
+                    } \
+                    data += sz; \
+                    size -= sz; \
+                } \
+                parse_frame(ctx, *out_data, *out_size); \
+                return *out_size
+
+                case_n(1, *idx);
+                case_n(2, AV_RL16(idx));
+                case_n(3, AV_RL24(idx));
+                case_n(4, AV_RL32(idx));
+            }
+        }
+    }
+
+    *out_data = data;
+    *out_size = size;
+    parse_frame(ctx, data, size);
+
+    return size;
+}
+
+AVCodecParser ff_vp9_parser = {
+    .codec_ids      = { AV_CODEC_ID_VP9 },
+    .priv_data_size = sizeof(VP9ParseContext),
+    .parser_parse   = parse,
+};
diff --git a/libavcodec/vp9block.c b/libavcodec/vp9block.c
deleted file mode 100644
index a92c794ebf..0000000000
--- a/libavcodec/vp9block.c
+++ /dev/null
@@ -1,1685 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/avassert.h"
-
-#include "avcodec.h"
-#include "get_bits.h"
-#include "internal.h"
-#include "videodsp.h"
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
-    {
-        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
-        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
-    },  {
-        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
-        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
-    }
-};
-
-// differential forward probability updates
-static void decode_mode(VP9Context *s, VP9Block *const b)
-{
-    static const uint8_t left_ctx[N_BS_SIZES] = {
-        0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
-    };
-    static const uint8_t above_ctx[N_BS_SIZES] = {
-        0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
-    };
-    static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
-        TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
-        TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
-    };
-    int row = b->row, col = b->col, row7 = b->row7;
-    enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
-    int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
-    int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]);
-    int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
-    int y;
-
-    if (!s->segmentation.enabled) {
-        b->seg_id = 0;
-    } else if (s->keyframe || s->intraonly) {
-        b->seg_id = s->segmentation.update_map ?
-                    vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree, s->prob.seg) : 0;
-    } else if (!s->segmentation.update_map ||
-               (s->segmentation.temporal &&
-                vp56_rac_get_prob_branchy(&s->c,
-                                          s->prob.segpred[s->above_segpred_ctx[col] +
-                                                          s->left_segpred_ctx[row7]]))) {
-        int pred = MAX_SEGMENT - 1;
-        int x;
-
-        for (y = 0; y < h4; y++)
-            for (x = 0; x < w4; x++)
-                pred = FFMIN(pred,
-                             s->segmentation_map[(y + row) * 8 * s->sb_cols + x + col]);
-        b->seg_id = pred;
-
-        memset(&s->above_segpred_ctx[col], 1, w4);
-        memset(&s->left_segpred_ctx[row7], 1, h4);
-    } else {
-        b->seg_id = vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree,
-                                     s->prob.seg);
-
-        memset(&s->above_segpred_ctx[col], 0, w4);
-        memset(&s->left_segpred_ctx[row7], 0, h4);
-    }
-    if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
-        for (y = 0; y < h4; y++)
-            memset(&s->segmentation_map[(y + row) * 8 * s->sb_cols + col],
-                   b->seg_id, w4);
-    }
-
-    b->skip = s->segmentation.enabled &&
-              s->segmentation.feat[b->seg_id].skip_enabled;
-    if (!b->skip) {
-        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
-        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
-        s->counts.skip[c][b->skip]++;
-    }
-
-    if (s->keyframe || s->intraonly) {
-        b->intra = 1;
-    } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
-        b->intra = !s->segmentation.feat[b->seg_id].ref_val;
-    } else {
-        int c, bit;
-
-        if (have_a && have_l) {
-            c  = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
-            c += (c == 2);
-        } else {
-            c = have_a ? 2 * s->above_intra_ctx[col] :
-                have_l ? 2 * s->left_intra_ctx[row7] : 0;
-        }
-        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
-        s->counts.intra[c][bit]++;
-        b->intra = !bit;
-    }
-
-    if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
-        int c;
-        if (have_a) {
-            if (have_l) {
-                c = (s->above_skip_ctx[col] ? max_tx :
-                     s->above_txfm_ctx[col]) +
-                    (s->left_skip_ctx[row7] ? max_tx :
-                     s->left_txfm_ctx[row7]) > max_tx;
-            } else {
-                c = s->above_skip_ctx[col] ? 1 :
-                    (s->above_txfm_ctx[col] * 2 > max_tx);
-            }
-        } else if (have_l) {
-            c = s->left_skip_ctx[row7] ? 1 :
-                (s->left_txfm_ctx[row7] * 2 > max_tx);
-        } else {
-            c = 1;
-        }
-        switch (max_tx) {
-        case TX_32X32:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
-            if (b->tx) {
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
-                if (b->tx == 2)
-                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
-            }
-            s->counts.tx32p[c][b->tx]++;
-            break;
-        case TX_16X16:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
-            if (b->tx)
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
-            s->counts.tx16p[c][b->tx]++;
-            break;
-        case TX_8X8:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
-            s->counts.tx8p[c][b->tx]++;
-            break;
-        case TX_4X4:
-            b->tx = TX_4X4;
-            break;
-        }
-    } else {
-        b->tx = FFMIN(max_tx, s->txfmmode);
-    }
-
-    if (s->keyframe || s->intraonly) {
-        uint8_t *a = &s->above_mode_ctx[col * 2];
-        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
-
-        b->comp = 0;
-        if (b->bs > BS_8x8) {
-            // FIXME the memory storage intermediates here aren't really
-            // necessary, they're just there to make the code slightly
-            // simpler for now
-            b->mode[0] =
-            a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
-                l[0]       =
-                a[1]       = b->mode[1];
-            } else {
-                l[0]       =
-                a[1]       =
-                b->mode[1] = b->mode[0];
-            }
-            if (b->bs != BS_4x8) {
-                b->mode[2] =
-                a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                                  ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
-                    l[1]       =
-                    a[1]       = b->mode[3];
-                } else {
-                    l[1]       =
-                    a[1]       =
-                    b->mode[3] = b->mode[2];
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                l[1]       =
-                a[1]       =
-                b->mode[3] = b->mode[1];
-            }
-        } else {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          ff_vp9_default_kf_ymode_probs[*a][*l]);
-            b->mode[3] =
-            b->mode[2] =
-            b->mode[1] = b->mode[0];
-            // FIXME this can probably be optimized
-            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
-            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
-        }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                     ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
-    } else if (b->intra) {
-        b->comp = 0;
-        if (b->bs > BS_8x8) {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          s->prob.p.y_mode[0]);
-            s->counts.y_mode[0][b->mode[0]]++;
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[1]]++;
-            } else {
-                b->mode[1] = b->mode[0];
-            }
-            if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[2]]++;
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                                  s->prob.p.y_mode[0]);
-                    s->counts.y_mode[0][b->mode[3]]++;
-                } else {
-                    b->mode[3] = b->mode[2];
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                b->mode[3] = b->mode[1];
-            }
-        } else {
-            static const uint8_t size_group[10] = {
-                3, 3, 3, 3, 2, 2, 2, 1, 1, 1
-            };
-            int sz = size_group[b->bs];
-
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          s->prob.p.y_mode[sz]);
-            b->mode[1] =
-            b->mode[2] =
-            b->mode[3] = b->mode[0];
-            s->counts.y_mode[sz][b->mode[3]]++;
-        }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                     s->prob.p.uv_mode[b->mode[3]]);
-        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
-    } else {
-        static const uint8_t inter_mode_ctx_lut[14][14] = {
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
-        };
-
-        if (s->segmentation.feat[b->seg_id].ref_enabled) {
-            av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
-            b->comp   = 0;
-            b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
-        } else {
-            // read comp_pred flag
-            if (s->comppredmode != PRED_SWITCHABLE) {
-                b->comp = s->comppredmode == PRED_COMPREF;
-            } else {
-                int c;
-
-                // FIXME add intra as ref=0xff (or -1) to make these easier?
-                if (have_a) {
-                    if (have_l) {
-                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
-                            c = 4;
-                        } else if (s->above_comp_ctx[col]) {
-                            c = 2 + (s->left_intra_ctx[row7] ||
-                                     s->left_ref_ctx[row7] == s->fixcompref);
-                        } else if (s->left_comp_ctx[row7]) {
-                            c = 2 + (s->above_intra_ctx[col] ||
-                                     s->above_ref_ctx[col] == s->fixcompref);
-                        } else {
-                            c = (!s->above_intra_ctx[col] &&
-                                 s->above_ref_ctx[col] == s->fixcompref) ^
-                                (!s->left_intra_ctx[row7] &&
-                                 s->left_ref_ctx[row & 7] == s->fixcompref);
-                        }
-                    } else {
-                        c = s->above_comp_ctx[col] ? 3 :
-                            (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
-                    }
-                } else if (have_l) {
-                    c = s->left_comp_ctx[row7] ? 3 :
-                        (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
-                } else {
-                    c = 1;
-                }
-                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
-                s->counts.comp[c][b->comp]++;
-            }
-
-            // read actual references
-            // FIXME probably cache a few variables here to prevent repetitive
-            // memory accesses below
-            if (b->comp) { /* two references */
-                int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
-
-                b->ref[fix_idx] = s->fixcompref;
-                // FIXME can this codeblob be replaced by some sort of LUT?
-                if (have_a) {
-                    if (have_l) {
-                        if (s->above_intra_ctx[col]) {
-                            if (s->left_intra_ctx[row7]) {
-                                c = 2;
-                            } else {
-                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                            }
-                        } else if (s->left_intra_ctx[row7]) {
-                            c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        } else {
-                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
-
-                            if (refl == refa && refa == s->varcompref[1]) {
-                                c = 0;
-                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
-                                if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
-                                    (refl == s->fixcompref && refa == s->varcompref[0])) {
-                                    c = 4;
-                                } else {
-                                    c = (refa == refl) ? 3 : 1;
-                                }
-                            } else if (!s->left_comp_ctx[row7]) {
-                                if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
-                                    c = 1;
-                                } else {
-                                    c = (refl == s->varcompref[1] &&
-                                         refa != s->varcompref[1]) ? 2 : 4;
-                                }
-                            } else if (!s->above_comp_ctx[col]) {
-                                if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
-                                    c = 1;
-                                } else {
-                                    c = (refa == s->varcompref[1] &&
-                                         refl != s->varcompref[1]) ? 2 : 4;
-                                }
-                            } else {
-                                c = (refl == refa) ? 4 : 2;
-                            }
-                        }
-                    } else {
-                        if (s->above_intra_ctx[col]) {
-                            c = 2;
-                        } else if (s->above_comp_ctx[col]) {
-                            c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        } else {
-                            c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        }
-                    }
-                } else if (have_l) {
-                    if (s->left_intra_ctx[row7]) {
-                        c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                    } else {
-                        c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                    }
-                } else {
-                    c = 2;
-                }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
-                b->ref[var_idx] = s->varcompref[bit];
-                s->counts.comp_ref[c][bit]++;
-            } else { /* single reference */
-                int bit, c;
-
-                if (have_a && !s->above_intra_ctx[col]) {
-                    if (have_l && !s->left_intra_ctx[row7]) {
-                        if (s->left_comp_ctx[row7]) {
-                            if (s->above_comp_ctx[col]) {
-                                c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
-                                         !s->above_ref_ctx[col]);
-                            } else {
-                                c = (3 * !s->above_ref_ctx[col]) +
-                                    (!s->fixcompref || !s->left_ref_ctx[row7]);
-                            }
-                        } else if (s->above_comp_ctx[col]) {
-                            c = (3 * !s->left_ref_ctx[row7]) +
-                                (!s->fixcompref || !s->above_ref_ctx[col]);
-                        } else {
-                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
-                        }
-                    } else if (s->above_intra_ctx[col]) {
-                        c = 2;
-                    } else if (s->above_comp_ctx[col]) {
-                        c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
-                    } else {
-                        c = 4 * (!s->above_ref_ctx[col]);
-                    }
-                } else if (have_l && !s->left_intra_ctx[row7]) {
-                    if (s->left_intra_ctx[row7]) {
-                        c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
-                    } else {
-                        c = 4 * (!s->left_ref_ctx[row7]);
-                    }
-                } else {
-                    c = 2;
-                }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
-                s->counts.single_ref[c][0][bit]++;
-                if (!bit) {
-                    b->ref[0] = 0;
-                } else {
-                    // FIXME can this codeblob be replaced by some sort of LUT?
-                    if (have_a) {
-                        if (have_l) {
-                            if (s->left_intra_ctx[row7]) {
-                                if (s->above_intra_ctx[col]) {
-                                    c = 2;
-                                } else if (s->above_comp_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->above_ref_ctx[col] == 1);
-                                } else if (!s->above_ref_ctx[col]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->above_ref_ctx[col] == 1);
-                                }
-                            } else if (s->above_intra_ctx[col]) {
-                                if (s->left_intra_ctx[row7]) {
-                                    c = 2;
-                                } else if (s->left_comp_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                } else if (!s->left_ref_ctx[row7]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (s->above_comp_ctx[col]) {
-                                if (s->left_comp_ctx[row7]) {
-                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
-                                        c = 3 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                    } else {
-                                        c = 2;
-                                    }
-                                } else if (!s->left_ref_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->above_ref_ctx[col] == 1);
-                                } else {
-                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
-                                        (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
-                                }
-                            } else if (s->left_comp_ctx[row7]) {
-                                if (!s->above_ref_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                } else {
-                                    c = 3 * (s->above_ref_ctx[col] == 1) +
-                                        (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (!s->above_ref_ctx[col]) {
-                                if (!s->left_ref_ctx[row7]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (!s->left_ref_ctx[row7]) {
-                                c = 4 * (s->above_ref_ctx[col] == 1);
-                            } else {
-                                c = 2 * (s->left_ref_ctx[row7] == 1) +
-                                    2 * (s->above_ref_ctx[col] == 1);
-                            }
-                        } else {
-                            if (s->above_intra_ctx[col] ||
-                                (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
-                                c = 2;
-                            } else if (s->above_comp_ctx[col]) {
-                                c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
-                            } else {
-                                c = 4 * (s->above_ref_ctx[col] == 1);
-                            }
-                        }
-                    } else if (have_l) {
-                        if (s->left_intra_ctx[row7] ||
-                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
-                            c = 2;
-                        } else if (s->left_comp_ctx[row7]) {
-                            c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
-                        } else {
-                            c = 4 * (s->left_ref_ctx[row7] == 1);
-                        }
-                    } else {
-                        c = 2;
-                    }
-                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
-                    s->counts.single_ref[c][1][bit]++;
-                    b->ref[0] = 1 + bit;
-                }
-            }
-        }
-
-        if (b->bs <= BS_8x8) {
-            if (s->segmentation.feat[b->seg_id].skip_enabled) {
-                b->mode[0] =
-                b->mode[1] =
-                b->mode[2] =
-                b->mode[3] = ZEROMV;
-            } else {
-                static const uint8_t off[10] = {
-                    3, 0, 0, 1, 0, 0, 0, 0, 0, 0
-                };
-
-                // FIXME this needs to use the LUT tables from find_ref_mvs
-                // because not all are -1,0/0,-1
-                int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
-                                          [s->left_mode_ctx[row7 + off[b->bs]]];
-
-                b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                b->mode[1] =
-                b->mode[2] =
-                b->mode[3] = b->mode[0];
-                s->counts.mv_mode[c][b->mode[0] - 10]++;
-            }
-        }
-
-        if (s->filtermode == FILTER_SWITCHABLE) {
-            int c;
-
-            if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
-                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
-                        s->left_filter_ctx[row7] : 3;
-                } else {
-                    c = s->above_filter_ctx[col];
-                }
-            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                c = s->left_filter_ctx[row7];
-            } else {
-                c = 3;
-            }
-
-            b->filter = vp8_rac_get_tree(&s->c, ff_vp9_filter_tree,
-                                         s->prob.p.filter[c]);
-            s->counts.filter[c][b->filter]++;
-        } else {
-            b->filter = s->filtermode;
-        }
-
-        if (b->bs > BS_8x8) {
-            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
-
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                          s->prob.p.mv_mode[c]);
-            s->counts.mv_mode[c][b->mode[0] - 10]++;
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], 0);
-
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[1] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[1], b->mode[1], 1);
-            } else {
-                b->mode[1] = b->mode[0];
-                AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
-                AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
-            }
-
-            if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[2] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[2], b->mode[2], 2);
-
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                                  s->prob.p.mv_mode[c]);
-                    s->counts.mv_mode[c][b->mode[3] - 10]++;
-                    ff_vp9_fill_mv(s, b->mv[3], b->mode[3], 3);
-                } else {
-                    b->mode[3] = b->mode[2];
-                    AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
-                    AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
-                AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
-                b->mode[3] = b->mode[1];
-                AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
-                AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
-            }
-        } else {
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], -1);
-            AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
-            AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
-            AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
-        }
-    }
-
-    // FIXME this can probably be optimized
-    memset(&s->above_skip_ctx[col], b->skip, w4);
-    memset(&s->left_skip_ctx[row7], b->skip, h4);
-    memset(&s->above_txfm_ctx[col], b->tx, w4);
-    memset(&s->left_txfm_ctx[row7], b->tx, h4);
-    memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
-    memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
-    if (!s->keyframe && !s->intraonly) {
-        memset(&s->above_intra_ctx[col], b->intra, w4);
-        memset(&s->left_intra_ctx[row7], b->intra, h4);
-        memset(&s->above_comp_ctx[col], b->comp, w4);
-        memset(&s->left_comp_ctx[row7], b->comp, h4);
-        memset(&s->above_mode_ctx[col], b->mode[3], w4);
-        memset(&s->left_mode_ctx[row7], b->mode[3], h4);
-        if (s->filtermode == FILTER_SWITCHABLE && !b->intra) {
-            memset(&s->above_filter_ctx[col], b->filter, w4);
-            memset(&s->left_filter_ctx[row7], b->filter, h4);
-            b->filter = ff_vp9_filter_lut[b->filter];
-        }
-        if (b->bs > BS_8x8) {
-            int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
-
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
-            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
-            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
-            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
-            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
-        } else {
-            int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
-
-            for (n = 0; n < w4 * 2; n++) {
-                AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
-                AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
-            }
-            for (n = 0; n < h4 * 2; n++) {
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
-            }
-        }
-
-        if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
-                         // as a direct check in above branches
-            int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
-
-            memset(&s->above_ref_ctx[col], vref, w4);
-            memset(&s->left_ref_ctx[row7], vref, h4);
-        }
-    }
-
-    // FIXME kinda ugly
-    for (y = 0; y < h4; y++) {
-        int x, o = (row + y) * s->sb_cols * 8 + col;
-
-        if (b->intra) {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] =
-                s->mv[0][o + x].ref[1] = -1;
-            }
-        } else if (b->comp) {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] = b->ref[0];
-                s->mv[0][o + x].ref[1] = b->ref[1];
-                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
-                AV_COPY32(&s->mv[0][o + x].mv[1], &b->mv[3][1]);
-            }
-        } else {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] = b->ref[0];
-                s->mv[0][o + x].ref[1] = -1;
-                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
-            }
-        }
-    }
-}
-
-// FIXME remove tx argument, and merge cnt/eob arguments?
-static int decode_block_coeffs(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
-                               enum TxfmMode tx, unsigned (*cnt)[6][3],
-                               unsigned (*eob)[6][2], uint8_t(*p)[6][11],
-                               int nnz, const int16_t *scan,
-                               const int16_t(*nb)[2],
-                               const int16_t *band_counts, const int16_t *qmul)
-{
-    int i = 0, band = 0, band_left = band_counts[band];
-    uint8_t *tp = p[0][nnz];
-    uint8_t cache[1024];
-
-    do {
-        int val, rc;
-
-        val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
-        eob[band][nnz][val]++;
-        if (!val)
-            break;
-
-skip_eob:
-        if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
-            cnt[band][nnz][0]++;
-            if (!--band_left)
-                band_left = band_counts[++band];
-            cache[scan[i]] = 0;
-            nnz            = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
-            tp             = p[band][nnz];
-            if (++i == n_coeffs)
-                break;  //invalid input; blocks should end with EOB
-            goto skip_eob;
-        }
-
-        rc = scan[i];
-        if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
-            cnt[band][nnz][1]++;
-            val       = 1;
-            cache[rc] = 1;
-        } else {
-            // fill in p[3-10] (model fill) - only once per frame for each pos
-            if (!tp[3])
-                memcpy(&tp[3], ff_vp9_model_pareto8[tp[2]], 8);
-
-            cnt[band][nnz][2]++;
-            if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
-                if (!vp56_rac_get_prob_branchy(c, tp[4])) {
-                    cache[rc] = val = 2;
-                } else {
-                    val       = 3 + vp56_rac_get_prob(c, tp[5]);
-                    cache[rc] = 3;
-                }
-            } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
-                cache[rc] = 4;
-                if (!vp56_rac_get_prob_branchy(c, tp[7])) {
-                    val  =  vp56_rac_get_prob(c, 159) + 5;
-                } else {
-                    val  = (vp56_rac_get_prob(c, 165) << 1) + 7;
-                    val +=  vp56_rac_get_prob(c, 145);
-                }
-            } else { // cat 3-6
-                cache[rc] = 5;
-                if (!vp56_rac_get_prob_branchy(c, tp[8])) {
-                    if (!vp56_rac_get_prob_branchy(c, tp[9])) {
-                        val  = (vp56_rac_get_prob(c, 173) << 2) + 11;
-                        val += (vp56_rac_get_prob(c, 148) << 1);
-                        val +=  vp56_rac_get_prob(c, 140);
-                    } else {
-                        val  = (vp56_rac_get_prob(c, 176) << 3) + 19;
-                        val += (vp56_rac_get_prob(c, 155) << 2);
-                        val += (vp56_rac_get_prob(c, 140) << 1);
-                        val +=  vp56_rac_get_prob(c, 135);
-                    }
-                } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
-                    val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
-                    val += (vp56_rac_get_prob(c, 157) << 3);
-                    val += (vp56_rac_get_prob(c, 141) << 2);
-                    val += (vp56_rac_get_prob(c, 134) << 1);
-                    val +=  vp56_rac_get_prob(c, 130);
-                } else {
-                    val  = (vp56_rac_get_prob(c, 254) << 13) + 67;
-                    val += (vp56_rac_get_prob(c, 254) << 12);
-                    val += (vp56_rac_get_prob(c, 254) << 11);
-                    val += (vp56_rac_get_prob(c, 252) << 10);
-                    val += (vp56_rac_get_prob(c, 249) << 9);
-                    val += (vp56_rac_get_prob(c, 243) << 8);
-                    val += (vp56_rac_get_prob(c, 230) << 7);
-                    val += (vp56_rac_get_prob(c, 196) << 6);
-                    val += (vp56_rac_get_prob(c, 177) << 5);
-                    val += (vp56_rac_get_prob(c, 153) << 4);
-                    val += (vp56_rac_get_prob(c, 140) << 3);
-                    val += (vp56_rac_get_prob(c, 133) << 2);
-                    val += (vp56_rac_get_prob(c, 130) << 1);
-                    val +=  vp56_rac_get_prob(c, 129);
-                }
-            }
-        }
-        if (!--band_left)
-            band_left = band_counts[++band];
-        if (tx == TX_32X32) // FIXME slow
-            coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
-        else
-            coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
-        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
-        tp  = p[band][nnz];
-    } while (++i < n_coeffs);
-
-    return i;
-}
-
-static int decode_coeffs(AVCodecContext *avctx)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
-    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
-    int end_x = FFMIN(2 * (s->cols - col), w4);
-    int end_y = FFMIN(2 * (s->rows - row), h4);
-    int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
-    int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), ret;
-    int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
-    int tx = 4 * s->lossless + b->tx;
-    const int16_t **yscans = ff_vp9_scans[tx];
-    const int16_t (**ynbs)[2] = ff_vp9_scans_nb[tx];
-    const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
-    const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
-    uint8_t *a = &s->above_y_nnz_ctx[col * 2];
-    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
-    static const int16_t band_counts[4][8] = {
-        { 1, 2, 3, 4,  3,   16 - 13, 0 },
-        { 1, 2, 3, 4, 11,   64 - 21, 0 },
-        { 1, 2, 3, 4, 11,  256 - 21, 0 },
-        { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-    };
-    const int16_t *y_band_counts  = band_counts[b->tx];
-    const int16_t *uv_band_counts = band_counts[b->uvtx];
-
-    /* y tokens */
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            for (x = 1; x < step1d; x++)
-                l[y] |= l[y + x];
-        for (x = 0; x < end_x; x += step1d)
-            for (y = 1; y < step1d; y++)
-                a[x] |= a[x + y];
-    }
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        for (x = 0; x < end_x; x += step1d, n += step) {
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
-                                                                b->bs > BS_8x8 ?
-                                                                n : 0]];
-            int nnz = a[x] + l[y];
-            if ((ret = decode_block_coeffs(&s->c, s->block + 16 * n, 16 * step,
-                                           b->tx, c, e, p, nnz, yscans[txtp],
-                                           ynbs[txtp], y_band_counts,
-                                           qmul[0])) < 0)
-                return ret;
-            a[x] = l[y] = !!ret;
-            if (b->tx > TX_8X8)
-                AV_WN16A(&s->eob[n], ret);
-            else
-                s->eob[n] = ret;
-        }
-    }
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
-        for (x = 0; x < end_x; x += step1d)
-            memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
-    }
-
-    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
-    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
-    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
-    w4    >>= 1;
-    h4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    for (pl = 0; pl < 2; pl++) {
-        a = &s->above_uv_nnz_ctx[pl][col];
-        l = &s->left_uv_nnz_ctx[pl][row & 7];
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                for (x = 1; x < uvstep1d; x++)
-                    l[y] |= l[y + x];
-            for (x = 0; x < end_x; x += uvstep1d)
-                for (y = 1; y < uvstep1d; y++)
-                    a[x] |= a[x + y];
-        }
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
-                int nnz = a[x] + l[y];
-                if ((ret = decode_block_coeffs(&s->c, s->uvblock[pl] + 16 * n,
-                                               16 * uvstep, b->uvtx, c, e, p,
-                                               nnz, uvscan, uvnb,
-                                               uv_band_counts, qmul[1])) < 0)
-                    return ret;
-                a[x] = l[y] = !!ret;
-                if (b->uvtx > TX_8X8)
-                    AV_WN16A(&s->uveob[pl][n], ret);
-                else
-                    s->uveob[pl][n] = ret;
-            }
-        }
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
-            for (x = 0; x < end_x; x += uvstep1d)
-                memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
-        }
-    }
-
-    return 0;
-}
-
-static av_always_inline int check_intra_mode(VP9Context *s, int mode,
-                                             uint8_t **a,
-                                             uint8_t *dst_edge,
-                                             ptrdiff_t stride_edge,
-                                             uint8_t *dst_inner,
-                                             ptrdiff_t stride_inner,
-                                             uint8_t *l, int col, int x, int w,
-                                             int row, int y, enum TxfmMode tx,
-                                             int p)
-{
-    int have_top   = row > 0 || y > 0;
-    int have_left  = col > s->tiling.tile_col_start || x > 0;
-    int have_right = x < w - 1;
-    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
-        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
-                                   { DC_127_PRED,          VERT_PRED            } },
-        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_PRED,             HOR_PRED             } },
-        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
-                                   { LEFT_DC_PRED,         DC_PRED              } },
-        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
-                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
-        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
-                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
-        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
-                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
-        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
-                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
-        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
-                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
-        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
-        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
-                                   { HOR_PRED,             TM_VP8_PRED          } },
-    };
-    static const struct {
-        uint8_t needs_left:1;
-        uint8_t needs_top:1;
-        uint8_t needs_topleft:1;
-        uint8_t needs_topright:1;
-    } edges[N_INTRA_PRED_MODES] = {
-        [VERT_PRED]            = { .needs_top  = 1 },
-        [HOR_PRED]             = { .needs_left = 1 },
-        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
-        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
-        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
-        [HOR_UP_PRED]          = { .needs_left = 1 },
-        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [LEFT_DC_PRED]         = { .needs_left = 1 },
-        [TOP_DC_PRED]          = { .needs_top  = 1 },
-        [DC_128_PRED]          = { 0 },
-        [DC_127_PRED]          = { 0 },
-        [DC_129_PRED]          = { 0 }
-    };
-
-    av_assert2(mode >= 0 && mode < 10);
-    mode = mode_conv[mode][have_left][have_top];
-    if (edges[mode].needs_top) {
-        uint8_t *top = NULL, *topleft = NULL;
-        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
-        int n_px_need_tr = 0;
-
-        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
-            n_px_need_tr = 4;
-
-        // if top of sb64-row, use s->intra_pred_data[] instead of
-        // dst[-stride] for intra prediction (it contains pre- instead of
-        // post-loopfilter data)
-        if (have_top) {
-            top = !(row & 7) && !y ?
-                  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
-            if (have_left)
-                topleft = !(row & 7) && !y ?
-                          s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                          y == 0 || x == 0 ? &dst_edge[-stride_edge] :
-                          &dst_inner[-stride_inner];
-        }
-
-        if (have_top &&
-            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
-            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
-            n_px_need + n_px_need_tr <= n_px_have) {
-            *a = top;
-        } else {
-            if (have_top) {
-                if (n_px_need <= n_px_have) {
-                    memcpy(*a, top, n_px_need);
-                } else {
-                    memcpy(*a, top, n_px_have);
-                    memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
-                           n_px_need - n_px_have);
-                }
-            } else {
-                memset(*a, 127, n_px_need);
-            }
-            if (edges[mode].needs_topleft) {
-                if (have_left && have_top)
-                    (*a)[-1] = topleft[-1];
-                else
-                    (*a)[-1] = have_top ? 129 : 127;
-            }
-            if (tx == TX_4X4 && edges[mode].needs_topright) {
-                if (have_top && have_right &&
-                    n_px_need + n_px_need_tr <= n_px_have) {
-                    memcpy(&(*a)[4], &top[4], 4);
-                } else {
-                    memset(&(*a)[4], (*a)[3], 4);
-                }
-            }
-        }
-    }
-    if (edges[mode].needs_left) {
-        if (have_left) {
-            int i;
-            int n_px_need = 4 << tx;
-            int n_px_have = (((s->rows - row) << !p) - y) * 4;
-            uint8_t *dst     = x == 0 ? dst_edge : dst_inner;
-            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
-
-            if (n_px_need <= n_px_have) {
-                for (i = 0; i < n_px_need; i++)
-                    l[i] = dst[i * stride - 1];
-            } else {
-                for (i = 0; i < n_px_have; i++)
-                    l[i] = dst[i * stride - 1];
-                memset(&l[i], l[i - 1], n_px_need - n_px_have);
-            }
-        } else {
-            memset(l, 129, 4 << tx);
-        }
-    }
-
-    return mode;
-}
-
-static void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-    int end_x = FFMIN(2 * (s->cols - col), w4);
-    int end_y = FFMIN(2 * (s->rows - row), h4);
-    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-    int uvstep1d = 1 << b->uvtx, p;
-    uint8_t *dst = b->dst[0], *dst_r = s->cur_frame->data[0] + y_off;
-
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        uint8_t *ptr = dst, *ptr_r = dst_r;
-        for (x = 0; x < end_x;
-             x += step1d, ptr += 4 * step1d, ptr_r += 4 * step1d, n += step) {
-            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
-                               y * 2 + x : 0];
-            LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-            uint8_t *a = &a_buf[16], l[32];
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
-            int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-            mode = check_intra_mode(s, mode, &a, ptr_r,
-                                    s->cur_frame->linesize[0],
-                                    ptr, b->y_stride, l,
-                                    col, x, w4, row, y, b->tx, 0);
-            s->dsp.intra_pred[b->tx][mode](ptr, b->y_stride, l, a);
-            if (eob)
-                s->dsp.itxfm_add[tx][txtp](ptr, b->y_stride,
-                                           s->block + 16 * n, eob);
-        }
-        dst_r += 4 * s->cur_frame->linesize[0] * step1d;
-        dst   += 4 * b->y_stride * step1d;
-    }
-
-    // U/V
-    h4    >>= 1;
-    w4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    step    = 1 << (b->uvtx * 2);
-    for (p = 0; p < 2; p++) {
-        dst   = b->dst[1 + p];
-        dst_r = s->cur_frame->data[1 + p] + uv_off;
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            uint8_t *ptr = dst, *ptr_r = dst_r;
-            for (x = 0; x < end_x;
-                 x += uvstep1d, ptr += 4 * uvstep1d,
-                 ptr_r += 4 * uvstep1d, n += step) {
-                int mode = b->uvmode;
-                LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-                uint8_t *a = &a_buf[16], l[32];
-                int eob    = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                              : s->uveob[p][n];
-
-                mode = check_intra_mode(s, mode, &a, ptr_r,
-                                        s->cur_frame->linesize[1],
-                                        ptr, b->uv_stride, l,
-                                        col, x, w4, row, y, b->uvtx, p + 1);
-                s->dsp.intra_pred[b->uvtx][mode](ptr, b->uv_stride, l, a);
-                if (eob)
-                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                    s->uvblock[p] + 16 * n,
-                                                    eob);
-            }
-            dst_r += 4 * uvstep1d * s->cur_frame->linesize[1];
-            dst   += 4 * uvstep1d * b->uv_stride;
-        }
-    }
-}
-
-static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                         uint8_t *dst, ptrdiff_t dst_stride,
-                                         const uint8_t *ref,
-                                         ptrdiff_t ref_stride,
-                                         ptrdiff_t y, ptrdiff_t x,
-                                         const VP56mv *mv,
-                                         int bw, int bh, int w, int h)
-{
-    int mx = mv->x, my = mv->y;
-
-    y   += my >> 3;
-    x   += mx >> 3;
-    ref += y * ref_stride + x;
-    mx  &= 7;
-    my  &= 7;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref - !!my * 3 * ref_stride - !!mx * 3,
-                                 80,
-                                 ref_stride,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref        = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        ref_stride = 80;
-    }
-    mc[!!mx][!!my](dst, ref, dst_stride, ref_stride, bh, mx << 1, my << 1);
-}
-
-static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                           uint8_t *dst_u, uint8_t *dst_v,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *ref_u,
-                                           ptrdiff_t src_stride_u,
-                                           const uint8_t *ref_v,
-                                           ptrdiff_t src_stride_v,
-                                           ptrdiff_t y, ptrdiff_t x,
-                                           const VP56mv *mv,
-                                           int bw, int bh, int w, int h)
-{
-    int mx = mv->x, my = mv->y;
-
-    y     += my >> 4;
-    x     += mx >> 4;
-    ref_u += y * src_stride_u + x;
-    ref_v += y * src_stride_v + x;
-    mx    &= 15;
-    my    &= 15;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
-                                 80,
-                                 src_stride_u,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_u, ref_u, dst_stride, 80, bh, mx, my);
-
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
-                                 80,
-                                 src_stride_v,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_v, ref_v, dst_stride, 80, bh, mx, my);
-    } else {
-        mc[!!mx][!!my](dst_u, ref_u, dst_stride, src_stride_u, bh, mx, my);
-        mc[!!mx][!!my](dst_v, ref_v, dst_stride, src_stride_v, bh, mx, my);
-    }
-}
-
-static int inter_recon(AVCodecContext *avctx)
-{
-    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
-        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
-        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
-    };
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    AVFrame *ref1 = s->refs[s->refidx[b->ref[0]]];
-    AVFrame *ref2 = b->comp ? s->refs[s->refidx[b->ref[1]]] : NULL;
-    int w = avctx->width, h = avctx->height;
-    ptrdiff_t ls_y = b->y_stride, ls_uv = b->uv_stride;
-
-    if (!ref1->data[0] || (b->comp && !ref2->data[0]))
-        return AVERROR_INVALIDDATA;
-
-    // y inter pred
-    if (b->bs > BS_8x8) {
-        if (b->bs == BS_8x4) {
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
-            }
-        } else if (b->bs == BS_4x8) {
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
-            }
-        } else {
-            av_assert2(b->bs == BS_4x4);
-
-            // FIXME if two horizontally adjacent blocks have the same MV,
-            // do a w8 instead of a w4 call
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
-            }
-        }
-    } else {
-        int bwl = bwlog_tab[0][b->bs];
-        int bw  = bwh_tab[0][b->bs][0] * 4;
-        int bh  = bwh_tab[0][b->bs][1] * 4;
-
-        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], b->dst[0], ls_y,
-                    ref1->data[0], ref1->linesize[0],
-                    row << 3, col << 3, &b->mv[0][0], bw, bh, w, h);
-
-        if (b->comp)
-            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], b->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0],
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
-    }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw  = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w = (w + 1) >> 1;
-        h = (h + 1) >> 1;
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x +
-                                 b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y +
-                                 b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
-                      b->dst[1], b->dst[2], ls_uv,
-                      ref1->data[1], ref1->linesize[1],
-                      ref1->data[2], ref1->linesize[2],
-                      row << 2, col << 2, &mvuv, bw, bh, w, h);
-
-        if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x +
-                                     b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y +
-                                     b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
-                          b->dst[1], b->dst[2], ls_uv,
-                          ref2->data[1], ref2->linesize[1],
-                          ref2->data[2], ref2->linesize[2],
-                          row << 2, col << 2, &mvuv, bw, bh, w, h);
-        }
-    }
-
-    if (!b->skip) {
-        /* mostly copied intra_reconn() */
-
-        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-        int end_x = FFMIN(2 * (s->cols - col), w4);
-        int end_y = FFMIN(2 * (s->rows - row), h4);
-        int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-        int uvstep1d = 1 << b->uvtx, p;
-        uint8_t *dst = b->dst[0];
-
-        // y itxfm add
-        for (n = 0, y = 0; y < end_y; y += step1d) {
-            uint8_t *ptr = dst;
-            for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
-                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-                if (eob)
-                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, b->y_stride,
-                                                  s->block + 16 * n, eob);
-            }
-            dst += 4 * b->y_stride * step1d;
-        }
-
-        // uv itxfm add
-        h4    >>= 1;
-        w4    >>= 1;
-        end_x >>= 1;
-        end_y >>= 1;
-        step    = 1 << (b->uvtx * 2);
-        for (p = 0; p < 2; p++) {
-            dst = b->dst[p + 1];
-            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-                uint8_t *ptr = dst;
-                for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
-                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                               : s->uveob[p][n];
-                    if (eob)
-                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                        s->uvblock[p] + 16 * n, eob);
-                }
-                dst += 4 * uvstep1d * b->uv_stride;
-            }
-        }
-    }
-    return 0;
-}
-
-static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
-                                        int row_and_7, int col_and_7,
-                                        int w, int h, int col_end, int row_end,
-                                        enum TxfmMode tx, int skip_inter)
-{
-    // FIXME I'm pretty sure all loops can be replaced by a single LUT if
-    // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
-    // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
-    // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
-
-    // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
-    // edges. This means that for UV, we work on two subsampled blocks at
-    // a time, and we only use the topleft block's mode information to set
-    // things like block strength. Thus, for any block size smaller than
-    // 16x16, ignore the odd portion of the block.
-    if (tx == TX_4X4 && is_uv) {
-        if (h == 1) {
-            if (row_and_7 & 1)
-                return;
-            if (!row_end)
-                h += 1;
-        }
-        if (w == 1) {
-            if (col_and_7 & 1)
-                return;
-            if (!col_end)
-                w += 1;
-        }
-    }
-
-    if (tx == TX_4X4 && !skip_inter) {
-        int t = 1 << col_and_7, m_col = (t << w) - t, y;
-        int m_col_odd = (t << (w - 1)) - t;
-
-        // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
-        if (is_uv) {
-            int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 7);
-
-                lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                lflvl->mask[is_uv][0][y][2] |= m_row_4;
-                // for odd lines, if the odd col is not being filtered,
-                // skip odd row also:
-                // .---. <-- a
-                // |   |
-                // |___| <-- b
-                // ^   ^
-                // c   d
-                //
-                // if a/c are even row/col and b/d are odd, and d is skipped,
-                // e.g. right edge of size-66x66.webm, then skip b also (bug)
-                if ((col_end & 1) && (y & 1)) {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
-                } else {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
-                }
-            }
-        } else {
-            int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 3);
-
-                lflvl->mask[is_uv][0][y][1]           |= m_row_8; // row edge
-                lflvl->mask[is_uv][0][y][2]           |= m_row_4;
-                lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
-                lflvl->mask[is_uv][0][y][3]           |= m_col;
-                lflvl->mask[is_uv][1][y][3]           |= m_col;
-            }
-        }
-    } else {
-        int y, t = 1 << col_and_7, m_col = (t << w) - t;
-
-        if (!skip_inter) {
-            int mask_id = (tx == TX_8X8);
-            int l2 = tx + is_uv - 1, step1d = 1 << l2;
-            static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
-            int m_row = m_col & masks[l2];
-
-            // at odd UV col/row edges tx16/tx32 loopfilter edges, force
-            // 8wd loopfilter to prevent going off the visible edge.
-            if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
-                int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
-                int m_row_8  = m_row - m_row_16;
-
-                for (y = row_and_7; y < h + row_and_7; y++) {
-                    lflvl->mask[is_uv][0][y][0] |= m_row_16;
-                    lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                }
-            } else {
-                for (y = row_and_7; y < h + row_and_7; y++)
-                    lflvl->mask[is_uv][0][y][mask_id] |= m_row;
-            }
-
-            if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
-                for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
-                    lflvl->mask[is_uv][1][y][0] |= m_col;
-                if (y - row_and_7 == h - 1)
-                    lflvl->mask[is_uv][1][y][1] |= m_col;
-            } else {
-                for (y = row_and_7; y < h + row_and_7; y += step1d)
-                    lflvl->mask[is_uv][1][y][mask_id] |= m_col;
-            }
-        } else if (tx != TX_4X4) {
-            int mask_id;
-
-            mask_id = (tx == TX_8X8) || (is_uv && h == 1);
-            lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
-            mask_id = (tx == TX_8X8) || (is_uv && w == 1);
-            for (y = row_and_7; y < h + row_and_7; y++)
-                lflvl->mask[is_uv][0][y][mask_id] |= t;
-        } else if (is_uv) {
-            int t8 = t & 0x01, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
-        } else {
-            int t8 = t & 0x11, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
-        }
-    }
-}
-
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    enum BlockSize bs = bl * 3 + bp;
-    int ret, y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
-    int emu[2];
-
-    b->row  = row;
-    b->row7 = row & 7;
-    b->col  = col;
-    b->col7 = col & 7;
-
-    s->min_mv.x = -(128 + col * 64);
-    s->min_mv.y = -(128 + row * 64);
-    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
-    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
-
-    b->bs = bs;
-    decode_mode(s, b);
-    b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
-
-    if (!b->skip) {
-        if ((ret = decode_coeffs(avctx)) < 0)
-            return ret;
-    } else {
-        int pl;
-
-        memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
-        memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
-        for (pl = 0; pl < 2; pl++) {
-            memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
-            memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
-        }
-    }
-
-    /* Emulated overhangs if the stride of the target buffer can't hold.
-     * This allows to support emu-edge and so on even if we have large
-     * block overhangs. */
-    emu[0] = (col + w4) * 8 > s->cur_frame->linesize[0] ||
-             (row + h4) > s->rows;
-    emu[1] = (col + w4) * 4 > s->cur_frame->linesize[1] ||
-             (row + h4) > s->rows;
-    if (emu[0]) {
-        b->dst[0]   = s->tmp_y;
-        b->y_stride = 64;
-    } else {
-        b->dst[0]   = s->cur_frame->data[0] + yoff;
-        b->y_stride = s->cur_frame->linesize[0];
-    }
-    if (emu[1]) {
-        b->dst[1]    = s->tmp_uv[0];
-        b->dst[2]    = s->tmp_uv[1];
-        b->uv_stride = 32;
-    } else {
-        b->dst[1]    = s->cur_frame->data[1] + uvoff;
-        b->dst[2]    = s->cur_frame->data[2] + uvoff;
-        b->uv_stride = s->cur_frame->linesize[1];
-    }
-    if (b->intra) {
-        intra_recon(avctx, yoff, uvoff);
-    } else {
-        if ((ret = inter_recon(avctx)) < 0)
-            return ret;
-    }
-    if (emu[0]) {
-        int w = FFMIN(s->cols - col, w4) * 8;
-        int h = FFMIN(s->rows - row, h4) * 8;
-        int n, o = 0;
-
-        for (n = 0; o < w; n++) {
-            int bw = 64 >> n;
-
-            av_assert2(n <= 4);
-            if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[0] + yoff + o,
-                                         s->tmp_y + o,
-                                         s->cur_frame->linesize[0],
-                                         64, h, 0, 0);
-                o += bw;
-            }
-        }
-    }
-    if (emu[1]) {
-        int w = FFMIN(s->cols - col, w4) * 4;
-        int h = FFMIN(s->rows - row, h4) * 4;
-        int n, o = 0;
-
-        for (n = 1; o < w; n++) {
-            int bw = 64 >> n;
-
-            av_assert2(n <= 4);
-            if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[1] + uvoff + o,
-                                         s->tmp_uv[0] + o,
-                                         s->cur_frame->linesize[1],
-                                         32, h, 0, 0);
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[2] + uvoff + o,
-                                         s->tmp_uv[1] + o,
-                                         s->cur_frame->linesize[2],
-                                         32, h, 0, 0);
-                o += bw;
-            }
-        }
-    }
-
-    // pick filter level and find edges to apply filter to
-    if (s->filter.level &&
-        (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
-                                                    [b->mode[3] != ZEROMV]) > 0) {
-        int x_end = FFMIN(s->cols - col, w4);
-        int y_end = FFMIN(s->rows - row, h4);
-        int skip_inter = !b->intra && b->skip;
-
-        for (y = 0; y < h4; y++)
-            memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
-        mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
-        mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
-                   s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
-                   s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
-                   b->uvtx, skip_inter);
-
-        if (!s->filter.lim_lut[lvl]) {
-            int sharp = s->filter.sharpness;
-            int limit = lvl;
-
-            if (sharp > 0) {
-                limit >>= (sharp + 3) >> 2;
-                limit   = FFMIN(limit, 9 - sharp);
-            }
-            limit = FFMAX(limit, 1);
-
-            s->filter.lim_lut[lvl]   = limit;
-            s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
-        }
-    }
-
-    return 0;
-}
diff --git a/libavcodec/vp9data.c b/libavcodec/vp9data.c
deleted file mode 100644
index 374fa8bb8c..0000000000
--- a/libavcodec/vp9data.c
+++ /dev/null
@@ -1,2133 +0,0 @@
-/*
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "vp9.h"
-#include "vp9data.h"
-
-const int8_t ff_vp9_partition_tree[3][2] = {
-    { -PARTITION_NONE,                1 }, // '0'
-    {    -PARTITION_H,                2 }, // '10'
-    {    -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
-};
-
-const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = {
-    { /* 64x64 -> 32x32 */
-        { 174,  35,  49 } /* a/l both not split */,
-        {  68,  11,  27 } /* a split, l not split */,
-        {  57,  15,   9 } /* l split, a not split */,
-        {  12,   3,   3 } /* a/l both split */
-    }, { /* 32x32 -> 16x16 */
-        { 150,  40,  39 } /* a/l both not split */,
-        {  78,  12,  26 } /* a split, l not split */,
-        {  67,  33,  11 } /* l split, a not split */,
-        {  24,   7,   5 } /* a/l both split */,
-    }, { /* 16x16 -> 8x8 */
-        { 149,  53,  53 } /* a/l both not split */,
-        {  94,  20,  48 } /* a split, l not split */,
-        {  83,  53,  24 } /* l split, a not split */,
-        {  52,  18,  18 } /* a/l both split */,
-    }, { /* 8x8 -> 4x4 */
-        { 158,  97,  94 } /* a/l both not split */,
-        {  93,  24,  99 } /* a split, l not split */,
-        {  85, 119,  44 } /* l split, a not split */,
-        {  62,  59,  67 } /* a/l both split */,
-    },
-};
-
-const int8_t ff_vp9_segmentation_tree[7][2] = {
-    {  1,  2 },
-    {  3,  4 },
-    {  5,  6 },
-    { -0, -1 }, // '00x'
-    { -2, -3 }, // '01x'
-    { -4, -5 }, // '10x'
-    { -6, -7 }, // '11x'
-};
-
-const int8_t ff_vp9_intramode_tree[9][2] = {
-    {              -DC_PRED,                1 }, // '0'
-    {          -TM_VP8_PRED,                2 }, // '10'
-    {            -VERT_PRED,                3 }, // '110'
-    {                     4,                6 },
-    {             -HOR_PRED,                5 }, // '11100'
-    { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
-    {  -DIAG_DOWN_LEFT_PRED,                7 }, // '11110'
-    {       -VERT_LEFT_PRED,                8 }, // '111110'
-    {        -HOR_DOWN_PRED,     -HOR_UP_PRED }, // '111111x'
-};
-
-const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9] = {
-    { /* above = v */
-        {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
-        {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
-        {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
-        {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
-        {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
-        {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
-        {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-        {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
-        {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
-        {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
-    }, { /* above = h */
-        {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
-        {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
-        {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
-        {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
-        {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
-        {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
-        {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-        {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
-        {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
-        {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
-    }, { /* above = dc */
-        {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
-        {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
-        { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
-        {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
-        {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
-        {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
-        {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-        {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
-        {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
-        {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
-    }, { /* above = d45 */
-        {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
-        {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
-        { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
-        {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
-        {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
-        {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
-        {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-        {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
-        {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
-        {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
-    }, { /* above = d135 */
-        {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
-        {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
-        {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
-        {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
-        {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
-        {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
-        {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-        {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
-        {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
-        {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
-    }, { /* above = d117 */
-        {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
-        {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
-        {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
-        {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
-        {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
-        {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
-        {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-        {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
-        {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
-        {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
-    }, { /* above = d153 */
-        {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
-        {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
-        {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
-        {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
-        {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
-        {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
-        {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-        {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
-        {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
-        {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-    }, { /* above = d63 */
-        {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
-        {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
-        {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
-        {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
-        {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
-        {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
-        {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-        {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
-        {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
-        {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
-    }, { /* above = d27 */
-        {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
-        {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
-        {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
-        {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
-        {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
-        {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
-        {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-        {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
-        {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
-        {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
-    }, { /* above = tm */
-        {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
-        {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
-        {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
-        {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
-        {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
-        {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
-        {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-        {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
-        {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
-        {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
-    }
-};
-
-const uint8_t ff_vp9_default_kf_uvmode_probs[10][9] = {
-    { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
-    { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
-    { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
-    { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
-    { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
-    { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
-    { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-    { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
-    { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
-    { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
-};
-
-const int8_t ff_vp9_inter_mode_tree[3][2] = {
-    {    -ZEROMV,      1 }, // '0'
-    { -NEARESTMV,      2 }, // '10'
-    {    -NEARMV, -NEWMV }, // '11x'
-};
-
-const int8_t ff_vp9_filter_tree[2][2] = {
-    { -0,  1 },  // '0'
-    { -1, -2 },  // '1x'
-};
-
-const enum FilterMode ff_vp9_filter_lut[3] = {
-    FILTER_8TAP_REGULAR,
-    FILTER_8TAP_SMOOTH,
-    FILTER_8TAP_SHARP,
-};
-
-const int16_t ff_vp9_dc_qlookup[256] = {
-       4,    8,    8,    9,   10,   11,   12,   12,
-      13,   14,   15,   16,   17,   18,   19,   19,
-      20,   21,   22,   23,   24,   25,   26,   26,
-      27,   28,   29,   30,   31,   32,   32,   33,
-      34,   35,   36,   37,   38,   38,   39,   40,
-      41,   42,   43,   43,   44,   45,   46,   47,
-      48,   48,   49,   50,   51,   52,   53,   53,
-      54,   55,   56,   57,   57,   58,   59,   60,
-      61,   62,   62,   63,   64,   65,   66,   66,
-      67,   68,   69,   70,   70,   71,   72,   73,
-      74,   74,   75,   76,   77,   78,   78,   79,
-      80,   81,   81,   82,   83,   84,   85,   85,
-      87,   88,   90,   92,   93,   95,   96,   98,
-      99,  101,  102,  104,  105,  107,  108,  110,
-     111,  113,  114,  116,  117,  118,  120,  121,
-     123,  125,  127,  129,  131,  134,  136,  138,
-     140,  142,  144,  146,  148,  150,  152,  154,
-     156,  158,  161,  164,  166,  169,  172,  174,
-     177,  180,  182,  185,  187,  190,  192,  195,
-     199,  202,  205,  208,  211,  214,  217,  220,
-     223,  226,  230,  233,  237,  240,  243,  247,
-     250,  253,  257,  261,  265,  269,  272,  276,
-     280,  284,  288,  292,  296,  300,  304,  309,
-     313,  317,  322,  326,  330,  335,  340,  344,
-     349,  354,  359,  364,  369,  374,  379,  384,
-     389,  395,  400,  406,  411,  417,  423,  429,
-     435,  441,  447,  454,  461,  467,  475,  482,
-     489,  497,  505,  513,  522,  530,  539,  549,
-     559,  569,  579,  590,  602,  614,  626,  640,
-     654,  668,  684,  700,  717,  736,  755,  775,
-     796,  819,  843,  869,  896,  925,  955,  988,
-    1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
-};
-
-const int16_t ff_vp9_ac_qlookup[256] = {
-       4,    8,    9,   10,   11,   12,   13,   14,
-      15,   16,   17,   18,   19,   20,   21,   22,
-      23,   24,   25,   26,   27,   28,   29,   30,
-      31,   32,   33,   34,   35,   36,   37,   38,
-      39,   40,   41,   42,   43,   44,   45,   46,
-      47,   48,   49,   50,   51,   52,   53,   54,
-      55,   56,   57,   58,   59,   60,   61,   62,
-      63,   64,   65,   66,   67,   68,   69,   70,
-      71,   72,   73,   74,   75,   76,   77,   78,
-      79,   80,   81,   82,   83,   84,   85,   86,
-      87,   88,   89,   90,   91,   92,   93,   94,
-      95,   96,   97,   98,   99,  100,  101,  102,
-     104,  106,  108,  110,  112,  114,  116,  118,
-     120,  122,  124,  126,  128,  130,  132,  134,
-     136,  138,  140,  142,  144,  146,  148,  150,
-     152,  155,  158,  161,  164,  167,  170,  173,
-     176,  179,  182,  185,  188,  191,  194,  197,
-     200,  203,  207,  211,  215,  219,  223,  227,
-     231,  235,  239,  243,  247,  251,  255,  260,
-     265,  270,  275,  280,  285,  290,  295,  300,
-     305,  311,  317,  323,  329,  335,  341,  347,
-     353,  359,  366,  373,  380,  387,  394,  401,
-     408,  416,  424,  432,  440,  448,  456,  465,
-     474,  483,  492,  501,  510,  520,  530,  540,
-     550,  560,  571,  582,  593,  604,  615,  627,
-     639,  651,  663,  676,  689,  702,  715,  729,
-     743,  757,  771,  786,  801,  816,  832,  848,
-     864,  881,  898,  915,  933,  951,  969,  988,
-    1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
-    1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
-    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
-    1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
-};
-
-const enum TxfmType ff_vp9_intra_txfm_type[14] = {
-    [VERT_PRED]            = ADST_DCT,
-    [HOR_PRED]             = DCT_ADST,
-    [DC_PRED]              = DCT_DCT,
-    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
-    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
-    [VERT_RIGHT_PRED]      = ADST_DCT,
-    [HOR_DOWN_PRED]        = DCT_ADST,
-    [VERT_LEFT_PRED]       = ADST_DCT,
-    [HOR_UP_PRED]          = DCT_ADST,
-    [TM_VP8_PRED]          = ADST_ADST,
-    [NEARESTMV]            = DCT_DCT,
-    [NEARMV]               = DCT_DCT,
-    [ZEROMV]               = DCT_DCT,
-    [NEWMV]                = DCT_DCT,
-};
-
-const int16_t ff_vp9_default_scan_4x4[16] = {
-     0,  1,  4,  5,
-     2,  8,  3,  6,
-    12,  9,  7, 10,
-    13, 11, 14, 15,
-};
-
-const int16_t ff_vp9_col_scan_4x4[16] = {
-     0,  1,  2,  4,
-     3,  5,  6,  8,
-     7,  9, 10, 12,
-    13, 11, 14, 15,
-};
-
-const int16_t ff_vp9_row_scan_4x4[16] = {
-     0,  4,  1,  8,
-     5, 12,  9,  2,
-     6, 13,  3, 10,
-     7, 14, 11, 15,
-};
-
-const int16_t ff_vp9_default_scan_8x8[64] = {
-     0,  1,  8,  2,  9, 16, 10,  3,
-    17, 24, 18, 11,  4, 25, 32, 19,
-    12, 26,  5, 33, 20, 27, 40, 13,
-    34,  6, 41, 28, 21, 35, 42, 48,
-    14,  7, 36, 29, 43, 56, 49, 22,
-    15, 37, 50, 44, 57, 30, 23, 51,
-    45, 58, 38, 31, 52, 59, 39, 46,
-    53, 60, 47, 54, 61, 55, 62, 63,
-};
-
-const int16_t ff_vp9_col_scan_8x8[64] = {
-     0,  1,  2,  8,  3,  9,  4, 10,
-    16,  5, 11, 17, 12, 18,  6, 24,
-    19, 13, 25,  7, 26, 20, 32, 14,
-    27, 21, 33, 28, 34, 15, 22, 35,
-    40, 29, 41, 36, 23, 30, 42, 37,
-    48, 43, 31, 44, 49, 38, 50, 56,
-    45, 39, 51, 57, 52, 46, 58, 53,
-    59, 47, 60, 54, 61, 55, 62, 63,
-};
-
-const int16_t ff_vp9_row_scan_8x8[64] = {
-     0,  8, 16,  1,  9, 24,  2, 17,
-    32, 10, 25,  3, 40, 18, 11, 33,
-    26, 19,  4, 48, 41, 34, 12, 27,
-    56, 20,  5, 42, 35, 13, 49, 28,
-     6, 21, 43, 36, 14, 50, 29, 57,
-     7, 44, 22, 37, 51, 15, 58, 30,
-    23, 45, 52, 38, 59, 31, 46, 53,
-    39, 60, 47, 61, 54, 62, 55, 63,
-};
-
-const int16_t ff_vp9_default_scan_16x16[256] = {
-      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  34,  19,  49,  20,   5,
-     35,  64,  50,  36,  65,  21,   6,  51,  80,  66,  37,  22,  52,   7,  81,  67,
-     38,  82,  53,  23,  96,  68,   8,  83,  97,  54,  39,  69, 112,  24,  98,  84,
-     70,  55,   9,  40,  85,  99, 113, 128,  25, 114, 100,  71,  86,  56,  10,  41,
-    115, 101, 129, 116,  72,  87,  26, 130, 144, 102,  57,  11,  42, 117, 131, 145,
-     88, 103,  27,  73, 132, 118, 146,  58, 160,  12,  43, 133, 147, 104,  89, 119,
-    161,  74, 148, 134,  28, 162,  59,  13, 176, 120, 149,  90, 135, 105, 163,  44,
-     75, 177, 164,  29, 150, 121, 136, 178, 165,  14, 106,  60,  91, 151,  45, 179,
-    192, 137, 166, 122,  76, 180, 152,  30,  61,  15, 107, 167, 181, 193,  92, 208,
-     46, 138, 123, 153, 194,  77, 168, 182,  31, 195, 209, 183, 108, 139,  62, 154,
-     47, 196,  93, 169, 210, 197, 224, 124, 184, 211,  78, 109, 170, 155,  63, 198,
-    212, 185, 225, 240, 140,  94, 199, 125,  79, 213, 226, 171, 186, 156, 214, 200,
-    110, 227, 141,  95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216,
-    243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232,
-    189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191,
-    235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255,
-};
-
-const int16_t ff_vp9_col_scan_16x16[256] = {
-      0,   1,   2,   3,  16,   4,  17,   5,  18,   6,  19,  32,  20,   7,  33,  21,
-     34,   8,  35,  22,  48,  36,   9,  49,  23,  50,  37,  10,  38,  51,  24,  64,
-     52,  11,  65,  39,  25,  53,  66,  54,  40,  67,  12,  80,  26,  68,  55,  81,
-     41,  69,  13,  27,  82,  56,  70,  83,  42,  14,  84,  96,  71,  28,  57,  85,
-     97,  15,  72,  98,  43,  86,  58,  99,  29,  87, 100, 112,  73,  44, 101,  59,
-     30, 113,  88, 114,  74, 128, 102,  45,  31, 115,  60, 103,  89, 116,  75, 129,
-    117,  46, 104,  90,  61, 130, 118, 131, 132, 105,  76,  47, 119, 144,  91,  62,
-    133, 106, 145, 120, 146, 134,  77, 147, 121,  92, 135, 148,  63, 107, 136, 122,
-     93, 149, 160,  78, 150, 137, 108, 161, 162, 151, 123,  79, 138, 163, 152,  94,
-    164, 109, 165, 153, 124, 139, 176, 166,  95, 177, 167, 110, 154, 178, 125, 179,
-    140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170,
-    195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172,
-    158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215,
-    227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244,
-    231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206,
-    249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255,
-};
-
-const int16_t ff_vp9_row_scan_16x16[256] = {
-      0,  16,  32,   1,  48,  17,  64,  33,   2,  80,  18,  49,  96,  34,   3,  65,
-     19, 112,  50,  81,  35,   4, 128,  66,  20,  97,  51,  82,   5, 144,  36,  67,
-    113,  98,  21,  52, 160,  83, 129,  37,  68,   6, 114, 176,  99,  53,  22,  84,
-    145,  38,  69, 130,   7, 115, 192, 100,  54,  23,  85, 161, 146, 131,  39,  70,
-    208, 116,   8, 101, 177,  55,  86,  24, 162, 147, 132,  71, 224, 117,  40, 102,
-      9, 148,  56,  87, 193, 163, 240, 133, 178,  25, 118,  72,  41, 103, 164,  10,
-    149,  88, 134, 209, 179,  57, 119, 194,  26,  73, 165, 150, 104,  42, 135,  11,
-    180, 120,  89, 225, 195,  58,  27, 210, 151, 181, 166,  74,  43, 105,  12, 136,
-     90,  59, 241, 121,  28, 196, 167, 211, 152,  44, 182, 137,  75,  13, 226, 106,
-    122,  60, 197,  91, 168,  29, 183, 153,  14,  76, 212, 138,  45, 107,  15, 198,
-     92, 227, 169,  30, 123, 154,  61, 242, 184, 213, 139,  46,  77,  31, 108, 170,
-    199, 185, 124, 228,  93, 155, 214,  62, 140, 243,  78,  47, 200, 109, 186, 171,
-    201,  94,  63, 215, 229, 156,  79, 125, 141, 110, 216, 187, 172, 244, 202, 230,
-    217,  95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233,
-    203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250,
-    235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255,
-};
-
-const int16_t ff_vp9_default_scan_32x32[1024] = {
-       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,
-      67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
-      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,
-     103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
-     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,
-     198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
-     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,
-     321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
-      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,
-     235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
-     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,
-      18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
-      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,
-     450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
-     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,
-      20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
-     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,
-     580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
-     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,
-     210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
-     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,
-      24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
-     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,
-     460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
-     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,
-     709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
-     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,
-     370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
-      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,
-     525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
-     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,
-      31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
-     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,
-      63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
-     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,
-     591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
-     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,
-     684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
-     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,
-     345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
-     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,
-     594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
-     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,
-     284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
-     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,
-     844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
-     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,
-     474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
-     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,
-     412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
-     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,
-     477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
-     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974,
-    1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
-     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,
-     697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
-     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,
-     885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
-     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,
-     949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
-     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,
-     796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
-     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,
-     924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
-};
-
-const int16_t *ff_vp9_scans[5][4] = {
-    {
-        ff_vp9_default_scan_4x4, ff_vp9_col_scan_4x4,
-        ff_vp9_row_scan_4x4, ff_vp9_default_scan_4x4
-    }, {
-        ff_vp9_default_scan_8x8, ff_vp9_col_scan_8x8,
-        ff_vp9_row_scan_8x8, ff_vp9_default_scan_8x8
-    }, {
-        ff_vp9_default_scan_16x16, ff_vp9_col_scan_16x16,
-        ff_vp9_row_scan_16x16, ff_vp9_default_scan_16x16
-    }, {
-        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32,
-        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32
-    }, { // lossless
-        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4,
-        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4
-    }
-};
-
-const int16_t ff_vp9_default_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  0,  0 }, {  4,  1 }, {  1,  1 },
-    {  4,  4 }, {  2,  2 }, {  5,  2 }, {  8,  8 },
-    {  8,  5 }, {  6,  3 }, {  9,  6 }, { 12,  9 },
-    { 10,  7 }, { 13, 10 }, { 14, 11 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_col_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
-    {  4,  4 }, {  5,  5 }, {  4,  4 }, {  6,  6 },
-    {  8,  8 }, {  9,  9 }, {  8,  8 }, { 12, 12 },
-    { 10, 10 }, { 13, 13 }, { 14, 14 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_row_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  0,  0 }, {  4,  4 }, {  1,  1 },
-    {  8,  8 }, {  5,  5 }, {  1,  1 }, {  2,  2 },
-    {  9,  9 }, {  2,  2 }, {  6,  6 }, {  3,  3 },
-    { 10, 10 }, {  7,  7 }, { 11, 11 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_default_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  0,  0 }, {  1,  1 }, {  8,  1 },
-    {  8,  8 }, {  9,  2 }, {  2,  2 }, { 16,  9 },
-    { 16, 16 }, { 17, 10 }, { 10,  3 }, {  3,  3 },
-    { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11,  4 },
-    { 25, 18 }, {  4,  4 }, { 32, 25 }, { 19, 12 },
-    { 26, 19 }, { 32, 32 }, { 12,  5 }, { 33, 26 },
-    {  5,  5 }, { 40, 33 }, { 27, 20 }, { 20, 13 },
-    { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13,  6 },
-    {  6,  6 }, { 35, 28 }, { 28, 21 }, { 42, 35 },
-    { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14,  7 },
-    { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 },
-    { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 },
-    { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 },
-    { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 },
-    { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 },
-    { 54, 47 }, { 61, 54 }, { 62, 55 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_col_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
-    {  8,  8 }, {  3,  3 }, {  9,  9 }, {  8,  8 },
-    {  4,  4 }, { 10, 10 }, { 16, 16 }, { 11, 11 },
-    { 17, 17 }, {  5,  5 }, { 16, 16 }, { 18, 18 },
-    { 12, 12 }, { 24, 24 }, {  6,  6 }, { 25, 25 },
-    { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 },
-    { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 },
-    { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 },
-    { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 },
-    { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 },
-    { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 },
-    { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 },
-    { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 },
-    { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 },
-    { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 },
-    { 54, 54 }, { 61, 61 }, { 62, 62 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_row_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  8,  8 }, {  0,  0 }, {  1,  1 },
-    { 16, 16 }, {  1,  1 }, {  9,  9 }, { 24, 24 },
-    {  2,  2 }, { 17, 17 }, {  2,  2 }, { 32, 32 },
-    { 10, 10 }, {  3,  3 }, { 25, 25 }, { 18, 18 },
-    { 11, 11 }, {  3,  3 }, { 40, 40 }, { 33, 33 },
-    { 26, 26 }, {  4,  4 }, { 19, 19 }, { 48, 48 },
-    { 12, 12 }, {  4,  4 }, { 34, 34 }, { 27, 27 },
-    {  5,  5 }, { 41, 41 }, { 20, 20 }, {  5,  5 },
-    { 13, 13 }, { 35, 35 }, { 28, 28 }, {  6,  6 },
-    { 42, 42 }, { 21, 21 }, { 49, 49 }, {  6,  6 },
-    { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 },
-    {  7,  7 }, { 50, 50 }, { 22, 22 }, { 15, 15 },
-    { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 },
-    { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 },
-    { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 },
-    { 54, 54 }, { 47, 47 }, { 55, 55 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_default_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {   0,   0 }, {   1,   1 }, {  16,   1 },
-    {  16,  16 }, {   2,   2 }, {  17,   2 }, {  32,  17 },
-    {  32,  32 }, {   3,   3 }, {  33,  18 }, {  18,   3 },
-    {  48,  33 }, {  19,   4 }, {   4,   4 }, {  34,  19 },
-    {  48,  48 }, {  49,  34 }, {  35,  20 }, {  64,  49 },
-    {  20,   5 }, {   5,   5 }, {  50,  35 }, {  64,  64 },
-    {  65,  50 }, {  36,  21 }, {  21,   6 }, {  51,  36 },
-    {   6,   6 }, {  80,  65 }, {  66,  51 }, {  37,  22 },
-    {  81,  66 }, {  52,  37 }, {  22,   7 }, {  80,  80 },
-    {  67,  52 }, {   7,   7 }, {  82,  67 }, {  96,  81 },
-    {  53,  38 }, {  38,  23 }, {  68,  53 }, {  96,  96 },
-    {  23,   8 }, {  97,  82 }, {  83,  68 }, {  69,  54 },
-    {  54,  39 }, {   8,   8 }, {  39,  24 }, {  84,  69 },
-    {  98,  83 }, { 112,  97 }, { 112, 112 }, {  24,   9 },
-    { 113,  98 }, {  99,  84 }, {  70,  55 }, {  85,  70 },
-    {  55,  40 }, {   9,   9 }, {  40,  25 }, { 114,  99 },
-    { 100,  85 }, { 128, 113 }, { 115, 100 }, {  71,  56 },
-    {  86,  71 }, {  25,  10 }, { 129, 114 }, { 128, 128 },
-    { 101,  86 }, {  56,  41 }, {  10,  10 }, {  41,  26 },
-    { 116, 101 }, { 130, 115 }, { 144, 129 }, {  87,  72 },
-    { 102,  87 }, {  26,  11 }, {  72,  57 }, { 131, 116 },
-    { 117, 102 }, { 145, 130 }, {  57,  42 }, { 144, 144 },
-    {  11,  11 }, {  42,  27 }, { 132, 117 }, { 146, 131 },
-    { 103,  88 }, {  88,  73 }, { 118, 103 }, { 160, 145 },
-    {  73,  58 }, { 147, 132 }, { 133, 118 }, {  27,  12 },
-    { 161, 146 }, {  58,  43 }, {  12,  12 }, { 160, 160 },
-    { 119, 104 }, { 148, 133 }, {  89,  74 }, { 134, 119 },
-    { 104,  89 }, { 162, 147 }, {  43,  28 }, {  74,  59 },
-    { 176, 161 }, { 163, 148 }, {  28,  13 }, { 149, 134 },
-    { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 },
-    {  13,  13 }, { 105,  90 }, {  59,  44 }, {  90,  75 },
-    { 150, 135 }, {  44,  29 }, { 178, 163 }, { 176, 176 },
-    { 136, 121 }, { 165, 150 }, { 121, 106 }, {  75,  60 },
-    { 179, 164 }, { 151, 136 }, {  29,  14 }, {  60,  45 },
-    {  14,  14 }, { 106,  91 }, { 166, 151 }, { 180, 165 },
-    { 192, 177 }, {  91,  76 }, { 192, 192 }, {  45,  30 },
-    { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 },
-    {  76,  61 }, { 167, 152 }, { 181, 166 }, {  30,  15 },
-    { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107,  92 },
-    { 138, 123 }, {  61,  46 }, { 153, 138 }, {  46,  31 },
-    { 195, 180 }, {  92,  77 }, { 168, 153 }, { 209, 194 },
-    { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 },
-    { 210, 195 }, {  77,  62 }, { 108,  93 }, { 169, 154 },
-    { 154, 139 }, {  62,  47 }, { 197, 182 }, { 211, 196 },
-    { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 },
-    {  93,  78 }, { 198, 183 }, { 124, 109 }, {  78,  63 },
-    { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 },
-    { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109,  94 },
-    { 226, 211 }, { 140, 125 }, {  94,  79 }, { 240, 225 },
-    { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 },
-    { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 },
-    { 228, 213 }, { 110,  95 }, { 215, 200 }, { 242, 227 },
-    { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 },
-    { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 },
-    { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 },
-    { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 },
-    { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 },
-    { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 },
-    { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 },
-    { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 },
-    { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 },
-    { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 },
-    { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 },
-    { 238, 223 }, { 253, 238 }, { 254, 239 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_col_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {   1,   1 }, {   2,   2 }, {   0,   0 },
-    {   3,   3 }, {  16,  16 }, {   4,   4 }, {  17,  17 },
-    {   5,   5 }, {  18,  18 }, {  16,  16 }, {  19,  19 },
-    {   6,   6 }, {  32,  32 }, {  20,  20 }, {  33,  33 },
-    {   7,   7 }, {  34,  34 }, {  21,  21 }, {  32,  32 },
-    {  35,  35 }, {   8,   8 }, {  48,  48 }, {  22,  22 },
-    {  49,  49 }, {  36,  36 }, {   9,   9 }, {  37,  37 },
-    {  50,  50 }, {  23,  23 }, {  48,  48 }, {  51,  51 },
-    {  10,  10 }, {  64,  64 }, {  38,  38 }, {  24,  24 },
-    {  52,  52 }, {  65,  65 }, {  53,  53 }, {  39,  39 },
-    {  66,  66 }, {  11,  11 }, {  64,  64 }, {  25,  25 },
-    {  67,  67 }, {  54,  54 }, {  80,  80 }, {  40,  40 },
-    {  68,  68 }, {  12,  12 }, {  26,  26 }, {  81,  81 },
-    {  55,  55 }, {  69,  69 }, {  82,  82 }, {  41,  41 },
-    {  13,  13 }, {  83,  83 }, {  80,  80 }, {  70,  70 },
-    {  27,  27 }, {  56,  56 }, {  84,  84 }, {  96,  96 },
-    {  14,  14 }, {  71,  71 }, {  97,  97 }, {  42,  42 },
-    {  85,  85 }, {  57,  57 }, {  98,  98 }, {  28,  28 },
-    {  86,  86 }, {  99,  99 }, {  96,  96 }, {  72,  72 },
-    {  43,  43 }, { 100, 100 }, {  58,  58 }, {  29,  29 },
-    { 112, 112 }, {  87,  87 }, { 113, 113 }, {  73,  73 },
-    { 112, 112 }, { 101, 101 }, {  44,  44 }, {  30,  30 },
-    { 114, 114 }, {  59,  59 }, { 102, 102 }, {  88,  88 },
-    { 115, 115 }, {  74,  74 }, { 128, 128 }, { 116, 116 },
-    {  45,  45 }, { 103, 103 }, {  89,  89 }, {  60,  60 },
-    { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 },
-    { 104, 104 }, {  75,  75 }, {  46,  46 }, { 118, 118 },
-    { 128, 128 }, {  90,  90 }, {  61,  61 }, { 132, 132 },
-    { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 },
-    { 133, 133 }, {  76,  76 }, { 146, 146 }, { 120, 120 },
-    {  91,  91 }, { 134, 134 }, { 147, 147 }, {  62,  62 },
-    { 106, 106 }, { 135, 135 }, { 121, 121 }, {  92,  92 },
-    { 148, 148 }, { 144, 144 }, {  77,  77 }, { 149, 149 },
-    { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 },
-    { 150, 150 }, { 122, 122 }, {  78,  78 }, { 137, 137 },
-    { 162, 162 }, { 151, 151 }, {  93,  93 }, { 163, 163 },
-    { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 },
-    { 138, 138 }, { 160, 160 }, { 165, 165 }, {  94,  94 },
-    { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 },
-    { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 },
-    { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 },
-    { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 },
-    { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 },
-    { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 },
-    { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 },
-    { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 },
-    { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 },
-    { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 },
-    { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 },
-    { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 },
-    { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 },
-    { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 },
-    { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 },
-    { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 },
-    { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 },
-    { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 },
-    { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 },
-    { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 },
-    { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 },
-    { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 },
-    { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 },
-    { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 },
-    { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 },
-    { 252, 252 }, { 253, 253 }, { 254, 254 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_row_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {  16,  16 }, {   0,   0 }, {  32,  32 },
-    {   1,   1 }, {  48,  48 }, {  17,  17 }, {   1,   1 },
-    {  64,  64 }, {   2,   2 }, {  33,  33 }, {  80,  80 },
-    {  18,  18 }, {   2,   2 }, {  49,  49 }, {   3,   3 },
-    {  96,  96 }, {  34,  34 }, {  65,  65 }, {  19,  19 },
-    {   3,   3 }, { 112, 112 }, {  50,  50 }, {   4,   4 },
-    {  81,  81 }, {  35,  35 }, {  66,  66 }, {   4,   4 },
-    { 128, 128 }, {  20,  20 }, {  51,  51 }, {  97,  97 },
-    {  82,  82 }, {   5,   5 }, {  36,  36 }, { 144, 144 },
-    {  67,  67 }, { 113, 113 }, {  21,  21 }, {  52,  52 },
-    {   5,   5 }, {  98,  98 }, { 160, 160 }, {  83,  83 },
-    {  37,  37 }, {   6,   6 }, {  68,  68 }, { 129, 129 },
-    {  22,  22 }, {  53,  53 }, { 114, 114 }, {   6,   6 },
-    {  99,  99 }, { 176, 176 }, {  84,  84 }, {  38,  38 },
-    {   7,   7 }, {  69,  69 }, { 145, 145 }, { 130, 130 },
-    { 115, 115 }, {  23,  23 }, {  54,  54 }, { 192, 192 },
-    { 100, 100 }, {   7,   7 }, {  85,  85 }, { 161, 161 },
-    {  39,  39 }, {  70,  70 }, {   8,   8 }, { 146, 146 },
-    { 131, 131 }, { 116, 116 }, {  55,  55 }, { 208, 208 },
-    { 101, 101 }, {  24,  24 }, {  86,  86 }, {   8,   8 },
-    { 132, 132 }, {  40,  40 }, {  71,  71 }, { 177, 177 },
-    { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 },
-    {   9,   9 }, { 102, 102 }, {  56,  56 }, {  25,  25 },
-    {  87,  87 }, { 148, 148 }, {   9,   9 }, { 133, 133 },
-    {  72,  72 }, { 118, 118 }, { 193, 193 }, { 163, 163 },
-    {  41,  41 }, { 103, 103 }, { 178, 178 }, {  10,  10 },
-    {  57,  57 }, { 149, 149 }, { 134, 134 }, {  88,  88 },
-    {  26,  26 }, { 119, 119 }, {  10,  10 }, { 164, 164 },
-    { 104, 104 }, {  73,  73 }, { 209, 209 }, { 179, 179 },
-    {  42,  42 }, {  11,  11 }, { 194, 194 }, { 135, 135 },
-    { 165, 165 }, { 150, 150 }, {  58,  58 }, {  27,  27 },
-    {  89,  89 }, {  11,  11 }, { 120, 120 }, {  74,  74 },
-    {  43,  43 }, { 225, 225 }, { 105, 105 }, {  12,  12 },
-    { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 },
-    {  28,  28 }, { 166, 166 }, { 121, 121 }, {  59,  59 },
-    {  12,  12 }, { 210, 210 }, {  90,  90 }, { 106, 106 },
-    {  44,  44 }, { 181, 181 }, {  75,  75 }, { 152, 152 },
-    {  13,  13 }, { 167, 167 }, { 137, 137 }, {  13,  13 },
-    {  60,  60 }, { 196, 196 }, { 122, 122 }, {  29,  29 },
-    {  91,  91 }, {  14,  14 }, { 182, 182 }, {  76,  76 },
-    { 211, 211 }, { 153, 153 }, {  14,  14 }, { 107, 107 },
-    { 138, 138 }, {  45,  45 }, { 226, 226 }, { 168, 168 },
-    { 197, 197 }, { 123, 123 }, {  30,  30 }, {  61,  61 },
-    {  15,  15 }, {  92,  92 }, { 154, 154 }, { 183, 183 },
-    { 169, 169 }, { 108, 108 }, { 212, 212 }, {  77,  77 },
-    { 139, 139 }, { 198, 198 }, {  46,  46 }, { 124, 124 },
-    { 227, 227 }, {  62,  62 }, {  31,  31 }, { 184, 184 },
-    {  93,  93 }, { 170, 170 }, { 155, 155 }, { 185, 185 },
-    {  78,  78 }, {  47,  47 }, { 199, 199 }, { 213, 213 },
-    { 140, 140 }, {  63,  63 }, { 109, 109 }, { 125, 125 },
-    {  94,  94 }, { 200, 200 }, { 171, 171 }, { 156, 156 },
-    { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 },
-    {  79,  79 }, { 141, 141 }, { 110, 110 }, { 229, 229 },
-    {  95,  95 }, { 126, 126 }, { 215, 215 }, { 172, 172 },
-    { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 },
-    { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 },
-    { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 },
-    { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 },
-    { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 },
-    { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 },
-    { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 },
-    { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 },
-    { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 },
-    { 238, 238 }, { 223, 223 }, { 239, 239 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_default_scan_32x32_nb[1024][2] = {
-    {    0,    0 }, {    0,    0 }, {    1,    1 }, {   32,    1 },
-    {   32,   32 }, {    2,    2 }, {   33,    2 }, {   64,   33 },
-    {    3,    3 }, {   64,   64 }, {   34,    3 }, {   65,   34 },
-    {    4,    4 }, {   35,    4 }, {   96,   65 }, {   66,   35 },
-    {   96,   96 }, {   97,   66 }, {   67,   36 }, {   36,    5 },
-    {    5,    5 }, {  128,   97 }, {   98,   67 }, {    6,    6 },
-    {  128,  128 }, {   68,   37 }, {   37,    6 }, {  129,   98 },
-    {   99,   68 }, {  160,  129 }, {  130,   99 }, {   38,    7 },
-    {   69,   38 }, {    7,    7 }, {  100,   69 }, {  161,  130 },
-    {  131,  100 }, {  160,  160 }, {   70,   39 }, {   39,    8 },
-    {    8,    8 }, {  101,   70 }, {  162,  131 }, {  132,  101 },
-    {  192,  161 }, {   71,   40 }, {  192,  192 }, {  102,   71 },
-    {   40,    9 }, {  163,  132 }, {    9,    9 }, {  193,  162 },
-    {  133,  102 }, {  164,  133 }, {   72,   41 }, {  103,   72 },
-    {  134,  103 }, {  224,  193 }, {   41,   10 }, {  194,  163 },
-    {   10,   10 }, {  224,  224 }, {  165,  134 }, {  225,  194 },
-    {  195,  164 }, {   73,   42 }, {  104,   73 }, {  135,  104 },
-    {   42,   11 }, {   11,   11 }, {  166,  135 }, {  196,  165 },
-    {  226,  195 }, {  256,  225 }, {   74,   43 }, {  105,   74 },
-    {  136,  105 }, {  227,  196 }, {   43,   12 }, {  197,  166 },
-    {  167,  136 }, {  257,  226 }, {  256,  256 }, {   12,   12 },
-    {  228,  197 }, {   75,   44 }, {  106,   75 }, {  198,  167 },
-    {  137,  106 }, {  258,  227 }, {  168,  137 }, {  288,  257 },
-    {   44,   13 }, {  229,  198 }, {  259,  228 }, {  199,  168 },
-    {  107,   76 }, {   13,   13 }, {  169,  138 }, {  138,  107 },
-    {  288,  288 }, {  289,  258 }, {   76,   45 }, {  230,  199 },
-    {  260,  229 }, {   45,   14 }, {  200,  169 }, {  139,  108 },
-    {  290,  259 }, {  108,   77 }, {  231,  200 }, {  320,  289 },
-    {  261,  230 }, {  170,  139 }, {   77,   46 }, {  291,  260 },
-    {   14,   14 }, {  321,  290 }, {  201,  170 }, {  262,  231 },
-    {  320,  320 }, {  171,  140 }, {  292,  261 }, {  232,  201 },
-    {  140,  109 }, {  322,  291 }, {  109,   78 }, {   46,   15 },
-    {  202,  171 }, {  263,  232 }, {  233,  202 }, {  293,  262 },
-    {  352,  321 }, {  323,  292 }, {   15,   15 }, {   78,   47 },
-    {  203,  172 }, {  264,  233 }, {  294,  263 }, {  324,  293 },
-    {  172,  141 }, {  353,  322 }, {  141,  110 }, {  234,  203 },
-    {  352,  352 }, {   47,   16 }, {  295,  264 }, {  110,   79 },
-    {  265,  234 }, {  354,  323 }, {  325,  294 }, {   79,   48 },
-    {   16,   16 }, {  204,  173 }, {  235,  204 }, {  173,  142 },
-    {  355,  324 }, {  384,  353 }, {  326,  295 }, {  142,  111 },
-    {  296,  265 }, {  266,  235 }, {  356,  325 }, {  385,  354 },
-    {  111,   80 }, {   48,   17 }, {  327,  296 }, {  297,  266 },
-    {  205,  174 }, {  384,  384 }, {  236,  205 }, {  357,  326 },
-    {  386,  355 }, {   80,   49 }, {  174,  143 }, {   17,   17 },
-    {  328,  297 }, {  358,  327 }, {  387,  356 }, {  298,  267 },
-    {  329,  298 }, {  388,  357 }, {  112,   81 }, {  416,  385 },
-    {  237,  206 }, {  359,  328 }, {   49,   18 }, {  206,  175 },
-    {  417,  386 }, {  389,  358 }, {  330,  299 }, {   18,   18 },
-    {  416,  416 }, {  360,  329 }, {   81,   50 }, {  418,  387 },
-    {  390,  359 }, {  238,  207 }, {   50,   19 }, {  361,  330 },
-    {  419,  388 }, {  113,   82 }, {  448,  417 }, {  448,  448 },
-    {  420,  389 }, {   82,   51 }, {  362,  331 }, {  449,  418 },
-    {  421,  390 }, {  480,  480 }, {  450,  419 }, {  422,  391 },
-    {  114,   83 }, {  451,  420 }, {  480,  449 }, {  452,  421 },
-    {  481,  450 }, {  453,  422 }, {  512,  512 }, {  482,  451 },
-    {  454,  423 }, {  512,  481 }, {  483,  452 }, {  513,  482 },
-    {  484,  453 }, {  514,  483 }, {  485,  454 }, {  544,  513 },
-    {  544,  544 }, {  486,  455 }, {  545,  514 }, {  546,  515 },
-    {  576,  576 }, {  576,  545 }, {  577,  546 }, {  578,  547 },
-    {  608,  577 }, {  609,  578 }, {  610,  579 }, {   19,   19 },
-    {  143,  112 }, {  267,  236 }, {  391,  360 }, {  515,  484 },
-    {  608,  608 }, {   20,   20 }, {   51,   20 }, {  144,  113 },
-    {  175,  144 }, {  268,  237 }, {  299,  268 }, {  392,  361 },
-    {  423,  392 }, {  516,  485 }, {  547,  516 }, {  640,  609 },
-    {  640,  640 }, {   21,   21 }, {   52,   21 }, {   83,   52 },
-    {  145,  114 }, {  176,  145 }, {  207,  176 }, {  269,  238 },
-    {  300,  269 }, {  331,  300 }, {  393,  362 }, {  424,  393 },
-    {  455,  424 }, {  517,  486 }, {  548,  517 }, {  579,  548 },
-    {  641,  610 }, {  672,  641 }, {  672,  672 }, {   22,   22 },
-    {   53,   22 }, {   84,   53 }, {  115,   84 }, {  146,  115 },
-    {  177,  146 }, {  208,  177 }, {  239,  208 }, {  270,  239 },
-    {  301,  270 }, {  332,  301 }, {  363,  332 }, {  394,  363 },
-    {  425,  394 }, {  456,  425 }, {  487,  456 }, {  518,  487 },
-    {  549,  518 }, {  580,  549 }, {  611,  580 }, {  642,  611 },
-    {  673,  642 }, {  704,  673 }, {  704,  704 }, {   54,   23 },
-    {   85,   54 }, {  116,   85 }, {  178,  147 }, {  209,  178 },
-    {  240,  209 }, {  302,  271 }, {  333,  302 }, {  364,  333 },
-    {  426,  395 }, {  457,  426 }, {  488,  457 }, {  550,  519 },
-    {  581,  550 }, {  612,  581 }, {  674,  643 }, {  705,  674 },
-    {  736,  705 }, {   86,   55 }, {  117,   86 }, {  210,  179 },
-    {  241,  210 }, {  334,  303 }, {  365,  334 }, {  458,  427 },
-    {  489,  458 }, {  582,  551 }, {  613,  582 }, {  706,  675 },
-    {  737,  706 }, {  118,   87 }, {  242,  211 }, {  366,  335 },
-    {  490,  459 }, {  614,  583 }, {  738,  707 }, {   23,   23 },
-    {  147,  116 }, {  271,  240 }, {  395,  364 }, {  519,  488 },
-    {  643,  612 }, {  736,  736 }, {   24,   24 }, {   55,   24 },
-    {  148,  117 }, {  179,  148 }, {  272,  241 }, {  303,  272 },
-    {  396,  365 }, {  427,  396 }, {  520,  489 }, {  551,  520 },
-    {  644,  613 }, {  675,  644 }, {  768,  737 }, {  768,  768 },
-    {   25,   25 }, {   56,   25 }, {   87,   56 }, {  149,  118 },
-    {  180,  149 }, {  211,  180 }, {  273,  242 }, {  304,  273 },
-    {  335,  304 }, {  397,  366 }, {  428,  397 }, {  459,  428 },
-    {  521,  490 }, {  552,  521 }, {  583,  552 }, {  645,  614 },
-    {  676,  645 }, {  707,  676 }, {  769,  738 }, {  800,  769 },
-    {  800,  800 }, {   26,   26 }, {   57,   26 }, {   88,   57 },
-    {  119,   88 }, {  150,  119 }, {  181,  150 }, {  212,  181 },
-    {  243,  212 }, {  274,  243 }, {  305,  274 }, {  336,  305 },
-    {  367,  336 }, {  398,  367 }, {  429,  398 }, {  460,  429 },
-    {  491,  460 }, {  522,  491 }, {  553,  522 }, {  584,  553 },
-    {  615,  584 }, {  646,  615 }, {  677,  646 }, {  708,  677 },
-    {  739,  708 }, {  770,  739 }, {  801,  770 }, {  832,  801 },
-    {  832,  832 }, {   58,   27 }, {   89,   58 }, {  120,   89 },
-    {  182,  151 }, {  213,  182 }, {  244,  213 }, {  306,  275 },
-    {  337,  306 }, {  368,  337 }, {  430,  399 }, {  461,  430 },
-    {  492,  461 }, {  554,  523 }, {  585,  554 }, {  616,  585 },
-    {  678,  647 }, {  709,  678 }, {  740,  709 }, {  802,  771 },
-    {  833,  802 }, {  864,  833 }, {   90,   59 }, {  121,   90 },
-    {  214,  183 }, {  245,  214 }, {  338,  307 }, {  369,  338 },
-    {  462,  431 }, {  493,  462 }, {  586,  555 }, {  617,  586 },
-    {  710,  679 }, {  741,  710 }, {  834,  803 }, {  865,  834 },
-    {  122,   91 }, {  246,  215 }, {  370,  339 }, {  494,  463 },
-    {  618,  587 }, {  742,  711 }, {  866,  835 }, {   27,   27 },
-    {  151,  120 }, {  275,  244 }, {  399,  368 }, {  523,  492 },
-    {  647,  616 }, {  771,  740 }, {  864,  864 }, {   28,   28 },
-    {   59,   28 }, {  152,  121 }, {  183,  152 }, {  276,  245 },
-    {  307,  276 }, {  400,  369 }, {  431,  400 }, {  524,  493 },
-    {  555,  524 }, {  648,  617 }, {  679,  648 }, {  772,  741 },
-    {  803,  772 }, {  896,  865 }, {  896,  896 }, {   29,   29 },
-    {   60,   29 }, {   91,   60 }, {  153,  122 }, {  184,  153 },
-    {  215,  184 }, {  277,  246 }, {  308,  277 }, {  339,  308 },
-    {  401,  370 }, {  432,  401 }, {  463,  432 }, {  525,  494 },
-    {  556,  525 }, {  587,  556 }, {  649,  618 }, {  680,  649 },
-    {  711,  680 }, {  773,  742 }, {  804,  773 }, {  835,  804 },
-    {  897,  866 }, {  928,  897 }, {  928,  928 }, {   30,   30 },
-    {   61,   30 }, {   92,   61 }, {  123,   92 }, {  154,  123 },
-    {  185,  154 }, {  216,  185 }, {  247,  216 }, {  278,  247 },
-    {  309,  278 }, {  340,  309 }, {  371,  340 }, {  402,  371 },
-    {  433,  402 }, {  464,  433 }, {  495,  464 }, {  526,  495 },
-    {  557,  526 }, {  588,  557 }, {  619,  588 }, {  650,  619 },
-    {  681,  650 }, {  712,  681 }, {  743,  712 }, {  774,  743 },
-    {  805,  774 }, {  836,  805 }, {  867,  836 }, {  898,  867 },
-    {  929,  898 }, {  960,  929 }, {  960,  960 }, {   62,   31 },
-    {   93,   62 }, {  124,   93 }, {  186,  155 }, {  217,  186 },
-    {  248,  217 }, {  310,  279 }, {  341,  310 }, {  372,  341 },
-    {  434,  403 }, {  465,  434 }, {  496,  465 }, {  558,  527 },
-    {  589,  558 }, {  620,  589 }, {  682,  651 }, {  713,  682 },
-    {  744,  713 }, {  806,  775 }, {  837,  806 }, {  868,  837 },
-    {  930,  899 }, {  961,  930 }, {  992,  961 }, {   94,   63 },
-    {  125,   94 }, {  218,  187 }, {  249,  218 }, {  342,  311 },
-    {  373,  342 }, {  466,  435 }, {  497,  466 }, {  590,  559 },
-    {  621,  590 }, {  714,  683 }, {  745,  714 }, {  838,  807 },
-    {  869,  838 }, {  962,  931 }, {  993,  962 }, {  126,   95 },
-    {  250,  219 }, {  374,  343 }, {  498,  467 }, {  622,  591 },
-    {  746,  715 }, {  870,  839 }, {  994,  963 }, {  155,  124 },
-    {  279,  248 }, {  403,  372 }, {  527,  496 }, {  651,  620 },
-    {  775,  744 }, {  899,  868 }, {  156,  125 }, {  187,  156 },
-    {  280,  249 }, {  311,  280 }, {  404,  373 }, {  435,  404 },
-    {  528,  497 }, {  559,  528 }, {  652,  621 }, {  683,  652 },
-    {  776,  745 }, {  807,  776 }, {  900,  869 }, {  931,  900 },
-    {  157,  126 }, {  188,  157 }, {  219,  188 }, {  281,  250 },
-    {  312,  281 }, {  343,  312 }, {  405,  374 }, {  436,  405 },
-    {  467,  436 }, {  529,  498 }, {  560,  529 }, {  591,  560 },
-    {  653,  622 }, {  684,  653 }, {  715,  684 }, {  777,  746 },
-    {  808,  777 }, {  839,  808 }, {  901,  870 }, {  932,  901 },
-    {  963,  932 }, {  158,  127 }, {  189,  158 }, {  220,  189 },
-    {  251,  220 }, {  282,  251 }, {  313,  282 }, {  344,  313 },
-    {  375,  344 }, {  406,  375 }, {  437,  406 }, {  468,  437 },
-    {  499,  468 }, {  530,  499 }, {  561,  530 }, {  592,  561 },
-    {  623,  592 }, {  654,  623 }, {  685,  654 }, {  716,  685 },
-    {  747,  716 }, {  778,  747 }, {  809,  778 }, {  840,  809 },
-    {  871,  840 }, {  902,  871 }, {  933,  902 }, {  964,  933 },
-    {  995,  964 }, {  190,  159 }, {  221,  190 }, {  252,  221 },
-    {  314,  283 }, {  345,  314 }, {  376,  345 }, {  438,  407 },
-    {  469,  438 }, {  500,  469 }, {  562,  531 }, {  593,  562 },
-    {  624,  593 }, {  686,  655 }, {  717,  686 }, {  748,  717 },
-    {  810,  779 }, {  841,  810 }, {  872,  841 }, {  934,  903 },
-    {  965,  934 }, {  996,  965 }, {  222,  191 }, {  253,  222 },
-    {  346,  315 }, {  377,  346 }, {  470,  439 }, {  501,  470 },
-    {  594,  563 }, {  625,  594 }, {  718,  687 }, {  749,  718 },
-    {  842,  811 }, {  873,  842 }, {  966,  935 }, {  997,  966 },
-    {  254,  223 }, {  378,  347 }, {  502,  471 }, {  626,  595 },
-    {  750,  719 }, {  874,  843 }, {  998,  967 }, {  283,  252 },
-    {  407,  376 }, {  531,  500 }, {  655,  624 }, {  779,  748 },
-    {  903,  872 }, {  284,  253 }, {  315,  284 }, {  408,  377 },
-    {  439,  408 }, {  532,  501 }, {  563,  532 }, {  656,  625 },
-    {  687,  656 }, {  780,  749 }, {  811,  780 }, {  904,  873 },
-    {  935,  904 }, {  285,  254 }, {  316,  285 }, {  347,  316 },
-    {  409,  378 }, {  440,  409 }, {  471,  440 }, {  533,  502 },
-    {  564,  533 }, {  595,  564 }, {  657,  626 }, {  688,  657 },
-    {  719,  688 }, {  781,  750 }, {  812,  781 }, {  843,  812 },
-    {  905,  874 }, {  936,  905 }, {  967,  936 }, {  286,  255 },
-    {  317,  286 }, {  348,  317 }, {  379,  348 }, {  410,  379 },
-    {  441,  410 }, {  472,  441 }, {  503,  472 }, {  534,  503 },
-    {  565,  534 }, {  596,  565 }, {  627,  596 }, {  658,  627 },
-    {  689,  658 }, {  720,  689 }, {  751,  720 }, {  782,  751 },
-    {  813,  782 }, {  844,  813 }, {  875,  844 }, {  906,  875 },
-    {  937,  906 }, {  968,  937 }, {  999,  968 }, {  318,  287 },
-    {  349,  318 }, {  380,  349 }, {  442,  411 }, {  473,  442 },
-    {  504,  473 }, {  566,  535 }, {  597,  566 }, {  628,  597 },
-    {  690,  659 }, {  721,  690 }, {  752,  721 }, {  814,  783 },
-    {  845,  814 }, {  876,  845 }, {  938,  907 }, {  969,  938 },
-    { 1000,  969 }, {  350,  319 }, {  381,  350 }, {  474,  443 },
-    {  505,  474 }, {  598,  567 }, {  629,  598 }, {  722,  691 },
-    {  753,  722 }, {  846,  815 }, {  877,  846 }, {  970,  939 },
-    { 1001,  970 }, {  382,  351 }, {  506,  475 }, {  630,  599 },
-    {  754,  723 }, {  878,  847 }, { 1002,  971 }, {  411,  380 },
-    {  535,  504 }, {  659,  628 }, {  783,  752 }, {  907,  876 },
-    {  412,  381 }, {  443,  412 }, {  536,  505 }, {  567,  536 },
-    {  660,  629 }, {  691,  660 }, {  784,  753 }, {  815,  784 },
-    {  908,  877 }, {  939,  908 }, {  413,  382 }, {  444,  413 },
-    {  475,  444 }, {  537,  506 }, {  568,  537 }, {  599,  568 },
-    {  661,  630 }, {  692,  661 }, {  723,  692 }, {  785,  754 },
-    {  816,  785 }, {  847,  816 }, {  909,  878 }, {  940,  909 },
-    {  971,  940 }, {  414,  383 }, {  445,  414 }, {  476,  445 },
-    {  507,  476 }, {  538,  507 }, {  569,  538 }, {  600,  569 },
-    {  631,  600 }, {  662,  631 }, {  693,  662 }, {  724,  693 },
-    {  755,  724 }, {  786,  755 }, {  817,  786 }, {  848,  817 },
-    {  879,  848 }, {  910,  879 }, {  941,  910 }, {  972,  941 },
-    { 1003,  972 }, {  446,  415 }, {  477,  446 }, {  508,  477 },
-    {  570,  539 }, {  601,  570 }, {  632,  601 }, {  694,  663 },
-    {  725,  694 }, {  756,  725 }, {  818,  787 }, {  849,  818 },
-    {  880,  849 }, {  942,  911 }, {  973,  942 }, { 1004,  973 },
-    {  478,  447 }, {  509,  478 }, {  602,  571 }, {  633,  602 },
-    {  726,  695 }, {  757,  726 }, {  850,  819 }, {  881,  850 },
-    {  974,  943 }, { 1005,  974 }, {  510,  479 }, {  634,  603 },
-    {  758,  727 }, {  882,  851 }, { 1006,  975 }, {  539,  508 },
-    {  663,  632 }, {  787,  756 }, {  911,  880 }, {  540,  509 },
-    {  571,  540 }, {  664,  633 }, {  695,  664 }, {  788,  757 },
-    {  819,  788 }, {  912,  881 }, {  943,  912 }, {  541,  510 },
-    {  572,  541 }, {  603,  572 }, {  665,  634 }, {  696,  665 },
-    {  727,  696 }, {  789,  758 }, {  820,  789 }, {  851,  820 },
-    {  913,  882 }, {  944,  913 }, {  975,  944 }, {  542,  511 },
-    {  573,  542 }, {  604,  573 }, {  635,  604 }, {  666,  635 },
-    {  697,  666 }, {  728,  697 }, {  759,  728 }, {  790,  759 },
-    {  821,  790 }, {  852,  821 }, {  883,  852 }, {  914,  883 },
-    {  945,  914 }, {  976,  945 }, { 1007,  976 }, {  574,  543 },
-    {  605,  574 }, {  636,  605 }, {  698,  667 }, {  729,  698 },
-    {  760,  729 }, {  822,  791 }, {  853,  822 }, {  884,  853 },
-    {  946,  915 }, {  977,  946 }, { 1008,  977 }, {  606,  575 },
-    {  637,  606 }, {  730,  699 }, {  761,  730 }, {  854,  823 },
-    {  885,  854 }, {  978,  947 }, { 1009,  978 }, {  638,  607 },
-    {  762,  731 }, {  886,  855 }, { 1010,  979 }, {  667,  636 },
-    {  791,  760 }, {  915,  884 }, {  668,  637 }, {  699,  668 },
-    {  792,  761 }, {  823,  792 }, {  916,  885 }, {  947,  916 },
-    {  669,  638 }, {  700,  669 }, {  731,  700 }, {  793,  762 },
-    {  824,  793 }, {  855,  824 }, {  917,  886 }, {  948,  917 },
-    {  979,  948 }, {  670,  639 }, {  701,  670 }, {  732,  701 },
-    {  763,  732 }, {  794,  763 }, {  825,  794 }, {  856,  825 },
-    {  887,  856 }, {  918,  887 }, {  949,  918 }, {  980,  949 },
-    { 1011,  980 }, {  702,  671 }, {  733,  702 }, {  764,  733 },
-    {  826,  795 }, {  857,  826 }, {  888,  857 }, {  950,  919 },
-    {  981,  950 }, { 1012,  981 }, {  734,  703 }, {  765,  734 },
-    {  858,  827 }, {  889,  858 }, {  982,  951 }, { 1013,  982 },
-    {  766,  735 }, {  890,  859 }, { 1014,  983 }, {  795,  764 },
-    {  919,  888 }, {  796,  765 }, {  827,  796 }, {  920,  889 },
-    {  951,  920 }, {  797,  766 }, {  828,  797 }, {  859,  828 },
-    {  921,  890 }, {  952,  921 }, {  983,  952 }, {  798,  767 },
-    {  829,  798 }, {  860,  829 }, {  891,  860 }, {  922,  891 },
-    {  953,  922 }, {  984,  953 }, { 1015,  984 }, {  830,  799 },
-    {  861,  830 }, {  892,  861 }, {  954,  923 }, {  985,  954 },
-    { 1016,  985 }, {  862,  831 }, {  893,  862 }, {  986,  955 },
-    { 1017,  986 }, {  894,  863 }, { 1018,  987 }, {  923,  892 },
-    {  924,  893 }, {  955,  924 }, {  925,  894 }, {  956,  925 },
-    {  987,  956 }, {  926,  895 }, {  957,  926 }, {  988,  957 },
-    { 1019,  988 }, {  958,  927 }, {  989,  958 }, { 1020,  989 },
-    {  990,  959 }, { 1021,  990 }, { 1022,  991 }, {    0,    0 },
-};
-
-const int16_t (*ff_vp9_scans_nb[5][4])[2] = {
-    {
-        ff_vp9_default_scan_4x4_nb, ff_vp9_col_scan_4x4_nb,
-        ff_vp9_row_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
-    }, {
-        ff_vp9_default_scan_8x8_nb, ff_vp9_col_scan_8x8_nb,
-        ff_vp9_row_scan_8x8_nb, ff_vp9_default_scan_8x8_nb
-    }, {
-        ff_vp9_default_scan_16x16_nb, ff_vp9_col_scan_16x16_nb,
-        ff_vp9_row_scan_16x16_nb, ff_vp9_default_scan_16x16_nb
-    }, {
-        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb,
-        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb
-    }, { // lossless
-        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb,
-        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
-    }
-};
-
-const uint8_t ff_vp9_model_pareto8[256][8] = {
-    {   6,  86, 128,  11,  87,  42,  91,  52 },
-    {   3,  86, 128,   6,  86,  23,  88,  29 },
-    {   6,  86, 128,  11,  87,  42,  91,  52 },
-    {   9,  86, 129,  17,  88,  61,  94,  76 },
-    {  12,  86, 129,  22,  88,  77,  97,  93 },
-    {  15,  87, 129,  28,  89,  93, 100, 110 },
-    {  17,  87, 129,  33,  90, 105, 103, 123 },
-    {  20,  88, 130,  38,  91, 118, 106, 136 },
-    {  23,  88, 130,  43,  91, 128, 108, 146 },
-    {  26,  89, 131,  48,  92, 139, 111, 156 },
-    {  28,  89, 131,  53,  93, 147, 114, 163 },
-    {  31,  90, 131,  58,  94, 156, 117, 171 },
-    {  34,  90, 131,  62,  94, 163, 119, 177 },
-    {  37,  90, 132,  66,  95, 171, 122, 184 },
-    {  39,  90, 132,  70,  96, 177, 124, 189 },
-    {  42,  91, 132,  75,  97, 183, 127, 194 },
-    {  44,  91, 132,  79,  97, 188, 129, 198 },
-    {  47,  92, 133,  83,  98, 193, 132, 202 },
-    {  49,  92, 133,  86,  99, 197, 134, 205 },
-    {  52,  93, 133,  90, 100, 201, 137, 208 },
-    {  54,  93, 133,  94, 100, 204, 139, 211 },
-    {  57,  94, 134,  98, 101, 208, 142, 214 },
-    {  59,  94, 134, 101, 102, 211, 144, 216 },
-    {  62,  94, 135, 105, 103, 214, 146, 218 },
-    {  64,  94, 135, 108, 103, 216, 148, 220 },
-    {  66,  95, 135, 111, 104, 219, 151, 222 },
-    {  68,  95, 135, 114, 105, 221, 153, 223 },
-    {  71,  96, 136, 117, 106, 224, 155, 225 },
-    {  73,  96, 136, 120, 106, 225, 157, 226 },
-    {  76,  97, 136, 123, 107, 227, 159, 228 },
-    {  78,  97, 136, 126, 108, 229, 160, 229 },
-    {  80,  98, 137, 129, 109, 231, 162, 231 },
-    {  82,  98, 137, 131, 109, 232, 164, 232 },
-    {  84,  98, 138, 134, 110, 234, 166, 233 },
-    {  86,  98, 138, 137, 111, 235, 168, 234 },
-    {  89,  99, 138, 140, 112, 236, 170, 235 },
-    {  91,  99, 138, 142, 112, 237, 171, 235 },
-    {  93, 100, 139, 145, 113, 238, 173, 236 },
-    {  95, 100, 139, 147, 114, 239, 174, 237 },
-    {  97, 101, 140, 149, 115, 240, 176, 238 },
-    {  99, 101, 140, 151, 115, 241, 177, 238 },
-    { 101, 102, 140, 154, 116, 242, 179, 239 },
-    { 103, 102, 140, 156, 117, 242, 180, 239 },
-    { 105, 103, 141, 158, 118, 243, 182, 240 },
-    { 107, 103, 141, 160, 118, 243, 183, 240 },
-    { 109, 104, 141, 162, 119, 244, 185, 241 },
-    { 111, 104, 141, 164, 119, 244, 186, 241 },
-    { 113, 104, 142, 166, 120, 245, 187, 242 },
-    { 114, 104, 142, 168, 121, 245, 188, 242 },
-    { 116, 105, 143, 170, 122, 246, 190, 243 },
-    { 118, 105, 143, 171, 122, 246, 191, 243 },
-    { 120, 106, 143, 173, 123, 247, 192, 244 },
-    { 121, 106, 143, 175, 124, 247, 193, 244 },
-    { 123, 107, 144, 177, 125, 248, 195, 244 },
-    { 125, 107, 144, 178, 125, 248, 196, 244 },
-    { 127, 108, 145, 180, 126, 249, 197, 245 },
-    { 128, 108, 145, 181, 127, 249, 198, 245 },
-    { 130, 109, 145, 183, 128, 249, 199, 245 },
-    { 132, 109, 145, 184, 128, 249, 200, 245 },
-    { 134, 110, 146, 186, 129, 250, 201, 246 },
-    { 135, 110, 146, 187, 130, 250, 202, 246 },
-    { 137, 111, 147, 189, 131, 251, 203, 246 },
-    { 138, 111, 147, 190, 131, 251, 204, 246 },
-    { 140, 112, 147, 192, 132, 251, 205, 247 },
-    { 141, 112, 147, 193, 132, 251, 206, 247 },
-    { 143, 113, 148, 194, 133, 251, 207, 247 },
-    { 144, 113, 148, 195, 134, 251, 207, 247 },
-    { 146, 114, 149, 197, 135, 252, 208, 248 },
-    { 147, 114, 149, 198, 135, 252, 209, 248 },
-    { 149, 115, 149, 199, 136, 252, 210, 248 },
-    { 150, 115, 149, 200, 137, 252, 210, 248 },
-    { 152, 115, 150, 201, 138, 252, 211, 248 },
-    { 153, 115, 150, 202, 138, 252, 212, 248 },
-    { 155, 116, 151, 204, 139, 253, 213, 249 },
-    { 156, 116, 151, 205, 139, 253, 213, 249 },
-    { 158, 117, 151, 206, 140, 253, 214, 249 },
-    { 159, 117, 151, 207, 141, 253, 215, 249 },
-    { 161, 118, 152, 208, 142, 253, 216, 249 },
-    { 162, 118, 152, 209, 142, 253, 216, 249 },
-    { 163, 119, 153, 210, 143, 253, 217, 249 },
-    { 164, 119, 153, 211, 143, 253, 217, 249 },
-    { 166, 120, 153, 212, 144, 254, 218, 250 },
-    { 167, 120, 153, 212, 145, 254, 219, 250 },
-    { 168, 121, 154, 213, 146, 254, 220, 250 },
-    { 169, 121, 154, 214, 146, 254, 220, 250 },
-    { 171, 122, 155, 215, 147, 254, 221, 250 },
-    { 172, 122, 155, 216, 147, 254, 221, 250 },
-    { 173, 123, 155, 217, 148, 254, 222, 250 },
-    { 174, 123, 155, 217, 149, 254, 222, 250 },
-    { 176, 124, 156, 218, 150, 254, 223, 250 },
-    { 177, 124, 156, 219, 150, 254, 223, 250 },
-    { 178, 125, 157, 220, 151, 254, 224, 251 },
-    { 179, 125, 157, 220, 151, 254, 224, 251 },
-    { 180, 126, 157, 221, 152, 254, 225, 251 },
-    { 181, 126, 157, 221, 152, 254, 225, 251 },
-    { 183, 127, 158, 222, 153, 254, 226, 251 },
-    { 184, 127, 158, 223, 154, 254, 226, 251 },
-    { 185, 128, 159, 224, 155, 255, 227, 251 },
-    { 186, 128, 159, 224, 155, 255, 227, 251 },
-    { 187, 129, 160, 225, 156, 255, 228, 251 },
-    { 188, 130, 160, 225, 156, 255, 228, 251 },
-    { 189, 131, 160, 226, 157, 255, 228, 251 },
-    { 190, 131, 160, 226, 158, 255, 228, 251 },
-    { 191, 132, 161, 227, 159, 255, 229, 251 },
-    { 192, 132, 161, 227, 159, 255, 229, 251 },
-    { 193, 133, 162, 228, 160, 255, 230, 252 },
-    { 194, 133, 162, 229, 160, 255, 230, 252 },
-    { 195, 134, 163, 230, 161, 255, 231, 252 },
-    { 196, 134, 163, 230, 161, 255, 231, 252 },
-    { 197, 135, 163, 231, 162, 255, 231, 252 },
-    { 198, 135, 163, 231, 162, 255, 231, 252 },
-    { 199, 136, 164, 232, 163, 255, 232, 252 },
-    { 200, 136, 164, 232, 164, 255, 232, 252 },
-    { 201, 137, 165, 233, 165, 255, 233, 252 },
-    { 201, 137, 165, 233, 165, 255, 233, 252 },
-    { 202, 138, 166, 233, 166, 255, 233, 252 },
-    { 203, 138, 166, 233, 166, 255, 233, 252 },
-    { 204, 139, 166, 234, 167, 255, 234, 252 },
-    { 205, 139, 166, 234, 167, 255, 234, 252 },
-    { 206, 140, 167, 235, 168, 255, 235, 252 },
-    { 206, 140, 167, 235, 168, 255, 235, 252 },
-    { 207, 141, 168, 236, 169, 255, 235, 252 },
-    { 208, 141, 168, 236, 170, 255, 235, 252 },
-    { 209, 142, 169, 237, 171, 255, 236, 252 },
-    { 209, 143, 169, 237, 171, 255, 236, 252 },
-    { 210, 144, 169, 237, 172, 255, 236, 252 },
-    { 211, 144, 169, 237, 172, 255, 236, 252 },
-    { 212, 145, 170, 238, 173, 255, 237, 252 },
-    { 213, 145, 170, 238, 173, 255, 237, 252 },
-    { 214, 146, 171, 239, 174, 255, 237, 253 },
-    { 214, 146, 171, 239, 174, 255, 237, 253 },
-    { 215, 147, 172, 240, 175, 255, 238, 253 },
-    { 215, 147, 172, 240, 175, 255, 238, 253 },
-    { 216, 148, 173, 240, 176, 255, 238, 253 },
-    { 217, 148, 173, 240, 176, 255, 238, 253 },
-    { 218, 149, 173, 241, 177, 255, 239, 253 },
-    { 218, 149, 173, 241, 178, 255, 239, 253 },
-    { 219, 150, 174, 241, 179, 255, 239, 253 },
-    { 219, 151, 174, 241, 179, 255, 239, 253 },
-    { 220, 152, 175, 242, 180, 255, 240, 253 },
-    { 221, 152, 175, 242, 180, 255, 240, 253 },
-    { 222, 153, 176, 242, 181, 255, 240, 253 },
-    { 222, 153, 176, 242, 181, 255, 240, 253 },
-    { 223, 154, 177, 243, 182, 255, 240, 253 },
-    { 223, 154, 177, 243, 182, 255, 240, 253 },
-    { 224, 155, 178, 244, 183, 255, 241, 253 },
-    { 224, 155, 178, 244, 183, 255, 241, 253 },
-    { 225, 156, 178, 244, 184, 255, 241, 253 },
-    { 225, 157, 178, 244, 184, 255, 241, 253 },
-    { 226, 158, 179, 244, 185, 255, 242, 253 },
-    { 227, 158, 179, 244, 185, 255, 242, 253 },
-    { 228, 159, 180, 245, 186, 255, 242, 253 },
-    { 228, 159, 180, 245, 186, 255, 242, 253 },
-    { 229, 160, 181, 245, 187, 255, 242, 253 },
-    { 229, 160, 181, 245, 187, 255, 242, 253 },
-    { 230, 161, 182, 246, 188, 255, 243, 253 },
-    { 230, 162, 182, 246, 188, 255, 243, 253 },
-    { 231, 163, 183, 246, 189, 255, 243, 253 },
-    { 231, 163, 183, 246, 189, 255, 243, 253 },
-    { 232, 164, 184, 247, 190, 255, 243, 253 },
-    { 232, 164, 184, 247, 190, 255, 243, 253 },
-    { 233, 165, 185, 247, 191, 255, 244, 253 },
-    { 233, 165, 185, 247, 191, 255, 244, 253 },
-    { 234, 166, 185, 247, 192, 255, 244, 253 },
-    { 234, 167, 185, 247, 192, 255, 244, 253 },
-    { 235, 168, 186, 248, 193, 255, 244, 253 },
-    { 235, 168, 186, 248, 193, 255, 244, 253 },
-    { 236, 169, 187, 248, 194, 255, 244, 253 },
-    { 236, 169, 187, 248, 194, 255, 244, 253 },
-    { 236, 170, 188, 248, 195, 255, 245, 253 },
-    { 236, 170, 188, 248, 195, 255, 245, 253 },
-    { 237, 171, 189, 249, 196, 255, 245, 254 },
-    { 237, 172, 189, 249, 196, 255, 245, 254 },
-    { 238, 173, 190, 249, 197, 255, 245, 254 },
-    { 238, 173, 190, 249, 197, 255, 245, 254 },
-    { 239, 174, 191, 249, 198, 255, 245, 254 },
-    { 239, 174, 191, 249, 198, 255, 245, 254 },
-    { 240, 175, 192, 249, 199, 255, 246, 254 },
-    { 240, 176, 192, 249, 199, 255, 246, 254 },
-    { 240, 177, 193, 250, 200, 255, 246, 254 },
-    { 240, 177, 193, 250, 200, 255, 246, 254 },
-    { 241, 178, 194, 250, 201, 255, 246, 254 },
-    { 241, 178, 194, 250, 201, 255, 246, 254 },
-    { 242, 179, 195, 250, 202, 255, 246, 254 },
-    { 242, 180, 195, 250, 202, 255, 246, 254 },
-    { 242, 181, 196, 250, 203, 255, 247, 254 },
-    { 242, 181, 196, 250, 203, 255, 247, 254 },
-    { 243, 182, 197, 251, 204, 255, 247, 254 },
-    { 243, 183, 197, 251, 204, 255, 247, 254 },
-    { 244, 184, 198, 251, 205, 255, 247, 254 },
-    { 244, 184, 198, 251, 205, 255, 247, 254 },
-    { 244, 185, 199, 251, 206, 255, 247, 254 },
-    { 244, 185, 199, 251, 206, 255, 247, 254 },
-    { 245, 186, 200, 251, 207, 255, 247, 254 },
-    { 245, 187, 200, 251, 207, 255, 247, 254 },
-    { 246, 188, 201, 252, 207, 255, 248, 254 },
-    { 246, 188, 201, 252, 207, 255, 248, 254 },
-    { 246, 189, 202, 252, 208, 255, 248, 254 },
-    { 246, 190, 202, 252, 208, 255, 248, 254 },
-    { 247, 191, 203, 252, 209, 255, 248, 254 },
-    { 247, 191, 203, 252, 209, 255, 248, 254 },
-    { 247, 192, 204, 252, 210, 255, 248, 254 },
-    { 247, 193, 204, 252, 210, 255, 248, 254 },
-    { 248, 194, 205, 252, 211, 255, 248, 254 },
-    { 248, 194, 205, 252, 211, 255, 248, 254 },
-    { 248, 195, 206, 252, 212, 255, 249, 254 },
-    { 248, 196, 206, 252, 212, 255, 249, 254 },
-    { 249, 197, 207, 253, 213, 255, 249, 254 },
-    { 249, 197, 207, 253, 213, 255, 249, 254 },
-    { 249, 198, 208, 253, 214, 255, 249, 254 },
-    { 249, 199, 209, 253, 214, 255, 249, 254 },
-    { 250, 200, 210, 253, 215, 255, 249, 254 },
-    { 250, 200, 210, 253, 215, 255, 249, 254 },
-    { 250, 201, 211, 253, 215, 255, 249, 254 },
-    { 250, 202, 211, 253, 215, 255, 249, 254 },
-    { 250, 203, 212, 253, 216, 255, 249, 254 },
-    { 250, 203, 212, 253, 216, 255, 249, 254 },
-    { 251, 204, 213, 253, 217, 255, 250, 254 },
-    { 251, 205, 213, 253, 217, 255, 250, 254 },
-    { 251, 206, 214, 254, 218, 255, 250, 254 },
-    { 251, 206, 215, 254, 218, 255, 250, 254 },
-    { 252, 207, 216, 254, 219, 255, 250, 254 },
-    { 252, 208, 216, 254, 219, 255, 250, 254 },
-    { 252, 209, 217, 254, 220, 255, 250, 254 },
-    { 252, 210, 217, 254, 220, 255, 250, 254 },
-    { 252, 211, 218, 254, 221, 255, 250, 254 },
-    { 252, 212, 218, 254, 221, 255, 250, 254 },
-    { 253, 213, 219, 254, 222, 255, 250, 254 },
-    { 253, 213, 220, 254, 222, 255, 250, 254 },
-    { 253, 214, 221, 254, 223, 255, 250, 254 },
-    { 253, 215, 221, 254, 223, 255, 250, 254 },
-    { 253, 216, 222, 254, 224, 255, 251, 254 },
-    { 253, 217, 223, 254, 224, 255, 251, 254 },
-    { 253, 218, 224, 254, 225, 255, 251, 254 },
-    { 253, 219, 224, 254, 225, 255, 251, 254 },
-    { 254, 220, 225, 254, 225, 255, 251, 254 },
-    { 254, 221, 226, 254, 225, 255, 251, 254 },
-    { 254, 222, 227, 255, 226, 255, 251, 254 },
-    { 254, 223, 227, 255, 226, 255, 251, 254 },
-    { 254, 224, 228, 255, 227, 255, 251, 254 },
-    { 254, 225, 229, 255, 227, 255, 251, 254 },
-    { 254, 226, 230, 255, 228, 255, 251, 254 },
-    { 254, 227, 230, 255, 229, 255, 251, 254 },
-    { 255, 228, 231, 255, 230, 255, 251, 254 },
-    { 255, 229, 232, 255, 230, 255, 251, 254 },
-    { 255, 230, 233, 255, 231, 255, 252, 254 },
-    { 255, 231, 234, 255, 231, 255, 252, 254 },
-    { 255, 232, 235, 255, 232, 255, 252, 254 },
-    { 255, 233, 236, 255, 232, 255, 252, 254 },
-    { 255, 235, 237, 255, 233, 255, 252, 254 },
-    { 255, 236, 238, 255, 234, 255, 252, 254 },
-    { 255, 238, 240, 255, 235, 255, 252, 255 },
-    { 255, 239, 241, 255, 235, 255, 252, 254 },
-    { 255, 241, 243, 255, 236, 255, 252, 254 },
-    { 255, 243, 245, 255, 237, 255, 252, 254 },
-    { 255, 246, 247, 255, 239, 255, 253, 255 },
-};
-
-const ProbContext ff_vp9_default_probs = {
-    { /* y_mode */
-        {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* bsize < 8x8 */,
-        { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* bsize < 16x16 */,
-        { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* bsize < 32x32 */,
-        { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* bsize >= 32x32 */
-    }, { /* uv_mode */
-        {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
-        {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
-        { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
-        {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
-        {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
-        {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
-        {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-        {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
-        {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
-        { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
-    }, { /* filter */
-        { 235, 162, },
-        {  36, 255, },
-        {  34,   3, },
-        { 149, 144, },
-    }, { /* mv_mode */
-        {  2, 173,  34 },  // 0 = both zero mv
-        {  7, 145,  85 },  // 1 = one zero mv + one a predicted mv
-        {  7, 166,  63 },  // 2 = two predicted mvs
-        {  7,  94,  66 },  // 3 = one predicted/zero and one new mv
-        {  8,  64,  46 },  // 4 = two new mvs
-        { 17,  81,  31 },  // 5 = one intra neighbor + x
-        { 25,  29,  30 },  // 6 = two intra neighbors
-    }, { /* intra */
-        9, 102, 187, 225
-    }, { /* comp */
-        239, 183, 119,  96,  41
-    }, { /* single_ref */
-        {  33,  16 },
-        {  77,  74 },
-        { 142, 142 },
-        { 172, 170 },
-        { 238, 247 }
-    }, { /* comp_ref */
-        50, 126, 123, 221, 226
-    }, { /* tx32p */
-        { 3, 136, 37, },
-        { 5,  52, 13, },
-    }, { /* tx16p */
-        { 20, 152, },
-        { 15, 101, },
-    }, { /* tx8p */
-        100, 66
-    }, { /* skip */
-        192, 128, 64
-    }, { /* mv_joint */
-        32, 64, 96
-    }, {
-        { /* mv vertical component */
-            128, /* sign */
-            { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */
-            216, /* class0 */
-            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
-            { /* class0_fp */
-                { 128, 128, 64 },
-                {  96, 112, 64 }
-            },
-            { 64, 96, 64 }, /* fp */
-            160, /* class0_hp bit */
-            128, /* hp */
-        }, { /* mv horizontal component */
-            128, /* sign */
-            { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */
-            208, /* class0 */
-            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
-            { /* class0_fp */
-                { 128, 128, 64 },
-                {  96, 112, 64 }
-            },
-            { 64, 96, 64 }, /* fp */
-            160, /* class0_hp bit */
-            128, /* hp */
-        }
-    }, { /* partition */
-        { /* 64x64 -> 32x32 */
-            { 222,  34,  30 } /* a/l both not split */,
-            {  72,  16,  44 } /* a split, l not split */,
-            {  58,  32,  12 } /* l split, a not split */,
-            {  10,   7,   6 } /* a/l both split */,
-        }, { /* 32x32 -> 16x16 */
-            { 177,  58,  59 } /* a/l both not split */,
-            {  68,  26,  63 } /* a split, l not split */,
-            {  52,  79,  25 } /* l split, a not split */,
-            {  17,  14,  12 } /* a/l both split */,
-        }, { /* 16x16 -> 8x8 */
-            { 174,  73,  87 } /* a/l both not split */,
-            {  92,  41,  83 } /* a split, l not split */,
-            {  82,  99,  50 } /* l split, a not split */,
-            {  53,  39,  39 } /* a/l both split */,
-        }, { /* 8x8 -> 4x4 */
-            { 199, 122, 141 } /* a/l both not split */,
-            { 147,  63, 159 } /* a split, l not split */,
-            { 148, 133, 118 } /* l split, a not split */,
-            { 121, 104, 114 } /* a/l both split */,
-        }
-    },
-};
-
-const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3] = {
-    { /* tx = 4x4 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 195,  29, 183 },
-                    {  84,  49, 136 },
-                    {   8,  42,  71 }
-                }, { /* Coeff Band 1 */
-                    {  31, 107, 169 },
-                    {  35,  99, 159 },
-                    {  17,  82, 140 },
-                    {   8,  66, 114 },
-                    {   2,  44,  76 },
-                    {   1,  19,  32 }
-                }, { /* Coeff Band 2 */
-                    {  40, 132, 201 },
-                    {  29, 114, 187 },
-                    {  13,  91, 157 },
-                    {   7,  75, 127 },
-                    {   3,  58,  95 },
-                    {   1,  28,  47 }
-                }, { /* Coeff Band 3 */
-                    {  69, 142, 221 },
-                    {  42, 122, 201 },
-                    {  15,  91, 159 },
-                    {   6,  67, 121 },
-                    {   1,  42,  77 },
-                    {   1,  17,  31 }
-                }, { /* Coeff Band 4 */
-                    { 102, 148, 228 },
-                    {  67, 117, 204 },
-                    {  17,  82, 154 },
-                    {   6,  59, 114 },
-                    {   2,  39,  75 },
-                    {   1,  15,  29 }
-                }, { /* Coeff Band 5 */
-                    { 156,  57, 233 },
-                    { 119,  57, 212 },
-                    {  58,  48, 163 },
-                    {  29,  40, 124 },
-                    {  12,  30,  81 },
-                    {   3,  12,  31 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 191, 107, 226 },
-                    { 124, 117, 204 },
-                    {  25,  99, 155 }
-                }, { /* Coeff Band 1 */
-                    {  29, 148, 210 },
-                    {  37, 126, 194 },
-                    {   8,  93, 157 },
-                    {   2,  68, 118 },
-                    {   1,  39,  69 },
-                    {   1,  17,  33 }
-                }, { /* Coeff Band 2 */
-                    {  41, 151, 213 },
-                    {  27, 123, 193 },
-                    {   3,  82, 144 },
-                    {   1,  58, 105 },
-                    {   1,  32,  60 },
-                    {   1,  13,  26 }
-                }, { /* Coeff Band 3 */
-                    {  59, 159, 220 },
-                    {  23, 126, 198 },
-                    {   4,  88, 151 },
-                    {   1,  66, 114 },
-                    {   1,  38,  71 },
-                    {   1,  18,  34 }
-                }, { /* Coeff Band 4 */
-                    { 114, 136, 232 },
-                    {  51, 114, 207 },
-                    {  11,  83, 155 },
-                    {   3,  56, 105 },
-                    {   1,  33,  65 },
-                    {   1,  17,  34 }
-                }, { /* Coeff Band 5 */
-                    { 149,  65, 234 },
-                    { 121,  57, 215 },
-                    {  61,  49, 166 },
-                    {  28,  36, 114 },
-                    {  12,  25,  76 },
-                    {   3,  16,  42 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 214,  49, 220 },
-                    { 132,  63, 188 },
-                    {  42,  65, 137 }
-                }, { /* Coeff Band 1 */
-                    {  85, 137, 221 },
-                    { 104, 131, 216 },
-                    {  49, 111, 192 },
-                    {  21,  87, 155 },
-                    {   2,  49,  87 },
-                    {   1,  16,  28 }
-                }, { /* Coeff Band 2 */
-                    {  89, 163, 230 },
-                    {  90, 137, 220 },
-                    {  29, 100, 183 },
-                    {  10,  70, 135 },
-                    {   2,  42,  81 },
-                    {   1,  17,  33 }
-                }, { /* Coeff Band 3 */
-                    { 108, 167, 237 },
-                    {  55, 133, 222 },
-                    {  15,  97, 179 },
-                    {   4,  72, 135 },
-                    {   1,  45,  85 },
-                    {   1,  19,  38 }
-                }, { /* Coeff Band 4 */
-                    { 124, 146, 240 },
-                    {  66, 124, 224 },
-                    {  17,  88, 175 },
-                    {   4,  58, 122 },
-                    {   1,  36,  75 },
-                    {   1,  18,  37 }
-                }, { /* Coeff Band 5 */
-                    { 141,  79, 241 },
-                    { 126,  70, 227 },
-                    {  66,  58, 182 },
-                    {  30,  44, 136 },
-                    {  12,  34,  96 },
-                    {   2,  20,  47 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 229,  99, 249 },
-                    { 143, 111, 235 },
-                    {  46, 109, 192 }
-                }, { /* Coeff Band 1 */
-                    {  82, 158, 236 },
-                    {  94, 146, 224 },
-                    {  25, 117, 191 },
-                    {   9,  87, 149 },
-                    {   3,  56,  99 },
-                    {   1,  33,  57 }
-                }, { /* Coeff Band 2 */
-                    {  83, 167, 237 },
-                    {  68, 145, 222 },
-                    {  10, 103, 177 },
-                    {   2,  72, 131 },
-                    {   1,  41,  79 },
-                    {   1,  20,  39 }
-                }, { /* Coeff Band 3 */
-                    {  99, 167, 239 },
-                    {  47, 141, 224 },
-                    {  10, 104, 178 },
-                    {   2,  73, 133 },
-                    {   1,  44,  85 },
-                    {   1,  22,  47 }
-                }, { /* Coeff Band 4 */
-                    { 127, 145, 243 },
-                    {  71, 129, 228 },
-                    {  17,  93, 177 },
-                    {   3,  61, 124 },
-                    {   1,  41,  84 },
-                    {   1,  21,  52 }
-                }, { /* Coeff Band 5 */
-                    { 157,  78, 244 },
-                    { 140,  72, 231 },
-                    {  69,  58, 184 },
-                    {  31,  44, 137 },
-                    {  14,  38, 105 },
-                    {   8,  23,  61 }
-                }
-            }
-        }
-    }, { /* tx = 8x8 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 125,  34, 187 },
-                    {  52,  41, 133 },
-                    {   6,  31,  56 }
-                }, { /* Coeff Band 1 */
-                    {  37, 109, 153 },
-                    {  51, 102, 147 },
-                    {  23,  87, 128 },
-                    {   8,  67, 101 },
-                    {   1,  41,  63 },
-                    {   1,  19,  29 }
-                }, { /* Coeff Band 2 */
-                    {  31, 154, 185 },
-                    {  17, 127, 175 },
-                    {   6,  96, 145 },
-                    {   2,  73, 114 },
-                    {   1,  51,  82 },
-                    {   1,  28,  45 }
-                }, { /* Coeff Band 3 */
-                    {  23, 163, 200 },
-                    {  10, 131, 185 },
-                    {   2,  93, 148 },
-                    {   1,  67, 111 },
-                    {   1,  41,  69 },
-                    {   1,  14,  24 }
-                }, { /* Coeff Band 4 */
-                    {  29, 176, 217 },
-                    {  12, 145, 201 },
-                    {   3, 101, 156 },
-                    {   1,  69, 111 },
-                    {   1,  39,  63 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 5 */
-                    {  57, 192, 233 },
-                    {  25, 154, 215 },
-                    {   6, 109, 167 },
-                    {   3,  78, 118 },
-                    {   1,  48,  69 },
-                    {   1,  21,  29 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 202, 105, 245 },
-                    { 108, 106, 216 },
-                    {  18,  90, 144 }
-                }, { /* Coeff Band 1 */
-                    {  33, 172, 219 },
-                    {  64, 149, 206 },
-                    {  14, 117, 177 },
-                    {   5,  90, 141 },
-                    {   2,  61,  95 },
-                    {   1,  37,  57 }
-                }, { /* Coeff Band 2 */
-                    {  33, 179, 220 },
-                    {  11, 140, 198 },
-                    {   1,  89, 148 },
-                    {   1,  60, 104 },
-                    {   1,  33,  57 },
-                    {   1,  12,  21 }
-                }, { /* Coeff Band 3 */
-                    {  30, 181, 221 },
-                    {   8, 141, 198 },
-                    {   1,  87, 145 },
-                    {   1,  58, 100 },
-                    {   1,  31,  55 },
-                    {   1,  12,  20 }
-                }, { /* Coeff Band 4 */
-                    {  32, 186, 224 },
-                    {   7, 142, 198 },
-                    {   1,  86, 143 },
-                    {   1,  58, 100 },
-                    {   1,  31,  55 },
-                    {   1,  12,  22 }
-                }, { /* Coeff Band 5 */
-                    {  57, 192, 227 },
-                    {  20, 143, 204 },
-                    {   3,  96, 154 },
-                    {   1,  68, 112 },
-                    {   1,  42,  69 },
-                    {   1,  19,  32 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 212,  35, 215 },
-                    { 113,  47, 169 },
-                    {  29,  48, 105 }
-                }, { /* Coeff Band 1 */
-                    {  74, 129, 203 },
-                    { 106, 120, 203 },
-                    {  49, 107, 178 },
-                    {  19,  84, 144 },
-                    {   4,  50,  84 },
-                    {   1,  15,  25 }
-                }, { /* Coeff Band 2 */
-                    {  71, 172, 217 },
-                    {  44, 141, 209 },
-                    {  15, 102, 173 },
-                    {   6,  76, 133 },
-                    {   2,  51,  89 },
-                    {   1,  24,  42 }
-                }, { /* Coeff Band 3 */
-                    {  64, 185, 231 },
-                    {  31, 148, 216 },
-                    {   8, 103, 175 },
-                    {   3,  74, 131 },
-                    {   1,  46,  81 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 4 */
-                    {  65, 196, 235 },
-                    {  25, 157, 221 },
-                    {   5, 105, 174 },
-                    {   1,  67, 120 },
-                    {   1,  38,  69 },
-                    {   1,  15,  30 }
-                }, { /* Coeff Band 5 */
-                    {  65, 204, 238 },
-                    {  30, 156, 224 },
-                    {   7, 107, 177 },
-                    {   2,  70, 124 },
-                    {   1,  42,  73 },
-                    {   1,  18,  34 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 225,  86, 251 },
-                    { 144, 104, 235 },
-                    {  42,  99, 181 }
-                }, { /* Coeff Band 1 */
-                    {  85, 175, 239 },
-                    { 112, 165, 229 },
-                    {  29, 136, 200 },
-                    {  12, 103, 162 },
-                    {   6,  77, 123 },
-                    {   2,  53,  84 }
-                }, { /* Coeff Band 2 */
-                    {  75, 183, 239 },
-                    {  30, 155, 221 },
-                    {   3, 106, 171 },
-                    {   1,  74, 128 },
-                    {   1,  44,  76 },
-                    {   1,  17,  28 }
-                }, { /* Coeff Band 3 */
-                    {  73, 185, 240 },
-                    {  27, 159, 222 },
-                    {   2, 107, 172 },
-                    {   1,  75, 127 },
-                    {   1,  42,  73 },
-                    {   1,  17,  29 }
-                }, { /* Coeff Band 4 */
-                    {  62, 190, 238 },
-                    {  21, 159, 222 },
-                    {   2, 107, 172 },
-                    {   1,  72, 122 },
-                    {   1,  40,  71 },
-                    {   1,  18,  32 }
-                }, { /* Coeff Band 5 */
-                    {  61, 199, 240 },
-                    {  27, 161, 226 },
-                    {   4, 113, 180 },
-                    {   1,  76, 129 },
-                    {   1,  46,  80 },
-                    {   1,  23,  41 }
-                }
-            }
-        }
-    }, { /* tx = 16x16 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    {   7,  27, 153 },
-                    {   5,  30,  95 },
-                    {   1,  16,  30 }
-                }, { /* Coeff Band 1 */
-                    {  50,  75, 127 },
-                    {  57,  75, 124 },
-                    {  27,  67, 108 },
-                    {  10,  54,  86 },
-                    {   1,  33,  52 },
-                    {   1,  12,  18 }
-                }, { /* Coeff Band 2 */
-                    {  43, 125, 151 },
-                    {  26, 108, 148 },
-                    {   7,  83, 122 },
-                    {   2,  59,  89 },
-                    {   1,  38,  60 },
-                    {   1,  17,  27 }
-                }, { /* Coeff Band 3 */
-                    {  23, 144, 163 },
-                    {  13, 112, 154 },
-                    {   2,  75, 117 },
-                    {   1,  50,  81 },
-                    {   1,  31,  51 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 4 */
-                    {  18, 162, 185 },
-                    {   6, 123, 171 },
-                    {   1,  78, 125 },
-                    {   1,  51,  86 },
-                    {   1,  31,  54 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 5 */
-                    {  15, 199, 227 },
-                    {   3, 150, 204 },
-                    {   1,  91, 146 },
-                    {   1,  55,  95 },
-                    {   1,  30,  53 },
-                    {   1,  11,  20 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    {  19,  55, 240 },
-                    {  19,  59, 196 },
-                    {   3,  52, 105 }
-                }, { /* Coeff Band 1 */
-                    {  41, 166, 207 },
-                    { 104, 153, 199 },
-                    {  31, 123, 181 },
-                    {  14, 101, 152 },
-                    {   5,  72, 106 },
-                    {   1,  36,  52 }
-                }, { /* Coeff Band 2 */
-                    {  35, 176, 211 },
-                    {  12, 131, 190 },
-                    {   2,  88, 144 },
-                    {   1,  60, 101 },
-                    {   1,  36,  60 },
-                    {   1,  16,  28 }
-                }, { /* Coeff Band 3 */
-                    {  28, 183, 213 },
-                    {   8, 134, 191 },
-                    {   1,  86, 142 },
-                    {   1,  56,  96 },
-                    {   1,  30,  53 },
-                    {   1,  12,  20 }
-                }, { /* Coeff Band 4 */
-                    {  20, 190, 215 },
-                    {   4, 135, 192 },
-                    {   1,  84, 139 },
-                    {   1,  53,  91 },
-                    {   1,  28,  49 },
-                    {   1,  11,  20 }
-                }, { /* Coeff Band 5 */
-                    {  13, 196, 216 },
-                    {   2, 137, 192 },
-                    {   1,  86, 143 },
-                    {   1,  57,  99 },
-                    {   1,  32,  56 },
-                    {   1,  13,  24 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 211,  29, 217 },
-                    {  96,  47, 156 },
-                    {  22,  43,  87 }
-                }, { /* Coeff Band 1 */
-                    {  78, 120, 193 },
-                    { 111, 116, 186 },
-                    {  46, 102, 164 },
-                    {  15,  80, 128 },
-                    {   2,  49,  76 },
-                    {   1,  18,  28 }
-                }, { /* Coeff Band 2 */
-                    {  71, 161, 203 },
-                    {  42, 132, 192 },
-                    {  10,  98, 150 },
-                    {   3,  69, 109 },
-                    {   1,  44,  70 },
-                    {   1,  18,  29 }
-                }, { /* Coeff Band 3 */
-                    {  57, 186, 211 },
-                    {  30, 140, 196 },
-                    {   4,  93, 146 },
-                    {   1,  62, 102 },
-                    {   1,  38,  65 },
-                    {   1,  16,  27 }
-                }, { /* Coeff Band 4 */
-                    {  47, 199, 217 },
-                    {  14, 145, 196 },
-                    {   1,  88, 142 },
-                    {   1,  57,  98 },
-                    {   1,  36,  62 },
-                    {   1,  15,  26 }
-                }, { /* Coeff Band 5 */
-                    {  26, 219, 229 },
-                    {   5, 155, 207 },
-                    {   1,  94, 151 },
-                    {   1,  60, 104 },
-                    {   1,  36,  62 },
-                    {   1,  16,  28 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 233,  29, 248 },
-                    { 146,  47, 220 },
-                    {  43,  52, 140 }
-                }, { /* Coeff Band 1 */
-                    { 100, 163, 232 },
-                    { 179, 161, 222 },
-                    {  63, 142, 204 },
-                    {  37, 113, 174 },
-                    {  26,  89, 137 },
-                    {  18,  68,  97 }
-                }, { /* Coeff Band 2 */
-                    {  85, 181, 230 },
-                    {  32, 146, 209 },
-                    {   7, 100, 164 },
-                    {   3,  71, 121 },
-                    {   1,  45,  77 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 3 */
-                    {  65, 187, 230 },
-                    {  20, 148, 207 },
-                    {   2,  97, 159 },
-                    {   1,  68, 116 },
-                    {   1,  40,  70 },
-                    {   1,  14,  29 }
-                }, { /* Coeff Band 4 */
-                    {  40, 194, 227 },
-                    {   8, 147, 204 },
-                    {   1,  94, 155 },
-                    {   1,  65, 112 },
-                    {   1,  39,  66 },
-                    {   1,  14,  26 }
-                }, { /* Coeff Band 5 */
-                    {  16, 208, 228 },
-                    {   3, 151, 207 },
-                    {   1,  98, 160 },
-                    {   1,  67, 117 },
-                    {   1,  41,  74 },
-                    {   1,  17,  31 }
-                }
-            }
-        }
-    }, { /* tx = 32x32 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    {  17,  38, 140 },
-                    {   7,  34,  80 },
-                    {   1,  17,  29 }
-                }, { /* Coeff Band 1 */
-                    {  37,  75, 128 },
-                    {  41,  76, 128 },
-                    {  26,  66, 116 },
-                    {  12,  52,  94 },
-                    {   2,  32,  55 },
-                    {   1,  10,  16 }
-                }, { /* Coeff Band 2 */
-                    {  50, 127, 154 },
-                    {  37, 109, 152 },
-                    {  16,  82, 121 },
-                    {   5,  59,  85 },
-                    {   1,  35,  54 },
-                    {   1,  13,  20 }
-                }, { /* Coeff Band 3 */
-                    {  40, 142, 167 },
-                    {  17, 110, 157 },
-                    {   2,  71, 112 },
-                    {   1,  44,  72 },
-                    {   1,  27,  45 },
-                    {   1,  11,  17 }
-                }, { /* Coeff Band 4 */
-                    {  30, 175, 188 },
-                    {   9, 124, 169 },
-                    {   1,  74, 116 },
-                    {   1,  48,  78 },
-                    {   1,  30,  49 },
-                    {   1,  11,  18 }
-                }, { /* Coeff Band 5 */
-                    {  10, 222, 223 },
-                    {   2, 150, 194 },
-                    {   1,  83, 128 },
-                    {   1,  48,  79 },
-                    {   1,  27,  45 },
-                    {   1,  11,  17 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    {  36,  41, 235 },
-                    {  29,  36, 193 },
-                    {  10,  27, 111 }
-                }, { /* Coeff Band 1 */
-                    {  85, 165, 222 },
-                    { 177, 162, 215 },
-                    { 110, 135, 195 },
-                    {  57, 113, 168 },
-                    {  23,  83, 120 },
-                    {  10,  49,  61 }
-                }, { /* Coeff Band 2 */
-                    {  85, 190, 223 },
-                    {  36, 139, 200 },
-                    {   5,  90, 146 },
-                    {   1,  60, 103 },
-                    {   1,  38,  65 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 3 */
-                    {  72, 202, 223 },
-                    {  23, 141, 199 },
-                    {   2,  86, 140 },
-                    {   1,  56,  97 },
-                    {   1,  36,  61 },
-                    {   1,  16,  27 }
-                }, { /* Coeff Band 4 */
-                    {  55, 218, 225 },
-                    {  13, 145, 200 },
-                    {   1,  86, 141 },
-                    {   1,  57,  99 },
-                    {   1,  35,  61 },
-                    {   1,  13,  22 }
-                }, { /* Coeff Band 5 */
-                    {  15, 235, 212 },
-                    {   1, 132, 184 },
-                    {   1,  84, 139 },
-                    {   1,  57,  97 },
-                    {   1,  34,  56 },
-                    {   1,  14,  23 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 181,  21, 201 },
-                    {  61,  37, 123 },
-                    {  10,  38,  71 }
-                }, { /* Coeff Band 1 */
-                    {  47, 106, 172 },
-                    {  95, 104, 173 },
-                    {  42,  93, 159 },
-                    {  18,  77, 131 },
-                    {   4,  50,  81 },
-                    {   1,  17,  23 }
-                }, { /* Coeff Band 2 */
-                    {  62, 147, 199 },
-                    {  44, 130, 189 },
-                    {  28, 102, 154 },
-                    {  18,  75, 115 },
-                    {   2,  44,  65 },
-                    {   1,  12,  19 }
-                }, { /* Coeff Band 3 */
-                    {  55, 153, 210 },
-                    {  24, 130, 194 },
-                    {   3,  93, 146 },
-                    {   1,  61,  97 },
-                    {   1,  31,  50 },
-                    {   1,  10,  16 }
-                }, { /* Coeff Band 4 */
-                    {  49, 186, 223 },
-                    {  17, 148, 204 },
-                    {   1,  96, 142 },
-                    {   1,  53,  83 },
-                    {   1,  26,  44 },
-                    {   1,  11,  17 }
-                }, { /* Coeff Band 5 */
-                    {  13, 217, 212 },
-                    {   2, 136, 180 },
-                    {   1,  78, 124 },
-                    {   1,  50,  83 },
-                    {   1,  29,  49 },
-                    {   1,  14,  23 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 197,  13, 247 },
-                    {  82,  17, 222 },
-                    {  25,  17, 162 }
-                }, { /* Coeff Band 1 */
-                    { 126, 186, 247 },
-                    { 234, 191, 243 },
-                    { 176, 177, 234 },
-                    { 104, 158, 220 },
-                    {  66, 128, 186 },
-                    {  55,  90, 137 }
-                }, { /* Coeff Band 2 */
-                    { 111, 197, 242 },
-                    {  46, 158, 219 },
-                    {   9, 104, 171 },
-                    {   2,  65, 125 },
-                    {   1,  44,  80 },
-                    {   1,  17,  91 }
-                }, { /* Coeff Band 3 */
-                    { 104, 208, 245 },
-                    {  39, 168, 224 },
-                    {   3, 109, 162 },
-                    {   1,  79, 124 },
-                    {   1,  50, 102 },
-                    {   1,  43, 102 }
-                }, { /* Coeff Band 4 */
-                    {  84, 220, 246 },
-                    {  31, 177, 231 },
-                    {   2, 115, 180 },
-                    {   1,  79, 134 },
-                    {   1,  55,  77 },
-                    {   1,  60,  79 }
-                }, { /* Coeff Band 5 */
-                    {  43, 243, 240 },
-                    {   8, 180, 217 },
-                    {   1, 115, 166 },
-                    {   1,  84, 121 },
-                    {   1,  51,  67 },
-                    {   1,  16,   6 }
-                }
-            }
-        }
-    }
-};
-
-const int8_t ff_vp9_mv_joint_tree[3][2] = {
-    { -MV_JOINT_ZERO,            1 }, // '0'
-    {    -MV_JOINT_H,            2 }, // '10'
-    {    -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
-};
-
-const int8_t ff_vp9_mv_class_tree[10][2] = {
-    { -0,   1 }, // '0'
-    { -1,   2 }, // '10'
-    {  3,   4 },
-    { -2,  -3 }, // '110x'
-    {  5,   6 },
-    { -4,  -5 }, // '1110x'
-    { -6,   7 }, // '11110'
-    {  8,   9 },
-    { -7,  -8 }, // '111110x'
-    { -9, -10 }, // '111111x'
-};
-
-const int8_t ff_vp9_mv_fp_tree[3][2] = {
-    { -0,  1 },   // '0'
-    { -1,  2 },   // '10'
-    { -2, -3 },   // '11x'
-};
diff --git a/libavcodec/vp9data.h b/libavcodec/vp9data.h
index a52cc0a353..4142cea52f 100644
--- a/libavcodec/vp9data.h
+++ b/libavcodec/vp9data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,45 +26,2266 @@
 
 #include "vp9.h"
 
-extern const int8_t ff_vp9_partition_tree[3][2];
-extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3];
-extern const int8_t ff_vp9_segmentation_tree[7][2];
-extern const int8_t ff_vp9_intramode_tree[9][2];
-extern const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9];
-extern const uint8_t ff_vp9_default_kf_uvmode_probs[10][9];
-extern const int8_t ff_vp9_inter_mode_tree[3][2];
-extern const int8_t ff_vp9_filter_tree[2][2];
-extern const enum FilterMode ff_vp9_filter_lut[3];
-extern const int16_t ff_vp9_dc_qlookup[256];
-extern const int16_t ff_vp9_ac_qlookup[256];
-extern const enum TxfmType ff_vp9_intra_txfm_type[14];
-extern const int16_t ff_vp9_default_scan_4x4[16];
-extern const int16_t ff_vp9_col_scan_4x4[16];
-extern const int16_t ff_vp9_row_scan_4x4[16];
-extern const int16_t ff_vp9_default_scan_8x8[64];
-extern const int16_t ff_vp9_col_scan_8x8[64];
-extern const int16_t ff_vp9_row_scan_8x8[64];
-extern const int16_t ff_vp9_default_scan_16x16[256];
-extern const int16_t ff_vp9_col_scan_16x16[256];
-extern const int16_t ff_vp9_row_scan_16x16[256];
-extern const int16_t ff_vp9_default_scan_32x32[1024];
-extern const int16_t *ff_vp9_scans[5][4];
-extern const int16_t ff_vp9_default_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_col_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_row_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_default_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_col_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_row_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_default_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_col_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_row_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_default_scan_32x32_nb[1024][2];
-extern const int16_t (*ff_vp9_scans_nb[5][4])[2];
-extern const uint8_t ff_vp9_model_pareto8[256][8];
-extern const ProbContext ff_vp9_default_probs;
-extern const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3];
-extern const int8_t ff_vp9_mv_joint_tree[3][2];
-extern const int8_t ff_vp9_mv_class_tree[10][2];
-extern const int8_t ff_vp9_mv_fp_tree[3][2];
+enum BlockPartition {
+    PARTITION_NONE,    // [ ] <-.
+    PARTITION_H,       // [-]   |
+    PARTITION_V,       // [|]   |
+    PARTITION_SPLIT,   // [+] --'
+};
+
+static const int8_t vp9_partition_tree[3][2] = {
+    { -PARTITION_NONE, 1 },               // '0'
+     { -PARTITION_H, 2 },                 // '10'
+      { -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
+};
+
+static const uint8_t vp9_default_kf_partition_probs[4][4][3] = {
+    { /* 64x64 -> 32x32 */
+        { 174,  35,  49 } /* a/l both not split */,
+        {  68,  11,  27 } /* a split, l not split */,
+        {  57,  15,   9 } /* l split, a not split */,
+        {  12,   3,   3 } /* a/l both split */
+    }, { /* 32x32 -> 16x16 */
+        { 150,  40,  39 } /* a/l both not split */,
+        {  78,  12,  26 } /* a split, l not split */,
+        {  67,  33,  11 } /* l split, a not split */,
+        {  24,   7,   5 } /* a/l both split */,
+    }, { /* 16x16 -> 8x8 */
+        { 149,  53,  53 } /* a/l both not split */,
+        {  94,  20,  48 } /* a split, l not split */,
+        {  83,  53,  24 } /* l split, a not split */,
+        {  52,  18,  18 } /* a/l both split */,
+    }, { /* 8x8 -> 4x4 */
+        { 158,  97,  94 } /* a/l both not split */,
+        {  93,  24,  99 } /* a split, l not split */,
+        {  85, 119,  44 } /* l split, a not split */,
+        {  62,  59,  67 } /* a/l both split */,
+    },
+};
+
+static const int8_t vp9_segmentation_tree[7][2] = {
+    { 1, 2 },
+     { 3, 4 },
+     { 5, 6 },
+      { -0, -1 }, // '00x'
+      { -2, -3 }, // '01x'
+      { -4, -5 }, // '10x'
+      { -6, -7 }, // '11x'
+};
+
+static const int8_t vp9_intramode_tree[9][2] = {
+    { -DC_PRED, 1 },                                  // '0'
+     { -TM_VP8_PRED, 2 },                             // '10'
+      { -VERT_PRED, 3 },                              // '110'
+       { 4, 6 },
+        { -HOR_PRED, 5 },                             // '11100'
+         { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
+        { -DIAG_DOWN_LEFT_PRED, 7 },                  // '11110'
+         { -VERT_LEFT_PRED, 8 },                      // '111110'
+          { -HOR_DOWN_PRED, -HOR_UP_PRED },           // '111111x'
+};
+
+static const uint8_t vp9_default_kf_ymode_probs[10][10][9] = {
+    { /* above = v */
+        {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+        {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+        {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+        {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+        {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+        {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+        {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+        {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+        {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+        {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+    }, { /* above = h */
+        {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+        {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+        {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+        {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+        {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+        {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+        {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+        {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+        {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+        {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+    }, { /* above = dc */
+        {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+        {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+        { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+        {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+        {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+        {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+        {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+        {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+        {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+        {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+    }, { /* above = d45 */
+        {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+        {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+        { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+        {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+        {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+        {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+        {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+        {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+        {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+        {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+    }, { /* above = d135 */
+        {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+        {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+        {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+        {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+        {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+        {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+        {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+        {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+        {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+        {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+    }, { /* above = d117 */
+        {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+        {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+        {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+        {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+        {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+        {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+        {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+        {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+        {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+        {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+    }, { /* above = d153 */
+        {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+        {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+        {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+        {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+        {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+        {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+        {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+        {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+        {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+        {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+    }, { /* above = d63 */
+        {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+        {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+        {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+        {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+        {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+        {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+        {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+        {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+        {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+        {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+    }, { /* above = d27 */
+        {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+        {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+        {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+        {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+        {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+        {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+        {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+        {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+        {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+        {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+    }, { /* above = tm */
+        {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+        {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+        {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+        {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+        {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+        {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+        {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+        {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+        {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+        {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+    }
+};
+
+static const uint8_t vp9_default_kf_uvmode_probs[10][9] = {
+    { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
+    { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
+    { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
+    { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
+    { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
+    { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
+    { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
+    { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
+    { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+    { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
+};
+
+enum InterPredMode {
+    NEARESTMV = 10,
+    NEARMV = 11,
+    ZEROMV = 12,
+    NEWMV = 13,
+};
+
+static const int8_t vp9_inter_mode_tree[3][2] = {
+    { -ZEROMV, 1 },        // '0'
+     { -NEARESTMV, 2 },    // '10'
+      { -NEARMV, -NEWMV }, // '11x'
+};
+
+static const int8_t vp9_filter_tree[2][2] = {
+    { -0, 1 },   // '0'
+     { -1, -2 }, // '1x'
+};
+
+static const enum FilterMode vp9_filter_lut[3] = {
+    FILTER_8TAP_REGULAR,
+    FILTER_8TAP_SMOOTH,
+    FILTER_8TAP_SHARP,
+};
+
+static const int16_t vp9_dc_qlookup[3][256] = {
+    {
+            4,     8,     8,     9,    10,    11,    12,    12,
+           13,    14,    15,    16,    17,    18,    19,    19,
+           20,    21,    22,    23,    24,    25,    26,    26,
+           27,    28,    29,    30,    31,    32,    32,    33,
+           34,    35,    36,    37,    38,    38,    39,    40,
+           41,    42,    43,    43,    44,    45,    46,    47,
+           48,    48,    49,    50,    51,    52,    53,    53,
+           54,    55,    56,    57,    57,    58,    59,    60,
+           61,    62,    62,    63,    64,    65,    66,    66,
+           67,    68,    69,    70,    70,    71,    72,    73,
+           74,    74,    75,    76,    77,    78,    78,    79,
+           80,    81,    81,    82,    83,    84,    85,    85,
+           87,    88,    90,    92,    93,    95,    96,    98,
+           99,   101,   102,   104,   105,   107,   108,   110,
+          111,   113,   114,   116,   117,   118,   120,   121,
+          123,   125,   127,   129,   131,   134,   136,   138,
+          140,   142,   144,   146,   148,   150,   152,   154,
+          156,   158,   161,   164,   166,   169,   172,   174,
+          177,   180,   182,   185,   187,   190,   192,   195,
+          199,   202,   205,   208,   211,   214,   217,   220,
+          223,   226,   230,   233,   237,   240,   243,   247,
+          250,   253,   257,   261,   265,   269,   272,   276,
+          280,   284,   288,   292,   296,   300,   304,   309,
+          313,   317,   322,   326,   330,   335,   340,   344,
+          349,   354,   359,   364,   369,   374,   379,   384,
+          389,   395,   400,   406,   411,   417,   423,   429,
+          435,   441,   447,   454,   461,   467,   475,   482,
+          489,   497,   505,   513,   522,   530,   539,   549,
+          559,   569,   579,   590,   602,   614,   626,   640,
+          654,   668,   684,   700,   717,   736,   755,   775,
+          796,   819,   843,   869,   896,   925,   955,   988,
+         1022,  1058,  1098,  1139,  1184,  1232,  1282,  1336,
+    }, {
+            4,     9,    10,    13,    15,    17,    20,    22,
+           25,    28,    31,    34,    37,    40,    43,    47,
+           50,    53,    57,    60,    64,    68,    71,    75,
+           78,    82,    86,    90,    93,    97,   101,   105,
+          109,   113,   116,   120,   124,   128,   132,   136,
+          140,   143,   147,   151,   155,   159,   163,   166,
+          170,   174,   178,   182,   185,   189,   193,   197,
+          200,   204,   208,   212,   215,   219,   223,   226,
+          230,   233,   237,   241,   244,   248,   251,   255,
+          259,   262,   266,   269,   273,   276,   280,   283,
+          287,   290,   293,   297,   300,   304,   307,   310,
+          314,   317,   321,   324,   327,   331,   334,   337,
+          343,   350,   356,   362,   369,   375,   381,   387,
+          394,   400,   406,   412,   418,   424,   430,   436,
+          442,   448,   454,   460,   466,   472,   478,   484,
+          490,   499,   507,   516,   525,   533,   542,   550,
+          559,   567,   576,   584,   592,   601,   609,   617,
+          625,   634,   644,   655,   666,   676,   687,   698,
+          708,   718,   729,   739,   749,   759,   770,   782,
+          795,   807,   819,   831,   844,   856,   868,   880,
+          891,   906,   920,   933,   947,   961,   975,   988,
+         1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+         1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+         1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+         1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+         1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+         1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+         1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+         2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+         2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+         3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+         4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+    }, {
+            4,    12,    18,    25,    33,    41,    50,    60,
+           70,    80,    91,   103,   115,   127,   140,   153,
+          166,   180,   194,   208,   222,   237,   251,   266,
+          281,   296,   312,   327,   343,   358,   374,   390,
+          405,   421,   437,   453,   469,   484,   500,   516,
+          532,   548,   564,   580,   596,   611,   627,   643,
+          659,   674,   690,   706,   721,   737,   752,   768,
+          783,   798,   814,   829,   844,   859,   874,   889,
+          904,   919,   934,   949,   964,   978,   993,  1008,
+         1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+         1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+         1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+         1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+         1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+         1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+         1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+         2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+         2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+         2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+         3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+         3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+         4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+         4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+         5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+         5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+         6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+         6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+         7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+         8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+        10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+        12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+        16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+    }
+};
+
+static const int16_t vp9_ac_qlookup[3][256] = {
+    {
+            4,     8,     9,    10,    11,    12,    13,    14,
+           15,    16,    17,    18,    19,    20,    21,    22,
+           23,    24,    25,    26,    27,    28,    29,    30,
+           31,    32,    33,    34,    35,    36,    37,    38,
+           39,    40,    41,    42,    43,    44,    45,    46,
+           47,    48,    49,    50,    51,    52,    53,    54,
+           55,    56,    57,    58,    59,    60,    61,    62,
+           63,    64,    65,    66,    67,    68,    69,    70,
+           71,    72,    73,    74,    75,    76,    77,    78,
+           79,    80,    81,    82,    83,    84,    85,    86,
+           87,    88,    89,    90,    91,    92,    93,    94,
+           95,    96,    97,    98,    99,   100,   101,   102,
+          104,   106,   108,   110,   112,   114,   116,   118,
+          120,   122,   124,   126,   128,   130,   132,   134,
+          136,   138,   140,   142,   144,   146,   148,   150,
+          152,   155,   158,   161,   164,   167,   170,   173,
+          176,   179,   182,   185,   188,   191,   194,   197,
+          200,   203,   207,   211,   215,   219,   223,   227,
+          231,   235,   239,   243,   247,   251,   255,   260,
+          265,   270,   275,   280,   285,   290,   295,   300,
+          305,   311,   317,   323,   329,   335,   341,   347,
+          353,   359,   366,   373,   380,   387,   394,   401,
+          408,   416,   424,   432,   440,   448,   456,   465,
+          474,   483,   492,   501,   510,   520,   530,   540,
+          550,   560,   571,   582,   593,   604,   615,   627,
+          639,   651,   663,   676,   689,   702,   715,   729,
+          743,   757,   771,   786,   801,   816,   832,   848,
+          864,   881,   898,   915,   933,   951,   969,   988,
+         1007,  1026,  1046,  1066,  1087,  1108,  1129,  1151,
+         1173,  1196,  1219,  1243,  1267,  1292,  1317,  1343,
+         1369,  1396,  1423,  1451,  1479,  1508,  1537,  1567,
+         1597,  1628,  1660,  1692,  1725,  1759,  1793,  1828,
+    }, {
+            4,     9,    11,    13,    16,    18,    21,    24,
+           27,    30,    33,    37,    40,    44,    48,    51,
+           55,    59,    63,    67,    71,    75,    79,    83,
+           88,    92,    96,   100,   105,   109,   114,   118,
+          122,   127,   131,   136,   140,   145,   149,   154,
+          158,   163,   168,   172,   177,   181,   186,   190,
+          195,   199,   204,   208,   213,   217,   222,   226,
+          231,   235,   240,   244,   249,   253,   258,   262,
+          267,   271,   275,   280,   284,   289,   293,   297,
+          302,   306,   311,   315,   319,   324,   328,   332,
+          337,   341,   345,   349,   354,   358,   362,   367,
+          371,   375,   379,   384,   388,   392,   396,   401,
+          409,   417,   425,   433,   441,   449,   458,   466,
+          474,   482,   490,   498,   506,   514,   523,   531,
+          539,   547,   555,   563,   571,   579,   588,   596,
+          604,   616,   628,   640,   652,   664,   676,   688,
+          700,   713,   725,   737,   749,   761,   773,   785,
+          797,   809,   825,   841,   857,   873,   889,   905,
+          922,   938,   954,   970,   986,  1002,  1018,  1038,
+         1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+         1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+         1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+         1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+         1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+         2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+         2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+         2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+         3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+         4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+         4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+         5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+         6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+    }, {
+            4,    13,    19,    27,    35,    44,    54,    64,
+           75,    87,    99,   112,   126,   139,   154,   168,
+          183,   199,   214,   230,   247,   263,   280,   297,
+          314,   331,   349,   366,   384,   402,   420,   438,
+          456,   475,   493,   511,   530,   548,   567,   586,
+          604,   623,   642,   660,   679,   698,   716,   735,
+          753,   772,   791,   809,   828,   846,   865,   884,
+          902,   920,   939,   957,   976,   994,  1012,  1030,
+         1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+         1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+         1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+         1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+         1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+         1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+         2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+         2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+         2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+         3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+         3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+         4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+         4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+         5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+         6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+         7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+         8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+        10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+        11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+        13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+        16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+        18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+        21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+        25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+    }
+};
+
+static const enum TxfmType vp9_intra_txfm_type[14] = {
+    [VERT_PRED]            = ADST_DCT,
+    [HOR_PRED]             = DCT_ADST,
+    [DC_PRED]              = DCT_DCT,
+    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
+    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+    [VERT_RIGHT_PRED]      = ADST_DCT,
+    [HOR_DOWN_PRED]        = DCT_ADST,
+    [VERT_LEFT_PRED]       = ADST_DCT,
+    [HOR_UP_PRED]          = DCT_ADST,
+    [TM_VP8_PRED]          = ADST_ADST,
+    [NEARESTMV]            = DCT_DCT,
+    [NEARMV]               = DCT_DCT,
+    [ZEROMV]               = DCT_DCT,
+    [NEWMV]                = DCT_DCT,
+};
+
+static const int16_t vp9_default_scan_4x4[16] = {
+     0,  1,  4,  5,
+     2,  8,  3,  6,
+    12,  9,  7, 10,
+    13, 11, 14, 15,
+};
+
+static const int16_t vp9_col_scan_4x4[16] = {
+     0,  1,  2,  4,
+     3,  5,  6,  8,
+     7,  9, 10, 12,
+    13, 11, 14, 15,
+};
+
+static const int16_t vp9_row_scan_4x4[16] = {
+     0,  4,  1,  8,
+     5, 12,  9,  2,
+     6, 13,  3, 10,
+     7, 14, 11, 15,
+};
+
+static const int16_t vp9_default_scan_8x8[64] = {
+     0,  1,  8,  2,  9, 16, 10,  3,
+    17, 24, 18, 11,  4, 25, 32, 19,
+    12, 26,  5, 33, 20, 27, 40, 13,
+    34,  6, 41, 28, 21, 35, 42, 48,
+    14,  7, 36, 29, 43, 56, 49, 22,
+    15, 37, 50, 44, 57, 30, 23, 51,
+    45, 58, 38, 31, 52, 59, 39, 46,
+    53, 60, 47, 54, 61, 55, 62, 63,
+};
+
+static const int16_t vp9_col_scan_8x8[64] = {
+     0,  1,  2,  8,  3,  9,  4, 10,
+    16,  5, 11, 17, 12, 18,  6, 24,
+    19, 13, 25,  7, 26, 20, 32, 14,
+    27, 21, 33, 28, 34, 15, 22, 35,
+    40, 29, 41, 36, 23, 30, 42, 37,
+    48, 43, 31, 44, 49, 38, 50, 56,
+    45, 39, 51, 57, 52, 46, 58, 53,
+    59, 47, 60, 54, 61, 55, 62, 63,
+};
+
+static const int16_t vp9_row_scan_8x8[64] = {
+     0,  8, 16,  1,  9, 24,  2, 17,
+    32, 10, 25,  3, 40, 18, 11, 33,
+    26, 19,  4, 48, 41, 34, 12, 27,
+    56, 20,  5, 42, 35, 13, 49, 28,
+     6, 21, 43, 36, 14, 50, 29, 57,
+     7, 44, 22, 37, 51, 15, 58, 30,
+    23, 45, 52, 38, 59, 31, 46, 53,
+    39, 60, 47, 61, 54, 62, 55, 63,
+};
+
+static const int16_t vp9_default_scan_16x16[256] = {
+      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  34,  19,  49,  20,   5,
+     35,  64,  50,  36,  65,  21,   6,  51,  80,  66,  37,  22,  52,   7,  81,  67,
+     38,  82,  53,  23,  96,  68,   8,  83,  97,  54,  39,  69, 112,  24,  98,  84,
+     70,  55,   9,  40,  85,  99, 113, 128,  25, 114, 100,  71,  86,  56,  10,  41,
+    115, 101, 129, 116,  72,  87,  26, 130, 144, 102,  57,  11,  42, 117, 131, 145,
+     88, 103,  27,  73, 132, 118, 146,  58, 160,  12,  43, 133, 147, 104,  89, 119,
+    161,  74, 148, 134,  28, 162,  59,  13, 176, 120, 149,  90, 135, 105, 163,  44,
+     75, 177, 164,  29, 150, 121, 136, 178, 165,  14, 106,  60,  91, 151,  45, 179,
+    192, 137, 166, 122,  76, 180, 152,  30,  61,  15, 107, 167, 181, 193,  92, 208,
+     46, 138, 123, 153, 194,  77, 168, 182,  31, 195, 209, 183, 108, 139,  62, 154,
+     47, 196,  93, 169, 210, 197, 224, 124, 184, 211,  78, 109, 170, 155,  63, 198,
+    212, 185, 225, 240, 140,  94, 199, 125,  79, 213, 226, 171, 186, 156, 214, 200,
+    110, 227, 141,  95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216,
+    243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232,
+    189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191,
+    235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255,
+};
+
+static const int16_t vp9_col_scan_16x16[256] = {
+      0,   1,   2,   3,  16,   4,  17,   5,  18,   6,  19,  32,  20,   7,  33,  21,
+     34,   8,  35,  22,  48,  36,   9,  49,  23,  50,  37,  10,  38,  51,  24,  64,
+     52,  11,  65,  39,  25,  53,  66,  54,  40,  67,  12,  80,  26,  68,  55,  81,
+     41,  69,  13,  27,  82,  56,  70,  83,  42,  14,  84,  96,  71,  28,  57,  85,
+     97,  15,  72,  98,  43,  86,  58,  99,  29,  87, 100, 112,  73,  44, 101,  59,
+     30, 113,  88, 114,  74, 128, 102,  45,  31, 115,  60, 103,  89, 116,  75, 129,
+    117,  46, 104,  90,  61, 130, 118, 131, 132, 105,  76,  47, 119, 144,  91,  62,
+    133, 106, 145, 120, 146, 134,  77, 147, 121,  92, 135, 148,  63, 107, 136, 122,
+     93, 149, 160,  78, 150, 137, 108, 161, 162, 151, 123,  79, 138, 163, 152,  94,
+    164, 109, 165, 153, 124, 139, 176, 166,  95, 177, 167, 110, 154, 178, 125, 179,
+    140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170,
+    195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172,
+    158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215,
+    227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244,
+    231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206,
+    249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255,
+};
+
+static const int16_t vp9_row_scan_16x16[256] = {
+      0,  16,  32,   1,  48,  17,  64,  33,   2,  80,  18,  49,  96,  34,   3,  65,
+     19, 112,  50,  81,  35,   4, 128,  66,  20,  97,  51,  82,   5, 144,  36,  67,
+    113,  98,  21,  52, 160,  83, 129,  37,  68,   6, 114, 176,  99,  53,  22,  84,
+    145,  38,  69, 130,   7, 115, 192, 100,  54,  23,  85, 161, 146, 131,  39,  70,
+    208, 116,   8, 101, 177,  55,  86,  24, 162, 147, 132,  71, 224, 117,  40, 102,
+      9, 148,  56,  87, 193, 163, 240, 133, 178,  25, 118,  72,  41, 103, 164,  10,
+    149,  88, 134, 209, 179,  57, 119, 194,  26,  73, 165, 150, 104,  42, 135,  11,
+    180, 120,  89, 225, 195,  58,  27, 210, 151, 181, 166,  74,  43, 105,  12, 136,
+     90,  59, 241, 121,  28, 196, 167, 211, 152,  44, 182, 137,  75,  13, 226, 106,
+    122,  60, 197,  91, 168,  29, 183, 153,  14,  76, 212, 138,  45, 107,  15, 198,
+     92, 227, 169,  30, 123, 154,  61, 242, 184, 213, 139,  46,  77,  31, 108, 170,
+    199, 185, 124, 228,  93, 155, 214,  62, 140, 243,  78,  47, 200, 109, 186, 171,
+    201,  94,  63, 215, 229, 156,  79, 125, 141, 110, 216, 187, 172, 244, 202, 230,
+    217,  95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233,
+    203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250,
+    235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255,
+};
+
+static const int16_t vp9_default_scan_32x32[1024] = {
+       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,   67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
+      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,  103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
+     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,  198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
+     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,  321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
+      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,  235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
+     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,   18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
+      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,  450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
+     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,   20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
+     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,  580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,  210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
+     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,   24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
+     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,  460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
+     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
+     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,  370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
+      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,  525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
+     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,   63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
+     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,  591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
+     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,  684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
+     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,  594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
+     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,  284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
+     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,  844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,  474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
+     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,  412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
+     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974, 1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
+     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,  697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
+     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,  885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
+     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,  949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
+     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,  796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
+     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,  924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
+};
+
+static const int16_t * const vp9_scans[5][4] = {
+    {
+        vp9_default_scan_4x4, vp9_col_scan_4x4,
+        vp9_row_scan_4x4, vp9_default_scan_4x4
+    }, {
+        vp9_default_scan_8x8, vp9_col_scan_8x8,
+        vp9_row_scan_8x8, vp9_default_scan_8x8
+    }, {
+        vp9_default_scan_16x16, vp9_col_scan_16x16,
+        vp9_row_scan_16x16, vp9_default_scan_16x16
+    }, {
+        vp9_default_scan_32x32, vp9_default_scan_32x32,
+        vp9_default_scan_32x32, vp9_default_scan_32x32
+    }, { // lossless
+        vp9_default_scan_4x4, vp9_default_scan_4x4,
+        vp9_default_scan_4x4, vp9_default_scan_4x4
+    }
+};
+
+static const int16_t vp9_default_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  1 }, {  1,  1 },
+    {  4,  4 }, {  2,  2 }, {  5,  2 }, {  8,  8 },
+    {  8,  5 }, {  6,  3 }, {  9,  6 }, { 12,  9 },
+    { 10,  7 }, { 13, 10 }, { 14, 11 }, {  0,  0 },
+};
+
+static const int16_t vp9_col_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  4,  4 }, {  5,  5 }, {  4,  4 }, {  6,  6 },
+    {  8,  8 }, {  9,  9 }, {  8,  8 }, { 12, 12 },
+    { 10, 10 }, { 13, 13 }, { 14, 14 }, {  0,  0 },
+};
+
+static const int16_t vp9_row_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  4 }, {  1,  1 },
+    {  8,  8 }, {  5,  5 }, {  1,  1 }, {  2,  2 },
+    {  9,  9 }, {  2,  2 }, {  6,  6 }, {  3,  3 },
+    { 10, 10 }, {  7,  7 }, { 11, 11 }, {  0,  0 },
+};
+
+static const int16_t vp9_default_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  0,  0 }, {  1,  1 }, {  8,  1 },
+    {  8,  8 }, {  9,  2 }, {  2,  2 }, { 16,  9 },
+    { 16, 16 }, { 17, 10 }, { 10,  3 }, {  3,  3 },
+    { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11,  4 },
+    { 25, 18 }, {  4,  4 }, { 32, 25 }, { 19, 12 },
+    { 26, 19 }, { 32, 32 }, { 12,  5 }, { 33, 26 },
+    {  5,  5 }, { 40, 33 }, { 27, 20 }, { 20, 13 },
+    { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13,  6 },
+    {  6,  6 }, { 35, 28 }, { 28, 21 }, { 42, 35 },
+    { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14,  7 },
+    { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 },
+    { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 },
+    { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 },
+    { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 },
+    { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 },
+    { 54, 47 }, { 61, 54 }, { 62, 55 }, {  0,  0 },
+};
+
+static const int16_t vp9_col_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  8,  8 }, {  3,  3 }, {  9,  9 }, {  8,  8 },
+    {  4,  4 }, { 10, 10 }, { 16, 16 }, { 11, 11 },
+    { 17, 17 }, {  5,  5 }, { 16, 16 }, { 18, 18 },
+    { 12, 12 }, { 24, 24 }, {  6,  6 }, { 25, 25 },
+    { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 },
+    { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 },
+    { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 },
+    { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 },
+    { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 },
+    { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 },
+    { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 },
+    { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 },
+    { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 },
+    { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 },
+    { 54, 54 }, { 61, 61 }, { 62, 62 }, {  0,  0 },
+};
+
+static const int16_t vp9_row_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  8,  8 }, {  0,  0 }, {  1,  1 },
+    { 16, 16 }, {  1,  1 }, {  9,  9 }, { 24, 24 },
+    {  2,  2 }, { 17, 17 }, {  2,  2 }, { 32, 32 },
+    { 10, 10 }, {  3,  3 }, { 25, 25 }, { 18, 18 },
+    { 11, 11 }, {  3,  3 }, { 40, 40 }, { 33, 33 },
+    { 26, 26 }, {  4,  4 }, { 19, 19 }, { 48, 48 },
+    { 12, 12 }, {  4,  4 }, { 34, 34 }, { 27, 27 },
+    {  5,  5 }, { 41, 41 }, { 20, 20 }, {  5,  5 },
+    { 13, 13 }, { 35, 35 }, { 28, 28 }, {  6,  6 },
+    { 42, 42 }, { 21, 21 }, { 49, 49 }, {  6,  6 },
+    { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 },
+    {  7,  7 }, { 50, 50 }, { 22, 22 }, { 15, 15 },
+    { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 },
+    { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 },
+    { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 },
+    { 54, 54 }, { 47, 47 }, { 55, 55 }, {  0,  0 },
+};
+
+static const int16_t vp9_default_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   0,   0 }, {   1,   1 }, {  16,   1 },
+    {  16,  16 }, {   2,   2 }, {  17,   2 }, {  32,  17 },
+    {  32,  32 }, {   3,   3 }, {  33,  18 }, {  18,   3 },
+    {  48,  33 }, {  19,   4 }, {   4,   4 }, {  34,  19 },
+    {  48,  48 }, {  49,  34 }, {  35,  20 }, {  64,  49 },
+    {  20,   5 }, {   5,   5 }, {  50,  35 }, {  64,  64 },
+    {  65,  50 }, {  36,  21 }, {  21,   6 }, {  51,  36 },
+    {   6,   6 }, {  80,  65 }, {  66,  51 }, {  37,  22 },
+    {  81,  66 }, {  52,  37 }, {  22,   7 }, {  80,  80 },
+    {  67,  52 }, {   7,   7 }, {  82,  67 }, {  96,  81 },
+    {  53,  38 }, {  38,  23 }, {  68,  53 }, {  96,  96 },
+    {  23,   8 }, {  97,  82 }, {  83,  68 }, {  69,  54 },
+    {  54,  39 }, {   8,   8 }, {  39,  24 }, {  84,  69 },
+    {  98,  83 }, { 112,  97 }, { 112, 112 }, {  24,   9 },
+    { 113,  98 }, {  99,  84 }, {  70,  55 }, {  85,  70 },
+    {  55,  40 }, {   9,   9 }, {  40,  25 }, { 114,  99 },
+    { 100,  85 }, { 128, 113 }, { 115, 100 }, {  71,  56 },
+    {  86,  71 }, {  25,  10 }, { 129, 114 }, { 128, 128 },
+    { 101,  86 }, {  56,  41 }, {  10,  10 }, {  41,  26 },
+    { 116, 101 }, { 130, 115 }, { 144, 129 }, {  87,  72 },
+    { 102,  87 }, {  26,  11 }, {  72,  57 }, { 131, 116 },
+    { 117, 102 }, { 145, 130 }, {  57,  42 }, { 144, 144 },
+    {  11,  11 }, {  42,  27 }, { 132, 117 }, { 146, 131 },
+    { 103,  88 }, {  88,  73 }, { 118, 103 }, { 160, 145 },
+    {  73,  58 }, { 147, 132 }, { 133, 118 }, {  27,  12 },
+    { 161, 146 }, {  58,  43 }, {  12,  12 }, { 160, 160 },
+    { 119, 104 }, { 148, 133 }, {  89,  74 }, { 134, 119 },
+    { 104,  89 }, { 162, 147 }, {  43,  28 }, {  74,  59 },
+    { 176, 161 }, { 163, 148 }, {  28,  13 }, { 149, 134 },
+    { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 },
+    {  13,  13 }, { 105,  90 }, {  59,  44 }, {  90,  75 },
+    { 150, 135 }, {  44,  29 }, { 178, 163 }, { 176, 176 },
+    { 136, 121 }, { 165, 150 }, { 121, 106 }, {  75,  60 },
+    { 179, 164 }, { 151, 136 }, {  29,  14 }, {  60,  45 },
+    {  14,  14 }, { 106,  91 }, { 166, 151 }, { 180, 165 },
+    { 192, 177 }, {  91,  76 }, { 192, 192 }, {  45,  30 },
+    { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 },
+    {  76,  61 }, { 167, 152 }, { 181, 166 }, {  30,  15 },
+    { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107,  92 },
+    { 138, 123 }, {  61,  46 }, { 153, 138 }, {  46,  31 },
+    { 195, 180 }, {  92,  77 }, { 168, 153 }, { 209, 194 },
+    { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 },
+    { 210, 195 }, {  77,  62 }, { 108,  93 }, { 169, 154 },
+    { 154, 139 }, {  62,  47 }, { 197, 182 }, { 211, 196 },
+    { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 },
+    {  93,  78 }, { 198, 183 }, { 124, 109 }, {  78,  63 },
+    { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 },
+    { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109,  94 },
+    { 226, 211 }, { 140, 125 }, {  94,  79 }, { 240, 225 },
+    { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 },
+    { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 },
+    { 228, 213 }, { 110,  95 }, { 215, 200 }, { 242, 227 },
+    { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 },
+    { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 },
+    { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 },
+    { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 },
+    { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 },
+    { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 },
+    { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 },
+    { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 },
+    { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 },
+    { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 },
+    { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 },
+    { 238, 223 }, { 253, 238 }, { 254, 239 }, {   0,   0 },
+};
+
+static const int16_t vp9_col_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   1,   1 }, {   2,   2 }, {   0,   0 },
+    {   3,   3 }, {  16,  16 }, {   4,   4 }, {  17,  17 },
+    {   5,   5 }, {  18,  18 }, {  16,  16 }, {  19,  19 },
+    {   6,   6 }, {  32,  32 }, {  20,  20 }, {  33,  33 },
+    {   7,   7 }, {  34,  34 }, {  21,  21 }, {  32,  32 },
+    {  35,  35 }, {   8,   8 }, {  48,  48 }, {  22,  22 },
+    {  49,  49 }, {  36,  36 }, {   9,   9 }, {  37,  37 },
+    {  50,  50 }, {  23,  23 }, {  48,  48 }, {  51,  51 },
+    {  10,  10 }, {  64,  64 }, {  38,  38 }, {  24,  24 },
+    {  52,  52 }, {  65,  65 }, {  53,  53 }, {  39,  39 },
+    {  66,  66 }, {  11,  11 }, {  64,  64 }, {  25,  25 },
+    {  67,  67 }, {  54,  54 }, {  80,  80 }, {  40,  40 },
+    {  68,  68 }, {  12,  12 }, {  26,  26 }, {  81,  81 },
+    {  55,  55 }, {  69,  69 }, {  82,  82 }, {  41,  41 },
+    {  13,  13 }, {  83,  83 }, {  80,  80 }, {  70,  70 },
+    {  27,  27 }, {  56,  56 }, {  84,  84 }, {  96,  96 },
+    {  14,  14 }, {  71,  71 }, {  97,  97 }, {  42,  42 },
+    {  85,  85 }, {  57,  57 }, {  98,  98 }, {  28,  28 },
+    {  86,  86 }, {  99,  99 }, {  96,  96 }, {  72,  72 },
+    {  43,  43 }, { 100, 100 }, {  58,  58 }, {  29,  29 },
+    { 112, 112 }, {  87,  87 }, { 113, 113 }, {  73,  73 },
+    { 112, 112 }, { 101, 101 }, {  44,  44 }, {  30,  30 },
+    { 114, 114 }, {  59,  59 }, { 102, 102 }, {  88,  88 },
+    { 115, 115 }, {  74,  74 }, { 128, 128 }, { 116, 116 },
+    {  45,  45 }, { 103, 103 }, {  89,  89 }, {  60,  60 },
+    { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 },
+    { 104, 104 }, {  75,  75 }, {  46,  46 }, { 118, 118 },
+    { 128, 128 }, {  90,  90 }, {  61,  61 }, { 132, 132 },
+    { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 },
+    { 133, 133 }, {  76,  76 }, { 146, 146 }, { 120, 120 },
+    {  91,  91 }, { 134, 134 }, { 147, 147 }, {  62,  62 },
+    { 106, 106 }, { 135, 135 }, { 121, 121 }, {  92,  92 },
+    { 148, 148 }, { 144, 144 }, {  77,  77 }, { 149, 149 },
+    { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 },
+    { 150, 150 }, { 122, 122 }, {  78,  78 }, { 137, 137 },
+    { 162, 162 }, { 151, 151 }, {  93,  93 }, { 163, 163 },
+    { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 },
+    { 138, 138 }, { 160, 160 }, { 165, 165 }, {  94,  94 },
+    { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 },
+    { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 },
+    { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 },
+    { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 },
+    { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 },
+    { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 },
+    { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 },
+    { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 },
+    { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 },
+    { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 },
+    { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 },
+    { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 },
+    { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 },
+    { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 },
+    { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 },
+    { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 },
+    { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 },
+    { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 },
+    { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 },
+    { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 },
+    { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 },
+    { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 },
+    { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 },
+    { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 },
+    { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 },
+    { 252, 252 }, { 253, 253 }, { 254, 254 }, {   0,   0 },
+};
+
+static const int16_t vp9_row_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {  16,  16 }, {   0,   0 }, {  32,  32 },
+    {   1,   1 }, {  48,  48 }, {  17,  17 }, {   1,   1 },
+    {  64,  64 }, {   2,   2 }, {  33,  33 }, {  80,  80 },
+    {  18,  18 }, {   2,   2 }, {  49,  49 }, {   3,   3 },
+    {  96,  96 }, {  34,  34 }, {  65,  65 }, {  19,  19 },
+    {   3,   3 }, { 112, 112 }, {  50,  50 }, {   4,   4 },
+    {  81,  81 }, {  35,  35 }, {  66,  66 }, {   4,   4 },
+    { 128, 128 }, {  20,  20 }, {  51,  51 }, {  97,  97 },
+    {  82,  82 }, {   5,   5 }, {  36,  36 }, { 144, 144 },
+    {  67,  67 }, { 113, 113 }, {  21,  21 }, {  52,  52 },
+    {   5,   5 }, {  98,  98 }, { 160, 160 }, {  83,  83 },
+    {  37,  37 }, {   6,   6 }, {  68,  68 }, { 129, 129 },
+    {  22,  22 }, {  53,  53 }, { 114, 114 }, {   6,   6 },
+    {  99,  99 }, { 176, 176 }, {  84,  84 }, {  38,  38 },
+    {   7,   7 }, {  69,  69 }, { 145, 145 }, { 130, 130 },
+    { 115, 115 }, {  23,  23 }, {  54,  54 }, { 192, 192 },
+    { 100, 100 }, {   7,   7 }, {  85,  85 }, { 161, 161 },
+    {  39,  39 }, {  70,  70 }, {   8,   8 }, { 146, 146 },
+    { 131, 131 }, { 116, 116 }, {  55,  55 }, { 208, 208 },
+    { 101, 101 }, {  24,  24 }, {  86,  86 }, {   8,   8 },
+    { 132, 132 }, {  40,  40 }, {  71,  71 }, { 177, 177 },
+    { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 },
+    {   9,   9 }, { 102, 102 }, {  56,  56 }, {  25,  25 },
+    {  87,  87 }, { 148, 148 }, {   9,   9 }, { 133, 133 },
+    {  72,  72 }, { 118, 118 }, { 193, 193 }, { 163, 163 },
+    {  41,  41 }, { 103, 103 }, { 178, 178 }, {  10,  10 },
+    {  57,  57 }, { 149, 149 }, { 134, 134 }, {  88,  88 },
+    {  26,  26 }, { 119, 119 }, {  10,  10 }, { 164, 164 },
+    { 104, 104 }, {  73,  73 }, { 209, 209 }, { 179, 179 },
+    {  42,  42 }, {  11,  11 }, { 194, 194 }, { 135, 135 },
+    { 165, 165 }, { 150, 150 }, {  58,  58 }, {  27,  27 },
+    {  89,  89 }, {  11,  11 }, { 120, 120 }, {  74,  74 },
+    {  43,  43 }, { 225, 225 }, { 105, 105 }, {  12,  12 },
+    { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 },
+    {  28,  28 }, { 166, 166 }, { 121, 121 }, {  59,  59 },
+    {  12,  12 }, { 210, 210 }, {  90,  90 }, { 106, 106 },
+    {  44,  44 }, { 181, 181 }, {  75,  75 }, { 152, 152 },
+    {  13,  13 }, { 167, 167 }, { 137, 137 }, {  13,  13 },
+    {  60,  60 }, { 196, 196 }, { 122, 122 }, {  29,  29 },
+    {  91,  91 }, {  14,  14 }, { 182, 182 }, {  76,  76 },
+    { 211, 211 }, { 153, 153 }, {  14,  14 }, { 107, 107 },
+    { 138, 138 }, {  45,  45 }, { 226, 226 }, { 168, 168 },
+    { 197, 197 }, { 123, 123 }, {  30,  30 }, {  61,  61 },
+    {  15,  15 }, {  92,  92 }, { 154, 154 }, { 183, 183 },
+    { 169, 169 }, { 108, 108 }, { 212, 212 }, {  77,  77 },
+    { 139, 139 }, { 198, 198 }, {  46,  46 }, { 124, 124 },
+    { 227, 227 }, {  62,  62 }, {  31,  31 }, { 184, 184 },
+    {  93,  93 }, { 170, 170 }, { 155, 155 }, { 185, 185 },
+    {  78,  78 }, {  47,  47 }, { 199, 199 }, { 213, 213 },
+    { 140, 140 }, {  63,  63 }, { 109, 109 }, { 125, 125 },
+    {  94,  94 }, { 200, 200 }, { 171, 171 }, { 156, 156 },
+    { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 },
+    {  79,  79 }, { 141, 141 }, { 110, 110 }, { 229, 229 },
+    {  95,  95 }, { 126, 126 }, { 215, 215 }, { 172, 172 },
+    { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 },
+    { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 },
+    { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 },
+    { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 },
+    { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 },
+    { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 },
+    { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 },
+    { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 },
+    { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 },
+    { 238, 238 }, { 223, 223 }, { 239, 239 }, {   0,   0 },
+};
+
+static const int16_t vp9_default_scan_32x32_nb[1024][2] = {
+    {    0,    0 }, {    0,    0 }, {    1,    1 }, {   32,    1 },
+    {   32,   32 }, {    2,    2 }, {   33,    2 }, {   64,   33 },
+    {    3,    3 }, {   64,   64 }, {   34,    3 }, {   65,   34 },
+    {    4,    4 }, {   35,    4 }, {   96,   65 }, {   66,   35 },
+    {   96,   96 }, {   97,   66 }, {   67,   36 }, {   36,    5 },
+    {    5,    5 }, {  128,   97 }, {   98,   67 }, {    6,    6 },
+    {  128,  128 }, {   68,   37 }, {   37,    6 }, {  129,   98 },
+    {   99,   68 }, {  160,  129 }, {  130,   99 }, {   38,    7 },
+    {   69,   38 }, {    7,    7 }, {  100,   69 }, {  161,  130 },
+    {  131,  100 }, {  160,  160 }, {   70,   39 }, {   39,    8 },
+    {    8,    8 }, {  101,   70 }, {  162,  131 }, {  132,  101 },
+    {  192,  161 }, {   71,   40 }, {  192,  192 }, {  102,   71 },
+    {   40,    9 }, {  163,  132 }, {    9,    9 }, {  193,  162 },
+    {  133,  102 }, {  164,  133 }, {   72,   41 }, {  103,   72 },
+    {  134,  103 }, {  224,  193 }, {   41,   10 }, {  194,  163 },
+    {   10,   10 }, {  224,  224 }, {  165,  134 }, {  225,  194 },
+    {  195,  164 }, {   73,   42 }, {  104,   73 }, {  135,  104 },
+    {   42,   11 }, {   11,   11 }, {  166,  135 }, {  196,  165 },
+    {  226,  195 }, {  256,  225 }, {   74,   43 }, {  105,   74 },
+    {  136,  105 }, {  227,  196 }, {   43,   12 }, {  197,  166 },
+    {  167,  136 }, {  257,  226 }, {  256,  256 }, {   12,   12 },
+    {  228,  197 }, {   75,   44 }, {  106,   75 }, {  198,  167 },
+    {  137,  106 }, {  258,  227 }, {  168,  137 }, {  288,  257 },
+    {   44,   13 }, {  229,  198 }, {  259,  228 }, {  199,  168 },
+    {  107,   76 }, {   13,   13 }, {  169,  138 }, {  138,  107 },
+    {  288,  288 }, {  289,  258 }, {   76,   45 }, {  230,  199 },
+    {  260,  229 }, {   45,   14 }, {  200,  169 }, {  139,  108 },
+    {  290,  259 }, {  108,   77 }, {  231,  200 }, {  320,  289 },
+    {  261,  230 }, {  170,  139 }, {   77,   46 }, {  291,  260 },
+    {   14,   14 }, {  321,  290 }, {  201,  170 }, {  262,  231 },
+    {  320,  320 }, {  171,  140 }, {  292,  261 }, {  232,  201 },
+    {  140,  109 }, {  322,  291 }, {  109,   78 }, {   46,   15 },
+    {  202,  171 }, {  263,  232 }, {  233,  202 }, {  293,  262 },
+    {  352,  321 }, {  323,  292 }, {   15,   15 }, {   78,   47 },
+    {  203,  172 }, {  264,  233 }, {  294,  263 }, {  324,  293 },
+    {  172,  141 }, {  353,  322 }, {  141,  110 }, {  234,  203 },
+    {  352,  352 }, {   47,   16 }, {  295,  264 }, {  110,   79 },
+    {  265,  234 }, {  354,  323 }, {  325,  294 }, {   79,   48 },
+    {   16,   16 }, {  204,  173 }, {  235,  204 }, {  173,  142 },
+    {  355,  324 }, {  384,  353 }, {  326,  295 }, {  142,  111 },
+    {  296,  265 }, {  266,  235 }, {  356,  325 }, {  385,  354 },
+    {  111,   80 }, {   48,   17 }, {  327,  296 }, {  297,  266 },
+    {  205,  174 }, {  384,  384 }, {  236,  205 }, {  357,  326 },
+    {  386,  355 }, {   80,   49 }, {  174,  143 }, {   17,   17 },
+    {  328,  297 }, {  358,  327 }, {  387,  356 }, {  298,  267 },
+    {  329,  298 }, {  388,  357 }, {  112,   81 }, {  416,  385 },
+    {  237,  206 }, {  359,  328 }, {   49,   18 }, {  206,  175 },
+    {  417,  386 }, {  389,  358 }, {  330,  299 }, {   18,   18 },
+    {  416,  416 }, {  360,  329 }, {   81,   50 }, {  418,  387 },
+    {  390,  359 }, {  238,  207 }, {   50,   19 }, {  361,  330 },
+    {  419,  388 }, {  113,   82 }, {  448,  417 }, {  448,  448 },
+    {  420,  389 }, {   82,   51 }, {  362,  331 }, {  449,  418 },
+    {  421,  390 }, {  480,  480 }, {  450,  419 }, {  422,  391 },
+    {  114,   83 }, {  451,  420 }, {  480,  449 }, {  452,  421 },
+    {  481,  450 }, {  453,  422 }, {  512,  512 }, {  482,  451 },
+    {  454,  423 }, {  512,  481 }, {  483,  452 }, {  513,  482 },
+    {  484,  453 }, {  514,  483 }, {  485,  454 }, {  544,  513 },
+    {  544,  544 }, {  486,  455 }, {  545,  514 }, {  546,  515 },
+    {  576,  576 }, {  576,  545 }, {  577,  546 }, {  578,  547 },
+    {  608,  577 }, {  609,  578 }, {  610,  579 }, {   19,   19 },
+    {  143,  112 }, {  267,  236 }, {  391,  360 }, {  515,  484 },
+    {  608,  608 }, {   20,   20 }, {   51,   20 }, {  144,  113 },
+    {  175,  144 }, {  268,  237 }, {  299,  268 }, {  392,  361 },
+    {  423,  392 }, {  516,  485 }, {  547,  516 }, {  640,  609 },
+    {  640,  640 }, {   21,   21 }, {   52,   21 }, {   83,   52 },
+    {  145,  114 }, {  176,  145 }, {  207,  176 }, {  269,  238 },
+    {  300,  269 }, {  331,  300 }, {  393,  362 }, {  424,  393 },
+    {  455,  424 }, {  517,  486 }, {  548,  517 }, {  579,  548 },
+    {  641,  610 }, {  672,  641 }, {  672,  672 }, {   22,   22 },
+    {   53,   22 }, {   84,   53 }, {  115,   84 }, {  146,  115 },
+    {  177,  146 }, {  208,  177 }, {  239,  208 }, {  270,  239 },
+    {  301,  270 }, {  332,  301 }, {  363,  332 }, {  394,  363 },
+    {  425,  394 }, {  456,  425 }, {  487,  456 }, {  518,  487 },
+    {  549,  518 }, {  580,  549 }, {  611,  580 }, {  642,  611 },
+    {  673,  642 }, {  704,  673 }, {  704,  704 }, {   54,   23 },
+    {   85,   54 }, {  116,   85 }, {  178,  147 }, {  209,  178 },
+    {  240,  209 }, {  302,  271 }, {  333,  302 }, {  364,  333 },
+    {  426,  395 }, {  457,  426 }, {  488,  457 }, {  550,  519 },
+    {  581,  550 }, {  612,  581 }, {  674,  643 }, {  705,  674 },
+    {  736,  705 }, {   86,   55 }, {  117,   86 }, {  210,  179 },
+    {  241,  210 }, {  334,  303 }, {  365,  334 }, {  458,  427 },
+    {  489,  458 }, {  582,  551 }, {  613,  582 }, {  706,  675 },
+    {  737,  706 }, {  118,   87 }, {  242,  211 }, {  366,  335 },
+    {  490,  459 }, {  614,  583 }, {  738,  707 }, {   23,   23 },
+    {  147,  116 }, {  271,  240 }, {  395,  364 }, {  519,  488 },
+    {  643,  612 }, {  736,  736 }, {   24,   24 }, {   55,   24 },
+    {  148,  117 }, {  179,  148 }, {  272,  241 }, {  303,  272 },
+    {  396,  365 }, {  427,  396 }, {  520,  489 }, {  551,  520 },
+    {  644,  613 }, {  675,  644 }, {  768,  737 }, {  768,  768 },
+    {   25,   25 }, {   56,   25 }, {   87,   56 }, {  149,  118 },
+    {  180,  149 }, {  211,  180 }, {  273,  242 }, {  304,  273 },
+    {  335,  304 }, {  397,  366 }, {  428,  397 }, {  459,  428 },
+    {  521,  490 }, {  552,  521 }, {  583,  552 }, {  645,  614 },
+    {  676,  645 }, {  707,  676 }, {  769,  738 }, {  800,  769 },
+    {  800,  800 }, {   26,   26 }, {   57,   26 }, {   88,   57 },
+    {  119,   88 }, {  150,  119 }, {  181,  150 }, {  212,  181 },
+    {  243,  212 }, {  274,  243 }, {  305,  274 }, {  336,  305 },
+    {  367,  336 }, {  398,  367 }, {  429,  398 }, {  460,  429 },
+    {  491,  460 }, {  522,  491 }, {  553,  522 }, {  584,  553 },
+    {  615,  584 }, {  646,  615 }, {  677,  646 }, {  708,  677 },
+    {  739,  708 }, {  770,  739 }, {  801,  770 }, {  832,  801 },
+    {  832,  832 }, {   58,   27 }, {   89,   58 }, {  120,   89 },
+    {  182,  151 }, {  213,  182 }, {  244,  213 }, {  306,  275 },
+    {  337,  306 }, {  368,  337 }, {  430,  399 }, {  461,  430 },
+    {  492,  461 }, {  554,  523 }, {  585,  554 }, {  616,  585 },
+    {  678,  647 }, {  709,  678 }, {  740,  709 }, {  802,  771 },
+    {  833,  802 }, {  864,  833 }, {   90,   59 }, {  121,   90 },
+    {  214,  183 }, {  245,  214 }, {  338,  307 }, {  369,  338 },
+    {  462,  431 }, {  493,  462 }, {  586,  555 }, {  617,  586 },
+    {  710,  679 }, {  741,  710 }, {  834,  803 }, {  865,  834 },
+    {  122,   91 }, {  246,  215 }, {  370,  339 }, {  494,  463 },
+    {  618,  587 }, {  742,  711 }, {  866,  835 }, {   27,   27 },
+    {  151,  120 }, {  275,  244 }, {  399,  368 }, {  523,  492 },
+    {  647,  616 }, {  771,  740 }, {  864,  864 }, {   28,   28 },
+    {   59,   28 }, {  152,  121 }, {  183,  152 }, {  276,  245 },
+    {  307,  276 }, {  400,  369 }, {  431,  400 }, {  524,  493 },
+    {  555,  524 }, {  648,  617 }, {  679,  648 }, {  772,  741 },
+    {  803,  772 }, {  896,  865 }, {  896,  896 }, {   29,   29 },
+    {   60,   29 }, {   91,   60 }, {  153,  122 }, {  184,  153 },
+    {  215,  184 }, {  277,  246 }, {  308,  277 }, {  339,  308 },
+    {  401,  370 }, {  432,  401 }, {  463,  432 }, {  525,  494 },
+    {  556,  525 }, {  587,  556 }, {  649,  618 }, {  680,  649 },
+    {  711,  680 }, {  773,  742 }, {  804,  773 }, {  835,  804 },
+    {  897,  866 }, {  928,  897 }, {  928,  928 }, {   30,   30 },
+    {   61,   30 }, {   92,   61 }, {  123,   92 }, {  154,  123 },
+    {  185,  154 }, {  216,  185 }, {  247,  216 }, {  278,  247 },
+    {  309,  278 }, {  340,  309 }, {  371,  340 }, {  402,  371 },
+    {  433,  402 }, {  464,  433 }, {  495,  464 }, {  526,  495 },
+    {  557,  526 }, {  588,  557 }, {  619,  588 }, {  650,  619 },
+    {  681,  650 }, {  712,  681 }, {  743,  712 }, {  774,  743 },
+    {  805,  774 }, {  836,  805 }, {  867,  836 }, {  898,  867 },
+    {  929,  898 }, {  960,  929 }, {  960,  960 }, {   62,   31 },
+    {   93,   62 }, {  124,   93 }, {  186,  155 }, {  217,  186 },
+    {  248,  217 }, {  310,  279 }, {  341,  310 }, {  372,  341 },
+    {  434,  403 }, {  465,  434 }, {  496,  465 }, {  558,  527 },
+    {  589,  558 }, {  620,  589 }, {  682,  651 }, {  713,  682 },
+    {  744,  713 }, {  806,  775 }, {  837,  806 }, {  868,  837 },
+    {  930,  899 }, {  961,  930 }, {  992,  961 }, {   94,   63 },
+    {  125,   94 }, {  218,  187 }, {  249,  218 }, {  342,  311 },
+    {  373,  342 }, {  466,  435 }, {  497,  466 }, {  590,  559 },
+    {  621,  590 }, {  714,  683 }, {  745,  714 }, {  838,  807 },
+    {  869,  838 }, {  962,  931 }, {  993,  962 }, {  126,   95 },
+    {  250,  219 }, {  374,  343 }, {  498,  467 }, {  622,  591 },
+    {  746,  715 }, {  870,  839 }, {  994,  963 }, {  155,  124 },
+    {  279,  248 }, {  403,  372 }, {  527,  496 }, {  651,  620 },
+    {  775,  744 }, {  899,  868 }, {  156,  125 }, {  187,  156 },
+    {  280,  249 }, {  311,  280 }, {  404,  373 }, {  435,  404 },
+    {  528,  497 }, {  559,  528 }, {  652,  621 }, {  683,  652 },
+    {  776,  745 }, {  807,  776 }, {  900,  869 }, {  931,  900 },
+    {  157,  126 }, {  188,  157 }, {  219,  188 }, {  281,  250 },
+    {  312,  281 }, {  343,  312 }, {  405,  374 }, {  436,  405 },
+    {  467,  436 }, {  529,  498 }, {  560,  529 }, {  591,  560 },
+    {  653,  622 }, {  684,  653 }, {  715,  684 }, {  777,  746 },
+    {  808,  777 }, {  839,  808 }, {  901,  870 }, {  932,  901 },
+    {  963,  932 }, {  158,  127 }, {  189,  158 }, {  220,  189 },
+    {  251,  220 }, {  282,  251 }, {  313,  282 }, {  344,  313 },
+    {  375,  344 }, {  406,  375 }, {  437,  406 }, {  468,  437 },
+    {  499,  468 }, {  530,  499 }, {  561,  530 }, {  592,  561 },
+    {  623,  592 }, {  654,  623 }, {  685,  654 }, {  716,  685 },
+    {  747,  716 }, {  778,  747 }, {  809,  778 }, {  840,  809 },
+    {  871,  840 }, {  902,  871 }, {  933,  902 }, {  964,  933 },
+    {  995,  964 }, {  190,  159 }, {  221,  190 }, {  252,  221 },
+    {  314,  283 }, {  345,  314 }, {  376,  345 }, {  438,  407 },
+    {  469,  438 }, {  500,  469 }, {  562,  531 }, {  593,  562 },
+    {  624,  593 }, {  686,  655 }, {  717,  686 }, {  748,  717 },
+    {  810,  779 }, {  841,  810 }, {  872,  841 }, {  934,  903 },
+    {  965,  934 }, {  996,  965 }, {  222,  191 }, {  253,  222 },
+    {  346,  315 }, {  377,  346 }, {  470,  439 }, {  501,  470 },
+    {  594,  563 }, {  625,  594 }, {  718,  687 }, {  749,  718 },
+    {  842,  811 }, {  873,  842 }, {  966,  935 }, {  997,  966 },
+    {  254,  223 }, {  378,  347 }, {  502,  471 }, {  626,  595 },
+    {  750,  719 }, {  874,  843 }, {  998,  967 }, {  283,  252 },
+    {  407,  376 }, {  531,  500 }, {  655,  624 }, {  779,  748 },
+    {  903,  872 }, {  284,  253 }, {  315,  284 }, {  408,  377 },
+    {  439,  408 }, {  532,  501 }, {  563,  532 }, {  656,  625 },
+    {  687,  656 }, {  780,  749 }, {  811,  780 }, {  904,  873 },
+    {  935,  904 }, {  285,  254 }, {  316,  285 }, {  347,  316 },
+    {  409,  378 }, {  440,  409 }, {  471,  440 }, {  533,  502 },
+    {  564,  533 }, {  595,  564 }, {  657,  626 }, {  688,  657 },
+    {  719,  688 }, {  781,  750 }, {  812,  781 }, {  843,  812 },
+    {  905,  874 }, {  936,  905 }, {  967,  936 }, {  286,  255 },
+    {  317,  286 }, {  348,  317 }, {  379,  348 }, {  410,  379 },
+    {  441,  410 }, {  472,  441 }, {  503,  472 }, {  534,  503 },
+    {  565,  534 }, {  596,  565 }, {  627,  596 }, {  658,  627 },
+    {  689,  658 }, {  720,  689 }, {  751,  720 }, {  782,  751 },
+    {  813,  782 }, {  844,  813 }, {  875,  844 }, {  906,  875 },
+    {  937,  906 }, {  968,  937 }, {  999,  968 }, {  318,  287 },
+    {  349,  318 }, {  380,  349 }, {  442,  411 }, {  473,  442 },
+    {  504,  473 }, {  566,  535 }, {  597,  566 }, {  628,  597 },
+    {  690,  659 }, {  721,  690 }, {  752,  721 }, {  814,  783 },
+    {  845,  814 }, {  876,  845 }, {  938,  907 }, {  969,  938 },
+    { 1000,  969 }, {  350,  319 }, {  381,  350 }, {  474,  443 },
+    {  505,  474 }, {  598,  567 }, {  629,  598 }, {  722,  691 },
+    {  753,  722 }, {  846,  815 }, {  877,  846 }, {  970,  939 },
+    { 1001,  970 }, {  382,  351 }, {  506,  475 }, {  630,  599 },
+    {  754,  723 }, {  878,  847 }, { 1002,  971 }, {  411,  380 },
+    {  535,  504 }, {  659,  628 }, {  783,  752 }, {  907,  876 },
+    {  412,  381 }, {  443,  412 }, {  536,  505 }, {  567,  536 },
+    {  660,  629 }, {  691,  660 }, {  784,  753 }, {  815,  784 },
+    {  908,  877 }, {  939,  908 }, {  413,  382 }, {  444,  413 },
+    {  475,  444 }, {  537,  506 }, {  568,  537 }, {  599,  568 },
+    {  661,  630 }, {  692,  661 }, {  723,  692 }, {  785,  754 },
+    {  816,  785 }, {  847,  816 }, {  909,  878 }, {  940,  909 },
+    {  971,  940 }, {  414,  383 }, {  445,  414 }, {  476,  445 },
+    {  507,  476 }, {  538,  507 }, {  569,  538 }, {  600,  569 },
+    {  631,  600 }, {  662,  631 }, {  693,  662 }, {  724,  693 },
+    {  755,  724 }, {  786,  755 }, {  817,  786 }, {  848,  817 },
+    {  879,  848 }, {  910,  879 }, {  941,  910 }, {  972,  941 },
+    { 1003,  972 }, {  446,  415 }, {  477,  446 }, {  508,  477 },
+    {  570,  539 }, {  601,  570 }, {  632,  601 }, {  694,  663 },
+    {  725,  694 }, {  756,  725 }, {  818,  787 }, {  849,  818 },
+    {  880,  849 }, {  942,  911 }, {  973,  942 }, { 1004,  973 },
+    {  478,  447 }, {  509,  478 }, {  602,  571 }, {  633,  602 },
+    {  726,  695 }, {  757,  726 }, {  850,  819 }, {  881,  850 },
+    {  974,  943 }, { 1005,  974 }, {  510,  479 }, {  634,  603 },
+    {  758,  727 }, {  882,  851 }, { 1006,  975 }, {  539,  508 },
+    {  663,  632 }, {  787,  756 }, {  911,  880 }, {  540,  509 },
+    {  571,  540 }, {  664,  633 }, {  695,  664 }, {  788,  757 },
+    {  819,  788 }, {  912,  881 }, {  943,  912 }, {  541,  510 },
+    {  572,  541 }, {  603,  572 }, {  665,  634 }, {  696,  665 },
+    {  727,  696 }, {  789,  758 }, {  820,  789 }, {  851,  820 },
+    {  913,  882 }, {  944,  913 }, {  975,  944 }, {  542,  511 },
+    {  573,  542 }, {  604,  573 }, {  635,  604 }, {  666,  635 },
+    {  697,  666 }, {  728,  697 }, {  759,  728 }, {  790,  759 },
+    {  821,  790 }, {  852,  821 }, {  883,  852 }, {  914,  883 },
+    {  945,  914 }, {  976,  945 }, { 1007,  976 }, {  574,  543 },
+    {  605,  574 }, {  636,  605 }, {  698,  667 }, {  729,  698 },
+    {  760,  729 }, {  822,  791 }, {  853,  822 }, {  884,  853 },
+    {  946,  915 }, {  977,  946 }, { 1008,  977 }, {  606,  575 },
+    {  637,  606 }, {  730,  699 }, {  761,  730 }, {  854,  823 },
+    {  885,  854 }, {  978,  947 }, { 1009,  978 }, {  638,  607 },
+    {  762,  731 }, {  886,  855 }, { 1010,  979 }, {  667,  636 },
+    {  791,  760 }, {  915,  884 }, {  668,  637 }, {  699,  668 },
+    {  792,  761 }, {  823,  792 }, {  916,  885 }, {  947,  916 },
+    {  669,  638 }, {  700,  669 }, {  731,  700 }, {  793,  762 },
+    {  824,  793 }, {  855,  824 }, {  917,  886 }, {  948,  917 },
+    {  979,  948 }, {  670,  639 }, {  701,  670 }, {  732,  701 },
+    {  763,  732 }, {  794,  763 }, {  825,  794 }, {  856,  825 },
+    {  887,  856 }, {  918,  887 }, {  949,  918 }, {  980,  949 },
+    { 1011,  980 }, {  702,  671 }, {  733,  702 }, {  764,  733 },
+    {  826,  795 }, {  857,  826 }, {  888,  857 }, {  950,  919 },
+    {  981,  950 }, { 1012,  981 }, {  734,  703 }, {  765,  734 },
+    {  858,  827 }, {  889,  858 }, {  982,  951 }, { 1013,  982 },
+    {  766,  735 }, {  890,  859 }, { 1014,  983 }, {  795,  764 },
+    {  919,  888 }, {  796,  765 }, {  827,  796 }, {  920,  889 },
+    {  951,  920 }, {  797,  766 }, {  828,  797 }, {  859,  828 },
+    {  921,  890 }, {  952,  921 }, {  983,  952 }, {  798,  767 },
+    {  829,  798 }, {  860,  829 }, {  891,  860 }, {  922,  891 },
+    {  953,  922 }, {  984,  953 }, { 1015,  984 }, {  830,  799 },
+    {  861,  830 }, {  892,  861 }, {  954,  923 }, {  985,  954 },
+    { 1016,  985 }, {  862,  831 }, {  893,  862 }, {  986,  955 },
+    { 1017,  986 }, {  894,  863 }, { 1018,  987 }, {  923,  892 },
+    {  924,  893 }, {  955,  924 }, {  925,  894 }, {  956,  925 },
+    {  987,  956 }, {  926,  895 }, {  957,  926 }, {  988,  957 },
+    { 1019,  988 }, {  958,  927 }, {  989,  958 }, { 1020,  989 },
+    {  990,  959 }, { 1021,  990 }, { 1022,  991 }, {    0,    0 },
+};
+
+static const int16_t (* const vp9_scans_nb[5][4])[2] = {
+    {
+        vp9_default_scan_4x4_nb, vp9_col_scan_4x4_nb,
+        vp9_row_scan_4x4_nb, vp9_default_scan_4x4_nb
+    }, {
+        vp9_default_scan_8x8_nb, vp9_col_scan_8x8_nb,
+        vp9_row_scan_8x8_nb, vp9_default_scan_8x8_nb
+    }, {
+        vp9_default_scan_16x16_nb, vp9_col_scan_16x16_nb,
+        vp9_row_scan_16x16_nb, vp9_default_scan_16x16_nb
+    }, {
+        vp9_default_scan_32x32_nb, vp9_default_scan_32x32_nb,
+        vp9_default_scan_32x32_nb, vp9_default_scan_32x32_nb
+    }, { // lossless
+        vp9_default_scan_4x4_nb, vp9_default_scan_4x4_nb,
+        vp9_default_scan_4x4_nb, vp9_default_scan_4x4_nb
+    }
+};
+
+static const uint8_t vp9_model_pareto8[256][8] = {
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   3,  86, 128,   6,  86,  23,  88,  29 },
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   9,  86, 129,  17,  88,  61,  94,  76 },
+    {  12,  86, 129,  22,  88,  77,  97,  93 },
+    {  15,  87, 129,  28,  89,  93, 100, 110 },
+    {  17,  87, 129,  33,  90, 105, 103, 123 },
+    {  20,  88, 130,  38,  91, 118, 106, 136 },
+    {  23,  88, 130,  43,  91, 128, 108, 146 },
+    {  26,  89, 131,  48,  92, 139, 111, 156 },
+    {  28,  89, 131,  53,  93, 147, 114, 163 },
+    {  31,  90, 131,  58,  94, 156, 117, 171 },
+    {  34,  90, 131,  62,  94, 163, 119, 177 },
+    {  37,  90, 132,  66,  95, 171, 122, 184 },
+    {  39,  90, 132,  70,  96, 177, 124, 189 },
+    {  42,  91, 132,  75,  97, 183, 127, 194 },
+    {  44,  91, 132,  79,  97, 188, 129, 198 },
+    {  47,  92, 133,  83,  98, 193, 132, 202 },
+    {  49,  92, 133,  86,  99, 197, 134, 205 },
+    {  52,  93, 133,  90, 100, 201, 137, 208 },
+    {  54,  93, 133,  94, 100, 204, 139, 211 },
+    {  57,  94, 134,  98, 101, 208, 142, 214 },
+    {  59,  94, 134, 101, 102, 211, 144, 216 },
+    {  62,  94, 135, 105, 103, 214, 146, 218 },
+    {  64,  94, 135, 108, 103, 216, 148, 220 },
+    {  66,  95, 135, 111, 104, 219, 151, 222 },
+    {  68,  95, 135, 114, 105, 221, 153, 223 },
+    {  71,  96, 136, 117, 106, 224, 155, 225 },
+    {  73,  96, 136, 120, 106, 225, 157, 226 },
+    {  76,  97, 136, 123, 107, 227, 159, 228 },
+    {  78,  97, 136, 126, 108, 229, 160, 229 },
+    {  80,  98, 137, 129, 109, 231, 162, 231 },
+    {  82,  98, 137, 131, 109, 232, 164, 232 },
+    {  84,  98, 138, 134, 110, 234, 166, 233 },
+    {  86,  98, 138, 137, 111, 235, 168, 234 },
+    {  89,  99, 138, 140, 112, 236, 170, 235 },
+    {  91,  99, 138, 142, 112, 237, 171, 235 },
+    {  93, 100, 139, 145, 113, 238, 173, 236 },
+    {  95, 100, 139, 147, 114, 239, 174, 237 },
+    {  97, 101, 140, 149, 115, 240, 176, 238 },
+    {  99, 101, 140, 151, 115, 241, 177, 238 },
+    { 101, 102, 140, 154, 116, 242, 179, 239 },
+    { 103, 102, 140, 156, 117, 242, 180, 239 },
+    { 105, 103, 141, 158, 118, 243, 182, 240 },
+    { 107, 103, 141, 160, 118, 243, 183, 240 },
+    { 109, 104, 141, 162, 119, 244, 185, 241 },
+    { 111, 104, 141, 164, 119, 244, 186, 241 },
+    { 113, 104, 142, 166, 120, 245, 187, 242 },
+    { 114, 104, 142, 168, 121, 245, 188, 242 },
+    { 116, 105, 143, 170, 122, 246, 190, 243 },
+    { 118, 105, 143, 171, 122, 246, 191, 243 },
+    { 120, 106, 143, 173, 123, 247, 192, 244 },
+    { 121, 106, 143, 175, 124, 247, 193, 244 },
+    { 123, 107, 144, 177, 125, 248, 195, 244 },
+    { 125, 107, 144, 178, 125, 248, 196, 244 },
+    { 127, 108, 145, 180, 126, 249, 197, 245 },
+    { 128, 108, 145, 181, 127, 249, 198, 245 },
+    { 130, 109, 145, 183, 128, 249, 199, 245 },
+    { 132, 109, 145, 184, 128, 249, 200, 245 },
+    { 134, 110, 146, 186, 129, 250, 201, 246 },
+    { 135, 110, 146, 187, 130, 250, 202, 246 },
+    { 137, 111, 147, 189, 131, 251, 203, 246 },
+    { 138, 111, 147, 190, 131, 251, 204, 246 },
+    { 140, 112, 147, 192, 132, 251, 205, 247 },
+    { 141, 112, 147, 193, 132, 251, 206, 247 },
+    { 143, 113, 148, 194, 133, 251, 207, 247 },
+    { 144, 113, 148, 195, 134, 251, 207, 247 },
+    { 146, 114, 149, 197, 135, 252, 208, 248 },
+    { 147, 114, 149, 198, 135, 252, 209, 248 },
+    { 149, 115, 149, 199, 136, 252, 210, 248 },
+    { 150, 115, 149, 200, 137, 252, 210, 248 },
+    { 152, 115, 150, 201, 138, 252, 211, 248 },
+    { 153, 115, 150, 202, 138, 252, 212, 248 },
+    { 155, 116, 151, 204, 139, 253, 213, 249 },
+    { 156, 116, 151, 205, 139, 253, 213, 249 },
+    { 158, 117, 151, 206, 140, 253, 214, 249 },
+    { 159, 117, 151, 207, 141, 253, 215, 249 },
+    { 161, 118, 152, 208, 142, 253, 216, 249 },
+    { 162, 118, 152, 209, 142, 253, 216, 249 },
+    { 163, 119, 153, 210, 143, 253, 217, 249 },
+    { 164, 119, 153, 211, 143, 253, 217, 249 },
+    { 166, 120, 153, 212, 144, 254, 218, 250 },
+    { 167, 120, 153, 212, 145, 254, 219, 250 },
+    { 168, 121, 154, 213, 146, 254, 220, 250 },
+    { 169, 121, 154, 214, 146, 254, 220, 250 },
+    { 171, 122, 155, 215, 147, 254, 221, 250 },
+    { 172, 122, 155, 216, 147, 254, 221, 250 },
+    { 173, 123, 155, 217, 148, 254, 222, 250 },
+    { 174, 123, 155, 217, 149, 254, 222, 250 },
+    { 176, 124, 156, 218, 150, 254, 223, 250 },
+    { 177, 124, 156, 219, 150, 254, 223, 250 },
+    { 178, 125, 157, 220, 151, 254, 224, 251 },
+    { 179, 125, 157, 220, 151, 254, 224, 251 },
+    { 180, 126, 157, 221, 152, 254, 225, 251 },
+    { 181, 126, 157, 221, 152, 254, 225, 251 },
+    { 183, 127, 158, 222, 153, 254, 226, 251 },
+    { 184, 127, 158, 223, 154, 254, 226, 251 },
+    { 185, 128, 159, 224, 155, 255, 227, 251 },
+    { 186, 128, 159, 224, 155, 255, 227, 251 },
+    { 187, 129, 160, 225, 156, 255, 228, 251 },
+    { 188, 130, 160, 225, 156, 255, 228, 251 },
+    { 189, 131, 160, 226, 157, 255, 228, 251 },
+    { 190, 131, 160, 226, 158, 255, 228, 251 },
+    { 191, 132, 161, 227, 159, 255, 229, 251 },
+    { 192, 132, 161, 227, 159, 255, 229, 251 },
+    { 193, 133, 162, 228, 160, 255, 230, 252 },
+    { 194, 133, 162, 229, 160, 255, 230, 252 },
+    { 195, 134, 163, 230, 161, 255, 231, 252 },
+    { 196, 134, 163, 230, 161, 255, 231, 252 },
+    { 197, 135, 163, 231, 162, 255, 231, 252 },
+    { 198, 135, 163, 231, 162, 255, 231, 252 },
+    { 199, 136, 164, 232, 163, 255, 232, 252 },
+    { 200, 136, 164, 232, 164, 255, 232, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 202, 138, 166, 233, 166, 255, 233, 252 },
+    { 203, 138, 166, 233, 166, 255, 233, 252 },
+    { 204, 139, 166, 234, 167, 255, 234, 252 },
+    { 205, 139, 166, 234, 167, 255, 234, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 207, 141, 168, 236, 169, 255, 235, 252 },
+    { 208, 141, 168, 236, 170, 255, 235, 252 },
+    { 209, 142, 169, 237, 171, 255, 236, 252 },
+    { 209, 143, 169, 237, 171, 255, 236, 252 },
+    { 210, 144, 169, 237, 172, 255, 236, 252 },
+    { 211, 144, 169, 237, 172, 255, 236, 252 },
+    { 212, 145, 170, 238, 173, 255, 237, 252 },
+    { 213, 145, 170, 238, 173, 255, 237, 252 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 216, 148, 173, 240, 176, 255, 238, 253 },
+    { 217, 148, 173, 240, 176, 255, 238, 253 },
+    { 218, 149, 173, 241, 177, 255, 239, 253 },
+    { 218, 149, 173, 241, 178, 255, 239, 253 },
+    { 219, 150, 174, 241, 179, 255, 239, 253 },
+    { 219, 151, 174, 241, 179, 255, 239, 253 },
+    { 220, 152, 175, 242, 180, 255, 240, 253 },
+    { 221, 152, 175, 242, 180, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 225, 156, 178, 244, 184, 255, 241, 253 },
+    { 225, 157, 178, 244, 184, 255, 241, 253 },
+    { 226, 158, 179, 244, 185, 255, 242, 253 },
+    { 227, 158, 179, 244, 185, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 230, 161, 182, 246, 188, 255, 243, 253 },
+    { 230, 162, 182, 246, 188, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 234, 166, 185, 247, 192, 255, 244, 253 },
+    { 234, 167, 185, 247, 192, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 237, 171, 189, 249, 196, 255, 245, 254 },
+    { 237, 172, 189, 249, 196, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 240, 175, 192, 249, 199, 255, 246, 254 },
+    { 240, 176, 192, 249, 199, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 242, 179, 195, 250, 202, 255, 246, 254 },
+    { 242, 180, 195, 250, 202, 255, 246, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 243, 182, 197, 251, 204, 255, 247, 254 },
+    { 243, 183, 197, 251, 204, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 245, 186, 200, 251, 207, 255, 247, 254 },
+    { 245, 187, 200, 251, 207, 255, 247, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 189, 202, 252, 208, 255, 248, 254 },
+    { 246, 190, 202, 252, 208, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 192, 204, 252, 210, 255, 248, 254 },
+    { 247, 193, 204, 252, 210, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 195, 206, 252, 212, 255, 249, 254 },
+    { 248, 196, 206, 252, 212, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 198, 208, 253, 214, 255, 249, 254 },
+    { 249, 199, 209, 253, 214, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 201, 211, 253, 215, 255, 249, 254 },
+    { 250, 202, 211, 253, 215, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 251, 204, 213, 253, 217, 255, 250, 254 },
+    { 251, 205, 213, 253, 217, 255, 250, 254 },
+    { 251, 206, 214, 254, 218, 255, 250, 254 },
+    { 251, 206, 215, 254, 218, 255, 250, 254 },
+    { 252, 207, 216, 254, 219, 255, 250, 254 },
+    { 252, 208, 216, 254, 219, 255, 250, 254 },
+    { 252, 209, 217, 254, 220, 255, 250, 254 },
+    { 252, 210, 217, 254, 220, 255, 250, 254 },
+    { 252, 211, 218, 254, 221, 255, 250, 254 },
+    { 252, 212, 218, 254, 221, 255, 250, 254 },
+    { 253, 213, 219, 254, 222, 255, 250, 254 },
+    { 253, 213, 220, 254, 222, 255, 250, 254 },
+    { 253, 214, 221, 254, 223, 255, 250, 254 },
+    { 253, 215, 221, 254, 223, 255, 250, 254 },
+    { 253, 216, 222, 254, 224, 255, 251, 254 },
+    { 253, 217, 223, 254, 224, 255, 251, 254 },
+    { 253, 218, 224, 254, 225, 255, 251, 254 },
+    { 253, 219, 224, 254, 225, 255, 251, 254 },
+    { 254, 220, 225, 254, 225, 255, 251, 254 },
+    { 254, 221, 226, 254, 225, 255, 251, 254 },
+    { 254, 222, 227, 255, 226, 255, 251, 254 },
+    { 254, 223, 227, 255, 226, 255, 251, 254 },
+    { 254, 224, 228, 255, 227, 255, 251, 254 },
+    { 254, 225, 229, 255, 227, 255, 251, 254 },
+    { 254, 226, 230, 255, 228, 255, 251, 254 },
+    { 254, 227, 230, 255, 229, 255, 251, 254 },
+    { 255, 228, 231, 255, 230, 255, 251, 254 },
+    { 255, 229, 232, 255, 230, 255, 251, 254 },
+    { 255, 230, 233, 255, 231, 255, 252, 254 },
+    { 255, 231, 234, 255, 231, 255, 252, 254 },
+    { 255, 232, 235, 255, 232, 255, 252, 254 },
+    { 255, 233, 236, 255, 232, 255, 252, 254 },
+    { 255, 235, 237, 255, 233, 255, 252, 254 },
+    { 255, 236, 238, 255, 234, 255, 252, 254 },
+    { 255, 238, 240, 255, 235, 255, 252, 255 },
+    { 255, 239, 241, 255, 235, 255, 252, 254 },
+    { 255, 241, 243, 255, 236, 255, 252, 254 },
+    { 255, 243, 245, 255, 237, 255, 252, 254 },
+    { 255, 246, 247, 255, 239, 255, 253, 255 },
+};
+
+typedef struct {
+    uint8_t y_mode[4][9];
+    uint8_t uv_mode[10][9];
+    uint8_t filter[4][2];
+    uint8_t mv_mode[7][3];
+    uint8_t intra[4];
+    uint8_t comp[5];
+    uint8_t single_ref[5][2];
+    uint8_t comp_ref[5];
+    uint8_t tx32p[2][3];
+    uint8_t tx16p[2][2];
+    uint8_t tx8p[2];
+    uint8_t skip[3];
+    uint8_t mv_joint[3];
+    struct {
+        uint8_t sign;
+        uint8_t classes[10];
+        uint8_t class0;
+        uint8_t bits[10];
+        uint8_t class0_fp[2][3];
+        uint8_t fp[3];
+        uint8_t class0_hp;
+        uint8_t hp;
+    } mv_comp[2];
+    uint8_t partition[4][4][3];
+} prob_context;
+
+static const prob_context vp9_default_probs = {
+    { /* y_mode */
+        {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* bsize < 8x8 */,
+        { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* bsize < 16x16 */,
+        { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* bsize < 32x32 */,
+        { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* bsize >= 32x32 */
+    }, { /* uv_mode */
+        {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
+        {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
+        { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
+        {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
+        {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
+        {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
+        {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
+        {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
+        {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+        { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
+    }, { /* filter */
+        { 235, 162, },
+        {  36, 255, },
+        {  34,   3, },
+        { 149, 144, },
+    }, { /* mv_mode */
+        {  2, 173,  34},  // 0 = both zero mv
+        {  7, 145,  85},  // 1 = one zero mv + one a predicted mv
+        {  7, 166,  63},  // 2 = two predicted mvs
+        {  7,  94,  66},  // 3 = one predicted/zero and one new mv
+        {  8,  64,  46},  // 4 = two new mvs
+        { 17,  81,  31},  // 5 = one intra neighbour + x
+        { 25,  29,  30},  // 6 = two intra neighbours
+    }, { /* intra */
+        9, 102, 187, 225
+    }, { /* comp */
+        239, 183, 119,  96,  41
+    }, { /* single_ref */
+        {  33,  16 },
+        {  77,  74 },
+        { 142, 142 },
+        { 172, 170 },
+        { 238, 247 }
+    }, { /* comp_ref */
+        50, 126, 123, 221, 226
+    }, { /* tx32p */
+        { 3, 136, 37, },
+        { 5,  52, 13, },
+    }, { /* tx16p */
+        { 20, 152, },
+        { 15, 101, },
+    }, { /* tx8p */
+        100, 66
+    }, { /* skip */
+        192, 128, 64
+    }, { /* mv_joint */
+        32, 64, 96
+    }, {
+        { /* mv vertical component */
+            128, /* sign */
+            { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */
+            216, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }, { /* mv horizontal component */
+            128, /* sign */
+            { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */
+            208, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }
+    }, { /* partition */
+        { /* 64x64 -> 32x32 */
+            { 222,  34,  30 } /* a/l both not split */,
+            {  72,  16,  44 } /* a split, l not split */,
+            {  58,  32,  12 } /* l split, a not split */,
+            {  10,   7,   6 } /* a/l both split */,
+        }, { /* 32x32 -> 16x16 */
+            { 177,  58,  59 } /* a/l both not split */,
+            {  68,  26,  63 } /* a split, l not split */,
+            {  52,  79,  25 } /* l split, a not split */,
+            {  17,  14,  12 } /* a/l both split */,
+        }, { /* 16x16 -> 8x8 */
+            { 174,  73,  87 } /* a/l both not split */,
+            {  92,  41,  83 } /* a split, l not split */,
+            {  82,  99,  50 } /* l split, a not split */,
+            {  53,  39,  39 } /* a/l both split */,
+        }, { /* 8x8 -> 4x4 */
+            { 199, 122, 141 } /* a/l both not split */,
+            { 147,  63, 159 } /* a split, l not split */,
+            { 148, 133, 118 } /* l split, a not split */,
+            { 121, 104, 114 } /* a/l both split */,
+        }
+    },
+};
+
+static const uint8_t vp9_default_coef_probs[4][2][2][6][6][3] = {
+    { /* tx = 4x4 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 195,  29, 183 },
+                    {  84,  49, 136 },
+                    {   8,  42,  71 }
+                }, { /* Coeff Band 1 */
+                    {  31, 107, 169 },
+                    {  35,  99, 159 },
+                    {  17,  82, 140 },
+                    {   8,  66, 114 },
+                    {   2,  44,  76 },
+                    {   1,  19,  32 }
+                }, { /* Coeff Band 2 */
+                    {  40, 132, 201 },
+                    {  29, 114, 187 },
+                    {  13,  91, 157 },
+                    {   7,  75, 127 },
+                    {   3,  58,  95 },
+                    {   1,  28,  47 }
+                }, { /* Coeff Band 3 */
+                    {  69, 142, 221 },
+                    {  42, 122, 201 },
+                    {  15,  91, 159 },
+                    {   6,  67, 121 },
+                    {   1,  42,  77 },
+                    {   1,  17,  31 }
+                }, { /* Coeff Band 4 */
+                    { 102, 148, 228 },
+                    {  67, 117, 204 },
+                    {  17,  82, 154 },
+                    {   6,  59, 114 },
+                    {   2,  39,  75 },
+                    {   1,  15,  29 }
+                }, { /* Coeff Band 5 */
+                    { 156,  57, 233 },
+                    { 119,  57, 212 },
+                    {  58,  48, 163 },
+                    {  29,  40, 124 },
+                    {  12,  30,  81 },
+                    {   3,  12,  31 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 191, 107, 226 },
+                    { 124, 117, 204 },
+                    {  25,  99, 155 }
+                }, { /* Coeff Band 1 */
+                    {  29, 148, 210 },
+                    {  37, 126, 194 },
+                    {   8,  93, 157 },
+                    {   2,  68, 118 },
+                    {   1,  39,  69 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 2 */
+                    {  41, 151, 213 },
+                    {  27, 123, 193 },
+                    {   3,  82, 144 },
+                    {   1,  58, 105 },
+                    {   1,  32,  60 },
+                    {   1,  13,  26 }
+                }, { /* Coeff Band 3 */
+                    {  59, 159, 220 },
+                    {  23, 126, 198 },
+                    {   4,  88, 151 },
+                    {   1,  66, 114 },
+                    {   1,  38,  71 },
+                    {   1,  18,  34 }
+                }, { /* Coeff Band 4 */
+                    { 114, 136, 232 },
+                    {  51, 114, 207 },
+                    {  11,  83, 155 },
+                    {   3,  56, 105 },
+                    {   1,  33,  65 },
+                    {   1,  17,  34 }
+                }, { /* Coeff Band 5 */
+                    { 149,  65, 234 },
+                    { 121,  57, 215 },
+                    {  61,  49, 166 },
+                    {  28,  36, 114 },
+                    {  12,  25,  76 },
+                    {   3,  16,  42 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 214,  49, 220 },
+                    { 132,  63, 188 },
+                    {  42,  65, 137 }
+                }, { /* Coeff Band 1 */
+                    {  85, 137, 221 },
+                    { 104, 131, 216 },
+                    {  49, 111, 192 },
+                    {  21,  87, 155 },
+                    {   2,  49,  87 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 2 */
+                    {  89, 163, 230 },
+                    {  90, 137, 220 },
+                    {  29, 100, 183 },
+                    {  10,  70, 135 },
+                    {   2,  42,  81 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 3 */
+                    { 108, 167, 237 },
+                    {  55, 133, 222 },
+                    {  15,  97, 179 },
+                    {   4,  72, 135 },
+                    {   1,  45,  85 },
+                    {   1,  19,  38 }
+                }, { /* Coeff Band 4 */
+                    { 124, 146, 240 },
+                    {  66, 124, 224 },
+                    {  17,  88, 175 },
+                    {   4,  58, 122 },
+                    {   1,  36,  75 },
+                    {   1,  18,  37 }
+                }, { /* Coeff Band 5 */
+                    { 141,  79, 241 },
+                    { 126,  70, 227 },
+                    {  66,  58, 182 },
+                    {  30,  44, 136 },
+                    {  12,  34,  96 },
+                    {   2,  20,  47 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 229,  99, 249 },
+                    { 143, 111, 235 },
+                    {  46, 109, 192 }
+                }, { /* Coeff Band 1 */
+                    {  82, 158, 236 },
+                    {  94, 146, 224 },
+                    {  25, 117, 191 },
+                    {   9,  87, 149 },
+                    {   3,  56,  99 },
+                    {   1,  33,  57 }
+                }, { /* Coeff Band 2 */
+                    {  83, 167, 237 },
+                    {  68, 145, 222 },
+                    {  10, 103, 177 },
+                    {   2,  72, 131 },
+                    {   1,  41,  79 },
+                    {   1,  20,  39 }
+                }, { /* Coeff Band 3 */
+                    {  99, 167, 239 },
+                    {  47, 141, 224 },
+                    {  10, 104, 178 },
+                    {   2,  73, 133 },
+                    {   1,  44,  85 },
+                    {   1,  22,  47 }
+                }, { /* Coeff Band 4 */
+                    { 127, 145, 243 },
+                    {  71, 129, 228 },
+                    {  17,  93, 177 },
+                    {   3,  61, 124 },
+                    {   1,  41,  84 },
+                    {   1,  21,  52 }
+                }, { /* Coeff Band 5 */
+                    { 157,  78, 244 },
+                    { 140,  72, 231 },
+                    {  69,  58, 184 },
+                    {  31,  44, 137 },
+                    {  14,  38, 105 },
+                    {   8,  23,  61 }
+                }
+            }
+        }
+    }, { /* tx = 8x8 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 125,  34, 187 },
+                    {  52,  41, 133 },
+                    {   6,  31,  56 }
+                }, { /* Coeff Band 1 */
+                    {  37, 109, 153 },
+                    {  51, 102, 147 },
+                    {  23,  87, 128 },
+                    {   8,  67, 101 },
+                    {   1,  41,  63 },
+                    {   1,  19,  29 }
+                }, { /* Coeff Band 2 */
+                    {  31, 154, 185 },
+                    {  17, 127, 175 },
+                    {   6,  96, 145 },
+                    {   2,  73, 114 },
+                    {   1,  51,  82 },
+                    {   1,  28,  45 }
+                }, { /* Coeff Band 3 */
+                    {  23, 163, 200 },
+                    {  10, 131, 185 },
+                    {   2,  93, 148 },
+                    {   1,  67, 111 },
+                    {   1,  41,  69 },
+                    {   1,  14,  24 }
+                }, { /* Coeff Band 4 */
+                    {  29, 176, 217 },
+                    {  12, 145, 201 },
+                    {   3, 101, 156 },
+                    {   1,  69, 111 },
+                    {   1,  39,  63 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 233 },
+                    {  25, 154, 215 },
+                    {   6, 109, 167 },
+                    {   3,  78, 118 },
+                    {   1,  48,  69 },
+                    {   1,  21,  29 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 202, 105, 245 },
+                    { 108, 106, 216 },
+                    {  18,  90, 144 }
+                }, { /* Coeff Band 1 */
+                    {  33, 172, 219 },
+                    {  64, 149, 206 },
+                    {  14, 117, 177 },
+                    {   5,  90, 141 },
+                    {   2,  61,  95 },
+                    {   1,  37,  57 }
+                }, { /* Coeff Band 2 */
+                    {  33, 179, 220 },
+                    {  11, 140, 198 },
+                    {   1,  89, 148 },
+                    {   1,  60, 104 },
+                    {   1,  33,  57 },
+                    {   1,  12,  21 }
+                }, { /* Coeff Band 3 */
+                    {  30, 181, 221 },
+                    {   8, 141, 198 },
+                    {   1,  87, 145 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  32, 186, 224 },
+                    {   7, 142, 198 },
+                    {   1,  86, 143 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  22 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 227 },
+                    {  20, 143, 204 },
+                    {   3,  96, 154 },
+                    {   1,  68, 112 },
+                    {   1,  42,  69 },
+                    {   1,  19,  32 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 212,  35, 215 },
+                    { 113,  47, 169 },
+                    {  29,  48, 105 }
+                }, { /* Coeff Band 1 */
+                    {  74, 129, 203 },
+                    { 106, 120, 203 },
+                    {  49, 107, 178 },
+                    {  19,  84, 144 },
+                    {   4,  50,  84 },
+                    {   1,  15,  25 }
+                }, { /* Coeff Band 2 */
+                    {  71, 172, 217 },
+                    {  44, 141, 209 },
+                    {  15, 102, 173 },
+                    {   6,  76, 133 },
+                    {   2,  51,  89 },
+                    {   1,  24,  42 }
+                }, { /* Coeff Band 3 */
+                    {  64, 185, 231 },
+                    {  31, 148, 216 },
+                    {   8, 103, 175 },
+                    {   3,  74, 131 },
+                    {   1,  46,  81 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 4 */
+                    {  65, 196, 235 },
+                    {  25, 157, 221 },
+                    {   5, 105, 174 },
+                    {   1,  67, 120 },
+                    {   1,  38,  69 },
+                    {   1,  15,  30 }
+                }, { /* Coeff Band 5 */
+                    {  65, 204, 238 },
+                    {  30, 156, 224 },
+                    {   7, 107, 177 },
+                    {   2,  70, 124 },
+                    {   1,  42,  73 },
+                    {   1,  18,  34 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 225,  86, 251 },
+                    { 144, 104, 235 },
+                    {  42,  99, 181 }
+                }, { /* Coeff Band 1 */
+                    {  85, 175, 239 },
+                    { 112, 165, 229 },
+                    {  29, 136, 200 },
+                    {  12, 103, 162 },
+                    {   6,  77, 123 },
+                    {   2,  53,  84 }
+                }, { /* Coeff Band 2 */
+                    {  75, 183, 239 },
+                    {  30, 155, 221 },
+                    {   3, 106, 171 },
+                    {   1,  74, 128 },
+                    {   1,  44,  76 },
+                    {   1,  17,  28 }
+                }, { /* Coeff Band 3 */
+                    {  73, 185, 240 },
+                    {  27, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  75, 127 },
+                    {   1,  42,  73 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 4 */
+                    {  62, 190, 238 },
+                    {  21, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  72, 122 },
+                    {   1,  40,  71 },
+                    {   1,  18,  32 }
+                }, { /* Coeff Band 5 */
+                    {  61, 199, 240 },
+                    {  27, 161, 226 },
+                    {   4, 113, 180 },
+                    {   1,  76, 129 },
+                    {   1,  46,  80 },
+                    {   1,  23,  41 }
+                }
+            }
+        }
+    }, { /* tx = 16x16 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {   7,  27, 153 },
+                    {   5,  30,  95 },
+                    {   1,  16,  30 }
+                }, { /* Coeff Band 1 */
+                    {  50,  75, 127 },
+                    {  57,  75, 124 },
+                    {  27,  67, 108 },
+                    {  10,  54,  86 },
+                    {   1,  33,  52 },
+                    {   1,  12,  18 }
+                }, { /* Coeff Band 2 */
+                    {  43, 125, 151 },
+                    {  26, 108, 148 },
+                    {   7,  83, 122 },
+                    {   2,  59,  89 },
+                    {   1,  38,  60 },
+                    {   1,  17,  27 }
+                }, { /* Coeff Band 3 */
+                    {  23, 144, 163 },
+                    {  13, 112, 154 },
+                    {   2,  75, 117 },
+                    {   1,  50,  81 },
+                    {   1,  31,  51 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 4 */
+                    {  18, 162, 185 },
+                    {   6, 123, 171 },
+                    {   1,  78, 125 },
+                    {   1,  51,  86 },
+                    {   1,  31,  54 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  15, 199, 227 },
+                    {   3, 150, 204 },
+                    {   1,  91, 146 },
+                    {   1,  55,  95 },
+                    {   1,  30,  53 },
+                    {   1,  11,  20 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  19,  55, 240 },
+                    {  19,  59, 196 },
+                    {   3,  52, 105 }
+                }, { /* Coeff Band 1 */
+                    {  41, 166, 207 },
+                    { 104, 153, 199 },
+                    {  31, 123, 181 },
+                    {  14, 101, 152 },
+                    {   5,  72, 106 },
+                    {   1,  36,  52 }
+                }, { /* Coeff Band 2 */
+                    {  35, 176, 211 },
+                    {  12, 131, 190 },
+                    {   2,  88, 144 },
+                    {   1,  60, 101 },
+                    {   1,  36,  60 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 3 */
+                    {  28, 183, 213 },
+                    {   8, 134, 191 },
+                    {   1,  86, 142 },
+                    {   1,  56,  96 },
+                    {   1,  30,  53 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  20, 190, 215 },
+                    {   4, 135, 192 },
+                    {   1,  84, 139 },
+                    {   1,  53,  91 },
+                    {   1,  28,  49 },
+                    {   1,  11,  20 }
+                }, { /* Coeff Band 5 */
+                    {  13, 196, 216 },
+                    {   2, 137, 192 },
+                    {   1,  86, 143 },
+                    {   1,  57,  99 },
+                    {   1,  32,  56 },
+                    {   1,  13,  24 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 211,  29, 217 },
+                    {  96,  47, 156 },
+                    {  22,  43,  87 }
+                }, { /* Coeff Band 1 */
+                    {  78, 120, 193 },
+                    { 111, 116, 186 },
+                    {  46, 102, 164 },
+                    {  15,  80, 128 },
+                    {   2,  49,  76 },
+                    {   1,  18,  28 }
+                }, { /* Coeff Band 2 */
+                    {  71, 161, 203 },
+                    {  42, 132, 192 },
+                    {  10,  98, 150 },
+                    {   3,  69, 109 },
+                    {   1,  44,  70 },
+                    {   1,  18,  29 }
+                }, { /* Coeff Band 3 */
+                    {  57, 186, 211 },
+                    {  30, 140, 196 },
+                    {   4,  93, 146 },
+                    {   1,  62, 102 },
+                    {   1,  38,  65 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  47, 199, 217 },
+                    {  14, 145, 196 },
+                    {   1,  88, 142 },
+                    {   1,  57,  98 },
+                    {   1,  36,  62 },
+                    {   1,  15,  26 }
+                }, { /* Coeff Band 5 */
+                    {  26, 219, 229 },
+                    {   5, 155, 207 },
+                    {   1,  94, 151 },
+                    {   1,  60, 104 },
+                    {   1,  36,  62 },
+                    {   1,  16,  28 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 233,  29, 248 },
+                    { 146,  47, 220 },
+                    {  43,  52, 140 }
+                }, { /* Coeff Band 1 */
+                    { 100, 163, 232 },
+                    { 179, 161, 222 },
+                    {  63, 142, 204 },
+                    {  37, 113, 174 },
+                    {  26,  89, 137 },
+                    {  18,  68,  97 }
+                }, { /* Coeff Band 2 */
+                    {  85, 181, 230 },
+                    {  32, 146, 209 },
+                    {   7, 100, 164 },
+                    {   3,  71, 121 },
+                    {   1,  45,  77 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  65, 187, 230 },
+                    {  20, 148, 207 },
+                    {   2,  97, 159 },
+                    {   1,  68, 116 },
+                    {   1,  40,  70 },
+                    {   1,  14,  29 }
+                }, { /* Coeff Band 4 */
+                    {  40, 194, 227 },
+                    {   8, 147, 204 },
+                    {   1,  94, 155 },
+                    {   1,  65, 112 },
+                    {   1,  39,  66 },
+                    {   1,  14,  26 }
+                }, { /* Coeff Band 5 */
+                    {  16, 208, 228 },
+                    {   3, 151, 207 },
+                    {   1,  98, 160 },
+                    {   1,  67, 117 },
+                    {   1,  41,  74 },
+                    {   1,  17,  31 }
+                }
+            }
+        }
+    }, { /* tx = 32x32 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {  17,  38, 140 },
+                    {   7,  34,  80 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 1 */
+                    {  37,  75, 128 },
+                    {  41,  76, 128 },
+                    {  26,  66, 116 },
+                    {  12,  52,  94 },
+                    {   2,  32,  55 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 2 */
+                    {  50, 127, 154 },
+                    {  37, 109, 152 },
+                    {  16,  82, 121 },
+                    {   5,  59,  85 },
+                    {   1,  35,  54 },
+                    {   1,  13,  20 }
+                }, { /* Coeff Band 3 */
+                    {  40, 142, 167 },
+                    {  17, 110, 157 },
+                    {   2,  71, 112 },
+                    {   1,  44,  72 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 4 */
+                    {  30, 175, 188 },
+                    {   9, 124, 169 },
+                    {   1,  74, 116 },
+                    {   1,  48,  78 },
+                    {   1,  30,  49 },
+                    {   1,  11,  18 }
+                }, { /* Coeff Band 5 */
+                    {  10, 222, 223 },
+                    {   2, 150, 194 },
+                    {   1,  83, 128 },
+                    {   1,  48,  79 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  36,  41, 235 },
+                    {  29,  36, 193 },
+                    {  10,  27, 111 }
+                }, { /* Coeff Band 1 */
+                    {  85, 165, 222 },
+                    { 177, 162, 215 },
+                    { 110, 135, 195 },
+                    {  57, 113, 168 },
+                    {  23,  83, 120 },
+                    {  10,  49,  61 }
+                }, { /* Coeff Band 2 */
+                    {  85, 190, 223 },
+                    {  36, 139, 200 },
+                    {   5,  90, 146 },
+                    {   1,  60, 103 },
+                    {   1,  38,  65 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  72, 202, 223 },
+                    {  23, 141, 199 },
+                    {   2,  86, 140 },
+                    {   1,  56,  97 },
+                    {   1,  36,  61 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  55, 218, 225 },
+                    {  13, 145, 200 },
+                    {   1,  86, 141 },
+                    {   1,  57,  99 },
+                    {   1,  35,  61 },
+                    {   1,  13,  22 }
+                }, { /* Coeff Band 5 */
+                    {  15, 235, 212 },
+                    {   1, 132, 184 },
+                    {   1,  84, 139 },
+                    {   1,  57,  97 },
+                    {   1,  34,  56 },
+                    {   1,  14,  23 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 181,  21, 201 },
+                    {  61,  37, 123 },
+                    {  10,  38,  71 }
+                }, { /* Coeff Band 1 */
+                    {  47, 106, 172 },
+                    {  95, 104, 173 },
+                    {  42,  93, 159 },
+                    {  18,  77, 131 },
+                    {   4,  50,  81 },
+                    {   1,  17,  23 }
+                }, { /* Coeff Band 2 */
+                    {  62, 147, 199 },
+                    {  44, 130, 189 },
+                    {  28, 102, 154 },
+                    {  18,  75, 115 },
+                    {   2,  44,  65 },
+                    {   1,  12,  19 }
+                }, { /* Coeff Band 3 */
+                    {  55, 153, 210 },
+                    {  24, 130, 194 },
+                    {   3,  93, 146 },
+                    {   1,  61,  97 },
+                    {   1,  31,  50 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 4 */
+                    {  49, 186, 223 },
+                    {  17, 148, 204 },
+                    {   1,  96, 142 },
+                    {   1,  53,  83 },
+                    {   1,  26,  44 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 5 */
+                    {  13, 217, 212 },
+                    {   2, 136, 180 },
+                    {   1,  78, 124 },
+                    {   1,  50,  83 },
+                    {   1,  29,  49 },
+                    {   1,  14,  23 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 197,  13, 247 },
+                    {  82,  17, 222 },
+                    {  25,  17, 162 }
+                }, { /* Coeff Band 1 */
+                    { 126, 186, 247 },
+                    { 234, 191, 243 },
+                    { 176, 177, 234 },
+                    { 104, 158, 220 },
+                    {  66, 128, 186 },
+                    {  55,  90, 137 }
+                }, { /* Coeff Band 2 */
+                    { 111, 197, 242 },
+                    {  46, 158, 219 },
+                    {   9, 104, 171 },
+                    {   2,  65, 125 },
+                    {   1,  44,  80 },
+                    {   1,  17,  91 }
+                }, { /* Coeff Band 3 */
+                    { 104, 208, 245 },
+                    {  39, 168, 224 },
+                    {   3, 109, 162 },
+                    {   1,  79, 124 },
+                    {   1,  50, 102 },
+                    {   1,  43, 102 }
+                }, { /* Coeff Band 4 */
+                    {  84, 220, 246 },
+                    {  31, 177, 231 },
+                    {   2, 115, 180 },
+                    {   1,  79, 134 },
+                    {   1,  55,  77 },
+                    {   1,  60,  79 }
+                }, { /* Coeff Band 5 */
+                    {  43, 243, 240 },
+                    {   8, 180, 217 },
+                    {   1, 115, 166 },
+                    {   1,  84, 121 },
+                    {   1,  51,  67 },
+                    {   1,  16,   6 }
+                }
+            }
+        }
+    }
+};
+
+enum MVJoint {
+    MV_JOINT_ZERO,
+    MV_JOINT_H,
+    MV_JOINT_V,
+    MV_JOINT_HV,
+};
+
+static const int8_t vp9_mv_joint_tree[3][2] = {
+    { -MV_JOINT_ZERO, 1 },           // '0'
+     { -MV_JOINT_H, 2 },             // '10'
+      { -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
+};
+
+static const int8_t vp9_mv_class_tree[10][2] = {
+    { -0, 1 },         // '0'
+     { -1, 2 },        // '10'
+      { 3, 4 },
+       { -2, -3 },     // '110x'
+       { 5, 6 },
+        { -4, -5 },    // '1110x'
+        { -6, 7 },     // '11110'
+         { 8, 9 },
+          { -7, -8 },  // '111110x'
+          { -9, -10 }, // '111111x'
+};
+
+static const int8_t vp9_mv_fp_tree[3][2] = {
+    { -0, 1 },    // '0'
+     { -1, 2 },   // '10'
+      { -2, -3 }, // '11x'
+};
 
 #endif /* AVCODEC_VP9DATA_H */
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index c83defeda3..54e77e267b 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -4,2171 +4,38 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
+#include "vp9dsp.h"
 
-#include "rnd_avg.h"
-#include "vp9.h"
-
-// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
-// back with h264pred.[ch]
-
-static void vert_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    unsigned p4 = AV_RN32A(top);
-
-    AV_WN32A(dst + stride * 0, p4);
-    AV_WN32A(dst + stride * 1, p4);
-    AV_WN32A(dst + stride * 2, p4);
-    AV_WN32A(dst + stride * 3, p4);
-}
-
-static void vert_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8 = AV_RN64A(top);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, p8);
-        dst += stride;
-    }
-}
-
-static void vert_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, p8a);
-        AV_WN64A(dst + 8, p8b);
-        dst += stride;
-    }
-}
-
-static void vert_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0),  p8b = AV_RN64A(top + 8),
-             p8c = AV_RN64A(top + 16), p8d = AV_RN64A(top + 24);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, p8a);
-        AV_WN64A(dst +  8, p8b);
-        AV_WN64A(dst + 16, p8c);
-        AV_WN64A(dst + 24, p8d);
-        dst += stride;
-    }
-}
-
-static void hor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, left[0] * 0x01010101U);
-    AV_WN32A(dst + stride * 1, left[1] * 0x01010101U);
-    AV_WN32A(dst + stride * 2, left[2] * 0x01010101U);
-    AV_WN32A(dst + stride * 3, left[3] * 0x01010101U);
-}
-
-static void hor_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, left[y] * 0x0101010101010101ULL);
-        dst += stride;
-    }
-}
-
-static void hor_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst + 0, p8);
-        AV_WN64A(dst + 8, p8);
-        dst += stride;
-    }
-}
-
-static void hor_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst +  0, p8);
-        AV_WN64A(dst +  8, p8);
-        AV_WN64A(dst + 16, p8);
-        AV_WN64A(dst + 24, p8);
-        dst += stride;
-    }
-}
-
-static void tm_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 4; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 8; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst[4] = av_clip_uint8(top[4] + l_m_tl);
-        dst[5] = av_clip_uint8(top[5] + l_m_tl);
-        dst[6] = av_clip_uint8(top[6] + l_m_tl);
-        dst[7] = av_clip_uint8(top[7] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 16; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void tm_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 32; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst[16] = av_clip_uint8(top[16] + l_m_tl);
-        dst[17] = av_clip_uint8(top[17] + l_m_tl);
-        dst[18] = av_clip_uint8(top[18] + l_m_tl);
-        dst[19] = av_clip_uint8(top[19] + l_m_tl);
-        dst[20] = av_clip_uint8(top[20] + l_m_tl);
-        dst[21] = av_clip_uint8(top[21] + l_m_tl);
-        dst[22] = av_clip_uint8(top[22] + l_m_tl);
-        dst[23] = av_clip_uint8(top[23] + l_m_tl);
-        dst[24] = av_clip_uint8(top[24] + l_m_tl);
-        dst[25] = av_clip_uint8(top[25] + l_m_tl);
-        dst[26] = av_clip_uint8(top[26] + l_m_tl);
-        dst[27] = av_clip_uint8(top[27] + l_m_tl);
-        dst[28] = av_clip_uint8(top[28] + l_m_tl);
-        dst[29] = av_clip_uint8(top[29] + l_m_tl);
-        dst[30] = av_clip_uint8(top[30] + l_m_tl);
-        dst[31] = av_clip_uint8(top[31] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void dc_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    top[0]  + top[1]  + top[2]  + top[3]  + 4) >> 3);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] +
-                    top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  +
-                    top[16]  + top[17]  + top[18]  + top[19]  +
-                    top[20]  + top[21]  + top[22]  + top[23]  +
-                    top[24]  + top[25]  + top[26]  + top[27]  +
-                    top[28]  + top[29]  + top[30]  + top[31]  + 32) >> 6);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_left_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U * ((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_top_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0] + top[1] + top[2] + top[3] +
-                    top[4] + top[5] + top[6] + top[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] +
-                    top[16] + top[17] + top[18] + top[19] +
-                    top[20] + top[21] + top[22] + top[23] +
-                    top[24] + top[25] + top[26] + top[27] +
-                    top[28] + top[29] + top[30] + top[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_128_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x80808080U);
-    AV_WN32A(dst + stride * 1, 0x80808080U);
-    AV_WN32A(dst + stride * 2, 0x80808080U);
-    AV_WN32A(dst + stride * 3, 0x80808080U);
-}
-
-static void dc_128_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8080808080808080ULL);
-        AV_WN64A(dst + 8, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8080808080808080ULL);
-        AV_WN64A(dst +  8, 0x8080808080808080ULL);
-        AV_WN64A(dst + 16, 0x8080808080808080ULL);
-        AV_WN64A(dst + 24, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 1, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 2, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 3, 0x7F7F7F7FU);
-}
-
-static void dc_127_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst +  8, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 16, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 24, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x81818181U);
-    AV_WN32A(dst + stride * 1, 0x81818181U);
-    AV_WN32A(dst + stride * 2, 0x81818181U);
-    AV_WN32A(dst + stride * 3, 0x81818181U);
-}
-
-static void dc_129_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8181818181818181ULL);
-        AV_WN64A(dst + 8, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8181818181818181ULL);
-        AV_WN64A(dst +  8, 0x8181818181818181ULL);
-        AV_WN64A(dst + 16, 0x8181818181818181ULL);
-        AV_WN64A(dst + 24, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-#define DST(x, y) dst[(x) + (y) * stride]
-
-static void diag_downleft_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
-
-    DST(0, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 1) =
-    DST(0, 2) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 1) =
-    DST(1, 2) =
-    DST(0, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 1) =
-    DST(2, 2) =
-    DST(1, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-    DST(3, 2) =
-    DST(2, 3) = (a5 + a6 * 2 + a7 + 2) >> 2;
-    DST(3, 3) = a7;  // note: this is different from vp8 and such
-}
-
-#define def_diag_downleft(size)                                             \
-static void diag_downleft_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      const uint8_t *left,  \
-                                                      const uint8_t *top)   \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size - 1];                                                    \
-                                                                            \
-    for (i = 0; i < size - 2; i++)                                          \
-        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;             \
-    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;             \
-                                                                            \
-    for (j = 0; j < size; j++) {                                            \
-        memcpy(dst + j * stride, v + j, size - 1 - j);                      \
-        memset(dst + j * stride + size - 1 - j, top[size - 1], j + 1);      \
-    }                                                                       \
-}
-
-def_diag_downleft(8)
-def_diag_downleft(16)
-def_diag_downleft(32)
-
-static void diag_downright_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(1, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(1, 2) =
-    DST(2, 3) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 1) =
-    DST(2, 2) =
-    DST(3, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 1) =
-    DST(3, 2) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_diag_downright(size)                                            \
-static void diag_downright_ ## size ## x ## size ## _c(uint8_t *dst,        \
-                                                       ptrdiff_t stride,    \
-                                                       const uint8_t *left, \
-                                                       const uint8_t *top)  \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size + size - 1];                                             \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i]            = (left[size - 1 - i] +                             \
-                           left[size - 2 - i] * 2 +                         \
-                           left[size - 3 - i] + 2) >> 2;                    \
-        v[size + 1 + i] = (top[i]             +                             \
-                           top[i + 1]         * 2 +                         \
-                           top[i + 2]         + 2) >> 2;                    \
-    }                                                                       \
-    v[size - 2] = (left[1] + left[0] * 2 + top[-1] + 2) >> 2;               \
-    v[size - 1] = (left[0] + top[-1] * 2 + top[0]  + 2) >> 2;               \
-    v[size]     = (top[-1] + top[0]  * 2 + top[1]  + 2) >> 2;               \
-                                                                            \
-    for (j = 0; j < size; j++)                                              \
-        memcpy(dst + j * stride, v + size - 1 - j, size);                   \
-}
-
-def_diag_downright(8)
-def_diag_downright(16)
-def_diag_downright(32)
-
-static void vert_right_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2];
-
-    DST(0, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 2) = (tl + a0          + 1) >> 1;
-    DST(0, 1) =
-    DST(1, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 2) = (a0 + a1          + 1) >> 1;
-    DST(1, 1) =
-    DST(2, 3) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 2) = (a1 + a2          + 1) >> 1;
-    DST(2, 1) =
-    DST(3, 3) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a2 + a3          + 1) >> 1;
-    DST(3, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_vert_right(size)                                                \
-static void vert_right_ ## size ## x ## size ## _c(uint8_t *dst,            \
-                                                   ptrdiff_t stride,        \
-                                                   const uint8_t *left,     \
-                                                   const uint8_t *top)      \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t ve[size + size / 2 - 1], vo[size + size / 2 - 1];               \
-                                                                            \
-    for (i = 0; i < size / 2 - 2; i++) {                                    \
-        vo[i] = (left[size - 4 - i * 2] +                                   \
-                 left[size - 3 - i * 2] * 2 +                               \
-                 left[size - 2 - i * 2] + 2) >> 2;                          \
-        ve[i] = (left[size - 5 - i * 2] +                                   \
-                 left[size - 4 - i * 2] * 2 +                               \
-                 left[size - 3 - i * 2] + 2) >> 2;                          \
-    }                                                                       \
-    vo[size / 2 - 2] = (left[0] + left[1] * 2 + left[2] + 2) >> 2;          \
-    ve[size / 2 - 2] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;          \
-                                                                            \
-    ve[size / 2 - 1] = (top[-1] + top[0] + 1) >> 1;                         \
-    vo[size / 2 - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2;           \
-    for (i = 0; i < size - 1; i++) {                                        \
-        ve[size / 2 + i] = (top[i] + top[i + 1] + 1) >> 1;                  \
-        vo[size / 2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
-    }                                                                       \
-                                                                            \
-    for (j = 0; j < size / 2; j++) {                                        \
-        memcpy(dst +  j * 2      * stride, ve + size / 2 - 1 - j, size);    \
-        memcpy(dst + (j * 2 + 1) * stride, vo + size / 2 - 1 - j, size);    \
-    }                                                                       \
-}
-
-def_vert_right(8)
-def_vert_right(16)
-def_vert_right(32)
-
-static void hor_down_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3],
-        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
-
-    DST(2, 0) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(3, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(0, 0) =
-    DST(2, 1) = (tl + l0          + 1) >> 1;
-    DST(1, 0) =
-    DST(3, 1) = (a0 + tl * 2 + l0 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 2) = (l0 + l1          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 3) = (l1 + l2          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 3) = (l2 + l3          + 1) >> 1;
-    DST(1, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-}
-
-#define def_hor_down(size)                                              \
-static void hor_down_ ## size ## x ## size ## _c(uint8_t *dst,          \
-                                                 ptrdiff_t stride,      \
-                                                 const uint8_t *left,   \
-                                                 const uint8_t *top)    \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t v[size * 3 - 2];                                            \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        v[i * 2]        = (left[size - 2 - i] +                         \
-                           left[size - 1 - i] + 1) >> 1;                \
-        v[i * 2    + 1] = (left[size - 3 - i] +                         \
-                           left[size - 2 - i] * 2 +                     \
-                           left[size - 1 - i] + 2) >> 2;                \
-        v[size * 2 + i] = (top[i - 1] +                                 \
-                           top[i] * 2 +                                 \
-                           top[i + 1] + 2) >> 2;                        \
-    }                                                                   \
-    v[size * 2 - 2] = (top[-1] + left[0] + 1) >> 1;                     \
-    v[size * 2 - 4] = (left[0] + left[1] + 1) >> 1;                     \
-    v[size * 2 - 1] = (top[0]  + top[-1] * 2 + left[0] + 2) >> 2;       \
-    v[size * 2 - 3] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;       \
-                                                                        \
-    for (j = 0; j < size; j++)                                          \
-        memcpy(dst + j * stride, v + size * 2 - 2 - j * 2, size);       \
-}
-
-def_hor_down(8)
-def_hor_down(16)
-def_hor_down(32)
-
-static void vert_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6];
-
-    DST(0, 0) = (a0 + a1          + 1) >> 1;
-    DST(0, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 2) = (a1 + a2          + 1) >> 1;
-    DST(1, 1) =
-    DST(0, 3) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 2) = (a2 + a3          + 1) >> 1;
-    DST(2, 1) =
-    DST(1, 3) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 2) = (a3 + a4          + 1) >> 1;
-    DST(3, 1) =
-    DST(2, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 2) = (a4 + a5          + 1) >> 1;
-    DST(3, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-}
-
-#define def_vert_left(size)                                             \
-static void vert_left_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                  ptrdiff_t stride,     \
-                                                  const uint8_t *left,  \
-                                                  const uint8_t *top)   \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t ve[size - 1], vo[size - 1];                                 \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        ve[i] = (top[i] + top[i + 1] + 1) >> 1;                         \
-        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;        \
-    }                                                                   \
-    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1;            \
-    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;        \
-                                                                        \
-    for (j = 0; j < size / 2; j++) {                                    \
-        memcpy(dst +  j * 2      * stride, ve + j, size - (j + 1));     \
-        memset(dst +  j * 2      * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-        memcpy(dst + (j * 2 + 1) * stride, vo + j, size - (j + 1));     \
-        memset(dst + (j * 2 + 1) * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-    }                                                                   \
-}
-
-def_vert_left(8)
-def_vert_left(16)
-def_vert_left(32)
-
-static void hor_up_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 0) = (l0 + l1          + 1) >> 1;
-    DST(1, 0) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 0) = (l1 + l2          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 0) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 1) = (l2 + l3          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 1) = (l2 + l3 * 3      + 2) >> 2;
-    DST(0, 3) =
-    DST(1, 3) =
-    DST(2, 2) =
-    DST(2, 3) =
-    DST(3, 2) =
-    DST(3, 3) = l3;
-}
-
-#define def_hor_up(size)                                                    \
-static void hor_up_ ## size ## x ## size ## _c(uint8_t *dst,                \
-                                               ptrdiff_t stride,            \
-                                               const uint8_t *left,         \
-                                               const uint8_t *top)          \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size * 2 - 2];                                                \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i * 2]     = (left[i] + left[i + 1] + 1) >> 1;                    \
-        v[i * 2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2;  \
-    }                                                                       \
-    v[size * 2 - 4] = (left[size - 2] + left[size - 1]     + 1) >> 1;       \
-    v[size * 2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2;       \
-                                                                            \
-    for (j = 0; j < size / 2; j++)                                          \
-        memcpy(dst + j * stride, v + j * 2, size);                          \
-    for (j = size / 2; j < size; j++) {                                     \
-        memcpy(dst + j * stride, v + j * 2, size * 2 - 2 - j * 2);          \
-        memset(dst + j * stride + size * 2 - 2 - j * 2, left[size - 1],     \
-               2 + j * 2 - size);                                           \
-    }                                                                       \
-}
-
-def_hor_up(8)
-def_hor_up(16)
-def_hor_up(32)
-
-#undef DST
-
-static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
-{
-#define init_intra_pred(tx, sz)                                              \
-    dsp->intra_pred[tx][VERT_PRED]            = vert_           ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_PRED]             = hor_            ## sz ## _c; \
-    dsp->intra_pred[tx][DC_PRED]              = dc_             ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_  ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_ ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_     ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_       ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_      ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_         ## sz ## _c; \
-    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_             ## sz ## _c; \
-    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_        ## sz ## _c; \
-    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_         ## sz ## _c
-
-    init_intra_pred(TX_4X4,   4x4);
-    init_intra_pred(TX_8X8,   8x8);
-    init_intra_pred(TX_16X16, 16x16);
-    init_intra_pred(TX_32X32, 32x32);
-
-#undef init_intra_pred
-}
-
-#define itxfm_wrapper(type_a, type_b, sz, bits)                             \
-static void                                                                 \
-type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      int16_t *block,       \
-                                                      int eob)              \
-{                                                                           \
-    int i, j;                                                               \
-    int16_t tmp[sz * sz], out[sz];                                          \
-    for (i = 0; i < sz; i++)                                                \
-        type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
-    memset(block, 0, sz * sz * sizeof(*block));                             \
-    for (i = 0; i < sz; i++) {                                              \
-        type_b ## sz ## _1d(out, tmp + i, sz, 1);                           \
-        for (j = 0; j < sz; j++)                                            \
-            dst[j * stride] =                                               \
-                av_clip_uint8(dst[j * stride] +                             \
-                              (bits ? (out[j] + (1 << (bits - 1))) >> bits  \
-                                    : out[j]));                             \
-        dst++;                                                              \
-    }                                                                       \
-}
-
-#define itxfm_wrap(sz, bits)             \
-    itxfm_wrapper(idct, idct, sz, bits)  \
-    itxfm_wrapper(iadst, idct, sz, bits) \
-    itxfm_wrapper(idct, iadst, sz, bits) \
-    itxfm_wrapper(iadst, iadst, sz, bits)
-
-#define IN(x) in[x * stride]
-
-static av_always_inline void idct4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 = ((IN(0)        + IN(2)) * 11585 + (1 << 13)) >> 14;
-    t1 = ((IN(0)        - IN(2)) * 11585 + (1 << 13)) >> 14;
-    t2 = (IN(1) *  6270 - IN(3)  * 15137 + (1 << 13)) >> 14;
-    t3 = (IN(1) * 15137 + IN(3)  *  6270 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t3;
-    out[1] = t1 + t2;
-    out[2] = t1 - t2;
-    out[3] = t0 - t3;
-}
-
-static av_always_inline void iadst4_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
-    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
-    t2 = 13377 * (IN(0) - IN(2) + IN(3));
-    t3 = 13377 * IN(1);
-
-    out[0] = (t0 + t3      + (1 << 13)) >> 14;
-    out[1] = (t1 + t3      + (1 << 13)) >> 14;
-    out[2] = (t2           + (1 << 13)) >> 14;
-    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(4, 4)
-
-static av_always_inline void idct8_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = ((IN(0)        + IN(4)) * 11585 + (1 << 13)) >> 14;
-    t1a = ((IN(0)        - IN(4)) * 11585 + (1 << 13)) >> 14;
-    t2a = (IN(2) *  6270 - IN(6)  * 15137 + (1 << 13)) >> 14;
-    t3a = (IN(2) * 15137 + IN(6)  *  6270 + (1 << 13)) >> 14;
-    t4a = (IN(1) *  3196 - IN(7)  * 16069 + (1 << 13)) >> 14;
-    t5a = (IN(5) * 13623 - IN(3)  *  9102 + (1 << 13)) >> 14;
-    t6a = (IN(5) *  9102 + IN(3)  * 13623 + (1 << 13)) >> 14;
-    t7a = (IN(1) * 16069 + IN(7)  *  3196 + (1 << 13)) >> 14;
-
-    t0  = t0a + t3a;
-    t1  = t1a + t2a;
-    t2  = t1a - t2a;
-    t3  = t0a - t3a;
-    t4  = t4a + t5a;
-    t5a = t4a - t5a;
-    t7  = t7a + t6a;
-    t6a = t7a - t6a;
-
-    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
-    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t7;
-    out[1] = t1 + t6;
-    out[2] = t2 + t5;
-    out[3] = t3 + t4;
-    out[4] = t3 - t4;
-    out[5] = t2 - t5;
-    out[6] = t1 - t6;
-    out[7] = t0 - t7;
-}
-
-static av_always_inline void iadst8_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = 16305 * IN(7) +  1606 * IN(0);
-    t1a =  1606 * IN(7) - 16305 * IN(0);
-    t2a = 14449 * IN(5) +  7723 * IN(2);
-    t3a =  7723 * IN(5) - 14449 * IN(2);
-    t4a = 10394 * IN(3) + 12665 * IN(4);
-    t5a = 12665 * IN(3) - 10394 * IN(4);
-    t6a =  4756 * IN(1) + 15679 * IN(6);
-    t7a = 15679 * IN(1) -  4756 * IN(6);
-
-    t0  = (t0a + t4a + (1 << 13)) >> 14;
-    t1  = (t1a + t5a + (1 << 13)) >> 14;
-    t2  = (t2a + t6a + (1 << 13)) >> 14;
-    t3  = (t3a + t7a + (1 << 13)) >> 14;
-    t4  = (t0a - t4a + (1 << 13)) >> 14;
-    t5  = (t1a - t5a + (1 << 13)) >> 14;
-    t6  = (t2a - t6a + (1 << 13)) >> 14;
-    t7  = (t3a - t7a + (1 << 13)) >> 14;
-
-    t4a = 15137 * t4 +  6270 * t5;
-    t5a =  6270 * t4 - 15137 * t5;
-    t6a = 15137 * t7 -  6270 * t6;
-    t7a =  6270 * t7 + 15137 * t6;
-
-    out[0] =   t0 + t2;
-    out[7] = -(t1 + t3);
-    t2     =   t0 - t2;
-    t3     =   t1 - t3;
-
-    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
-    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6     =   (t4a - t6a + (1 << 13)) >> 14;
-    t7     =   (t5a - t7a + (1 << 13)) >> 14;
-
-    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
-    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
-    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
-    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
-}
-
-itxfm_wrap(8, 5)
-
-static av_always_inline void idct16_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0a  = ((IN(0)         + IN(8)) * 11585 + (1 << 13)) >> 14;
-    t1a  = ((IN(0)         - IN(8)) * 11585 + (1 << 13)) >> 14;
-    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
-    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
-    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
-    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
-    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
-    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
-    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
-    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
-    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
-    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
-    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
-    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
-    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
-    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
-
-    t0   = t0a  + t3a;
-    t1   = t1a  + t2a;
-    t2   = t1a  - t2a;
-    t3   = t0a  - t3a;
-    t4   = t4a  + t5a;
-    t5   = t4a  - t5a;
-    t6   = t7a  - t6a;
-    t7   = t7a  + t6a;
-    t8   = t8a  + t9a;
-    t9   = t8a  - t9a;
-    t10  = t11a - t10a;
-    t11  = t11a + t10a;
-    t12  = t12a + t13a;
-    t13  = t12a - t13a;
-    t14  = t15a - t14a;
-    t15  = t15a + t14a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4   = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7   = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-
-    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
-    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
-    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
-    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0a + t15a;
-    out[1]  = t1a + t14;
-    out[2]  = t2a + t13a;
-    out[3]  = t3a + t12;
-    out[4]  = t4  + t11;
-    out[5]  = t5  + t10a;
-    out[6]  = t6  + t9;
-    out[7]  = t7  + t8a;
-    out[8]  = t7  - t8a;
-    out[9]  = t6  - t9;
-    out[10] = t5  - t10a;
-    out[11] = t4  - t11;
-    out[12] = t3a - t12;
-    out[13] = t2a - t13a;
-    out[14] = t1a - t14;
-    out[15] = t0a - t15a;
-}
-
-static av_always_inline void iadst16_1d(int16_t *out, const int16_t *in,
-                                        ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0  = IN(15) * 16364 + IN(0)  *   804;
-    t1  = IN(15) *   804 - IN(0)  * 16364;
-    t2  = IN(13) * 15893 + IN(2)  *  3981;
-    t3  = IN(13) *  3981 - IN(2)  * 15893;
-    t4  = IN(11) * 14811 + IN(4)  *  7005;
-    t5  = IN(11) *  7005 - IN(4)  * 14811;
-    t6  = IN(9)  * 13160 + IN(6)  *  9760;
-    t7  = IN(9)  *  9760 - IN(6)  * 13160;
-    t8  = IN(7)  * 11003 + IN(8)  * 12140;
-    t9  = IN(7)  * 12140 - IN(8)  * 11003;
-    t10 = IN(5)  *  8423 + IN(10) * 14053;
-    t11 = IN(5)  * 14053 - IN(10) *  8423;
-    t12 = IN(3)  *  5520 + IN(12) * 15426;
-    t13 = IN(3)  * 15426 - IN(12) *  5520;
-    t14 = IN(1)  *  2404 + IN(14) * 16207;
-    t15 = IN(1)  * 16207 - IN(14) *  2404;
-
-    t0a  = (t0 + t8  + (1 << 13)) >> 14;
-    t1a  = (t1 + t9  + (1 << 13)) >> 14;
-    t2a  = (t2 + t10 + (1 << 13)) >> 14;
-    t3a  = (t3 + t11 + (1 << 13)) >> 14;
-    t4a  = (t4 + t12 + (1 << 13)) >> 14;
-    t5a  = (t5 + t13 + (1 << 13)) >> 14;
-    t6a  = (t6 + t14 + (1 << 13)) >> 14;
-    t7a  = (t7 + t15 + (1 << 13)) >> 14;
-    t8a  = (t0 - t8  + (1 << 13)) >> 14;
-    t9a  = (t1 - t9  + (1 << 13)) >> 14;
-    t10a = (t2 - t10 + (1 << 13)) >> 14;
-    t11a = (t3 - t11 + (1 << 13)) >> 14;
-    t12a = (t4 - t12 + (1 << 13)) >> 14;
-    t13a = (t5 - t13 + (1 << 13)) >> 14;
-    t14a = (t6 - t14 + (1 << 13)) >> 14;
-    t15a = (t7 - t15 + (1 << 13)) >> 14;
-
-    t8   = t8a  * 16069 + t9a  *  3196;
-    t9   = t8a  *  3196 - t9a  * 16069;
-    t10  = t10a *  9102 + t11a * 13623;
-    t11  = t10a * 13623 - t11a *  9102;
-    t12  = t13a * 16069 - t12a *  3196;
-    t13  = t13a *  3196 + t12a * 16069;
-    t14  = t15a *  9102 - t14a * 13623;
-    t15  = t15a * 13623 + t14a *  9102;
-
-    t0   = t0a  + t4a;
-    t1   = t1a  + t5a;
-    t2   = t2a  + t6a;
-    t3   = t3a  + t7a;
-    t4   = t0a  - t4a;
-    t5   = t1a  - t5a;
-    t6   = t2a  - t6a;
-    t7   = t3a  - t7a;
-    t8a  = (t8  + t12 + (1 << 13)) >> 14;
-    t9a  = (t9  + t13 + (1 << 13)) >> 14;
-    t10a = (t10 + t14 + (1 << 13)) >> 14;
-    t11a = (t11 + t15 + (1 << 13)) >> 14;
-    t12a = (t8  - t12 + (1 << 13)) >> 14;
-    t13a = (t9  - t13 + (1 << 13)) >> 14;
-    t14a = (t10 - t14 + (1 << 13)) >> 14;
-    t15a = (t11 - t15 + (1 << 13)) >> 14;
-
-    t4a  = t4   * 15137 + t5   *  6270;
-    t5a  = t4   *  6270 - t5   * 15137;
-    t6a  = t7   * 15137 - t6   *  6270;
-    t7a  = t7   *  6270 + t6   * 15137;
-    t12  = t12a * 15137 + t13a *  6270;
-    t13  = t12a *  6270 - t13a * 15137;
-    t14  = t15a * 15137 - t14a *  6270;
-    t15  = t15a *  6270 + t14a * 15137;
-
-    out[0]  =     t0 + t2;
-    out[15] =   -(t1 + t3);
-    t2a     =     t0 - t2;
-    t3a     =     t1 - t3;
-    out[3]  = -((t4a + t6a + (1 << 13)) >> 14);
-    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6      =   (t4a - t6a + (1 << 13)) >> 14;
-    t7      =   (t5a - t7a + (1 << 13)) >> 14;
-    out[1]  =  -(t8a + t10a);
-    out[14] =    t9a + t11a;
-    t10     =    t8a - t10a;
-    t11     =    t9a - t11a;
-    out[2]  =   (t12 + t14 + (1 << 13)) >> 14;
-    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
-    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
-    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
-
-    out[7]  = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
-    out[8]  = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
-    out[4]  = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
-    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
-    out[6]  = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
-    out[9]  = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
-    out[5]  = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
-    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(16, 6)
-
-static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
+av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
 {
-    int t0a  = ((IN(0)         + IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t1a  = ((IN(0)         - IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t2a  = (IN(8)  *  6270 - IN(24)  * 15137 + (1 << 13)) >> 14;
-    int t3a  = (IN(8)  * 15137 + IN(24)  *  6270 + (1 << 13)) >> 14;
-    int t4a  = (IN(4)  *  3196 - IN(28)  * 16069 + (1 << 13)) >> 14;
-    int t7a  = (IN(4)  * 16069 + IN(28)  *  3196 + (1 << 13)) >> 14;
-    int t5a  = (IN(20) * 13623 - IN(12)  *  9102 + (1 << 13)) >> 14;
-    int t6a  = (IN(20) *  9102 + IN(12)  * 13623 + (1 << 13)) >> 14;
-    int t8a  = (IN(2)  *  1606 - IN(30)  * 16305 + (1 << 13)) >> 14;
-    int t15a = (IN(2)  * 16305 + IN(30)  *  1606 + (1 << 13)) >> 14;
-    int t9a  = (IN(18) * 12665 - IN(14)  * 10394 + (1 << 13)) >> 14;
-    int t14a = (IN(18) * 10394 + IN(14)  * 12665 + (1 << 13)) >> 14;
-    int t10a = (IN(10) *  7723 - IN(22)  * 14449 + (1 << 13)) >> 14;
-    int t13a = (IN(10) * 14449 + IN(22)  *  7723 + (1 << 13)) >> 14;
-    int t11a = (IN(26) * 15679 - IN(6)   *  4756 + (1 << 13)) >> 14;
-    int t12a = (IN(26) *  4756 + IN(6)   * 15679 + (1 << 13)) >> 14;
-    int t16a = (IN(1)  *   804 - IN(31)  * 16364 + (1 << 13)) >> 14;
-    int t31a = (IN(1)  * 16364 + IN(31)  *   804 + (1 << 13)) >> 14;
-    int t17a = (IN(17) * 12140 - IN(15)  * 11003 + (1 << 13)) >> 14;
-    int t30a = (IN(17) * 11003 + IN(15)  * 12140 + (1 << 13)) >> 14;
-    int t18a = (IN(9)  *  7005 - IN(23)  * 14811 + (1 << 13)) >> 14;
-    int t29a = (IN(9)  * 14811 + IN(23)  *  7005 + (1 << 13)) >> 14;
-    int t19a = (IN(25) * 15426 - IN(7)   *  5520 + (1 << 13)) >> 14;
-    int t28a = (IN(25) *  5520 + IN(7)   * 15426 + (1 << 13)) >> 14;
-    int t20a = (IN(5)  *  3981 - IN(27)  * 15893 + (1 << 13)) >> 14;
-    int t27a = (IN(5)  * 15893 + IN(27)  *  3981 + (1 << 13)) >> 14;
-    int t21a = (IN(21) * 14053 - IN(11)  *  8423 + (1 << 13)) >> 14;
-    int t26a = (IN(21) *  8423 + IN(11)  * 14053 + (1 << 13)) >> 14;
-    int t22a = (IN(13) *  9760 - IN(19)  * 13160 + (1 << 13)) >> 14;
-    int t25a = (IN(13) * 13160 + IN(19)  *  9760 + (1 << 13)) >> 14;
-    int t23a = (IN(29) * 16207 - IN(3)   *  2404 + (1 << 13)) >> 14;
-    int t24a = (IN(29) *  2404 + IN(3)   * 16207 + (1 << 13)) >> 14;
-
-    int t0  = t0a  + t3a;
-    int t1  = t1a  + t2a;
-    int t2  = t1a  - t2a;
-    int t3  = t0a  - t3a;
-    int t4  = t4a  + t5a;
-    int t5  = t4a  - t5a;
-    int t6  = t7a  - t6a;
-    int t7  = t7a  + t6a;
-    int t8  = t8a  + t9a;
-    int t9  = t8a  - t9a;
-    int t10 = t11a - t10a;
-    int t11 = t11a + t10a;
-    int t12 = t12a + t13a;
-    int t13 = t12a - t13a;
-    int t14 = t15a - t14a;
-    int t15 = t15a + t14a;
-    int t16 = t16a + t17a;
-    int t17 = t16a - t17a;
-    int t18 = t19a - t18a;
-    int t19 = t19a + t18a;
-    int t20 = t20a + t21a;
-    int t21 = t20a - t21a;
-    int t22 = t23a - t22a;
-    int t23 = t23a + t22a;
-    int t24 = t24a + t25a;
-    int t25 = t24a - t25a;
-    int t26 = t27a - t26a;
-    int t27 = t27a + t26a;
-    int t28 = t28a + t29a;
-    int t29 = t28a - t29a;
-    int t30 = t31a - t30a;
-    int t31 = t31a + t30a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-    t17a =   (t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
-    t30a =   (t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
-    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
-    t29a =   (t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
-    t21a =   (t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
-    t26a =   (t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
-    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
-    t25a =   (t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4a  = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7a  = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-    t16a = t16  + t19;
-    t17  = t17a + t18a;
-    t18  = t17a - t18a;
-    t19a = t16  - t19;
-    t20a = t23  - t20;
-    t21  = t22a - t21a;
-    t22  = t22a + t21a;
-    t23a = t23  + t20;
-    t24a = t24  + t27;
-    t25  = t25a + t26a;
-    t26  = t25a - t26a;
-    t27a = t24  - t27;
-    t28a = t31  - t28;
-    t29  = t30a - t29a;
-    t30  = t30a + t29a;
-    t31a = t31  + t28;
-
-    t10a = ((t13           - t10)  * 11585  + (1 << 13)) >> 14;
-    t13a = ((t13           + t10)  * 11585  + (1 << 13)) >> 14;
-    t11  = ((t12a          - t11a) * 11585  + (1 << 13)) >> 14;
-    t12  = ((t12a          + t11a) * 11585  + (1 << 13)) >> 14;
-    t18a =   (t29  *  6270 - t18   * 15137  + (1 << 13)) >> 14;
-    t29a =   (t29  * 15137 + t18   *  6270  + (1 << 13)) >> 14;
-    t19  =   (t28a *  6270 - t19a  * 15137  + (1 << 13)) >> 14;
-    t28  =   (t28a * 15137 + t19a  *  6270  + (1 << 13)) >> 14;
-    t20  = (-(t27a * 15137 + t20a  *  6270) + (1 << 13)) >> 14;
-    t27  =   (t27a *  6270 - t20a  * 15137  + (1 << 13)) >> 14;
-    t21a = (-(t26  * 15137 + t21   *  6270) + (1 << 13)) >> 14;
-    t26a =   (t26  *  6270 - t21   * 15137  + (1 << 13)) >> 14;
-
-    t0   = t0a  + t15a;
-    t1   = t1a  + t14;
-    t2   = t2a  + t13a;
-    t3   = t3a  + t12;
-    t4   = t4a  + t11;
-    t5a  = t5   + t10a;
-    t6a  = t6   + t9;
-    t7   = t7a  + t8a;
-    t8   = t7a  - t8a;
-    t9a  = t6   - t9;
-    t10  = t5   - t10a;
-    t11a = t4a  - t11;
-    t12a = t3a  - t12;
-    t13  = t2a  - t13a;
-    t14a = t1a  - t14;
-    t15  = t0a  - t15a;
-    t16  = t16a + t23a;
-    t17a = t17  + t22;
-    t18  = t18a + t21a;
-    t19a = t19  + t20;
-    t20a = t19  - t20;
-    t21  = t18a - t21a;
-    t22a = t17  - t22;
-    t23  = t16a - t23a;
-    t24  = t31a - t24a;
-    t25a = t30  - t25;
-    t26  = t29a - t26a;
-    t27a = t28  - t27;
-    t28a = t28  + t27;
-    t29  = t29a + t26a;
-    t30a = t30  + t25;
-    t31  = t31a + t24a;
-
-    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
-    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
-    t21a = ((t26  - t21)  * 11585 + (1 << 13)) >> 14;
-    t26a = ((t26  + t21)  * 11585 + (1 << 13)) >> 14;
-    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
-    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
-    t23a = ((t24  - t23)  * 11585 + (1 << 13)) >> 14;
-    t24a = ((t24  + t23)  * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0   + t31;
-    out[1]  = t1   + t30a;
-    out[2]  = t2   + t29;
-    out[3]  = t3   + t28a;
-    out[4]  = t4   + t27;
-    out[5]  = t5a  + t26a;
-    out[6]  = t6a  + t25;
-    out[7]  = t7   + t24a;
-    out[8]  = t8   + t23a;
-    out[9]  = t9a  + t22;
-    out[10] = t10  + t21a;
-    out[11] = t11a + t20;
-    out[12] = t12a + t19a;
-    out[13] = t13  + t18;
-    out[14] = t14a + t17a;
-    out[15] = t15  + t16;
-    out[16] = t15  - t16;
-    out[17] = t14a - t17a;
-    out[18] = t13  - t18;
-    out[19] = t12a - t19a;
-    out[20] = t11a - t20;
-    out[21] = t10  - t21a;
-    out[22] = t9a  - t22;
-    out[23] = t8   - t23a;
-    out[24] = t7   - t24a;
-    out[25] = t6a  - t25;
-    out[26] = t5a  - t26a;
-    out[27] = t4   - t27;
-    out[28] = t3   - t28a;
-    out[29] = t2   - t29;
-    out[30] = t1   - t30a;
-    out[31] = t0   - t31;
-}
-
-itxfm_wrapper(idct, idct, 32, 6)
-
-static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4;
-
-    if (pass == 0) {
-        t0 = IN(0) >> 2;
-        t1 = IN(3) >> 2;
-        t2 = IN(1) >> 2;
-        t3 = IN(2) >> 2;
+    if (bpp == 8) {
+        ff_vp9dsp_init_8(dsp);
+    } else if (bpp == 10) {
+        ff_vp9dsp_init_10(dsp);
     } else {
-        t0 = IN(0);
-        t1 = IN(3);
-        t2 = IN(1);
-        t3 = IN(2);
+        av_assert0(bpp == 12);
+        ff_vp9dsp_init_12(dsp);
     }
 
-    t0 += t2;
-    t3 -= t1;
-    t4 = (t0 - t3) >> 1;
-    t1 = t4 - t1;
-    t2 = t4 - t2;
-    t0 -= t1;
-    t3 += t2;
-
-    out[0] = t0;
-    out[1] = t1;
-    out[2] = t2;
-    out[3] = t3;
-}
-
-itxfm_wrapper(iwht, iwht, 4, 0)
-
-#undef IN
-#undef itxfm_wrapper
-#undef itxfm_wrap
-
-static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
-{
-#define init_itxfm(tx, sz)                                        \
-    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_   ## sz ## _add_c; \
-    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_ ## sz ## _add_c
-
-#define init_idct(tx, nm)                               \
-    dsp->itxfm_add[tx][DCT_DCT]   =                     \
-    dsp->itxfm_add[tx][ADST_DCT]  =                     \
-    dsp->itxfm_add[tx][DCT_ADST]  =                     \
-    dsp->itxfm_add[tx][ADST_ADST] = nm ## _add_c
-
-    init_itxfm(TX_4X4, 4x4);
-    init_itxfm(TX_8X8, 8x8);
-    init_itxfm(TX_16X16, 16x16);
-    init_idct(TX_32X32, idct_idct_32x32);
-    init_idct(4 /* lossless */, iwht_iwht_4x4);
-
-#undef init_itxfm
-#undef init_idct
-}
-
-static av_always_inline void loop_filter(uint8_t *dst, ptrdiff_t stride,
-                                         int E, int I, int H,
-                                         ptrdiff_t stridea, ptrdiff_t strideb,
-                                         int wd)
-{
-    int i;
-
-    for (i = 0; i < 8; i++, dst += stridea) {
-        int p7, p6, p5, p4;
-        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
-        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
-        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
-        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
-        int q4, q5, q6, q7;
-        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
-                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
-                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
-                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
-        int flat8out, flat8in;
-
-        if (!fm)
-            continue;
-
-        if (wd >= 16) {
-            p7 = dst[strideb * -8];
-            p6 = dst[strideb * -7];
-            p5 = dst[strideb * -6];
-            p4 = dst[strideb * -5];
-            q4 = dst[strideb * +4];
-            q5 = dst[strideb * +5];
-            q6 = dst[strideb * +6];
-            q7 = dst[strideb * +7];
-
-            flat8out = FFABS(p7 - p0) <= 1 && FFABS(p6 - p0) <= 1 &&
-                       FFABS(p5 - p0) <= 1 && FFABS(p4 - p0) <= 1 &&
-                       FFABS(q4 - q0) <= 1 && FFABS(q5 - q0) <= 1 &&
-                       FFABS(q6 - q0) <= 1 && FFABS(q7 - q0) <= 1;
-        }
-
-        if (wd >= 8)
-            flat8in = FFABS(p3 - p0) <= 1 && FFABS(p2 - p0) <= 1 &&
-                      FFABS(p1 - p0) <= 1 && FFABS(q1 - q0) <= 1 &&
-                      FFABS(q2 - q0) <= 1 && FFABS(q3 - q0) <= 1;
-
-        if (wd >= 16 && flat8out && flat8in) {
-            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
-                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
-            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
-                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
-            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
-                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
-            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
-                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
-            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
-                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
-            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
-            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
-            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
-            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
-            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
-                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
-                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
-                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
-                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-        } else if (wd >= 8 && flat8in) {
-            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
-            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
-            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
-            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
-            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
-            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
-        } else {
-            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
-
-            if (hev) {
-                int f = av_clip_int8(3 * (q0 - p0) + av_clip_int8(p1 - q1));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-            } else {
-                int f = av_clip_int8(3 * (q0 - p0));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-
-                f = (f1 + 1) >> 1;
-                dst[strideb * -2] = av_clip_uint8(p1 + f);
-                dst[strideb * +1] = av_clip_uint8(q1 - f);
-            }
-        }
-    }
-}
-
-#define lf_8_fn(dir, wd, stridea, strideb)                                  \
-static void loop_filter_ ## dir ## _ ## wd  ## _8_c(uint8_t *dst,           \
-                                                    ptrdiff_t stride,       \
-                                                    int E, int I, int H)    \
-{                                                                           \
-    loop_filter(dst, stride, E, I, H, stridea, strideb, wd);                \
-}
-
-#define lf_8_fns(wd)          \
-    lf_8_fn(h, wd, stride, 1) \
-    lf_8_fn(v, wd, 1, stride)
-
-lf_8_fns(4)
-lf_8_fns(8)
-lf_8_fns(16)
-
-#undef lf_8_fn
-#undef lf_8_fns
-
-#define lf_16_fn(dir, stridea)                                          \
-static void loop_filter_ ## dir ## _16_16_c(uint8_t *dst,               \
-                                            ptrdiff_t stride,           \
-                                            int E, int I, int H)        \
-{                                                                       \
-    loop_filter_ ## dir ## _16_8_c(dst, stride, E, I, H);               \
-    loop_filter_ ## dir ## _16_8_c(dst + 8 * stridea, stride, E, I, H); \
-}
-
-lf_16_fn(h, stride)
-lf_16_fn(v, 1)
-
-#undef lf_16_fn
-
-#define lf_mix_fn(dir, wd1, wd2, stridea)                                     \
-static void loop_filter_ ## dir ## _ ## wd1 ## wd2 ## _16_c(uint8_t *dst,     \
-                                                            ptrdiff_t stride, \
-                                                            int E, int I,     \
-                                                            int H)            \
-{                                                                             \
-    loop_filter_ ## dir ## _ ## wd1 ## _8_c(dst, stride, E & 0xff,            \
-                                            I & 0xff, H & 0xff);              \
-    loop_filter_ ## dir ## _ ## wd2 ## _8_c(dst + 8 * stridea, stride,        \
-                                            E >> 8, I >> 8, H >> 8);          \
-}
-
-#define lf_mix_fns(wd1, wd2)       \
-    lf_mix_fn(h, wd1, wd2, stride) \
-    lf_mix_fn(v, wd1, wd2, 1)
-
-lf_mix_fns(4, 4)
-lf_mix_fns(4, 8)
-lf_mix_fns(8, 4)
-lf_mix_fns(8, 8)
-
-#undef lf_mix_fn
-#undef lf_mix_fns
-
-static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
-{
-    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
-    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
-    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
-    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
-    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
-    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
-
-    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
-    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
-
-    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
-    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
-    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
-    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
-    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
-    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
-    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
-    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
-}
-
-static av_always_inline void copy_c(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t dst_stride,
-                                    ptrdiff_t src_stride,
-                                    int w, int h)
-{
-    do {
-        memcpy(dst, src, w);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-static av_always_inline void avg_c(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t dst_stride,
-                                   ptrdiff_t src_stride,
-                                   int w, int h)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x += 4)
-            AV_WN32A(&dst[x], rnd_avg32(AV_RN32A(&dst[x]), AV_RN32(&src[x])));
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define fpel_fn(type, sz)                                      \
-static void type ## sz ## _c(uint8_t *dst, const uint8_t *src, \
-                             ptrdiff_t dst_stride,             \
-                             ptrdiff_t src_stride,             \
-                             int h, int mx, int my)            \
-{                                                              \
-    type ## _c(dst, src, dst_stride, src_stride, sz, h);       \
-}
-
-#define copy_avg_fn(sz) \
-    fpel_fn(copy, sz)   \
-    fpel_fn(avg, sz)
-
-copy_avg_fn(64)
-copy_avg_fn(32)
-copy_avg_fn(16)
-copy_avg_fn(8)
-copy_avg_fn(4)
-
-#undef fpel_fn
-#undef copy_avg_fn
-
-static const int8_t vp9_subpel_filters[3][15][8] = {
-    [FILTER_8TAP_REGULAR] = {
-        {  0,  1,  -5, 126,   8,  -3,  1,  0 },
-        { -1,  3, -10, 122,  18,  -6,  2,  0 },
-        { -1,  4, -13, 118,  27,  -9,  3, -1 },
-        { -1,  4, -16, 112,  37, -11,  4, -1 },
-        { -1,  5, -18, 105,  48, -14,  4, -1 },
-        { -1,  5, -19,  97,  58, -16,  5, -1 },
-        { -1,  6, -19,  88,  68, -18,  5, -1 },
-        { -1,  6, -19,  78,  78, -19,  6, -1 },
-        { -1,  5, -18,  68,  88, -19,  6, -1 },
-        { -1,  5, -16,  58,  97, -19,  5, -1 },
-        { -1,  4, -14,  48, 105, -18,  5, -1 },
-        { -1,  4, -11,  37, 112, -16,  4, -1 },
-        { -1,  3,  -9,  27, 118, -13,  4, -1 },
-        {  0,  2,  -6,  18, 122, -10,  3, -1 },
-        {  0,  1,  -3,   8, 126,  -5,  1,  0 },
-    }, [FILTER_8TAP_SHARP] = {
-        { -1,  3,  -7, 127,   8,  -3,  1,  0 },
-        { -2,  5, -13, 125,  17,  -6,  3, -1 },
-        { -3,  7, -17, 121,  27, -10,  5, -2 },
-        { -4,  9, -20, 115,  37, -13,  6, -2 },
-        { -4, 10, -23, 108,  48, -16,  8, -3 },
-        { -4, 10, -24, 100,  59, -19,  9, -3 },
-        { -4, 11, -24,  90,  70, -21, 10, -4 },
-        { -4, 11, -23,  80,  80, -23, 11, -4 },
-        { -4, 10, -21,  70,  90, -24, 11, -4 },
-        { -3,  9, -19,  59, 100, -24, 10, -4 },
-        { -3,  8, -16,  48, 108, -23, 10, -4 },
-        { -2,  6, -13,  37, 115, -20,  9, -4 },
-        { -2,  5, -10,  27, 121, -17,  7, -3 },
-        { -1,  3,  -6,  17, 125, -13,  5, -2 },
-        {  0,  1,  -3,   8, 127,  -7,  3, -1 },
-    }, [FILTER_8TAP_SMOOTH] = {
-        { -3, -1,  32,  64,  38,   1, -3,  0 },
-        { -2, -2,  29,  63,  41,   2, -3,  0 },
-        { -2, -2,  26,  63,  43,   4, -4,  0 },
-        { -2, -3,  24,  62,  46,   5, -4,  0 },
-        { -2, -3,  21,  60,  49,   7, -4,  0 },
-        { -1, -4,  18,  59,  51,   9, -4,  0 },
-        { -1, -4,  16,  57,  53,  12, -4, -1 },
-        { -1, -4,  14,  55,  55,  14, -4, -1 },
-        { -1, -4,  12,  53,  57,  16, -4, -1 },
-        {  0, -4,   9,  51,  59,  18, -4, -1 },
-        {  0, -4,   7,  49,  60,  21, -3, -2 },
-        {  0, -4,   5,  46,  62,  24, -3, -2 },
-        {  0, -4,   4,  43,  63,  26, -2, -2 },
-        {  0, -3,   2,  41,  63,  29, -2, -2 },
-        {  0, -3,   1,  38,  64,  32, -1, -3 },
-    }
-};
-
-#define FILTER_8TAP(src, x, F, stride)              \
-    av_clip_uint8((F[0] * src[x + -3 * stride] +    \
-                   F[1] * src[x + -2 * stride] +    \
-                   F[2] * src[x + -1 * stride] +    \
-                   F[3] * src[x + +0 * stride] +    \
-                   F[4] * src[x + +1 * stride] +    \
-                   F[5] * src[x + +2 * stride] +    \
-                   F[6] * src[x + +3 * stride] +    \
-                   F[7] * src[x + +4 * stride] + 64) >> 7)
-
-static av_always_inline void do_8tap_1d_c(uint8_t *dst, const uint8_t *src,
-                                          ptrdiff_t dst_stride,
-                                          ptrdiff_t src_stride,
-                                          int w, int h, ptrdiff_t ds,
-                                          const int8_t *filter, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(src, x, filter, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define filter_8tap_1d_fn(opn, opa, dir, ds)                                \
-static av_noinline void opn ## _8tap_1d_ ## dir ## _c(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int w, int h,         \
-                                                      const int8_t *filter) \
-{                                                                           \
-    do_8tap_1d_c(dst, src, dst_stride, src_stride, w, h, ds, filter, opa);  \
-}
-
-filter_8tap_1d_fn(put, 0, v, src_stride)
-filter_8tap_1d_fn(put, 0, h, 1)
-filter_8tap_1d_fn(avg, 1, v, src_stride)
-filter_8tap_1d_fn(avg, 1, h, 1)
-
-#undef filter_8tap_1d_fn
-
-static av_always_inline void do_8tap_2d_c(uint8_t *dst, const uint8_t *src,
-                                          ptrdiff_t dst_stride,
-                                          ptrdiff_t src_stride,
-                                          int w, int h, const int8_t *filterx,
-                                          const int8_t *filtery, int avg)
-{
-    int tmp_h = h + 7;
-    uint8_t tmp[64 * 71], *tmp_ptr = tmp;
-
-    src -= src_stride * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp + 64 * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define filter_8tap_2d_fn(opn, opa)                                     \
-static av_noinline void opn ## _8tap_2d_hv_c(uint8_t *dst,              \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t dst_stride,      \
-                                             ptrdiff_t src_stride,      \
-                                             int w, int h,              \
-                                             const int8_t *filterx,     \
-                                             const int8_t *filtery)     \
-{                                                                       \
-    do_8tap_2d_c(dst, src, dst_stride, src_stride,                      \
-                 w, h, filterx, filtery, opa);                          \
-}
-
-filter_8tap_2d_fn(put, 0)
-filter_8tap_2d_fn(avg, 1)
-
-#undef filter_8tap_2d_fn
-
-#undef FILTER_8TAP
-
-#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg)                   \
-static void                                                                 \
-avg ## _8tap_ ## type ## _ ## sz ## dir ## _c(uint8_t *dst,                 \
-                                              const uint8_t *src,           \
-                                              ptrdiff_t dst_stride,         \
-                                              ptrdiff_t src_stride,         \
-                                              int h, int mx, int my)        \
-{                                                                           \
-    avg ## _8tap_1d_ ## dir ## _c(dst, src, dst_stride, src_stride, sz, h,  \
-                                  vp9_subpel_filters[type_idx][dir_m - 1]); \
-}
-
-#define filter_fn_2d(sz, type, type_idx, avg)                               \
-static void avg ## _8tap_ ## type ## _ ## sz ## hv_c(uint8_t *dst,          \
-                                                     const uint8_t *src,    \
-                                                     ptrdiff_t dst_stride,  \
-                                                     ptrdiff_t src_stride,  \
-                                                     int h, int mx, int my) \
-{                                                                           \
-    avg ## _8tap_2d_hv_c(dst, src, dst_stride, src_stride, sz, h,           \
-                         vp9_subpel_filters[type_idx][mx - 1],              \
-                         vp9_subpel_filters[type_idx][my - 1]);             \
-}
-
-#define FILTER_BILIN(src, x, mxy, stride)                       \
-    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
-
-static av_always_inline void do_bilin_1d_c(uint8_t *dst,
-                                           const uint8_t *src,
-                                           ptrdiff_t dst_stride,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, ptrdiff_t ds,
-                                           int mxy, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(src, x, mxy, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define bilin_1d_fn(opn, opa, dir, ds)                                        \
-static av_noinline void opn ## _bilin_1d_ ## dir ## _c(uint8_t *dst,          \
-                                                       const uint8_t *src,    \
-                                                       ptrdiff_t dst_stride,  \
-                                                       ptrdiff_t src_stride,  \
-                                                       int w, int h, int mxy) \
-{                                                                             \
-    do_bilin_1d_c(dst, src, dst_stride, src_stride, w, h, ds, mxy, opa);      \
-}
-
-bilin_1d_fn(put, 0, v, src_stride)
-bilin_1d_fn(put, 0, h, 1)
-bilin_1d_fn(avg, 1, v, src_stride)
-bilin_1d_fn(avg, 1, h, 1)
-
-#undef bilin_1d_fn
-
-static av_always_inline void do_bilin_2d_c(uint8_t *dst,
-                                           const uint8_t *src,
-                                           ptrdiff_t dst_stride,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, int mx, int my,
-                                           int avg)
-{
-    uint8_t tmp[64 * 65], *tmp_ptr = tmp;
-    int tmp_h = h + 1;
-
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define bilin_2d_fn(opn, opa)                                           \
-static av_noinline void opn ## _bilin_2d_hv_c(uint8_t *dst,             \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t dst_stride,     \
-                                              ptrdiff_t src_stride,     \
-                                              int w, int h,             \
-                                              int mx, int my)           \
-{                                                                       \
-    do_bilin_2d_c(dst, src, dst_stride, src_stride, w, h, mx, my, opa); \
-}
-
-bilin_2d_fn(put, 0)
-bilin_2d_fn(avg, 1)
-
-#undef bilin_2d_fn
-
-#undef FILTER_BILIN
-
-#define bilinf_fn_1d(sz, dir, dir_m, avg)                               \
-static void avg ## _bilin_ ## sz ## dir ## _c(uint8_t *dst,             \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t dst_stride,     \
-                                              ptrdiff_t src_stride,     \
-                                              int h, int mx, int my)    \
-{                                                                       \
-    avg ## _bilin_1d_ ## dir ## _c(dst, src, dst_stride, src_stride,    \
-                                   sz, h, dir_m);                       \
-}
-
-#define bilinf_fn_2d(sz, avg)                                        \
-static void avg ## _bilin_ ## sz ## hv_c(uint8_t *dst,               \
-                                         const uint8_t *src,         \
-                                         ptrdiff_t dst_stride,       \
-                                         ptrdiff_t src_stride,       \
-                                         int h, int mx, int my)      \
-{                                                                    \
-    avg ## _bilin_2d_hv_c(dst, src, dst_stride, src_stride,          \
-                          sz, h, mx, my);                            \
-}
-
-#define filter_fn(sz, avg)                                     \
-    filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg)        \
-    filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg)          \
-    filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg)            \
-    bilinf_fn_1d(sz, h, mx, avg)                               \
-    bilinf_fn_1d(sz, v, my, avg)                               \
-    bilinf_fn_2d(sz, avg)
-
-#define filter_fn_set(avg) \
-    filter_fn(64, avg)     \
-    filter_fn(32, avg)     \
-    filter_fn(16, avg)     \
-    filter_fn(8, avg)      \
-    filter_fn(4, avg)
-
-filter_fn_set(put)
-filter_fn_set(avg)
-
-#undef filter_fn
-#undef filter_fn_set
-#undef filter_fn_1d
-#undef filter_fn_2d
-#undef bilinf_fn_1d
-#undef bilinf_fn_2d
-
-static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
-{
-#define init_fpel(idx1, idx2, sz, type)                                \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][0][0]  = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][0][0]   = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][0][0]     = type ## sz ## _c
-
-#define init_copy_avg(idx, sz)          \
-    init_fpel(idx, 0, sz, copy);        \
-    init_fpel(idx, 1, sz, avg)
-
-    init_copy_avg(0, 64);
-    init_copy_avg(1, 32);
-    init_copy_avg(2, 16);
-    init_copy_avg(3,  8);
-    init_copy_avg(4,  4);
-
-#undef init_copy_avg
-#undef init_fpel
-
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)             \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][idxh][idxv]     = type ## _bilin_        ## sz ## dir ## _c
-
-#define init_subpel2(idx, idxh, idxv, dir, type)     \
-    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
-    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
-    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
-    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
-
-#define init_subpel3(idx, type)         \
-    init_subpel2(idx, 1, 1, hv, type);  \
-    init_subpel2(idx, 0, 1, v, type);   \
-    init_subpel2(idx, 1, 0, h, type)
-
-    init_subpel3(0, put);
-    init_subpel3(1, avg);
-
-#undef init_subpel1
-#undef init_subpel2
-#undef init_subpel3
-}
-
-av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
-{
-    vp9dsp_intrapred_init(dsp);
-    vp9dsp_itxfm_init(dsp);
-    vp9dsp_loopfilter_init(dsp);
-    vp9dsp_mc_init(dsp);
-
-    if (ARCH_X86)
-        ff_vp9dsp_init_x86(dsp);
+    if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
+    if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
 }
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
new file mode 100644
index 0000000000..016a9bb231
--- /dev/null
+++ b/libavcodec/vp9dsp.h
@@ -0,0 +1,132 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9DSP_H
+#define AVCODEC_VP9DSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "vp9.h"
+
+typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int h, int mx, int my);
+typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                                   const uint8_t *ref, ptrdiff_t ref_stride,
+                                   int h, int mx, int my, int dx, int dy);
+
+typedef struct VP9DSPContext {
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
+     * dimension 2: intra prediction modes
+     *
+     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * top[-1] is top/left; top[4,7] is top-right for 4x4
+     */
+    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
+    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
+    // also needs to fit in with what h264/vp8/etc do
+    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
+                                                         ptrdiff_t stride,
+                                                         const uint8_t *left,
+                                                         const uint8_t *top);
+
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
+     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
+     *
+     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * block is 16-byte aligned
+     * eob indicates the position (+1) of the last non-zero coefficient,
+     * in scan-order. This can be used to write faster versions, e.g. a
+     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
+     * etc.
+     */
+    // FIXME also write idct_add_block() versions for whole (inter) pred
+    // blocks, so we can do 2 4x4s at once
+    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
+                                                      ptrdiff_t stride,
+                                                      int16_t *block, int eob);
+
+    /*
+     * dimension 1: width of filter (0=4, 1=8, 2=16)
+     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by 8
+     */
+    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
+                                int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * The width of filter is assumed to be 16; dst/stride are aligned by 16
+     */
+    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
+                              int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
+     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by operation size
+     * this basically calls loop_filter[d1][d3][0](), followed by
+     * loop_filter[d2][d3][0]() on the next 8 pixels
+     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
+     * integer.
+     */
+    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
+    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
+                                      int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
+     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
+     * dimension 3: averaging type (0: put, 1: avg)
+     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
+     * dimension 5: y subpel interpolation (0: none, 1: 8tap/bilin)
+     *
+     * dst/stride are aligned by hsize
+     */
+    vp9_mc_func mc[5][4][2][2][2];
+
+    /*
+     * for scalable MC, first 3 dimensions identical to above, the other two
+     * don't exist since it changes per stepsize.
+     */
+    vp9_scaled_mc_func smc[5][4][2];
+} VP9DSPContext;
+
+void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact);
+
+void ff_vp9dsp_init_8(VP9DSPContext *dsp);
+void ff_vp9dsp_init_10(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12(VP9DSPContext *dsp);
+
+void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact);
+void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp);
+
+#endif /* AVCODEC_VP9DSP_H */
diff --git a/libavcodec/vp9dsp_10bpp.c b/libavcodec/vp9dsp_10bpp.c
new file mode 100644
index 0000000000..62ce182070
--- /dev/null
+++ b/libavcodec/vp9dsp_10bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 10
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_12bpp.c b/libavcodec/vp9dsp_12bpp.c
new file mode 100644
index 0000000000..2f36471c5b
--- /dev/null
+++ b/libavcodec/vp9dsp_12bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 12
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_8bpp.c b/libavcodec/vp9dsp_8bpp.c
new file mode 100644
index 0000000000..4b219b06b0
--- /dev/null
+++ b/libavcodec/vp9dsp_8bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 8
+#define dctint int
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_template.c b/libavcodec/vp9dsp_template.c
new file mode 100644
index 0000000000..4d810fec3a
--- /dev/null
+++ b/libavcodec/vp9dsp_template.c
@@ -0,0 +1,2601 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "bit_depth_template.c"
+#include "vp9dsp.h"
+
+#if BIT_DEPTH != 12
+
+// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
+// back with h264pred.[ch]
+
+static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4 = AV_RN4PA(top);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, p4);
+    AV_WN4PA(dst + stride * 1, p4);
+    AV_WN4PA(dst + stride * 2, p4);
+    AV_WN4PA(dst + stride * 3, p4);
+}
+
+static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top + 0);
+    pixel4 p4b = AV_RN4PA(top + 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, p4a);
+        AV_WN4PA(dst + 4, p4b);
+        dst += stride;
+    }
+}
+
+static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        dst += stride;
+    }
+}
+
+static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    pixel4 p4e = AV_RN4PA(top + 16);
+    pixel4 p4f = AV_RN4PA(top + 20);
+    pixel4 p4g = AV_RN4PA(top + 24);
+    pixel4 p4h = AV_RN4PA(top + 28);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        AV_WN4PA(dst + 16, p4e);
+        AV_WN4PA(dst + 20, p4f);
+        AV_WN4PA(dst + 24, p4g);
+        AV_WN4PA(dst + 28, p4h);
+        dst += stride;
+    }
+}
+
+static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
+    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
+    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
+    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
+}
+
+static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
+
+        AV_WN4PA(dst + 0, p4);
+        AV_WN4PA(dst + 4, p4);
+        dst += stride;
+    }
+}
+
+static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        dst += stride;
+    }
+}
+
+static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        AV_WN4PA(dst + 16, p4);
+        AV_WN4PA(dst + 20, p4);
+        AV_WN4PA(dst + 24, p4);
+        AV_WN4PA(dst + 28, p4);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 4; y++) {
+        int l_m_tl = left[3 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        int l_m_tl = left[7 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst[4] = av_clip_pixel(top[4] + l_m_tl);
+        dst[5] = av_clip_pixel(top[5] + l_m_tl);
+        dst[6] = av_clip_pixel(top[6] + l_m_tl);
+        dst[7] = av_clip_pixel(top[7] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        int l_m_tl = left[15 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        int l_m_tl = left[31 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst[16] = av_clip_pixel(top[16] + l_m_tl);
+        dst[17] = av_clip_pixel(top[17] + l_m_tl);
+        dst[18] = av_clip_pixel(top[18] + l_m_tl);
+        dst[19] = av_clip_pixel(top[19] + l_m_tl);
+        dst[20] = av_clip_pixel(top[20] + l_m_tl);
+        dst[21] = av_clip_pixel(top[21] + l_m_tl);
+        dst[22] = av_clip_pixel(top[22] + l_m_tl);
+        dst[23] = av_clip_pixel(top[23] + l_m_tl);
+        dst[24] = av_clip_pixel(top[24] + l_m_tl);
+        dst[25] = av_clip_pixel(top[25] + l_m_tl);
+        dst[26] = av_clip_pixel(top[26] + l_m_tl);
+        dst[27] = av_clip_pixel(top[27] + l_m_tl);
+        dst[28] = av_clip_pixel(top[28] + l_m_tl);
+        dst[29] = av_clip_pixel(top[29] + l_m_tl);
+        dst[30] = av_clip_pixel(top[30] + l_m_tl);
+        dst[31] = av_clip_pixel(top[31] + l_m_tl);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
+                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
+          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
+          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
+          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
+          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
+          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
+          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
+          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] +
+          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
+          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
+          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
+          left[30] + left[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
+          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
+          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
+          top[30] + top[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);}
+
+static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+#if BIT_DEPTH == 8
+#define memset_bpc memset
+#else
+static inline void memset_bpc(uint16_t *dst, int val, int len) {
+    int n;
+    for (n = 0; n < len; n++) {
+        dst[n] = val;
+    }
+}
+#endif
+
+#define DST(x, y) dst[(x) + (y) * stride]
+
+static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
+    DST(3,3) = a7;  // note: this is different from vp8 and such
+}
+
+#define def_diag_downleft(size) \
+static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                              const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel v[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) \
+        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) { \
+        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
+    } \
+}
+
+def_diag_downleft(8)
+def_diag_downleft(16)
+def_diag_downleft(32)
+
+static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                 const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_diag_downright(size) \
+static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                               const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size + size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
+    } \
+    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
+    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
+    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
+}
+
+def_diag_downright(8)
+def_diag_downright(16)
+def_diag_downright(32)
+
+static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                             const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
+    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
+    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
+    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a2 + a3 + 1) >> 1;
+    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_vert_right(size) \
+static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                           const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size/2 - 2; i++) { \
+        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
+        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
+    } \
+    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
+    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
+    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
+    for (i = 0; i < size - 1; i++) { \
+        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
+        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
+    } \
+}
+
+def_vert_right(8)
+def_vert_right(16)
+def_vert_right(32)
+
+static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
+        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
+
+    stride /= sizeof(pixel);
+    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
+    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
+    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
+    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
+    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,3) = (l2 + l3 + 1) >> 1;
+    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+}
+
+#define def_hor_down(size) \
+static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                         const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size * 3 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
+        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
+        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
+    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
+    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
+}
+
+def_hor_down(8)
+def_hor_down(16)
+def_hor_down(32)
+
+static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 + 1) >> 1;
+    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
+    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
+    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
+    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,2) = (a4 + a5 + 1) >> 1;
+    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+}
+
+#define def_vert_left(size) \
+static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                          const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel ve[size - 1], vo[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    } \
+    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
+    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
+        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
+    } \
+}
+
+def_vert_left(8)
+def_vert_left(16)
+def_vert_left(32)
+
+static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (l0 + l1 + 1) >> 1;
+    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
+    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
+    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
+    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
+}
+
+#define def_hor_up(size) \
+static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                       const uint8_t *_left, const uint8_t *top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size*2 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
+        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+    } \
+    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) \
+        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
+    for (j = size / 2; j < size; j++) { \
+        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
+                   2 + j*2 - size); \
+    } \
+}
+
+def_hor_up(8)
+def_hor_up(16)
+def_hor_up(32)
+
+#undef DST
+
+#endif /* BIT_DEPTH != 12 */
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
+{
+#define init_intra_pred_bd_aware(tx, sz) \
+    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
+    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
+    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
+    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_intrapred_init_10(dsp);
+#define init_intra_pred(tx, sz) \
+    init_intra_pred_bd_aware(tx, sz)
+#else
+    #define init_intra_pred(tx, sz) \
+    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
+    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
+    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
+    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
+    init_intra_pred_bd_aware(tx, sz)
+#endif
+
+    init_intra_pred(TX_4X4,   4x4);
+    init_intra_pred(TX_8X8,   8x8);
+    init_intra_pred(TX_16X16, 16x16);
+    init_intra_pred(TX_32X32, 32x32);
+
+#undef init_intra_pred
+#undef init_intra_pred_bd_aware
+}
+
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
+static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
+                                                    ptrdiff_t stride, \
+                                                    int16_t *_block, int eob) \
+{ \
+    int i, j; \
+    pixel *dst = (pixel *) _dst; \
+    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
+\
+    stride /= sizeof(pixel); \
+    if (has_dconly && eob == 1) { \
+        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
+                                            * 11585 + (1 << 13)) >> 14; \
+        block[0] = 0; \
+        for (i = 0; i < sz; i++) { \
+            for (j = 0; j < sz; j++) \
+                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                                (bits ? \
+                                                 (t + (1 << (bits - 1))) >> bits : \
+                                                 t)); \
+            dst++; \
+        } \
+        return; \
+    } \
+\
+    for (i = 0; i < sz; i++) \
+        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
+    memset(block, 0, sz * sz * sizeof(*block)); \
+    for (i = 0; i < sz; i++) { \
+        type_b##sz##_1d(tmp + i, sz, out, 1); \
+        for (j = 0; j < sz; j++) \
+            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                            (bits ? \
+                                             (out[j] + (1 << (bits - 1))) >> bits : \
+                                             out[j])); \
+        dst++; \
+    } \
+}
+
+#define itxfm_wrap(sz, bits) \
+itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+itxfm_wrapper(iadst, iadst, sz, bits, 0)
+
+#define IN(x) ((dctint) in[(x) * stride])
+
+static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
+    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
+    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
+    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t3;
+    out[1] = t1 + t2;
+    out[2] = t1 - t2;
+    out[3] = t0 - t3;
+}
+
+static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
+    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
+    t2 = 13377 * (IN(0) - IN(2) + IN(3));
+    t3 = 13377 * IN(1);
+
+    out[0] = (t0 + t3      + (1 << 13)) >> 14;
+    out[1] = (t1 + t3      + (1 << 13)) >> 14;
+    out[2] = (t2           + (1 << 13)) >> 14;
+    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(4, 4)
+
+static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
+    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
+    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
+    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
+    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
+    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
+    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
+    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
+
+    t0  = t0a + t3a;
+    t1  = t1a + t2a;
+    t2  = t1a - t2a;
+    t3  = t0a - t3a;
+    t4  = t4a + t5a;
+    t5a = t4a - t5a;
+    t7  = t7a + t6a;
+    t6a = t7a - t6a;
+
+    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
+    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t7;
+    out[1] = t1 + t6;
+    out[2] = t2 + t5;
+    out[3] = t3 + t4;
+    out[4] = t3 - t4;
+    out[5] = t2 - t5;
+    out[6] = t1 - t6;
+    out[7] = t0 - t7;
+}
+
+static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = 16305 * IN(7) +  1606 * IN(0);
+    t1a =  1606 * IN(7) - 16305 * IN(0);
+    t2a = 14449 * IN(5) +  7723 * IN(2);
+    t3a =  7723 * IN(5) - 14449 * IN(2);
+    t4a = 10394 * IN(3) + 12665 * IN(4);
+    t5a = 12665 * IN(3) - 10394 * IN(4);
+    t6a =  4756 * IN(1) + 15679 * IN(6);
+    t7a = 15679 * IN(1) -  4756 * IN(6);
+
+    t0 = (t0a + t4a + (1 << 13)) >> 14;
+    t1 = (t1a + t5a + (1 << 13)) >> 14;
+    t2 = (t2a + t6a + (1 << 13)) >> 14;
+    t3 = (t3a + t7a + (1 << 13)) >> 14;
+    t4 = (t0a - t4a + (1 << 13)) >> 14;
+    t5 = (t1a - t5a + (1 << 13)) >> 14;
+    t6 = (t2a - t6a + (1 << 13)) >> 14;
+    t7 = (t3a - t7a + (1 << 13)) >> 14;
+
+    t4a = 15137 * t4 +  6270 * t5;
+    t5a =  6270 * t4 - 15137 * t5;
+    t6a = 15137 * t7 -  6270 * t6;
+    t7a =  6270 * t7 + 15137 * t6;
+
+    out[0] =   t0 + t2;
+    out[7] = -(t1 + t3);
+    t2     =   t0 - t2;
+    t3     =   t1 - t3;
+
+    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6     =   (t4a - t6a + (1 << 13)) >> 14;
+    t7     =   (t5a - t7a + (1 << 13)) >> 14;
+
+    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
+    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
+    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
+    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
+}
+
+itxfm_wrap(8, 5)
+
+static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0a  = ((IN(0) + IN(8)) * 11585 + (1 << 13)) >> 14;
+    t1a  = ((IN(0) - IN(8)) * 11585 + (1 << 13)) >> 14;
+    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
+    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
+    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
+    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
+    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
+    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
+    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
+    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
+    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
+    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
+    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
+    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
+    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
+    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
+
+    t0  = t0a  + t3a;
+    t1  = t1a  + t2a;
+    t2  = t1a  - t2a;
+    t3  = t0a  - t3a;
+    t4  = t4a  + t5a;
+    t5  = t4a  - t5a;
+    t6  = t7a  - t6a;
+    t7  = t7a  + t6a;
+    t8  = t8a  + t9a;
+    t9  = t8a  - t9a;
+    t10 = t11a - t10a;
+    t11 = t11a + t10a;
+    t12 = t12a + t13a;
+    t13 = t12a - t13a;
+    t14 = t15a - t14a;
+    t15 = t15a + t14a;
+
+    t5a  = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a  = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4   = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7   = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0a + t15a;
+    out[ 1] = t1a + t14;
+    out[ 2] = t2a + t13a;
+    out[ 3] = t3a + t12;
+    out[ 4] = t4  + t11;
+    out[ 5] = t5  + t10a;
+    out[ 6] = t6  + t9;
+    out[ 7] = t7  + t8a;
+    out[ 8] = t7  - t8a;
+    out[ 9] = t6  - t9;
+    out[10] = t5  - t10a;
+    out[11] = t4  - t11;
+    out[12] = t3a - t12;
+    out[13] = t2a - t13a;
+    out[14] = t1a - t14;
+    out[15] = t0a - t15a;
+}
+
+static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
+                                        dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0  = IN(15) * 16364 + IN(0)  *   804;
+    t1  = IN(15) *   804 - IN(0)  * 16364;
+    t2  = IN(13) * 15893 + IN(2)  *  3981;
+    t3  = IN(13) *  3981 - IN(2)  * 15893;
+    t4  = IN(11) * 14811 + IN(4)  *  7005;
+    t5  = IN(11) *  7005 - IN(4)  * 14811;
+    t6  = IN(9)  * 13160 + IN(6)  *  9760;
+    t7  = IN(9)  *  9760 - IN(6)  * 13160;
+    t8  = IN(7)  * 11003 + IN(8)  * 12140;
+    t9  = IN(7)  * 12140 - IN(8)  * 11003;
+    t10 = IN(5)  *  8423 + IN(10) * 14053;
+    t11 = IN(5)  * 14053 - IN(10) *  8423;
+    t12 = IN(3)  *  5520 + IN(12) * 15426;
+    t13 = IN(3)  * 15426 - IN(12) *  5520;
+    t14 = IN(1)  *  2404 + IN(14) * 16207;
+    t15 = IN(1)  * 16207 - IN(14) *  2404;
+
+    t0a  = (t0 + t8  + (1 << 13)) >> 14;
+    t1a  = (t1 + t9  + (1 << 13)) >> 14;
+    t2a  = (t2 + t10 + (1 << 13)) >> 14;
+    t3a  = (t3 + t11 + (1 << 13)) >> 14;
+    t4a  = (t4 + t12 + (1 << 13)) >> 14;
+    t5a  = (t5 + t13 + (1 << 13)) >> 14;
+    t6a  = (t6 + t14 + (1 << 13)) >> 14;
+    t7a  = (t7 + t15 + (1 << 13)) >> 14;
+    t8a  = (t0 - t8  + (1 << 13)) >> 14;
+    t9a  = (t1 - t9  + (1 << 13)) >> 14;
+    t10a = (t2 - t10 + (1 << 13)) >> 14;
+    t11a = (t3 - t11 + (1 << 13)) >> 14;
+    t12a = (t4 - t12 + (1 << 13)) >> 14;
+    t13a = (t5 - t13 + (1 << 13)) >> 14;
+    t14a = (t6 - t14 + (1 << 13)) >> 14;
+    t15a = (t7 - t15 + (1 << 13)) >> 14;
+
+    t8   = t8a  * 16069 + t9a  *  3196;
+    t9   = t8a  *  3196 - t9a  * 16069;
+    t10  = t10a *  9102 + t11a * 13623;
+    t11  = t10a * 13623 - t11a *  9102;
+    t12  = t13a * 16069 - t12a *  3196;
+    t13  = t13a *  3196 + t12a * 16069;
+    t14  = t15a *  9102 - t14a * 13623;
+    t15  = t15a * 13623 + t14a *  9102;
+
+    t0   = t0a + t4a;
+    t1   = t1a + t5a;
+    t2   = t2a + t6a;
+    t3   = t3a + t7a;
+    t4   = t0a - t4a;
+    t5   = t1a - t5a;
+    t6   = t2a - t6a;
+    t7   = t3a - t7a;
+    t8a  = (t8  + t12 + (1 << 13)) >> 14;
+    t9a  = (t9  + t13 + (1 << 13)) >> 14;
+    t10a = (t10 + t14 + (1 << 13)) >> 14;
+    t11a = (t11 + t15 + (1 << 13)) >> 14;
+    t12a = (t8  - t12 + (1 << 13)) >> 14;
+    t13a = (t9  - t13 + (1 << 13)) >> 14;
+    t14a = (t10 - t14 + (1 << 13)) >> 14;
+    t15a = (t11 - t15 + (1 << 13)) >> 14;
+
+    t4a  = t4 * 15137 + t5 *  6270;
+    t5a  = t4 *  6270 - t5 * 15137;
+    t6a  = t7 * 15137 - t6 *  6270;
+    t7a  = t7 *  6270 + t6 * 15137;
+    t12  = t12a * 15137 + t13a *  6270;
+    t13  = t12a *  6270 - t13a * 15137;
+    t14  = t15a * 15137 - t14a *  6270;
+    t15  = t15a *  6270 + t14a * 15137;
+
+    out[ 0] =   t0 + t2;
+    out[15] = -(t1 + t3);
+    t2a     =   t0 - t2;
+    t3a     =   t1 - t3;
+    out[ 3] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6      =   (t4a - t6a + (1 << 13)) >> 14;
+    t7      =   (t5a - t7a + (1 << 13)) >> 14;
+    out[ 1] = -(t8a + t10a);
+    out[14] =   t9a + t11a;
+    t10     =   t8a - t10a;
+    t11     =   t9a - t11a;
+    out[ 2] =   (t12 + t14 + (1 << 13)) >> 14;
+    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
+    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
+    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
+
+    out[ 7] = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
+    out[ 8] = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
+    out[ 4] = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
+    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
+    out[ 6] = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 9] = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 5] = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
+    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(16, 6)
+
+static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0a  = ((IN(0) + IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t1a  = ((IN(0) - IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t2a  = (IN( 8) *  6270 - IN(24) * 15137 + (1 << 13)) >> 14;
+    dctint t3a  = (IN( 8) * 15137 + IN(24) *  6270 + (1 << 13)) >> 14;
+    dctint t4a  = (IN( 4) *  3196 - IN(28) * 16069 + (1 << 13)) >> 14;
+    dctint t7a  = (IN( 4) * 16069 + IN(28) *  3196 + (1 << 13)) >> 14;
+    dctint t5a  = (IN(20) * 13623 - IN(12) *  9102 + (1 << 13)) >> 14;
+    dctint t6a  = (IN(20) *  9102 + IN(12) * 13623 + (1 << 13)) >> 14;
+    dctint t8a  = (IN( 2) *  1606 - IN(30) * 16305 + (1 << 13)) >> 14;
+    dctint t15a = (IN( 2) * 16305 + IN(30) *  1606 + (1 << 13)) >> 14;
+    dctint t9a  = (IN(18) * 12665 - IN(14) * 10394 + (1 << 13)) >> 14;
+    dctint t14a = (IN(18) * 10394 + IN(14) * 12665 + (1 << 13)) >> 14;
+    dctint t10a = (IN(10) *  7723 - IN(22) * 14449 + (1 << 13)) >> 14;
+    dctint t13a = (IN(10) * 14449 + IN(22) *  7723 + (1 << 13)) >> 14;
+    dctint t11a = (IN(26) * 15679 - IN( 6) *  4756 + (1 << 13)) >> 14;
+    dctint t12a = (IN(26) *  4756 + IN( 6) * 15679 + (1 << 13)) >> 14;
+    dctint t16a = (IN( 1) *   804 - IN(31) * 16364 + (1 << 13)) >> 14;
+    dctint t31a = (IN( 1) * 16364 + IN(31) *   804 + (1 << 13)) >> 14;
+    dctint t17a = (IN(17) * 12140 - IN(15) * 11003 + (1 << 13)) >> 14;
+    dctint t30a = (IN(17) * 11003 + IN(15) * 12140 + (1 << 13)) >> 14;
+    dctint t18a = (IN( 9) *  7005 - IN(23) * 14811 + (1 << 13)) >> 14;
+    dctint t29a = (IN( 9) * 14811 + IN(23) *  7005 + (1 << 13)) >> 14;
+    dctint t19a = (IN(25) * 15426 - IN( 7) *  5520 + (1 << 13)) >> 14;
+    dctint t28a = (IN(25) *  5520 + IN( 7) * 15426 + (1 << 13)) >> 14;
+    dctint t20a = (IN( 5) *  3981 - IN(27) * 15893 + (1 << 13)) >> 14;
+    dctint t27a = (IN( 5) * 15893 + IN(27) *  3981 + (1 << 13)) >> 14;
+    dctint t21a = (IN(21) * 14053 - IN(11) *  8423 + (1 << 13)) >> 14;
+    dctint t26a = (IN(21) *  8423 + IN(11) * 14053 + (1 << 13)) >> 14;
+    dctint t22a = (IN(13) *  9760 - IN(19) * 13160 + (1 << 13)) >> 14;
+    dctint t25a = (IN(13) * 13160 + IN(19) *  9760 + (1 << 13)) >> 14;
+    dctint t23a = (IN(29) * 16207 - IN( 3) *  2404 + (1 << 13)) >> 14;
+    dctint t24a = (IN(29) *  2404 + IN( 3) * 16207 + (1 << 13)) >> 14;
+
+    dctint t0  = t0a  + t3a;
+    dctint t1  = t1a  + t2a;
+    dctint t2  = t1a  - t2a;
+    dctint t3  = t0a  - t3a;
+    dctint t4  = t4a  + t5a;
+    dctint t5  = t4a  - t5a;
+    dctint t6  = t7a  - t6a;
+    dctint t7  = t7a  + t6a;
+    dctint t8  = t8a  + t9a;
+    dctint t9  = t8a  - t9a;
+    dctint t10 = t11a - t10a;
+    dctint t11 = t11a + t10a;
+    dctint t12 = t12a + t13a;
+    dctint t13 = t12a - t13a;
+    dctint t14 = t15a - t14a;
+    dctint t15 = t15a + t14a;
+    dctint t16 = t16a + t17a;
+    dctint t17 = t16a - t17a;
+    dctint t18 = t19a - t18a;
+    dctint t19 = t19a + t18a;
+    dctint t20 = t20a + t21a;
+    dctint t21 = t20a - t21a;
+    dctint t22 = t23a - t22a;
+    dctint t23 = t23a + t22a;
+    dctint t24 = t24a + t25a;
+    dctint t25 = t24a - t25a;
+    dctint t26 = t27a - t26a;
+    dctint t27 = t27a + t26a;
+    dctint t28 = t28a + t29a;
+    dctint t29 = t28a - t29a;
+    dctint t30 = t31a - t30a;
+    dctint t31 = t31a + t30a;
+
+    t5a = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+    t17a = (  t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
+    t30a = (  t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
+    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
+    t29a = (  t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
+    t21a = (  t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
+    t26a = (  t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
+    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
+    t25a = (  t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4a  = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7a  = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+    t18a = (  t29  *  6270 - t18  * 15137  + (1 << 13)) >> 14;
+    t29a = (  t29  * 15137 + t18  *  6270  + (1 << 13)) >> 14;
+    t19  = (  t28a *  6270 - t19a * 15137  + (1 << 13)) >> 14;
+    t28  = (  t28a * 15137 + t19a *  6270  + (1 << 13)) >> 14;
+    t20  = (-(t27a * 15137 + t20a *  6270) + (1 << 13)) >> 14;
+    t27  = (  t27a *  6270 - t20a * 15137  + (1 << 13)) >> 14;
+    t21a = (-(t26  * 15137 + t21  *  6270) + (1 << 13)) >> 14;
+    t26a = (  t26  *  6270 - t21  * 15137  + (1 << 13)) >> 14;
+
+    t0   = t0a + t15a;
+    t1   = t1a + t14;
+    t2   = t2a + t13a;
+    t3   = t3a + t12;
+    t4   = t4a + t11;
+    t5a  = t5  + t10a;
+    t6a  = t6  + t9;
+    t7   = t7a + t8a;
+    t8   = t7a - t8a;
+    t9a  = t6  - t9;
+    t10  = t5  - t10a;
+    t11a = t4a - t11;
+    t12a = t3a - t12;
+    t13  = t2a - t13a;
+    t14a = t1a - t14;
+    t15  = t0a - t15a;
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
+
+    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
+    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
+    t21a = ((t26  - t21 ) * 11585 + (1 << 13)) >> 14;
+    t26a = ((t26  + t21 ) * 11585 + (1 << 13)) >> 14;
+    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
+    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
+    t23a = ((t24  - t23 ) * 11585 + (1 << 13)) >> 14;
+    t24a = ((t24  + t23 ) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0   + t31;
+    out[ 1] = t1   + t30a;
+    out[ 2] = t2   + t29;
+    out[ 3] = t3   + t28a;
+    out[ 4] = t4   + t27;
+    out[ 5] = t5a  + t26a;
+    out[ 6] = t6a  + t25;
+    out[ 7] = t7   + t24a;
+    out[ 8] = t8   + t23a;
+    out[ 9] = t9a  + t22;
+    out[10] = t10  + t21a;
+    out[11] = t11a + t20;
+    out[12] = t12a + t19a;
+    out[13] = t13  + t18;
+    out[14] = t14a + t17a;
+    out[15] = t15  + t16;
+    out[16] = t15  - t16;
+    out[17] = t14a - t17a;
+    out[18] = t13  - t18;
+    out[19] = t12a - t19a;
+    out[20] = t11a - t20;
+    out[21] = t10  - t21a;
+    out[22] = t9a  - t22;
+    out[23] = t8   - t23a;
+    out[24] = t7   - t24a;
+    out[25] = t6a  - t25;
+    out[26] = t5a  - t26a;
+    out[27] = t4   - t27;
+    out[28] = t3   - t28a;
+    out[29] = t2   - t29;
+    out[30] = t1   - t30a;
+    out[31] = t0   - t31;
+}
+
+itxfm_wrapper(idct, idct, 32, 6, 1)
+
+static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    int t0, t1, t2, t3, t4;
+
+    if (pass == 0) {
+        t0 = IN(0) >> 2;
+        t1 = IN(3) >> 2;
+        t2 = IN(1) >> 2;
+        t3 = IN(2) >> 2;
+    } else {
+        t0 = IN(0);
+        t1 = IN(3);
+        t2 = IN(1);
+        t3 = IN(2);
+    }
+
+    t0 += t2;
+    t3 -= t1;
+    t4 = (t0 - t3) >> 1;
+    t1 = t4 - t1;
+    t2 = t4 - t2;
+    t0 -= t1;
+    t3 += t2;
+
+    out[0] = t0;
+    out[1] = t1;
+    out[2] = t2;
+    out[3] = t3;
+}
+
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
+
+#undef IN
+#undef itxfm_wrapper
+#undef itxfm_wrap
+
+static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
+{
+#define init_itxfm(tx, sz) \
+    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
+
+#define init_idct(tx, nm) \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
+
+    init_itxfm(TX_4X4,   4x4);
+    init_itxfm(TX_8X8,   8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32,  idct_idct_32x32);
+    init_idct(4 /* lossless */, iwht_iwht_4x4);
+
+#undef init_itxfm
+#undef init_idct
+}
+
+static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
+                                         ptrdiff_t stridea, ptrdiff_t strideb,
+                                         int wd)
+{
+    int i, F = 1 << (BIT_DEPTH - 8);
+
+    E <<= (BIT_DEPTH - 8);
+    I <<= (BIT_DEPTH - 8);
+    H <<= (BIT_DEPTH - 8);
+    for (i = 0; i < 8; i++, dst += stridea) {
+        int p7, p6, p5, p4;
+        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
+        int q4, q5, q6, q7;
+        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
+                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
+                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
+        int flat8out, flat8in;
+
+        if (!fm)
+            continue;
+
+        if (wd >= 16) {
+            p7 = dst[strideb * -8];
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+            q7 = dst[strideb * +7];
+
+            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
+                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
+                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
+                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
+        }
+
+        if (wd >= 8)
+            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
+                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
+                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
+
+        if (wd >= 16 && flat8out && flat8in) {
+            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
+                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
+                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
+                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
+                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
+                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
+            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else {
+            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
+
+            if (hev) {
+                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
+                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+            } else {
+                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = av_clip_pixel(p1 + f);
+                dst[strideb * +1] = av_clip_pixel(q1 - f);
+            }
+        }
+    }
+}
+
+#define lf_8_fn(dir, wd, stridea, strideb) \
+static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
+                                           ptrdiff_t stride, \
+                                           int E, int I, int H) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    stride /= sizeof(pixel); \
+    loop_filter(dst, E, I, H, stridea, strideb, wd); \
+}
+
+#define lf_8_fns(wd) \
+lf_8_fn(h, wd, stride, 1) \
+lf_8_fn(v, wd, 1, stride)
+
+lf_8_fns(4)
+lf_8_fns(8)
+lf_8_fns(16)
+
+#undef lf_8_fn
+#undef lf_8_fns
+
+#define lf_16_fn(dir, stridea) \
+static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
+                                        ptrdiff_t stride, \
+                                        int E, int I, int H) \
+{ \
+    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
+    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
+}
+
+lf_16_fn(h, stride)
+lf_16_fn(v, sizeof(pixel))
+
+#undef lf_16_fn
+
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
+static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  int E, int I, int H) \
+{ \
+    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
+    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2) \
+lf_mix_fn(h, wd1, wd2, stride) \
+lf_mix_fn(v, wd1, wd2, sizeof(pixel))
+
+lf_mix_fns(4, 4)
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+#undef lf_mix_fn
+#undef lf_mix_fns
+
+static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
+{
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
+
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
+
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
+                                    const uint8_t *src, ptrdiff_t src_stride,
+                                    int w, int h)
+{
+    do {
+        memcpy(dst, src, w * sizeof(pixel));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                   const uint8_t *_src, ptrdiff_t src_stride,
+                                   int w, int h)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x += 4)
+            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define fpel_fn(type, sz) \
+static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                         const uint8_t *src, ptrdiff_t src_stride, \
+                         int h, int mx, int my) \
+{ \
+    type##_c(dst, dst_stride, src, src_stride, sz, h); \
+}
+
+#define copy_avg_fn(sz) \
+fpel_fn(copy, sz) \
+fpel_fn(avg,  sz)
+
+copy_avg_fn(64)
+copy_avg_fn(32)
+copy_avg_fn(16)
+copy_avg_fn(8)
+copy_avg_fn(4)
+
+#undef fpel_fn
+#undef copy_avg_fn
+
+#endif /* BIT_DEPTH != 12 */
+
+static const int16_t vp9_subpel_filters[3][16][8] = {
+    [FILTER_8TAP_REGULAR] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        {  0,  1,  -5, 126,   8,  -3,  1,  0 },
+        { -1,  3, -10, 122,  18,  -6,  2,  0 },
+        { -1,  4, -13, 118,  27,  -9,  3, -1 },
+        { -1,  4, -16, 112,  37, -11,  4, -1 },
+        { -1,  5, -18, 105,  48, -14,  4, -1 },
+        { -1,  5, -19,  97,  58, -16,  5, -1 },
+        { -1,  6, -19,  88,  68, -18,  5, -1 },
+        { -1,  6, -19,  78,  78, -19,  6, -1 },
+        { -1,  5, -18,  68,  88, -19,  6, -1 },
+        { -1,  5, -16,  58,  97, -19,  5, -1 },
+        { -1,  4, -14,  48, 105, -18,  5, -1 },
+        { -1,  4, -11,  37, 112, -16,  4, -1 },
+        { -1,  3,  -9,  27, 118, -13,  4, -1 },
+        {  0,  2,  -6,  18, 122, -10,  3, -1 },
+        {  0,  1,  -3,   8, 126,  -5,  1,  0 },
+    }, [FILTER_8TAP_SHARP] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        { -1,  3,  -7, 127,   8,  -3,  1,  0 },
+        { -2,  5, -13, 125,  17,  -6,  3, -1 },
+        { -3,  7, -17, 121,  27, -10,  5, -2 },
+        { -4,  9, -20, 115,  37, -13,  6, -2 },
+        { -4, 10, -23, 108,  48, -16,  8, -3 },
+        { -4, 10, -24, 100,  59, -19,  9, -3 },
+        { -4, 11, -24,  90,  70, -21, 10, -4 },
+        { -4, 11, -23,  80,  80, -23, 11, -4 },
+        { -4, 10, -21,  70,  90, -24, 11, -4 },
+        { -3,  9, -19,  59, 100, -24, 10, -4 },
+        { -3,  8, -16,  48, 108, -23, 10, -4 },
+        { -2,  6, -13,  37, 115, -20,  9, -4 },
+        { -2,  5, -10,  27, 121, -17,  7, -3 },
+        { -1,  3,  -6,  17, 125, -13,  5, -2 },
+        {  0,  1,  -3,   8, 127,  -7,  3, -1 },
+    }, [FILTER_8TAP_SMOOTH] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        { -3, -1,  32,  64,  38,   1, -3,  0 },
+        { -2, -2,  29,  63,  41,   2, -3,  0 },
+        { -2, -2,  26,  63,  43,   4, -4,  0 },
+        { -2, -3,  24,  62,  46,   5, -4,  0 },
+        { -2, -3,  21,  60,  49,   7, -4,  0 },
+        { -1, -4,  18,  59,  51,   9, -4,  0 },
+        { -1, -4,  16,  57,  53,  12, -4, -1 },
+        { -1, -4,  14,  55,  55,  14, -4, -1 },
+        { -1, -4,  12,  53,  57,  16, -4, -1 },
+        {  0, -4,   9,  51,  59,  18, -4, -1 },
+        {  0, -4,   7,  49,  60,  21, -3, -2 },
+        {  0, -4,   5,  46,  62,  24, -3, -2 },
+        {  0, -4,   4,  43,  63,  26, -2, -2 },
+        {  0, -3,   2,  41,  63,  29, -2, -2 },
+        {  0, -3,   1,  38,  64,  32, -1, -3 },
+    }
+};
+
+#define FILTER_8TAP(src, x, F, stride) \
+    av_clip_pixel((F[0] * src[x + -3 * stride] + \
+                   F[1] * src[x + -2 * stride] + \
+                   F[2] * src[x + -1 * stride] + \
+                   F[3] * src[x + +0 * stride] + \
+                   F[4] * src[x + +1 * stride] + \
+                   F[5] * src[x + +2 * stride] + \
+                   F[6] * src[x + +3 * stride] + \
+                   F[7] * src[x + +4 * stride] + 64) >> 7)
+
+static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, ptrdiff_t ds,
+                                          const int16_t *filter, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(src, x, filter, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define filter_8tap_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src, ptrdiff_t src_stride, \
+                                                int w, int h, const int16_t *filter) \
+{ \
+    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
+}
+
+filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(put, 0, h, 1)
+filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(avg, 1, h, 1)
+
+#undef filter_8tap_1d_fn
+
+static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, const int16_t *filterx,
+                                          const int16_t *filtery, int avg)
+{
+    int tmp_h = h + 7;
+    pixel tmp[64 * 71], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define filter_8tap_2d_fn(opn, opa) \
+static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int w, int h, const int16_t *filterx, \
+                                           const int16_t *filtery) \
+{ \
+    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
+}
+
+filter_8tap_2d_fn(put, 0)
+filter_8tap_2d_fn(avg, 1)
+
+#undef filter_8tap_2d_fn
+
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                              const uint8_t *src, ptrdiff_t src_stride, \
+                                              int h, int mx, int my) \
+{ \
+    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
+                            vp9_subpel_filters[type_idx][dir_m]); \
+}
+
+#define filter_fn_2d(sz, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my) \
+{ \
+    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
+                       vp9_subpel_filters[type_idx][mx], \
+                       vp9_subpel_filters[type_idx][my]); \
+}
+
+#if BIT_DEPTH != 12
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
+
+static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(src, x, mxy, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define bilin_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                 const uint8_t *src, ptrdiff_t src_stride, \
+                                                 int w, int h, int mxy) \
+{ \
+    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
+}
+
+bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+bilin_1d_fn(put, 0, h, 1)
+bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+bilin_1d_fn(avg, 1, h, 1)
+
+#undef bilin_1d_fn
+
+static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, int mx, int my, int avg)
+{
+    pixel tmp[64 * 65], *tmp_ptr = tmp;
+    int tmp_h = h + 1;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define bilin_2d_fn(opn, opa) \
+static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my) \
+{ \
+    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
+}
+
+bilin_2d_fn(put, 0)
+bilin_2d_fn(avg, 1)
+
+#undef bilin_2d_fn
+
+#define bilinf_fn_1d(sz, dir, dir_m, avg) \
+static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my) \
+{ \
+    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
+}
+
+#define bilinf_fn_2d(sz, avg) \
+static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my) \
+{ \
+    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
+}
+
+#else
+
+#define bilinf_fn_1d(a, b, c, d)
+#define bilinf_fn_2d(a, b)
+
+#endif
+
+#define filter_fn(sz, avg) \
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+bilinf_fn_1d(sz, h, mx,                               avg) \
+bilinf_fn_1d(sz, v, my,                               avg) \
+bilinf_fn_2d(sz,                                      avg)
+
+#define filter_fn_set(avg) \
+filter_fn(64, avg) \
+filter_fn(32, avg) \
+filter_fn(16, avg) \
+filter_fn(8,  avg) \
+filter_fn(4,  avg)
+
+filter_fn_set(put)
+filter_fn_set(avg)
+
+#undef filter_fn
+#undef filter_fn_set
+#undef filter_fn_1d
+#undef filter_fn_2d
+#undef bilinf_fn_1d
+#undef bilinf_fn_2d
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
+{
+#if BIT_DEPTH == 12
+    ff_vp9dsp_mc_init_10(dsp);
+#else /* BIT_DEPTH == 12 */
+
+#define init_fpel(idx1, idx2, sz, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
+
+#define init_copy_avg(idx, sz) \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_copy_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_fpel
+
+#endif /* BIT_DEPTH == 12 */
+
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
+
+#if BIT_DEPTH == 12
+#define init_subpel1 init_subpel1_bd_aware
+#else
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
+    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
+#endif
+
+#define init_subpel2(idx, idxh, idxv, dir, type) \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type) \
+    init_subpel2(idx, 1, 1, hv, type); \
+    init_subpel2(idx, 0, 1, v, type); \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+#undef init_subpel1_bd_aware
+}
+
+static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                              const uint8_t *_src, ptrdiff_t src_stride,
+                                              int w, int h, int mx, int my,
+                                              int dx, int dy, int avg,
+                                              const int16_t (*filters)[8])
+{
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
+    pixel tmp[64 * 135], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+        const int16_t *filter = filters[my];
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_filter_8tap_fn(opn, opa) \
+static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my, int dx, int dy, \
+                                            const int16_t (*filters)[8]) \
+{ \
+    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                     opa, filters); \
+}
+
+scaled_filter_8tap_fn(put, 0)
+scaled_filter_8tap_fn(avg, 1)
+
+#undef scaled_filter_8tap_fn
+
+#undef FILTER_8TAP
+
+#define scaled_filter_fn(sz, type, type_idx, avg) \
+static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
+                        vp9_subpel_filters[type_idx]); \
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                               const uint8_t *_src, ptrdiff_t src_stride,
+                                               int w, int h, int mx, int my,
+                                               int dx, int dy, int avg)
+{
+    pixel tmp[64 * 129], *tmp_ptr = tmp;
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_bilin_fn(opn, opa) \
+static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                             const uint8_t *src, ptrdiff_t src_stride, \
+                                             int w, int h, int mx, int my, int dx, int dy) \
+{ \
+    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
+}
+
+scaled_bilin_fn(put, 0)
+scaled_bilin_fn(avg, 1)
+
+#undef scaled_bilin_fn
+
+#undef FILTER_BILIN
+
+#define scaled_bilinf_fn(sz, avg) \
+static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
+}
+
+#else
+
+#define scaled_bilinf_fn(a, b)
+
+#endif
+
+#define scaled_filter_fns(sz, avg) \
+scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+scaled_bilinf_fn(sz,                                      avg)
+
+#define scaled_filter_fn_set(avg) \
+scaled_filter_fns(64, avg) \
+scaled_filter_fns(32, avg) \
+scaled_filter_fns(16, avg) \
+scaled_filter_fns(8,  avg) \
+scaled_filter_fns(4,  avg)
+
+scaled_filter_fn_set(put)
+scaled_filter_fn_set(avg)
+
+#undef scaled_filter_fns
+#undef scaled_filter_fn_set
+#undef scaled_filter_fn
+#undef scaled_bilinf_fn
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
+{
+#define init_scaled_bd_aware(idx1, idx2, sz, type) \
+    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_scaled_mc_init_10(dsp);
+#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
+#else
+#define init_scaled(idx1, idx2, sz, type) \
+    init_scaled_bd_aware(idx1, idx2, sz, type); \
+    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
+#endif
+
+#define init_scaled_put_avg(idx, sz) \
+    init_scaled(idx, 0, sz, put); \
+    init_scaled(idx, 1, sz, avg)
+
+    init_scaled_put_avg(0, 64);
+    init_scaled_put_avg(1, 32);
+    init_scaled_put_avg(2, 16);
+    init_scaled_put_avg(3,  8);
+    init_scaled_put_avg(4,  4);
+
+#undef init_scaled_put_avg
+#undef init_scaled
+#undef init_scaled_bd_aware
+}
+
+av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
+{
+    FUNC(ff_vp9dsp_intrapred_init)(dsp);
+    vp9dsp_itxfm_init(dsp);
+    vp9dsp_loopfilter_init(dsp);
+    FUNC(ff_vp9dsp_mc_init)(dsp);
+    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
+}
diff --git a/libavcodec/vp9mvs.c b/libavcodec/vp9mvs.c
deleted file mode 100644
index 1f65aaac0a..0000000000
--- a/libavcodec/vp9mvs.c
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "internal.h"
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
-                                      VP9Context *s)
-{
-    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
-    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
-}
-
-static void find_ref_mvs(VP9Context *s,
-                         VP56mv *pmv, int ref, int z, int idx, int sb)
-{
-    static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
-        [BS_64x64] = { {  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
-                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 } },
-        [BS_64x32] = { {  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 } },
-        [BS_32x64] = { { -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
-                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 } },
-        [BS_32x32] = { {  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_32x16] = { {  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
-                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_16x32] = { { -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
-                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 } },
-        [BS_16x16] = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_16x8]  = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
-                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 } },
-        [BS_8x16]  = { { -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
-                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 } },
-        [BS_8x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_8x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_4x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_4x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-    };
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col, row7 = b->row7;
-    const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
-#define INVALID_MV 0x80008000U
-    uint32_t mem = INVALID_MV;
-    int i;
-
-#define RETURN_DIRECT_MV(mv)                    \
-    do {                                        \
-        uint32_t m = AV_RN32A(&mv);             \
-        if (!idx) {                             \
-            AV_WN32A(pmv, m);                   \
-            return;                             \
-        } else if (mem == INVALID_MV) {         \
-            mem = m;                            \
-        } else if (m != mem) {                  \
-            AV_WN32A(pmv, m);                   \
-            return;                             \
-        }                                       \
-    } while (0)
-
-    if (sb >= 0) {
-        if (sb == 2 || sb == 1) {
-            RETURN_DIRECT_MV(b->mv[0][z]);
-        } else if (sb == 3) {
-            RETURN_DIRECT_MV(b->mv[2][z]);
-            RETURN_DIRECT_MV(b->mv[1][z]);
-            RETURN_DIRECT_MV(b->mv[0][z]);
-        }
-
-#define RETURN_MV(mv)                           \
-    do {                                        \
-        if (sb > 0) {                           \
-            VP56mv tmp;                         \
-            uint32_t m;                         \
-            clamp_mv(&tmp, &mv, s);             \
-            m = AV_RN32A(&tmp);                 \
-            if (!idx) {                         \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            }                                   \
-        } else {                                \
-            uint32_t m = AV_RN32A(&mv);         \
-            if (!idx) {                         \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            }                                   \
-        }                                       \
-    } while (0)
-
-        if (row > 0) {
-            VP9MVRefPair *mv = &s->mv[0][(row - 1) * s->sb_cols * 8 + col];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
-        }
-        if (col > s->tiling.tile_col_start) {
-            VP9MVRefPair *mv = &s->mv[0][row * s->sb_cols * 8 + col - 1];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
-        }
-        i = 2;
-    } else {
-        i = 0;
-    }
-
-    // previously coded MVs in the neighborhood, using same reference frame
-    for (; i < 8; i++) {
-        int c = p[i][0] + col, r = p[i][1] + row;
-
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
-            r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(mv->mv[0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(mv->mv[1]);
-        }
-    }
-
-    // MV at this position in previous frame, using same reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
-
-        if (mv->ref[0] == ref)
-            RETURN_MV(mv->mv[0]);
-        else if (mv->ref[1] == ref)
-            RETURN_MV(mv->mv[1]);
-    }
-
-#define RETURN_SCALE_MV(mv, scale)              \
-    do {                                        \
-        if (scale) {                            \
-            VP56mv mv_temp = { -mv.x, -mv.y };  \
-            RETURN_MV(mv_temp);                 \
-        } else {                                \
-            RETURN_MV(mv);                      \
-        }                                       \
-    } while (0)
-
-    // previously coded MVs in the neighborhood, using different reference frame
-    for (i = 0; i < 8; i++) {
-        int c = p[i][0] + col, r = p[i][1] + row;
-
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
-            r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
-
-            if (mv->ref[0] != ref && mv->ref[0] >= 0)
-                RETURN_SCALE_MV(mv->mv[0],
-                                s->signbias[mv->ref[0]] != s->signbias[ref]);
-            if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
-                // BUG - libvpx has this condition regardless of whether
-                // we used the first ref MV and pre-scaling
-                AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-                RETURN_SCALE_MV(mv->mv[1],
-                                s->signbias[mv->ref[1]] != s->signbias[ref]);
-            }
-        }
-    }
-
-    // MV at this position in previous frame, using different reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
-
-        if (mv->ref[0] != ref && mv->ref[0] >= 0)
-            RETURN_SCALE_MV(mv->mv[0],
-                            s->signbias[mv->ref[0]] != s->signbias[ref]);
-        if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
-            // BUG - libvpx has this condition regardless of whether
-            // we used the first ref MV and pre-scaling
-            AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-            RETURN_SCALE_MV(mv->mv[1],
-                            s->signbias[mv->ref[1]] != s->signbias[ref]);
-        }
-    }
-
-    AV_ZERO32(pmv);
-#undef INVALID_MV
-#undef RETURN_MV
-#undef RETURN_SCALE_MV
-}
-
-static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
-{
-    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
-    int n, c = vp8_rac_get_tree(&s->c, ff_vp9_mv_class_tree,
-                                s->prob.p.mv_comp[idx].classes);
-
-    s->counts.mv_comp[idx].sign[sign]++;
-    s->counts.mv_comp[idx].classes[c]++;
-    if (c) {
-        int m;
-
-        for (n = 0, m = 0; m < c; m++) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
-            n  |= bit << m;
-            s->counts.mv_comp[idx].bits[m][bit]++;
-        }
-        n <<= 3;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
-                               s->prob.p.mv_comp[idx].fp);
-        n  |= bit << 1;
-        s->counts.mv_comp[idx].fp[bit]++;
-        if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
-            s->counts.mv_comp[idx].hp[bit]++;
-            n |= bit;
-        } else {
-            n |= 1;
-            // bug in libvpx - we count for bw entropy purposes even if the
-            // bit wasn't coded
-            s->counts.mv_comp[idx].hp[1]++;
-        }
-        n += 8 << c;
-    } else {
-        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
-        s->counts.mv_comp[idx].class0[n]++;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
-                               s->prob.p.mv_comp[idx].class0_fp[n]);
-        s->counts.mv_comp[idx].class0_fp[n][bit]++;
-        n = (n << 3) | (bit << 1);
-        if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
-            s->counts.mv_comp[idx].class0_hp[bit]++;
-            n |= bit;
-        } else {
-            n |= 1;
-            // bug in libvpx - we count for bw entropy purposes even if the
-            // bit wasn't coded
-            s->counts.mv_comp[idx].class0_hp[1]++;
-        }
-    }
-
-    return sign ? -(n + 1) : (n + 1);
-}
-
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
-{
-    VP9Block *const b = &s->b;
-
-    if (mode == ZEROMV) {
-        memset(mv, 0, sizeof(*mv) * 2);
-    } else {
-        int hp;
-
-        // FIXME cache this value and reuse for other subblocks
-        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
-                     mode == NEWMV ? -1 : sb);
-        // FIXME maybe move this code into find_ref_mvs()
-        if ((mode == NEWMV || sb == -1) &&
-            !(hp = s->highprecisionmvs &&
-              abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
-            if (mv[0].y & 1) {
-                if (mv[0].y < 0)
-                    mv[0].y++;
-                else
-                    mv[0].y--;
-            }
-            if (mv[0].x & 1) {
-                if (mv[0].x < 0)
-                    mv[0].x++;
-                else
-                    mv[0].x--;
-            }
-        }
-        if (mode == NEWMV) {
-            enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
-                                              s->prob.p.mv_joint);
-
-            s->counts.mv_joint[j]++;
-            if (j >= MV_JOINT_V)
-                mv[0].y += read_mv_component(s, 0, hp);
-            if (j & 1)
-                mv[0].x += read_mv_component(s, 1, hp);
-        }
-
-        if (b->comp) {
-            // FIXME cache this value and reuse for other subblocks
-            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
-                         mode == NEWMV ? -1 : sb);
-            if ((mode == NEWMV || sb == -1) &&
-                !(hp = s->highprecisionmvs &&
-                  abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
-                if (mv[1].y & 1) {
-                    if (mv[1].y < 0)
-                        mv[1].y++;
-                    else
-                        mv[1].y--;
-                }
-                if (mv[1].x & 1) {
-                    if (mv[1].x < 0)
-                        mv[1].x++;
-                    else
-                        mv[1].x--;
-                }
-            }
-            if (mode == NEWMV) {
-                enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
-                                                  s->prob.p.mv_joint);
-
-                s->counts.mv_joint[j]++;
-                if (j >= MV_JOINT_V)
-                    mv[1].y += read_mv_component(s, 0, hp);
-                if (j & 1)
-                    mv[1].x += read_mv_component(s, 1, hp);
-            }
-        }
-    }
-}
diff --git a/libavcodec/vp9prob.c b/libavcodec/vp9prob.c
deleted file mode 100644
index b8a7c22af4..0000000000
--- a/libavcodec/vp9prob.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
-                                        int max_count, int update_factor)
-{
-    unsigned ct = ct0 + ct1, p2, p1;
-
-    if (!ct)
-        return;
-
-    p1 = *p;
-    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
-    p2 = av_clip(p2, 1, 255);
-    ct = FFMIN(ct, max_count);
-    update_factor = FASTDIV(update_factor * ct, max_count);
-
-    // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
-    *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
-}
-
-void ff_vp9_adapt_probs(VP9Context *s)
-{
-    int i, j, k, l, m;
-    ProbContext *p = &s->prob_ctx[s->framectxid].p;
-    int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
-
-    // coefficients
-    for (i = 0; i < 4; i++)
-        for (j = 0; j < 2; j++)
-            for (k = 0; k < 2; k++)
-                for (l = 0; l < 6; l++)
-                    for (m = 0; m < 6; m++) {
-                        uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
-                        unsigned *e = s->counts.eob[i][j][k][l][m];
-                        unsigned *c = s->counts.coef[i][j][k][l][m];
-
-                        if (l == 0 && m >= 3) // dc only has 3 pt
-                            break;
-
-                        adapt_prob(&pp[0], e[0], e[1], 24, uf);
-                        adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
-                        adapt_prob(&pp[2], c[1], c[2], 24, uf);
-                    }
-
-    if (s->keyframe || s->intraonly) {
-        memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
-        memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
-        memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
-        memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
-        return;
-    }
-
-    // skip flag
-    for (i = 0; i < 3; i++)
-        adapt_prob(&p->skip[i], s->counts.skip[i][0],
-                   s->counts.skip[i][1], 20, 128);
-
-    // intra/inter flag
-    for (i = 0; i < 4; i++)
-        adapt_prob(&p->intra[i], s->counts.intra[i][0],
-                   s->counts.intra[i][1], 20, 128);
-
-    // comppred flag
-    if (s->comppredmode == PRED_SWITCHABLE) {
-        for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp[i], s->counts.comp[i][0],
-                       s->counts.comp[i][1], 20, 128);
-    }
-
-    // reference frames
-    if (s->comppredmode != PRED_SINGLEREF) {
-        for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
-                       s->counts.comp_ref[i][1], 20, 128);
-    }
-
-    if (s->comppredmode != PRED_COMPREF) {
-        for (i = 0; i < 5; i++) {
-            uint8_t *pp = p->single_ref[i];
-            unsigned (*c)[2] = s->counts.single_ref[i];
-
-            adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
-            adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
-        }
-    }
-
-    // block partitioning
-    for (i = 0; i < 4; i++)
-        for (j = 0; j < 4; j++) {
-            uint8_t *pp = p->partition[i][j];
-            unsigned *c = s->counts.partition[i][j];
-
-            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-            adapt_prob(&pp[2], c[2], c[3], 20, 128);
-        }
-
-    // tx size
-    if (s->txfmmode == TX_SWITCHABLE) {
-        for (i = 0; i < 2; i++) {
-            unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
-
-            adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0],
-                       s->counts.tx8p[i][1], 20, 128);
-            adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
-            adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
-            adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
-            adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
-            adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
-        }
-    }
-
-    // interpolation filter
-    if (s->filtermode == FILTER_SWITCHABLE) {
-        for (i = 0; i < 4; i++) {
-            uint8_t *pp = p->filter[i];
-            unsigned *c = s->counts.filter[i];
-
-            adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2], 20, 128);
-        }
-    }
-
-    // inter modes
-    for (i = 0; i < 7; i++) {
-        uint8_t *pp = p->mv_mode[i];
-        unsigned *c = s->counts.mv_mode[i];
-
-        adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[1], c[3], 20, 128);
-    }
-
-    // mv joints
-    {
-        uint8_t *pp = p->mv_joint;
-        unsigned *c = s->counts.mv_joint;
-
-        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[2], c[3], 20, 128);
-    }
-
-    // mv components
-    for (i = 0; i < 2; i++) {
-        uint8_t *pp;
-        unsigned *c, (*c2)[2], sum;
-
-        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
-                   s->counts.mv_comp[i].sign[1], 20, 128);
-
-        pp  = p->mv_comp[i].classes;
-        c   = s->counts.mv_comp[i].classes;
-        sum = c[1] + c[2] + c[3] + c[4] + c[5] +
-              c[6] + c[7] + c[8] + c[9] + c[10];
-        adapt_prob(&pp[0], c[0], sum, 20, 128);
-        sum -= c[1];
-        adapt_prob(&pp[1], c[1], sum, 20, 128);
-        sum -= c[2] + c[3];
-        adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
-        adapt_prob(&pp[3], c[2], c[3], 20, 128);
-        sum -= c[4] + c[5];
-        adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
-        adapt_prob(&pp[5], c[4], c[5], 20, 128);
-        sum -= c[6];
-        adapt_prob(&pp[6], c[6], sum, 20, 128);
-        adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
-        adapt_prob(&pp[8], c[7], c[8], 20, 128);
-        adapt_prob(&pp[9], c[9], c[10], 20, 128);
-
-        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
-                   s->counts.mv_comp[i].class0[1], 20, 128);
-        pp = p->mv_comp[i].bits;
-        c2 = s->counts.mv_comp[i].bits;
-        for (j = 0; j < 10; j++)
-            adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
-
-        for (j = 0; j < 2; j++) {
-            pp = p->mv_comp[i].class0_fp[j];
-            c  = s->counts.mv_comp[i].class0_fp[j];
-            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-            adapt_prob(&pp[2], c[2], c[3], 20, 128);
-        }
-        pp = p->mv_comp[i].fp;
-        c  = s->counts.mv_comp[i].fp;
-        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[2], c[3], 20, 128);
-
-        if (s->highprecisionmvs) {
-            adapt_prob(&p->mv_comp[i].class0_hp,
-                       s->counts.mv_comp[i].class0_hp[0],
-                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
-            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
-                       s->counts.mv_comp[i].hp[1], 20, 128);
-        }
-    }
-
-    // y intra modes
-    for (i = 0; i < 4; i++) {
-        uint8_t *pp = p->y_mode[i];
-        unsigned *c = s->counts.y_mode[i], sum, s2;
-
-        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
-        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
-        sum -= c[TM_VP8_PRED];
-        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
-        sum -= c[VERT_PRED];
-        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
-        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
-        sum -= s2;
-        adapt_prob(&pp[3], s2, sum, 20, 128);
-        s2 -= c[HOR_PRED];
-        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
-        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
-                   20, 128);
-        sum -= c[DIAG_DOWN_LEFT_PRED];
-        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
-        sum -= c[VERT_LEFT_PRED];
-        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
-        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
-    }
-
-    // uv intra modes
-    for (i = 0; i < 10; i++) {
-        uint8_t *pp = p->uv_mode[i];
-        unsigned *c = s->counts.uv_mode[i], sum, s2;
-
-        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
-        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
-        sum -= c[TM_VP8_PRED];
-        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
-        sum -= c[VERT_PRED];
-        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
-        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
-        sum -= s2;
-        adapt_prob(&pp[3], s2, sum, 20, 128);
-        s2 -= c[HOR_PRED];
-        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
-        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
-                   20, 128);
-        sum -= c[DIAG_DOWN_LEFT_PRED];
-        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
-        sum -= c[VERT_LEFT_PRED];
-        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
-        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
-    }
-}
diff --git a/libavcodec/vqavideo.c b/libavcodec/vqavideo.c
index e58bb26f89..3ed9652d3b 100644
--- a/libavcodec/vqavideo.c
+++ b/libavcodec/vqavideo.c
@@ -1,21 +1,21 @@
 /*
  * Westwood Studios VQA Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
 
     /* make sure the extradata made it */
     if (s->avctx->extradata_size != VQA_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: expected extradata size of %d\n", VQA_HEADER_SIZE);
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n", VQA_HEADER_SIZE);
         return AVERROR(EINVAL);
     }
 
@@ -162,8 +162,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->width  & (s->vector_width  - 1) ||
-        s->height & (s->vector_height - 1)) {
+    if (s->width % s->vector_width || s->height % s->vector_height) {
         av_log(avctx, AV_LOG_ERROR, "Image size not multiple of block size\n");
         return AVERROR_INVALIDDATA;
     }
@@ -180,7 +179,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
     /* allocate decode buffer */
     s->decode_buffer_size = (s->width / s->vector_width) *
         (s->height / s->vector_height) * 2;
-    s->decode_buffer = av_malloc(s->decode_buffer_size);
+    s->decode_buffer = av_mallocz(s->decode_buffer_size);
     if (!s->decode_buffer)
         goto fail;
 
@@ -208,22 +207,22 @@ fail:
 
 #define CHECK_COUNT() \
     if (dest_index + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current dest_index = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current dest_index = %d, count = %d, dest_size = %d\n", \
             dest_index, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 #define CHECK_COPY(idx) \
     if (idx < 0 || idx + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current src_pos = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current src_pos = %d, count = %d, dest_size = %d\n", \
             src_pos, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 
-static int decode_format80(GetByteContext *gb, int src_size,
+static int decode_format80(VqaContext *s, int src_size,
     unsigned char *dest, int dest_size, int check_size) {
 
     int dest_index = 0;
@@ -232,26 +231,32 @@ static int decode_format80(GetByteContext *gb, int src_size,
     unsigned char color;
     int i;
 
-    start = bytestream2_tell(gb);
-    while (bytestream2_tell(gb) - start < src_size) {
-        opcode = bytestream2_get_byte(gb);
-        ff_dlog(NULL, "      opcode %02X: ", opcode);
+    if (src_size < 0 || src_size > bytestream2_get_bytes_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Chunk size %d is out of range\n",
+               src_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    start = bytestream2_tell(&s->gb);
+    while (bytestream2_tell(&s->gb) - start < src_size) {
+        opcode = bytestream2_get_byte(&s->gb);
+        ff_tlog(s->avctx, "opcode %02X: ", opcode);
 
         /* 0x80 means that frame is finished */
         if (opcode == 0x80)
-            return 0;
+            break;
 
         if (dest_index >= dest_size) {
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
                 dest_index, dest_size);
             return AVERROR_INVALIDDATA;
         }
 
         if (opcode == 0xFF) {
 
-            count   = bytestream2_get_le16(gb);
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
+            count   = bytestream2_get_le16(&s->gb);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -260,9 +265,9 @@ static int decode_format80(GetByteContext *gb, int src_size,
 
         } else if (opcode == 0xFE) {
 
-            count = bytestream2_get_le16(gb);
-            color = bytestream2_get_byte(gb);
-            ff_dlog(NULL, "(2) set %X bytes to %02X\n", count, color);
+            count = bytestream2_get_le16(&s->gb);
+            color = bytestream2_get_byte(&s->gb);
+            ff_tlog(s->avctx, "(2) set %X bytes to %02X\n", count, color);
             CHECK_COUNT();
             memset(&dest[dest_index], color, count);
             dest_index += count;
@@ -270,8 +275,8 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if ((opcode & 0xC0) == 0xC0) {
 
             count = (opcode & 0x3F) + 3;
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -281,16 +286,16 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if (opcode > 0x80) {
 
             count = opcode & 0x3F;
-            ff_dlog(NULL, "(4) copy %X bytes from source to dest\n", count);
+            ff_tlog(s->avctx, "(4) copy %X bytes from source to dest\n", count);
             CHECK_COUNT();
-            bytestream2_get_buffer(gb, &dest[dest_index], count);
+            bytestream2_get_buffer(&s->gb, &dest[dest_index], count);
             dest_index += count;
 
         } else {
 
             count = ((opcode & 0x70) >> 4) + 3;
-            src_pos = bytestream2_get_byte(gb) | ((opcode & 0x0F) << 8);
-            ff_dlog(NULL, "(5) copy %X bytes from relpos %X\n", count, src_pos);
+            src_pos = bytestream2_get_byte(&s->gb) | ((opcode & 0x0F) << 8);
+            ff_tlog(s->avctx, "(5) copy %X bytes from relpos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(dest_index - src_pos);
             for (i = 0; i < count; i++)
@@ -304,9 +309,11 @@ static int decode_format80(GetByteContext *gb, int src_size,
      * codebook entry; it is not important for compressed codebooks because
      * not every entry needs to be filled */
     if (check_size)
-        if (dest_index < dest_size)
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
+        if (dest_index < dest_size) {
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
                 dest_index, dest_size);
+            memset(dest + dest_index, 0, dest_size - dest_index);
+        }
 
     return 0; // let's display what we decoded anyway
 }
@@ -377,7 +384,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             break;
 
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: Found unknown chunk type: %c%c%c%c (%08X)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "Found unknown chunk type: %c%c%c%c (%08X)\n",
             (chunk_type >> 24) & 0xFF,
             (chunk_type >> 16) & 0xFF,
             (chunk_type >>  8) & 0xFF,
@@ -394,7 +401,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cpl0_chunk != -1) && (cplz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CPL0 and CPLZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CPL0 and CPLZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -412,7 +419,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the palette size */
         if (chunk_size / 3 > 256 || chunk_size > bytestream2_get_bytes_left(&s->gb)) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found a palette chunk with %d colors\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: found a palette chunk with %d colors\n",
                 chunk_size / 3);
             return AVERROR_INVALIDDATA;
         }
@@ -421,7 +428,8 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             r = bytestream2_get_byteu(&s->gb) * 4;
             g = bytestream2_get_byteu(&s->gb) * 4;
             b = bytestream2_get_byteu(&s->gb) * 4;
-            s->palette[i] = (r << 16) | (g << 8) | (b);
+            s->palette[i] = 0xFFU << 24 | r << 16 | g << 8 | b;
+            s->palette[i] |= s->palette[i] >> 6 & 0x30303;
         }
     }
 
@@ -429,7 +437,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cbf0_chunk != -1) && (cbfz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBF0 and CBFZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBF0 and CBFZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -438,7 +446,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
 
         bytestream2_seek(&s->gb, cbfz_chunk, SEEK_SET);
         chunk_size = bytestream2_get_be32(&s->gb);
-        if ((res = decode_format80(&s->gb, chunk_size, s->codebook,
+        if ((res = decode_format80(s, chunk_size, s->codebook,
                                    s->codebook_size, 0)) < 0)
             return res;
     }
@@ -450,7 +458,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the full codebook size */
         if (chunk_size > MAX_CODEBOOK_SIZE) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: CBF0 chunk too large (0x%X bytes)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: CBF0 chunk too large (0x%X bytes)\n",
                 chunk_size);
             return AVERROR_INVALIDDATA;
         }
@@ -462,13 +470,13 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if (vptz_chunk == -1) {
 
         /* something is wrong if there is no VPTZ chunk */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: no VPTZ chunk found\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: no VPTZ chunk found\n");
         return AVERROR_INVALIDDATA;
     }
 
     bytestream2_seek(&s->gb, vptz_chunk, SEEK_SET);
     chunk_size = bytestream2_get_be32(&s->gb);
-    if ((res = decode_format80(&s->gb, chunk_size,
+    if ((res = decode_format80(s, chunk_size,
                                s->decode_buffer, s->decode_buffer_size, 1)) < 0)
         return res;
 
@@ -531,7 +539,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     /* handle partial codebook */
     if ((cbp0_chunk != -1) && (cbpz_chunk != -1)) {
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBP0 and CBPZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBP0 and CBPZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -552,7 +560,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
+        if (s->partial_countdown <= 0) {
 
             /* time to replace codebook */
             memcpy(s->codebook, s->next_codebook_buffer,
@@ -581,12 +589,10 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
-            GetByteContext gb;
-
-            bytestream2_init(&gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
+        if (s->partial_countdown <= 0) {
+            bytestream2_init(&s->gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
             /* decompress codebook */
-            if ((res = decode_format80(&gb, s->next_codebook_buffer_index,
+            if ((res = decode_format80(s, s->next_codebook_buffer_index,
                                        s->codebook, s->codebook_size, 0)) < 0)
                 return res;
 
@@ -607,10 +613,8 @@ static int vqa_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA Video: get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     if ((res = vqa_decode_chunk(s, frame)) < 0)
diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index 599e36ccc5..d20556d312 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -2,20 +2,20 @@
  * WavPack lossless audio decoder
  * Copyright (c) 2006,2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,60 +25,16 @@
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
+#include "thread.h"
 #include "unary.h"
 #include "bytestream.h"
+#include "wavpack.h"
 
 /**
  * @file
  * WavPack lossless audio decoder
  */
 
-#define WV_HEADER_SIZE    32
-
-#define WV_MONO           0x00000004
-#define WV_JOINT_STEREO   0x00000010
-#define WV_FALSE_STEREO   0x40000000
-
-#define WV_HYBRID_MODE    0x00000008
-#define WV_HYBRID_SHAPE   0x00000008
-#define WV_HYBRID_BITRATE 0x00000200
-#define WV_HYBRID_BALANCE 0x00000400
-#define WV_INITIAL_BLOCK  0x00000800
-#define WV_FINAL_BLOCK    0x00001000
-
-#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
-
-#define WV_FLT_SHIFT_ONES 0x01
-#define WV_FLT_SHIFT_SAME 0x02
-#define WV_FLT_SHIFT_SENT 0x04
-#define WV_FLT_ZERO_SENT  0x08
-#define WV_FLT_ZERO_SIGN  0x10
-
-enum WP_ID_Flags {
-    WP_IDF_MASK   = 0x3F,
-    WP_IDF_IGNORE = 0x20,
-    WP_IDF_ODD    = 0x40,
-    WP_IDF_LONG   = 0x80
-};
-
-enum WP_ID {
-    WP_ID_DUMMY = 0,
-    WP_ID_ENCINFO,
-    WP_ID_DECTERMS,
-    WP_ID_DECWEIGHTS,
-    WP_ID_DECSAMPLES,
-    WP_ID_ENTROPY,
-    WP_ID_HYBRID,
-    WP_ID_SHAPING,
-    WP_ID_FLOATINFO,
-    WP_ID_INT32INFO,
-    WP_ID_DATA,
-    WP_ID_CORR,
-    WP_ID_EXTRABITS,
-    WP_ID_CHANINFO,
-    WP_ID_SAMPLE_RATE = 0x27,
-};
-
 typedef struct SavedContext {
     int offset;
     int size;
@@ -86,23 +42,6 @@ typedef struct SavedContext {
     uint32_t crc;
 } SavedContext;
 
-#define MAX_TERMS 16
-
-typedef struct Decorr {
-    int delta;
-    int value;
-    int weightA;
-    int weightB;
-    int samplesA[8];
-    int samplesB[8];
-} Decorr;
-
-typedef struct WvChannel {
-    int median[3];
-    int slow_level, error_limit;
-    int bitrate_acc, bitrate_delta;
-} WvChannel;
-
 typedef struct WavpackFrameContext {
     AVCodecContext *avctx;
     int frame_flags;
@@ -144,101 +83,7 @@ typedef struct WavpackContext {
     int ch_offset;
 } WavpackContext;
 
-static const int wv_rates[16] = {
-     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
-    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
-};
-
-// exponent table copied from WavPack source
-static const uint8_t wp_exp2_table[256] = {
-    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
-    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
-    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
-    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
-    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
-    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
-    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
-    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
-    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
-    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
-    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
-    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
-    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
-};
-
-static const uint8_t wp_log2_table [] = {
-    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
-    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
-    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
-    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
-    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
-    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
-    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
-    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
-    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
-    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
-    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
-    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
-    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
-    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
-    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
-};
-
-static av_always_inline int wp_exp2(int16_t val)
-{
-    int res, neg = 0;
-
-    if (val < 0) {
-        val = -val;
-        neg = 1;
-    }
-
-    res   = wp_exp2_table[val & 0xFF] | 0x100;
-    val >>= 8;
-    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
-    return neg ? -res : res;
-}
-
-static av_always_inline int wp_log2(int32_t val)
-{
-    int bits;
-
-    if (!val)
-        return 0;
-    if (val == 1)
-        return 256;
-    val += val >> 9;
-    bits = av_log2(val) + 1;
-    if (bits < 9)
-        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
-    else
-        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
-}
-
-#define LEVEL_DECAY(a)  ((a + 0x80) >> 8)
-
-// macros for manipulating median values
-#define GET_MED(n) ((c->median[n] >> 4) + 1)
-#define DEC_MED(n) c->median[n] -= ((c->median[n] + (128 >> n) - 2) / (128 >> n)) * 2
-#define INC_MED(n) c->median[n] += ((c->median[n] + (128 >> n)    ) / (128 >> n)) * 5
-
-// macros for applying weight
-#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
-    if (samples && in) { \
-        if ((samples ^ in) < 0) { \
-            weight -= delta; \
-            if (weight < -1024) \
-                weight = -1024; \
-        } else { \
-            weight += delta; \
-            if (weight > 1024) \
-                weight = 1024; \
-        } \
-    }
+#define LEVEL_DECAY(a)  (((a) + 0x80) >> 8)
 
 static av_always_inline int get_tail(GetBitContext *gb, int k)
 {
@@ -310,7 +155,7 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
             if (t >= 2) {
                 if (get_bits_left(gb) < t - 1)
                     goto error;
-                t = get_bits(gb, t - 1) | (1 << (t - 1));
+                t = get_bits_long(gb, t - 1) | (1 << (t - 1));
             } else {
                 if (get_bits_left(gb) < 0)
                     goto error;
@@ -341,7 +186,7 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
             } else {
                 if (get_bits_left(gb) < t2 - 1)
                     goto error;
-                t += get_bits(gb, t2 - 1) | (1 << (t2 - 1));
+                t += get_bits_long(gb, t2 - 1) | (1 << (t2 - 1));
             }
         }
 
@@ -381,6 +226,10 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
         INC_MED(2);
     }
     if (!c->error_limit) {
+        if (add >= 0x2000000U) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "k %d is too large\n", add);
+            goto error;
+        }
         ret = base + get_tail(gb, add);
         if (get_bits_left(gb) <= 0)
             goto error;
@@ -404,6 +253,10 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
     return sign ? ~ret : ret;
 
 error:
+    ret = get_bits_left(gb);
+    if (ret <= 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too few bits (%d) left\n", ret);
+    }
     *last = 1;
     return 0;
 }
@@ -418,7 +271,7 @@ static inline int wv_get_value_integer(WavpackFrameContext *s, uint32_t *crc,
 
         if (s->got_extra_bits &&
             get_bits_left(&s->gb_extra_bits) >= s->extra_bits) {
-            S   |= get_bits(&s->gb_extra_bits, s->extra_bits);
+            S   |= get_bits_long(&s->gb_extra_bits, s->extra_bits);
             *crc = *crc * 9 + (S & 0xffff) * 3 + ((unsigned)S >> 16);
         }
     }
@@ -619,6 +472,14 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, GetBitContext *gb,
                 s->decorr[i].samplesB[0] = L;
             }
         }
+
+        if (type == AV_SAMPLE_FMT_S16P) {
+            if (FFABS(L) + FFABS(R) > (1<<19)) {
+                av_log(s->avctx, AV_LOG_ERROR, "sample %d %d too large\n", L, R);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
         pos = (pos + 1) & 7;
         if (s->joint)
             L += (R -= (L >> 1));
@@ -638,6 +499,13 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, GetBitContext *gb,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst_l + count*size, 0, (s->samples-count)*size);
+        memset((uint8_t*)dst_r + count*size, 0, (s->samples-count)*size);
+    }
+
     if ((s->avctx->err_recognition & AV_EF_CRCCHECK) &&
         wv_check_crc(s, crc, crc_extra_bits))
         return AVERROR_INVALIDDATA;
@@ -699,6 +567,12 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, GetBitContext *gb,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst + count*size, 0, (s->samples-count)*size);
+    }
+
     if (s->avctx->err_recognition & AV_EF_CRCCHECK) {
         int ret = wv_check_crc(s, crc, crc_extra_bits);
         if (ret < 0 && s->avctx->err_recognition & AV_EF_EXPLODE)
@@ -723,6 +597,15 @@ static av_cold int wv_alloc_frame_context(WavpackContext *c)
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    WavpackContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+#endif
+
 static av_cold int wavpack_decode_init(AVCodecContext *avctx)
 {
     WavpackContext *s = avctx->priv_data;
@@ -750,9 +633,10 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                                 AVFrame *frame, const uint8_t *buf, int buf_size)
 {
     WavpackContext *wc = avctx->priv_data;
+    ThreadFrame tframe = { .f = frame };
     WavpackFrameContext *s;
     GetByteContext gb;
-    void *samples_l, *samples_r;
+    void *samples_l = NULL, *samples_r = NULL;
     int ret;
     int got_terms   = 0, got_weights = 0, got_samples = 0,
         got_entropy = 0, got_bs      = 0, got_float   = 0, got_hybrid = 0;
@@ -910,7 +794,7 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         case WP_ID_ENTROPY:
             if (size != 6 * (s->stereo_in + 1)) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Entropy vars size should be %i, got %i",
+                       "Entropy vars size should be %i, got %i.\n",
                        6 * (s->stereo_in + 1), size);
                 bytestream2_skip(&gb, ssize);
                 continue;
@@ -953,7 +837,11 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 continue;
             }
             bytestream2_get_buffer(&gb, val, 4);
-            if (val[0]) {
+            if (val[0] > 32) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid INT32INFO, extra_bits = %d (> 32)\n", val[0]);
+                continue;
+            } else if (val[0]) {
                 s->extra_bits = val[0];
             } else if (val[1]) {
                 s->shift = val[1];
@@ -990,7 +878,8 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         case WP_ID_DATA:
             s->sc.offset = bytestream2_tell(&gb);
             s->sc.size   = size * 8;
-            init_get_bits(&s->gb, gb.buffer, size * 8);
+            if ((ret = init_get_bits8(&s->gb, gb.buffer, size)) < 0)
+                return ret;
             s->data_size = size * 8;
             bytestream2_skip(&gb, size);
             got_bs       = 1;
@@ -1004,7 +893,8 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
             }
             s->extra_sc.offset = bytestream2_tell(&gb);
             s->extra_sc.size   = size * 8;
-            init_get_bits(&s->gb_extra_bits, gb.buffer, size * 8);
+            if ((ret = init_get_bits8(&s->gb_extra_bits, gb.buffer, size)) < 0)
+                return ret;
             s->crc_extra_bits  = get_bits_long(&s->gb_extra_bits, 32);
             bytestream2_skip(&gb, size);
             s->got_extra_bits  = 1;
@@ -1027,10 +917,13 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 chmask = bytestream2_get_le24(&gb);
                 break;
             case 3:
-                chmask = bytestream2_get_le32(&gb);;
+                chmask = bytestream2_get_le32(&gb);
                 break;
             case 5:
-                bytestream2_skip(&gb, 1);
+                size = bytestream2_get_byte(&gb);
+                if (avctx->channels != size)
+                    av_log(avctx, AV_LOG_WARNING, "%i channels signalled"
+                           " instead of %i.\n", size, avctx->channels);
                 chan  |= (bytestream2_get_byte(&gb) & 0xF) << 8;
                 chmask = bytestream2_get_le16(&gb);
                 break;
@@ -1115,11 +1008,10 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         }
 
         /* get output buffer */
-        frame->nb_samples = s->samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        frame->nb_samples = s->samples + 1;
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
+        frame->nb_samples = s->samples;
     }
 
     if (wc->ch_offset + s->stereo >= avctx->channels) {
@@ -1176,7 +1068,7 @@ static int wavpack_decode_frame(AVCodecContext *avctx, void *data,
     /* determine number of samples */
     s->samples  = AV_RL32(buf + 20);
     frame_flags = AV_RL32(buf + 24);
-    if (s->samples <= 0) {
+    if (s->samples <= 0 || s->samples > WV_MAX_SAMPLES) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of samples: %d\n",
                s->samples);
         return AVERROR_INVALIDDATA;
@@ -1234,5 +1126,6 @@ AVCodec ff_wavpack_decoder = {
     .close          = wavpack_decode_end,
     .decode         = wavpack_decode_frame,
     .flush          = wavpack_decode_flush,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/wavpack.h b/libavcodec/wavpack.h
new file mode 100644
index 0000000000..a1b46d5bd7
--- /dev/null
+++ b/libavcodec/wavpack.h
@@ -0,0 +1,194 @@
+/*
+ * WavPack decoder/encoder common code
+ * Copyright (c) 2006,2011 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACK_H
+#define AVCODEC_WAVPACK_H
+
+#include "libavutil/common.h"
+
+#define MAX_TERMS      16
+#define MAX_TERM        8
+
+#define WV_HEADER_SIZE    32
+
+#define WV_MONO           0x00000004
+#define WV_JOINT_STEREO   0x00000010
+#define WV_CROSS_DECORR   0x00000020
+#define WV_FLOAT_DATA     0x00000080
+#define WV_INT32_DATA     0x00000100
+#define WV_FALSE_STEREO   0x40000000
+
+#define WV_HYBRID_MODE    0x00000008
+#define WV_HYBRID_SHAPE   0x00000008
+#define WV_HYBRID_BITRATE 0x00000200
+#define WV_HYBRID_BALANCE 0x00000400
+#define WV_INITIAL_BLOCK  0x00000800
+#define WV_FINAL_BLOCK    0x00001000
+
+#define WV_MONO_DATA    (WV_MONO | WV_FALSE_STEREO)
+
+#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
+
+#define WV_FLT_SHIFT_ONES 0x01
+#define WV_FLT_SHIFT_SAME 0x02
+#define WV_FLT_SHIFT_SENT 0x04
+#define WV_FLT_ZERO_SENT  0x08
+#define WV_FLT_ZERO_SIGN  0x10
+
+#define WV_MAX_SAMPLES    150000
+
+enum WP_ID_Flags {
+    WP_IDF_MASK   = 0x3F,
+    WP_IDF_IGNORE = 0x20,
+    WP_IDF_ODD    = 0x40,
+    WP_IDF_LONG   = 0x80
+};
+
+enum WP_ID {
+    WP_ID_DUMMY = 0,
+    WP_ID_ENCINFO,
+    WP_ID_DECTERMS,
+    WP_ID_DECWEIGHTS,
+    WP_ID_DECSAMPLES,
+    WP_ID_ENTROPY,
+    WP_ID_HYBRID,
+    WP_ID_SHAPING,
+    WP_ID_FLOATINFO,
+    WP_ID_INT32INFO,
+    WP_ID_DATA,
+    WP_ID_CORR,
+    WP_ID_EXTRABITS,
+    WP_ID_CHANINFO,
+    WP_ID_SAMPLE_RATE = 0x27,
+};
+
+typedef struct Decorr {
+    int delta;
+    int value;
+    int weightA;
+    int weightB;
+    int samplesA[MAX_TERM];
+    int samplesB[MAX_TERM];
+    int sumA;
+    int sumB;
+} Decorr;
+
+typedef struct WvChannel {
+    int median[3];
+    int slow_level, error_limit;
+    int bitrate_acc, bitrate_delta;
+} WvChannel;
+
+// macros for manipulating median values
+#define GET_MED(n) ((c->median[n] >> 4) + 1)
+#define DEC_MED(n) c->median[n] -= ((c->median[n] + (128 >> (n)) - 2) / (128 >> (n))) * 2
+#define INC_MED(n) c->median[n] += ((c->median[n] + (128 >> (n))    ) / (128 >> (n))) * 5
+
+// macros for applying weight
+#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
+    if ((samples) && (in)) { \
+        if (((samples) ^ (in)) < 0) { \
+            (weight) -= (delta); \
+            if ((weight) < -1024) \
+                (weight) = -1024; \
+        } else { \
+            (weight) += (delta); \
+            if ((weight) > 1024) \
+                (weight) = 1024; \
+        } \
+    }
+
+static const int wv_rates[16] = {
+     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
+    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
+};
+
+// exponent table copied from WavPack source
+static const uint8_t wp_exp2_table[256] = {
+    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
+    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
+    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
+    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
+    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
+    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
+    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
+    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
+    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
+    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
+};
+
+static const uint8_t wp_log2_table [] = {
+    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
+    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
+    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
+    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
+    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
+    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
+    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
+    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
+    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
+    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
+    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
+    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
+    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
+    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+};
+
+static av_always_inline int wp_exp2(int16_t val)
+{
+    int res, neg = 0;
+
+    if (val < 0) {
+        val = -val;
+        neg = 1;
+    }
+
+    res   = wp_exp2_table[val & 0xFF] | 0x100;
+    val >>= 8;
+    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
+    return neg ? -res : res;
+}
+
+static av_always_inline int wp_log2(int32_t val)
+{
+    int bits;
+
+    if (!val)
+        return 0;
+    if (val == 1)
+        return 256;
+    val += val >> 9;
+    bits = av_log2(val) + 1;
+    if (bits < 9)
+        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
+    else
+        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
+}
+
+#endif /* AVCODEC_WAVPACK_H */
diff --git a/libavcodec/wavpackenc.c b/libavcodec/wavpackenc.c
new file mode 100644
index 0000000000..977bcf0e04
--- /dev/null
+++ b/libavcodec/wavpackenc.c
@@ -0,0 +1,2986 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "wavpackenc.h"
+#include "wavpack.h"
+
+#define UPDATE_WEIGHT(weight, delta, source, result) \
+    if ((source) && (result)) { \
+        int32_t s = (int32_t) ((source) ^ (result)) >> 31; \
+        weight = ((delta) ^ s) + ((weight) - s); \
+    }
+
+#define APPLY_WEIGHT_F(weight, sample) ((((((sample) & 0xffff) * (weight)) >> 9) + \
+    ((((sample) & ~0xffff) >> 9) * (weight)) + 1) >> 1)
+
+#define APPLY_WEIGHT_I(weight, sample) (((weight) * (sample) + 512) >> 10)
+
+#define APPLY_WEIGHT(weight, sample) ((sample) != (short) (sample) ? \
+    APPLY_WEIGHT_F(weight, sample) : APPLY_WEIGHT_I (weight, sample))
+
+#define CLEAR(destin) memset(&destin, 0, sizeof(destin));
+
+#define SHIFT_LSB       13
+#define SHIFT_MASK      (0x1FU << SHIFT_LSB)
+
+#define MAG_LSB         18
+#define MAG_MASK        (0x1FU << MAG_LSB)
+
+#define SRATE_LSB       23
+#define SRATE_MASK      (0xFU << SRATE_LSB)
+
+#define EXTRA_TRY_DELTAS     1
+#define EXTRA_ADJUST_DELTAS  2
+#define EXTRA_SORT_FIRST     4
+#define EXTRA_BRANCHES       8
+#define EXTRA_SORT_LAST     16
+
+typedef struct WavPackExtraInfo {
+    struct Decorr dps[MAX_TERMS];
+    int nterms, log_limit, gt16bit;
+    uint32_t best_bits;
+} WavPackExtraInfo;
+
+typedef struct WavPackWords {
+    int pend_data, holding_one, zeros_acc;
+    int holding_zero, pend_count;
+    WvChannel c[2];
+} WavPackWords;
+
+typedef struct WavPackEncodeContext {
+    AVClass *class;
+    AVCodecContext *avctx;
+    PutBitContext pb;
+    int block_samples;
+    int buffer_size;
+    int sample_index;
+    int stereo, stereo_in;
+    int ch_offset;
+
+    int32_t *samples[2];
+    int samples_size[2];
+
+    int32_t *sampleptrs[MAX_TERMS+2][2];
+    int sampleptrs_size[MAX_TERMS+2][2];
+
+    int32_t *temp_buffer[2][2];
+    int temp_buffer_size[2][2];
+
+    int32_t *best_buffer[2];
+    int best_buffer_size[2];
+
+    int32_t *js_left, *js_right;
+    int js_left_size, js_right_size;
+
+    int32_t *orig_l, *orig_r;
+    int orig_l_size, orig_r_size;
+
+    unsigned extra_flags;
+    int optimize_mono;
+    int decorr_filter;
+    int joint;
+    int num_branches;
+
+    uint32_t flags;
+    uint32_t crc_x;
+    WavPackWords w;
+
+    uint8_t int32_sent_bits, int32_zeros, int32_ones, int32_dups;
+    uint8_t float_flags, float_shift, float_max_exp, max_exp;
+    int32_t shifted_ones, shifted_zeros, shifted_both;
+    int32_t false_zeros, neg_zeros, ordata;
+
+    int num_terms, shift, joint_stereo, false_stereo;
+    int num_decorrs, num_passes, best_decorr, mask_decorr;
+    struct Decorr decorr_passes[MAX_TERMS];
+    const WavPackDecorrSpec *decorr_specs;
+    float delta_decay;
+} WavPackEncodeContext;
+
+static av_cold int wavpack_encode_init(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    if (!avctx->frame_size) {
+        int block_samples;
+        if (!(avctx->sample_rate & 1))
+            block_samples = avctx->sample_rate / 2;
+        else
+            block_samples = avctx->sample_rate;
+
+        while (block_samples * avctx->channels > WV_MAX_SAMPLES)
+            block_samples /= 2;
+
+        while (block_samples * avctx->channels < 40000)
+            block_samples *= 2;
+        avctx->frame_size = block_samples;
+    } else if (avctx->frame_size && (avctx->frame_size < 128 ||
+                              avctx->frame_size > WV_MAX_SAMPLES)) {
+        av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n", avctx->frame_size);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT) {
+        if (avctx->compression_level >= 3) {
+            s->decorr_filter = 3;
+            s->num_passes = 9;
+            if      (avctx->compression_level >= 8) {
+                s->num_branches = 4;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_SORT_LAST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 7) {
+                s->num_branches = 3;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 6) {
+                s->num_branches = 2;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 5) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 4) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_BRANCHES;
+            }
+        } else if (avctx->compression_level == 2) {
+            s->decorr_filter = 2;
+            s->num_passes = 4;
+        } else if (avctx->compression_level == 1) {
+            s->decorr_filter = 1;
+            s->num_passes = 2;
+        } else if (avctx->compression_level < 1) {
+            s->decorr_filter = 0;
+            s->num_passes = 0;
+        }
+    }
+
+    s->num_decorrs = decorr_filter_sizes[s->decorr_filter];
+    s->decorr_specs = decorr_filters[s->decorr_filter];
+
+    s->delta_decay = 2.0;
+
+    return 0;
+}
+
+static void shift_mono(int32_t *samples, int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++)
+        samples[i] >>= shift;
+}
+
+static void shift_stereo(int32_t *left, int32_t *right,
+                         int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++) {
+        left [i] >>= shift;
+        right[i] >>= shift;
+    }
+}
+
+#define FLOAT_SHIFT_ONES 1
+#define FLOAT_SHIFT_SAME 2
+#define FLOAT_SHIFT_SENT 4
+#define FLOAT_ZEROS_SENT 8
+#define FLOAT_NEG_ZEROS  0x10
+#define FLOAT_EXCEPTIONS 0x20
+
+#define get_mantissa(f)     ((f) & 0x7fffff)
+#define get_exponent(f)     (((f) >> 23) & 0xff)
+#define get_sign(f)         (((f) >> 31) & 0x1)
+
+static void process_float(WavPackEncodeContext *s, int32_t *sample)
+{
+    int32_t shift_count, value, f = *sample;
+
+    if (get_exponent(f) == 255) {
+        s->float_flags |= FLOAT_EXCEPTIONS;
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(f)) {
+        shift_count = s->max_exp - get_exponent(f);
+        value = 0x800000 + get_mantissa(f);
+    } else {
+        shift_count = s->max_exp ? s->max_exp - 1 : 0;
+        value = get_mantissa(f);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (get_exponent(f) || get_mantissa(f))
+            s->false_zeros++;
+        else if (get_sign(f))
+            s->neg_zeros++;
+    } else if (shift_count) {
+        int32_t mask = (1 << shift_count) - 1;
+
+        if (!(get_mantissa(f) & mask))
+            s->shifted_zeros++;
+        else if ((get_mantissa(f) & mask) == mask)
+            s->shifted_ones++;
+        else
+            s->shifted_both++;
+    }
+
+    s->ordata |= value;
+    *sample = get_sign(f) ? -value : value;
+}
+
+static int scan_float(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t crc = 0xffffffffu;
+    int i;
+
+    s->shifted_ones = s->shifted_zeros = s->shifted_both = s->ordata = 0;
+    s->float_shift = s->float_flags = 0;
+    s->false_zeros = s->neg_zeros = 0;
+    s->max_exp = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f;
+
+            f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+
+            f = samples_r[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    }
+
+    s->crc_x = crc;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            process_float(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            process_float(s, &samples_l[i]);
+            process_float(s, &samples_r[i]);
+        }
+    }
+
+    s->float_max_exp = s->max_exp;
+
+    if (s->shifted_both)
+        s->float_flags |= FLOAT_SHIFT_SENT;
+    else if (s->shifted_ones && !s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_ONES;
+    else if (s->shifted_ones && s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_SAME;
+    else if (s->ordata && !(s->ordata & 1)) {
+        do {
+            s->float_shift++;
+            s->ordata >>= 1;
+        } while (!(s->ordata & 1));
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, s->float_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, s->float_shift);
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (s->ordata) {
+        s->flags += 1 << MAG_LSB;
+        s->ordata >>= 1;
+    }
+
+    if (s->false_zeros || s->neg_zeros)
+        s->float_flags |= FLOAT_ZEROS_SENT;
+
+    if (s->neg_zeros)
+        s->float_flags |= FLOAT_NEG_ZEROS;
+
+    return s->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT |
+                             FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME);
+}
+
+static void scan_int23(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!(s->flags & MAG_MASK))
+        return;
+
+    if (!(ordata & 1)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    } else if (anddata & 1) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    } else if (!(xordata & 2)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+}
+
+static int scan_int32(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    uint32_t crc = 0xffffffffu;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            crc = crc * 9 + (M & 0xffff) * 3 + ((M >> 16) & 0xffff);
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            crc = crc * 9 + (L & 0xffff) * 3 + ((L >> 16) & 0xffff);
+            crc = crc * 9 + (R & 0xffff) * 3 + ((R >> 16) & 0xffff);
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+        }
+    }
+
+    s->crc_x = crc;
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!((s->flags & MAG_MASK) >> MAG_LSB)) {
+        s->flags &= ~WV_INT32_DATA;
+        return 0;
+    }
+
+    if (!(ordata & 1))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    else if (anddata & 1)
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    else if (!(xordata & 2))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+
+    if (((s->flags & MAG_MASK) >> MAG_LSB) > 23) {
+        s->int32_sent_bits = (uint8_t)(((s->flags & MAG_MASK) >> MAG_LSB) - 23);
+        total_shift += s->int32_sent_bits;
+        s->flags &= ~MAG_MASK;
+        s->flags += 23 << MAG_LSB;
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+
+    return s->int32_sent_bits;
+}
+
+static int8_t store_weight(int weight)
+{
+    weight = av_clip(weight, -1024, 1024);
+    if (weight > 0)
+        weight -= (weight + 64) >> 7;
+
+    return (weight + 4) >> 3;
+}
+
+static int restore_weight(int8_t weight)
+{
+    int result;
+
+    if ((result = (int) weight << 3) > 0)
+        result += (result + 64) >> 7;
+
+    return result;
+}
+
+static int log2s(int32_t value)
+{
+    return (value < 0) ? -wp_log2(-value) : wp_log2(value);
+}
+
+static void decorr_mono(int32_t *in_samples, int32_t *out_samples,
+                        int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = 0;
+
+    if (dir < 0) {
+        out_samples += (nb_samples - 1);
+        in_samples  += (nb_samples - 1);
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+
+    for (i = 0; i < MAX_TERM; i++)
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+
+    if (dpp->value > MAX_TERM) {
+        while (nb_samples--) {
+            int32_t left, sam_A;
+
+            sam_A = ((3 - (dpp->value & 1)) * dpp->samplesA[0] - dpp->samplesA[1]) >> !(dpp->value & 1);
+
+            dpp->samplesA[1] = dpp->samplesA[0];
+            dpp->samplesA[0] = left = in_samples[0];
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    } else if (dpp->value > 0) {
+        while (nb_samples--) {
+            int k = (m + dpp->value) & (MAX_TERM - 1);
+            int32_t left, sam_A;
+
+            sam_A = dpp->samplesA[m];
+            dpp->samplesA[k] = left = in_samples[0];
+            m = (m + 1) & (MAX_TERM - 1);
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    }
+
+    if (m && dpp->value > 0 && dpp->value <= MAX_TERM) {
+        int32_t temp_A[MAX_TERM];
+
+        memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+
+        for (i = 0; i < MAX_TERM; i++) {
+            dpp->samplesA[i] = temp_A[m];
+            m = (m + 1) & (MAX_TERM - 1);
+        }
+    }
+}
+
+static void reverse_mono_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesA[0] = sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = sam_A;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+        }
+    }
+}
+
+static uint32_t log2sample(uint32_t v, int limit, uint32_t *result)
+{
+    uint32_t dbits;
+
+    if ((v += v >> 9) < (1 << 8)) {
+        dbits = nbits_table[v];
+        *result += (dbits << 8) + wp_log2_table[(v << (9 - dbits)) & 0xff];
+    } else {
+        if (v < (1 << 16))
+            dbits = nbits_table[v >> 8] + 8;
+        else if (v < (1 << 24))
+            dbits = nbits_table[v >> 16] + 16;
+        else
+            dbits = nbits_table[v >> 24] + 24;
+
+        *result += dbits = (dbits << 8) + wp_log2_table[(v >> (dbits - 9)) & 0xff];
+
+        if (limit && dbits >= limit)
+            return 1;
+    }
+
+    return 0;
+}
+
+static uint32_t log2mono(int32_t *samples, int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static uint32_t log2stereo(int32_t *samples_l, int32_t *samples_r,
+                           int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples_l++), limit, &result) ||
+            log2sample(abs(*samples_r++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static void decorr_mono_buffer(int32_t *samples, int32_t *outsamples,
+                               int nb_samples, struct Decorr *dpp,
+                               int tindex)
+{
+    struct Decorr dp, *dppi = dpp + tindex;
+    int delta = dppi->delta, pre_delta, term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    CLEAR(dp);
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_mono(samples, outsamples, FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0)
+        reverse_mono_decorr(&dp);
+    else
+        CLEAR(dp.samplesA);
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    dppi->weightA = dp.weightA;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+    }
+
+    decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+}
+
+static void recurse_mono(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                         int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *samples, *outsamples;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    samples = s->sampleptrs[depth][0];
+    outsamples = s->sampleptrs[depth + 1][0];
+
+    for (term = 1; term <= 18; term++) {
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term > 8 && term < 17)
+            continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+        bits = log2mono(outsamples, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[depth + 1][0], s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+
+        recurse_mono(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void sort_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                                   s->block_samples, info->dps, i);
+
+            bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+                       s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+            }
+        }
+    }
+}
+
+static void delta_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0],  s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+}
+
+static int allocate_buffers2(WavPackEncodeContext *s, int nterms)
+{
+    int i;
+
+    for (i = 0; i < nterms + 2; i++) {
+        av_fast_padded_malloc(&s->sampleptrs[i][0], &s->sampleptrs_size[i][0],
+                              s->block_samples * 4);
+        if (!s->sampleptrs[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->sampleptrs[i][1], &s->sampleptrs_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->sampleptrs[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static int allocate_buffers(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        av_fast_padded_malloc(&s->best_buffer[0], &s->best_buffer_size[0],
+                              s->block_samples * 4);
+        if (!s->best_buffer[0])
+            return AVERROR(ENOMEM);
+
+        av_fast_padded_malloc(&s->temp_buffer[i][0], &s->temp_buffer_size[i][0],
+                              s->block_samples * 4);
+        if (!s->temp_buffer[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->best_buffer[1], &s->best_buffer_size[1],
+                                  s->block_samples * 4);
+            if (!s->best_buffer[1])
+                return AVERROR(ENOMEM);
+
+            av_fast_padded_malloc(&s->temp_buffer[i][1], &s->temp_buffer_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->temp_buffer[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static void analyze_mono(WavPackEncodeContext *s, int32_t *samples, int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], samples, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        decorr_mono(s->sampleptrs[i][0], s->sampleptrs[i + 1][0],
+                    s->block_samples, info.dps + i, 1);
+
+    info.best_bits = log2mono(s->sampleptrs[info.nterms][0], s->block_samples, 0) * 1;
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_mono(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                     log2mono(s->sampleptrs[0][0], s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_mono(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_mono(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_mono(s, &info);
+
+    if (do_samples)
+        memcpy(samples, s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static void scan_word(WavPackEncodeContext *s, WvChannel *c,
+                      int32_t *samples, int nb_samples, int dir)
+{
+    if (dir < 0)
+        samples += nb_samples - 1;
+
+    while (nb_samples--) {
+        uint32_t low, value = labs(samples[0]);
+
+        if (value < GET_MED(0)) {
+            DEC_MED(0);
+        } else {
+            low = GET_MED(0);
+            INC_MED(0);
+
+            if (value - low < GET_MED(1)) {
+                DEC_MED(1);
+            } else {
+                low += GET_MED(1);
+                INC_MED(1);
+
+                if (value - low < GET_MED(2)) {
+                    DEC_MED(2);
+                } else {
+                    INC_MED(2);
+                }
+            }
+        }
+        samples += dir;
+    }
+}
+
+static int wv_mono(WavPackEncodeContext *s, int32_t *samples,
+                   int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    uint32_t best_size = UINT32_MAX, size;
+    int log_limit, pi, i, ret;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples[i])
+            break;
+
+    if (i == nb_samples) {
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi) {
+            c = s->best_decorr;
+        } else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+        memcpy(s->temp_buffer[0][0], samples, buf_size);
+        CLEAR(save_decorr_passes);
+
+        for (j = 0; j < nterms; j++) {
+            CLEAR(temp_decorr_pass);
+            temp_decorr_pass.delta = wpds->delta;
+            temp_decorr_pass.value = wpds->terms[j];
+
+            if (temp_decorr_pass.value < 0)
+                temp_decorr_pass.value = 1;
+
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        FFMIN(nb_samples, 2048), &temp_decorr_pass, -1);
+
+            if (j) {
+                CLEAR(temp_decorr_pass.samplesA);
+            } else {
+                reverse_mono_decorr(&temp_decorr_pass);
+            }
+
+            memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        nb_samples, &temp_decorr_pass, 1);
+        }
+
+        size = log2mono(s->temp_buffer[j&1][0], nb_samples, log_limit);
+        if (size != UINT32_MAX || !nterms)
+            break;
+        nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (s->extra_flags)
+        analyze_mono(s, samples, do_samples);
+    else if (do_samples)
+        memcpy(samples, s->best_buffer[0], buf_size);
+
+    if (no_history || s->extra_flags) {
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+    }
+    return 0;
+}
+
+static void decorr_stereo(int32_t *in_left, int32_t *in_right,
+                          int32_t *out_left, int32_t *out_right,
+                          int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = dpp->sumB = 0;
+
+    if (dir < 0) {
+        out_left  += nb_samples - 1;
+        out_right += nb_samples - 1;
+        in_left   += nb_samples - 1;
+        in_right  += nb_samples - 1;
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[0] = tmp = (dpp->samplesA[1] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[0] = tmp = (dpp->samplesB[1] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 17:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT (dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 18:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[0] = tmp = (dpp->samplesA[k] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[m];
+            out_right[0] = tmp = (dpp->samplesB[k] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+        }
+    case -1:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[0] = tmp = (sam_B = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            out_right[0] = tmp = (dpp->samplesA[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -2:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[0] = tmp = (sam_A = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            out_left[0] = tmp = (dpp->samplesB[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -3:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[0];
+            out_right[0] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            dpp->samplesB[0] = tmp = in_left[0];
+            out_left[0] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    }
+}
+
+static void reverse_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A, sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesB[1] = dpp->samplesB[0];
+        dpp->samplesA[0] = sam_A;
+        dpp->samplesB[0] = sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = sam_A;
+        dpp->samplesB[1] = sam_B;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+            dpp->samplesB[j] ^= dpp->samplesB[i];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+        }
+    }
+}
+
+static void decorr_stereo_quick(int32_t *in_left,  int32_t *in_right,
+                                int32_t *out_left, int32_t *out_right,
+                                int nb_samples, struct Decorr *dpp)
+{
+    int m = 0, i;
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[i] = tmp = (dpp->samplesA[1] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[i] = tmp = (dpp->samplesB[1] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[i] = tmp = (dpp->samplesA[k] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            out_right[i] = tmp = (dpp->samplesB[k] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    }
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[i] = tmp = (sam_B = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            out_right[i] = tmp = (dpp->samplesA[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[i] = tmp = (sam_A = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            out_left[i] = tmp = (dpp->samplesB[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[i];
+            out_right[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = in_left[i];
+            out_left[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void decorr_stereo_buffer(WavPackExtraInfo *info,
+                                 int32_t *in_left,  int32_t *in_right,
+                                 int32_t *out_left, int32_t *out_right,
+                                 int nb_samples, int tindex)
+{
+    struct Decorr dp = {0}, *dppi = info->dps + tindex;
+    int delta = dppi->delta, pre_delta;
+    int term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_stereo(in_left, in_right, out_left, out_right,
+                  FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0) {
+        reverse_decorr(&dp);
+    } else {
+        CLEAR(dp.samplesA);
+        CLEAR(dp.samplesB);
+    }
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    memcpy(dppi->samplesB, dp.samplesB, sizeof(dp.samplesB));
+    dppi->weightA = dp.weightA;
+    dppi->weightB = dp.weightB;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_stereo(in_left, in_right, out_left, out_right, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        memcpy(dp.samplesB, dppi->samplesB, sizeof(dp.samplesB));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+        dppi->weightB = dp.weightB = dp.sumB / nb_samples;
+    }
+
+    if (info->gt16bit)
+        decorr_stereo(in_left, in_right, out_left, out_right,
+                           nb_samples, &dp, 1);
+    else
+        decorr_stereo_quick(in_left, in_right, out_left, out_right,
+                            nb_samples, &dp);
+}
+
+static void sort_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                     s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                     s->block_samples, i);
+
+            bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                              s->block_samples, info->log_limit);
+
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0],
+                       s->sampleptrs[i][0], s->block_samples * 4);
+                memcpy(s->sampleptrs[info->nterms + 1][1],
+                       s->sampleptrs[i][1], s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri  ];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+            }
+        }
+    }
+}
+
+static void delta_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d, i;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+        memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[i][1],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[i][0], s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1],
+                   s->sampleptrs[i][1], s->block_samples * 4);
+        }
+        else
+            break;
+    }
+}
+
+static void recurse_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                           int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *in_left, *in_right, *out_left, *out_right;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    in_left   = s->sampleptrs[depth    ][0];
+    in_right  = s->sampleptrs[depth    ][1];
+    out_left  = s->sampleptrs[depth + 1][0];
+    out_right = s->sampleptrs[depth + 1][1];
+
+    for (term = -3; term <= 18; term++) {
+        if (!term || (term > 8 && term < 17))
+            continue;
+
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term == -1 || term == -2)
+            if (!(s->flags & WV_CROSS_DECORR))
+                continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+        bits = log2stereo(out_left, out_right, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[depth + 1][0],
+                   s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[depth + 1][1],
+                   s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+
+        recurse_stereo(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void analyze_stereo(WavPackEncodeContext *s,
+                           int32_t *in_left, int32_t *in_right,
+                           int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.gt16bit = ((s->flags & MAG_MASK) >> MAG_LSB) >= 16;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], in_left,  s->block_samples * 4);
+    memcpy(s->sampleptrs[0][1], in_right, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        if (info.gt16bit)
+            decorr_stereo(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                          s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                          s->block_samples, info.dps + i, 1);
+        else
+            decorr_stereo_quick(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                                s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                                s->block_samples, info.dps + i);
+
+    info.best_bits = log2stereo(s->sampleptrs[info.nterms][0], s->sampleptrs[info.nterms][1],
+                                s->block_samples, 0);
+
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+    memcpy(s->sampleptrs[info.nterms + 1][1], s->sampleptrs[i][1], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_stereo(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                       log2stereo(s->sampleptrs[0][0], s->sampleptrs[0][1],
+                                  s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_stereo(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_stereo(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_stereo(s, &info);
+
+    if (do_samples) {
+        memcpy(in_left,  s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+        memcpy(in_right, s->sampleptrs[info.nterms + 1][1], s->block_samples * 4);
+    }
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static int wv_stereo(WavPackEncodeContext *s,
+                     int32_t *samples_l, int32_t *samples_r,
+                     int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples, ret;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    int log_limit, force_js = 0, force_ts = 0, got_js = 0, pi, i;
+    uint32_t best_size = UINT32_MAX, size;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples_l[i] || samples_r[i])
+            break;
+
+    if (i == nb_samples) {
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if (s->joint != -1) {
+        force_js =  s->joint;
+        force_ts = !s->joint;
+    }
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi)
+            c = s->best_decorr;
+        else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+            if (force_js || (wpds->joint_stereo && !force_ts)) {
+                if (!got_js) {
+                    av_fast_padded_malloc(&s->js_left,  &s->js_left_size,  buf_size);
+                    av_fast_padded_malloc(&s->js_right, &s->js_right_size, buf_size);
+                    memcpy(s->js_left,  samples_l, buf_size);
+                    memcpy(s->js_right, samples_r, buf_size);
+
+                    for (i = 0; i < nb_samples; i++)
+                        s->js_right[i] += ((s->js_left[i] -= s->js_right[i]) >> 1);
+                    got_js = 1;
+                }
+
+                memcpy(s->temp_buffer[0][0], s->js_left,  buf_size);
+                memcpy(s->temp_buffer[0][1], s->js_right, buf_size);
+            } else {
+                memcpy(s->temp_buffer[0][0], samples_l, buf_size);
+                memcpy(s->temp_buffer[0][1], samples_r, buf_size);
+            }
+
+            CLEAR(save_decorr_passes);
+
+            for (j = 0; j < nterms; j++) {
+                CLEAR(temp_decorr_pass);
+                temp_decorr_pass.delta = wpds->delta;
+                temp_decorr_pass.value = wpds->terms[j];
+
+                if (temp_decorr_pass.value < 0 && !(s->flags & WV_CROSS_DECORR))
+                    temp_decorr_pass.value = -3;
+
+                decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                              s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                              FFMIN(2048, nb_samples), &temp_decorr_pass, -1);
+
+                if (j) {
+                    CLEAR(temp_decorr_pass.samplesA);
+                    CLEAR(temp_decorr_pass.samplesB);
+                } else {
+                    reverse_decorr(&temp_decorr_pass);
+                }
+
+                memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+
+                if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16)
+                    decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                  s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                  nb_samples, &temp_decorr_pass, 1);
+                else
+                    decorr_stereo_quick(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                        s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                        nb_samples, &temp_decorr_pass);
+            }
+
+            size = log2stereo(s->temp_buffer[j&1][0], s->temp_buffer[j&1][1],
+                              nb_samples, log_limit);
+            if (size != UINT32_MAX || !nterms)
+                break;
+            nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->best_buffer[1], s->temp_buffer[j&1][1], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (force_js || (s->decorr_specs[s->best_decorr].joint_stereo && !force_ts))
+        s->flags |= WV_JOINT_STEREO;
+    else
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+
+    if (s->extra_flags) {
+        if (s->flags & WV_JOINT_STEREO) {
+            analyze_stereo(s, s->js_left, s->js_right, do_samples);
+
+            if (do_samples) {
+                memcpy(samples_l, s->js_left,  buf_size);
+                memcpy(samples_r, s->js_right, buf_size);
+            }
+        } else
+            analyze_stereo(s, samples_l, samples_r, do_samples);
+    } else if (do_samples) {
+        memcpy(samples_l, s->best_buffer[0], buf_size);
+        memcpy(samples_r, s->best_buffer[1], buf_size);
+    }
+
+    if (s->extra_flags || no_history ||
+        s->joint_stereo != s->decorr_specs[s->best_decorr].joint_stereo) {
+        s->joint_stereo = s->decorr_specs[s->best_decorr].joint_stereo;
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+        scan_word(s, &s->w.c[1], s->best_buffer[1], nb_samples, -1);
+    }
+    return 0;
+}
+
+#define count_bits(av) ( \
+ (av) < (1 << 8) ? nbits_table[av] : \
+  ( \
+   (av) < (1 << 16) ? nbits_table[(av) >> 8] + 8 : \
+   ((av) < (1 << 24) ? nbits_table[(av) >> 16] + 16 : nbits_table[(av) >> 24] + 24) \
+  ) \
+)
+
+static void encode_flush(WavPackEncodeContext *s)
+{
+    WavPackWords *w = &s->w;
+    PutBitContext *pb = &s->pb;
+
+    if (w->zeros_acc) {
+        int cbits = count_bits(w->zeros_acc);
+
+        do {
+            if (cbits > 31) {
+                put_bits(pb, 31, 0x7FFFFFFF);
+                cbits -= 31;
+            } else {
+                put_bits(pb, cbits, (1 << cbits) - 1);
+                cbits = 0;
+            }
+        } while (cbits);
+
+        put_bits(pb, 1, 0);
+
+        while (w->zeros_acc > 1) {
+            put_bits(pb, 1, w->zeros_acc & 1);
+            w->zeros_acc >>= 1;
+        }
+
+        w->zeros_acc = 0;
+    }
+
+    if (w->holding_one) {
+        if (w->holding_one >= 16) {
+            int cbits;
+
+            put_bits(pb, 16, (1 << 16) - 1);
+            put_bits(pb, 1, 0);
+            w->holding_one -= 16;
+            cbits = count_bits(w->holding_one);
+
+            do {
+                if (cbits > 31) {
+                    put_bits(pb, 31, 0x7FFFFFFF);
+                    cbits -= 31;
+                } else {
+                    put_bits(pb, cbits, (1 << cbits) - 1);
+                    cbits = 0;
+                }
+            } while (cbits);
+
+            put_bits(pb, 1, 0);
+
+            while (w->holding_one > 1) {
+                put_bits(pb, 1, w->holding_one & 1);
+                w->holding_one >>= 1;
+            }
+
+            w->holding_zero = 0;
+        } else {
+            put_bits(pb, w->holding_one, (1 << w->holding_one) - 1);
+        }
+
+        w->holding_one = 0;
+    }
+
+    if (w->holding_zero) {
+        put_bits(pb, 1, 0);
+        w->holding_zero = 0;
+    }
+
+    if (w->pend_count) {
+        put_bits(pb, w->pend_count, w->pend_data);
+        w->pend_data = w->pend_count = 0;
+    }
+}
+
+static void wavpack_encode_sample(WavPackEncodeContext *s, WvChannel *c, int32_t sample)
+{
+    WavPackWords *w = &s->w;
+    uint32_t ones_count, low, high;
+    int sign = sample < 0;
+
+    if (s->w.c[0].median[0] < 2 && !s->w.holding_zero && s->w.c[1].median[0] < 2) {
+        if (w->zeros_acc) {
+            if (sample)
+                encode_flush(s);
+            else {
+                w->zeros_acc++;
+                return;
+            }
+        } else if (sample) {
+            put_bits(&s->pb, 1, 0);
+        } else {
+            CLEAR(s->w.c[0].median);
+            CLEAR(s->w.c[1].median);
+            w->zeros_acc = 1;
+            return;
+        }
+    }
+
+    if (sign)
+        sample = ~sample;
+
+    if (sample < (int32_t) GET_MED(0)) {
+        ones_count = low = 0;
+        high = GET_MED(0) - 1;
+        DEC_MED(0);
+    } else {
+        low = GET_MED(0);
+        INC_MED(0);
+
+        if (sample - low < GET_MED(1)) {
+            ones_count = 1;
+            high = low + GET_MED(1) - 1;
+            DEC_MED(1);
+        } else {
+            low += GET_MED(1);
+            INC_MED(1);
+
+            if (sample - low < GET_MED(2)) {
+                ones_count = 2;
+                high = low + GET_MED(2) - 1;
+                DEC_MED(2);
+            } else {
+                ones_count = 2 + (sample - low) / GET_MED(2);
+                low += (ones_count - 2) * GET_MED(2);
+                high = low + GET_MED(2) - 1;
+                INC_MED(2);
+            }
+        }
+    }
+
+    if (w->holding_zero) {
+        if (ones_count)
+            w->holding_one++;
+
+        encode_flush(s);
+
+        if (ones_count) {
+            w->holding_zero = 1;
+            ones_count--;
+        } else
+            w->holding_zero = 0;
+    } else
+        w->holding_zero = 1;
+
+    w->holding_one = ones_count * 2;
+
+    if (high != low) {
+        uint32_t maxcode = high - low, code = sample - low;
+        int bitcount = count_bits(maxcode);
+        uint32_t extras = (1 << bitcount) - maxcode - 1;
+
+        if (code < extras) {
+            w->pend_data |= code << w->pend_count;
+            w->pend_count += bitcount - 1;
+        } else {
+            w->pend_data |= ((code + extras) >> 1) << w->pend_count;
+            w->pend_count += bitcount - 1;
+            w->pend_data |= ((code + extras) & 1) << w->pend_count++;
+        }
+    }
+
+    w->pend_data |= ((int32_t) sign << w->pend_count++);
+
+    if (!w->holding_zero)
+        encode_flush(s);
+}
+
+static void pack_int32(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    const int sent_bits = s->int32_sent_bits;
+    PutBitContext *pb = &s->pb;
+    int i, pre_shift;
+
+    pre_shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+
+    if (!sent_bits)
+        return;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+            put_sbits(pb, sent_bits, samples_r[i] >> pre_shift);
+        }
+    }
+}
+
+static void pack_float_sample(WavPackEncodeContext *s, int32_t *sample)
+{
+    const int max_exp = s->float_max_exp;
+    PutBitContext *pb = &s->pb;
+    int32_t value, shift_count;
+
+    if (get_exponent(*sample) == 255) {
+        if (get_mantissa(*sample)) {
+            put_bits(pb, 1, 1);
+            put_bits(pb, 23, get_mantissa(*sample));
+        } else {
+            put_bits(pb, 1, 0);
+        }
+
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(*sample)) {
+        shift_count = max_exp - get_exponent(*sample);
+        value = 0x800000 + get_mantissa(*sample);
+    } else {
+        shift_count = max_exp ? max_exp - 1 : 0;
+        value = get_mantissa(*sample);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (s->float_flags & FLOAT_ZEROS_SENT) {
+            if (get_exponent(*sample) || get_mantissa(*sample)) {
+                put_bits(pb, 1, 1);
+                put_bits(pb, 23, get_mantissa(*sample));
+
+                if (max_exp >= 25)
+                    put_bits(pb, 8, get_exponent(*sample));
+
+                put_bits(pb, 1, get_sign(*sample));
+            } else {
+                put_bits(pb, 1, 0);
+
+                if (s->float_flags & FLOAT_NEG_ZEROS)
+                    put_bits(pb, 1, get_sign(*sample));
+            }
+        }
+    } else if (shift_count) {
+        if (s->float_flags & FLOAT_SHIFT_SENT) {
+            int32_t data = get_mantissa(*sample) & ((1 << shift_count) - 1);
+            put_bits(pb, shift_count, data);
+        } else if (s->float_flags & FLOAT_SHIFT_SAME) {
+            put_bits(pb, 1, get_mantissa(*sample) & 1);
+        }
+    }
+}
+
+static void pack_float(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    int i;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            pack_float_sample(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            pack_float_sample(s, &samples_l[i]);
+            pack_float_sample(s, &samples_r[i]);
+        }
+    }
+}
+
+static void decorr_stereo_pass2(struct Decorr *dpp,
+                                int32_t *samples_l, int32_t *samples_r,
+                                int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof (dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof (dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+#define update_weight_d2(weight, delta, source, result) \
+    if (source && result) \
+        weight -= (((source ^ result) >> 29) & 4) - 2;
+
+#define update_weight_clip_d2(weight, delta, source, result) \
+    if (source && result) { \
+        const int32_t s = (source ^ result) >> 31; \
+        if ((weight = (weight ^ s) + (2 - s)) > 1024) weight = 1024; \
+        weight = (weight ^ s) - s; \
+    }
+
+static void decorr_stereo_pass_id2(struct Decorr *dpp,
+                                   int32_t *samples_l, int32_t *samples_r,
+                                   int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void put_metadata_block(PutByteContext *pb, int flags, int size)
+{
+    if (size & 1)
+        flags |= WP_IDF_ODD;
+
+    bytestream2_put_byte(pb, flags);
+    bytestream2_put_byte(pb, (size + 1) >> 1);
+}
+
+static int wavpack_encode_block(WavPackEncodeContext *s,
+                                int32_t *samples_l, int32_t *samples_r,
+                                uint8_t *out, int out_size)
+{
+    int block_size, start, end, data_size, tcount, temp, m = 0;
+    int i, j, ret = 0, got_extra = 0, nb_samples = s->block_samples;
+    uint32_t crc = 0xffffffffu;
+    struct Decorr *dpp;
+    PutByteContext pb;
+
+    if (s->flags & WV_MONO_DATA) {
+        CLEAR(s->w);
+    }
+    if (!(s->flags & WV_MONO) && s->optimize_mono) {
+        int32_t lor = 0, diff = 0;
+
+        for (i = 0; i < nb_samples; i++) {
+            lor  |= samples_l[i] | samples_r[i];
+            diff |= samples_l[i] - samples_r[i];
+
+            if (lor && diff)
+                break;
+        }
+
+        if (i == nb_samples && lor && !diff) {
+            s->flags &= ~(WV_JOINT_STEREO | WV_CROSS_DECORR);
+            s->flags |= WV_FALSE_STEREO;
+
+            if (!s->false_stereo) {
+                s->false_stereo = 1;
+                s->num_terms = 0;
+                CLEAR(s->w);
+            }
+        } else if (s->false_stereo) {
+            s->false_stereo = 0;
+            s->num_terms = 0;
+            CLEAR(s->w);
+        }
+    }
+
+    if (s->flags & SHIFT_MASK) {
+        int shift = (s->flags & SHIFT_MASK) >> SHIFT_LSB;
+        int mag = (s->flags & MAG_MASK) >> MAG_LSB;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, shift);
+
+        if ((mag -= shift) < 0)
+            s->flags &= ~MAG_MASK;
+        else
+            s->flags -= (1 << MAG_LSB) * shift;
+    }
+
+    if ((s->flags & WV_FLOAT_DATA) || (s->flags & MAG_MASK) >> MAG_LSB >= 24) {
+        av_fast_padded_malloc(&s->orig_l, &s->orig_l_size, sizeof(int32_t) * nb_samples);
+        memcpy(s->orig_l, samples_l, sizeof(int32_t) * nb_samples);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->orig_r, &s->orig_r_size, sizeof(int32_t) * nb_samples);
+            memcpy(s->orig_r, samples_r, sizeof(int32_t) * nb_samples);
+        }
+
+        if (s->flags & WV_FLOAT_DATA)
+            got_extra = scan_float(s, samples_l, samples_r, nb_samples);
+        else
+            got_extra = scan_int32(s, samples_l, samples_r, nb_samples);
+        s->num_terms = 0;
+    } else {
+        scan_int23(s, samples_l, samples_r, nb_samples);
+        if (s->shift != s->int32_zeros + s->int32_ones + s->int32_dups) {
+            s->shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+            s->num_terms = 0;
+        }
+    }
+
+    if (!s->num_passes && !s->num_terms) {
+        s->num_passes = 1;
+
+        if (s->flags & WV_MONO_DATA)
+            ret = wv_mono(s, samples_l, 1, 0);
+        else
+            ret = wv_stereo(s, samples_l, samples_r, 1, 0);
+
+        s->num_passes = 0;
+    }
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 1) + samples_l[i];
+
+        if (s->num_passes)
+            ret = wv_mono(s, samples_l, !s->num_terms, 1);
+    } else {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 3) + (samples_l[i] << 1) + samples_l[i] + samples_r[i];
+
+        if (s->num_passes)
+            ret = wv_stereo(s, samples_l, samples_r, !s->num_terms, 1);
+    }
+    if (ret < 0)
+        return ret;
+
+    if (!s->ch_offset)
+        s->flags |= WV_INITIAL_BLOCK;
+
+    s->ch_offset += 1 + !(s->flags & WV_MONO);
+
+    if (s->ch_offset == s->avctx->channels)
+        s->flags |= WV_FINAL_BLOCK;
+
+    bytestream2_init_writer(&pb, out, out_size);
+    bytestream2_put_le32(&pb, MKTAG('w', 'v', 'p', 'k'));
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le16(&pb, 0x410);
+    bytestream2_put_le16(&pb, 0);
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le32(&pb, s->sample_index);
+    bytestream2_put_le32(&pb, nb_samples);
+    bytestream2_put_le32(&pb, s->flags);
+    bytestream2_put_le32(&pb, crc);
+
+    if (s->flags & WV_INITIAL_BLOCK &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_MONO &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_STEREO) {
+        put_metadata_block(&pb, WP_ID_CHANINFO, 5);
+        bytestream2_put_byte(&pb, s->avctx->channels);
+        bytestream2_put_le32(&pb, s->avctx->channel_layout);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    if ((s->flags & SRATE_MASK) == SRATE_MASK) {
+        put_metadata_block(&pb, WP_ID_SAMPLE_RATE, 3);
+        bytestream2_put_le24(&pb, s->avctx->sample_rate);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    put_metadata_block(&pb, WP_ID_DECTERMS, s->num_terms);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        bytestream2_put_byte(&pb, ((dpp->value + 5) & 0x1f) | ((dpp->delta << 5) & 0xe0));
+    }
+    if (s->num_terms & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECWEIGHT(type) do {            \
+        temp = store_weight(type);    \
+        bytestream2_put_byte(&pb, temp);      \
+        type = restore_weight(temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECWEIGHTS);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = s->num_terms - 1; i >= 0; --i) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+
+        if (store_weight(dpp->weightA) ||
+            (!(s->flags & WV_MONO_DATA) && store_weight(dpp->weightB)))
+                break;
+    }
+    tcount = i + 1;
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i < tcount) {
+            WRITE_DECWEIGHT(dpp->weightA);
+            if (!(s->flags & WV_MONO_DATA))
+                WRITE_DECWEIGHT(dpp->weightB);
+        } else {
+            dpp->weightA = dpp->weightB = 0;
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 2] = WP_ID_DECWEIGHTS | (((end - start) & 1) ? WP_IDF_ODD: 0);
+    out[start - 1] = (end - start + 1) >> 1;
+    if ((end - start) & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECSAMPLE(type) do {        \
+        temp = log2s(type);               \
+        type = wp_exp2(temp);             \
+        bytestream2_put_le16(&pb, temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECSAMPLES);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i == 0) {
+            if (dpp->value > MAX_TERM) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesA[1]);
+                if (!(s->flags & WV_MONO_DATA)) {
+                    WRITE_DECSAMPLE(dpp->samplesB[0]);
+                    WRITE_DECSAMPLE(dpp->samplesB[1]);
+                }
+            } else if (dpp->value < 0) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesB[0]);
+            } else {
+                for (j = 0; j < dpp->value; j++) {
+                    WRITE_DECSAMPLE(dpp->samplesA[j]);
+                    if (!(s->flags & WV_MONO_DATA))
+                        WRITE_DECSAMPLE(dpp->samplesB[j]);
+                }
+            }
+        } else {
+            CLEAR(dpp->samplesA);
+            CLEAR(dpp->samplesB);
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 1] = (end - start) >> 1;
+
+#define WRITE_CHAN_ENTROPY(chan) do {               \
+        for (i = 0; i < 3; i++) {                   \
+            temp = wp_log2(s->w.c[chan].median[i]); \
+            bytestream2_put_le16(&pb, temp);        \
+            s->w.c[chan].median[i] = wp_exp2(temp); \
+        }                                           \
+    } while (0)
+
+    put_metadata_block(&pb, WP_ID_ENTROPY, 6 * (1 + (!(s->flags & WV_MONO_DATA))));
+    WRITE_CHAN_ENTROPY(0);
+    if (!(s->flags & WV_MONO_DATA))
+        WRITE_CHAN_ENTROPY(1);
+
+    if (s->flags & WV_FLOAT_DATA) {
+        put_metadata_block(&pb, WP_ID_FLOATINFO, 4);
+        bytestream2_put_byte(&pb, s->float_flags);
+        bytestream2_put_byte(&pb, s->float_shift);
+        bytestream2_put_byte(&pb, s->float_max_exp);
+        bytestream2_put_byte(&pb, 127);
+    }
+
+    if (s->flags & WV_INT32_DATA) {
+        put_metadata_block(&pb, WP_ID_INT32INFO, 4);
+        bytestream2_put_byte(&pb, s->int32_sent_bits);
+        bytestream2_put_byte(&pb, s->int32_zeros);
+        bytestream2_put_byte(&pb, s->int32_ones);
+        bytestream2_put_byte(&pb, s->int32_dups);
+    }
+
+    if (s->flags & WV_MONO_DATA && !s->num_passes) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t code = samples_l[i];
+
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++) {
+                int32_t sam;
+
+                if (dpp->value > MAX_TERM) {
+                    if (dpp->value & 1)
+                        sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+                    else
+                        sam = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+                    dpp->samplesA[1] = dpp->samplesA[0];
+                    dpp->samplesA[0] = code;
+                } else {
+                    sam = dpp->samplesA[m];
+                    dpp->samplesA[(m + dpp->value) & (MAX_TERM - 1)] = code;
+                }
+
+                code -= APPLY_WEIGHT(dpp->weightA, sam);
+                UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, code);
+            }
+
+            m = (m + 1) & (MAX_TERM - 1);
+            samples_l[i] = code;
+        }
+        if (m) {
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++)
+                if (dpp->value > 0 && dpp->value <= MAX_TERM) {
+                int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+                int k;
+
+                memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+                memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+                for (k = 0; k < MAX_TERM; k++) {
+                    dpp->samplesA[k] = temp_A[m];
+                    dpp->samplesB[k] = temp_B[m];
+                    m = (m + 1) & (MAX_TERM - 1);
+                }
+            }
+        }
+    } else if (!s->num_passes) {
+        if (s->flags & WV_JOINT_STEREO) {
+            for (i = 0; i < nb_samples; i++)
+                samples_r[i] += ((samples_l[i] -= samples_r[i]) >> 1);
+        }
+
+        for (i = 0; i < s->num_terms; i++) {
+            struct Decorr *dpp = &s->decorr_passes[i];
+            if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16 || dpp->delta != 2)
+                decorr_stereo_pass2(dpp, samples_l, samples_r, nb_samples);
+            else
+                decorr_stereo_pass_id2(dpp, samples_l, samples_r, nb_samples);
+        }
+    }
+
+    bytestream2_put_byte(&pb, WP_ID_DATA | WP_IDF_LONG);
+    init_put_bits(&s->pb, pb.buffer + 3, bytestream2_get_bytes_left_p(&pb));
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+            wavpack_encode_sample(s, &s->w.c[1], s->samples[1][i]);
+        }
+    }
+    encode_flush(s);
+    flush_put_bits(&s->pb);
+    data_size = put_bits_count(&s->pb) >> 3;
+    bytestream2_put_le24(&pb, (data_size + 1) >> 1);
+    bytestream2_skip_p(&pb, data_size);
+    if (data_size & 1)
+        bytestream2_put_byte(&pb, 0);
+
+    if (got_extra) {
+        bytestream2_put_byte(&pb, WP_ID_EXTRABITS | WP_IDF_LONG);
+        init_put_bits(&s->pb, pb.buffer + 7, bytestream2_get_bytes_left_p(&pb));
+        if (s->flags & WV_FLOAT_DATA)
+            pack_float(s, s->orig_l, s->orig_r, nb_samples);
+        else
+            pack_int32(s, s->orig_l, s->orig_r, nb_samples);
+        flush_put_bits(&s->pb);
+        data_size = put_bits_count(&s->pb) >> 3;
+        bytestream2_put_le24(&pb, (data_size + 5) >> 1);
+        bytestream2_put_le32(&pb, s->crc_x);
+        bytestream2_skip_p(&pb, data_size);
+        if (data_size & 1)
+            bytestream2_put_byte(&pb, 0);
+    }
+
+    block_size = bytestream2_tell_p(&pb);
+    AV_WL32(out + 4, block_size - 8);
+
+    av_assert0(!bytestream2_get_eof(&pb));
+
+    return block_size;
+}
+
+static void fill_buffer(WavPackEncodeContext *s,
+                        const int8_t *src, int32_t *dst,
+                        int nb_samples)
+{
+    int i;
+
+#define COPY_SAMPLES(type, offset, shift) do {            \
+        const type *sptr = (const type *)src;             \
+        for (i = 0; i < nb_samples; i++)                  \
+            dst[i] = (sptr[i] - offset) >> shift;         \
+    } while (0)
+
+    switch (s->avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8P:
+        COPY_SAMPLES(int8_t, 0x80, 0);
+        break;
+    case AV_SAMPLE_FMT_S16P:
+        COPY_SAMPLES(int16_t, 0, 0);
+        break;
+    case AV_SAMPLE_FMT_S32P:
+        if (s->avctx->bits_per_raw_sample <= 24) {
+            COPY_SAMPLES(int32_t, 0, 8);
+            break;
+        }
+    case AV_SAMPLE_FMT_FLTP:
+        memcpy(dst, src, nb_samples * 4);
+    }
+}
+
+static void set_samplerate(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 15; i++) {
+        if (wv_rates[i] == s->avctx->sample_rate)
+            break;
+    }
+
+    s->flags = i << SRATE_LSB;
+}
+
+static int wavpack_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                const AVFrame *frame, int *got_packet_ptr)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int buf_size, ret;
+    uint8_t *buf;
+
+    s->block_samples = frame->nb_samples;
+    av_fast_padded_malloc(&s->samples[0], &s->samples_size[0],
+                          sizeof(int32_t) * s->block_samples);
+    if (!s->samples[0])
+        return AVERROR(ENOMEM);
+    if (avctx->channels > 1) {
+        av_fast_padded_malloc(&s->samples[1], &s->samples_size[1],
+                              sizeof(int32_t) * s->block_samples);
+        if (!s->samples[1])
+            return AVERROR(ENOMEM);
+    }
+
+    buf_size = s->block_samples * avctx->channels * 8
+             + 200 /* for headers */;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+    buf = avpkt->data;
+
+    for (s->ch_offset = 0; s->ch_offset < avctx->channels;) {
+        set_samplerate(s);
+
+        switch (s->avctx->sample_fmt) {
+        case AV_SAMPLE_FMT_S16P: s->flags |= 1; break;
+        case AV_SAMPLE_FMT_S32P: s->flags |= 3 - (s->avctx->bits_per_raw_sample <= 24); break;
+        case AV_SAMPLE_FMT_FLTP: s->flags |= 3 | WV_FLOAT_DATA;
+        }
+
+        fill_buffer(s, frame->extended_data[s->ch_offset], s->samples[0], s->block_samples);
+        if (avctx->channels - s->ch_offset == 1) {
+            s->flags |= WV_MONO;
+        } else {
+            s->flags |= WV_CROSS_DECORR;
+            fill_buffer(s, frame->extended_data[s->ch_offset + 1], s->samples[1], s->block_samples);
+        }
+
+        s->flags += (1 << MAG_LSB) * ((s->flags & 3) * 8 + 7);
+
+        if ((ret = wavpack_encode_block(s, s->samples[0], s->samples[1],
+                                        buf, buf_size)) < 0)
+            return ret;
+
+        buf      += ret;
+        buf_size -= ret;
+    }
+    s->sample_index += frame->nb_samples;
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = buf - avpkt->data;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int wavpack_encode_close(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < MAX_TERMS + 2; i++) {
+        av_freep(&s->sampleptrs[i][0]);
+        av_freep(&s->sampleptrs[i][1]);
+        s->sampleptrs_size[i][0] = s->sampleptrs_size[i][1] = 0;
+    }
+
+    for (i = 0; i < 2; i++) {
+        av_freep(&s->samples[i]);
+        s->samples_size[i] = 0;
+
+        av_freep(&s->best_buffer[i]);
+        s->best_buffer_size[i] = 0;
+
+        av_freep(&s->temp_buffer[i][0]);
+        av_freep(&s->temp_buffer[i][1]);
+        s->temp_buffer_size[i][0] = s->temp_buffer_size[i][1] = 0;
+    }
+
+    av_freep(&s->js_left);
+    av_freep(&s->js_right);
+    s->js_left_size = s->js_right_size = 0;
+
+    av_freep(&s->orig_l);
+    av_freep(&s->orig_r);
+    s->orig_l_size = s->orig_r_size = 0;
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(WavPackEncodeContext, x)
+#define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    { "joint_stereo",  "", OFFSET(joint), AV_OPT_TYPE_BOOL, {.i64=-1}, -1, 1, FLAGS },
+    { "optimize_mono", "", OFFSET(optimize_mono), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass wavpack_encoder_class = {
+    .class_name = "WavPack encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_wavpack_encoder = {
+    .name           = "wavpack",
+    .long_name      = NULL_IF_CONFIG_SMALL("WavPack"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_WAVPACK,
+    .priv_data_size = sizeof(WavPackEncodeContext),
+    .priv_class     = &wavpack_encoder_class,
+    .init           = wavpack_encode_init,
+    .encode2        = wavpack_encode_frame,
+    .close          = wavpack_encode_close,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8P,
+                                                     AV_SAMPLE_FMT_S16P,
+                                                     AV_SAMPLE_FMT_S32P,
+                                                     AV_SAMPLE_FMT_FLTP,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/wavpackenc.h b/libavcodec/wavpackenc.h
new file mode 100644
index 0000000000..9dd2a01bbe
--- /dev/null
+++ b/libavcodec/wavpackenc.h
@@ -0,0 +1,664 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACKENC_H
+#define AVCODEC_WAVPACKENC_H
+
+#include "wavpack.h"
+
+typedef struct WavPackDecorrSpec {
+    int8_t joint_stereo, delta, terms[MAX_TERMS+1];
+} WavPackDecorrSpec;
+
+static const WavPackDecorrSpec fast_specs[] = {
+ { 1, 2, { 18,17 } }, { 1, 1, { 17,17 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,17 } }, { 1, 3, {  1,18 } }, { 1, 1, { 17, 1 } },
+ { 0, 1, {  1,17 } }, { 0, 1, { -2,17 } }, { 0, 2, { -1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 3, { 18,18 } }, { 0, 1, { 17, 1 } },
+ { 1, 6, {  1, 2 } }, { 1, 1, { 17, 3 } }, { 0, 1, { -2, 3 } },
+ { 0, 1, {  2,17 } }, { 0, 1, { 18,-2 } }, { 0, 1, { -1,17 } },
+ { 0, 1, { 18,17 } }, { 0, 1, { 17, 2 } }, { 1, 2, { 18,-2 } },
+ { 1, 1, {  1,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, { 17,-2 } },
+ { 0, 1, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, { 18,18 } }, { 1, 1, {  1, 3 } }, { 1, 1, { 18, 3 } },
+ { 1, 1, {  1, 3 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 3 } }, { 0, 3, { 18,17 } }, { 0, 1, { 18,18 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  1,18 } }, { 0, 1, { 18,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { -1,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } }, { 1, 1, { 18, 2 } },
+ { 1, 1, { 17,-2 } }, { 0, 1, {  1,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,-2 } }, { 1, 1, { 17,-2 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, {  2,17 } }, { 1, 2, { 18,-3 } }, { 1, 2, {  1,18 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { 17,-1 } }, { 0, 1, { 17,-2 } },
+ { 1, 1, { 17,-2 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  1,17 } },
+ { 1, 2, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 2 } }, { 1, 2, { 18,18 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,18 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 6, {  1, 2 } }, { 0, 3, { 17,17 } },
+ { 0, 1, {  1,18 } }, { 0, 1, {  1,-2 } }, { 1, 1, { 17, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, { 18, 3 } },
+ { 1, 2, { 17,-3 } }, { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } },
+ { 0, 1, { 18,-2 } }, { 1, 1, { 18,18 } }, { 1, 6, {  1, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -1,17 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 17,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 18, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { -2, 3 } }, { 0, 4, { 18,-1 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { -2, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 17, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { -1,18 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 17,-2 } }, { 0, 1, { 17, 2 } },
+ { 1, 2, { 18,-3 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17,-3 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  2,17 } }, { 1, 2, { 18, 2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1, 3 } },
+ { 0, 2, { -2,17 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 17,-2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  2,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { -1,17 } }, { 1, 1, {  2,17 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { 17, 2 } }, { 1, 4, { 18,-3 } },
+ { 1, 1, { 18, 1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18,-1 } }, { 0, 1, { -1,18 } }, { 1, 6, {  1, 2 } },
+ { 1, 1, { 17, 2 } }, { 1, 4, { 18, 3 } }, { 0, 1, {  1,17 } },
+ { 0, 1, { 18, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17, 2 } }, { 0, 2, { 18,-2 } }, { 0, 1, {  1,18 } },
+ { 1, 2, { 18,-3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 2, { 18,18 } }, { 1, 3, { 17,17 } },
+ { 0, 1, { -2,17 } }, { 0, 1, { 17,18 } }, { 0, 1, { -1, 3 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17, 2 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 18, 2 } }, { 1, 2, { 17,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2,17 } },
+ { 0, 1, { 17,-1 } }, { 0, 1, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, {  1,18 } }, { 1, 3, { 18, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18,18 } }, { 0, 1, {  1,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, { 18,18 } }, { 0, 2, { 18, 2 } },
+ { 0, 1, { 17,18 } }, { 1, 2, { 18, 2 } }, { 1, 1, { 17,-2 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, {  1,-2 } }, { 0, 1, { 18, 1 } },
+ { 1, 2, { 18,-2 } }, { 0, 1, { 17, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 3 } }, { 0, 1, { 17,-1 } },
+ { 0, 1, { 18, 2 } }, { 1, 1, { 17, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 18,18 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17,18 } }, { 0, 1, { -2, 3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 2, { 18,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, {  1,18 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2, 3 } },
+ { 0, 3, { 17,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } },
+};
+
+static const WavPackDecorrSpec default_specs[] = {
+ { 1, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,17,-1, 3, 2 } },
+ { 1, 1, { 17,18,18,-2, 2 } }, { 0, 2, { 18,17, 3,-2,17 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,18,-1, 2,17 } },
+ { 0, 1, { 17,17,-2, 2, 3 } }, { 0, 1, { 18,-2,18, 2,17 } },
+ { 1, 2, { 18,18,-1, 2, 3 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 17,-2,17, 1, 2 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 0, 1, { 18,18,18, 2,17 } }, { 0, 1, { 18,17,-1, 2,18 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 18,17, 1, 4, 6 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 1, { 18,17,-1, 2, 3 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 1, 2, { 18,17,-1,17, 3 } },
+ { 1, 2, { 18,17, 2, 3,-1 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 5, { -2,18,18,18, 2 } },
+ { 1, 1, { 18,18,-1, 6, 3 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18,17,18, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 1, 3, { 18,-3,18, 2, 3 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 1, 1, { 18,18, 3, 5, 2 } },
+ { 0, 2, { 18,18,-1, 2,17 } }, { 0, 1, { 18,-1,17,18, 2 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 3, { 18,18, 2, 3,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 0, 1, { -1, 3, 5, 4, 7 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 1, { 18,17,-2,18, 3 } },
+ { 0, 2, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17,-2, 2, 3 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 2, {  3,18,17, 2,17 } }, { 1, 2, { 18,18, 2,-2,18 } },
+ { 1, 2, { 18,18,-1,18, 2 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 1, 3, { 18,18, 2, 3,-2 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 1, { 17, 3, 2, 1, 7 } },
+ { 1, 3, { 18,18,-2, 2,18 } }, { 1, 1, { 17,18,18,-2, 2 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 0, 1, { -1, 3, 4, 5, 7 } }, { 1, 1, { 17,17, 2,-1, 7 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 0, 2, { 18,17, 2, 3,17 } },
+ { 0, 1, { 18,17, 2,18, 2 } }, { 0, 2, { 18,17,-1, 2,17 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 18,17,18, 2, 5 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, { 18,18,-2, 2,18 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 2, 1, 3 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17, 1, 4, 6 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 1, { 17, 2,18, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6,-1 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 1, { 18,17,-2, 2,17 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 1, 3, { 18,18,-3,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 1, 1, { 18, 3, 1, 5, 4 } },
+ { 0, 3, { 18,17,-1, 2,17 } }, { 1, 3, { 18,17, 2,18,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,-1,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18,18, 3, 2 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 1, 3, { 18,17, 2,-2,18 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 1, { 18,18, 2,18,-2 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 3, { 18,18,-2, 3, 2 } },
+ { 1, 1, { 18,18, 2,18, 5 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 1, 4, { 18,18, 2, 3,-2 } }, { 0, 2, { 18,17,18, 2,-2 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, {  3,17,18, 2,17 } }, { 1, 1, { 18,-1,18, 2,17 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 3, {  3,18,18, 2,17 } },
+ { 0, 1, { 18,-1,17,18, 2 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,17, 2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 2, { 18,17, 2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,17, 2, 3,18 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, { 17,-2,17, 2,-3 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 1, 1, { 18,18,18, 2, 4 } }, { 1, 2, { 18, 2,18, 3,-2 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,-2,18, 3, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 2, { 18,18, 2,-2,18 } }, { 0, 1, { -1, 3, 5, 4, 7 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 1, { 18,18,-2,18, 3 } },
+ { 0, 2, { 18,17,18, 2,-2 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,17,-2,-1,17 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 1, { 18,18,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 2, { 18,17,18, 2, 3 } }, { 0, 3, { 18,17,-3,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 0, 1, { 18,-2,-3, 2, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,18, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 17,17, 2,-1, 7 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 1, {  1,18, 3, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,-2,18, 3, 2 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 0, 1, { 18,18,-1, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+};
+
+static const WavPackDecorrSpec high_specs[] = {
+ { 1, 2, { 18,18,18,-2, 2, 3, 5,-1,17, 4 } }, { 0, 1, { 18,17,-2, 2,18, 3, 7, 2, 5, 4 } },
+ { 1, 2, {  1,18, 3, 6,-2,18, 2, 3, 4, 5 } }, { 0, 2, { 18,18,-2, 2,18, 3, 6, 2,17, 4 } },
+ { 1, 2, { 18,18, 2,18, 3, 2,-1, 4,18, 5 } }, { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } },
+ { 1, 1, { 17, 3,18, 7, 2, 6, 1, 4, 3, 5 } }, { 1, 1, { -2,18,18,18, 3,-2, 6, 5, 2, 1 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 6,-2,17, 5 } }, { 0, 1, { 17,17,18, 3, 6, 4, 5, 2,18,-2 } },
+ { 1, 2, {  1,18,-2, 3, 5, 2, 4,-1, 6, 1 } }, { 0, 2, { 18,18, 3, 6,18, 2, 4, 8, 5, 3 } },
+ { 0, 1, { -2, 1,18, 2,-2, 7,18, 2,-1, 5 } }, { 1, 1, {  4, 3, 8, 1, 5, 2, 5, 6, 2, 8 } },
+ { 1, 1, { 17,18, 2, 6, 3, 4,-1, 1, 8, 6 } }, { 0, 1, { 18,18, 3, 6, 3,-2, 2, 5,-1, 1 } },
+ { 0, 1, { 18,18,17,-1, 2,-2,18, 3, 4, 5 } }, { 1, 2, { 18,17, 2,-2,18, 3, 5, 7, 2, 4 } },
+ { 1, 2, { 18,18, 3, 6,-2,18, 2, 5, 8, 3 } }, { 0, 1, { 18,17, 2,18,18, 2, 6, 5,17, 7 } },
+ { 1, 2, { 18,17, 2,18, 3, 2, 6,18,-1, 4 } }, { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } },
+ { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } }, { 0, 1, { -2,18,18,18,-2, 3, 2, 4, 6, 5 } },
+ { 1, 2, { 18,17,-3, 3,-1,18, 2, 3, 6, 5 } }, { 0, 1, { 17,18, 7, 3,-2, 7, 1, 2, 4, 5 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1,18, 3, 6 } }, { 0, 3, {  1,18, 4, 3, 5, 2, 4,18, 2, 3 } },
+ { 0, 1, { -2,18, 2,18, 3, 7,18, 2, 6,-2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 1, 1, { 18, 3, 6, 5, 7, 8, 2, 3, 1,-1 } },
+ { 1, 1, { 18,18,18, 2,-2, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17,-2, 2, 3,18,-3, 5, 2, 7 } },
+ { 1, 1, {  1, 1,-1, 8,17, 3,-2, 2, 6,17 } }, { 0, 2, { 18,18,17, 2,-2, 3, 2, 4,18, 5 } },
+ { 1, 1, { 17,18, 2,-1, 5, 7,18, 3, 4, 6 } }, { 1, 1, {  5, 4, 5,17, 3, 6, 3, 4, 7, 2 } },
+ { 0, 1, { 17, 3, 1, 7, 4, 2, 5,-2,18, 6 } }, { 0, 1, { 17,18, 2,18, 4, 3, 5, 7,-3, 6 } },
+ { 1, 2, { 17,17,-3,-2, 2, 8,18,-1, 3, 5 } }, { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2, 5,-3, 7,-2 } }, { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } },
+ { 0, 1, { 18,17, 2,18,-2, 3, 7, 6, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 2,-1, 3, 6, 1, 3, 4, 8 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { 18,17,-3,18, 2, 4,-2, 3, 6,17 } }, { 1, 3, {  1, 2,17, 3,18, 7,-1, 5, 2, 4 } },
+ { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } }, { 0, 1, { 17, 2,18, 6, 3, 2, 5, 4, 8, 1 } },
+ { 0, 1, { 18,17,-1, 2, 3,18,18, 2, 3,17 } }, { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } },
+ { 1, 1, {  6,17, 3, 8, 1, 5, 7,-1, 2, 1 } }, { 1, 1, { 18,-2,18, 3,-2, 2, 7, 4, 6,18 } },
+ { 1, 3, { 18,-3,18, 2, 3,18,-1, 7, 2, 5 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 18,-2, 2,-3,18,-2,17,-1, 4, 2 } }, { 0, 3, { 17,17, 2, 5, 3, 7,18, 6, 4, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,17, 4, 6, 6, 4, 5, 3, 4, 1 } }, { 0, 1, { 18, 5, 3, 6, 2, 3, 8, 1, 3, 7 } },
+ { 1, 2, { 18,17,-2, 2,18, 3, 5, 7,-1, 2 } }, { 0, 1, {  1,18,18, 3, 6,-1, 4, 8, 5, 2 } },
+ { 1, 1, {  1, 5, 3, 4, 1, 1, 3, 5, 7, 3 } }, { 0, 1, {  3,18,18, 2,18,18,-1, 2, 3,18 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 4, 6,18, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 3, 1, 4, 5, 2, 7, 1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 5,-2, 6, 8 } }, { 1, 1, { 17,18, 4, 8, 3, 2, 5, 2, 7, 6 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 2, { 18,17,-1, 3, 6,18, 2, 3, 7, 5 } },
+ { 0, 1, { -2,18, 2,-3, 6,18, 4, 3,-2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,17, 6, 2, 4, 8, 3, 5,-1,17 } }, { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } },
+ { 1, 2, { 17,17,-3, 2,18,-2, 8, 3, 6,-1 } }, { 1, 1, { 18,-2,17,18, 2, 3,-2, 6, 5, 4 } },
+ { 1, 2, { 18,17,-1, 3,18, 2, 5, 3, 6,-3 } }, { 0, 1, { 18,17, 2,18, 7,18, 2, 4, 3,17 } },
+ { 1, 3, { 18,18, 5, 6, 4, 3, 4,18, 6, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } }, { 0, 1, { -2,18,18,18, 3, 6, 4, 2, 5, 2 } },
+ { 0, 3, { 18,17,-3,18, 3, 2, 5,-1,17, 3 } }, { 1, 1, { 17,18, 7, 3, 1, 7, 4, 2, 6, 5 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 3, { 18,18,-1, 3, 2, 7, 5,18, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2,-2, 4, 8,18, 3, 6, 5 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, {  3,17,18,18, 2, 5, 7, 6,18, 3 } },
+ { 1, 1, { 17,18,18, 4, 3, 2,18, 7, 8,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 2, 3, 5, 6, 1, 4, 8,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,17,-1,18,-3, 2, 8, 3, 6,17 } }, { 1, 1, { 17,17, 1, 2, 4, 5,-1, 2, 1, 6 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2,-3, 3,-2, 5 } }, { 0, 1, { 18, 3,18, 6,18, 5, 2, 4,-1, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18,-1, 2,18, 3, 6, 4,-2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2,-2, 4, 7, 2, 3 } }, { 0, 3, {  3,17,-2, 5, 2, 7,18, 6, 4, 5 } },
+ { 0, 1, { 17, 6,18, 3, 8, 4, 5, 3, 8,18 } }, { 0, 2, { 18, 2, 6, 2,18, 3, 2, 4, 5, 8 } },
+ { 0, 1, {  3,18,18, 2,18,-1, 2,18, 2,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  3, 6,17,-2, 5, 1, 2, 7, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  3,18,17, 5, 6, 2, 7,-2, 8,18 } }, { 1, 1, { 18,-1, 3, 1, 7, 2,-1, 4, 6,17 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, { 18, 1, 2,18, 3, 6, 5, 2, 4, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } },
+ { 1, 2, {  1,18, 2, 3,-1, 5, 6, 4, 7,17 } }, { 0, 2, { 18,17, 3, 6,-2, 2, 3, 8, 5,17 } },
+ { 0, 2, { 18,18, 3, 2,18,-1, 2, 4, 3,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 17,-1,18, 2, 3,-2, 5,18, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,18, 5, 6,-3 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, {  1,18,-1, 2, 3, 1,-2, 8, 2, 5 } }, { 0, 1, { 18,18, 3, 6,18, 2, 3, 4, 8, 5 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 2,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18,-1, 2, 8, 3, 4, 5, 1, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2,18, 3,-2, 5, 4, 2 } }, { 1, 1, { 18,17, 2,18, 3, 8, 5, 2, 7,17 } },
+ { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } }, { 0, 1, { 18,18, 2,18, 2, 6,18, 2,17, 7 } },
+ { 1, 3, { 18,17,18, 2, 8,18, 5,-1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,17,-1, 3, 6,18, 2, 5, 8, 3 } }, { 0, 1, { 17,18,18, 4, 7, 2, 3,-2,18, 5 } },
+ { 1, 2, { 18, 1, 2, 6, 2, 5,18, 2, 4, 8 } }, { 0, 4, { 18, 4, 1, 2, 3, 5, 4, 1, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17, 2,-1,18, 3,-3, 5, 2, 4 } },
+ { 0, 1, { 17,17, 3, 6, 3, 5,-2, 2,18,-1 } }, { 0, 2, { 18,18, 3,-2,18, 2,-3, 5, 3, 6 } },
+ { 1, 1, { 17,17, 2, 4, 1, 3, 5, 2, 6,-3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 3, 2, 7, 1, 6, 3, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,-1,18, 2, 1, 5, 3, 8,-1,-2 } }, { 1, 1, { 17,18,-1, 8, 2, 5, 3, 4, 1, 6 } },
+ { 1, 2, {  1,18, 3,-1, 5, 1, 2, 4, 7, 6 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18,-1, 3, 8, 5, 6, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18, 2, 3, 6,18,-1, 4, 2, 3 } }, { 1, 1, {  1, 3, 5,18, 2, 6, 7, 2, 3, 1 } },
+ { 1, 1, {  1, 3, 8,18, 5, 2, 7, 1, 3,-2 } }, { 0, 2, { 17, 2,18, 3, 6, 2, 4, 5, 8, 3 } },
+ { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18,-1, 3,-2, 5, 7, 1, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  1,-1, 1, 3,-2, 2, 5, 7,-3,18 } }, { 1, 2, { 18, 7, 3,-3, 2, 8, 2, 5, 4,17 } },
+ { 1, 1, {  1, 4, 5, 1, 3, 4, 6, 7, 8, 3 } }, { 0, 1, { 18,17, 2,18,-1, 2, 3,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2, 3, 4, 7, 5,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2, 1, 3, 2, 5, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2, 3, 5, 8, 6, 1,-2 } }, { 0, 1, { 17,18, 8, 3, 4, 6, 5, 2, 8, 7 } },
+ { 1, 2, {  1, 3,-2,18, 2, 5, 1, 7,-1,-2 } }, { 0, 3, { 18,17,-1, 3,18, 2, 3, 6, 4,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18, 4,18, 6, 7, 8, 3,18, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 17,-3,17, 2,-2, 8, 3,18, 4,-3 } }, { 1, 1, { 18,17, 3, 5, 6, 2, 8, 1, 3, 7 } },
+ { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, {  3,17,18,-3, 2, 5,18, 6,-1, 7 } }, { 1, 1, { 17,18, 3, 2, 5,-1, 6, 8, 4, 7 } },
+ { 1, 1, { 18, 1,-2, 3, 2, 1, 7, 6, 3, 4 } }, { 0, 3, {  1, 2,17, 3,18, 2, 7, 5, 4,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 2, { 18, 5,18, 2, 3, 7,-2, 1, 6, 8 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3, 5,18, 3 } }, { 0, 1, {  3,17,18, 5, 2,18, 7, 3, 6, 5 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18, 2, 1, 3, 4, 1, 5, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { 17,17,18, 2, 4, 5,18,-2, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,18,-1, 3, 5, 6, 8,18, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 18,18, 4, 6, 8,18, 7, 3, 2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2, 4,-2, 2, 3, 6 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,-2, 5, 3, 8 } }, { 0, 2, { 18, 1, 2, 6, 2, 8, 3,18, 5, 4 } },
+ { 1, 1, {  3,18,18, 2,18, 2,18, 3, 2,18 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  3,17,18, 5, 2, 6, 7, 1, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+};
+
+static const WavPackDecorrSpec very_high_specs[] = {
+ { 1, 2, { 18,18, 2, 3,-2,18, 2, 4, 7, 5, 3, 6, 8,-1,18, 2 } },
+ { 0, 1, { 18,18,-1,18, 2, 3, 4, 6, 5, 7,18,-3, 8, 2,-1, 3 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 0, 1, { 17,17, 2, 3, 4,18,-1, 5, 6, 7,18, 2, 8,17, 3,-2 } },
+ { 1, 1, { 18,18, 2,18, 3, 2,18, 4,-1, 3,18, 2, 6, 8,17, 5 } },
+ { 0, 2, { 18,17, 2, 3,-2, 5,18,-3, 2, 4, 7, 3, 6, 8, 5,17 } },
+ { 1, 1, { 18,-2, 2,-3,18, 5,-2,18, 2, 3, 6, 2,17, 4, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3,-2, 2, 5, 4,18, 6, 3, 8, 7, 2, 5, 4 } },
+ { 0, 2, { 18,17,-2, 2,18, 3, 2, 5,-3, 4, 7,18, 3, 8, 6, 2 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 1, 2, {  1,18, 3, 2,-2, 1, 5, 4, 6, 2, 7, 1, 8, 3,-1, 1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -2,18, 2,18, 7, 2, 6,-2, 3, 4,18,18, 2,-3, 8, 5 } },
+ { 0, 2, { 18,18,18, 2, 4, 3,18, 5, 3, 6,-2, 2, 4,18, 8, 7 } },
+ { 0, 1, { -2, 1,18, 2,-2,18,-1, 5, 7, 2, 3, 4,18, 2, 6, 2 } },
+ { 1, 1, { 17,18, 3, 2, 1, 7,-1, 2, 4, 3, 5, 6,-2,18, 7, 8 } },
+ { 1, 1, { 18,18, 2,18, 3, 4, 6,-2,18, 5, 8, 2, 3, 7, 4,-1 } },
+ { 0, 1, { 18,18,18,-1, 2, 3, 4, 6, 8,18, 3, 5, 2, 6, 7, 4 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7, 5, 2, 3, 1, 4, 8 } },
+ { 1, 1, { 17,17, 3, 2, 7, 1, 4, 3, 6, 2, 5,-2, 8, 7,18, 6 } },
+ { 0, 1, { 18,17,-2, 2,18, 3,-3, 7, 6, 5, 2, 4,-1, 8, 3,17 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1, 5,18, 3, 8, 6, 2, 7,17, 4 } },
+ { 0, 1, { 17, 3, 6, 8, 5, 4, 3, 8, 1,18, 7, 2, 4, 5, 6, 3 } },
+ { 1, 2, { 17,18, 4, 8, 3, 2, 5, 7, 6, 8, 2, 7,-2,18, 3, 4 } },
+ { 1, 1, {  6, 5, 5, 3, 4, 7, 3, 2, 4, 6, 3, 7, 1, 5, 2, 4 } },
+ { 1, 1, {  1,18,-1, 2, 1, 3, 8,-2, 2, 5, 6, 3, 8, 7,18, 4 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 0, 1, { 18, 2,18,18, 2,18, 6,-2,18, 7, 5, 4, 3, 2,18,-2 } },
+ { 0, 3, {  1, 4,18, 3, 2, 4, 1, 5, 2, 3, 6,18, 8, 7, 2, 4 } },
+ { 0, 1, { 17,-2, 1,-3, 2,18, 3,-2, 4,18, 3, 6, 7,-3, 2, 8 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 2, { 18,-1,17,18, 2, 3,-2,18, 5, 8, 2, 4, 3, 7, 6,-1 } },
+ { 1, 1, { 18,18,18,-2, 4, 2, 3,18, 5, 8, 2, 4, 6, 7,-2, 3 } },
+ { 1, 2, { 18,18,-2,18,-1, 3, 2, 5,18,-2, 7, 2, 3, 4, 6, 8 } },
+ { 0, 1, { 17,18,-1, 2, 4,18, 8, 3, 6, 5, 7,-3, 2, 4, 3,17 } },
+ { 1, 1, { 18,18,17, 2,-1,18, 3, 2,18, 6, 5, 4,18, 7, 2,-1 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,17,-2, 2,-3, 6, 3, 5, 1, 2, 7, 6, 8,-2, 4, 1 } },
+ { 0, 1, { 17,-1, 5, 1, 4, 3, 6, 2,-2,18, 3, 2, 4, 5, 8,-1 } },
+ { 0, 2, { 18,18,17, 2, 3,-2, 5,18, 2, 4, 7, 8, 6,17, 3, 5 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 2, {  1,-1, 3, 2,18, 7,-2, 5, 2, 6, 4, 3,-1,18, 8, 7 } },
+ { 0, 2, { 18,17, 3,18, 2, 5, 4, 3, 6, 2, 7, 8,18, 3, 4, 5 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 2, { 18,18, 3,-3,18, 2, 6, 5, 3, 7,18, 4,-2, 8, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 0, 1, { 18,18,18, 2, 4,-1,18, 8,-1, 2, 3, 4, 6,-2, 1, 7 } },
+ { 1, 1, { 18,-2,17,18, 2, 6, 3,-2, 5, 4, 7, 1,-3, 8, 2, 6 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 1, 5, 4, 3, 2, 5, 6, 1, 4, 5 } },
+ { 0, 1, { 18,18,-2,18, 2,-3, 3, 8, 5,18, 6, 4, 3,-1, 7, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2, 3,-2, 4, 6, 1,-3, 2, 7, 8 } },
+ { 0, 1, { 18, 3, 5, 8, 2, 6, 7, 3, 1, 5, 2,-1, 8, 6, 7, 4 } },
+ { 1, 1, {  4, 3, 8, 1, 5, 6, 2, 5, 8,-2, 2, 7, 3,18, 5, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 3,18,18, 7, 2, 4,18, 6, 2, 3,-1, 8, 5,18,-3 } },
+ { 0, 1, {  3,17,18, 2,18, 6, 7,-3,18, 2, 5, 6, 3, 8, 7,-1 } },
+ { 1, 1, { 18,18, 2,18,18, 2,-1, 7, 3,18, 5, 2, 6, 4,-1,18 } },
+ { 0, 3, { 18, 3, 4, 1, 5, 2,18, 4, 2, 3,18, 7, 6, 1, 2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1,18, 2, 3, 6, 4, 5, 7,18, 3, 8, 2, 4,-2,17 } },
+ { 1, 2, { 18,17, 2, 3, 5,18, 6,-2, 7, 3, 2, 4,18, 8,-1, 5 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { 18,18, 2,18, 2,18, 7, 6,18, 2,-2, 3, 5, 4,18, 8 } },
+ { 1, 2, { 18,17, 2, 3,18,-1, 2, 3, 6,18, 5, 4, 3, 7, 2, 8 } },
+ { 1, 2, { 18,18, 3,-2, 4,18, 5, 7, 6, 2, 4,-3, 8, 5,18, 3 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 1, 1, {  3,17,18, 5, 7, 2, 4, 6, 1, 8,-1, 3, 7, 4, 1, 2 } },
+ { 0, 2, {  1,-2, 2,18, 3, 5, 2, 4, 7,-1, 2, 3, 5,18,-2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 2,-2, 6,18,-3, 2, 7, 3,-2, 5, 6, 1, 8, 2, 4 } },
+ { 0, 1, { 18,18,18, 3,-2, 6,18, 2, 4, 3, 5, 8, 7, 6, 2,-2 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 1, {  3,17,18, 2, 5,18, 6, 7, 5,-2, 2, 4,18, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 17,-1,18, 2, 4,-1, 8, 3,18, 7,-3, 4, 5, 1, 2,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18, 3, 6, 4, 8,-2, 2, 5, 3, 7,18, 6, 8, 4, 2 } },
+ { 1, 1, { 17,18,18,-2, 5, 2, 3, 1, 4,-1, 8, 6, 5, 3, 2,18 } },
+ { 1, 1, { 17,17, 1, 2, 4, 5, 2, 6,-1, 3, 1, 1,-2, 4, 2, 7 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { 18,17,-2,-3, 1, 2, 3, 2, 5, 4, 7,-3, 6,-2, 2, 1 } },
+ { 1, 1, {  1, 3, 5,18, 1, 2, 7, 3, 6, 2, 5, 8,-1, 1, 4, 7 } },
+ { 1, 1, { 17, 3, 6, 8, 1, 4, 5, 3,-2, 7, 2, 8, 5, 6,18, 3 } },
+ { 1, 1, { 17,18, 2, 4, 8,-2, 3, 1, 5, 6, 7, 1, 2, 3, 4, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 1, 8,18, 5, 2, 3,18, 6, 7,-2, 4, 3, 2, 8,18 } },
+ { 0, 1, { 18,17, 2,18, 3, 4,-1,18, 7, 6, 2, 8, 4,18,18, 5 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 1, 2, { 18,17,18, 2, 3, 5,-2,18, 6,-1, 2, 3, 7, 4, 8,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 2, { 18,18,-2,17, 2,18, 3, 4,18, 8, 7,-1, 2, 4, 5,17 } },
+ { 0, 2, { 17,-3,17, 3, 2,-2,18, 8, 4,-3, 2,18, 5, 3,-2, 6 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 2, {  1,18,-1, 3, 5, 2,-3,18, 7, 3,-1, 6, 4, 2,17, 5 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 1, 1, {  1,18, 1, 3, 5, 8, 6, 2, 3,-1, 7, 1, 4, 8, 5,-3 } },
+ { 0, 2, {  3,18,18, 2,18,-2, 6, 5, 7, 2, 4,18, 3, 6,-3, 5 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 4, { 18, 2,17, 3,18,-2, 2, 6,18, 2, 7, 3, 5, 4, 8,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 2, 5, 3,-2, 1, 4, 3, 7, 6,-3, 2, 1, 1, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18,-2,18,-2, 2, 3, 6,18, 4,-1, 2, 3, 8, 1, 4 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 0, 1, { 17,17,18, 3, 2,18,18, 6, 8, 2,-2, 3, 5, 4,17,18 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 1, {  1, 3,-3,18,18, 6, 5,18, 2,-1, 3, 8, 7,-3, 4,17 } },
+ { 1, 1, { 18, 1, 2, 1, 3, 8, 7, 4, 1, 5, 2,-1,-3,18, 6, 2 } },
+ { 0, 1, { 18, 3, 5, 2, 6, 8,18, 5, 7, 2, 3,-1, 6, 7, 8, 5 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, {  1, 1, 2, 5, 2, 7, 4, 3,-1,18,-2, 8, 2, 1, 6, 7 } },
+ { 0, 1, {  3,17,18, 5, 2, 6, 7,18, 4, 5, 3, 6,18, 2, 7, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, {  1,18, 1, 2, 3, 5, 1, 2, 6, 7, 4, 3, 8, 1,17, 5 } },
+ { 1, 2, { 17,-1,18,-2, 2, 3, 5,18, 2, 4, 6, 7, 3,-1, 5, 8 } },
+ { 1, 1, { 18,18,-3,18,-2, 2, 3,-2,18, 6, 4, 5, 8, 3,17,-3 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 7, 3, 4,-3, 6,18, 8 } },
+ { 0, 2, { 18,18, 2, 3, 5,18, 2, 4, 3, 6,18, 7, 8,-1, 5, 2 } },
+ { 0, 1, { 18,17,-1, 2,18, 3, 2,18, 4, 3,18, 2, 6, 5, 8,17 } },
+ { 0, 2, { 18,17, 2, 3,18, 5,-1, 6, 7, 8, 2, 3, 4, 5,18, 6 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,-3, 5,18, 7, 6, 2, 4, 3, 8,-2 } },
+ { 1, 1, { 17,18,18,-2, 2, 3, 5, 4, 8,18,-1, 5, 3, 6,-2, 7 } },
+ { 1, 2, { 18,17, 2,-2,18, 3,-1, 4,18, 2, 7, 5, 3, 8, 6, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 2, { 18,18, 3, 3,-2, 2, 5,18, 6, 3,-1, 4, 7,-1, 1, 2 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 3, 2, 6, 2,-1, 4,-2,17 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18,-1, 3, 2, 5, 1, 3, 2, 8, 4, 7, 6, 2,-1, 5 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 0, 1, { 18,18,-2,18, 2, 3, 4, 5, 6,18, 8, 2, 3, 7,-2, 4 } },
+ { 0, 1, { 18,-2,18,18,-3,-2, 2, 3, 5, 8, 1, 2, 6, 4, 7,-1 } },
+ { 0, 1, { 18,17, 2,18, 3,-2, 2, 7, 6, 4,18, 3, 8, 7, 4, 2 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 1, { 18,17,18, 2, 5, 3,-2,18, 6, 2, 3, 4, 8, 7, 5,-1 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3,18, 5, 3, 6,18, 2, 4, 7, 8 } },
+ { 1, 1, { 17,18, 8, 3, 6, 4,-1, 5, 2, 7, 3, 8, 6, 5,18, 4 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 2, { 18,-1,18, 3,-2,18, 2, 5, 3, 6, 7, 2,-1,18, 8, 4 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 1, 2, {  1,18,-3, 2, 3,18,-1, 5, 6, 2, 8, 3, 4, 1,-2, 7 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5, 1, 2, 6, 3, 4, 7, 1, 8, 5, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18,-2, 2, 5, 3, 7,18, 2, 4,-3, 5, 6, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 3, {  3,18,-1, 5, 2, 7,18, 6, 5, 2, 4, 3,-1, 7,18, 6 } },
+ { 0, 2, { 18,18,18, 4, 3, 2, 6, 4, 8,18, 5, 3, 2, 7,-2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18,18, 2, 4, 6,-2, 2, 8, 3, 4,18, 7,-1, 6 } },
+ { 0, 1, { 18, 1,-2, 2, 4, 1, 3,-1, 2, 5, 7, 1, 6, 8,-2,17 } },
+ { 0, 1, { 17,17,18, 2, 5, 4,18, 3, 8, 7, 4, 6, 8, 1, 5, 2 } },
+ { 1, 2, { 18,18, 5, 4, 6, 3, 4,18, 8, 4,-1, 7, 5, 3, 6, 2 } },
+ { 0, 1, { 18,18,-3,18, 3, 6, 2, 5, 7,18, 3, 8,-1, 4, 5, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2,-2, 4, 3, 6,18, 8,-1, 2, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18, 3,18, 2, 5, 4, 7,-3, 6, 3, 2,18, 4, 7, 3 } },
+ { 1, 1, {  1, 7, 4, 5, 3, 4, 5, 1, 3, 6, 3, 2, 4, 8,-2, 7 } },
+ { 0, 1, {  1,18,-1,-2,18, 3, 2,-1, 6, 7, 4, 5, 3,18, 2,-3 } },
+ { 1, 1, { 18,18,-1, 3, 6,18, 5, 4, 8, 2, 3, 6,18, 7, 4,-2 } },
+ { 0, 2, { 18,18, 2, 6,18, 2,18, 5, 3,18, 2, 4, 7, 8, 3,18 } },
+ { 1, 1, {  3,18,18, 5,18, 6, 2, 4, 7,-2,18, 5, 8, 6, 3, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 1, 1, { 18,-2,18, 2, 5,18, 3,-2, 4, 7, 2,-1, 8, 6, 5, 1 } },
+ { 1, 1, { 17,17, 5,18, 4, 1, 2, 8, 6, 4,-2, 3, 5,-1, 1, 8 } },
+ { 0, 2, {  1, 2,17, 3, 7,18, 2,-1, 4, 5,18, 2, 7, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2,-2, 3, 6, 4, 8,18, 2, 5, 7, 4, 3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18, 1, 8, 3, 5, 6, 4,-1, 8, 3, 7,18, 2, 5, 8, 4 } },
+ { 1, 1, { 17,18, 5, 2, 4, 3, 1, 6,-2, 1, 3, 2, 4, 5,-1,17 } },
+ { 1, 1, { 18,17, 2,18, 3,-3, 7, 2, 6, 4, 3, 5,18, 8, 2,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5,-1,18, 2, 7, 8, 4, 6, 3,18, 5 } },
+ { 0, 1, { 18,17,18,-2, 2,-3, 3, 4, 8, 5, 2,18, 6, 3, 7,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,18, 5,-2, 3, 8, 5, 2, 4, 7, 6 } },
+ { 0, 1, { 18,-2, 3, 5, 1, 7, 3, 2, 6,-3, 4, 1, 5, 8, 3,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3,17,18, 5,-1,18, 2, 6, 7,18, 5, 3,-3,-1, 6, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, { 18,17,-2, 3,-1,18, 2, 5, 3, 7, 6, 2, 4, 8,18, 5 } },
+ { 0, 1, { 18,-1,18, 2,18, 3, 5,18, 2, 8,18, 5, 4,-1, 6, 2 } },
+ { 1, 2, { 18,-2,18,18, 2, 3, 4,-3, 2, 5,18, 7, 4, 3, 8, 6 } },
+ { 0, 2, { 17,-1,18, 2,-1, 1, 7, 3, 8, 5,-2, 4, 1, 2,-3, 6 } },
+ { 0, 1, { 18,17, 2,18, 2,18, 6, 7, 4, 3,18, 5, 2,-2,17, 8 } },
+ { 0, 3, { 18,17, 2, 3,-3,-1,18, 2, 4, 5,18, 7, 3, 2,-3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, {  3,18,18,18, 2, 6, 5,18, 7, 2, 4, 6,18, 5, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 3, 6, 3,-2, 2,18, 5,-1, 7, 3, 4,-2, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,17,18,18,-2, 2, 3,-3,18, 6, 4, 2,-2, 8, 3, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18,18, 4, 2, 7, 8,18, 3, 2,-2, 4, 7, 6,17, 5 } },
+ { 1, 1, { 18,18,-1,-2, 8, 3,18, 6, 3, 5, 8, 2, 4, 7, 1, 6 } },
+ { 1, 1, {  1,-3, 3,18,18, 2,-1, 3, 6, 5,18, 4, 7,-2, 8, 3 } },
+ { 1, 1, {  1,18, 4, 2, 5,18, 1, 3,-1, 6, 1, 4, 8, 2, 5, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+};
+
+static const WavPackDecorrSpec * const decorr_filters[] = {
+    &fast_specs[0], &default_specs[0], &high_specs[0], &very_high_specs[0],
+};
+
+static const uint16_t decorr_filter_sizes[] = {
+    FF_ARRAY_ELEMS(fast_specs),
+    FF_ARRAY_ELEMS(default_specs),
+    FF_ARRAY_ELEMS(high_specs),
+    FF_ARRAY_ELEMS(very_high_specs),
+};
+
+static const uint8_t decorr_filter_nterms[] = { 2, 5, 10, 16 };
+
+static const int8_t nbits_table[] = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+};
+
+#endif /* AVCODEC_WAVPACKENC_H */
diff --git a/libavcodec/webp.c b/libavcodec/webp.c
index c4757446d3..5c2961ff19 100644
--- a/libavcodec/webp.c
+++ b/libavcodec/webp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2013 Aneesh Dogra <aneesh@sugarlabs.org>
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,16 +31,20 @@
  * Lossless decoder
  * Compressed alpha for lossy
  *
+ * @author James Almer <jamrial@gmail.com>
+ * Exif metadata
+ *
  * Unimplemented:
  *   - Animation
  *   - ICC profile
- *   - Exif and XMP metadata
+ *   - XMP metadata
  */
 
 #define BITSTREAM_READER_LE
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "exif.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "thread.h"
@@ -191,6 +195,8 @@ typedef struct WebPContext {
     enum AlphaFilter alpha_filter;      /* filtering method for alpha chunk */
     uint8_t *alpha_data;                /* alpha chunk data */
     int alpha_data_size;                /* alpha chunk data size */
+    int has_exif;                       /* set after an EXIF chunk has been processed */
+    AVDictionary *exif_metadata;        /* EXIF chunk data */
     int width;                          /* image width */
     int height;                         /* image height */
     int lossless;                       /* indicates lossless or lossy */
@@ -303,7 +309,7 @@ static int huff_reader_build_canonical(HuffReader *r, int *code_lengths,
     if (max_code_length == 0 || max_code_length > MAX_HUFFMAN_CODE_LENGTH)
         return AVERROR(EINVAL);
 
-    codes = av_malloc(alphabet_size * sizeof(*codes));
+    codes = av_malloc_array(alphabet_size, sizeof(*codes));
     if (!codes)
         return AVERROR(ENOMEM);
 
@@ -1027,7 +1033,7 @@ static int apply_color_indexing_transform(WebPContext *s)
     ImageContext *img;
     ImageContext *pal;
     int i, x, y;
-    uint8_t *p, *pi;
+    uint8_t *p;
 
     img = &s->image[IMAGE_ROLE_ARGB];
     pal = &s->image[IMAGE_ROLE_COLOR_INDEXING];
@@ -1060,16 +1066,33 @@ static int apply_color_indexing_transform(WebPContext *s)
         av_free(line);
     }
 
-    for (y = 0; y < img->frame->height; y++) {
-        for (x = 0; x < img->frame->width; x++) {
-            p = GET_PIXEL(img->frame, x, y);
-            i = p[2];
-            if (i >= pal->frame->width) {
-                av_log(s->avctx, AV_LOG_ERROR, "invalid palette index %d\n", i);
-                return AVERROR_INVALIDDATA;
+    // switch to local palette if it's worth initializing it
+    if (img->frame->height * img->frame->width > 300) {
+        uint8_t palette[256 * 4];
+        const int size = pal->frame->width * 4;
+        av_assert0(size <= 1024U);
+        memcpy(palette, GET_PIXEL(pal->frame, 0, 0), size);   // copy palette
+        // set extra entries to transparent black
+        memset(palette + size, 0, 256 * 4 - size);
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                AV_COPY32(p, &palette[i * 4]);
+            }
+        }
+    } else {
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                if (i >= pal->frame->width) {
+                    AV_WB32(p, 0x00000000);
+                } else {
+                    const uint8_t *pi = GET_PIXEL(pal->frame, i, 0);
+                    AV_COPY32(p, pi);
+                }
             }
-            pi = GET_PIXEL(pal->frame, i, 0);
-            AV_COPY32(p, pi);
         }
     }
 
@@ -1088,7 +1111,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
     }
 
-    ret = init_get_bits(&s->gb, data_start, data_size * 8);
+    ret = init_get_bits8(&s->gb, data_start, data_size);
     if (ret < 0)
         return ret;
 
@@ -1134,7 +1157,6 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
     used = 0;
     while (get_bits1(&s->gb)) {
         enum TransformType transform = get_bits(&s->gb, 2);
-        s->transforms[s->nb_transforms++] = transform;
         if (used & (1 << transform)) {
             av_log(avctx, AV_LOG_ERROR, "Transform %d used more than once\n",
                    transform);
@@ -1142,6 +1164,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
             goto free_and_return;
         }
         used |= (1 << transform);
+        s->transforms[s->nb_transforms++] = transform;
         switch (transform) {
         case PREDICTOR_TRANSFORM:
             ret = parse_transform_predictor(s);
@@ -1343,6 +1366,7 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->height    = 0;
     *got_frame   = 0;
     s->has_alpha = 0;
+    s->has_exif  = 0;
     bytestream2_init(&gb, avpkt->data, avpkt->size);
 
     if (bytestream2_get_bytes_left(&gb) < 12)
@@ -1362,6 +1386,7 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
+    av_dict_free(&s->exif_metadata);
     while (bytestream2_get_bytes_left(&gb) > 8) {
         char chunk_str[5] = { 0 };
 
@@ -1392,6 +1417,7 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                                 chunk_size, 0);
                 if (ret < 0)
                     return ret;
+                avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             }
             bytestream2_skip(&gb, chunk_size);
             break;
@@ -1435,10 +1461,44 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
             break;
         }
+        case MKTAG('E', 'X', 'I', 'F'): {
+            int le, ifd_offset, exif_offset = bytestream2_tell(&gb);
+            GetByteContext exif_gb;
+
+            if (s->has_exif) {
+                av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
+                goto exif_end;
+            }
+            if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
+                av_log(avctx, AV_LOG_WARNING,
+                       "EXIF chunk present, but Exif bit not set in the "
+                       "VP8X header\n");
+
+            s->has_exif = 1;
+            bytestream2_init(&exif_gb, avpkt->data + exif_offset,
+                             avpkt->size - exif_offset);
+            if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
+                       "in Exif data\n");
+                goto exif_end;
+            }
+
+            bytestream2_seek(&exif_gb, ifd_offset, SEEK_SET);
+            if (avpriv_exif_decode_ifd(avctx, &exif_gb, le, 0, &s->exif_metadata) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "error decoding Exif data\n");
+                goto exif_end;
+            }
+
+            av_dict_copy(avpriv_frame_get_metadatap(data), s->exif_metadata, 0);
+
+exif_end:
+            av_dict_free(&s->exif_metadata);
+            bytestream2_skip(&gb, chunk_size);
+            break;
+        }
         case MKTAG('I', 'C', 'C', 'P'):
         case MKTAG('A', 'N', 'I', 'M'):
         case MKTAG('A', 'N', 'M', 'F'):
-        case MKTAG('E', 'X', 'I', 'F'):
         case MKTAG('X', 'M', 'P', ' '):
             AV_WL32(chunk_str, chunk_type);
             av_log(avctx, AV_LOG_VERBOSE, "skipping unsupported chunk: %s\n",
diff --git a/libavcodec/webvttdec.c b/libavcodec/webvttdec.c
new file mode 100644
index 0000000000..1284a172c4
--- /dev/null
+++ b/libavcodec/webvttdec.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebVTT subtitle decoder
+ * @see http://dev.w3.org/html5/webvtt/
+ * @todo need to support extended markups and cue settings
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static const struct {
+    const char *from;
+    const char *to;
+} webvtt_tag_replace[] = {
+    {"<i>", "{\\i1}"}, {"</i>", "{\\i0}"},
+    {"<b>", "{\\b1}"}, {"</b>", "{\\b0}"},
+    {"<u>", "{\\u1}"}, {"</u>", "{\\u0}"},
+    {"{", "\\{"}, {"}", "\\}"}, // escape to avoid ASS markup conflicts
+};
+
+static int webvtt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int i, skip = 0;
+
+    while (*p) {
+
+        for (i = 0; i < FF_ARRAY_ELEMS(webvtt_tag_replace); i++) {
+            const char *from = webvtt_tag_replace[i].from;
+            const size_t len = strlen(from);
+            if (!strncmp(p, from, len)) {
+                av_bprintf(buf, "%s", webvtt_tag_replace[i].to);
+                p += len;
+                break;
+            }
+        }
+        if (!*p)
+            break;
+
+        if (*p == '<')
+            skip = 1;
+        else if (*p == '>')
+            skip = 0;
+        else if (p[0] == '\n' && p[1])
+            av_bprintf(buf, "\\N");
+        else if (!skip && *p != '\r')
+            av_bprint_chars(buf, *p, 1);
+        p++;
+    }
+    return 0;
+}
+
+static int webvtt_decode_frame(AVCodecContext *avctx,
+                               void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && !webvtt_event_to_ass(&buf, ptr)) {
+        int ts_start     = av_rescale_q(avpkt->pts, avctx->time_base, (AVRational){1,100});
+        int ts_duration  = avpkt->duration != -1 ?
+                           av_rescale_q(avpkt->duration, avctx->time_base, (AVRational){1,100}) : -1;
+        ret = ff_ass_add_rect_bprint(sub, &buf, ts_start, ts_duration);
+    }
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_webvtt_decoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .decode         = webvtt_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+};
diff --git a/libavcodec/webvttenc.c b/libavcodec/webvttenc.c
new file mode 100644
index 0000000000..9f67a2eab6
--- /dev/null
+++ b/libavcodec/webvttenc.c
@@ -0,0 +1,219 @@
+/*
+ * WebVTT subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (c) 2014  Aman Gupta <ffmpeg@tmm1.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define WEBVTT_STACK_SIZE 64
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    unsigned timestamp_end;
+    int count;
+    char stack[WEBVTT_STACK_SIZE];
+    int stack_ptr;
+} WebVTTContext;
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void webvtt_print(WebVTTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int webvtt_stack_push(WebVTTContext *s, const char c)
+{
+    if (s->stack_ptr >= WEBVTT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char webvtt_stack_pop(WebVTTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int webvtt_stack_find(WebVTTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void webvtt_close_tag(WebVTTContext *s, char tag)
+{
+    webvtt_print(s, "</%c>", tag);
+}
+
+static void webvtt_stack_push_pop(WebVTTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? webvtt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            webvtt_close_tag(s, webvtt_stack_pop(s));
+    } else if (webvtt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void webvtt_style_apply(WebVTTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            webvtt_print(s, "<b>");
+            webvtt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            webvtt_print(s, "<i>");
+            webvtt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            webvtt_print(s, "<u>");
+            webvtt_stack_push(s, 'u');
+        }
+    }
+}
+
+static void webvtt_text_cb(void *priv, const char *text, int len)
+{
+    WebVTTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void webvtt_new_line_cb(void *priv, int forced)
+{
+    webvtt_print(priv, "\n");
+}
+
+static void webvtt_style_cb(void *priv, char style, int close)
+{
+    if (style == 's') // strikethrough unsupported
+        return;
+
+    webvtt_stack_push_pop(priv, style, close);
+    if (!close)
+        webvtt_print(priv, "<%c>", style);
+}
+
+static void webvtt_cancel_overrides_cb(void *priv, const char *style)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+    webvtt_style_apply(priv, style);
+}
+
+static void webvtt_end_cb(void *priv)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks webvtt_callbacks = {
+    .text             = webvtt_text_cb,
+    .new_line         = webvtt_new_line_cb,
+    .style            = webvtt_style_cb,
+    .color            = NULL,
+    .font_name        = NULL,
+    .font_size        = NULL,
+    .alignment        = NULL,
+    .cancel_overrides = webvtt_cancel_overrides_cb,
+    .move             = NULL,
+    .end              = webvtt_end_cb,
+};
+
+static int webvtt_encode_frame(AVCodecContext *avctx,
+                               unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i, num;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+        dialog = ff_ass_split_dialog(s->ass_ctx, sub->rects[i]->ass, 0, &num);
+        for (; dialog && num--; dialog++) {
+            webvtt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&webvtt_callbacks, s, dialog->text);
+        }
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int webvtt_encode_close(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+static av_cold int webvtt_encode_init(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+AVCodec ff_webvtt_encoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .priv_data_size = sizeof(WebVTTContext),
+    .init           = webvtt_encode_init,
+    .encode_sub     = webvtt_encode_frame,
+    .close          = webvtt_encode_close,
+};
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index f0aabfcba3..6d1c7e5c2f 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -1,21 +1,21 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,9 +29,6 @@
 #include "wma_freqs.h"
 #include "wmadata.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 /* XXX: use same run/length optimization as mpeg decoders */
 // FIXME maybe split decode / encode or pass flag
 static av_cold int init_coef_vlc(VLC *vlc, uint16_t **prun_table,
@@ -48,10 +45,10 @@ static av_cold int init_coef_vlc(VLC *vlc, uint16_t **prun_table,
 
     init_vlc(vlc, VLCBITS, n, table_bits, 1, 1, table_codes, 4, 4, 0);
 
-    run_table    = av_malloc(n * sizeof(uint16_t));
-    level_table  = av_malloc(n * sizeof(uint16_t));
-    flevel_table = av_malloc(n * sizeof(*flevel_table));
-    int_table    = av_malloc(n * sizeof(uint16_t));
+    run_table    = av_malloc_array(n, sizeof(uint16_t));
+    level_table  = av_malloc_array(n, sizeof(uint16_t));
+    flevel_table = av_malloc_array(n, sizeof(*flevel_table));
+    int_table    = av_malloc_array(n, sizeof(uint16_t));
     if (!run_table || !level_table || !flevel_table || !int_table) {
         av_freep(&run_table);
         av_freep(&level_table);
@@ -95,7 +92,6 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
         avctx->bit_rate    <= 0)
         return -1;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     if (avctx->codec->id == AV_CODEC_ID_WMAV1)
         s->version = 1;
@@ -144,6 +140,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
     bps                 = (float) avctx->bit_rate /
                           (float) (avctx->channels * avctx->sample_rate);
     s->byte_offset_bits = av_log2((int) (bps * s->frame_len / 8.0 + 0.5)) + 2;
+    if (s->byte_offset_bits + 3 > MIN_CACHE_BITS) {
+        av_log(avctx, AV_LOG_ERROR, "byte_offset_bits %d is too large\n", s->byte_offset_bits);
+        return AVERROR_PATCHWELCOME;
+    }
 
     /* compute high frequency value and choose if noise coding should
      * be activated */
@@ -185,8 +185,8 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
             high_freq = high_freq * 0.5;
     }
     ff_dlog(s->avctx, "flags2=0x%x\n", flags2);
-    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%d block_align=%d\n",
-            s->version, avctx->channels, avctx->sample_rate, avctx->bit_rate,
+    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%"PRId64" block_align=%d\n",
+            s->version, avctx->channels, avctx->sample_rate, (int64_t)avctx->bit_rate,
             avctx->block_align);
     ff_dlog(s->avctx, "bps=%f bps1=%f high_freq=%f bitoffset=%d\n",
             bps, bps1, high_freq, s->byte_offset_bits);
@@ -338,6 +338,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
 #endif /* TRACE */
     }
 
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
     /* choose the VLC tables for the coefficients */
     coef_vlc_table = 2;
     if (avctx->sample_rate >= 32000) {
@@ -385,10 +389,11 @@ int ff_wma_end(AVCodecContext *avctx)
         ff_free_vlc(&s->hgain_vlc);
     for (i = 0; i < 2; i++) {
         ff_free_vlc(&s->coef_vlc[i]);
-        av_free(s->run_table[i]);
-        av_free(s->level_table[i]);
-        av_free(s->int_table[i]);
+        av_freep(&s->run_table[i]);
+        av_freep(&s->level_table[i]);
+        av_freep(&s->int_table[i]);
     }
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -447,7 +452,7 @@ int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
             /** normal code */
             offset                  += run_table[code];
             sign                     = get_bits1(gb) - 1;
-            iptr[offset & coef_mask] = ilvl[code] ^ sign << 31;
+            iptr[offset & coef_mask] = ilvl[code] ^ (sign & 0x80000000);
         } else if (code == 1) {
             /** EOB */
             break;
@@ -479,7 +484,11 @@ int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
     }
     /** NOTE: EOB can be omitted */
     if (offset > num_coefs) {
-        av_log(avctx, AV_LOG_ERROR, "overflow in spectral RLE, ignoring\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "overflow (%d > %d) in spectral RLE, ignoring\n",
+               offset,
+               num_coefs
+              );
         return -1;
     }
 
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index c954d71e28..4aad713fa8 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,7 +42,7 @@
 #define NB_LSP_COEFS 10
 
 /* XXX: is it a suitable value ? */
-#define MAX_CODED_SUPERFRAME_SIZE 16384
+#define MAX_CODED_SUPERFRAME_SIZE 32768
 
 #define MAX_CHANNELS 2
 
@@ -116,7 +116,7 @@ typedef struct WMACodecContext {
     DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
     DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
     FFTContext mdct_ctx[BLOCK_NB_SIZES];
-    float *windows[BLOCK_NB_SIZES];
+    const float *windows[BLOCK_NB_SIZES];
     /* output buffer for one frame and the last for IMDCT windowing */
     DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
     /* last frame info */
@@ -131,7 +131,7 @@ typedef struct WMACodecContext {
     float lsp_pow_e_table[256];
     float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
     float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
 #ifdef TRACE
     int frame_count;
diff --git a/libavcodec/wma_common.c b/libavcodec/wma_common.c
index cf76f5cf52..c01e0f4e70 100644
--- a/libavcodec/wma_common.c
+++ b/libavcodec/wma_common.c
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_common.h b/libavcodec/wma_common.h
index 61b1a35e02..55404afc45 100644
--- a/libavcodec/wma_common.h
+++ b/libavcodec/wma_common.h
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.c b/libavcodec/wma_freqs.c
index 82cef3bfc3..03a283fca5 100644
--- a/libavcodec/wma_freqs.c
+++ b/libavcodec/wma_freqs.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.h b/libavcodec/wma_freqs.h
index d40ab65b96..6fd93e4eff 100644
--- a/libavcodec/wma_freqs.h
+++ b/libavcodec/wma_freqs.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadata.h b/libavcodec/wmadata.h
index 58bffedd6e..641cb1813c 100644
--- a/libavcodec/wmadata.h
+++ b/libavcodec/wmadata.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * copyright (c) 2002 The Libav Project
+ * copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index b79dd2a101..e814007ead 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,9 +39,6 @@
 #include "internal.h"
 #include "wma.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 #define EXPVLCBITS 8
 #define EXPMAX     ((19 + EXPVLCBITS - 1) / EXPVLCBITS)
 
@@ -94,6 +91,16 @@ static av_cold int wma_decode_init(AVCodecContext *avctx)
     s->use_bit_reservoir      = flags2 & 0x0002;
     s->use_variable_block_len = flags2 & 0x0004;
 
+    if (avctx->codec->id == AV_CODEC_ID_WMAV2 && avctx->extradata_size >= 8){
+        if (AV_RL16(extradata+4)==0xd && s->use_variable_block_len){
+            av_log(avctx, AV_LOG_WARNING, "Disabling use_variable_block_len, if this fails contact the ffmpeg developers and send us the file\n");
+            s->use_variable_block_len= 0; // this fixes issue1503
+        }
+    }
+
+    for (i=0; i<MAX_CHANNELS; i++)
+        s->max_exponent[i] = 1.0;
+
     if (ff_wma_init(avctx, flags2) < 0)
         return -1;
 
@@ -380,14 +387,14 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_add(out, in, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out, in, s->windows[bsize],
                                 out, block_len);
     } else {
         block_len = 1 << s->prev_block_len_bits;
         n         = (s->block_len - block_len) / 2;
         bsize     = s->frame_len_bits - s->prev_block_len_bits;
 
-        s->fdsp.vector_fmul_add(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out + n, in + n, s->windows[bsize],
                                 out + n, block_len);
 
         memcpy(out + n + block_len, in + n + block_len, n * sizeof(float));
@@ -401,7 +408,7 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_reverse(out, in, s->windows[bsize], block_len);
+        s->fdsp->vector_fmul_reverse(out, in, s->windows[bsize], block_len);
     } else {
         block_len = 1 << s->next_block_len_bits;
         n         = (s->block_len - block_len) / 2;
@@ -409,7 +416,7 @@ static void wma_window(WMACodecContext *s, float *out)
 
         memcpy(out, in, n * sizeof(float));
 
-        s->fdsp.vector_fmul_reverse(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_reverse(out + n, in + n, s->windows[bsize],
                                     block_len);
 
         memset(out + n + block_len, 0, n * sizeof(float));
@@ -475,6 +482,11 @@ static int wma_decode_block(WMACodecContext *s)
         s->block_len_bits      = s->frame_len_bits;
     }
 
+    if (s->frame_len_bits - s->block_len_bits >= s->nb_block_sizes){
+        av_log(s->avctx, AV_LOG_ERROR, "block_len_bits not initialized to a valid value\n");
+        return -1;
+    }
+
     /* now check if the block length is coherent with the frame length */
     s->block_len = 1 << s->block_len_bits;
     if ((s->block_pos + s->block_len) > s->frame_len) {
@@ -502,6 +514,10 @@ static int wma_decode_block(WMACodecContext *s)
      * coef escape coding */
     total_gain = 1;
     for (;;) {
+        if (get_bits_left(&s->gb) < 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "total_gain overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         a           = get_bits(&s->gb, 7);
         total_gain += a;
         if (a != 127)
@@ -681,7 +697,7 @@ static int wma_decode_block(WMACodecContext *s)
 
                 /* very high freqs : noise */
                 n     = s->block_len - s->coefs_end[bsize];
-                mult1 = mult * exponents[((-1 << bsize)) >> esize];
+                mult1 = mult * exponents[(-(1 << bsize)) >> esize];
                 for (i = 0; i < n; i++) {
                     *coefs++       = s->noise_table[s->noise_index] * mult1;
                     s->noise_index = (s->noise_index + 1) & (NOISE_TAB_SIZE - 1);
@@ -719,7 +735,7 @@ static int wma_decode_block(WMACodecContext *s)
             s->channel_coded[0] = 1;
         }
 
-        s->fdsp.butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
+        s->fdsp->butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
     }
 
 next:
@@ -811,7 +827,8 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
                buf_size, avctx->block_align);
         return AVERROR_INVALIDDATA;
     }
-    buf_size = avctx->block_align;
+    if (avctx->block_align)
+        buf_size = avctx->block_align;
 
     init_get_bits(&s->gb, buf, buf_size * 8);
 
@@ -819,15 +836,38 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         /* read super frame header */
         skip_bits(&s->gb, 4); /* super frame index */
         nb_frames = get_bits(&s->gb, 4) - (s->last_superframe_len <= 0);
+        if (nb_frames <= 0) {
+            int is_error = nb_frames < 0 || get_bits_left(&s->gb) <= 8;
+            av_log(avctx, is_error ? AV_LOG_ERROR : AV_LOG_WARNING,
+                   "nb_frames is %d bits left %d\n",
+                   nb_frames, get_bits_left(&s->gb));
+            if (is_error)
+                return AVERROR_INVALIDDATA;
+
+            if ((s->last_superframe_len + buf_size - 1) >
+                MAX_CODED_SUPERFRAME_SIZE)
+                goto fail;
+
+            q   = s->last_superframe + s->last_superframe_len;
+            len = buf_size - 1;
+            while (len > 0) {
+                *q++ = get_bits (&s->gb, 8);
+                len --;
+            }
+            memset(q, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+            s->last_superframe_len += 8*buf_size - 8;
+//             s->reset_block_lengths = 1; //XXX is this needed ?
+            *got_frame_ptr = 0;
+            return buf_size;
+        }
     } else
         nb_frames = 1;
 
     /* get output buffer */
     frame->nb_samples = nb_frames * s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples        = (float **) frame->extended_data;
     samples_offset = 0;
 
@@ -904,13 +944,13 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         samples_offset += s->frame_len;
     }
 
-    ff_dlog(s->avctx, "%d %d %d %d outbytes:%td eaten:%d\n",
+    ff_dlog(s->avctx, "%d %d %d %d outbytes:%"PTRDIFF_SPECIFIER" eaten:%d\n",
             s->frame_len_bits, s->block_len_bits, s->frame_len, s->block_len,
             (int8_t *) samples - (int8_t *) data, avctx->block_align);
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return buf_size;
 
 fail:
     /* when error, we reset the bit reservoir */
@@ -926,6 +966,7 @@ static av_cold void flush(AVCodecContext *avctx)
     s->last_superframe_len = 0;
 }
 
+#if CONFIG_WMAV1_DECODER
 AVCodec ff_wmav1_decoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -940,7 +981,8 @@ AVCodec ff_wmav1_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_DECODER
 AVCodec ff_wmav2_decoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -955,3 +997,4 @@ AVCodec ff_wmav2_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index c176daa7c9..faf0cb518d 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -2,20 +2,20 @@
  * WMA compatible encoder
  * Copyright (c) 2007 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "wma.h"
-
-#undef NDEBUG
-#include <assert.h>
+#include "libavutil/avassert.h"
 
 
 static av_cold int encode_init(AVCodecContext *avctx)
@@ -39,21 +37,21 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     if (avctx->channels > MAX_CHANNELS) {
         av_log(avctx, AV_LOG_ERROR,
-               "too many channels: got %i, need %i or fewer",
+               "too many channels: got %i, need %i or fewer\n",
                avctx->channels, MAX_CHANNELS);
         return AVERROR(EINVAL);
     }
 
     if (avctx->sample_rate > 48000) {
-        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz",
+        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz\n",
                avctx->sample_rate);
         return AVERROR(EINVAL);
     }
 
     if (avctx->bit_rate < 24 * 1000) {
         av_log(avctx, AV_LOG_ERROR,
-               "bitrate too low: got %i, need 24000 or higher\n",
-               avctx->bit_rate);
+               "bitrate too low: got %"PRId64", need 24000 or higher\n",
+               (int64_t)avctx->bit_rate);
         return AVERROR(EINVAL);
     }
 
@@ -75,7 +73,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
         AV_WL32(extradata, flags1);
         AV_WL16(extradata + 4, flags2);
     } else {
-        assert(0);
+        av_assert0(0);
     }
     avctx->extradata          = extradata;
     s->use_exp_vlc            = flags2 & 0x0001;
@@ -94,8 +92,6 @@ static av_cold int encode_init(AVCodecContext *avctx)
                          (avctx->sample_rate * 8);
     block_align        = FFMIN(block_align, MAX_CODED_SUPERFRAME_SIZE);
     avctx->block_align = block_align;
-    avctx->bit_rate    = avctx->block_align * 8LL * avctx->sample_rate /
-                         s->frame_len;
     avctx->frame_size = avctx->initial_padding = s->frame_len;
 
     return 0;
@@ -115,10 +111,10 @@ static void apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
 
     for (ch = 0; ch < avctx->channels; ch++) {
         memcpy(s->output, s->frame_out[ch], window_len * sizeof(*s->output));
-        s->fdsp.vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
-        s->fdsp.vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
+        s->fdsp->vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
+        s->fdsp->vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
                                     win, len);
-        s->fdsp.vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
+        s->fdsp->vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
         mdct->mdct_calc(mdct, s->coefs[ch], s->output);
     }
 }
@@ -157,7 +153,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     q_end = q + s->block_len;
     if (s->version == 1) {
         last_exp = *exp_param++;
-        assert(last_exp - 10 >= 0 && last_exp - 10 < 32);
+        av_assert0(last_exp - 10 >= 0 && last_exp - 10 < 32);
         put_bits(&s->pb, 5, last_exp - 10);
         q += *ptr++;
     } else
@@ -165,7 +161,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     while (q < q_end) {
         int exp  = *exp_param++;
         int code = exp - last_exp + 60;
-        assert(code >= 0 && code < 120);
+        av_assert1(code >= 0 && code < 120);
         put_bits(&s->pb, ff_aac_scalefactor_bits[code],
                  ff_aac_scalefactor_code[code]);
         /* XXX: use a table */
@@ -190,7 +186,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
     // FIXME remove duplication relative to decoder
     if (s->use_variable_block_len) {
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     } else {
         /* fixed block len */
         s->next_block_len_bits = s->frame_len_bits;
@@ -199,7 +195,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     }
 
     s->block_len = 1 << s->block_len_bits;
-//     assert((s->block_pos + s->block_len) <= s->frame_len);
+//     av_assert0((s->block_pos + s->block_len) <= s->frame_len);
     bsize = s->frame_len_bits - s->block_len_bits;
 
     // FIXME factor
@@ -235,7 +231,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
             mult     *= mdct_norm;
             coefs     = src_coefs[ch];
             if (s->use_noise_coding && 0) {
-                assert(0); // FIXME not implemented
+                av_assert0(0); // FIXME not implemented
             } else {
                 coefs += s->coefs_start;
                 n      = nb_coefs[ch];
@@ -290,13 +286,13 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                 if (s->use_exp_vlc) {
                     encode_exp_vlc(s, ch, fixed_exp);
                 } else {
-                    assert(0); // FIXME not implemented
+                    av_assert0(0); // FIXME not implemented
 //                    encode_exp_lsp(s, ch);
                 }
             }
         }
     } else
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
 
     for (ch = 0; ch < s->avctx->channels; ch++) {
         if (s->channel_coded[ch]) {
@@ -316,7 +312,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                         if (run < s->coef_vlcs[tindex]->levels[abs_level - 1])
                             code = run + s->int_table[tindex][abs_level - 1];
 
-                    assert(code < s->coef_vlcs[tindex]->n);
+                    av_assert2(code < s->coef_vlcs[tindex]->n);
                     put_bits(&s->pb, s->coef_vlcs[tindex]->huffbits[code],
                              s->coef_vlcs[tindex]->huffcodes[code]);
 
@@ -349,7 +345,7 @@ static int encode_frame(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     init_put_bits(&s->pb, buf, buf_size);
 
     if (s->use_bit_reservoir)
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     else if (encode_block(s, src_coefs, total_gain) < 0)
         return INT_MAX;
 
@@ -362,7 +358,7 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
                              const AVFrame *frame, int *got_packet_ptr)
 {
     WMACodecContext *s = avctx->priv_data;
-    int i, total_gain, ret;
+    int i, total_gain, ret, error;
 
     s->block_len_bits = s->frame_len_bits; // required by non variable block len
     s->block_len      = 1 << s->block_len_bits;
@@ -381,46 +377,32 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
-#if 1
     total_gain = 128;
     for (i = 64; i; i >>= 1) {
-        int error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
                                  total_gain - i);
-        if (error < 0)
-            total_gain -= i;
-    }
-#else
-    total_gain = 90;
-    best = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain);
-    for (i = 32; i; i >>= 1) {
-        int scoreL = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain - i);
-        int scoreR = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain + i);
-        av_log(NULL, AV_LOG_ERROR, "%d %d %d (%d)\n", scoreL, best, scoreR, total_gain);
-        if (scoreL < FFMIN(best, scoreR)) {
-            best        = scoreL;
+        if (error <= 0)
             total_gain -= i;
-        } else if (scoreR < best) {
-            best        = scoreR;
-            total_gain += i;
-        }
     }
-#endif /* 1 */
 
-    if ((i = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain)) >= 0) {
-        av_log(avctx, AV_LOG_ERROR, "required frame size too large. please "
-                                    "use a higher bit rate.\n");
+    while(total_gain <= 128 && error > 0)
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain++);
+    if (error > 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid input data or requested bitrate too low, cannot encode\n");
+        avpkt->size = 0;
         return AVERROR(EINVAL);
     }
-    assert((put_bits_count(&s->pb) & 7) == 0);
-    while (i++)
+    av_assert0((put_bits_count(&s->pb) & 7) == 0);
+    i= avctx->block_align - (put_bits_count(&s->pb)+7)/8;
+    av_assert0(i>=0);
+    while(i--)
         put_bits(&s->pb, 8, 'N');
 
     flush_put_bits(&s->pb);
+    av_assert0(put_bits_ptr(&s->pb) - s->pb.buf == avctx->block_align);
 
     if (frame->pts != AV_NOPTS_VALUE)
         avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);
@@ -430,6 +412,7 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
     return 0;
 }
 
+#if CONFIG_WMAV1_ENCODER
 AVCodec ff_wmav1_encoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -442,7 +425,8 @@ AVCodec ff_wmav1_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_ENCODER
 AVCodec ff_wmav2_encoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -455,3 +439,4 @@ AVCodec ff_wmav2_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index e752bd02ea..e9d0bca616 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2011 Andreas Öman
  * Copyright (c) 2011 - 2012 Mashiat Sarker Shakkhar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "internal.h"
 #include "get_bits.h"
 #include "put_bits.h"
+#include "lossless_audiodsp.h"
 #include "wma.h"
 #include "wma_common.h"
 
@@ -46,6 +47,7 @@
 #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS)    ///< maximum block size
 #define WMALL_BLOCK_SIZES    (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
 
+#define WMALL_COEFF_PAD_SIZE   16                       ///< pad coef buffers with 0 for use with SIMD
 
 /**
  * @brief frame-specific decoder context for a single channel
@@ -69,6 +71,7 @@ typedef struct WmallDecodeCtx {
     /* generic decoder variables */
     AVCodecContext  *avctx;
     AVFrame         *frame;
+    LLAudDSPContext dsp;                           ///< accelerated DSP functions
     uint8_t         frame_data[MAX_FRAMESIZE + AV_INPUT_BUFFER_PADDING_SIZE];  ///< compressed frame data
     PutBitContext   pb;                             ///< context for filling the frame_data buffer
 
@@ -124,8 +127,8 @@ typedef struct WmallDecodeCtx {
 
     int8_t  acfilter_order;
     int8_t  acfilter_scaling;
-    int64_t acfilter_coeffs[16];
-    int     acfilter_prevvalues[2][16];
+    int16_t acfilter_coeffs[16];
+    int     acfilter_prevvalues[WMALL_MAX_CHANNELS][16];
 
     int8_t  mclms_order;
     int8_t  mclms_scaling;
@@ -143,35 +146,37 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        int16_t coefs[MAX_ORDER];
-        int16_t lms_prevvalues[MAX_ORDER * 2];
-        int16_t lms_updates[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
         int recent;
-    } cdlms[2][9];
+    } cdlms[WMALL_MAX_CHANNELS][9];
 
-    int cdlms_ttl[2];
+    int cdlms_ttl[WMALL_MAX_CHANNELS];
 
     int bV3RTM;
 
-    int is_channel_coded[2];
-    int update_speed[2];
+    int is_channel_coded[WMALL_MAX_CHANNELS];
+    int update_speed[WMALL_MAX_CHANNELS];
 
-    int transient[2];
-    int transient_pos[2];
+    int transient[WMALL_MAX_CHANNELS];
+    int transient_pos[WMALL_MAX_CHANNELS];
     int seekable_tile;
 
-    int ave_sum[2];
+    int ave_sum[WMALL_MAX_CHANNELS];
 
-    int channel_residues[2][WMALL_BLOCK_MAX_SIZE];
+    int channel_residues[WMALL_MAX_CHANNELS][WMALL_BLOCK_MAX_SIZE];
 
-    int lpc_coefs[2][40];
+    int lpc_coefs[WMALL_MAX_CHANNELS][40];
     int lpc_order;
     int lpc_scaling;
     int lpc_intbits;
 
-    int channel_coeffs[2][WMALL_BLOCK_MAX_SIZE];
+    int channel_coeffs[WMALL_MAX_CHANNELS][WMALL_BLOCK_MAX_SIZE];
 } WmallDecodeCtx;
 
+/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
+#define WMASIGN(x) (((x) > 0) - ((x) < 0))
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -180,7 +185,13 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int channel_mask;
     int i, log2_max_num_subframes;
 
+    if (!avctx->block_align) {
+        av_log(avctx, AV_LOG_ERROR, "block_align is not set\n");
+        return AVERROR(EINVAL);
+    }
+
     s->avctx = avctx;
+    ff_llauddsp_init(&s->dsp);
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
     if (avctx->extradata_size >= 18) {
@@ -190,9 +201,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
         if (s->bits_per_sample == 16)
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else if (s->bits_per_sample == 24) {
+            av_log(avctx, AV_LOG_WARNING, "Decoding audio at 24 bit-depth\n");
             avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-            avpriv_report_missing_feature(avctx, "Bit-depth higher than 16");
-            return AVERROR_PATCHWELCOME;
+            avctx->bits_per_raw_sample = 24;
         } else {
             av_log(avctx, AV_LOG_ERROR, "Unknown bit-depth: %"PRIu8"\n",
                    s->bits_per_sample);
@@ -345,11 +356,11 @@ static int decode_tilehdr(WmallDecodeCtx *s)
             if (num_samples[c] == min_channel_len) {
                 if (fixed_channel_layout || channels_for_cur_subframe == 1 ||
                    (min_channel_len == s->samples_per_frame - s->min_samples_per_subframe)) {
-                    contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = 1;
                 } else {
-                    if (get_bits1(&s->gb))
-                        contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = get_bits1(&s->gb);
                 }
+                in_use |= contains_subframe[c];
             } else
                 contains_subframe[c] = 0;
         }
@@ -454,6 +465,13 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[0][0].order = 0;
                 return AVERROR_INVALIDDATA;
             }
+            if(s->cdlms[c][i].order & 8) {
+                static int warned;
+                if(!warned)
+                    avpriv_request_sample(s->avctx, "CDLMS of order %d",
+                                          s->cdlms[c][i].order);
+                warned = 1;
+            }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
@@ -471,7 +489,7 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 if ((1 << cbits) < s->cdlms[c][i].scaling + 1)
                     cbits++;
 
-                s->cdlms[c][i].bitsend = get_bits(&s->gb, cbits) + 2;
+                s->cdlms[c][i].bitsend = (cbits ? get_bits(&s->gb, cbits) : 0) + 2;
                 shift_l = 32 - s->cdlms[c][i].bitsend;
                 shift_r = 32 - s->cdlms[c][i].scaling - 2;
                 for (j = 0; j < s->cdlms[c][i].coefsend; j++)
@@ -479,6 +497,10 @@ static int decode_cdlms(WmallDecodeCtx *s)
                         (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
             }
         }
+
+        for (i = 0; i < s->cdlms_ttl[c]; i++)
+            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order,
+                   0, WMALL_COEFF_PAD_SIZE);
     }
 
     return 0;
@@ -505,9 +527,9 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
 
     if (s->seekable_tile) {
         if (s->do_inter_ch_decorr)
-            s->channel_residues[ch][0] = get_sbits(&s->gb, s->bits_per_sample + 1);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample + 1);
         else
-            s->channel_residues[ch][0] = get_sbits(&s->gb, s->bits_per_sample);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample);
         i++;
     }
     for (; i < tile_size; i++) {
@@ -525,17 +547,14 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
             residue = quo;
         else {
             rem_bits = av_ceil_log2(ave_mean);
-            rem      = rem_bits ? get_bits_long(&s->gb, rem_bits) : 0;
+            rem      = get_bits_long(&s->gb, rem_bits);
             residue  = (quo << rem_bits) + rem;
         }
 
         s->ave_sum[ch] = residue + s->ave_sum[ch] -
                          (s->ave_sum[ch] >> s->movave_scaling);
 
-        if (residue & 1)
-            residue = -(residue >> 1) - 1;
-        else
-            residue = residue >> 1;
+        residue = (residue >> 1) ^ -(residue & 1);
         s->channel_residues[ch][i] = residue;
     }
 
@@ -612,47 +631,31 @@ static void mclms_update(WmallDecodeCtx *s, int icoef, int *pred)
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] +=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] += WMASIGN(s->channel_residues[j][icoef]);
         } else if (pred_error < 0) {
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] -=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] -= WMASIGN(s->channel_residues[j][icoef]);
         }
     }
 
     for (ich = num_channels - 1; ich >= 0; ich--) {
         s->mclms_recent--;
-        s->mclms_prevvalues[s->mclms_recent] = s->channel_residues[ich][icoef];
-        if (s->channel_residues[ich][icoef] > range - 1)
-            s->mclms_prevvalues[s->mclms_recent] = range - 1;
-        else if (s->channel_residues[ich][icoef] < -range)
-            s->mclms_prevvalues[s->mclms_recent] = -range;
-
-        s->mclms_updates[s->mclms_recent] = 0;
-        if (s->channel_residues[ich][icoef] > 0)
-            s->mclms_updates[s->mclms_recent] = 1;
-        else if (s->channel_residues[ich][icoef] < 0)
-            s->mclms_updates[s->mclms_recent] = -1;
+        s->mclms_prevvalues[s->mclms_recent] = av_clip(s->channel_residues[ich][icoef],
+            -range, range - 1);
+        s->mclms_updates[s->mclms_recent] = WMASIGN(s->channel_residues[ich][icoef]);
     }
 
     if (s->mclms_recent == 0) {
         memcpy(&s->mclms_prevvalues[order * num_channels],
                s->mclms_prevvalues,
-               2 * order * num_channels);
+               sizeof(int16_t) * order * num_channels);
         memcpy(&s->mclms_updates[order * num_channels],
                s->mclms_updates,
-               2 * order * num_channels);
+               sizeof(int16_t) * order * num_channels);
         s->mclms_recent = num_channels * order;
     }
 }
@@ -688,58 +691,30 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
-{
-    int pred = 0, icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-
-    for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-        pred += s->cdlms[ich][ilms].coefs[icoef] *
-                s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
-
-    return pred;
-}
-
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
-                       int input, int residue)
+static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input)
 {
-    int icoef;
     int recent = s->cdlms[ich][ilms].recent;
     int range  = 1 << s->bits_per_sample - 1;
-
-    if (residue < 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] -=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    } else if (residue > 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] +=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    }
+    int order  = s->cdlms[ich][ilms].order;
 
     if (recent)
         recent--;
     else {
-        memcpy(&s->cdlms[ich][ilms].lms_prevvalues[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_prevvalues,
-               2 * s->cdlms[ich][ilms].order);
-        memcpy(&s->cdlms[ich][ilms].lms_updates[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_updates,
-               2 * s->cdlms[ich][ilms].order);
-        recent = s->cdlms[ich][ilms].order - 1;
+        memcpy(s->cdlms[ich][ilms].lms_prevvalues + order,
+               s->cdlms[ich][ilms].lms_prevvalues, sizeof(*s->cdlms[ich][ilms].lms_prevvalues) * order);
+        memcpy(s->cdlms[ich][ilms].lms_updates + order,
+               s->cdlms[ich][ilms].lms_updates, sizeof(*s->cdlms[ich][ilms].lms_updates) * order);
+        recent = order - 1;
     }
 
     s->cdlms[ich][ilms].lms_prevvalues[recent] = av_clip(input, -range, range - 1);
-    if (!input)
-        s->cdlms[ich][ilms].lms_updates[recent] = 0;
-    else if (input < 0)
-        s->cdlms[ich][ilms].lms_updates[recent] = -s->update_speed[ich];
-    else
-        s->cdlms[ich][ilms].lms_updates[recent] = s->update_speed[ich];
+    s->cdlms[ich][ilms].lms_updates[recent] = WMASIGN(input) * s->update_speed[ich];
 
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 4)] >>= 2;
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 3)] >>= 1;
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 4)] >>= 2;
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 3)] >>= 1;
     s->cdlms[ich][ilms].recent = recent;
+    memset(s->cdlms[ich][ilms].lms_updates + recent + order, 0,
+           sizeof(s->cdlms[ich][ilms].lms_updates) - 2*(recent+order));
 }
 
 static void use_high_update_speed(WmallDecodeCtx *s, int ich)
@@ -787,12 +762,20 @@ static void revert_cdlms(WmallDecodeCtx *s, int ch,
         for (icoef = coef_begin; icoef < coef_end; icoef++) {
             pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
             residue = s->channel_residues[ch][icoef];
-            pred += lms_predict(s, ch, ilms);
+            pred += s->dsp.scalarproduct_and_madd_int16(s->cdlms[ch][ilms].coefs,
+                                                        s->cdlms[ch][ilms].lms_prevvalues
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        s->cdlms[ch][ilms].lms_updates
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        FFALIGN(s->cdlms[ch][ilms].order,
+                                                                WMALL_COEFF_PAD_SIZE),
+                                                        WMASIGN(residue));
             input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input, residue);
+            lms_update(s, ch, ilms, input);
             s->channel_residues[ch][icoef] = input;
         }
     }
+    emms_c();
 }
 
 static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
@@ -811,7 +794,7 @@ static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 static void revert_acfilter(WmallDecodeCtx *s, int tile_size)
 {
     int ich, pred, i, j;
-    int64_t *filter_coeffs = s->acfilter_coeffs;
+    int16_t *filter_coeffs = s->acfilter_coeffs;
     int scaling            = s->acfilter_scaling;
     int order              = s->acfilter_order;
 
@@ -955,7 +938,7 @@ static int decode_subframe(WmallDecodeCtx *s)
                 bits * s->num_channels * subframe_len, get_bits_count(&s->gb));
         for (i = 0; i < s->num_channels; i++)
             for (j = 0; j < subframe_len; j++)
-                s->channel_coeffs[i][j] = get_sbits(&s->gb, bits);
+                s->channel_coeffs[i][j] = get_sbits_long(&s->gb, bits);
     } else {
         for (i = 0; i < s->num_channels; i++)
             if (s->is_channel_coded[i]) {
@@ -991,7 +974,7 @@ static int decode_subframe(WmallDecodeCtx *s)
             if (s->bits_per_sample == 16) {
                 *s->samples_16[c]++ = (int16_t) s->channel_residues[c][j] << padding_zeroes;
             } else {
-                *s->samples_32[c]++ = s->channel_residues[c][j] << padding_zeroes;
+                *s->samples_32[c]++ = s->channel_residues[c][j] << (padding_zeroes + 8);
             }
         }
     }
@@ -1022,9 +1005,8 @@ static int decode_frame(WmallDecodeCtx *s)
     s->frame->nb_samples = s->samples_per_frame;
     if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
         /* return an error if no frame could be decoded at all */
-        av_log(s->avctx, AV_LOG_ERROR,
-               "not enough space for the output samples\n");
         s->packet_loss = 1;
+        s->frame->nb_samples = 0;
         return ret;
     }
     for (i = 0; i < s->num_channels; i++) {
@@ -1037,9 +1019,10 @@ static int decode_frame(WmallDecodeCtx *s)
         len = get_bits(gb, s->log2_frame_size);
 
     /* decode tile information */
-    if (decode_tilehdr(s)) {
+    if ((ret = decode_tilehdr(s))) {
         s->packet_loss = 1;
-        return 0;
+        av_frame_unref(s->frame);
+        return ret;
     }
 
     /* read drc info */
@@ -1074,16 +1057,18 @@ static int decode_frame(WmallDecodeCtx *s)
 
     /* decode all subframes */
     while (!s->parsed_all_subframes) {
+        int decoded_samples = s->channel[0].decoded_samples;
         if (decode_subframe(s) < 0) {
             s->packet_loss = 1;
+            if (s->frame->nb_samples)
+                s->frame->nb_samples = decoded_samples;
             return 0;
         }
     }
 
     ff_dlog(s->avctx, "Frame done\n");
 
-    if (s->skip_frame)
-        s->skip_frame = 0;
+    s->skip_frame = 0;
 
     if (s->len_prefix) {
         if (len != (get_bits_count(gb) - s->frame_offset) + 2) {
@@ -1182,9 +1167,13 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     if (s->packet_done || s->packet_loss) {
         s->packet_done = 0;
 
-        /* sanity check for the buffer length */
-        if (buf_size < avctx->block_align)
+        if (!buf_size)
             return 0;
+        /* sanity check for the buffer length */
+        if (buf_size < avctx->block_align) {
+            av_log(avctx, AV_LOG_ERROR, "buf size %d invalid\n", buf_size);
+            return AVERROR_INVALIDDATA;
+        }
 
         s->next_packet_start = buf_size - avctx->block_align;
         buf_size             = avctx->block_align;
diff --git a/libavcodec/wmaprodata.h b/libavcodec/wmaprodata.h
index f8a52bf4f4..53824799d5 100644
--- a/libavcodec/wmaprodata.h
+++ b/libavcodec/wmaprodata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2009 Sascha Sommer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index 46710c54ae..19dc335b32 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2011 Sascha Sommer, Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -171,13 +171,13 @@ typedef struct WMAProChannelGrp {
 typedef struct WMAProDecodeCtx {
     /* generic decoder variables */
     AVCodecContext*  avctx;                         ///< codec context for av_log
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     uint8_t          frame_data[MAX_FRAMESIZE +
                       AV_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
     PutBitContext    pb;                            ///< context for filling the frame_data buffer
     FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
     DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
-    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
+    const float*     windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
 
     /* frame size dependent frame information (set during initialization) */
     uint32_t         decode_flags;                  ///< used compression features
@@ -260,6 +260,8 @@ static av_cold int decode_end(AVCodecContext *avctx)
     WMAProDecodeCtx *s = avctx->priv_data;
     int i;
 
+    av_freep(&s->fdsp);
+
     for (i = 0; i < WMAPRO_BLOCK_SIZES; i++)
         ff_mdct_end(&s->mdct_ctx[i]);
 
@@ -286,7 +288,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     s->avctx = avctx;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
@@ -308,6 +312,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     /** generic init */
     s->log2_frame_size = av_log2(avctx->block_align) + 4;
+    if (s->log2_frame_size > 25) {
+        avpriv_request_sample(avctx, "Large block align");
+        return AVERROR_PATCHWELCOME;
+    }
 
     /** frame info */
     s->skip_frame  = 1; /* skip first frame */
@@ -340,8 +348,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     if (s->min_samples_per_subframe < WMAPRO_BLOCK_MIN_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid minimum block size %"PRId8"\n",
-               s->max_num_subframes);
+        av_log(avctx, AV_LOG_ERROR, "min_samples_per_subframe of %d too small\n",
+               s->min_samples_per_subframe);
         return AVERROR_INVALIDDATA;
     }
 
@@ -418,9 +426,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
             offset &= ~3;
             if (offset > s->sfb_offsets[i][band - 1])
                 s->sfb_offsets[i][band++] = offset;
+
+            if (offset >= subframe_len)
+                break;
         }
         s->sfb_offsets[i][band - 1] = subframe_len;
         s->num_sfb[i]               = band - 1;
+        if (s->num_sfb[i] <= 0) {
+            av_log(avctx, AV_LOG_ERROR, "num_sfb invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
 
@@ -437,9 +452,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
                            + s->sfb_offsets[i][b + 1] - 1) << i) >> 1;
             for (x = 0; x < num_possible_block_sizes; x++) {
                 int v = 0;
-                while (s->sfb_offsets[x][v + 1] << x < offset)
-                    if (++v >= MAX_BANDS)
-                        return AVERROR_INVALIDDATA;
+                while (s->sfb_offsets[x][v + 1] << x < offset) {
+                    v++;
+                    av_assert0(v < MAX_BANDS);
+                }
                 s->sf_offsets[i][x][b] = v;
             }
         }
@@ -493,6 +509,9 @@ static int decode_subframe_length(WMAProDecodeCtx *s, int offset)
     if (offset == s->samples_per_frame - s->min_samples_per_subframe)
         return s->min_samples_per_subframe;
 
+    if (get_bits_left(&s->gb) < 1)
+        return AVERROR_INVALIDDATA;
+
     /** 1 bit indicates if the subframe is of maximum length */
     if (s->max_subframe_len_bit) {
         if (get_bits1(&s->gb))
@@ -671,7 +690,7 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
 /**
  *@brief Decode channel transformation parameters
  *@param s codec context
- *@return 0 in case of success, < 0 in case of bitstream errors
+ *@return >= 0 in case of success, < 0 in case of bitstream errors
  */
 static int decode_channel_transform(WMAProDecodeCtx* s)
 {
@@ -1022,10 +1041,10 @@ static void inverse_channel_transform(WMAProDecodeCtx *s)
                     }
                 } else if (s->avctx->channels == 2) {
                     int len = FFMIN(sfb[1], s->subframe_len) - sfb[0];
-                    s->fdsp.vector_fmul_scalar(ch_data[0] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[0] + sfb[0],
                                                ch_data[0] + sfb[0],
                                                181.0 / 128, len);
-                    s->fdsp.vector_fmul_scalar(ch_data[1] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[1] + sfb[0],
                                                ch_data[1] + sfb[0],
                                                181.0 / 128, len);
                 }
@@ -1043,7 +1062,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
     int i;
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
         int c = s->channel_indexes_for_cur_subframe[i];
-        float* window;
+        const float* window;
         int winlen = s->channel[c].prev_block_len;
         float* start = s->channel[c].coeffs - (winlen >> 1);
 
@@ -1056,7 +1075,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
 
         winlen >>= 1;
 
-        s->fdsp.vector_fmul_window(start, start, start + winlen,
+        s->fdsp->vector_fmul_window(start, start, start + winlen,
                                    window, winlen);
 
         s->channel[c].prev_block_len = s->subframe_len;
@@ -1145,7 +1164,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
         int num_fill_bits;
         if (!(num_fill_bits = get_bits(&s->gb, 2))) {
             int len = get_bits(&s->gb, 4);
-            num_fill_bits = get_bits(&s->gb, len) + 1;
+            num_fill_bits = (len ? get_bits(&s->gb, len) : 0) + 1;
         }
 
         if (num_fill_bits >= 0) {
@@ -1175,6 +1194,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
             transmit_coeffs = 1;
     }
 
+    av_assert0(s->subframe_len <= WMAPRO_BLOCK_MAX_SIZE);
     if (transmit_coeffs) {
         int step;
         int quant_step = 90 * s->bits_per_sample >> 4;
@@ -1185,10 +1205,11 @@ static int decode_subframe(WMAProDecodeCtx *s)
             for (i = 0; i < s->channels_for_cur_subframe; i++) {
                 int c = s->channel_indexes_for_cur_subframe[i];
                 int num_vec_coeffs = get_bits(&s->gb, num_bits) << 2;
-                if (num_vec_coeffs + offset > FF_ARRAY_ELEMS(s->channel[c].out)) {
+                if (num_vec_coeffs > s->subframe_len) {
                     av_log(s->avctx, AV_LOG_ERROR, "num_vec_coeffs %d is too large\n", num_vec_coeffs);
                     return AVERROR_INVALIDDATA;
                 }
+                av_assert0(num_vec_coeffs + offset <= FF_ARRAY_ELEMS(s->channel[c].out));
                 s->channel[c].num_vec_coeffs = num_vec_coeffs;
             }
         } else {
@@ -1274,7 +1295,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
                             s->channel[c].scale_factor_step;
                 const float quant = pow(10.0, exp / 20.0);
                 int start = s->cur_sfb_offsets[b];
-                s->fdsp.vector_fmul_scalar(s->tmp + start,
+                s->fdsp->vector_fmul_scalar(s->tmp + start,
                                            s->channel[c].coeffs + start,
                                            quant, end - start);
             }
@@ -1381,7 +1402,6 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
     /* get output buffer */
     frame->nb_samples = s->samples_per_frame;
     if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         s->packet_loss = 1;
         return 0;
     }
@@ -1455,7 +1475,7 @@ static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
     int buflen;
 
     /** when the frame data does not need to be concatenated, the input buffer
-        is resetted and additional bits from the previous frame are copyed
+        is reset and additional bits from the previous frame are copied
         and skipped later so that a fast byte copy is possible */
 
     if (!append) {
@@ -1464,7 +1484,7 @@ static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
         init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
     }
 
-    buflen = (s->num_saved_bits + len + 8) >> 3;
+    buflen = (put_bits_count(&s->pb) + len + 8) >> 3;
 
     if (len <= 0 || buflen > MAX_FRAMESIZE) {
         avpriv_request_sample(s->avctx, "Too small input buffer");
@@ -1472,13 +1492,7 @@ static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
         return;
     }
 
-    if (len > put_bits_left(&s->pb)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "Cannot append %d bits, only %d bits available.\n",
-               len, put_bits_left(&s->pb));
-        s->packet_loss = 1;
-        return;
-    }
+    av_assert0(len <= put_bits_left(&s->pb));
 
     s->num_saved_bits += len;
     if (!append) {
@@ -1593,7 +1607,8 @@ static int decode_packet(AVCodecContext *avctx, void *data,
             (frame_size = show_bits(gb, s->log2_frame_size)) &&
             frame_size <= remaining_bits(s, gb)) {
             save_bits(s, gb, frame_size, 0);
-            s->packet_done = !decode_frame(s, data, got_frame_ptr);
+            if (!s->packet_loss)
+                s->packet_done = !decode_frame(s, data, got_frame_ptr);
         } else if (!s->len_prefix
                    && s->num_saved_bits > get_bits_count(&s->gb)) {
             /** when the frames do not have a length prefix, we don't know
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index 92261b9a77..029dfdd8bf 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -2,20 +2,20 @@
  * Windows Media Audio Voice decoder.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,6 @@
  * @author Ronald S. Bultje <rsbultje@gmail.com>
  */
 
-#define UNCHECKED_BITSTREAM_READER 1
-
 #include <math.h>
 
 #include "libavutil/channel_layout.h"
@@ -520,7 +518,7 @@ static int kalman_smoothen(WMAVoiceContext *s, int pitch,
     float optimal_gain = 0, dot;
     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
-                *best_hist_ptr;
+                *best_hist_ptr = NULL;
 
     /* find best fitting point in history */
     do {
@@ -780,7 +778,7 @@ static void postfilter(WMAVoiceContext *s, const float *synth,
           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
           *synth_filter_in = zero_exc_pf;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
 
     /* generate excitation from input signal */
     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
@@ -1249,7 +1247,7 @@ static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
     float gain;
     int n, r_idx;
 
-    assert(size <= MAX_FRAMESIZE);
+    av_assert0(size <= MAX_FRAMESIZE);
 
     /* Set the offset from which we start reading wmavoice_std_codebook */
     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
@@ -1285,7 +1283,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
     int n, idx, gain_weight;
     AMRFixed fcb;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
     memset(pulses, 0, sizeof(*pulses) * size);
 
     fcb.pitch_lag      = block_pitch_sh2 >> 2;
@@ -1456,8 +1454,8 @@ static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
                        float *excitation, float *synth)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
-    int pitch[MAX_BLOCKS], last_block_pitch;
+    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
+    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
 
     /* Parse frame type ("frame header"), see frame_descs */
     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
@@ -1674,7 +1672,7 @@ static int check_bits_for_superframe(GetBitContext *orig_gb,
     /* initialize a copy */
     init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
     skip_bits_long(gb, get_bits_count(orig_gb));
-    assert(get_bits_left(gb) == get_bits_left(orig_gb));
+    av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));
 
     /* superframe header */
     if (get_bits_left(gb) < 14)
@@ -1820,10 +1818,8 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 
     /* get output buffer */
     frame->nb_samples = 480;
-    if ((res = ff_get_buffer(ctx, frame, 0)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
         return res;
-    }
     frame->nb_samples = n_samples;
     samples = (float *)frame->data[0];
 
@@ -1955,7 +1951,7 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     int size, res, pos;
 
     /* Packets are sometimes a multiple of ctx->block_align, with a packet
-     * header at each ctx->block_align bytes. However, Libav's ASF demuxer
+     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
      * feeds us ASF packets, which may concatenate multiple "codec" packets
      * in a single "muxer" packet, so we artificially emulate that by
      * capping the packet size at ctx->block_align. */
@@ -1986,7 +1982,14 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
                     *got_frame_ptr) {
                     cnt += s->spillover_nbits;
                     s->skip_bits_next = cnt & 7;
-                    return cnt >> 3;
+                    res = cnt >> 3;
+                    if (res > avpkt->size) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Trying to skip %d bytes in packet of size %d\n",
+                               res, avpkt->size);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    return res;
                 } else
                     skip_bits_long (gb, s->spillover_nbits - cnt +
                                     get_bits_count(gb)); // resync
@@ -2005,12 +2008,19 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     } else if (*got_frame_ptr) {
         int cnt = get_bits_count(gb);
         s->skip_bits_next = cnt & 7;
-        return cnt >> 3;
+        res = cnt >> 3;
+        if (res > avpkt->size) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Trying to skip %d bytes in packet of size %d\n",
+                   res, avpkt->size);
+            return AVERROR_INVALIDDATA;
+        }
+        return res;
     } else if ((s->sframe_cache_size = pos) > 0) {
         /* rewind bit reader to start of last (incomplete) superframe... */
         init_get_bits(gb, avpkt->data, size << 3);
         skip_bits_long(gb, (size << 3) - pos);
-        assert(get_bits_left(gb) == pos);
+        av_assert1(get_bits_left(gb) == pos);
 
         /* ...and cache it for spillover in next packet */
         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
diff --git a/libavcodec/wmavoice_data.h b/libavcodec/wmavoice_data.h
index 7f14fb8350..cbf65b043e 100644
--- a/libavcodec/wmavoice_data.h
+++ b/libavcodec/wmavoice_data.h
@@ -2,20 +2,20 @@
  * Windows Media Voice (WMAVoice) tables.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c
index 3974254fab..9c3acbcd2d 100644
--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -103,8 +103,8 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
 {
     Wmv2Context *const w = (Wmv2Context *) s;
     uint8_t *ptr;
-    int dxy, offset, mx, my, src_x, src_y, v_edge_pos;
-    ptrdiff_t linesize, uvlinesize;
+    int dxy, mx, my, src_x, src_y, v_edge_pos;
+    ptrdiff_t offset, linesize, uvlinesize;
     int emu = 0;
 
     dxy   = ((motion_y & 1) << 1) | (motion_x & 1);
@@ -144,21 +144,13 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
     if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    if (s->out_format == FMT_H263) {
-        dxy = 0;
-        if ((motion_x & 3) != 0)
-            dxy |= 1;
-        if ((motion_y & 3) != 0)
-            dxy |= 2;
-        mx = motion_x >> 2;
-        my = motion_y >> 2;
-    } else {
-        mx   = motion_x / 2;
-        my   = motion_y / 2;
-        dxy  = ((my & 1) << 1) | (mx & 1);
-        mx >>= 1;
-        my >>= 1;
-    }
+    dxy = 0;
+    if ((motion_x & 3) != 0)
+        dxy |= 1;
+    if ((motion_y & 3) != 0)
+        dxy |= 2;
+    mx = motion_x >> 2;
+    my = motion_y >> 2;
 
     src_x = s->mb_x * 8 + mx;
     src_y = s->mb_y * 8 + my;
diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h
index b77dd98cd5..31593b8c38 100644
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,4 +70,16 @@ void ff_mspel_motion(MpegEncContext *s,
                      uint8_t **ref_picture, op_pixels_func (*pix_op)[4],
                      int motion_x, int motion_y, int h);
 
+
+static av_always_inline int wmv2_get_cbp_table_index(MpegEncContext *s, int cbp_index)
+{
+    static const uint8_t map[3][3] = {
+        { 0, 2, 1 },
+        { 1, 0, 2 },
+        { 2, 1, 0 },
+    };
+
+    return map[(s->qscale > 10) + (s->qscale > 20)][cbp_index];
+}
+
 #endif /* AVCODEC_WMV2_H */
diff --git a/libavcodec/wmv2dec.c b/libavcodec/wmv2dec.c
index c71fd3dfea..99c95d397a 100644
--- a/libavcodec/wmv2dec.c
+++ b/libavcodec/wmv2dec.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int decode_ext_header(Wmv2Context *w)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, "
+               "fps:%d, br:%"PRId64", qpbit:%d, abt_flag:%d, j_type_bit:%d, "
                "tl_mv_flag:%d, mbrl_bit:%d, code:%d, loop_filter:%d, "
                "slices:%d\n",
                fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit,
@@ -174,16 +174,7 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext *s)
 
         parse_mb_skip(w);
         cbp_index = decode012(&s->gb);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = {2,1,0};
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             s->mspel = get_bits1(&s->gb);
@@ -462,6 +453,10 @@ static av_cold int wmv2_decode_init(AVCodecContext *avctx)
     Wmv2Context *const w = avctx->priv_data;
     int ret;
 
+#if FF_API_EMU_EDGE
+    avctx->flags |= CODEC_FLAG_EMU_EDGE;
+#endif
+
     if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
         return ret;
 
diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c
index 2e3a3ff552..40e0bef0da 100644
--- a/libavcodec/wmv2dsp.c
+++ b/libavcodec/wmv2dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2dsp.h b/libavcodec/wmv2dsp.h
index f2f258ed34..0bf9489b07 100644
--- a/libavcodec/wmv2dsp.h
+++ b/libavcodec/wmv2dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2enc.c b/libavcodec/wmv2enc.c
index 06ad2b65cf..3ed8b5fb53 100644
--- a/libavcodec/wmv2enc.c
+++ b/libavcodec/wmv2enc.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +62,7 @@ static av_cold int wmv2_encode_init(AVCodecContext *avctx)
     ff_wmv2_common_init(w);
 
     avctx->extradata_size = 4;
-    avctx->extradata      = av_mallocz(avctx->extradata_size + 10);
+    avctx->extradata      = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -88,10 +88,10 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
     w->abt_type        = 0;
     w->j_type          = 0;
 
-    assert(s->flipflop_rounding);
+    av_assert0(s->flipflop_rounding);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
-        assert(s->no_rounding == 1);
+        av_assert0(s->no_rounding == 1);
         if (w->j_type_bit)
             put_bits(&s->pb, 1, w->j_type);
 
@@ -112,16 +112,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 2, SKIP_TYPE_NONE);
 
         ff_msmpeg4_code012(&s->pb, cbp_index = 0);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = { 2, 1, 0 };
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             put_bits(&s->pb, 1, s->mspel);
@@ -174,10 +165,12 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][1],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
 
+        s->misc_bits += get_bits_diff(s);
         /* motion vector */
         ff_h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
         ff_msmpeg4_encode_motion(s, motion_x - pred_x,
                                  motion_y - pred_y);
+        s->mv_bits += get_bits_diff(s);
     } else {
         /* compute cbp */
         cbp       = 0;
@@ -210,10 +203,15 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                      ff_table_inter_intra[s->h263_aic_dir][1],
                      ff_table_inter_intra[s->h263_aic_dir][0]);
         }
+        s->misc_bits += get_bits_diff(s);
     }
 
     for (i = 0; i < 6; i++)
         ff_msmpeg4_encode_block(s, block[i], i);
+    if (s->mb_intra)
+        s->i_tex_bits += get_bits_diff(s);
+    else
+        s->p_tex_bits += get_bits_diff(s);
 }
 
 static const AVClass wmv2_class = {
diff --git a/libavcodec/wnv1.c b/libavcodec/wnv1.c
index d0304c97ad..9ff99b2f98 100644
--- a/libavcodec/wnv1.c
+++ b/libavcodec/wnv1.c
@@ -2,20 +2,20 @@
  * Winnov WNV1 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,8 +31,6 @@
 
 
 typedef struct WNV1Context {
-    AVCodecContext *avctx;
-
     int shift;
     GetBitContext gb;
 } WNV1Context;
@@ -70,8 +68,8 @@ static int decode_frame(AVCodecContext *avctx,
     int prev_y = 0, prev_u = 0, prev_v = 0;
     uint8_t *rbuf;
 
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
+    if (buf_size <= 8) {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is too small\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
@@ -80,9 +78,9 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
+    memset(rbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         av_free(rbuf);
         return ret;
     }
@@ -90,7 +88,9 @@ static int decode_frame(AVCodecContext *avctx,
 
     for (i = 8; i < buf_size; i++)
         rbuf[i] = ff_reverse[buf[i]];
-    init_get_bits(&l->gb, rbuf + 8, (buf_size - 8) * 8);
+
+    if ((ret = init_get_bits8(&l->gb, rbuf + 8, buf_size - 8)) < 0)
+        return ret;
 
     if (buf[2] >> 4 == 6)
         l->shift = 2;
@@ -134,10 +134,8 @@ static int decode_frame(AVCodecContext *avctx,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    WNV1Context * const l = avctx->priv_data;
     static VLC_TYPE code_table[1 << CODE_VLC_BITS][2];
 
-    l->avctx       = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
     code_vlc.table           = code_table;
diff --git a/libavcodec/ws-snd1.c b/libavcodec/ws-snd1.c
index 11b7289185..0f005807ae 100644
--- a/libavcodec/ws-snd1.c
+++ b/libavcodec/ws-snd1.c
@@ -2,20 +2,20 @@
  * Westwood SNDx codecs
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,15 +76,13 @@ static int ws_snd_decode_frame(AVCodecContext *avctx, void *data,
 
     if (in_size > buf_size) {
         av_log(avctx, AV_LOG_ERROR, "Frame data is larger than input buffer\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* get output buffer */
     frame->nb_samples = out_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples     = frame->data[0];
     samples_end = samples + out_size;
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 1cf47a8567..5ff3a77e37 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,11 +3,13 @@ OBJS                                   += x86/constants.o               \
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLAC_DECODER)            += x86/flacdsp_init.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += x86/flacdsp_init.o
 OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
@@ -15,6 +17,8 @@ OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
@@ -34,48 +38,62 @@ OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 # decoders/encoders
-OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
 OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
 OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
 # GCC inline assembly optimizations
 # subsystems
-MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
-MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
-                                          x86/hpeldsp_mmx.o
+MMX-OBJS-$(CONFIG_DIRAC_DECODER)       += x86/dirac_dwt.o
 MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
-MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
-                                          x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/simple_idct.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
-                                          x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 
 # subsystems
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
+YASM-OBJS-$(CONFIG_BLOCKDSP)           += x86/blockdsp.o
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
+                                          x86/dwt_yasm.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
+endif
 YASM-OBJS-$(CONFIG_FMTCONVERT)         += x86/fmtconvert.o
 YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
@@ -95,6 +113,9 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
+YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
@@ -103,21 +124,44 @@ YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 YASM-OBJS-$(CONFIG_RV34DSP)            += x86/rv34dsp.o
-YASM-OBJS-$(CONFIG_V210_ENCODER)       += x86/v210enc.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP8DSP)             += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
 
 # decoders/encoders
-YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
+YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o
+YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o               \
+                                          x86/hevc_res_add.o            \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+YASM-OBJS-$(CONFIG_JPEG2000_DECODER)   += x86/jpeg2000dsp.o
+YASM-OBJS-$(CONFIG_MLP_DECODER)        += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_MPEG4_DECODER)      += x86/xvididct.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_SVQ1_ENCODER)       += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER)     += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER)        += x86/ttadsp.o
+YASM-OBJS-$(CONFIG_V210_ENCODER)       += x86/v210enc.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
-YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9dsp.o
+YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER)       += x86/vp8dsp.o
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
new file mode 100644
index 0000000000..d1187df4db
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,215 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+.loop:
+    movaps m0, [srcq]
+    movaps m1, [srcq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+%if cpuflag(sse3)
+    haddps m0, m1
+%else
+    movaps m3, m0
+    movaps m4, m1
+    shufps m3, m3, q0301
+    shufps m4, m4, q0301
+    addps  m0, m3
+    addps  m1, m4
+    shufps m0, m1, q2020
+%endif
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add  srcq, mmsize*2
+    sub    nd, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 3
+INIT_XMM sse3
+PS_ADD_SQUARES 5
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
+    xor r4q, r4q
+
+.loop:
+    movu     m0, [src1q+r4q]
+    movu     m1, [src1q+r4q+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+r4q], m0
+    mova [dstq+r4q+mmsize], m1
+    add   src2q, mmsize
+    add     r4q, mmsize*2
+    sub      nd, mmsize/4
+    jg .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    cmp      nd, 0
+    jle .ret
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    movaps   m4, m0
+    movaps   m5, m0
+    unpcklps m4, m4
+    unpckhps m5, m5
+    mulps    m2, m4
+    mulps    m3, m5
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+.ret:
+    REP_RET
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 int stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strided, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
new file mode 100644
index 0000000000..f6d6c039c3
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,55 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a319c..675ade3101 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 DSP functions
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
 ; used in ff_ac3_extract_exponents()
-pd_1:   times 4 dd 1
+cextern pd_1
 pd_151: times 4 dd 151
 
 ; used in ff_apply_window_int16()
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index cd638b9228..eea2736bfa 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,6 +64,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 
+#if ARCH_X86_32 && defined(__INTEL_COMPILER)
+#       undef HAVE_7REGS
+#       define HAVE_7REGS 0
+#endif
+
 #if HAVE_SSE_INLINE && HAVE_7REGS
 
 #define IF1(x) x
@@ -160,7 +165,7 @@ static void ac3_downmix_sse(float **samples, float (*matrix)[2],
                matrix_cmp[3][0] == matrix_cmp[4][0]) {
         MIX5(IF1, IF0);
     } else {
-        DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
+        LOCAL_ALIGNED(16, float, matrix_simd, [AC3_MAX_CHANNELS], [2][4]);
         float *samp[AC3_MAX_CHANNELS];
 
         for (j = 0; j < in_ch; j++)
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index 696a73bd81..3ffb27fcf9 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m1
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m2
-    paddd   m2, m0
-    pshuflw m0, m2, 0x4e
-%else
-    pshufw  m0, m2, 0x4e
-%endif
-    paddd   m2, m0
+    HADDD   m2, m0
     movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
     RET
 %endmacro
 
@@ -80,17 +76,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     SPLATD    m4
     SPLATD    m5
 .loop:
-%assign %%i 1
+%assign %%i 0
 %rep %2
-    mova      m0,  [srcq+mmsize*0*%%i]
-    mova      m1,  [srcq+mmsize*1*%%i]
-    mova      m2,  [srcq+mmsize*2*%%i]
-    mova      m3,  [srcq+mmsize*3*%%i]
+    mova      m0,  [srcq+mmsize*(0+%%i)]
+    mova      m1,  [srcq+mmsize*(1+%%i)]
+    mova      m2,  [srcq+mmsize*(2+%%i)]
+    mova      m3,  [srcq+mmsize*(3+%%i)]
 %if %3
-    mova      m7,  [srcq+mmsize*4*%%i]
-    mova      m8,  [srcq+mmsize*5*%%i]
-    mova      m9,  [srcq+mmsize*6*%%i]
-    mova      m10, [srcq+mmsize*7*%%i]
+    mova      m7,  [srcq+mmsize*(4+%%i)]
+    mova      m8,  [srcq+mmsize*(5+%%i)]
+    mova      m9,  [srcq+mmsize*(6+%%i)]
+    mova      m10, [srcq+mmsize*(7+%%i)]
 %endif
     CLIPD  m0,  m4, m5, m6
     CLIPD  m1,  m4, m5, m6
@@ -102,17 +98,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     CLIPD  m9,  m4, m5, m6
     CLIPD  m10, m4, m5, m6
 %endif
-    mova  [dstq+mmsize*0*%%i], m0
-    mova  [dstq+mmsize*1*%%i], m1
-    mova  [dstq+mmsize*2*%%i], m2
-    mova  [dstq+mmsize*3*%%i], m3
+    mova  [dstq+mmsize*(0+%%i)], m0
+    mova  [dstq+mmsize*(1+%%i)], m1
+    mova  [dstq+mmsize*(2+%%i)], m2
+    mova  [dstq+mmsize*(3+%%i)], m3
 %if %3
-    mova  [dstq+mmsize*4*%%i], m7
-    mova  [dstq+mmsize*5*%%i], m8
-    mova  [dstq+mmsize*6*%%i], m9
-    mova  [dstq+mmsize*7*%%i], m10
+    mova  [dstq+mmsize*(4+%%i)], m7
+    mova  [dstq+mmsize*(5+%%i)], m8
+    mova  [dstq+mmsize*(6+%%i)], m9
+    mova  [dstq+mmsize*(7+%%i)], m10
 %endif
-%assign %%i %%i+1
+%assign %%i %%i+4*(%3+1)
 %endrep
     add     srcq, mmsize*4*(%2+%3)
     add     dstq, mmsize*4*(%2+%3)
@@ -135,3 +131,47 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif
+
+;-----------------------------------------------------
+;void ff_vector_clipf(float *dst, const float *src,
+;                     float min, float max, int len)
+;-----------------------------------------------------
+INIT_XMM sse
+%if UNIX64
+cglobal vector_clipf, 3,3,6, dst, src, len
+%else
+cglobal vector_clipf, 5,5,6, dst, src, min, max, len
+%endif
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+%elif ARCH_X86_32
+    movss   m0, minm
+    movss   m1, maxm
+%endif
+    SPLATD  m0
+    SPLATD  m1
+        shl lend, 2
+        add srcq, lenq
+        add dstq, lenq
+        neg lenq
+.loop:
+    mova    m2,  [srcq+lenq+mmsize*0]
+    mova    m3,  [srcq+lenq+mmsize*1]
+    mova    m4,  [srcq+lenq+mmsize*2]
+    mova    m5,  [srcq+lenq+mmsize*3]
+    maxps   m2, m0
+    maxps   m3, m0
+    maxps   m4, m0
+    maxps   m5, m0
+    minps   m2, m1
+    minps   m3, m1
+    minps   m4, m1
+    minps   m5, m1
+    mova    [dstq+lenq+mmsize*0], m2
+    mova    [dstq+lenq+mmsize*1], m3
+    mova    [dstq+lenq+mmsize*2], m4
+    mova    [dstq+lenq+mmsize*3], m5
+    add     lenq, mmsize*4
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 743f5a3699..a2ce231f32 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,6 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/audiodsp.h"
-#include "audiodsp.h"
 
 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
@@ -39,6 +38,8 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                                int32_t min, int32_t max, unsigned int len);
+void ff_vector_clipf_sse(float *dst, const float *src,
+                         float min, float max, int len);
 
 av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
 {
@@ -50,7 +51,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
     if (EXTERNAL_MMXEXT(cpu_flags))
         c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
 
-    if (INLINE_SSE(cpu_flags))
+    if (EXTERNAL_SSE(cpu_flags))
         c->vector_clipf = ff_vector_clipf_sse;
 
     if (EXTERNAL_SSE2(cpu_flags)) {
diff --git a/libavcodec/x86/audiodsp_mmx.c b/libavcodec/x86/audiodsp_mmx.c
deleted file mode 100644
index cb550598f9..0000000000
--- a/libavcodec/x86/audiodsp_mmx.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/x86/asm.h"
-#include "audiodsp.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len)
-{
-    x86_reg i = (len - 16) * 4;
-    __asm__ volatile (
-        "movss          %3, %%xmm4      \n\t"
-        "movss          %4, %%xmm5      \n\t"
-        "shufps $0, %%xmm4, %%xmm4      \n\t"
-        "shufps $0, %%xmm5, %%xmm5      \n\t"
-        "1:                             \n\t"
-        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
-        "movaps 16(%2, %0), %%xmm1      \n\t"
-        "movaps 32(%2, %0), %%xmm2      \n\t"
-        "movaps 48(%2, %0), %%xmm3      \n\t"
-        "maxps      %%xmm4, %%xmm0      \n\t"
-        "maxps      %%xmm4, %%xmm1      \n\t"
-        "maxps      %%xmm4, %%xmm2      \n\t"
-        "maxps      %%xmm4, %%xmm3      \n\t"
-        "minps      %%xmm5, %%xmm0      \n\t"
-        "minps      %%xmm5, %%xmm1      \n\t"
-        "minps      %%xmm5, %%xmm2      \n\t"
-        "minps      %%xmm5, %%xmm3      \n\t"
-        "movaps     %%xmm0,   (%1, %0)  \n\t"
-        "movaps     %%xmm1, 16(%1, %0)  \n\t"
-        "movaps     %%xmm2, 32(%1, %0)  \n\t"
-        "movaps     %%xmm3, 48(%1, %0)  \n\t"
-        "sub           $64, %0          \n\t"
-        "jge            1b              \n\t"
-        : "+&r" (i)
-        : "r" (dst), "r" (src), "m" (min), "m" (max)
-        : "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000000..7cbfa3a843
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+    mova  [blocksq+mmsize*(4+%%i)], m0
+    mova  [blocksq+mmsize*(5+%%i)], m0
+    mova  [blocksq+mmsize*(6+%%i)], m0
+    mova  [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0
+.loop:
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index b5294242ab..0000000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/blockdsp.h"
-#include "libavcodec/version.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n)                           \
-static void name(int16_t *blocks)                       \
-{                                                       \
-    __asm__ volatile (                                  \
-        "pxor %%mm7, %%mm7              \n\t"           \
-        "mov     %1,        %%"REG_a"   \n\t"           \
-        "1:                             \n\t"           \
-        "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
-        "add    $32, %%"REG_a"          \n\t"           \
-        "js      1b                     \n\t"           \
-        :: "r"(((uint8_t *) blocks) + 128 * n),         \
-           "i"(-128 * n)                                \
-        : "%"REG_a);                                    \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0          \n"
-        "movaps %%xmm0,    (%0)         \n"
-        "movaps %%xmm0,  16(%0)         \n"
-        "movaps %%xmm0,  32(%0)         \n"
-        "movaps %%xmm0,  48(%0)         \n"
-        "movaps %%xmm0,  64(%0)         \n"
-        "movaps %%xmm0,  80(%0)         \n"
-        "movaps %%xmm0,  96(%0)         \n"
-        "movaps %%xmm0, 112(%0)         \n"
-        :: "r" (block)
-        : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0              \n"
-        "mov        %1,         %%"REG_a"   \n"
-        "1:                                 \n"
-        "movaps %%xmm0,    (%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
-        "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
-        "add      $128,         %%"REG_a"   \n"
-        "js         1b                      \n"
-        :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
-        : "%"REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#if FF_API_XVMC
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
-                                  AVCodecContext *avctx)
-#else
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth)
-#endif /* FF_API_XVMC */
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (!high_bit_depth) {
-        if (INLINE_MMX(cpu_flags)) {
-            c->clear_block  = clear_block_mmx;
-            c->clear_blocks = clear_blocks_mmx;
-        }
-
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
-    if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
-        return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
-        if (INLINE_SSE(cpu_flags)) {
-            c->clear_block  = clear_block_sse;
-            c->clear_blocks = clear_blocks_sse;
-        }
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000000..21599934ff
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 4810867921..56d8083622 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
 ;******************************************************************************
 ;* optimized bswap buffer functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -24,6 +26,8 @@
 SECTION_RODATA
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+cextern pb_80
+
 SECTION .text
 
 ; %1 = aligned/unaligned
@@ -84,11 +88,14 @@ SECTION .text
 %macro BSWAP32_BUF 0
 %if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
+    mov      r3, r1
     mova     m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
 %endif
-    test     r1, 15
+    or       r3, r0
+    test     r3, 15
     jz       .start_align
     BSWAP_LOOPS  u
     jmp      .left
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2dbe1..c042e56371 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index d1701bf071..3510336f95 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,28 @@
 #include "libavutil/x86/asm.h"
 #include "config.h"
 
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
 #if HAVE_INLINE_ASM
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"REG_c"                                 \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
 #ifdef BROKEN_RELOCATIONS
 #define TABLES_ARG , "r"(tables)
 
@@ -73,8 +93,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
@@ -92,7 +111,8 @@
         "2:                                                             \n\t"
 
 #else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c")     , "tmp"                                 \n\t"\
@@ -154,8 +173,7 @@
 
 #endif /* BROKEN_RELOCATIONS */
 
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                              AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%8")
-        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end))
           TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
         : "%"REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
 
+#if !BROKEN_COMPILER
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "xor           %%edx, %%ecx     \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
         "addl          %%edx, %%eax     \n\t"
         "cmp         %c5(%2), %1        \n\t"
         "jge              1f            \n\t"
         "add"OPSIZE"      $2, %c4(%2)   \n\t"
+#endif
         "1:                             \n\t"
         "movl          %%eax, %c3(%2)   \n\t"
 
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "inc           %%edx            \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
     );
     return res;
 }
+#endif /* !BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index 39eec4b3ee..4b20e655a7 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
 {
     int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
 
     for(i=0; i<2; i++){
         cavs_idct8_1d(block + 4 * i, ff_pw_4.a);
@@ -196,7 +196,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         );
     }
 
-    ff_add_pixels_clamped_mmx(b2, dst, stride);
+    ff_add_pixels_clamped(b2, dst, stride);
 }
 
 #endif /* HAVE_MMX_INLINE */
@@ -210,10 +210,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  ****************************************************************************/
 
 /* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
         "psllw $3, "#E"             \n\t"\
@@ -228,35 +228,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#B", %%mm6          \n\t"\
         "psraw $1, "#B"             \n\t"\
         "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "add %2, %0                 \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $3, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
         "psllw $3, "#B"             \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psraw $3, "#B"             \n\t"\
@@ -269,7 +269,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#E", %%mm6          \n\t"\
         "psraw $1, "#E"             \n\t"\
         "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
@@ -298,32 +298,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "punpcklbw %%mm7, %%mm2     \n\t"\
         "punpcklbw %%mm7, %%mm3     \n\t"\
         "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
         \
         : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
         : "memory"\
      );\
      if(h==16){\
         __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
             \
            : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
            : "memory"\
         );\
      }\
@@ -336,7 +338,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
     int h=8;\
     __asm__ volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
         "1:                         \n\t"\
         "movq    (%0), %%mm0        \n\t"\
         "movq   1(%0), %%mm2        \n\t"\
@@ -362,7 +364,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "paddw %%mm3, %%mm5         \n\t"\
         "psubw %%mm2, %%mm0         \n\t"\
         "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
         "paddw %%mm5, %%mm0         \n\t"\
         "paddw %%mm5, %%mm1         \n\t"\
         "psraw $3, %%mm0            \n\t"\
@@ -374,7 +376,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
         : "memory"\
     );\
 }\
@@ -384,7 +387,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
@@ -457,7 +460,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 
 #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                     ptrdiff_t stride)
 {
@@ -470,6 +473,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels8_mmx(dst, src, stride, 8);
 }
 
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
 static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t stride)
 {
@@ -482,18 +491,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_mmx(dst, src, stride, 16);
 }
 
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
 static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
                                      AVCodecContext *avctx)
 {
+#if HAVE_MMX_EXTERNAL
     c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
 
+#if HAVE_MMX_INLINE
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
-}
 #endif /* HAVE_MMX_INLINE */
+}
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
     c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -509,15 +540,6 @@ CAVS_MC(put_,  8, mmxext)
 CAVS_MC(put_, 16, mmxext)
 CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
-                                        AVCodecContext *avctx)
-{
-    DSPFUNC(put, 0, 16, mmxext);
-    DSPFUNC(put, 1,  8, mmxext);
-    DSPFUNC(avg, 0, 16, mmxext);
-    DSPFUNC(avg, 1,  8, mmxext);
-}
 #endif /* HAVE_MMXEXT_INLINE */
 
 #if HAVE_AMD3DNOW_INLINE
@@ -541,18 +563,31 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
 
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
 {
-#if HAVE_MMX_INLINE
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
-        cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+    cavsdsp_init_mmx(c, avctx);
 #if HAVE_AMD3DNOW_INLINE
     if (INLINE_AMD3DNOW(cpu_flags))
         cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_AMD3DNOW_INLINE */
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
-        cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+    }
+#endif
 }
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 5b8d1b224f..3f3ee0ffc1 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,10 +22,10 @@
 #include "libavutil/x86/asm.h" // for xmm_reg
 #include "constants.h"
 
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
@@ -35,19 +35,55 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+                                                    0x0100010001000100ULL, 0x0100010001000100ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
 
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index f38fbe3425..ee8422eeb3 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
 /*
  * MMX/SSE constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,27 +25,46 @@
 
 #include "libavutil/x86/asm.h"
 
-extern const uint64_t ff_wtwo;
-
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
 extern const xmm_reg  ff_pw_3;
 extern const xmm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
 extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
 extern const uint64_t ff_pw_15;
 extern const xmm_reg  ff_pw_16;
 extern const xmm_reg  ff_pw_18;
-extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_20;
 extern const xmm_reg  ff_pw_32;
 extern const uint64_t ff_pw_42;
 extern const uint64_t ff_pw_53;
 extern const xmm_reg  ff_pw_64;
 extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const ymm_reg  ff_pw_255;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
-extern const xmm_reg  ff_pb_F8;
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const xmm_reg  ff_pb_80;
+extern const xmm_reg  ff_pb_FE;
 extern const uint64_t ff_pb_FC;
 
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_65535;
+
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae61c..0000000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 18c7a0c354..548cec10b1 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -2,20 +2,20 @@
 ;* SSE-optimized functions for the DCA decoder
 ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -132,11 +132,16 @@ DECODE_HF
     mulps       va, %2
     mulps       vb, %2
 %if %0 == 3
+%if cpuflag(fma3)
+    fmaddps     va, m4, %3, va
+    fmaddps     vb, m0, %3, vb
+%else
     mulps       m4, %3
     mulps       m0, %3
     addps       va, m4
     addps       vb, m0
 %endif
+%endif
     ; va = va1 va2 va3 va4
     ; vb = vb1 vb2 vb3 vb4
 %if %1
@@ -198,6 +203,10 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
 INIT_XMM sse
 DCA_LFE_FIR 0
 DCA_LFE_FIR 1
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+DCA_LFE_FIR 0
+%endif
 
 %macro SETZERO 1
 %if cpuflag(sse2) && notcpuflag(avx)
@@ -324,7 +333,7 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
 %if ARCH_X86_32
     mov         buf2, synth_buf2mp
 %endif
-.mainloop
+.mainloop:
     ; m1 = a  m2 = b  m3 = c  m4 = d
     SETZERO       m3
     SETZERO       m4
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 7c2bec1f9b..1a19f6b807 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS
                        int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
+void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
 
 av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
@@ -54,6 +55,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
     if (EXTERNAL_SSE4(cpu_flags)) {
         s->decode_hf = ff_decode_hf_sse4;
     }
+
+    if (EXTERNAL_FMA3(cpu_flags)) {
+        s->lfe_fir[0]        = ff_dca_lfe_fir0_fma3;
+    }
 }
 
 
diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c
index 9d4aaf5415..0414381e65 100644
--- a/libavcodec/x86/dct-test.c
+++ b/libavcodec/x86/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,37 @@
 #include "xvididct.h"
 #include "simple_idct.h"
 
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM
+void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
+                                int16_t *block, int16_t *qmat);
+
+#define PR_WRAP(INSN) \
+static void ff_prores_idct_put_10_##INSN##_wrap(int16_t *dst){ \
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]); \
+    LOCAL_ALIGNED(16, int16_t, tmp, [64]); \
+    int i; \
+ \
+    for(i=0; i<64; i++){ \
+        qmat[i]=4; \
+        tmp[i]= dst[i]; \
+    } \
+    ff_prores_idct_put_10_##INSN (dst, 16, tmp, qmat); \
+ \
+    for(i=0; i<64; i++) { \
+         dst[i] -= 512; \
+    } \
+}
+
+PR_WRAP(sse2)
+
+# if HAVE_AVX_EXTERNAL
+void ff_prores_idct_put_10_avx(uint16_t *dst, int linesize,
+                               int16_t *block, int16_t *qmat);
+PR_WRAP(avx)
+# endif
+
+#endif
+
 static const struct algo fdct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "MMX",    ff_fdct_mmx,    FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
@@ -39,21 +70,25 @@ static const struct algo idct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "SIMPLE-MMX",  ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
 #endif
-#if CONFIG_MPEG4_DECODER
-#if HAVE_MMX_INLINE
+#if CONFIG_MPEG4_DECODER && HAVE_YASM
+#if ARCH_X86_32
     { "XVID-MMX",    ff_xvid_idct_mmx,    FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMX,    1 },
-#endif
-#if HAVE_MMXEXT_INLINE
     { "XVID-MMXEXT", ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
     { "XVID-SSE2",   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
 #endif
-#endif /* CONFIG_MPEG4_DECODER */
+#endif /* CONFIG_MPEG4_DECODER && HAVE_YASM */
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM
+    { "PR-SSE2",     ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+# if HAVE_AVX_EXTERNAL
+    { "PR-AVX",      ff_prores_idct_put_10_avx_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
+# endif
+#endif
     { 0 }
 };
 
-static short idct_simple_mmx_perm[64] = {
+static const uint8_t idct_simple_mmx_perm[64] = {
     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 2c4c32eb11..4e657b5460 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
 INIT_YMM avx
 SECTION .text
+%if HAVE_AVX_EXTERNAL
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
 cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
 INIT_XMM
     PASS6_AND_PERMUTE
     RET
+%endif
 
 %if ARCH_X86_64
 %define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index b2e43a9be1..c31ef92238 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dirac_dwt.c b/libavcodec/x86/dirac_dwt.c
new file mode 100644
index 0000000000..3c51ea6ffa
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.c
@@ -0,0 +1,202 @@
+/*
+ * MMX optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                           IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                          IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/dirac_dwt.h b/libavcodec/x86/dirac_dwt.h
new file mode 100644
index 0000000000..126b29029f
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRAC_DWT_H
+#define AVCODEC_X86_DIRAC_DWT_H
+
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c
new file mode 100644
index 0000000000..11df5e395e
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "diracdsp_mmx.h"
+#include "fpel.h"
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+    ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+    ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+    ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+    ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+    c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+    c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+    c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+    c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+    c->add_rect_clamped = ff_add_rect_clamped_mmx;
+    c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
+#endif
+    PIXFUNC(put, 0, mmx);
+    PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+}
diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h
new file mode 100644
index 0000000000..89858544f3
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRACDSP_H
+#define AVCODEC_X86_DIRACDSP_H
+
+#include "libavcodec/diracdsp.h"
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c);
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm
new file mode 100644
index 0000000000..40fe2c8212
--- /dev/null
+++ b/libavcodec/x86/diracdsp_yasm.asm
@@ -0,0 +1,265 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+section .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy:
+    lea     src2q, [srcq+src_strideq*2]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*4]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index d39b07b9f4..9dd6d51ee6 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd986..fd6f15005a 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm
new file mode 100644
index 0000000000..658acc13fc
--- /dev/null
+++ b/libavcodec/x86/dwt_yasm.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* MMX optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6528b57361..112566ded0 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
 } fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct
   29692,  -12299,   26722,  -31521,
 };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
 } tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977e8f..648cdc5350 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb60d..0cb5fd625b 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index d3be72e576..22d9866665 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -51,13 +51,12 @@ struc FFTContext
     .imdcthalf:pointer 1
 endstruc
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 %define M_SQRT1_2 0.70710678118654752440
 %define M_COS_PI_1_8 0.923879532511287
 %define M_COS_PI_3_8 0.38268343236509
 
-align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 
@@ -69,9 +68,10 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
 %rep 13
 cextern cos_ %+ i
@@ -305,6 +305,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -394,6 +395,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 %macro INTERL_AVX 5
     vunpckhps      %3, %2, %1
     vunpcklps      %2, %2, %1
@@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -681,7 +686,7 @@ cglobal imdct_calc, 3,5,3
     mov     r2, r3
     sub     r3, mmsize
     neg     r2
-    mova    m2, [ps_m1m1m1m1]
+    mova    m2, [ps_neg]
 .loop:
 %if mmsize == 8
     PSWAPD  m0, [r1 + r3]
@@ -776,9 +781,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -992,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1080,4 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
 %endif
 
 INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
 DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index a604956836..398091eb1f 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index 5c0273de9d..5085f11380 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000000..e285158185
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000000..7138611526
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+;                                                   int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    shl      lend, 2
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    add      in1q, lenq
+    add      in0q, lenq
+    add      outq, lenq
+    neg      lenq
+
+align 16
+.loop:
+    mova       m0, [in0q + lenq]
+    mova       m1, [in1q + lenq]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+%ifnidn %1, indep2
+    p%4d       m2, m0, m1
+%endif
+    packssdw  m%2, m%2
+    packssdw  m%3, m%3
+    punpcklwd m%2, m%3
+    psllw     m%2, m3
+    mova [outq + lenq], m%2
+    add      lenq, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+;                                        int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    sub      in1q, in0q
+
+align 16
+.loop:
+    mova       m0, [in0q]
+    mova       m1, [in0q + in1q]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+    p%5d       m2, m0, m1
+    pslld     m%2, m3
+    pslld     m%3, m3
+
+    SBUTTERFLY dq, %2, %3, %4
+
+    mova  [outq         ], m%2
+    mova  [outq + mmsize], m%3
+
+    add      in0q, mmsize
+    add      outq, mmsize*2
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+;                                            int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+    %define  lend  dword r3m
+%else
+    mov      lend, lenm
+%endif
+%endif
+    movd      m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+    sub      in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+    mova       m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    punpcklqdq m6, m0, m2
+    punpckhqdq m2, m4
+    shufps     m4, m0, 0xe4
+    punpcklqdq m0, m1, m3
+    punpckhqdq m3, m5
+    shufps     m5, m1, 0xe4
+    SWAP 0,6,1,4,5,3
+%elif %2 == 4
+    TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+    packssdw   m0, [in0q + in4q]
+    packssdw   m1, [in0q + in5q]
+    packssdw   m2, [in0q + in6q]
+    packssdw   m3, [in0q + in7q]
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+    packssdw   m0, [in0q + in3q]
+    packssdw   m1, [in0q + in4q]
+    packssdw   m2, [in0q + in5q]
+    pshufd     m3, m0,     q1032
+    punpcklwd  m0, m1
+    punpckhwd  m1, m2
+    punpcklwd  m2, m3
+
+    shufps     m3, m0, m2, q2020
+    shufps     m0, m1,     q2031
+    shufps     m2, m1,     q3131
+    shufps     m1, m2, m3, q3120
+    shufps     m3, m0,     q0220
+    shufps     m0, m2,     q3113
+    SWAP 2, 0, 3
+%else ; %2 == 4
+    packssdw   m0, [in0q + in2q]
+    packssdw   m1, [in0q + in3q]
+    SBUTTERFLY wd, 0, 1, 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+    psll%4   m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+    mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+    add      in0q, mmsize
+    add      outq, mmsize*REPCOUNT
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000000..e28c5c9322
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 727daa9774..20d8315fd9 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -68,3 +68,43 @@ INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 1871b477bb..e4cbadcce7 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 
 void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
 void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
 
 #endif /* HAVE_YASM */
 
@@ -42,9 +46,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
 
     if (EXTERNAL_SSE(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
     }
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b581471296..0e3b444e2a 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,85 +25,83 @@
 
 SECTION .text
 
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
 ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
 ;                        ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
 %else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
 %endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
     movsxdifnidn r2, r2d
     lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
 .loop:
-    OP           m0, [r1]
-    OP           m1, [r1+r2]
-    OP           m2, [r1+r2*2]
-    OP           m3, [r1+r4]
-    lea          r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
 %ifidn %1, avg
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
 %endif
-    OP         [r0], m0
-    OP      [r0+r2], m1
-    OP    [r0+r2*2], m2
-    OP      [r0+r4], m3
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
     sub         r3d, 4
+    lea          r1, [r1+r2*4]
     lea          r0, [r0+r2*4]
     jne       .loop
     RET
 %endmacro
 
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
 
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
 
 INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415ade..4d93959a96 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@ void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index eef05ecc74..0000000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm
new file mode 100644
index 0000000000..a529422262
--- /dev/null
+++ b/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs:  dw   3, -210,  -11, -805,  -11,  951,  53, 3876
+pw_qmf_coeffs2: dw  12, 3876, -156,  951,   32, -805, 362, -210
+pw_qmf_coeffs3: dw 362,    0 ,  32,    0, -156,    0,  12,    0
+pw_qmf_coeffs4: dw  53,    0,  -11,    0,  -11,    0,   3,    0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+    movu m0, [prevq+mmsize*0]
+    movu m1, [prevq+mmsize*1]
+    movu m2, [prevq+mmsize*2]
+    punpcklwd m3, m0, m1
+    punpckhwd m0, m1
+    punpcklwd m4, m2, m2
+    punpckhwd m2, m2
+    pmaddwd   m3, [pw_qmf_coeffs ]
+    pmaddwd   m0, [pw_qmf_coeffs2]
+    pmaddwd   m4, [pw_qmf_coeffs3]
+    pmaddwd   m2, [pw_qmf_coeffs4]
+    paddd     m0, m3
+    paddd     m2, m4
+    paddd     m0, m2
+    pshufd    m2, m0, q0032
+    paddd     m0, m2
+    pshufd    m0, m0, q0001
+    movq  [outq], m0
+    RET
diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c
new file mode 100644
index 0000000000..614695193b
--- /dev/null
+++ b/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index cd726ba86d..77c8cf154d 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
 ;******************************************************************************
 ;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab981bf..ab81063233 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index cc41f00461..107ae51cbc 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index 7b003515cc..c358482092 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -252,8 +252,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
 %define CHROMAMC_AVG  NOTHING
 INIT_XMM sse2
 CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 put
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 put
 CHROMA_MC2 put
@@ -261,8 +263,10 @@ CHROMA_MC2 put
 %define CHROMAMC_AVG  AVG
 INIT_XMM sse2
 CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 avg
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 avg
 CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 33fd5a9dd7..5151f3c9cd 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -384,8 +384,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
 
 INIT_XMM sse2
 DEBLOCK_LUMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -499,8 +501,10 @@ INIT_MMX mmxext
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -772,8 +776,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
@@ -836,7 +842,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmxext
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index d049c62bf2..ebf8a3f109 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,34 +7,32 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 SECTION .text
 
 cextern pw_2
 cextern pw_3
 cextern pw_4
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 ; out: %4 = |%1-%2|-%3
 ; clobbers: %5
@@ -418,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -715,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -802,10 +804,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -918,5 +922,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index bb881c35df..49ad0e0fa0 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,15 @@
 
 #if HAVE_INLINE_ASM
 
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define decode_significance decode_significance_x86
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
@@ -55,6 +61,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     __asm__ volatile(
         "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -130,6 +137,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -138,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "3:                                     \n\t"
 
         "mov %10, %0                            \n\t"
-        "movzbl (%0, %6), %k6                   \n\t"
+        "movzb (%0, %6), %6                     \n\t"
         "add %9, %6                             \n\t"
 
         BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
@@ -149,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%15")
 
-        "mov %1, %k6                            \n\t"
+        "mov %1, %6                             \n\t"
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
 
 #ifdef BROKEN_RELOCATIONS
-        "movzbl %c14(%15, %q6), %k6\n\t"
+        "movzb %c14(%15, %q6), %6\n\t"
 #else
-        "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
 #endif
         "add %11, %6                            \n\t"
 
@@ -169,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              "%15")
 
         "mov %2, %0                             \n\t"
-        "mov %1, %k6                            \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
 
         "test $1, %4                            \n\t"
         " jnz 5f                                \n\t"
@@ -178,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "add"OPSIZE"  $4, %2                    \n\t"
 
         "4:                                     \n\t"
-        "addl $1, %k6                           \n\t"
-        "mov %k6, %1                            \n\t"
-        "cmpl $63, %k6                          \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
         " jb 3b                                 \n\t"
         "mov %2, %0                             \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %k6, (%0)                          \n\t"
         "5:                                     \n\t"
         "addl %8, %k0                           \n\t"
         "shr $2, %k0                            \n\t"
-        : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
           "=&r"(bit), "+&r"(c->range), "=&r"(state)
         : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
-          "m"(sig_off), "m"(last_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
@@ -198,7 +206,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     );
     return coeff_count;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 313791a5d9..7fafe195f1 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index b7d51051d3..f1c2c81ef8 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,32 +5,31 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32:        times 4 dd 32
-
 SECTION .text
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -83,8 +82,10 @@ cglobal h264_idct_add_10, 3,3
 
 INIT_XMM sse2
 IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -117,9 +118,11 @@ add4x4_idct %+ SUFFIX:
 INIT_XMM sse2
 ALIGN 16
 ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ALIGN 16
 ADD4x4IDCT
+%endif
 
 %macro ADD16_OP 2
     cmp          byte [r4+%2], 0
@@ -155,8 +158,10 @@ cglobal h264_idct_add16_10, 5,6
 
 INIT_XMM sse2
 IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -220,8 +225,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
 
 INIT_XMM sse2
 IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -293,8 +300,10 @@ cglobal h264_idct_add16intra_10,5,7,8
 
 INIT_XMM sse2
 IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16INTRA_10
+%endif
 
 %assign last_block 36
 ;-----------------------------------------------------------------------------
@@ -330,8 +339,10 @@ cglobal h264_idct_add8_10,5,8,7
 
 INIT_XMM sse2
 IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD8
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -537,8 +548,10 @@ h264_idct8_add1_10 %+ SUFFIX:
 
 INIT_XMM sse2
 IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -577,5 +590,7 @@ cglobal h264_idct8_add4_10, 0,7,16
 
 INIT_XMM sse2
 IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index df657a443c..c88d91b49e 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -2497,10 +2497,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
     pshufb     mm3, mm6
     pshufb     mm4, mm6
     pshufb     mm5, mm6
-    psubw      mm2, mm7
-    psubw      mm3, mm7
-    psubw      mm4, mm7
-    psubw      mm5, mm7
+    psubw      mm0, mm7
     paddw      mm2, mm0
     paddw      mm3, mm0
     paddw      mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 55790a956e..9e40cfe24b 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,18 +26,19 @@
 
 SECTION_RODATA
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pw_512
 cextern pw_16
 cextern pw_8
 cextern pw_4
 cextern pw_2
 cextern pw_1
+cextern pd_16
 
 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:        times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_512:       times 8 dw 512
 pd_17:        times 4 dd 17
-pd_16:        times 4 dd 16
 
 SECTION .text
 
@@ -82,8 +83,10 @@ INIT_XMM sse2
 PRED4x4_DR
 INIT_XMM ssse3
 PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
@@ -119,8 +122,10 @@ INIT_XMM sse2
 PRED4x4_VR
 INIT_XMM ssse3
 PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VR
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
@@ -159,28 +164,14 @@ INIT_XMM sse2
 PRED4x4_HD
 INIT_XMM ssse3
 PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_HD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
 ;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
-    movhlps %2, %1
-    paddd   %1, %2
-    pshuflw %2, %1, 0xE
-    paddd   %1, %2
-%else
-    pshufw  %2, %1, 0xE
-    paddd   %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
-    pmaddwd %1, [pw_1]
-    HADDD   %1, %2
-%endmacro
 
 INIT_MMX mmxext
 cglobal pred4x4_dc_10, 3, 3
@@ -228,8 +219,10 @@ cglobal pred4x4_down_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
@@ -255,8 +248,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
@@ -565,8 +560,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_TOP_DC
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
@@ -622,8 +619,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
 
 INIT_XMM sse2
 PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DC
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
@@ -656,8 +655,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
@@ -711,8 +712,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
@@ -778,8 +781,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
 PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_LEFT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
@@ -851,8 +856,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
 PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
@@ -920,8 +927,10 @@ INIT_XMM sse2
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
 PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
@@ -980,8 +989,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL_UP
+%endif
 
 
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1226..528b92e497 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 76d2ab05d5..d9cb5f264c 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,8 @@
 #include "fpel.h"
 
 #if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +49,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -282,7 +282,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
 }\
@@ -294,7 +294,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
 }\
@@ -302,74 +302,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index f92c4aab2b..872268300a 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,12 +26,13 @@
 
 SECTION_RODATA 32
 
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_16
 cextern pw_1
 cextern pb_0
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 pad10: times 8 dw 10*1023
 pad20: times 8 dw 20*1023
 pad30: times 8 dw 30*1023
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
-pd_0f: times 4 dd 0xffff
 
 SECTION .text
 
@@ -386,7 +386,7 @@ MC_CACHE MC10
 ; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
 %macro V_FILT 10
-v_filt%9_%10_10
+v_filt%9_%10_10:
     add    r4, r2
 .no_addr4:
     FILT_V m0, m1, m2, m3, m4, m5, m6, m7
@@ -708,7 +708,7 @@ h%1_loop_op:
     psrad      m1, 10
     psrad      m2, 10
     pslld      m2, 16
-    pand       m1, [pd_0f]
+    pand       m1, [pd_65535]
     por        m1, m2
 %if num_mmregs <= 8
     pxor       m0, m0
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c72541b..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 9ad26de832..6c57d57bc0 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -139,12 +139,12 @@ WEIGHT_FUNC_HALF_MM 8, 8
     je .nonnormal
     cmp        r5, 128
     jne .normal
-.nonnormal
+.nonnormal:
     sar        r5, 1
     sar        r6, 1
     sar  off_regd, 1
     sub        r4, 1
-.normal
+.normal:
 %if cpuflag(ssse3)
     movd       m4, r5d
     movd       m0, r6d
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8ca45..f924e55854 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,11 +26,12 @@
 
 SECTION_RODATA 32
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 sq_1: dq 1
       dq 0
 
 cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 SECTION .text
 
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 8ec8a79aba..e08af2759e 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594ca9..35db20014a 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -210,6 +210,7 @@ H264_BIWEIGHT_10_SSE(4,  10)
 av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -365,4 +366,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #endif /* HAVE_ALIGNED_STACK */
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 1e895f0aa5..48a597530b 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,13 +26,15 @@
 
 SECTION_RODATA
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m1:        times 8 dw -1
-pw_m2:        times 8 dw -2
-pd_1 :        times 4 dd  1
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
 
 cextern pw_4
 cextern pw_8
+cextern pw_m1
 
 SECTION .text
 INIT_XMM sse2
@@ -57,10 +59,10 @@ INIT_XMM sse2
     movd             m4, %5
     movd             m6, %6
     movd             m5, %7
-    movd             m7, %8
+    movd             m3, %8
 
     punpcklbw        m4, m6
-    punpcklbw        m5, m7
+    punpcklbw        m5, m3
     punpcklwd        m4, m5
 
     punpckhdq        m2, m0, m4
@@ -76,16 +78,10 @@ INIT_XMM sse2
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m6, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
 
     movd             %1, m0
     pshufd           m0, m0, 0x39
@@ -95,13 +91,13 @@ INIT_XMM sse2
     pshufd           m0, m0, 0x39
     movd             %4, m0
 
-    movd             %5, m6
-    pshufd           m6, m6, 0x39
-    movd             %6, m6
-    pshufd           m6, m6, 0x39
-    movd             %7, m6
-    pshufd           m6, m6, 0x39
-    movd             %8, m6
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
 %endmacro
 
 ; in: 8 rows of 4 words in %4..%11
@@ -120,10 +116,10 @@ INIT_XMM sse2
     movq             m4, %5
     movq             m6, %6
     movq             m5, %7
-    movq             m7, %8
+    movq             m3, %8
 
     punpcklwd        m4, m6
-    punpcklwd        m5, m7
+    punpcklwd        m5, m3
     punpckhdq        m6, m4, m5
     punpckldq        m4, m5
 
@@ -136,32 +132,23 @@ INIT_XMM sse2
 
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
-    pxor             m5, m5; zeros reg
-    CLIPW            m0, m5, [pw_pixel_max]
-    CLIPW            m1, m5, [pw_pixel_max]
-    CLIPW            m2, m5, [pw_pixel_max]
-    CLIPW            m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
 
-    punpckhwd        m4, m0, m1
-    punpcklwd        m0, m1
-    punpckhwd        m5, m2, m3
-    punpcklwd        m2, m3
-    punpckhdq        m6, m0, m2
-    punpckldq        m0, m2
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
 
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m6
-    movhps           %4, m6
-
-    punpckhdq        m6, m4, m5
-    punpckldq        m4, m5
-
-    movq             %5, m4
-    movhps           %6, m4
-    movq             %7, m6
-    movhps           %8, m6
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +199,20 @@ INIT_XMM sse2
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 bytes in %1..%8
 %macro TRANSPOSE8x8B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-    packuswb         m4, m4
-    packuswb         m5, m5
-    packuswb         m6, m6
-    packuswb         m7, m7
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m8, m0, m2
-    punpcklwd        m0, m2
-
-    punpcklbw        m4, m5
-    punpcklbw        m6, m7
-
-    punpckhwd        m9, m4, m6
-    punpcklwd        m4, m6
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
 
-    punpckhdq       m10, m0, m4; 2, 3
-    punpckldq        m0, m4;   0, 1
-
-    punpckldq       m11, m8, m9;  4, 5
-    punpckhdq        m8, m9;   6, 7
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m10
-    movhps           %4, m10
-    movq             %5, m11
-    movhps           %6, m11
-    movq             %7, m8
-    movhps           %8, m8
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 words in %1..%8
@@ -264,18 +231,18 @@ INIT_XMM sse2
 
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
     TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
 
     pxor             m8, m8
-    CLIPW            m0, m8, [pw_pixel_max]
-    CLIPW            m1, m8, [pw_pixel_max]
-    CLIPW            m2, m8, [pw_pixel_max]
-    CLIPW            m3, m8, [pw_pixel_max]
-    CLIPW            m4, m8, [pw_pixel_max]
-    CLIPW            m5, m8, [pw_pixel_max]
-    CLIPW            m6, m8, [pw_pixel_max]
-    CLIPW            m7, m8, [pw_pixel_max]
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
 
     movdqu           %1, m0
     movdqu           %2, m1
@@ -318,13 +285,14 @@ ALIGN 16
     paddw            m5, m4;
 
     ;tc calculations
-    movd             m6, [r2]; tc0
-    add              r2, 4;
+    movq             m6, [tcq]; tc0
     punpcklwd        m6, m6
-    movd             m7, [r2]; tc1
-    punpcklwd        m7, m7
-    shufps           m6, m7, 0; tc0, tc1
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
     pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
     ;end tc calculations
 
     paddw            m5, [pw_4]; +4
@@ -362,11 +330,11 @@ ALIGN 16
 
     paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
 
-    pshufhw         m14, m9,  q0033 ;0b00001111;  0d3 0d3 0d0 0d0 in high
-    pshuflw         m14, m14, q0033 ;0b00001111;  1d3 1d3 1d0 1d0 in low
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
 
-    pshufhw          m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
-    pshuflw          m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
 
     paddw           m14, m9; 0d0+0d3, 1d0+1d3
 
@@ -380,7 +348,7 @@ ALIGN 16
     psraw           m15, m13, 2;   beta >> 2
     psllw            m8, m9, 1;
     pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
-    movmskps        r14, m15;
+    movmskps        r6, m15;
     ;end weak / strong decision
 
     ; weak filter nd_p/q calculation
@@ -388,19 +356,15 @@ ALIGN 16
     psrld            m8, 16
     paddw            m8, m10
     movd            r7d, m8
-    and              r7, 0xffff; 1dp0 + 1dp3
     pshufd           m8, m8, 0x4E
     movd            r8d, m8
-    and              r8, 0xffff; 0dp0 + 0dp3
 
     pshufd           m8, m11, 0x31
     psrld            m8, 16
     paddw            m8, m11
     movd            r9d, m8
-    and              r9, 0xffff; 1dq0 + 1dq3
     pshufd           m8, m8, 0x4E
     movd           r10d, m8
-    and             r10, 0xffff; 0dq0 + 0dq3
     ; end calc for weak filter
 
     ; filtering mask
@@ -422,14 +386,13 @@ ALIGN 16
     shl             r11, %1 - 8
 %endif
     movd             m8, r11d; tc0
-    add             tcq, 4;
-    mov             r3d, [tcq];
+    mov             r3d, [tcq+4];
 %if %1 > 8
     shl              r3, %1 - 8
 %endif
-    movd             m9, r3d; tc1
     add            r11d, r3d; tc0 + tc1
     jz             .bypassluma
+    movd             m9, r3d; tc1
     punpcklwd        m8, m8
     punpcklwd        m9, m9
     shufps           m8, m9, 0; tc0, tc1
@@ -453,7 +416,7 @@ ALIGN 16
     psraw           m13, 3; beta >> 3
     pcmpgtw         m13, m12;
     movmskps        r11, m13;
-    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
     ;----beta_3 comparison end-----
     ;----tc25 comparison---
     psubw           m12, m3, m4;      p0 - q0
@@ -464,23 +427,23 @@ ALIGN 16
 
     pcmpgtw          m8, m12; tc25 comparisons
     movmskps        r11, m8;
-    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
     ;----tc25 comparison end---
-    mov             r11, r14;
+    mov             r11, r6;
     shr             r11, 1;
-    and             r14, r11; strong mask, bits 2 and 0
+    and             r6, r11; strong mask, bits 2 and 0
 
     pmullw          m14, m9, [pw_m2]; -tc * 2
     paddw            m9, m9
 
-    and             r14, 5; 0b101
-    mov             r11, r14; strong mask
-    shr             r14, 2;
-    movd            m12, r14d; store to xmm for mask generation
-    shl             r14, 1
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
     and             r11, 1
     movd            m10, r11d; store to xmm for mask generation
-    or              r14, r11; final strong mask, bits 1 and 0
+    or              r6, r11; final strong mask, bits 1 and 0
     jz      .weakfilter
 
     shufps          m10, m12, 0
@@ -565,16 +528,16 @@ ALIGN 16
     MASKED_COPY      m3, m12
 
 .weakfilter:
-    not             r14; strong mask -> weak mask
-    and             r14, r13; final weak filtering mask, bits 0 and 1
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
     jz             .store
 
     ; weak filtering mask
-    mov             r11, r14
+    mov             r11, r6
     shr             r11, 1
     movd            m12, r11d
-    and             r14, 1
-    movd            m11, r14d
+    and             r6, 1
+    movd            m11, r6d
     shufps          m11, m12, 0
     pcmpeqd         m11, [pd_1]; filtering mask
 
@@ -609,7 +572,11 @@ ALIGN 16
     pminsw          m12, m9;  av_clip(delta0, -tc, tc)
 
     psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
     pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
 
     pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
     psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
@@ -658,117 +625,161 @@ ALIGN 16
     MASKED_COPY      m4, m8
 %endmacro
 
-INIT_XMM sse2
 ;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
-    sub              r0, 2
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8B_LOAD  PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 8
-    TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     RET
 
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 10
-    TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
-    mov              r5, r0; pix
-    sub              r5, r1
-    sub              r5, r1
-    movh             m0, [r5];      p1
-    movh             m1, [r5 + r1]; p0
-    movh             m2, [r0];      q0
-    movh             m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
     pxor             m5, m5; zeros reg
     punpcklbw        m0, m5
     punpcklbw        m1, m5
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb          m1, m2
-    movh       [r5 + r1], m1
-    movhps          [r0], m1
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
-    mov             r5, r0; pix
-    sub             r5, r1
-    sub             r5, r1
-    movdqu          m0, [r5];      p1
-    movdqu          m1, [r5+r1];   p0
-    movdqu          m2, [r0];      q0
-    movdqu          m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
     CHROMA_DEBLOCK_BODY 10
     pxor            m5, m5; zeros reg
-    CLIPW           m1, m5, [pw_pixel_max]
-    CLIPW           m2, m5, [pw_pixel_max]
-    movdqu   [r5 + r1], m1
-    movdqu        [r0], m2
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
     RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
 
 %if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r6, r0
-    add              r0, r5
-    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
     LUMA_DEBLOCK_BODY 8, v
 .store:
-    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
 .bypassluma:
     RET
 
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     sub            pixq, 8
-    lea              r5, [3 * strideq]
-    mov              r6, pixq
-    add            pixq, r5
-    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
     LUMA_DEBLOCK_BODY 10, v
 .store:
-    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
 .bypassluma:
     RET
 
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea     src3strideq, [3 * strideq]
     mov           pix0q, pixq
     sub           pix0q, src3strideq
     sub           pix0q, strideq
-    movdqu           m0, [pix0q];               p3
-    movdqu           m1, [pix0q +     strideq]; p2
-    movdqu           m2, [pix0q + 2 * strideq]; p1
-    movdqu           m3, [pix0q + src3strideq]; p0
-    movdqu           m4, [pixq];                q0
-    movdqu           m5, [pixq +     strideq];  q1
-    movdqu           m6, [pixq + 2 * strideq];  q2
-    movdqu           m7, [pixq + src3strideq];  q3
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
     pxor             m8, m8
     punpcklbw        m0, m8
     punpcklbw        m1, m8
@@ -783,16 +794,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     packuswb          m1, m2
     packuswb          m3, m4
     packuswb          m5, m6
-    movh   [r5 +     r1], m1
-    movhps [r5 + 2 * r1], m1
-    movh   [r5 +     r6], m3
-    movhps [r0         ], m3
-    movh   [r0 +     r1], m5
-    movhps [r0 + 2 * r1], m5
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
 .bypassluma:
     RET
 
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea                  src3strideq, [3 * strideq]
     mov                        pix0q, pixq
     sub                        pix0q, src3strideq
@@ -808,12 +819,43 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     LUMA_DEBLOCK_BODY             10, h
 .store:
     pxor                          m8, m8; zeros reg
-    CLIPW                         m1, m8, [pw_pixel_max]
-    CLIPW                         m2, m8, [pw_pixel_max]
-    CLIPW                         m3, m8, [pw_pixel_max]
-    CLIPW                         m4, m8, [pw_pixel_max]
-    CLIPW                         m5, m8, [pw_pixel_max]
-    CLIPW                         m6, m8, [pw_pixel_max]
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
     movdqu     [pix0q +     strideq], m1;  p2
     movdqu     [pix0q + 2 * strideq], m2;  p1
     movdqu     [pix0q + src3strideq], m3;  p0
@@ -822,4 +864,13 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     movdqu     [pixq  + 2 * strideq], m6;  q2
 .bypassluma:
     RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
 %endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
new file mode 100644
index 0000000000..2edaf9aef1
--- /dev/null
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,122 @@
+; /*
+; * SIMD optimized idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; * Copyright (c) 2014 James Almer
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%3) + 1)
+    sar               tmpw, (15-%3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop:
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+    add  coeffq, mmsize*8
+    dec  cntd
+    jg  .loop
+    RET
+%endmacro
+
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%2) + 1)
+    sar               tmpw, (15-%2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
+%endmacro
+
+; 8-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,      8
+IDCT_DC     8,  2,  8
+
+INIT_XMM sse2
+IDCT_DC_NL  8,      8
+IDCT_DC    16,  4,  8
+IDCT_DC    32, 16,  8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2,  8
+IDCT_DC    32,  8,  8
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 10-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     10
+IDCT_DC     8,  2, 10
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     10
+IDCT_DC    16,  4, 10
+IDCT_DC    32, 16, 10
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 10
+IDCT_DC    32,  8, 10
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 12-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     12
+IDCT_DC     8,  2, 12
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     12
+IDCT_DC    16,  4, 12
+IDCT_DC    32, 16, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 12
+IDCT_DC    32,  8, 12
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
new file mode 100644
index 0000000000..ff6ed0711a
--- /dev/null
+++ b/libavcodec/x86/hevc_mc.asm
@@ -0,0 +1,1672 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_8192
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+%define pw_8 pw_512
+%define pw_10 pw_2048
+%define pw_12 pw_8192
+%define pw_bi_10 pw_1024
+%define pw_bi_12 pw_4096
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+pw_bi_8:                times 16 dw  (1 <<  8)
+max_pixels_12:          times 16 dw ((1 << 12)-1)
+cextern pd_1
+cextern pb_0
+
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+                        times %2 d%3 10, -2
+                        times %2 d%3 -4, 54
+                        times %2 d%3 16, -2
+                        times %2 d%3 -6, 46
+                        times %2 d%3 28, -4
+                        times %2 d%3 -4, 36
+                        times %2 d%3 36, -4
+                        times %2 d%3 -4, 28
+                        times %2 d%3 46, -6
+                        times %2 d%3 -2, 16
+                        times %2 d%3 54, -4
+                        times %2 d%3 -2, 10
+                        times %2 d%3 58, -2
+%endmacro
+
+
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+
+EPEL_TABLE  8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+                        times %2 d%3 -10, 58
+                        times %2 d%3  17, -5
+                        times %2 d%3   1,  0
+                        times %2 d%3  -1,  4
+                        times %2 d%3 -11, 40
+                        times %2 d%3  40,-11
+                        times %2 d%3   4, -1
+                        times %2 d%3   0,  1
+                        times %2 d%3  -5, 17
+                        times %2 d%3  58,-10
+                        times %2 d%3   4, -1
+%endmacro
+
+QPEL_TABLE  8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
+
+QPEL_TABLE  8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
+
+SECTION .text
+
+%define MAX_PB_SIZE  64
+
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
+
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
+
+%if ARCH_X86_64
+
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
+%elif %1 <= 12
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+    mova              %3, [%2]
+    mova              %4, [%2+32]
+%endif
+%endmacro
+
+%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movd              %4, [%3]                                               ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movq              %4, [%3]                                               ; load data from source
+%elif notcpuflag(avx)
+    movu              %4, [%3]                                               ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+    movdqu           %4, [%3]
+%else
+    movu              %4, [%3]
+%endif
+%endmacro
+
+
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_avx2_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_sse4_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_sse4_%1
+%endif
+%endif ;cpuflag(avx2)
+    sub              %2q, 1
+%if cpuflag(avx2)
+    shl              %2q, 6                      ; multiply by 64
+  %else
+    shl              %2q, 5                      ; multiply by 32
+%endif
+    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
+    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
+%endmacro
+
+%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  6
+%define %%table  hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  5
+%define %%table  hevc_epel_filters_sse4_%1
+%endif
+
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
+    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
+
+%if cpuflag(avx2)
+%define %%table  hevc_epel_filters_avx2_10
+%else
+%define %%table  hevc_epel_filters_sse4_10
+%endif
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    mova             m12, [FILTER + myq]        ; get 2 first values of filters
+    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro QPEL_FILTER 2
+
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  7
+%define %%table  hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  6
+%define %%table  hevc_qpel_filters_sse4_%1
+%endif
+
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
+    sub              %2q, 1
+    shl              %2q, %%shift                        ; multiply by 32
+    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
+    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
+    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
+    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
+%endmacro
+
+%macro EPEL_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+
+    %%load            m0, [%2q ]
+%ifnum %3
+    %%load            m1, [%2q+  %3]
+    %%load            m2, [%2q+2*%3]
+    %%load            m3, [%2q+3*%3]
+%else
+    %%load            m1, [%2q+  %3q]
+    %%load            m2, [%2q+2*%3q]
+    %%load            m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 7
+    SBUTTERFLY        bw, 2, 3, 7
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 7
+    SBUTTERFLY        wd, 2, 3, 7
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+%endif
+%endif
+%endmacro
+
+
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+    %%load            m0, [%2-3*%%stride]        ;load data from source
+    %%load            m1, [%2-2*%%stride]
+    %%load            m2, [%2-%%stride  ]
+    %%load            m3, [%2           ]
+    %%load            m4, [%2+%%stride  ]
+    %%load            m5, [%2+2*%%stride]
+    %%load            m6, [%2+3*%%stride]
+    %%load            m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+    SBUTTERFLY        wd, 0, 1, %4
+    SBUTTERFLY        wd, 2, 3, %4
+    SBUTTERFLY        wd, 4, 5, %4
+    SBUTTERFLY        wd, 6, 7, %4
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %3 > 4
+    SBUTTERFLY        dq, 0, 1, %4
+    SBUTTERFLY        dq, 2, 3, %4
+    SBUTTERFLY        dq, 4, 5, %4
+    SBUTTERFLY        dq, 6, 7, %4
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
+
+%macro QPEL_V_LOAD 5
+    lea              %5q, [%2]
+    sub              %5q, r3srcq
+    movu              m0, [%5q            ]      ;load x- 3*srcstride
+    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movu              m3, [%2       ]      ;load x
+    movu              m4, [%2+   %3q]      ;load x+stride
+    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
+    movu              m6, [%2+r3srcq]      ;load x+3*stride
+    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 8
+    SBUTTERFLY        bw, 2, 3, 8
+    SBUTTERFLY        bw, 4, 5, 8
+    SBUTTERFLY        bw, 6, 7, 8
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 8
+    SBUTTERFLY        wd, 2, 3, 8
+    SBUTTERFLY        wd, 4, 5, 8
+    SBUTTERFLY        wd, 6, 7, 8
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
+
+%macro PEL_12STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+    PEL_12STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endmacro
+
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+    movu            [%1], %2
+%else
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endif
+%endmacro
+
+%macro PEL_10STORE32 3
+    PEL_10STORE16     %1, %2, %3
+    movu         [%1+32], %3
+%endmacro
+
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+    movdqu        [%1], %2
+%else
+    mova          [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+    movu          [%1], %2
+%endmacro
+
+%macro LOOP_END 3
+    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
+    add              %2q, %3q                    ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+%endmacro
+
+
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+    vextracti128 xm1, m0, 1
+    pmovzxbw      m1, xm1
+    psllw         m1, 14-%2
+%endif
+    pmovzxbw      m0, xm0
+%else ; not avx
+%if %1 > 8
+    punpckhbw     m1, m0, m2
+    psllw         m1, 14-%2
+%endif
+    punpcklbw     m0, m2
+%endif
+%endif ;avx
+    psllw         m0, 14-%2
+%endmacro
+
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+    vperm2i128    m10, m0, m1, q0301
+%endif
+    vinserti128    m0, m0, xm1, 1
+    mova           m1, m10
+%if %2 > 16
+    vperm2i128    m10, m2, m3, q0301
+%endif
+    vinserti128    m2, m2, xm3, 1
+    mova           m3, m10
+%endif
+    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
+    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
+    paddw          %%reg0, %%reg2
+%if %2 > 8
+    pmaddubsw      %%reg1, %3
+    pmaddubsw      %%reg3, %4
+    paddw          %%reg1, %%reg3
+%endif
+%else
+    pmaddwd        %%reg0, %3
+    pmaddwd        %%reg2, %4
+    paddd          %%reg0, %%reg2
+%if %2 > 4
+    pmaddwd        %%reg1, %3
+    pmaddwd        %%reg3, %4
+    paddd          %%reg1, %%reg3
+%if %1 != 8
+    psrad          %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+    psrad          %%reg0, %1-8
+%endif
+    packssdw       %%reg0, %%reg1
+%endif
+%endmacro
+
+%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
+
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table  hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table  hevc_qpel_filters_sse4_%2
+%endif
+
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
+
+%if %2 == 8
+    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
+    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%else
+    pmaddwd           m0, [rfilterq + %3q*8   ]
+    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, [rfilterq + %3q*8   ]
+    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
+
+%macro QPEL_COMPUTE 2-3     ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
+
+    vperm2i128 m10, m0,  m1, q0301
+    vinserti128 m0, m0, xm1, 1
+    SWAP 1, 10
+
+    vperm2i128 m10, m2,  m3, q0301
+    vinserti128 m2, m2, xm3, 1
+    SWAP 3, 10
+
+
+    vperm2i128 m10, m4,  m5, q0301
+    vinserti128 m4, m4, xm5, 1
+    SWAP 5, 10
+
+    vperm2i128 m10, m6,  m7, q0301
+    vinserti128 m6, m6, xm7, 1
+    SWAP 7, 10
+%endif
+
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
+%endmacro
+
+%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+    vpermq            %3, %3, 216
+    vpermq            %4, %4, 216
+%endif
+%endmacro
+
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
+%else
+    CLIPW             %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+    CLIPW             %4, [pb_0], [max_pixels_%2]
+%endif
+%endif
+%endmacro
+
+
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
+
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS     %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS  %1, %2
+%endmacro
+
+%macro HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+    pxor               m2, m2
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+ %endmacro
+
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+%macro HEVC_BI_PEL_PIXELS 2
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width);
+; ******************************
+
+
+%macro HEVC_PUT_HEVC_EPEL 2
+%if cpuflag(avx2)
+%define XMM_REGS  11
+%else
+%define XMM_REGS  8
+%endif
+
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_bi_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m2, m0, xm4, 1
+    vperm2i128        m3, m0, m4, q0301
+    PEL_10STORE%1     dstq, m2, m3
+%else
+    PEL_10STORE%1     dstq, m0, m4
+%endif
+%else
+    PEL_10STORE%1     dstq, m0, m1
+%endif
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    mova              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
+%else
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    SIMPLE_BILOAD     %1, src2q, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m1, m8, xm3, 1
+    vperm2i128        m2, m8, m3, q0301
+    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
+    SIMPLE_BILOAD     %1, src2q, m8, m9
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m4
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
+    mova              m9, [pw_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m9, [pw_bi_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_bi_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    PEL_10STORE%1     dstq, m0, m1
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    mova            m8, m9
+    mova            m9, m10
+    mova           m10, m11
+    mova           m11, m12
+    mova           m12, m13
+    mova           m13, m14
+    mova           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
+    mov             r4d, denomm
+%define SHIFT  r4d
+%else
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
+%define SHIFT  denomd
+%endif
+    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
+%if %1 <= 4
+    pxor             m1, m1
+%endif
+    movd             m2, wxm        ; WX
+    movd             m4, SHIFT      ; shift
+%if %1 <= 4
+    punpcklwd        m2, m1
+%else
+    punpcklwd        m2, m2
+%endif
+    dec           SHIFT
+    movdqu           m5, [pd_1]
+    movd             m6, SHIFT
+    pshufd           m2, m2, 0
+    mov           SHIFT, oxm
+    pslld            m5, m6
+%if %2 != 8
+    shl           SHIFT, %2-8       ; ox << (bitd - 8)
+%endif
+    movd             m3, SHIFT      ; OX
+    pshufd           m3, m3, 0
+%if WIN64 || ARCH_X86_32
+    mov           SHIFT, heightm
+%endif
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq, m0
+%if %1 <= 4
+    punpcklwd         m0, m1
+    pmaddwd           m0, m2
+    paddd             m0, m5
+    psrad             m0, m4
+    paddd             m0, m3
+%else
+    pmulhw            m6, m0, m2
+    pmullw            m0, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    paddd             m0, m5
+    paddd             m1, m5
+    psrad             m0, m4
+    psrad             m1, m4
+    paddd             m0, m3
+    paddd             m1, m3
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    CLIPW             m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
+    movifnidn        r5d, denomm
+%if %1 <= 4
+    pxor              m1, m1
+%endif
+    movd              m2, wx0m         ; WX0
+    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
+    movd              m3, wx1m         ; WX1
+    movd              m0, r5d          ; shift
+%if %1 <= 4
+    punpcklwd         m2, m1
+    punpcklwd         m3, m1
+%else
+    punpcklwd         m2, m2
+    punpcklwd         m3, m3
+%endif
+    inc              r5d
+    movd              m5, r5d          ; shift+1
+    pshufd            m2, m2, 0
+    mov              r5d, ox0m
+    pshufd            m3, m3, 0
+    add              r5d, ox1m
+%if %2 != 8
+    shl              r5d, %2-8         ; ox << (bitd - 8)
+%endif
+    inc              r5d
+    movd              m4, r5d          ; offset
+    pshufd            m4, m4, 0
+%if UNIX64
+%define h heightd
+%else
+    mov              r5d, heightm
+%define h r5d
+%endif
+    pslld             m4, m0
+
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq,  m0
+   SIMPLE_LOAD        %1, 10, src2q, m8
+%if %1 <= 4
+    punpcklwd         m0, m1
+    punpcklwd         m8, m1
+    pmaddwd           m0, m3
+    pmaddwd           m8, m2
+    paddd             m0, m4
+    paddd             m0, m8
+    psrad             m0, m5
+%else
+    pmulhw            m6, m0, m3
+    pmullw            m0, m3
+    pmulhw            m7, m8, m2
+    pmullw            m8, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    punpckhwd         m9, m8, m7
+    punpcklwd         m8, m7
+    paddd             m0, m8
+    paddd             m1, m9
+    paddd             m0, m4
+    paddd             m1, m4
+    psrad             m0, m5
+    psrad             m1, m5
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+     CLIPW            m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
+    dec                h                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
+
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
+
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
+
+HEVC_PUT_HEVC_EPEL 2,  8
+HEVC_PUT_HEVC_EPEL 4,  8
+HEVC_PUT_HEVC_EPEL 6,  8
+HEVC_PUT_HEVC_EPEL 8,  8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
+
+
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
+
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
+
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
+
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
+
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
+
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
+
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
+
+%endif ;AVX2
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
new file mode 100644
index 0000000000..dc3e88a373
--- /dev/null
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -0,0 +1,388 @@
+; /*
+; * Provide SIMD optimizations for transform_add functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
+
+;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+%macro TR_ADD_MMX_4_8 0
+    mova              m2, [r1]
+    mova              m4, [r1+8]
+    pxor              m3, m3
+    psubw             m3, m2
+    packuswb          m2, m2
+    packuswb          m3, m3
+    pxor              m5, m5
+    psubw             m5, m4
+    packuswb          m4, m4
+    packuswb          m5, m5
+
+    movh              m0, [r0     ]
+    movh              m1, [r0+r2  ]
+    paddusb           m0, m2
+    paddusb           m1, m4
+    psubusb           m0, m3
+    psubusb           m1, m5
+    movh       [r0     ], m0
+    movh       [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add4_8, 3, 4, 6
+    TR_ADD_MMX_4_8
+    add               r1, 16
+    lea               r0, [r0+r2*2]
+    TR_ADD_MMX_4_8
+    RET
+
+%macro TR_ADD_SSE_8_8 0
+    pxor              m3, m3
+    mova              m4, [r1]
+    mova              m6, [r1+16]
+    mova              m0, [r1+32]
+    mova              m2, [r1+48]
+    psubw             m5, m3, m4
+    psubw             m7, m3, m6
+    psubw             m1, m3, m0
+    packuswb          m4, m0
+    packuswb          m5, m1
+    psubw             m3, m2
+    packuswb          m6, m2
+    packuswb          m7, m3
+
+    movq                m0, [r0     ]
+    movq                m1, [r0+r2  ]
+    movhps              m0, [r0+r2*2]
+    movhps              m1, [r0+r3  ]
+    paddusb             m0, m4
+    paddusb             m1, m6
+    psubusb             m0, m5
+    psubusb             m1, m7
+    movq         [r0     ], m0
+    movq         [r0+r2  ], m1
+    movhps       [r0+2*r2], m0
+    movhps       [r0+r3  ], m1
+%endmacro
+
+%macro TR_ADD_SSE_16_32_8 3
+    mova             xm2, [r1+%1   ]
+    mova             xm6, [r1+%1+16]
+%if cpuflag(avx2)
+    vinserti128       m2, m2, [r1+%1+32], 1
+    vinserti128       m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+    psubw             m1, m0, m2
+    psubw             m5, m0, m6
+%else
+    mova              m1, m0
+    mova              m5, m0
+    psubw             m1, m2
+    psubw             m5, m6
+%endif
+    packuswb          m2, m6
+    packuswb          m1, m5
+
+    mova             xm4, [r1+%1+mmsize*2   ]
+    mova             xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128       m4, m4, [r1+%1+96 ], 1
+    vinserti128       m6, m6, [r1+%1+112], 1
+%endif
+%if cpuflag(avx)
+    psubw             m3, m0, m4
+    psubw             m5, m0, m6
+%else
+    mova              m3, m0
+    mova              m5, m0
+    psubw             m3, m4
+    psubw             m5, m6
+%endif
+    packuswb          m4, m6
+    packuswb          m3, m5
+
+    paddusb           m2, [%2]
+    paddusb           m4, [%3]
+    psubusb           m2, m1
+    psubusb           m4, m3
+    mova            [%2], m2
+    mova            [%3], m4
+%endmacro
+
+
+%macro TRANSFORM_ADD_8 0
+; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add8_8, 3, 4, 8
+    lea               r3, [r2*3]
+    TR_ADD_SSE_8_8
+    add               r1, 64
+    lea               r0, [r0+r2*4]
+    TR_ADD_SSE_8_8
+    RET
+
+; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add16_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%rep 3
+    add                r1, 128
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%endrep
+    RET
+
+; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor               m0, m0
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%rep 15
+    add                r1, 128
+    lea                r0, [r0+r2*2]
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%endrep
+    RET
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+    add                r1, 256
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro TR_ADD_SSE_8_10 4
+    mova              m0, [%4]
+    mova              m1, [%4+16]
+    mova              m2, [%4+32]
+    mova              m3, [%4+48]
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TR_ADD_MMX4_10 3
+    mova              m0, [%1+0   ]
+    mova              m1, [%1+%2  ]
+    paddw             m0, [%3]
+    paddw             m1, [%3+8]
+    CLIPW             m0, m2, m3
+    CLIPW             m1, m2, m3
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+%endmacro
+
+%macro TRANS_ADD_SSE_16_10 3
+    mova              m0, [%3]
+    mova              m1, [%3+16]
+    mova              m2, [%3+32]
+    mova              m3, [%3+48]
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+16   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+16]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+16   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+16], m3
+%endmacro
+
+%macro TRANS_ADD_SSE_32_10 2
+    mova              m0, [%2]
+    mova              m1, [%2+16]
+    mova              m2, [%2+32]
+    mova              m3, [%2+48]
+
+    paddw             m0, [%1   ]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+32]
+    paddw             m3, [%1+48]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova         [%1   ], m0
+    mova         [%1+16], m1
+    mova         [%1+32], m2
+    mova         [%1+48], m3
+%endmacro
+
+%macro TRANS_ADD16_AVX2 4
+    mova              m0, [%4]
+    mova              m1, [%4+32]
+    mova              m2, [%4+64]
+    mova              m3, [%4+96]
+
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TRANS_ADD32_AVX2 3
+    mova              m0, [%3]
+    mova              m1, [%3+32]
+    mova              m2, [%3+64]
+    mova              m3, [%3+96]
+
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+32   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+32]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+32   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+32], m3
+%endmacro
+
+
+INIT_MMX mmxext
+cglobal hevc_transform_add4_10,3,4, 6
+    pxor              m2, m2
+    mova              m3, [max_pixels_10]
+    TR_ADD_MMX4_10     r0, r2, r1
+    add               r1, 16
+    lea               r0, [r0+2*r2]
+    TR_ADD_MMX4_10     r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal hevc_transform_add8_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    lea               r0, [r0+r2*4]
+    add               r1, 64
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    RET
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%rep 7
+    lea                 r0, [r0+r2*2]
+    add                 r1, 64
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_32_10 r0, r1
+%rep 31
+    lea                 r0, [r0+r2]
+    add                 r1, 64
+    TRANS_ADD_SSE_32_10 r0, r1
+%endrep
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%rep 3
+    lea               r0, [r0+r2*4]
+    add               r1, 128
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD32_AVX2  r0, r2, r1
+%rep 15
+    lea               r0, [r0+r2*2]
+    add               r1, 128
+    TRANS_ADD32_AVX2  r0, r2, r1
+%endrep
+    RET
+%endif ;HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
new file mode 100644
index 0000000000..888a28afa7
--- /dev/null
+++ b/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mov%3     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER  8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
new file mode 100644
index 0000000000..f45fc56cfa
--- /dev/null
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,433 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 3
+    psraw             %2, %3, %1-5
+%if ARCH_X86_64
+    pcmpeqw          m10, %2, m0
+    pcmpeqw          m11, %2, m1
+    pcmpeqw          m12, %2, m2
+    pcmpeqw           %2, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %2, m7
+    por              m10, m11
+    por              m12, %2
+    por              m10, m12
+    paddw             %3, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %2, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %2, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %2, [rsp+MMSIZE*2]
+    pcmpeqw           %2, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %2, m7
+    por               m4, m5
+    por               m6, %2
+    por               m4, m6
+    paddw             %3, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+%if %2 == 8
+    movu              m8, [srcq]
+    HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
+    CLIPW             m8, m14, m13
+    movu          [dstq], m8
+%endif
+
+%assign i 0
+%rep %3
+    mova              m8, [srcq + i]
+    HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
+    CLIPW             m8, m14, m13
+    mova      [dstq + i], m8
+
+    mova              m9, [srcq + i + mmsize]
+    HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
+    CLIPW             m9, m14, m13
+    mova      [dstq + i + mmsize], m9
+%assign i i+mmsize*2
+%endrep
+
+%if %2 == 48
+INIT_XMM cpuname
+    mova              m8, [srcq + i]
+    HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
+    CLIPW             m8, m14, m13
+    mova      [dstq + i], m8
+
+    mova              m9, [srcq + i + mmsize]
+    HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
+    CLIPW             m9, m14, m13
+    mova      [dstq + i + mmsize], m9
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10,  8, 0
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 2
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+HEVC_SAO_BAND_FILTER 12,  8, 0
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 2
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10,  8, 0
+HEVC_SAO_BAND_FILTER 10, 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 32, 1
+HEVC_SAO_BAND_FILTER 10, 48, 1
+HEVC_SAO_BAND_FILTER 10, 64, 2
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12,  8, 0
+HEVC_SAO_BAND_FILTER 12, 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 32, 1
+HEVC_SAO_BAND_FILTER 12, 48, 1
+HEVC_SAO_BAND_FILTER 12, 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 0
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+MMSIZE*0]
+    pand              m3, [rsp+MMSIZE*1]
+    pand              m5, [rsp+MMSIZE*2]
+    pand              m6, [rsp+MMSIZE*3]
+    pand              m7, [rsp+MMSIZE*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%assign MMSIZE mmsize
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if cpuflag(avx2)
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%if %2 == 8
+    mova              m1, [srcq]
+    movu              m2, [srcq+a_strideq]
+    movu              m3, [srcq+b_strideq]
+
+    HEVC_SAO_EDGE_FILTER_COMPUTE
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    movu          [dstq], m2
+%endif
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova      [dstq + i], m2
+
+    mova              m1, [srcq + i + mmsize]
+    movu              m2, [srcq+a_strideq + i + mmsize]
+    movu              m3, [srcq+b_strideq + i + mmsize]
+    HEVC_SAO_EDGE_FILTER_COMPUTE
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova [dstq + i + mmsize], m2
+%assign i i+mmsize*2
+%endrep
+
+%if %2 == 48
+INIT_XMM cpuname
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova              [dstq + i], m2
+
+    mova              m1, [srcq + i + mmsize]
+    movu              m2, [srcq+a_strideq + i + mmsize]
+    movu              m3, [srcq+b_strideq + i + mmsize]
+    HEVC_SAO_EDGE_FILTER_COMPUTE
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova [dstq + i + mmsize], m2
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10,  8, 0
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 2
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+HEVC_SAO_EDGE_FILTER 12,  8, 0
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 2
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 32, 1
+HEVC_SAO_EDGE_FILTER 10, 48, 1
+HEVC_SAO_EDGE_FILTER 10, 64, 2
+
+HEVC_SAO_EDGE_FILTER 12, 32, 1
+HEVC_SAO_EDGE_FILTER 12, 48, 1
+HEVC_SAO_EDGE_FILTER 12, 64, 2
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000000..ad8168fb5b
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,261 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 04203c22a0..ddc876dfcf 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
  * Copyright (c) 2013 Seppo Tomperi
  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
-
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
 
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,40 +34,1081 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
 #define LFL_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
 
-#define LFC_FUNCS(type, depth) \
-    LFC_FUNC(h, depth, sse2)   \
-    LFC_FUNC(v, depth, sse2)
+#define LFC_FUNCS(type, depth, opt) \
+    LFC_FUNC(h, depth, opt)  \
+    LFC_FUNC(v, depth, opt)
+
+#define LFL_FUNCS(type, depth, opt) \
+    LFL_FUNC(h, depth, opt)  \
+    LFL_FUNC(v, depth, opt)
+
+LFC_FUNCS(uint8_t,   8, sse2)
+LFC_FUNCS(uint8_t,  10, sse2)
+LFC_FUNCS(uint8_t,  12, sse2)
+LFC_FUNCS(uint8_t,   8, avx)
+LFC_FUNCS(uint8_t,  10, avx)
+LFC_FUNCS(uint8_t,  12, avx)
+LFL_FUNCS(uint8_t,   8, sse2)
+LFL_FUNCS(uint8_t,  10, sse2)
+LFL_FUNCS(uint8_t,  12, sse2)
+LFL_FUNCS(uint8_t,   8, ssse3)
+LFL_FUNCS(uint8_t,  10, ssse3)
+LFL_FUNCS(uint8_t,  12, ssse3)
+LFL_FUNCS(uint8_t,   8, avx)
+LFL_FUNCS(uint8_t,  10, avx)
+LFL_FUNCS(uint8_t,  12, avx)
+
+#define IDCT_FUNCS(W, opt) \
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+
+IDCT_FUNCS(4x4,   mmxext);
+IDCT_FUNCS(8x8,   mmxext);
+IDCT_FUNCS(8x8,   sse2);
+IDCT_FUNCS(16x16, sse2);
+IDCT_FUNCS(32x32, sse2);
+IDCT_FUNCS(16x16, avx2);
+IDCT_FUNCS(32x32, avx2);
+
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
+                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
+                                                intptr_t mx, intptr_t my, int width)                            \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    int16_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst = _dst + i;                                                                                         \
+        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
+    }                                                                                                           \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
+                                                    intptr_t mx, intptr_t my, int width)                        \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    uint8_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src = _src + (i * ((bitd + 7) / 8));                                                                    \
+        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
+        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
+                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
+                                                   int height, intptr_t mx, intptr_t my, int width)             \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t  *src;                                                                                              \
+    uint8_t  *dst;                                                                                              \
+    int16_t  *src2;                                                                                             \
+    for (i = 0; i < W ; i += step) {                                                                            \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
+        src2 = _src2 + i;                                                                                       \
+        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt);            \
+    mc_rep_uni_func(name, bitd, step, W, opt);        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
+                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int width)                           \
+{                                                                                                               \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, int width)                       \
+{                                                                                                               \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
+                                                        height, mx, my, width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
+                                                    int height, intptr_t mx, intptr_t my, int width)            \
+{                                                                                                               \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, mx, my, width);                    \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt);            \
+    mc_rep_uni_func(name, bitd, step, W, opt);        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt);     \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
+                                                 int height, intptr_t mx, intptr_t my, int width)             \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
+    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
+                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
+                                                    int height, intptr_t mx, intptr_t my, int width)          \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
+                                                     intptr_t mx, intptr_t my, int width)                     \
+{                                                                                                             \
+    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
+                                                      height, mx, my, width);                                 \
+    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
+                                                      height, mx, my, width);                                 \
+}
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)    \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4);            \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4);         \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
+                                                int height, intptr_t mx, intptr_t my, int width)              \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
+    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
+                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
+                                                   int height, intptr_t mx, intptr_t my, int width)           \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
+                                                  src2, height, mx, my, width);                               \
+    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
+                                                  src2+width2, height, mx, my, width);                        \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
+                                                    intptr_t mx, intptr_t my, int width)                      \
+{                                                                                                             \
+    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)    \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2);            \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2);         \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4);
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32);
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2);
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2);
+mc_rep_funcs(epel_h, 10, 16, 48, avx2);
+mc_rep_funcs(epel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2);
+mc_rep_funcs(epel_v, 10, 16, 48, avx2);
+mc_rep_funcs(epel_v, 10, 32, 64, avx2);
+
+
+mc_rep_funcs(epel_hv,  8, 32, 64, avx2);
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2);
+
+#endif //AVX2
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
+mc_rep_funcs(pel_pixels, 8,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 64, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 48, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 32, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 16, sse4);
+mc_rep_funcs(pel_pixels,10,  4, 12, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 64, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 48, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 32, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 16, sse4);
+mc_rep_funcs(pel_pixels,12,  4, 12, sse4);
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4);
+mc_rep_funcs(epel_h, 8, 16, 48, sse4);
+mc_rep_funcs(epel_h, 8, 16, 32, sse4);
+mc_rep_funcs(epel_h, 8,  8, 24, sse4);
+mc_rep_funcs(epel_h,10,  8, 64, sse4);
+mc_rep_funcs(epel_h,10,  8, 48, sse4);
+mc_rep_funcs(epel_h,10,  8, 32, sse4);
+mc_rep_funcs(epel_h,10,  8, 24, sse4);
+mc_rep_funcs(epel_h,10,  8, 16, sse4);
+mc_rep_funcs(epel_h,10,  4, 12, sse4);
+mc_rep_funcs(epel_h,12,  8, 64, sse4);
+mc_rep_funcs(epel_h,12,  8, 48, sse4);
+mc_rep_funcs(epel_h,12,  8, 32, sse4);
+mc_rep_funcs(epel_h,12,  8, 24, sse4);
+mc_rep_funcs(epel_h,12,  8, 16, sse4);
+mc_rep_funcs(epel_h,12,  4, 12, sse4);
+mc_rep_funcs(epel_v, 8, 16, 64, sse4);
+mc_rep_funcs(epel_v, 8, 16, 48, sse4);
+mc_rep_funcs(epel_v, 8, 16, 32, sse4);
+mc_rep_funcs(epel_v, 8,  8, 24, sse4);
+mc_rep_funcs(epel_v,10,  8, 64, sse4);
+mc_rep_funcs(epel_v,10,  8, 48, sse4);
+mc_rep_funcs(epel_v,10,  8, 32, sse4);
+mc_rep_funcs(epel_v,10,  8, 24, sse4);
+mc_rep_funcs(epel_v,10,  8, 16, sse4);
+mc_rep_funcs(epel_v,10,  4, 12, sse4);
+mc_rep_funcs(epel_v,12,  8, 64, sse4);
+mc_rep_funcs(epel_v,12,  8, 48, sse4);
+mc_rep_funcs(epel_v,12,  8, 32, sse4);
+mc_rep_funcs(epel_v,12,  8, 24, sse4);
+mc_rep_funcs(epel_v,12,  8, 16, sse4);
+mc_rep_funcs(epel_v,12,  4, 12, sse4);
+mc_rep_funcs(epel_hv, 8, 16, 64, sse4);
+mc_rep_funcs(epel_hv, 8, 16, 48, sse4);
+mc_rep_funcs(epel_hv, 8, 16, 32, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 24, sse4);
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4);
+mc_rep_funcs(epel_hv,10,  8, 64, sse4);
+mc_rep_funcs(epel_hv,10,  8, 48, sse4);
+mc_rep_funcs(epel_hv,10,  8, 32, sse4);
+mc_rep_funcs(epel_hv,10,  8, 24, sse4);
+mc_rep_funcs(epel_hv,10,  8, 16, sse4);
+mc_rep_funcs(epel_hv,10,  4, 12, sse4);
+mc_rep_funcs(epel_hv,12,  8, 64, sse4);
+mc_rep_funcs(epel_hv,12,  8, 48, sse4);
+mc_rep_funcs(epel_hv,12,  8, 32, sse4);
+mc_rep_funcs(epel_hv,12,  8, 24, sse4);
+mc_rep_funcs(epel_hv,12,  8, 16, sse4);
+mc_rep_funcs(epel_hv,12,  4, 12, sse4);
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4);
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4);
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4);
+mc_rep_funcs(qpel_h, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_h,10,  8, 64, sse4);
+mc_rep_funcs(qpel_h,10,  8, 48, sse4);
+mc_rep_funcs(qpel_h,10,  8, 32, sse4);
+mc_rep_funcs(qpel_h,10,  8, 24, sse4);
+mc_rep_funcs(qpel_h,10,  8, 16, sse4);
+mc_rep_funcs(qpel_h,10,  4, 12, sse4);
+mc_rep_funcs(qpel_h,12,  8, 64, sse4);
+mc_rep_funcs(qpel_h,12,  8, 48, sse4);
+mc_rep_funcs(qpel_h,12,  8, 32, sse4);
+mc_rep_funcs(qpel_h,12,  8, 24, sse4);
+mc_rep_funcs(qpel_h,12,  8, 16, sse4);
+mc_rep_funcs(qpel_h,12,  4, 12, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4);
+mc_rep_funcs(qpel_v, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_v,10,  8, 64, sse4);
+mc_rep_funcs(qpel_v,10,  8, 48, sse4);
+mc_rep_funcs(qpel_v,10,  8, 32, sse4);
+mc_rep_funcs(qpel_v,10,  8, 24, sse4);
+mc_rep_funcs(qpel_v,10,  8, 16, sse4);
+mc_rep_funcs(qpel_v,10,  4, 12, sse4);
+mc_rep_funcs(qpel_v,12,  8, 64, sse4);
+mc_rep_funcs(qpel_v,12,  8, 48, sse4);
+mc_rep_funcs(qpel_v,12,  8, 32, sse4);
+mc_rep_funcs(qpel_v,12,  8, 24, sse4);
+mc_rep_funcs(qpel_v,12,  8, 16, sse4);
+mc_rep_funcs(qpel_v,12,  4, 12, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 64, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 48, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 32, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 16, sse4);
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 64, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 48, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 32, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 24, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 16, sse4);
+mc_rep_funcs(qpel_hv,10,  4, 12, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 64, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 48, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 32, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 24, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 16, sse4);
+mc_rep_funcs(qpel_hv,12,  4, 12, sse4);
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                               int height, int denom,  int _wx, int _ox)                                \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
+                                                     height, denom, _wx, _ox);                                          \
+    }                                                                                                                   \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4);
+mc_rep_uni_w(8, 8, 16, sse4);
+mc_rep_uni_w(8, 8, 24, sse4);
+mc_rep_uni_w(8, 8, 32, sse4);
+mc_rep_uni_w(8, 8, 48, sse4);
+mc_rep_uni_w(8, 8, 64, sse4);
+
+mc_rep_uni_w(10, 6, 12, sse4);
+mc_rep_uni_w(10, 8, 16, sse4);
+mc_rep_uni_w(10, 8, 24, sse4);
+mc_rep_uni_w(10, 8, 32, sse4);
+mc_rep_uni_w(10, 8, 48, sse4);
+mc_rep_uni_w(10, 8, 64, sse4);
+
+mc_rep_uni_w(12, 6, 12, sse4);
+mc_rep_uni_w(12, 8, 16, sse4);
+mc_rep_uni_w(12, 8, 24, sse4);
+mc_rep_uni_w(12, 8, 32, sse4);
+mc_rep_uni_w(12, 8, 48, sse4);
+mc_rep_uni_w(12, 8, 64, sse4);
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
+                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
+    }                                                                                                                   \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4);
+mc_rep_bi_w(8, 8, 16, sse4);
+mc_rep_bi_w(8, 8, 24, sse4);
+mc_rep_bi_w(8, 8, 32, sse4);
+mc_rep_bi_w(8, 8, 48, sse4);
+mc_rep_bi_w(8, 8, 64, sse4);
+
+mc_rep_bi_w(10, 6, 12, sse4);
+mc_rep_bi_w(10, 8, 16, sse4);
+mc_rep_bi_w(10, 8, 24, sse4);
+mc_rep_bi_w(10, 8, 32, sse4);
+mc_rep_bi_w(10, 8, 48, sse4);
+mc_rep_bi_w(10, 8, 64, sse4);
+
+mc_rep_bi_w(12, 6, 12, sse4);
+mc_rep_bi_w(12, 8, 16, sse4);
+mc_rep_bi_w(12, 8, 24, sse4);
+mc_rep_bi_w(12, 8, 32, sse4);
+mc_rep_bi_w(12, 8, 48, sse4);
+mc_rep_bi_w(12, 8, 64, sse4);
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
+                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
+                                                      int height, int denom,                        \
+                                                      int _wx, int _ox,                             \
+                                                      intptr_t mx, intptr_t my, int width)          \
+{                                                                                                   \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt)       \
+        mc_uni_w_func(name, bitd, 4, opt);    \
+        mc_uni_w_func(name, bitd, 8, opt);    \
+        mc_uni_w_func(name, bitd, 12, opt);   \
+        mc_uni_w_func(name, bitd, 16, opt);   \
+        mc_uni_w_func(name, bitd, 24, opt);   \
+        mc_uni_w_func(name, bitd, 32, opt);   \
+        mc_uni_w_func(name, bitd, 48, opt);   \
+        mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4);
+mc_uni_w_func(pel_pixels, 8, 6, sse4);
+mc_uni_w_funcs(epel_h, 8, sse4);
+mc_uni_w_func(epel_h, 8, 6, sse4);
+mc_uni_w_funcs(epel_v, 8, sse4);
+mc_uni_w_func(epel_v, 8, 6, sse4);
+mc_uni_w_funcs(epel_hv, 8, sse4);
+mc_uni_w_func(epel_hv, 8, 6, sse4);
+mc_uni_w_funcs(qpel_h, 8, sse4);
+mc_uni_w_funcs(qpel_v, 8, sse4);
+mc_uni_w_funcs(qpel_hv, 8, sse4);
+
+mc_uni_w_funcs(pel_pixels, 10, sse4);
+mc_uni_w_func(pel_pixels, 10, 6, sse4);
+mc_uni_w_funcs(epel_h, 10, sse4);
+mc_uni_w_func(epel_h, 10, 6, sse4);
+mc_uni_w_funcs(epel_v, 10, sse4);
+mc_uni_w_func(epel_v, 10, 6, sse4);
+mc_uni_w_funcs(epel_hv, 10, sse4);
+mc_uni_w_func(epel_hv, 10, 6, sse4);
+mc_uni_w_funcs(qpel_h, 10, sse4);
+mc_uni_w_funcs(qpel_v, 10, sse4);
+mc_uni_w_funcs(qpel_hv, 10, sse4);
+
+mc_uni_w_funcs(pel_pixels, 12, sse4);
+mc_uni_w_func(pel_pixels, 12, 6, sse4);
+mc_uni_w_funcs(epel_h, 12, sse4);
+mc_uni_w_func(epel_h, 12, 6, sse4);
+mc_uni_w_funcs(epel_v, 12, sse4);
+mc_uni_w_func(epel_v, 12, 6, sse4);
+mc_uni_w_funcs(epel_hv, 12, sse4);
+mc_uni_w_func(epel_hv, 12, 6, sse4);
+mc_uni_w_funcs(qpel_h, 12, sse4);
+mc_uni_w_funcs(qpel_v, 12, sse4);
+mc_uni_w_funcs(qpel_hv, 12, sse4);
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
+                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt)       \
+        mc_bi_w_func(name, bitd, 4, opt);    \
+        mc_bi_w_func(name, bitd, 8, opt);    \
+        mc_bi_w_func(name, bitd, 12, opt);   \
+        mc_bi_w_func(name, bitd, 16, opt);   \
+        mc_bi_w_func(name, bitd, 24, opt);   \
+        mc_bi_w_func(name, bitd, 32, opt);   \
+        mc_bi_w_func(name, bitd, 48, opt);   \
+        mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4);
+mc_bi_w_func(pel_pixels, 8, 6, sse4);
+mc_bi_w_funcs(epel_h, 8, sse4);
+mc_bi_w_func(epel_h, 8, 6, sse4);
+mc_bi_w_funcs(epel_v, 8, sse4);
+mc_bi_w_func(epel_v, 8, 6, sse4);
+mc_bi_w_funcs(epel_hv, 8, sse4);
+mc_bi_w_func(epel_hv, 8, 6, sse4);
+mc_bi_w_funcs(qpel_h, 8, sse4);
+mc_bi_w_funcs(qpel_v, 8, sse4);
+mc_bi_w_funcs(qpel_hv, 8, sse4);
+
+mc_bi_w_funcs(pel_pixels, 10, sse4);
+mc_bi_w_func(pel_pixels, 10, 6, sse4);
+mc_bi_w_funcs(epel_h, 10, sse4);
+mc_bi_w_func(epel_h, 10, 6, sse4);
+mc_bi_w_funcs(epel_v, 10, sse4);
+mc_bi_w_func(epel_v, 10, 6, sse4);
+mc_bi_w_funcs(epel_hv, 10, sse4);
+mc_bi_w_func(epel_hv, 10, 6, sse4);
+mc_bi_w_funcs(qpel_h, 10, sse4);
+mc_bi_w_funcs(qpel_v, 10, sse4);
+mc_bi_w_funcs(qpel_hv, 10, sse4);
+
+mc_bi_w_funcs(pel_pixels, 12, sse4);
+mc_bi_w_func(pel_pixels, 12, 6, sse4);
+mc_bi_w_funcs(epel_h, 12, sse4);
+mc_bi_w_func(epel_h, 12, 6, sse4);
+mc_bi_w_funcs(epel_v, 12, sse4);
+mc_bi_w_func(epel_v, 12, 6, sse4);
+mc_bi_w_funcs(epel_hv, 12, sse4);
+mc_bi_w_func(epel_hv, 12, 6, sse4);
+mc_bi_w_funcs(qpel_h, 12, sse4);
+mc_bi_w_funcs(qpel_v, 12, sse4);
+mc_bi_w_funcs(qpel_hv, 12, sse4);
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
-#define LFL_FUNCS(type, depth) \
-    LFL_FUNC(h, depth, ssse3)  \
-    LFL_FUNC(v, depth, ssse3)
+#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
+void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                             int16_t *sao_offset_val, int sao_left_class, int width, int height)
 
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
+SAO_BAND_FILTER_FUNCS(8,  sse2);
+SAO_BAND_FILTER_FUNCS(10, sse2);
+SAO_BAND_FILTER_FUNCS(12, sse2);
+SAO_BAND_FILTER_FUNCS(8,   avx);
+SAO_BAND_FILTER_FUNCS(10,  avx);
+SAO_BAND_FILTER_FUNCS(12,  avx);
+SAO_BAND_FILTER_FUNCS(8,  avx2);
+SAO_BAND_FILTER_FUNCS(10, avx2);
+SAO_BAND_FILTER_FUNCS(12, avx2);
+
+#define SAO_BAND_INIT(bitd, opt) do {                                       \
+    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
+    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
+    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
+    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
+    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
+void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
+                                              int eo, int width, int height);                                               \
+void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+
+SAO_EDGE_FILTER_FUNCS(8, ssse3);
+SAO_EDGE_FILTER_FUNCS(8, avx2);
+SAO_EDGE_FILTER_FUNCS(10, sse2);
+SAO_EDGE_FILTER_FUNCS(10, avx2);
+SAO_EDGE_FILTER_FUNCS(12, sse2);
+SAO_EDGE_FILTER_FUNCS(12, avx2);
+
+#define SAO_EDGE_INIT(bitd, opt) do {                                       \
+    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
+    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
+    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
+    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
+    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+            c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
+
+            }
+            SAO_BAND_INIT(8, sse2);
+
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
+
+            c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
         }
-        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+        if (EXTERNAL_SSSE3(cpu_flags)) {
+            if(ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+            }
+            SAO_EDGE_INIT(8, ssse3);
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+            }
+            SAO_BAND_INIT(8, avx);
+
+            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+            }
+            SAO_BAND_INIT(8, avx2);
+
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
         }
     } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+            }
+            SAO_BAND_INIT(10, sse2);
+            SAO_EDGE_INIT(10, sse2);
+
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+
+            c->transform_add[1]    = ff_hevc_transform_add8_10_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_10_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_10_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
         }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+            }
+            SAO_BAND_INIT(10, avx);
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+            }
+            SAO_BAND_INIT(10, avx2);
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
+
+            c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
+            c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
+
+        }
+    } else if (bit_depth == 12) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            }
+            SAO_BAND_INIT(12, sse2);
+            SAO_EDGE_INIT(12, sse2);
+
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+            }
+            SAO_BAND_INIT(12, avx);
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
+
+            SAO_BAND_INIT(12, avx2);
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2;
+        }
     }
 }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index b8929b96d4..82fb8934af 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
 ;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
 ;* SIMD-optimized halfpel functions
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -22,26 +29,49 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
 
 SECTION .text
 
 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
 cglobal put_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     add          r1, r4
     add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
 PUT_PIXELS_16
 INIT_MMX 3dnow
 PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
 
 
 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -191,20 +224,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
 cglobal put_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
     PAVGB        m2, m1
@@ -221,6 +258,9 @@ INIT_MMX mmxext
 PUT_PIXELS8_Y2
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
 
 
 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -334,26 +374,48 @@ AVG_PIXELS8
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
 cglobal avg_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -362,40 +424,45 @@ cglobal avg_pixels8_x2, 4,5
     REP_RET
 %endmacro
 
+INIT_MMX mmx
+AVG_PIXELS8_X2
 INIT_MMX mmxext
 AVG_PIXELS8_X2
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
 
 
 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
 cglobal avg_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m0, m3
-    PAVGB        m1, m4
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m2, m3
-    PAVGB        m1, m4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -408,11 +475,16 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
     mova         m6, [pb_1]
     lea          r4, [r2*2]
     mova         m0, [r1]
@@ -449,6 +521,160 @@ cglobal avg_pixels8_xy2, 4,5
 %endmacro
 
 INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
 INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
+INIT_MMX 3dnow
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 47b0b8b825..5fae990a4f 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,27 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
 
 void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
 void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 #endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 59cb5e143a..5c5da28360 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -40,6 +40,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -74,10 +82,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 
 #define avg_pixels8_mmx         ff_avg_pixels8_mmx
 #define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
@@ -111,11 +121,13 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
 #undef PAVGB
 #undef STATIC
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
 
 /***********************************/
 /* MMX rounding */
@@ -138,11 +150,13 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #undef PAVGBP
 #undef PAVGB
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
 
 #endif /* HAVE_INLINE_ASM */
 
@@ -156,32 +170,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8)
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_3dnow)
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_YASM */
 
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
     do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
     } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
 {
-#if HAVE_MMX_INLINE
     SET_HPEL_FUNCS(put,        [0], 16, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
     SET_HPEL_FUNCS(put,        [1],  8, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
-#endif /* HAVE_MMX_INLINE */
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
 }
 
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
@@ -193,6 +224,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -200,6 +232,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -207,11 +240,11 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 
-    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+    if (CONFIG_VP3_DECODER && flags & AV_CODEC_FLAG_BITEXACT) {
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
     }
@@ -227,6 +260,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -234,6 +268,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)){
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -241,11 +276,11 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
     }
 
-    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+    if (CONFIG_VP3_DECODER && flags & AV_CODEC_FLAG_BITEXACT) {
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
     }
@@ -259,11 +294,27 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
         // these functions are slower than mmx on AMD, but faster on Intel
         c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
         c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+        c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+        c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+        c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
         c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+        c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+        c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+        c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
     }
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -279,4 +330,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 
     if (EXTERNAL_SSE2(cpu_flags))
         hpeldsp_init_sse2(c, flags, cpu_flags);
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags, cpu_flags);
 }
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e40e..0000000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index d854e8a2fc..e20d0658cd 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include <stdint.h>
 
 // put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -60,7 +60,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :REG_a, "memory");
 }
 
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -106,7 +106,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff
         :REG_a, "memory");
 }
 
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -135,33 +135,34 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :REG_a, "memory");
 }
 
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
         __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
             :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
 }
 
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index e7536da998..0dbe598421 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,28 +1,29 @@
 ;******************************************************************************
 ;* SIMD-optimized HuffYUV functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
-pb_f: times 16 db 15
+cextern pb_15
 pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
@@ -33,64 +34,72 @@ SECTION .text
 ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
 ;                                     const uint8_t *diff, int w,
 ;                                     int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
-    movq    mm0, [topq]
-    movq    mm2, mm0
-    movd    mm4, [left_topq]
-    psllq   mm2, 8
-    movq    mm1, mm0
-    por     mm4, mm2
-    movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
+%macro HFYU_MEDIAN 0
+cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
     add    dstq, wq
     add    topq, wq
     add   diffq, wq
     neg      wq
     jmp .skip
 .loop:
-    movq    mm4, [topq+wq]
-    movq    mm0, mm4
-    psllq   mm4, 8
-    por     mm4, mm1
-    movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
 .skip:
-    movq    mm2, [diffq+wq]
+    movu    m2, [diffq+wq]
 %assign i 0
-%rep 8
-    movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
-    movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
 %if i==0
-    movq    mm7, mm3
-    psllq   mm7, 56
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
 %else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
 %endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
 %endif
 %assign i i+1
 %endrep
-    movq [dstq+wq], mm7
-    add      wq, 8
+    movu [dstq+wq], m7
+    add      wq, mmsize
     jl .loop
     movzx   r2d, byte [dstq-1]
     mov [leftq], r2d
     movzx   r2d, byte [topq-1]
     mov [left_topq], r2d
     RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+HFYU_MEDIAN
+%endif
+INIT_XMM sse2
+HFYU_MEDIAN
 
 
 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
@@ -148,7 +157,7 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
 
 INIT_XMM sse4
 cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
+    mova    m5, [pb_15]
     mova    m6, [pb_zzzzzzzz77777777]
     mova    m4, [pb_zzzz3333zzzzbbbb]
     mova    m3, [pb_zz11zz55zz99zzdd]
@@ -163,3 +172,82 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
     ADD_HFYU_LEFT_LOOP 0, 1
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 75537d7a4c..3ced3c0a1c 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,20 +25,29 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
+
 void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-                                    const uint8_t *diff, int w,
+                                    const uint8_t *diff, intptr_t w,
                                     int *left, int *left_top);
+void ff_add_hfyu_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, intptr_t w,
+                                  int *left, int *left_top);
 
 int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                                 int w, int left);
+                                 intptr_t w, int left);
 int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
-                                int w, int left);
+                                intptr_t w, int left);
 
-#if HAVE_INLINE_ASM
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
 
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
 static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
-                                      const uint8_t *diff, int w,
+                                      const uint8_t *diff, intptr_t w,
                                       int *left, int *left_top)
 {
     x86_reg w2 = -w;
@@ -72,56 +81,34 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
     *left     = l;
     *left_top = tl;
 }
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
+#endif
 
 av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
     if (cpu_flags & AV_CPU_FLAG_CMOV)
         c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
+#endif
 
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+    }
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
         /* slower than cmov version on AMD */
         if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
     }
 
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes            = ff_add_bytes_sse2;
+        c->add_hfyu_median_pred = ff_add_hfyu_median_pred_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
+    }
+
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
         if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c
index 8ffaced37d..63d8e3cc73 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,10 +31,11 @@
 
 #if HAVE_INLINE_ASM
 
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
+static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
 {
     x86_reg i = 0;
 
+    if (w >= 16)
     __asm__ volatile (
         "1:                             \n\t"
         "movq  (%2, %0), %%mm0          \n\t"
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000000..089425a9ab
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 22df3dd758..daa4e798ed 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,12 +20,19 @@
 #define AVCODEC_X86_IDCTDSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size);
+                                      ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
 
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3661..2c26a98850 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
         if (!high_bit_depth &&
+            avctx->lowres == 0 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_mmx;
                 c->idct_add  = ff_simple_idct_add_mmx;
@@ -77,4 +75,14 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+    }
 }
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 7285b1d357..0000000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index f85e2e4cc3..7218f06ba5 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
 ;* 36 point SSE-optimized IMDCT transform
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -50,7 +50,7 @@ ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
                dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
                dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
                dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
-               dd 1.0,  0.70710678118654752439,  0.0,  0.0
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
 
 costabs:  times 4 dd  0.98480773
           times 4 dd  0.93969262
@@ -129,6 +129,19 @@ SECTION .text
 %endif
 %endmacro
 
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
 %macro STORE 4
     movhlps %2, %1
     movss   [%3       ], %1
@@ -279,11 +292,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     BUTTERF  m7, m2, 16
     BUTTERF  m3, m6, 32
     BUTTERF  m4, m1, 48
-
-    mulps   m5, m5, [ps_cosh + 64]
-    PSHUFD  m1, m5, 0xe1
-    xorps   m5, m5, [ps_p1m1p1m1]
-    addps   m5, m5, m1
+    BUTTERF2 m5, m1, 64
 
     ; permutates:
     ; m0    0  1  2  3     =>     2  6 10 14   m1
@@ -358,8 +367,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse2
 DEFINE_IMDCT
@@ -370,8 +381,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -716,5 +729,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index e4affabc87..3e65a76973 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
 /*
  * inline assembly helper macros
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 
 #ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
new file mode 100644
index 0000000000..56b5fbd606
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,144 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+    movaps   m6, [pf_ict0]
+    movaps   m7, [pf_ict1]
+    %define ICT0 m6
+    %define ICT1 m7
+
+%if ARCH_X86_64
+    movaps   m8, [pf_ict2]
+    %define ICT2 m8
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    movaps   m9, [pf_ict3]
+    %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+    %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+    movaps   m0, [src0q+csizeq]
+    movaps   m1, [src1q+csizeq]
+    movaps   m2, [src2q+csizeq]
+
+%if cpuflag(avx)
+    mulps    m5, m1, ICT1
+    mulps    m4, m2, ICT0
+    mulps    m1, m1, ICT3
+    mulps    m2, m2, ICT2
+    subps    m5, m0, m5
+%else ; sse
+    movaps   m3, m1
+    movaps   m4, m2
+    movaps   m5, m0
+    mulps    m3, ICT1
+    mulps    m4, ICT0
+    mulps    m1, ICT3
+    mulps    m2, ICT2
+    subps    m5, m3
+%endif
+    addps    m4, m4, m0
+    addps    m0, m0, m1
+    subps    m5, m5, m2
+
+    movaps   [src0q+csizeq], m4
+    movaps   [src2q+csizeq], m0
+    movaps   [src1q+csizeq], m5
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+
+align 16
+.loop:
+    mova   m1, [src1q+csizeq]
+    mova   m2, [src2q+csizeq]
+    mova   m0, [src0q+csizeq]
+    paddd  m3, m1, m2
+    psrad  m3, 2
+    psubd  m0, m3
+    paddd  m1, m0
+    paddd  m2, m0
+    mova   [src1q+csizeq], m0
+    mova   [src2q+csizeq], m1
+    mova   [src0q+csizeq], m2
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
new file mode 100644
index 0000000000..0dbd2db7f5
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,50 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d6abd982e8..5597dadab4 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
 %endmacro
@@ -159,9 +152,6 @@ SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b9b6..197173caf4 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                                const int16_t *v3,
@@ -31,8 +31,9 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags))
@@ -44,4 +45,5 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags) &&
         !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+#endif
 }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000000..f06fcdf7cf
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,294 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    m4, maskd
+    SPLATW  m4, m4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    INT16_LOOP a, add
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, add
+.unaligned:
+    INT16_LOOP u, add
+
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    INT16_LOOP a, sub
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, sub
+.unaligned:
+    INT16_LOOP u, sub
+
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 16
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubw   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 16
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubw   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 4
+    movq    mm4, mm0
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
+    movq    mm5, mm3
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 48
+%else
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
+%endif
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, word [dstq-2]
+    mov [leftq], r2d
+    movzx   r2d, word [topq-2]
+    mov [left_topq], r2d
+    RET
+
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000000..b0fbcfef8f
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,62 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../lossless_videodsp.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->add_int16 = ff_add_int16_mmx;
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_ssse3;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_sse4;
+    }
+}
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index ea5d2eab56..3a9493f728 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
  * SIMD-optimized LPC functions
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
         "3:                                    \n\t"
         :"+&r"(i), "+&r"(j)
         :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                     "%xmm5", "%xmm6", "%xmm7")
     );
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
                 :"memory"
             );
         } else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
             );
         }
     }
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
 #if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
         c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
     }
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index 2c04d9d1bd..6298f5ed19 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ __asm__ volatile(\
 #endif /* HAVE_I686 */
 
 #define MASK_ABS(mask, level)                   \
-    __asm__ ("cltd                   \n\t"      \
+    __asm__ ("cdq                    \n\t"      \
              "xorl %1, %0            \n\t"      \
              "subl %1, %0            \n\t"      \
              : "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37b39..ad06d485ab 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
 SECTION .text
 
 %macro DIFF_PIXELS_1 4
@@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
-;                               uint8_t *src2, int stride, int h)
+;                               uint8_t *src2, ptrdiff_t stride, int h)
 ; r0 = void *s = unused, int h = unused (always 8)
 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
 ; can simply call this 2x2x (and that's why we access rsp+gprsize
@@ -274,19 +279,27 @@ INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
 
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-;                   int line_size, int h);
-cglobal sse16, 5, 5, 8
-    shr      r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;               ptrdiff_t line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+    shr       hd, 1
+%endif
     pxor      m0, m0         ; mm0 = 0
     pxor      m7, m7         ; mm7 holds the sum
 
 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else  ; %1 / 2 == mmsize; mmx only
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
+%endif
 
     ; todo: mm1-mm2, mm3-mm4
     ; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8
     pmaddwd   m1, m1
     pmaddwd   m3, m3
 
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
-
     paddd     m1, m2
     paddd     m3, m4
     paddd     m7, m1
     paddd     m7, m3
 
-    dec       r4
+%if %1 == mmsize
+    lea    pix1q, [pix1q + 2*lsizeq]
+    lea    pix2q, [pix2q + 2*lsizeq]
+%else
+    add    pix1q, lsizeq
+    add    pix2q, lsizeq
+%endif
+    dec       hd
     jnz .next2lines
 
-    mova      m1, m7
-    psrldq    m7, 8          ; shift hi qword to lo
-    paddd     m7, m1
-    mova      m1, m7
-    psrldq    m7, 4          ; shift hi dword to lo
-    paddd     m7, m1
+    HADDD     m7, m1
     movd     eax, m7         ; return value
     RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+    pxor    m0, m0
+    pxor    m1, m1
+%assign %%i 0
+%rep %2
+    mova      m2, [blockq+mmsize*(0+%%i)]
+    mova      m3, [blockq+mmsize*(1+%%i)]
+    mova      m4, [blockq+mmsize*(2+%%i)]
+    mova      m5, [blockq+mmsize*(3+%%i)]
+    ABS1_SUM  m2, m6, m0
+    ABS1_SUM  m3, m6, m1
+    ABS1_SUM  m4, m6, m0
+    ABS1_SUM  m5, m6, m1
+%assign %%i %%i+4
+%endrep
+    paddusw m0, m1
+    HSUM    m0, m1, eax
+    and     eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+    mova      m%2, [pix1q]
+%if %1 == 8
+    mova      m%3, m%2
+    psllq     m%2, 8
+    psrlq     m%3, 8
+    psrlq     m%2, 8
+%else
+    mova      m%3, [pix1q+1]
+%endif
+    mova      m%4, m%2
+    mova      m%5, m%3
+    punpcklbw m%2, m7
+    punpcklbw m%3, m7
+    punpckhbw m%4, m7
+    punpckhbw m%5, m7
+    psubw     m%2, m%3
+    psubw     m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
+    pxor       m3, m3
+    pxor       m1, m1
+    pcmpgtw    m3, m%1
+    pcmpgtw    m1, m%2
+    pxor      m%1, m3
+    pxor      m%2, m1
+    psubw     m%1, m3
+    psubw     m%2, m1
+    paddw     m%2, m%1
+    paddw      m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+    sub        hd, 2
+    pxor       m7, m7
+    pxor       m6, m6
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+.loop:
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    HF_NOISE_PART2     4, 5, 0, 2
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+    sub        hd, 2
+        jne .loop
+
+    mova       m0, m6
+    punpcklwd  m0, m7
+    punpckhwd  m6, m7
+    paddd      m6, m0
+    mova       m0, m6
+    psrlq      m6, 32
+    paddd      m0, m6
+    movd      eax, m0   ; eax = result of hf_noise8;
+    REP_RET                 ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+    movu      m2, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m2, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+strideq*2]
+    lea    pix2q, [pix2q+strideq*2]
+    movu      m0, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m0, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m0
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m0, m2
+    paddw     m2, m0
+%endif
+    movd     eax, m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+    movu      m0, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m0, m3
+    pavgb     m2, m4
+%else
+    pavgb     m0, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m0, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m1, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m1, m3
+    pavgb     m2, m4
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m4, [pix2q+8]
+    movu      m5, [pix2q+strideq+8]
+    movu      m6, [pix2q+2*strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+    mova      m4, [pb_1]
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    movu      m2, [pix2q+2*strideq+1]
+    pavgb     m1, m5
+    pavgb     m0, m6
+    pavgb     m3, m2
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m0, [pix2q+strideq+1]
+    pavgb     m3, [pix2q+2*strideq+1]
+%endif
+    psubusb   m0, m4
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    movu      m7, [pix2q+2*strideq+8]
+    pavgb     m5, [pix2q+1+8]
+    pavgb     m6, [pix2q+strideq+1+8]
+    pavgb     m7, [pix2q+2*strideq+1+8]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    pavgb     m2, m5
+    pavgb     m3, m6
+%else
+    pavgb     m2, [pix2q+1]
+    pavgb     m3, [pix2q+strideq+1]
+%endif
+    psubusb   m2, m4
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m6, [pix2q+8]
+    movu      m7, [pix2q+strideq+8]
+    pavgb     m6, [pix2q+8+1]
+    pavgb     m7, [pix2q+strideq+8+1]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                  ptrdiff_t line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+    mova      m0, [pix1q]
+%if %1 == mmsize
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m0, m2
+%else
+    mova      m2, [pix1q+lsizeq]
+    mova      m3, [pix1q+8]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m0, m2
+    psadbw    m3, m4
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+
+.loop:
+    lea    pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+    mova      m1, [pix1q]
+    psadbw    m2, m1
+    paddw     m0, m2
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m1, m2
+    paddw     m0, m1
+%else
+    mova      m1, [pix1q]
+    mova      m3, [pix1q+8]
+    psadbw    m2, m1
+    psadbw    m4, m3
+    paddw     m0, m2
+    paddw     m0, m4
+    mova      m2, [pix1q+lsizeq]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m1, m2
+    psadbw    m3, m4
+    paddw     m0, m1
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+    jg     .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                   ptrdiff_t line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+    mova   m1, [pb_80]
+    mova   m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+    mova   m4, [pix1q+lsizeq]
+%if mmsize == 16
+    movu   m3, [pix2q]
+    movu   m2, [pix2q+lsizeq]
+    psubb  m0, m3
+    psubb  m4, m2
+%else
+    psubb  m0, [pix2q]
+    psubb  m4, [pix2q+lsizeq]
+%endif
+    pxor   m0, m1
+    pxor   m4, m1
+    psadbw m0, m4
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m0, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m0, m1
+    pxor   m3, m1
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m0, m4
+    psadbw m3, m5
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+
+.loop:
+    lea pix1q, [pix1q + 2*lsizeq]
+    lea pix2q, [pix2q + 2*lsizeq]
+    mova   m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+    movu   m3, [pix2q]
+    psubb  m2, m3
+%else
+    psubb  m2, [pix2q]
+%endif
+    pxor   m2, m1
+    psadbw m4, m2
+    paddw  m0, m4
+    mova   m4, [pix1q+lsizeq]
+    movu   m3, [pix2q+lsizeq]
+    psubb  m4, m3
+    pxor   m4, m1
+    psadbw m2, m4
+    paddw  m0, m2
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m2, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m2, m1
+    pxor   m3, m1
+    psadbw m4, m2
+    psadbw m5, m3
+    paddw  m0, m4
+    paddw  m0, m5
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m2, m4
+    psadbw m3, m5
+    paddw  m0, m2
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+    jg  .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd  eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index b906bb6caa..49f50d0eed 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,382 +29,67 @@
 #include "libavcodec/me_cmp.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                    ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl         %4, %%ecx          \n"
-        "shr          $1, %%ecx          \n"
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
-        "1:                              \n"
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq      %%mm1, %%mm5          \n"
-        "movq      %%mm3, %%mm6          \n"
-        "psubusb   %%mm2, %%mm1          \n"
-        "psubusb   %%mm4, %%mm3          \n"
-        "psubusb   %%mm5, %%mm2          \n"
-        "psubusb   %%mm6, %%mm4          \n"
-
-        "por       %%mm1, %%mm2          \n"
-        "por       %%mm3, %%mm4          \n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq      %%mm2, %%mm1          \n"
-        "movq      %%mm4, %%mm3          \n"
-
-        "punpckhbw %%mm0, %%mm2          \n"
-        "punpckhbw %%mm0, %%mm4          \n"
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd   %%mm2, %%mm2          \n"
-        "pmaddwd   %%mm4, %%mm4          \n"
-        "pmaddwd   %%mm1, %%mm1          \n"
-        "pmaddwd   %%mm3, %%mm3          \n"
-
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * stride */
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * stride */
-
-        "paddd     %%mm2, %%mm1          \n"
-        "paddd     %%mm4, %%mm3          \n"
-        "paddd     %%mm1, %%mm7          \n"
-        "paddd     %%mm3, %%mm7          \n"
-
-        "decl      %%ecx                 \n"
-        "jnz       1b                    \n"
-
-        "movq      %%mm7, %%mm1          \n"
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
-        "paddd     %%mm7, %%mm1          \n"
-        "movd      %%mm1, %2             \n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                     ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
-        "1:\n"
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq %%mm1, %%mm5\n"
-        "movq %%mm3, %%mm6\n"
-        "psubusb %%mm2, %%mm1\n"
-        "psubusb %%mm4, %%mm3\n"
-        "psubusb %%mm5, %%mm2\n"
-        "psubusb %%mm6, %%mm4\n"
-
-        "por %%mm1, %%mm2\n"
-        "por %%mm3, %%mm4\n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq %%mm2, %%mm1\n"
-        "movq %%mm4, %%mm3\n"
-
-        "punpckhbw %%mm0, %%mm2\n"
-        "punpckhbw %%mm0, %%mm4\n"
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd %%mm2, %%mm2\n"
-        "pmaddwd %%mm4, %%mm4\n"
-        "pmaddwd %%mm1, %%mm1\n"
-        "pmaddwd %%mm3, %%mm3\n"
-
-        "add %3, %0\n"
-        "add %3, %1\n"
-
-        "paddd %%mm2, %%mm1\n"
-        "paddd %%mm4, %%mm3\n"
-        "paddd %%mm1, %%mm7\n"
-        "paddd %%mm3, %%mm7\n"
-
-        "decl %%ecx\n"
-        "jnz 1b\n"
-
-        "movq %%mm7, %%mm1\n"
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
-        "paddd %%mm7, %%mm1\n"
-        "movd %%mm1, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor  %%mm3, %%mm3\n"
-        "pxor  %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor  %%mm3, %%mm4\n"
-        "pxor  %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq      %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq      %%mm4, %%mm5\n"
-        "movq      %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw     %%mm1, %%mm4\n"
-        "psubw     %%mm3, %%mm5\n"
-        "psubw     %%mm4, %%mm0\n"
-        "psubw     %%mm5, %%mm2\n"
-        "pxor      %%mm3, %%mm3\n"
-        "pxor      %%mm1, %%mm1\n"
-        "pcmpgtw   %%mm0, %%mm3\n\t"
-        "pcmpgtw   %%mm2, %%mm1\n\t"
-        "pxor      %%mm3, %%mm0\n"
-        "pxor      %%mm1, %%mm2\n"
-        "psubw     %%mm3, %%mm0\n"
-        "psubw     %%mm1, %%mm2\n"
-        "paddw     %%mm0, %%mm2\n"
-        "paddw     %%mm2, %%mm6\n"
-
-        "add  %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq      %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd     %%mm0, %%mm6\n"
-
-        "movq  %%mm6, %%mm0\n"
-        "psrlq $32,   %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd  %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-    uint8_t *pix = pix1;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor %%mm3, %%mm4\n"
-        "pxor %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
 
-        "movq %%mm6, %%mm0\n"
-        "psrlq $32, %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
+#define hadamard_func(cpu)                                                    \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
+                                  uint8_t *src2, ptrdiff_t stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
+                                    uint8_t *src2, ptrdiff_t stride, int h);
 
-    return tmp + hf_noise8_mmx(pix + 8, stride, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
+#if HAVE_YASM
 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
     if (c)
         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
     else
-        score1 = sse16_mmx(c, pix1, pix2, stride, h);
-    score2 = hf_noise16_mmx(pix1, stride, h) -
-             hf_noise16_mmx(pix2, stride, h);
+        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                      ptrdiff_t stride, int h)
 {
-    int score1 = sse8_mmx(c, pix1, pix2, stride, h);
-    int score2 = hf_noise8_mmx(pix1, stride, h) -
-                 hf_noise8_mmx(pix2, stride, h);
+    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+                 ff_hf_noise8_mmx(pix2, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         return score1 + FFABS(score2) * 8;
 }
 
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+
 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
                             ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)               \
     "movq (%0), %%mm2\n"                        \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
-                               ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq 8(%0), " #out1 "\n"                   \
-    "add %2, %0\n"                              \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n"
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pxor %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq 8(%0), %%mm1\n"
-        "add %2, %0\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %1\n"
-        : "+r" (pix), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)       \
     "movq (%0), %%mm2\n"                \
@@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 }
 #undef SUM
 
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                         ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq (%1), %%mm2\n"                        \
-    "movq 8(%0), " #out1 "\n"                   \
-    "movq 8(%1), %%mm3\n"                       \
-    "add %3, %0\n"                              \
-    "add %3, %1\n"                              \
-    "psubb %%mm2, " #out0 "\n"                  \
-    "psubb %%mm3, " #out1 "\n"                  \
-    "pxor %%mm7, " #out0 "\n"                   \
-    "pxor %%mm7, " #out1 "\n"                   \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n    "
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pcmpeqw %%mm7, %%mm7\n"
-        "psllw $15, %%mm7\n"
-        "packsswb %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq (%1), %%mm2\n"
-        "movq 8(%0), %%mm1\n"
-        "movq 8(%1), %%mm3\n"
-        "add %3, %0\n"
-        "add %3, %1\n"
-        "psubb %%mm2, %%mm0\n"
-        "psubb %%mm3, %%mm1\n"
-        "pxor %%mm7, %%mm0\n"
-        "pxor %%mm7, %%mm1\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z)                          \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "pcmpgtw " #a ", " #z "             \n\t"   \
-    "pxor "    #z ", " #a "             \n\t"   \
-    "psubw "   #z ", " #a "             \n\t"
-
-#define MMABS_MMXEXT(a, z)                      \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "psubw "   #a ", " #z "             \n\t"   \
-    "pmaxsw "  #z ", " #a "             \n\t"
-
-#define MMABS_SSSE3(a,z)                        \
-    "pabsw "   #a ", " #a "             \n\t"
-
-#define MMABS_SUM(a,z, sum)                     \
-    MMABS(a,z)                                  \
-    "paddusw " #a ", " #sum "           \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)                     \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $32, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $16, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_MMXEXT(a, t, dst)                  \
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_SSE2(a, t, dst)                    \
-    "movhlps " #a ", " #t "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define DCT_SAD4(m, mm, o)                      \
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
-
-#define DCT_SAD_MMX                             \
-    "pxor    %%mm0, %%mm0               \n\t"   \
-    "pxor    %%mm1, %%mm1               \n\t"   \
-    DCT_SAD4(q, %%mm, 0)                        \
-    DCT_SAD4(q, %%mm, 8)                        \
-    DCT_SAD4(q, %%mm, 64)                       \
-    DCT_SAD4(q, %%mm, 72)                       \
-    "paddusw %%mm1, %%mm0               \n\t"   \
-    HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2                            \
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
-    DCT_SAD4(dqa, %%xmm, 0)                     \
-    DCT_SAD4(dqa, %%xmm, 64)                    \
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
-    HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu)                           \
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
-{                                                   \
-    int sum;                                        \
-    __asm__ volatile (                              \
-        DCT_SAD                                     \
-        :"=r"(sum)                                  \
-        :"r"(block));                               \
-    return sum & 0xFFFF;                            \
-}
-
-#define DCT_SAD         DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z)     MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD         DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
     0x0000000000000000ULL,
     0x0001000100010001ULL,
     0x0002000200020002ULL,
 };
 
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
         : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
 }
 
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
-                      ptrdiff_t stride, int h)
-{
-    int ret;
-    __asm__ volatile (
-        "pxor %%xmm2, %%xmm2            \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movdqu (%1), %%xmm0            \n\t"
-        "movdqu (%1, %4), %%xmm1        \n\t"
-        "psadbw (%2), %%xmm0            \n\t"
-        "psadbw (%2, %4), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm2           \n\t"
-        "paddw %%xmm1, %%xmm2           \n\t"
-        "lea (%1,%4,2), %1              \n\t"
-        "lea (%2,%4,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        "movhlps %%xmm2, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm2         \n\t"
-        "movd    %%xmm2, %3             \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
-        : "r" (stride));
-    return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "pavgb 1(%1, %3), %%mm1         \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq (%1), %%mm0               \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq "MANGLE(bone)", %%mm5     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1,%3), %%mm2            \n\t"
-        "pavgb 1(%1), %%mm1             \n\t"
-        "pavgb 1(%1,%3), %%mm2          \n\t"
-        "psubusb %%mm5, %%mm1           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2,%3), %%mm1          \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
@@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddw %%mm4, %%mm2             \n\t"
         "paddw %%mm5, %%mm3             \n\t"
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+        "movq %5, %%mm5                 \n\t"
         "paddw %%mm2, %%mm0             \n\t"
         "paddw %%mm3, %%mm1             \n\t"
         "paddw %%mm5, %%mm0             \n\t"
@@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         " js 1b                         \n\t"
         : "+a" (len)
         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
-          "r" (stride));
+          "r" (stride), "m" (round_tab[2]));
 }
 
 static inline int sum_mmx(void)
@@ -1072,15 +420,6 @@ static inline int sum_mmx(void)
     return ret & 0xFFFF;
 }
 
-static inline int sum_mmxext(void)
-{
-    int ret;
-    __asm__ volatile (
-        "movd %%mm6, %0                 \n\t"
-        : "=r" (ret));
-    return ret;
-}
-
 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
                                 ptrdiff_t stride, int h)
 {
@@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
                         uint8_t *blk1, ptrdiff_t stride, int h)         \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
                             uint8_t *blk1, ptrdiff_t stride, int h)     \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 }                                                                       \
 
 PIX_SAD(mmx)
-PIX_SAD(mmxext)
 
 #endif /* HAVE_INLINE_ASM */
 
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                  ptrdiff_t stride, int h);
-
-#define hadamard_func(cpu)                                                    \
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
-                                  uint8_t *src2, ptrdiff_t stride, int h);    \
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
-                                    uint8_t *src2, ptrdiff_t stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
         c->pix_abs[0][0] = sad16_mmx;
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
         c->sad[0] = sad16_mmx;
         c->sad[1] = sad8_mmx;
 
-        c->sse[0]  = sse16_mmx;
-        c->sse[1]  = sse8_mmx;
         c->vsad[4] = vsad_intra16_mmx;
 
-        c->nsse[0] = nsse16_mmx;
-        c->nsse[1] = nsse8_mmx;
-
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->vsad[0] = vsad16_mmx;
         }
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
-        c->vsad[4] = vsad_intra16_mmxext;
-
-        c->pix_abs[0][0] = sad16_mmxext;
-        c->pix_abs[1][0] = sad8_mmxext;
-
-        c->sad[0] = sad16_mmxext;
-        c->sad[1] = sad8_mmxext;
-
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->pix_abs[0][1] = sad16_x2_mmxext;
-            c->pix_abs[0][2] = sad16_y2_mmxext;
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
-            c->pix_abs[1][1] = sad8_x2_mmxext;
-            c->pix_abs[1][2] = sad8_y2_mmxext;
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
-
-            c->vsad[0] = vsad16_mmxext;
-        }
-    }
-
-    if (INLINE_SSE2(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
-    }
-
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
-        c->sad[0] = sad16_sse2;
-    }
-
-#if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
-    }
-#endif
 #endif /* HAVE_INLINE_ASM */
 
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_YASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 
 #if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 #endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
     }
 }
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000000..3dc641e89e
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, q2301
+    pshufd        m3, m1, q2301
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, q2301
+    pshufd        m4, m2, q2301
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
+    jne .shift                                      ; jump if true
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop4                                       ; jump if true
+
+align 16
+.loop8:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop8
+    RET
+
+align 16
+.loop4:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop4
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index (not needed on UNIX64)
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+                index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+                index2, accum, ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop4_shift                  ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop8_shift
+    RET
+
+align 16
+.loop4_shift:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop4_shift
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 72fc637764..e9d9b1bf18 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
 extern char ff_mlp_firorder_7;
@@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2;
 extern char ff_mlp_iirorder_1;
 extern char ff_mlp_iirorder_0;
 
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
                                    &ff_mlp_firorder_2, &ff_mlp_firorder_3,
                                    &ff_mlp_firorder_4, &ff_mlp_firorder_5,
                                    &ff_mlp_firorder_6, &ff_mlp_firorder_7,
                                    &ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
                                    &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
                                    &ff_mlp_iirorder_4 };
 
@@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         FIRMUL   (ff_mlp_firorder_6, 0x14   )
         FIRMUL   (ff_mlp_firorder_5, 0x10   )
         FIRMUL   (ff_mlp_firorder_4, 0x0c   )
-        FIRMULREG(ff_mlp_firorder_3, 0x08,10)
-        FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+        FIRMUL   (ff_mlp_firorder_3, 0x08   )
+        FIRMUL   (ff_mlp_firorder_2, 0x04   )
         FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
         LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
         "jmp  *%6                     \n\t"
@@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
           /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
         , /* 8*/"r"((int64_t)coeff[0])
-        , /* 9*/"r"((int64_t)coeff[1])
-        , /*10*/"r"((int64_t)coeff[2])
         : "rax", "rdx", "rsi"
 #else /* ARCH_X86_32 */
           /* 3*/"+m"(blocksize)
@@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 533b4a7c3f..d969f1df38 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
  * SIMD-optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,20 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_YASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_YASM */
+
 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                                float *tmpbuf);
 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +47,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +191,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 
 #if HAVE_YASM
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
@@ -217,16 +226,22 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_SSE
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
 #endif /* HAVE_YASM */
 
 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
     int i, j;
     for (j = 0; j < 4; j++) {
@@ -242,16 +257,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
         }
     }
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
         s->apply_window_float = apply_window_mp3;
     }
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
 
 #if HAVE_YASM
+#if HAVE_SSE
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
@@ -261,8 +279,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
     }
+#endif
+#if HAVE_AVX_EXTERNAL
     if (EXTERNAL_AVX(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
     }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index b739a8f9a8..af47422cf2 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
@@ -35,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
 
     qmul = qscale << 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     if (!s->h263_aic) {
         if (n < 4)
@@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
@@ -171,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -239,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -306,7 +307,10 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -345,8 +349,8 @@ __asm__ volatile(
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -371,7 +375,10 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -410,8 +417,8 @@ __asm__ volatile(
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $4, %%mm0                \n\t"
-                "psrlw $4, %%mm1                \n\t"
+                "psrlw $5, %%mm0                \n\t"
+                "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -442,11 +449,11 @@ __asm__ volatile(
         );
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +465,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 }
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index 0e5dd0f153..941a8e2e4c 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
 
 #if HAVE_INLINE_ASM
 
@@ -43,20 +44,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
     const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
     int x, y;
 
     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
     const int dxh = dxy * (h - 1);
     const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w ||
+                     (unsigned) iy >= height - h;
 
     if ( // non-constant fullpel offset (3% of blocks)
         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
         // uses more than 16 bits of subpel mv (only at huge resolution)
         (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned) ix >= width  - w ||
-        (unsigned) iy >= height - h) {
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
         // FIXME could still use mmx for some of the rows
         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                  shift, r, width, height);
@@ -64,6 +69,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     }
 
     src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
 
     __asm__ volatile (
         "movd         %0, %%mm6         \n\t"
@@ -150,4 +159,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
         c->gmc = gmc_mmx;
 #endif /* HAVE_INLINE_ASM */
 }
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 47349d17ec..67b26178a8 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
  * The simplest mpeg encoder (well, it was the simplest!)
  * Copyright (c) 2000,2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 
+#if HAVE_6REGS
+
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -81,7 +83,10 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
+#endif /* HAVE_6REGS */
+
 #if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -135,7 +140,9 @@ static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
         : "r"(block+64)
     );
 }
+#endif /* HAVE_MMX_INLINE */
 
+#if HAVE_SSE2_INLINE
 static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -191,9 +198,10 @@ static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
     );
 }
+#endif /* HAVE_SSE2_INLINE */
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
 {
     const int dct_algo = s->avctx->dct_algo;
     int i;
@@ -205,21 +213,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
 #if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_mmx;
+#endif
             s->denoise_dct  = denoise_dct_mmx;
         }
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_mmxext;
 #endif
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
+#endif
             s->denoise_dct  = denoise_dct_sse2;
         }
 #endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d68762a..882d486205 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/x86/asm.h"
 
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
 {
     x86_reg i=0;
 
-    assert(FFABS(scale) < MAX_ABS);
+    av_assert2(FFABS(scale) < MAX_ABS);
     scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index a54c9042ce..da76459cd6 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const uint16_t *qmat, *bias;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
-    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -118,10 +118,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -136,8 +141,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
@@ -173,7 +176,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -207,7 +210,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -221,7 +224,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         "psubusw "MM"1, "MM"4               \n\t"
         "packuswb "MM"4, "MM"4              \n\t"
 #if COMPILE_TEMPLATE_SSE2
-        "packuswb "MM"4, "MM"4              \n\t"
+        "packsswb "MM"4, "MM"4              \n\t"
 #endif
         "movd "MM"4, %0                     \n\t" // *overflow
         : "=g" (*overflow)
@@ -275,6 +278,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
     }else{
         if(last_non_zero_p1 <= 1) goto end;
         block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee776d..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION .text
+SECTION_RODATA
 
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
+    mov          r2, %1
+%if mmsize == 16
+    lea          r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+    pxor         m5, m5
+%endif
+    pxor         m4, m4
 .loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+%if cpuflag(xop)
+    vphaddubq    m0, [r0]
+    vphaddubq    m1, [r0+r1]
+    vphaddubq    m2, [r0+r1*2]
+    vphaddubq    m3, [r0+r3]
+%else
+    mova         m0, [r0]
+%if mmsize == 8
+    mova         m1, [r0+8]
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
+    mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
+    punpckhbw    m2, m0, m5
+    punpcklbw    m0, m5
+    punpckhbw    m3, m1, m5
+    punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
+    paddw        m4, m3
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
+%else
+    add          r0, r1
+%endif
+    dec r2
+    jne .loop
+%if mmsize == 16
+    pshufd       m0, m4, q0032
+    paddd        m4, m0
+%elif notcpuflag(mmxext)
+    HADDW        m4, m5
+%endif
+    movd        eax, m4
     RET
+%endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16  4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16  4, 4, 4
+%endif
+
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
     movsxdifnidn r1, r1d
-    mov          r2, 16
+    mov          r2, %2
     pxor         m0, m0
-    pxor         m7, m7
+    pxor         m5, m5
 .loop:
     mova         m2, [r0+0]
+%if mmsize == 8
     mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
+%else
+    mova         m3, [r0+r1]
+%endif
+    punpckhbw    m1, m2, m0
     punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
+    punpckhbw    m4, m3, m0
+    punpcklbw    m3, m0
     pmaddwd      m1, m1
     pmaddwd      m2, m2
     pmaddwd      m3, m3
     pmaddwd      m4, m4
     paddd        m2, m1
     paddd        m4, m3
-    paddd        m7, m2
+    paddd        m5, m2
+    paddd        m5, m4
+%if mmsize == 8
     add          r0, r1
-    paddd        m7, m4
+%else
+    lea          r0, [r0+r1*2]
+%endif
     dec r2
     jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
+    HADDD        m5, m1
+    movd        eax, m5
     RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
 
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 71fbf2874f..532836cec9 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
-    } else {
+    } else if (w == 16) {
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             "add               %1, %0           \n\t"
             "cmp               %3, %0           \n\t"
             "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index c8fd1b24a1..7c5377b2bb 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -26,9 +26,8 @@
 SECTION .text
 
 INIT_MMX mmx
-; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
+; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size)
 cglobal get_pixels, 3,4
-    movsxdifnidn r2, r2d
     add          r0, 128
     mov          r3, -128
     pxor         m7, m7
@@ -51,8 +50,7 @@ cglobal get_pixels, 3,4
     REP_RET
 
 INIT_XMM sse2
-cglobal get_pixels, 3, 4
-    movsxdifnidn r2, r2d
+cglobal get_pixels, 3, 4, 5
     lea          r3, [r2*3]
     pxor         m4, m4
     movh         m0, [r1]
@@ -108,3 +106,28 @@ cglobal diff_pixels, 4,5
     add          r4, 16
     jne .loop
     REP_RET
+
+INIT_XMM sse2
+cglobal diff_pixels, 4, 5, 5
+    movsxdifnidn r3, r3d
+    pxor         m4, m4
+    add          r0,  128
+    mov          r4, -128
+.loop:
+    movh         m0, [r1]
+    movh         m2, [r2]
+    movh         m1, [r1+r3]
+    movh         m3, [r2+r3]
+    punpcklbw    m0, m4
+    punpcklbw    m1, m4
+    punpcklbw    m2, m4
+    punpcklbw    m3, m4
+    psubw        m0, m2
+    psubw        m1, m3
+    mova [r0+r4+0 ], m0
+    mova [r0+r4+16], m1
+    lea          r1, [r1+r3*2]
+    lea          r2, [r2+r3*2]
+    add          r4, 32
+    jne .loop
+    RET
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index 9582e0b5c2..4d06a44c6d 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * SIMD-optimized pixel operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,10 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/pixblockdsp.h"
 
-void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
-void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
+void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         int stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
@@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
     if (EXTERNAL_SSE2(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels = ff_diff_pixels_sse2;
     }
 }
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 722caf0fd1..50e4255dec 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
     and                waq, ~(mmsize*2-1)
     jmp .end_v
 .loop_v:
-    mova                m0, [src1q+iq]
-    mova                m1, [src1q+iq+mmsize]
-    paddb               m0, [src2q+iq]
-    paddb               m1, [src2q+iq+mmsize]
-    mova  [dstq+iq       ], m0
-    mova  [dstq+iq+mmsize], m1
+    movu                m0, [src2q+iq]
+    movu                m1, [src2q+iq+mmsize]
+    paddb               m0, [src1q+iq]
+    paddb               m1, [src1q+iq+mmsize]
+    movu  [dstq+iq       ], m0
+    movu  [dstq+iq+mmsize], m1
     add                 iq, mmsize*2
 .end_v:
     cmp                 iq, waq
@@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
     movh            [dstq], m3
     add               dstq, bppq
     cmp               dstq, endq
-    jle .loop
+    jl .loop
 
     mov               dstq, [rsp]
     dec              cntrq
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da36d7..7dca62c675 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index a0e97b3951..632ece6ebf 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,23 +1,24 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -47,10 +48,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2
 w5_min_w1:  times 4 dw W5sh2, -W1sh2
 w5_plus_w7: times 4 dw W5sh2, +W7sh2
 w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
+pw_88:      times 8 dw 0x2008
 
+cextern pw_1
 cextern pw_4
-cextern pw_8
 cextern pw_512
 cextern pw_1019
 
@@ -91,14 +92,12 @@ section .text align=16
     ; a2 -= W6 * row[2];
     ; a3 -= W2 * row[2];
 %ifidn %1, col
-    paddw       m10,[pw_8]
+    paddw       m10,[pw_88]
 %endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
 %ifidn %1, row
-    psubw       m10,[row_round]
+    paddw       m10,[pw_1]
 %endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
     pmaddwd     m2,  m0, [w4_plus_w6]
     pmaddwd     m3,  m1, [w4_plus_w6]
     pmaddwd     m4,  m0, [w4_min_w6]
@@ -107,75 +106,33 @@ section .text align=16
     pmaddwd     m7,  m1, [w4_min_w2]
     pmaddwd     m0, [w4_plus_w2]
     pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; a0: -1*row[0]-1*row[2]
     ; a1: -1*row[0]
     ; a2: -1*row[0]
     ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
 
     ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
     ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
     ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
     ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
     SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
     pmaddwd     m10, m8, [w4_plus_w6]
     pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
     paddd       m0,  m10            ; a0[0-3]
     paddd       m1,  m11            ; a0[4-7]
     pmaddwd     m10, m8, [w4_min_w6]
     pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
     paddd       m6,  m10           ; a3[0-3]
     paddd       m7,  m11           ; a3[4-7]
     pmaddwd     m10, m8, [w4_min_w2]
     pmaddwd     m11, m9, [w4_min_w2]
     pmaddwd     m8, [w4_plus_w2]
     pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
     psubd       m4,  m10           ; a2[0-3] intermediate
     psubd       m5,  m11           ; a2[4-7] intermediate
     psubd       m2,  m8            ; a1[0-3] intermediate
     psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
 
     ; load/store
     mova   [r2+  0], m0
@@ -206,8 +163,6 @@ section .text align=16
     ; b3 = MUL(W7, row[1]);
     ; MAC(b3, -W5, row[3]);
     SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
     pmaddwd     m2,  m0, [w3_min_w7]
     pmaddwd     m3,  m1, [w3_min_w7]
     pmaddwd     m4,  m0, [w5_min_w1]
@@ -216,35 +171,11 @@ section .text align=16
     pmaddwd     m7,  m1, [w7_min_w5]
     pmaddwd     m0, [w1_plus_w3]
     pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; b0: +1*row[1]+2*row[3]
     ; b1: +2*row[1]-1*row[3]
     ; b2: -1*row[1]-1*row[3]
     ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
 
     ; MAC(b0,  W5, row[5]);
     ; MAC(b0,  W7, row[7]);
@@ -255,38 +186,16 @@ section .text align=16
     ; MAC(b3,  W3, row[5]);
     ; MAC(b3, -W1, row[7]);
     SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
 
     ; b0: -1*row[5]+1*row[7]
     ; b1: -1*row[5]+1*row[7]
     ; b2: +1*row[5]+2*row[7]
     ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
 
     pmaddwd     m10, m8, [w1_plus_w5]
     pmaddwd     m11, m9, [w1_plus_w5]
     pmaddwd     m12, m8, [w5_plus_w7]
     pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
     psubd       m2,  m10           ; b1[0-3]
     psubd       m3,  m11           ; b1[4-7]
     paddd       m0,  m12            ; b0[0-3]
@@ -295,10 +204,6 @@ section .text align=16
     pmaddwd     m13, m9, [w7_plus_w3]
     pmaddwd     m8, [w3_min_w1]
     pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
     paddd       m4,  m12           ; b2[0-3]
     paddd       m5,  m13           ; b2[4-7]
     paddd       m6,  m8            ; b3[0-3]
@@ -345,7 +250,7 @@ cglobal prores_idct_put_10, 4, 4, %1
     pmullw      m13,[r3+64]
     pmullw      m12,[r3+96]
 
-    IDCT_1D     row, 17
+    IDCT_1D     row, 15
 
     ; transpose for second part of IDCT
     TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
@@ -360,20 +265,11 @@ cglobal prores_idct_put_10, 4, 4, %1
 
     ; for (i = 0; i < 8; i++)
     ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
+    IDCT_1D     col, 18
 
     ; clip/store
-    mova        m6, [pw_512]
     mova        m3, [pw_4]
     mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
     pmaxsw      m8,  m3
     pmaxsw      m0,  m3
     pmaxsw      m1,  m3
@@ -404,25 +300,11 @@ cglobal prores_idct_put_10, 4, 4, %1
     RET
 %endmacro
 
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
-    movhlps     %2,  %1
-    pmovsxwd    %1,  %1
-    pmovsxwd    %2,  %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
-    pxor        %3,  %3
-    pcmpgtw     %3,  %1
-    mova        %2,  %1
-    punpcklwd   %1,  %3
-    punpckhwd   %2,  %3
-%endif
-%endmacro
-
 INIT_XMM sse2
 idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 idct_put_fn 16
+%endif
 
 %endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index e82dac0448..ead11ae9c1 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,10 @@
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
                                 int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
-                                int16_t *block, const int16_t *qmat);
 void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
                                 int16_t *block, const int16_t *qmat);
 
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
@@ -42,11 +40,6 @@ av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
         dsp->idct_put = ff_prores_idct_put_10_sse2;
     }
 
-    if (EXTERNAL_SSE4(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_sse4;
-    }
-
     if (EXTERNAL_AVX(cpu_flags)) {
         dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
         dsp->idct_put = ff_prores_idct_put_10_avx;
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63b8a..4e72d5084f 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index ef5f1d8826..282faed14f 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
 ;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index cdefe50a3c..3268d907ab 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_YASM
 
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
 
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index a9fb13234b..ddca4eb590 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "inline_asm.h"
 
 // put_pixels
-STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -99,7 +99,7 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
 
 // avg_pixels
 // this routine is 'slightly' suboptimal but mostly unused
-STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b600..7732d65b2a 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 586e4e9a6d..99c56f9d09 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
  * RV30/40 MMX/SSE2 optimizations
  * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 0a242b54e3..fdd81a0a37 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index e006c76584..12b85c9450 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions x86-optimised
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,13 @@
 #include "libavutil/x86/cpu.h"
 #include "hpeldsp.h"
 
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
 #if HAVE_YASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
                                   int stride, int h, int x, int y);
@@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
 {                                                                       \
     int i;                                                              \
     if (PH && PV) {                                                     \
-        DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)];           \
+        LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]);           \
         uint8_t *tmpptr = tmp + SIZE * 2;                               \
         src -= stride * 2;                                              \
                                                                         \
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
 /** @} */
 
 #define LOOPSIZE  8
-#define HCOFF(x)  (32 * (x - 1))
-#define VCOFF(x)  (32 * (x - 1))
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
 QPEL_MC_DECL(put_, _ssse3)
 QPEL_MC_DECL(avg_, _ssse3)
 
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  8
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 QPEL_MC_DECL(put_, _sse2)
 QPEL_MC_DECL(avg_, _sse2)
 
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  4
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 
 QPEL_MC_DECL(put_, _mmx)
 
@@ -186,34 +193,28 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_YASM */
 
 #if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_MMX_INLINE
     if (INLINE_MMX(cpu_flags)) {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index b449de5f9a..b6fa53517c 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
 ;* AAC Spectral Band Replication decoding functions
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,7 +25,14 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
-ps_neg          times 4 dd 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
 
 SECTION .text
 
@@ -136,7 +143,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
     mova       m3, m1
     mova       m4, m2
-    mova       m7, [ps_mask]
 
     ; Set pointers
 %if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +162,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     shl      start, 3            ; offset from num loops
 
     mova        m0, [X_lowq + start]
-    movlhps     m1, m1           ; (a2 a3 a2 a3)
-    movlhps     m2, m2           ; (a0 a1 a0 a1)
-    shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
-    shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
-    xorps       m3, m7           ; (-a3 a2 -a3 a2)
-    xorps       m4, m7           ; (-a1 a0 -a1 a0)
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
 .loop2:
-    mova        m5, m0
+    movu        m7, [X_lowq + start + 8]        ; BbCc
     mova        m6, m0
-    shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
-    shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
-    mulps       m0, m2
-    mulps       m5, m4
-    mova        m7, m6
-    addps       m5, m0
-    mova        m0, [X_lowq + start + 2*2*4]
-    shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
-    shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
-    mulps       m6, m1
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
     mulps       m7, m3
-    addps       m5, m6
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + start +16]        ; CcDd
     addps       m7, m0
-    addps       m5, m7
-    mova  [X_highq + start], m5
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + start], m7
     add     start, 16
     jnz         .loop2
     RET
@@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
     jne      .loop
     REP_RET
 
-INIT_XMM sse2
 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
     mov               cq, 64*4-2*mmsize
     lea            vrevq, [vq + 64*4]
 .loop:
     mova              m0, [src0q+cq]
     mova              m1, [src1q]
-    mova              m2, [src0q+cq+mmsize]
-    mova              m3, [src1q+mmsize]
-    pshufd            m4, m0, q0123
-    pshufd            m5, m1, q0123
-    pshufd            m6, m2, q0123
-    pshufd            m7, m3, q0123
-    addps             m3, m4
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
     subps             m0, m7
     addps             m1, m6
-    subps             m2, m5
+    subps             m4, m3
     mova         [vrevq], m1
-    mova  [vrevq+mmsize], m3
+    mova  [vrevq+mmsize], m5
     mova         [vq+cq], m0
-    mova  [vq+cq+mmsize], m2
+    mova  [vq+cq+mmsize], m4
     add            src1q, 2*mmsize
     add            vrevq, 2*mmsize
     sub               cq, 2*mmsize
     jge            .loop
     REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
 
 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +321,244 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
     movq       m2, [zq]
     movq    [r2q], m2
     REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+    movsxdifnidn    noiseq, noised
+    dec    noiseq
+    shl    count, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*count]
+    add      s_mq, count
+    add   q_filtq, count
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    count
+.loop:
+    mova       m1, [q_filtq + count]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + count]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*count]
+    movu       m7, [Yq + 2*count + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*count], m6
+    movu    [Yq + 2*count + mmsize], m7
+    add    count, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+%define   MOVH  movsd
+    movddup m5, [xq+cntq]
+%else
+%define   MOVH  movlps
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    MOVH    m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    MOVH    m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m2, m1
+    mulps   m1, m1
+    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m2, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+%if cpuflag(sse3)
+    movshdup m0, m1
+    haddps  m2, m5
+    haddps  m7, m6
+    addss   m1, m0
+%else
+    movaps  m3, m2
+    movaps  m0, m5
+    movaps  m4, m6
+    shufps  m3, m3, q0301
+    shufps  m0, m0, q0301
+    shufps  m4, m4, q0301
+    addps   m2, m3
+    addps   m5, m0
+    addps   m6, m4
+
+    movss   m0, m7
+    movss   m3, m1
+    shufps  m7, m7, q0001
+    shufps  m1, m1, q0001
+    addss   m7, m0
+    addss   m1, m3
+    shufps  m2, m5, q2020
+    shufps  m7, m6, q2020
+%endif
+    movaps  [phiq     ], m2
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852163..6911a1a515 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding functions
  * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
                        float bw, int start, int end);
 void ff_sbr_neg_odd_64_sse(float *z);
 void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
         s->hf_gen     = ff_sbr_hf_gen_sse;
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
         s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+    }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
     }
 }
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 71763dbf75..1d46212a13 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
  *
  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 
@@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
 
 static inline void idct(int16_t *block)
 {
-        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
         int16_t * const temp= (int16_t*)align_tmp;
 
         __asm__ volatile(
@@ -1148,6 +1147,7 @@ Temp
 
 "9: \n\t"
                 :: "r" (block), "r" (temp), "r" (coeffs)
+                   NAMED_CONSTRAINTS_ADD(wm1010,d40000)
                 : "%eax"
         );
 }
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 4fc29141b5..4a98732503 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000000..e2ad511d0a
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"REG_d")      \n\t"\
+        "movdqa %%"s1", 16("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s2", 32("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s3", 48("w",%%"REG_d")    \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq  8(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq 24(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"REG_d"), %%mm1         \n\t"
+        "movq  8(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"REG_d"), %%mm1         \n\t"
+        "movq 24(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"REG_S"             \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"REG_c"              \n\t"\
+             "add"OPSIZE" $"PTR_SIZE"*2, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"REG_c"              \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"OPSIZE" $"PTR_SIZE"*1, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "movdqa (%%"REG_D"), %%xmm0     \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"REG_D"              \n\t"
+             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+             "add %3, %%"REG_D"              \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm4     \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"REG_d")       \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"REG_D"), %%xmm1    \n\t"
+             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"REG_d")       \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+             "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+             "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"    \n\t"\
+             "pmullw %%mm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"REG_d"              \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+             "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"REG_S"             \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"\
+             "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000000..a87632836d
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+    pxor m0, m0
+.loop:
+    sub       sizeq, 8
+    movq      m1, [pix1q + sizeq]
+    mova      m2, [pix2q + sizeq*2]
+%if mmsize == 8
+    movq      m3, [pix2q + sizeq*2 + mmsize]
+    punpckhbw m4, m1
+    punpcklbw m1, m1
+    psraw     m4, 8
+    psraw     m1, 8
+    psubw     m3, m4
+    psubw     m2, m1
+    pmaddwd   m3, m3
+    pmaddwd   m2, m2
+    paddd     m0, m3
+    paddd     m0, m2
+%else
+    punpcklbw m1, m1
+    psraw     m1, 8
+    psubw     m2, m1
+    pmaddwd   m2, m2
+    paddd     m0, m2
+%endif
+    jg .loop
+    HADDD     m0, m1
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84b8c..0000000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
-                                 int size)
-{
-    int sum;
-    x86_reg i = size;
-
-    __asm__ volatile (
-        "pxor %%mm4, %%mm4 \n"
-        "1: \n"
-        "sub $8, %0 \n"
-        "movq (%2, %0), %%mm2 \n"
-        "movq (%3, %0, 2), %%mm0 \n"
-        "movq 8(%3, %0, 2), %%mm1 \n"
-        "punpckhbw %%mm2, %%mm3 \n"
-        "punpcklbw %%mm2, %%mm2 \n"
-        "psraw $8, %%mm3 \n"
-        "psraw $8, %%mm2 \n"
-        "psubw %%mm3, %%mm1 \n"
-        "psubw %%mm2, %%mm0 \n"
-        "pmaddwd %%mm1, %%mm1 \n"
-        "pmaddwd %%mm0, %%mm0 \n"
-        "paddd %%mm1, %%mm4 \n"
-        "paddd %%mm0, %%mm4 \n"
-        "jg 1b \n"
-        "movq %%mm4, %%mm3 \n"
-        "psrlq $32, %%mm3 \n"
-        "paddd %%mm3, %%mm4 \n"
-        "movd %%mm4, %1 \n"
-        : "+r" (i), "=r" (sum)
-        : "r" (pix1), "r" (pix2));
-
-    return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000000..40b4b0e183
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000000..8f489498a3
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+    movd       m0, [inq]            ; filter->error = *in;
+    movd       [errorq], m0         ;
+
+    movd       m2, shiftm           ; *in += (sum >> filter->shift);
+    psrad      m6, m2               ;
+    paddd      m0, m6               ;
+    movd       [inq], m0            ;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4,  7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000000..47dc87f6af
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttafilter_process_dec_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttafilter_process_dec_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..f579307aa0
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..c24c765e5b
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop:
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 595c8907b3..859e2d9455 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -2,20 +2,20 @@
 ;* V210 SIMD pack
 ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -23,7 +23,8 @@
 
 SECTION_RODATA
 
-v210_enc_min_10: times 8 dw 0x4
+cextern pw_4
+%define v210_enc_min_10 pw_4
 v210_enc_max_10: times 8 dw 0x3fb
 
 v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0
@@ -32,8 +33,10 @@ v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
 v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0
 v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
 
-v210_enc_min_8: times 16 db 0x1
-v210_enc_max_8: times 16 db 0xfe
+cextern pb_1
+%define v210_enc_min_8 pb_1
+cextern pb_FE
+%define v210_enc_max_8 pb_FE
 
 v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
 v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0
@@ -57,7 +60,7 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
     mova    m2, [v210_enc_min_10]
     mova    m3, [v210_enc_max_10]
 
-.loop
+.loop:
     movu    m0, [yq+2*widthq]
     CLIPW   m0, m2, m3
 
@@ -99,7 +102,7 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
     mova    m5, [v210_enc_max_8]
     pxor    m6, m6
 
-.loop
+.loop:
     movu    m1, [yq+2*widthq]
     CLIPUB  m1, m4, m5
 
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 95b999bc05..2afb1b2d7b 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index adf08d7d84..546688cf9d 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -2,20 +2,20 @@
 ;* VC1 deblocking optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ada26..fdd4de1813 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
 /*
  * VC-1 and WMV3 decoder - X86 DSP init functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index aff4b264e3..2bef5f5fb5 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/vc1dsp.h"
 #include "fpel.h"
 #include "vc1dsp.h"
@@ -62,12 +63,17 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
     ff_vc1_h_loop_filter8_sse4(src,          stride, pq);
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
 }
-
 static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t stride, int rnd)
 {
     ff_avg_pixels8_mmxext(dst, src, stride, 8);
 }
+static void avg_vc1_mspel_mc00_16_sse2(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+
 #endif /* HAVE_YASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
@@ -86,10 +92,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags))
         ff_vc1dsp_init_mmx(dsp);
 
-    if (INLINE_MMXEXT(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
         ff_vc1dsp_init_mmxext(dsp);
 
 #define ASSIGN_LF(EXT) \
@@ -111,13 +117,14 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
-        dsp->avg_vc1_mspel_pixels_tab[0]         = avg_vc1_mspel_mc00_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_mmxext;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
         dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
         ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 046affbc26..a7eb59df47 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -34,7 +33,7 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -81,7 +80,7 @@
     "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
     "add       %2, %0                  \n\t"
 
-/** Sacrifying mm6 allows to pipeline loads from src */
+/** Sacrificing mm6 makes it possible to pipeline loads from src */
 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
                                        const uint8_t *src, x86_reg stride,
                                        int rnd, int64_t shift)
@@ -111,6 +110,7 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
         : "+r"(src), "+r"(dst)
         : "r"(stride), "r"(-2*stride),
           "m"(shift), "m"(rnd), "r"(9*stride-4)
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)
         : "%"REG_c, "memory"
     );
 }
@@ -155,6 +155,7 @@ static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
         "jnz 1b                            \n\t"\
         : "+r"(h), "+r" (src),  "+r" (dst)\
         : "r"(stride), "m"(rnd)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
         : "memory"\
     );\
 }
@@ -213,6 +214,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
         : "+r"(src),  "+r"(dst)\
         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
           "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
         : "%"REG_c, "memory"\
     );\
 }
@@ -315,6 +317,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(src_stride), "r"(3*src_stride),                           \
           "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -352,6 +355,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -387,6 +391,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -441,7 +446,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\
             int              r;\
-            DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
 \
             r = (1<<(shift-1)) + rnd-1;\
             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
@@ -457,6 +462,15 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 \
     /* Horizontal mode with no vertical mode */\
     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }
 
 VC1_MSPEL_MC(put_)
@@ -477,6 +491,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
                                                   int rnd)              \
 {                                                                       \
      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
 DECLARE_FUNCTION(0, 1)
@@ -700,59 +728,83 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
     );
 }
 
+#if HAVE_MMX_EXTERNAL
 static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                    ptrdiff_t stride, int rnd)
 {
     ff_put_pixels8_mmx(dst, src, stride, 8);
 }
+static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride, int rnd)
+{
+    ff_put_pixels16_mmx(dst, src, stride, 16);
+}
+static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels8_mmx(dst, src, stride, 8);
+}
+static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels16_mmx(dst, src, stride, 16);
+}
+#endif
+
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+#if HAVE_MMX_EXTERNAL
+    FN_ASSIGN(put_, 0, 0, _mmx);
+    FN_ASSIGN(avg_, 0, 0, _mmx);
+#endif
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
 }
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
 
     dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
     dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
     dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
     dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
 }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 53b9e8292c..25d43640ab 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
 ;* Core video DSP functions
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -54,13 +54,13 @@ SECTION .text
 ; |    |    <- bottom is copied from last line in body of source
 ; '----' <- bh
 %if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
                                 start_y, end_y, bh, w
 %else ; x86-32
 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
 %define src_strideq r3mp
-%define dst_strideq r2mp
-    mov            srcq, r1mp
+%define dst_strideq r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
-    ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+    vpbroadcastb     m0, [dstq+start_xq]
+    mov              wq, n_wordsq               ;   initialize w
+%else
     movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     imul             wd, 0x01010101             ;   w *= 0x01010101
     movd             m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
 %else ; mmx
     punpckldq        m0, m0                     ;   splat
 %endif ; mmx/sse
+%endif ; avx2
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
 INIT_XMM sse2
 hvar_fn
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
 ; macro to read/write a horizontal number of pixels (%2) to/from registers
 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
@@ -262,30 +271,30 @@ hvar_fn
 %rep 1+%2-%1
 %if %%n <= 3
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, val, bh
     mov             bhq, r6mp                   ; r6mp = bhmp
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
     mov            dstq, r0mp
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %else
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, bh
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %endif
@@ -344,9 +353,8 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if %1 == 2
-    movzx          valw, byte %2
-    imul           valw, 0x0101
+%if cpuflag(avx2)
+    vpbroadcastb     m0, %2
 %else
     movzx          vald, byte %2
     imul           vald, 0x01010101
@@ -356,13 +364,16 @@ VERTICAL_EXTEND 16, 22
     pshufd           m0, m0, q0000
 %else
     punpckldq        m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
 %assign %%off 0
+
+%if %1 >= 8
+
 %rep %1/mmsize
     movu     [%2+%%off], m0
 %assign %%off %%off+mmsize
@@ -378,34 +389,44 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
     movq      [%2+%1-8], m0
 %assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
-    movd     [%2+%%off], m0
-%assign %%off %%off+4
 %else
-    mov      [%2+%%off], vald
+    movd     [%2+%%off], m0
 %assign %%off %%off+4
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off >= 2
-%if %1 >= 8
-    movd      [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+    mov      [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+    movd     [%2+%%off-2], m0
 %else
     mov      [%2+%%off], valw
-%endif
+%endif ; avx2
 %endif ; (%1-%%off)/2
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
 .loop_y:                                        ; do {
     READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
@@ -426,6 +447,11 @@ H_EXTEND 16, 22
 INIT_XMM sse2
 H_EXTEND 16, 22
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
 %macro PREFETCH_FN 1
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee837096a..26e072bb12 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
 /*
+ * Copyright (C) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
@@ -28,11 +30,11 @@
 #include "libavcodec/videodsp.h"
 
 #if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
@@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
     &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
     &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
     &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
@@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
     ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
     ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
     ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
@@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
@@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
 
 static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t dst_stride,
@@ -133,22 +152,26 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               x86_reg block_w, x86_reg block_h,
                                               x86_reg src_x, x86_reg src_y,
                                               x86_reg w, x86_reg h,
-                                              emu_edge_vfix_func **vfix_tbl,
+                                              emu_edge_vfix_func * const *vfix_tbl,
                                               emu_edge_vvar_func *v_extend_var,
-                                              emu_edge_hfix_func **hfix_tbl,
+                                              emu_edge_hfix_func * const *hfix_tbl,
                                               emu_edge_hvar_func *h_extend_var)
 {
     x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
     if (!w || !h)
-         return;
+        return;
+
+    av_assert2(block_w <= FFABS(dst_stride));
 
     if (src_y >= h) {
-        src  -= src_y * src_stride;
-        src_y = src_y_add = h - 1;
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
     } else if (src_y <= -block_h) {
-        src  -= src_y*src_stride;
-        src_y = src_y_add = 1 - block_h;
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
     }
     if (src_x >= w) {
         src   += w - 1 - src_x;
@@ -162,18 +185,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y   = FFMIN(block_h, h-src_y);
     end_x   = FFMIN(block_w, w-src_x);
-    assert(start_x < end_x && block_w > 0);
-    assert(start_y < end_y && block_h > 0);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
 
     // fill in the to-be-copied part plus all above/below
     src += (src_y_add + start_y) * src_stride + start_x;
     w = end_x - start_x;
     if (w <= 22) {
-        vfix_tbl[w - 1](dst + start_x, src,
-                        dst_stride, src_stride,
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                         start_y, end_y, block_h);
     } else {
-        v_extend_var(dst + start_x, src, dst_stride, src_stride,
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                      start_y, end_y, block_h, w);
     }
 
@@ -212,7 +234,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
                      hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
 }
 
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
@@ -231,10 +253,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int src_x, int src_y, int w,
                                               int h)
 {
-    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
-                     src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_YASM */
 
 void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +300,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
     if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
         ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650eef5..b25d838868 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
 ;* Vorbis x86 optimizations
 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd83195cc..bc1cc43a18 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc8a047224..d457cd7de5 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,6 +40,7 @@ pb_81: times 8 db 0x81
 cextern pb_1
 cextern pb_3
 cextern pb_80
+cextern pb_FE
 
 cextern pw_8
 
@@ -147,6 +148,49 @@ cglobal vp3_h_loop_filter, 3, 4
     STORE_4_WORDS m3
     RET
 
+%macro PAVGB_NO_RND 0
+    mova   m4, m0
+    mova   m5, m2
+    pand   m4, m1
+    pand   m5, m3
+    pxor   m1, m0
+    pxor   m3, m2
+    pand   m1, m6
+    pand   m3, m6
+    psrlq  m1, 1
+    psrlq  m3, 1
+    paddb  m4, m1
+    paddb  m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+    mova   m6, [pb_FE]
+    lea    stride3q,[strideq+strideq*2]
+.loop:
+    mova   m0, [src1q]
+    mova   m1, [src2q]
+    mova   m2, [src1q+strideq]
+    mova   m3, [src2q+strideq]
+    PAVGB_NO_RND
+    mova   [dstq], m4
+    mova   [dstq+strideq], m5
+
+    mova   m0, [src1q+strideq*2]
+    mova   m1, [src2q+strideq*2]
+    mova   m2, [src1q+stride3q]
+    mova   m3, [src2q+stride3q]
+    PAVGB_NO_RND
+    mova   [dstq+strideq*2], m4
+    mova   [dstq+stride3q],  m5
+
+    lea    src1q, [src1q+strideq*4]
+    lea    src2q, [src2q+strideq*4]
+    lea    dstq,  [dstq+strideq*4]
+    sub    hd, 4
+    jnz .loop
+    RET
+
 ; from original comments: The Macro does IDct on 4 1-D Dcts
 %macro BeginIDCT 0
     movq          m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index b320dc5db9..2ece9ab7d5 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +25,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
-#include "config.h"
 
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
@@ -39,16 +40,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
                                  int *bounding_values);
 
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
+
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-    }
 #endif
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a693684af..810cc8dcd8 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_X86_VP56_ARITH_H
 #define AVCODEC_X86_VP56_ARITH_H
 
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
 #define vp56_rac_get_prob vp56_rac_get_prob
 static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int code_word = vp56_rac_renorm(c);
-    unsigned int high = c->high;
-    unsigned int low = 1 + (((high - 1) * prob) >> 8);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
     unsigned int low_shift = low << 16;
     int bit = 0;
+    c->code_word = code_word;
 
     __asm__(
         "subl  %4, %1      \n\t"
         "subl  %3, %2      \n\t"
-        "leal (%2, %3), %3 \n\t"
         "setae %b0         \n\t"
         "cmovb %4, %1      \n\t"
-        "cmovb %3, %2      \n\t"
-        : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
-        : "r"(low)
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
     );
 
-    c->high      = high;
-    c->code_word = code_word;
     return bit;
 }
 #endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index 80f8ca5f38..3d874ea62a 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index cd94f3e038..82baee7e97 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index adc9730dfa..538b3f4a9b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -143,13 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
 
-pw_256:   times 8 dw 256
 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
 
 cextern pw_3
 cextern pw_4
 cextern pw_64
+cextern pw_256
 
 SECTION .text
 
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index e5afd493bb..8d5d033744 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -169,7 +169,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
     src -= srcstride * (TAPNUMY / 2 - 1); \
     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
@@ -214,7 +214,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
@@ -347,7 +347,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -417,7 +417,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -430,7 +430,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
 
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -455,7 +455,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
 
         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 5d792e8207..98bb6696a0 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
deleted file mode 100644
index 6488f3092d..0000000000
--- a/libavcodec/x86/vp9dsp.asm
+++ /dev/null
@@ -1,277 +0,0 @@
-;******************************************************************************
-;* VP9 SIMD optimizations
-;*
-;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-; FIXME share with vp8dsp.asm
-pw_256:   times 8 dw 256
-
-%macro F8_TAPS 8
-times 8 db %1, %2
-times 8 db %3, %4
-times 8 db %5, %6
-times 8 db %7, %8
-%endmacro
-; int8_t ff_filters_ssse3[3][15][4][16]
-const filters_ssse3 ; smooth
-                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
-                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
-                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
-                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
-                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
-                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
-                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
-                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
-                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
-                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
-                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
-                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
-                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
-                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
-                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
-                    ; regular
-                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
-                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
-                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
-                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
-                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
-                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
-                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
-                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
-                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
-                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
-                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
-                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
-                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
-                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
-                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
-                    ; sharp
-                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
-                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
-                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
-                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
-                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
-                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
-                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
-                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
-                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
-                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
-                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
-                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
-                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
-                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
-                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
-
-SECTION .text
-
-%macro filter_h_fn 1
-%assign %%px mmsize/2
-cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
-    mova        m6, [pw_256]
-    mova        m7, [filteryq+ 0]
-%if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
-%endif
-.loop:
-    movh        m0, [srcq-3]
-    movh        m1, [srcq-2]
-    movh        m2, [srcq-1]
-    movh        m3, [srcq+0]
-    movh        m4, [srcq+1]
-    movh        m5, [srcq+2]
-    punpcklbw   m0, m1
-    punpcklbw   m2, m3
-    movh        m1, [srcq+3]
-    movh        m3, [srcq+4]
-    add       srcq, sstrideq
-    punpcklbw   m4, m5
-    punpcklbw   m1, m3
-    pmaddubsw   m0, m7
-%if ARCH_X86_64 && mmsize > 8
-    pmaddubsw   m2, m8
-    pmaddubsw   m4, m9
-    pmaddubsw   m1, m10
-%else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
-%endif
-    paddw       m0, m2
-    paddw       m4, m1
-    paddsw      m0, m4
-    pmulhrsw    m0, m6
-%ifidn %1, avg
-    movh        m1, [dstq]
-%endif
-    packuswb    m0, m0
-%ifidn %1, avg
-    pavgb       m0, m1
-%endif
-    movh    [dstq], m0
-    add       dstq, dstrideq
-    dec         hd
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX ssse3
-filter_h_fn put
-filter_h_fn avg
-
-INIT_XMM ssse3
-filter_h_fn put
-filter_h_fn avg
-
-%macro filter_v_fn 1
-%assign %%px mmsize/2
-%if ARCH_X86_64
-cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
-%else
-cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
-    mov   filteryq, r5mp
-%define hd r4mp
-%endif
-    sub       srcq, sstrideq
-    lea  sstride3q, [sstrideq*3]
-    sub       srcq, sstrideq
-    mova        m6, [pw_256]
-    sub       srcq, sstrideq
-    mova        m7, [filteryq+ 0]
-    lea      src4q, [srcq+sstrideq*4]
-%if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
-%endif
-.loop:
-    ; FIXME maybe reuse loads from previous rows, or just more generally
-    ; unroll this to prevent multiple loads of the same data?
-    movh        m0, [srcq]
-    movh        m1, [srcq+sstrideq]
-    movh        m2, [srcq+sstrideq*2]
-    movh        m3, [srcq+sstride3q]
-    movh        m4, [src4q]
-    movh        m5, [src4q+sstrideq]
-    punpcklbw   m0, m1
-    punpcklbw   m2, m3
-    movh        m1, [src4q+sstrideq*2]
-    movh        m3, [src4q+sstride3q]
-    add       srcq, sstrideq
-    add      src4q, sstrideq
-    punpcklbw   m4, m5
-    punpcklbw   m1, m3
-    pmaddubsw   m0, m7
-%if ARCH_X86_64 && mmsize > 8
-    pmaddubsw   m2, m8
-    pmaddubsw   m4, m9
-    pmaddubsw   m1, m10
-%else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
-%endif
-    paddw       m0, m2
-    paddw       m4, m1
-    paddsw      m0, m4
-    pmulhrsw    m0, m6
-%ifidn %1, avg
-    movh        m1, [dstq]
-%endif
-    packuswb    m0, m0
-%ifidn %1, avg
-    pavgb       m0, m1
-%endif
-    movh    [dstq], m0
-    add       dstq, dstrideq
-    dec         hd
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX ssse3
-filter_v_fn put
-filter_v_fn avg
-
-INIT_XMM ssse3
-filter_v_fn put
-filter_v_fn avg
-
-%macro fpel_fn 6
-%if %2 == 4
-%define %%srcfn movh
-%define %%dstfn movh
-%else
-%define %%srcfn movu
-%define %%dstfn mova
-%endif
-
-%if %2 <= 16
-cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
-    lea  sstride3q, [sstrideq*3]
-    lea  dstride3q, [dstrideq*3]
-%else
-cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
-%endif
-.loop:
-    %%srcfn     m0, [srcq]
-    %%srcfn     m1, [srcq+s%3]
-    %%srcfn     m2, [srcq+s%4]
-    %%srcfn     m3, [srcq+s%5]
-    lea       srcq, [srcq+sstrideq*%6]
-%ifidn %1, avg
-    pavgb       m0, [dstq]
-    pavgb       m1, [dstq+d%3]
-    pavgb       m2, [dstq+d%4]
-    pavgb       m3, [dstq+d%5]
-%endif
-    %%dstfn [dstq], m0
-    %%dstfn [dstq+d%3], m1
-    %%dstfn [dstq+d%4], m2
-    %%dstfn [dstq+d%5], m3
-    lea       dstq, [dstq+dstrideq*%6]
-    sub         hd, %6
-    jnz .loop
-    RET
-%endmacro
-
-%define d16 16
-%define s16 16
-INIT_MMX mmx
-fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
-INIT_MMX sse
-fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
-INIT_XMM sse
-fpel_fn put, 16, strideq, strideq*2, stride3q, 4
-fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
-INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
-%undef s16
-%undef d16
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index ce58c08a3b..cd4af992d0 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,240 +1,394 @@
 /*
  * VP9 SIMD optimizations
  *
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
 
 #if HAVE_YASM
 
-#define fpel_func(avg, sz, opt)                                         \
-void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src,     \
-                                  ptrdiff_t dst_stride,                 \
-                                  ptrdiff_t src_stride,                 \
-                                  int h, int mx, int my)
-
-fpel_func(put,  4, mmx);
-fpel_func(put,  8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg,  4, sse);
-fpel_func(avg,  8, sse);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-#undef fpel_func
-
-#define mc_func(avg, sz, dir, opt)                                          \
-void                                                                        \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16])
-
-#define mc_funcs(sz)            \
-    mc_func(put, sz, h, ssse3); \
-    mc_func(avg, sz, h, ssse3); \
-    mc_func(put, sz, v, ssse3); \
-    mc_func(avg, sz, v, ssse3)
-
-mc_funcs(4);
-mc_funcs(8);
-
-#undef mc_funcs
-#undef mc_func
-
-#define mc_rep_func(avg, sz, hsz, dir, opt)                                 \
-static av_always_inline void                                                \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16]) \
-{                                                                           \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src,        \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h,               \
-                                                           filter);         \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,       \
-                                                           src + hsz,       \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h, filter);      \
-}
-
-#define mc_rep_funcs(sz, hsz)            \
-    mc_rep_func(put, sz, hsz, h, ssse3); \
-    mc_rep_func(avg, sz, hsz, h, ssse3); \
-    mc_rep_func(put, sz, hsz, v, ssse3); \
-    mc_rep_func(avg, sz, hsz, v, ssse3)
-
-mc_rep_funcs(16, 8);
-mc_rep_funcs(32, 16);
-mc_rep_funcs(64, 32);
-
-#undef mc_rep_funcs
-#undef mc_rep_func
-
-extern const int8_t ff_filters_ssse3[3][15][4][16];
-
-#define filter_8tap_2d_fn(op, sz, f, fname)                             \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst,              \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t dst_stride,      \
-                                             ptrdiff_t src_stride,      \
-                                             int h, int mx, int my)     \
-{                                                                       \
-    LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]);                         \
-    ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride,       \
-                                      64, src_stride,                   \
-                                      h + 7,                            \
-                                      ff_filters_ssse3[f][mx - 1]);     \
-    ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64,        \
-                                             dst_stride, 64,            \
-                                             h,                         \
-                                             ff_filters_ssse3[f][my - 1]); \
-}
-
-#define filters_8tap_2d_fn(op, sz)                          \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp)     \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
-
-#define filters_8tap_2d_fn2(op) \
-    filters_8tap_2d_fn(op, 64)  \
-    filters_8tap_2d_fn(op, 32)  \
-    filters_8tap_2d_fn(op, 16)  \
-    filters_8tap_2d_fn(op, 8)   \
-    filters_8tap_2d_fn(op, 4)
-
-filters_8tap_2d_fn2(put)
-filters_8tap_2d_fn2(avg)
-
-#undef filters_8tap_2d_fn2
-#undef filters_8tap_2d_fn
-#undef filter_8tap_2d_fn
-
-#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar)                  \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst,         \
-                                                  const uint8_t *src,   \
-                                                  ptrdiff_t dst_stride, \
-                                                  ptrdiff_t src_stride, \
-                                                  int h, int mx,        \
-                                                  int my)               \
-{                                                                       \
-    ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src,        \
-                                                       dst_stride,      \
-                                                       src_stride, h,   \
-                                                       ff_filters_ssse3[f][dvar - 1]); \
-}
-
-#define filters_8tap_1d_fn(op, sz, dir, dvar)                          \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar)     \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
-
-#define filters_8tap_1d_fn2(op, sz)             \
-    filters_8tap_1d_fn(op, sz, h, mx)           \
-    filters_8tap_1d_fn(op, sz, v, my)
-
-#define filters_8tap_1d_fn3(op) \
-    filters_8tap_1d_fn2(op, 64) \
-    filters_8tap_1d_fn2(op, 32) \
-    filters_8tap_1d_fn2(op, 16) \
-    filters_8tap_1d_fn2(op,  8) \
-    filters_8tap_1d_fn2(op,  4)
-
-filters_8tap_1d_fn3(put)
-filters_8tap_1d_fn3(avg)
-
-#undef filters_8tap_1d_fn
-#undef filters_8tap_1d_fn2
-#undef filters_8tap_1d_fn3
-#undef filter_8tap_1d_fn
+decl_fpel_func(put,  4,   , mmx);
+decl_fpel_func(put,  8,   , mmx);
+decl_fpel_func(put, 16,   , sse);
+decl_fpel_func(put, 32,   , sse);
+decl_fpel_func(put, 64,   , sse);
+decl_fpel_func(avg,  4, _8, mmxext);
+decl_fpel_func(avg,  8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32,   , avx);
+decl_fpel_func(put, 64,   , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t,  8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
+#if ARCH_X86_64
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#endif
+
+mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8);
+#if ARCH_X86_32
+mc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8);
+#endif
+mc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8);
+mc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8);
+mc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8);
+mc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8);
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+mc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8);
+#endif
+
+extern const int8_t ff_filters_ssse3[3][15][4][32];
+extern const int16_t ff_filters_sse2[3][15][8][8];
+
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
+#endif
+
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
+#endif
+
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct,  idct,  4, mmxext);
+itxfm_func(idct,  iadst, 4, sse2);
+itxfm_func(iadst, idct,  4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H)
+
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
+
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
+
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
+
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
+
+ipred_func(4, h, sse2);
+
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
+
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
+
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
 
 #endif /* HAVE_YASM */
 
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
 {
 #if HAVE_YASM
-    int cpu_flags = av_get_cpu_flags();
-
-#define init_fpel(idx1, idx2, sz, type, opt)                            \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt
-
-
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
-
-#define init_subpel2(idx, idxh, idxv, dir, type, opt)     \
-    init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
-    init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
-    init_subpel1(3, idx, idxh, idxv,  8, dir, type, opt); \
-    init_subpel1(4, idx, idxh, idxv,  4, dir, type, opt)
+    int cpu_flags;
+
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_x86(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_x86(dsp);
+        return;
+    }
 
-#define init_subpel3(idx, type, opt)        \
-    init_subpel2(idx, 1, 1, hv, type, opt); \
-    init_subpel2(idx, 0, 1,  v, type, opt); \
-    init_subpel2(idx, 1, 0,  h, type, opt)
+    cpu_flags = av_get_cpu_flags();
+
+#define init_lpf(opt) do { \
+    dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+    dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+    dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+    dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+    dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+    dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+    dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+    dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+    dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+    dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+} while (0)
+
+#define init_ipred(sz, opt, t, e) \
+    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+    init_ipred(sz, opt, hd, HOR_DOWN); \
+    init_ipred(sz, opt, vl, VERT_LEFT); \
+    init_ipred(sz, opt, hu, HOR_UP); \
+    init_ipred(sz, opt, tm, TM_VP8); \
+    init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+    init_dir_tm_ipred(sz, opt); \
+    init_ipred(sz, opt, h,  HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dc,      DC); \
+    init_ipred(sz, opt, dc_left, LEFT_DC); \
+    init_ipred(sz, opt, dc_top,  TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+    init_dc_ipred(sz, opt); \
+    init_dir_tm_h_ipred(sz, opt); \
+} while (0)
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        init_fpel(4, 0,  4, put, mmx);
-        init_fpel(3, 0,  8, put, mmx);
+        init_fpel_func(4, 0,  4, put, , mmx);
+        init_fpel_func(3, 0,  8, put, , mmx);
+        if (!bitexact) {
+            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        }
+        init_ipred(8, mmx, v, VERT);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_subpel2(4, 0, 4, put, 8, mmxext);
+        init_subpel2(4, 1, 4, avg, 8, mmxext);
+        init_fpel_func(4, 1,  4, avg, _8, mmxext);
+        init_fpel_func(3, 1,  8, avg, _8, mmxext);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+        init_dc_ipred(4, mmxext);
+        init_dc_ipred(8, mmxext);
+        init_dir_tm_ipred(4, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
-        init_fpel(2, 0, 16, put, sse);
-        init_fpel(1, 0, 32, put, sse);
-        init_fpel(0, 0, 64, put, sse);
-        init_fpel(4, 1,  4, avg, sse);
-        init_fpel(3, 1,  8, avg, sse);
+        init_fpel_func(2, 0, 16, put, , sse);
+        init_fpel_func(1, 0, 32, put, , sse);
+        init_fpel_func(0, 0, 64, put, , sse);
+        init_ipred(16, sse, v, VERT);
+        init_ipred(32, sse, v, VERT);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        init_fpel(2, 1, 16, avg, sse2);
-        init_fpel(1, 1, 32, avg, sse2);
-        init_fpel(0, 1, 64, avg, sse2);
+        init_subpel3_8to64(0, put, 8, sse2);
+        init_subpel3_8to64(1, avg, 8, sse2);
+        init_fpel_func(2, 1, 16, avg,  _8, sse2);
+        init_fpel_func(1, 1, 32, avg,  _8, sse2);
+        init_fpel_func(0, 1, 64, avg,  _8, sse2);
+        init_lpf(sse2);
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+        init_dc_ipred(16, sse2);
+        init_dc_ipred(32, sse2);
+        init_dir_tm_h_ipred(8, sse2);
+        init_dir_tm_h_ipred(16, sse2);
+        init_dir_tm_h_ipred(32, sse2);
+        init_ipred(4, sse2, h, HOR);
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
-        init_subpel3(0, put, ssse3);
-        init_subpel3(1, avg, ssse3);
+        init_subpel3(0, put, 8, ssse3);
+        init_subpel3(1, avg, 8, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+        init_lpf(ssse3);
+        init_all_ipred(4, ssse3);
+        init_all_ipred(8, ssse3);
+        init_all_ipred(16, ssse3);
+        init_all_ipred(32, ssse3);
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+        init_lpf(avx);
+        init_dir_tm_h_ipred(8, avx);
+        init_dir_tm_h_ipred(16, avx);
+        init_dir_tm_h_ipred(32, avx);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(1, 0, 32, put, , avx);
+        init_fpel_func(0, 0, 64, put, , avx);
+        init_ipred(32, avx, v, VERT);
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        init_fpel_func(1, 1, 32, avg, _8, avx2);
+        init_fpel_func(0, 1, 64, avg, _8, avx2);
+        if (ARCH_X86_64) {
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+            init_subpel3_32_64(0, put, 8, avx2);
+            init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+        }
+        init_dc_ipred(32, avx2);
+        init_ipred(32, avx2, h,  HOR);
+        init_ipred(32, avx2, tm, TM_VP8);
     }
 
 #undef init_fpel
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000000..5842282398
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,176 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                         const uint8_t *src, ptrdiff_t src_stride, \
+                                                         int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                    const uint8_t *src, ptrdiff_t src_stride, \
+                                                    int h, const type (*filter)[f_sz]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst,        dst_stride, src, \
+                                                         src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+                                                         src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp); \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp); \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp); \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                          const uint8_t *src, ptrdiff_t src_stride, \
+                                                          int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+                                                       h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+                                              src_stride,  h + 7, \
+                                              ff_filters_##f_opt[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+                                                 64 * bytes, h, \
+                                                 ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+        type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+        type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = \
+        type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, bpp, opt); \
+    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+    init_subpel2(0, idx, 64, type, bpp, opt); \
+    init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+    init_subpel3_32_64(idx, type, bpp, opt); \
+    init_subpel2(2, idx, 16, type, bpp, opt); \
+    init_subpel2(3, idx,  8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+    init_subpel3_8to64(idx, type, bpp, opt); \
+    init_subpel2(4, idx,  4, type, bpp, opt)
+
+#define cat(a, bpp, b) a##bpp##b
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  4, bpp, opt); \
+    init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_10bpp.c b/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 0000000000..2694c06cb2
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_12bpp.c b/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 0000000000..5da3bc1840
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000000..4ceb4d4b49
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,139 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+decl_fpel_func(put,   8,    , mmx);
+decl_fpel_func(avg,   8, _16, mmxext);
+decl_fpel_func(put,  16,    , sse);
+decl_fpel_func(put,  32,    , sse);
+decl_fpel_func(put,  64,    , sse);
+decl_fpel_func(put, 128,    , sse);
+decl_fpel_func(avg,  16, _16, sse2);
+decl_fpel_func(avg,  32, _16, sse2);
+decl_fpel_func(avg,  64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put,  32,    , avx);
+decl_fpel_func(put,  64,    , avx);
+decl_fpel_func(put, 128,    , avx);
+decl_fpel_func(avg,  32, _16, avx2);
+decl_fpel_func(avg,  64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v,       16, mmx,    sse);
+decl_ipred_fns(h,       16, mmxext, sse2);
+decl_ipred_fns(dc,      16, mmxext, sse2);
+decl_ipred_fns(dc_top,  16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,   8, put, , mmx);
+        init_ipred_func(v, VERT, 4, 16, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_fpel_func(4, 1,   8, avg, _16, mmxext);
+        init_ipred_func(h, HOR, 4, 16, mmxext);
+        init_ipred_func(dc, DC, 4, 16, mmxext);
+        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
+        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(3, 0,  16, put, , sse);
+        init_fpel_func(2, 0,  32, put, , sse);
+        init_fpel_func(1, 0,  64, put, , sse);
+        init_fpel_func(0, 0, 128, put, , sse);
+        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel_func(3, 1,  16, avg, _16, sse2);
+        init_fpel_func(2, 1,  32, avg, _16, sse2);
+        init_fpel_func(1, 1,  64, avg, _16, sse2);
+        init_fpel_func(0, 1, 128, avg, _16, sse2);
+        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
+        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+        init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+        init_ipred_funcs(hu, HOR_UP, 16, sse2);
+        init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+        init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+        init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+        init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(2, 0,  32, put, , avx);
+        init_fpel_func(1, 0,  64, put, , avx);
+        init_fpel_func(0, 0, 128, put, , avx);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+        init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+        init_ipred_funcs(hu, HOR_UP, 16, avx);
+        init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        init_fpel_func(2, 1,  32, avg, _16, avx2);
+        init_fpel_func(1, 1,  64, avg, _16, avx2);
+        init_fpel_func(0, 1, 128, avg, _16, avx2);
+    }
+
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 0000000000..f486caf1a1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16,  8, 16, sse2, int16_t, 16, BPC);
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC);
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC);
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC);
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC);
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                     int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir,  4, BPC); \
+decl_lpf_funcs(dir,  8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                 int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst,       stride, E, I, H); \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt); \
+lpf_16_wrapper(v, 16,         bpp, opt)
+
+lpf_16_wrappers(BPC, sse2);
+lpf_16_wrappers(BPC, ssse3);
+lpf_16_wrappers(BPC, avx);
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                           int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst,       stride, \
+                                                     E & 0xff, I & 0xff, H & 0xff); \
+    ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+                                                     E >> 8,   I >> 8,   H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt); \
+lpf_mix2_wrapper(v, 16,         wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt); \
+lpf_mix2_wrappers(4, 8, bpp, opt); \
+lpf_mix2_wrappers(8, 4, bpp, opt); \
+lpf_mix2_wrappers(8, 8, bpp, opt); \
+
+lpf_mix2_wrappers_set(BPC, sse2);
+lpf_mix2_wrappers_set(BPC, ssse3);
+lpf_mix2_wrappers_set(BPC, avx);
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+#endif /* HAVE_YASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+    init_lpf_8_func(0, 0, h,  4, bpp, opt); \
+    init_lpf_8_func(0, 1, v,  4, bpp, opt); \
+    init_lpf_8_func(1, 0, h,  8, bpp, opt); \
+    init_lpf_8_func(1, 1, v,  8, bpp, opt); \
+    init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+    init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+    init_lpf_16_func(0, h, bpp, opt); \
+    init_lpf_16_func(1, v, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_subpel3(0, put, BPC, sse2);
+        init_subpel3(1, avg, BPC, sse2);
+        init_lpf_funcs(BPC, sse2);
+        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_lpf_funcs(BPC, ssse3);
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        init_lpf_funcs(BPC, avx);
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+        init_subpel3_32_64(0,  put, BPC, avx2);
+        init_subpel3_32_64(1,  avg, BPC, avx2);
+        init_subpel2(2, 0, 16, put, BPC, avx2);
+        init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+    }
+
+#endif /* HAVE_YASM */
+
+    ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000000..31f7d449fd
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+                    times 4 db 2
+                    times 4 db 1
+                    times 4 db 0
+pb_8x1_8x0:   times 8 db 1
+              times 8 db 0
+pb_8x3_8x2:   times 8 db 3
+              times 8 db 2
+pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
+              times 8 db -1
+pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
+              times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+              times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+              times 11 db 7
+pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+               times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+              times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+              db -1
+pb_0to2_5x3: db 0, 1, 2
+             times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+             times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+             times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [lq]
+    movq                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [aq]
+    mova                    m3, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m4, m4
+    psadbw                  m0, m4
+    psadbw                  m1, m4
+    psadbw                  m2, m4
+    psadbw                  m3, m4
+    paddw                   m0, m1
+    paddw                   m2, m3
+    paddw                   m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_512]
+    pshufb                  m0, m4
+%else
+    paddw                   m0, [pw_32]
+    psraw                   m0, 6
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [%2q]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_8192]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_2]
+    psraw                   m0, 2
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    mova                    m1, [%2q+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top,  a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
+%else
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0123
+    punpcklwd               m0, m0
+%endif
+    lea               stride3q, [strideq*3]
+    movd      [dstq+strideq*0], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*1], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*2], m0
+    psrldq                  m0, 4
+    movd      [dstq+stride3q ], m0
+    RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m2, [pb_8x1_8x0]
+    mova                    m3, [pb_8x3_8x2]
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 1
+.loop:
+    movd                    m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, m3
+    pshufb                  m0, m2
+%else
+    punpcklbw               m0, m0
+    punpcklwd               m0, m0
+    pshufd                  m1, m0, q2233
+    pshufd                  m0, m0, q0011
+%endif
+    movq      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    movq      [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 3
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+    pxor                    m1, m1
+    movd                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 1
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m4, m2, q1111
+    pshufw                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m4
+    packuswb                m2, m2
+    movd      [dstq+strideq*0], m4
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+    pxor                    m1, m1
+    movh                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 3
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m4, m2, q1111
+    pshufd                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m2
+    movh      [dstq+strideq*0], m4
+    movhps    [dstq+strideq*1], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    mova                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m4, [pw_m256]
+    mova                    m3, [pw_m255]
+    pshufb                  m2, m4
+%else
+    punpcklbw               m2, m3
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 7
+.loop:
+    pinsrw                  m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m5, m7, m3
+    pshufb                  m7, m4
+%else
+    punpcklbw               m7, m3
+    punpcklwd               m7, m7
+    pshufd                  m5, m7, q1111
+    pshufd                  m7, m7, q0000
+%endif
+    paddw                   m2, m5, m0
+    paddw                   m5, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m5
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+    pxor                    m5, m5
+    pinsrw                  m4, [aq-1], 0
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova                   m12, [pw_m256]
+    mova                   m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+    pshufb                  m4, pw_m256_reg
+%else
+    punpcklbw               m4, m5
+    punpcklwd               m4, m4
+    pshufd                  m4, m4, q0000
+%endif
+    punpckhbw               m1, m0,  m5
+    punpckhbw               m3, m2,  m5
+    punpcklbw               m0, m5
+    punpcklbw               m2, m5
+    psubw                   m1, m4
+    psubw                   m0, m4
+    psubw                   m3, m4
+    psubw                   m2, m4
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+    SWAP                     2, 10
+    SWAP                     3, 11
+%else
+    mova            [rsp+0*16], m0
+    mova            [rsp+1*16], m1
+    mova            [rsp+2*16], m2
+    mova            [rsp+3*16], m3
+%endif
+    mov                   cntq, 15
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m7, m3, pw_m255_reg
+    pshufb                  m3, pw_m256_reg
+%else
+    pxor                    m7, m7
+    punpcklbw               m3, m7
+    punpcklwd               m3, m3
+    pshufd                  m7, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+%if ARCH_X86_64
+    paddw                   m4, m7, m8
+    paddw                   m5, m7, m9
+    paddw                   m6, m7, m10
+    paddw                   m7, m11
+    paddw                   m0, m3, m8
+    paddw                   m1, m3, m9
+    paddw                   m2, m3, m10
+    paddw                   m3, m11
+%else
+    paddw                   m4, m7, [rsp+0*16]
+    paddw                   m5, m7, [rsp+1*16]
+    paddw                   m6, m7, [rsp+2*16]
+    paddw                   m7, [rsp+3*16]
+    paddw                   m0, m3, [rsp+0*16]
+    paddw                   m1, m3, [rsp+1*16]
+    paddw                   m2, m3, [rsp+2*16]
+    paddw                   m3, [rsp+3*16]
+%endif
+    packuswb                m4, m5
+    packuswb                m6, m7
+    packuswb                m0, m1
+    packuswb                m2, m3
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m6
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+    pxor                   m%4, m%1, m%3
+    pand                   m%4, [pb_1]
+    pavgb                  m%1, m%3
+    psubusb                m%1, m%4
+    pavgb                  m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, m1, [pb_0to5_2x7]
+    pshufb                  m2, m1, [pb_2to6_3x7]
+%else
+    punpckhbw               m3, m1, m1              ; 44556677
+    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
+    pand                    m3, [pb_6x0_2xm1]       ; ______77
+    psrlq                   m2, m1, 16              ; 234567__
+    por                     m0, m3                  ; 01234577
+    por                     m2, m3                  ; 23456777
+%endif
+    psrlq                   m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    pshufw                  m1, m0, q3321
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    psrlq                   m0, 8
+    psrlq                   m1, 8
+    add                   dstq, strideq
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+    movq                    m0, [aq]
+    lea               stride5q, [strideq*5]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, [pb_1to6_10x7]
+%else
+    punpcklbw               m1, m0, m0              ; 0011223344556677
+    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
+%endif
+    shufps                  m0, m1, q3310
+%if notcpuflag(ssse3)
+    psrldq                  m1, m0, 1
+    shufps                  m1, m0, q3210
+%endif
+    psrldq                  m2, m1, 1
+    LOWPASS                  0, 1, 2, 3
+
+    pshufd                  m1, m0, q3321
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m0, [aq]
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m1, m0, m5
+    pshufb                  m2, m1, m5
+    pshufb                  m4, m0, [pb_15]
+%else
+    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
+    por                     m1, m5                      ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
+    por                     m2, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
+%endif
+    LOWPASS                  0, 1, 2, 3
+    DEFINE_ARGS dst, stride, cnt, stride9
+    lea               stride9q, [strideq+strideq*8]
+    mov                   cntd, 4
+
+.loop:
+    movhlps                 m4, m0
+    mova      [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+strideq*8], m4
+    movhlps                 m4, m0
+    mova      [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+stride9q ], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    PALIGNR                 m2, m1, m0, 1, m4
+    PALIGNR                 m3, m1, m0, 2, m4
+    LOWPASS                  0, 2, 3, 4
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m2, m1, m5
+    pshufb                  m3, m2, m5
+    pshufb                  m6, m1, [pb_15]
+    mova                    m7, m6
+%else
+    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
+    por                     m2, m5                      ; 123456789ABCDEFF
+    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
+    por                     m3, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
+    pshufd                  m6, m7, q3333
+%endif
+    LOWPASS                  1, 2, 3, 4
+    lea                 dst16q, [dstq  +strideq*8]
+    mov                   cntd, 8
+    lea                 dst16q, [dst16q+strideq*8]
+.loop:
+    movhlps                 m7, m1
+    mova [dstq  +strideq*0+ 0], m0
+    mova [dstq  +strideq*0+16], m1
+    movhps [dstq+strideq*8+ 0], m0
+    movq [dstq  +strideq*8+ 8], m1
+    mova [dstq  +strideq*8+16], m7
+    mova [dst16q+strideq*0+ 0], m1
+    mova [dst16q+strideq*0+16], m6
+    mova [dst16q+strideq*8+ 0], m7
+    mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 1
+    pshufb                  m1, m5
+%elif cpuflag(ssse3)
+    palignr                 m2, m1, m0, 1
+    pshufb                  m1, m5
+    mova                    m0, m2
+%else
+    mova                    m4, m1
+    psrldq                  m0, 1
+    pslldq                  m4, 15
+    psrldq                  m1, 1
+    por                     m0, m4
+    por                     m1, m5
+%endif
+    add                   dstq, strideq
+    add                 dst16q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    movd                    m1, [aq+3]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m1, m0, 1, m3
+    psrlq                   m2, m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    movd      [dstq+stride3q ], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*2], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*1], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*0], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m1, [lq]
+    movhps                  m1, [aq-1]
+    movd                    m2, [aq+7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m3
+    LOWPASS                  0, 1, 2, 3
+
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*4]
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m1, [lq]
+    movu                    m2, [aq-1]
+    movd                    m4, [aq+15]
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea               stride9q, [strideq *3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+    PALIGNR                 m4, m2, 1, m5
+    PALIGNR                 m3, m2, m1, 15, m5
+    LOWPASS                  3,  2, 4, 5
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m4
+    LOWPASS                  0,  1, 2, 4
+
+.loop:
+    mova    [dstq+strideq*0  ], m3
+    movhps  [dstq+strideq*8+0], m0
+    movq    [dstq+strideq*8+8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    mova    [dstq+strideq*1  ], m3
+    movhps  [dstq+stride9q +0], m0
+    movq    [dstq+stride9q +8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+    mova                    m1, [lq]
+    mova                    m2, [lq+16]
+    movu                    m3, [aq-1]
+    movu                    m4, [aq+15]
+    movd                    m5, [aq+31]
+    DEFINE_ARGS dst, stride, stride8, cnt
+    lea               stride8q, [strideq*8]
+    PALIGNR                 m5, m4, 1, m7
+    PALIGNR                 m6, m4, m3, 15, m7
+    LOWPASS                  5,  4,  6,  7
+    PALIGNR                 m4, m3, 1, m7
+    PALIGNR                 m6, m3, m2, 15, m7
+    LOWPASS                  4,  3,  6,  7
+    PALIGNR                 m3, m2, 1, m7
+    PALIGNR                 m6, m2, m1, 15, m7
+    LOWPASS                  3,  2,  6,  7
+    PALIGNR                 m2, m1, 1, m6
+    pslldq                  m0, m1, 1
+    LOWPASS                  2,  1,  0,  6
+    mov                   cntd, 16
+
+    ; out=m2/m3/m4/m5
+.loop:
+    mova  [dstq+stride8q*0+ 0], m4
+    mova  [dstq+stride8q*0+16], m5
+    mova  [dstq+stride8q*2+ 0], m3
+    mova  [dstq+stride8q*2+16], m4
+    PALIGNR                 m5, m4, 15, m6
+    PALIGNR                 m4, m3, 15, m6
+    PALIGNR                 m3, m2, 15, m6
+    pslldq                  m2, 1
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    psrlq                   m1, 8
+    psrlq                   m2, 8
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0
+    punpckhwd               m1, m1
+    shufps                  m0, m1, q3310
+%endif
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m0, 2
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+    pshufb                  m2, m1, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
+    por                     m1, m4                  ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
+    por                     m2, m4                  ; 23456789ABCDEFFF
+%endif
+    LOWPASS                  2,  1,  0, 3
+    pavgb                   m1, m0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m5, [aq+16]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    PALIGNR                 m2, m5, m0, 1, m4
+    PALIGNR                 m3, m5, m0, 2, m4
+    lea                 dst16q, [dstq  +strideq*8]
+    LOWPASS                  3,  2,  0, 6
+    pavgb                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m0, m5, m4
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
+    por                     m0, m4                  ; 123456789ABCDEFF
+    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
+    por                     m1, m4                  ; 23456789ABCDEFFF
+%endif
+    lea                 dst16q, [dst16q+strideq*8]
+    LOWPASS                  1,  0,  5, 6
+    pavgb                   m0, m5
+%if cpuflag(ssse3)
+    pshufb                  m5, [pb_15]
+%else
+    punpckhbw               m5, m4, m4
+    pshufhw                 m5, m5, q3333
+    punpckhqdq              m5, m5
+%endif
+    mov                   cntd, 8
+
+.loop:
+%macro %%write 3
+    mova    [dstq+stride%1+ 0], %2
+    mova    [dstq+stride%1+16], %3
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+    palignr                 %2, %3, %2, 1
+    pshufb                  %3, m4
+%elif cpuflag(ssse3)
+    palignr                 m6, %3, %2, 1
+    pshufb                  %3, m4
+    mova                    %2, m6
+%else
+    pslldq                  m6, %3, 15
+    psrldq                  %3, 1
+    psrldq                  %2, 1
+    por                     %3, m4
+    por                     %2, m6
+%endif
+%endmacro
+
+    %%write                q*0, m2, m0
+    %%write                q*1, m3, m1
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq-1]
+    punpckldq               m2, [lq]
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 5, m3
+    psrlq                   m2, m1, 8
+    psllq                   m3, m1, 8
+    LOWPASS                  2,  1, 3, 4
+
+    ; ABCD <- for the following predictor:
+    ; EFGH
+    ; IABC  | m0 contains ABCDxxxx
+    ; JEFG  | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+    punpckldq               m0, m2
+    pshufb                  m2, [pb_13456_3xm1]
+    movd      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6012_4xm1]
+    movd      [dstq+stride3q ], m2
+    psrlq                   m2, 8
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+strideq*1], m2
+%else
+    psllq                   m1, m2, 40
+    psrlq                   m2, 24
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m2
+    PALIGNR                 m0, m1, 7, m3
+    psllq                   m1, 8
+    PALIGNR                 m2, m1, 7, m3
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+stride3q ], m2
+%endif
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-1]
+    movhps                  m2, [lq]
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 9, m3
+    pslldq                  m2, m1, 1
+    pslldq                  m3, m1, 2
+    LOWPASS                  1,  2, 3, 4
+
+    ; ABCDEFGH <- for the following predictor:
+    ; IJKLMNOP
+    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
+    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
+    ; SQABCDEF
+    ; TRIJKLMN
+    ; USQABCDE
+    ; VTRIJKLM
+
+%if cpuflag(ssse3)
+    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+    movq      [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
+    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
+%else
+    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
+    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
+    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
+    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
+    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
+    psrldq                  m1, 8
+    pslldq                  m3, 8
+    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
+%endif
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+    mova                    m0, [aq]
+    movu                    m1, [aq-1]
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m3, m1, m2, 15, m6
+    LOWPASS                  3,  1,  0,  4
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2,  1, m6
+    pslldq                  m4, m2,  1
+    LOWPASS                  1,  2,  4,  5
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m5, m1, 8
+    pand                    m1, [pw_255]
+    packuswb                m1, m5
+%endif
+    mov                   cntd, 4
+
+.loop:
+    movlhps                 m2, m1
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m4, m0, m1, 15, m6
+    PALIGNR                 m5, m3, m2, 15, m6
+    mova      [dstq+strideq*2], m4
+    mova      [dstq+stride3q ], m5
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m1, 14, m6
+    PALIGNR                 m3, m2, 14, m6
+    pslldq                  m1, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    movu                    m1, [aq-1]
+    PALIGNR                 m3, m2, m0, 15, m6
+    PALIGNR                 m4, m2, m0, 14, m6
+    LOWPASS                  4,  3,  2,  5
+    pavgb                   m3, m2
+    mova                    m2, [lq+16]
+    PALIGNR                 m5, m1, m2, 15, m6
+    LOWPASS                  5,  1,  0,  6
+    pavgb                   m0, m1
+    mova                    m6, [lq]
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                [dstq], m0
+%endif
+    PALIGNR                 m1, m2,  1, m0
+    PALIGNR                 m7, m2, m6, 15, m0
+    LOWPASS                  1,  2,  7,  0
+    PALIGNR                 m2, m6,  1, m0
+    pslldq                  m7, m6,  1
+    LOWPASS                  2,  6,  7,  0
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    pshufb                  m2, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m0, m1, 8
+    psrlw                   m6, m2, 8
+    pand                    m1, [pw_255]
+    pand                    m2, [pw_255]
+    packuswb                m1, m0
+    packuswb                m2, m6
+%endif
+    DEFINE_ARGS dst, stride, dst16, cnt
+    lea                 dst16q, [dstq  +strideq*8]
+    lea                 dst16q, [dst16q+strideq*8]
+    SBUTTERFLY             qdq,  2,  1,  6
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                    m0, [dstq]
+%endif
+    mov                   cntd, 8
+
+.loop:
+    ; even lines (0, 2, 4, ...): m1 | m0, m3
+    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+    mova    [dstq+stride%1+ 0], %3
+    mova    [dstq+stride%1+16], %4
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], %4
+    PALIGNR                 %4, %3, 15, m6
+    PALIGNR                 %3, %2, 15, m6
+    pslldq                  %2,  1
+%endmacro
+
+    %%write                q*0, m1, m0, m3
+    %%write                q*1, m2, m5, m4
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0,  3
+    pavgb                   m1, m0
+
+    ; DHIJ <- for the following predictor:
+    ; CGDH
+    ; BFCG  | m1 contains ABCDxxxx
+    ; AEBF  | m2 contains EFGHIJxx
+
+    punpcklbw               m1, m2
+    punpckhdq               m0, m1, m2
+
+    ; m1 contains AEBFCGDH
+    ; m0 contains CGDHIJxx
+
+    movd      [dstq+stride3q ], m1
+    movd      [dstq+strideq*1], m0
+    psrlq                   m1, 16
+    psrlq                   m0, 16
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+    movq                    m0, [lq]
+    movhps                  m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    ; HPQRSTUV <- for the following predictor
+    ; GOHPQRST
+    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
+    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
+    ; DLEMFNGO
+    ; CKDLEMFN
+    ; BJCKDLEM
+    ; AIBJCKDL
+
+    punpcklbw               m1, m2
+    movhlps                 m2, m2
+
+    ; m1 contains AIBJCKDLEMFNGOHP
+    ; m2 contains QRSTUVxxxxxxxxxx
+
+    movhps   [dstq +stride3q ], m1
+    movq     [dst4q+stride3q ], m1
+    PALIGNR                 m3, m2, m1, 2, m4
+    movhps   [dstq +strideq*2], m3
+    movq     [dst4q+strideq*2], m3
+    PALIGNR                 m3, m2, m1, 4, m4
+    movhps   [dstq +strideq*1], m3
+    movq     [dst4q+strideq*1], m3
+    PALIGNR                 m2, m1, 6, m4
+    movhps   [dstq +strideq*0], m2
+    movq     [dst4q+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m3, [aq-1]
+    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+    lea               stride4q, [strideq*4]
+    lea                  dst4q, [dstq +stride4q]
+    lea                  dst8q, [dst4q+stride4q]
+    lea                 dst12q, [dst8q+stride4q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m1, m3, m0,  1, m6
+    PALIGNR                 m2, m3, m0,  2, m6
+    LOWPASS                  2,  1,  0,  6
+    pavgb                   m1, m0
+    SBUTTERFLY              bw,  1,  2,  6
+
+    ; I PROBABLY INVERTED L0 ad L16 here
+    ; m1, m2, m5
+.loop:
+    sub               stride4q, strideq
+    movhps [dstq +stride4q +0], m2
+    movq   [dstq +stride4q +8], m5
+    mova   [dst4q+stride4q   ], m2
+    movhps [dst8q+stride4q +0], m1
+    movq   [dst8q+stride4q +8], m2
+    mova  [dst12q+stride4q   ], m1
+%if cpuflag(avx)
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m5, m2, 2
+%elif cpuflag(ssse3)
+    palignr                 m3, m2, m1, 2
+    palignr                 m0, m5, m2, 2
+    mova                    m1, m3
+    mova                    m2, m0
+%else
+    ; slightly modified version of PALIGNR
+    mova                    m6, m2
+    mova                    m4, m5
+    pslldq                  m6, 14
+    pslldq                  m4, 14
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    por                     m1, m6
+    por                     m2, m4
+%endif
+    psrldq                  m5, 2
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    movu                    m2, [aq-1]
+    movu                    m3, [aq+15]
+    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+    lea               stride8q, [strideq*8]
+    lea                  dst8q, [dstq  +stride8q]
+    lea                 dst16q, [dst8q +stride8q]
+    lea                 dst24q, [dst16q+stride8q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m4, m3, m2,  2, m6
+    PALIGNR                 m3, m2,  1, m6
+    LOWPASS                  4,  3,  2,  6
+    PALIGNR                 m3, m2, m1,  2, m6
+    PALIGNR                 m2, m1,  1, m6
+    LOWPASS                  3,  2,  1,  6
+    pavgb                   m2, m1
+    PALIGNR                 m6, m1, m0,  1, m7
+    PALIGNR                 m1, m0,  2, m7
+    LOWPASS                  1,  6,  0,  7
+    pavgb                   m0, m6
+    SBUTTERFLY              bw,  2,  3,  6
+    SBUTTERFLY              bw,  0,  1,  6
+
+    ; m0, m1, m2, m3, m4, m5
+.loop:
+    sub               stride8q, strideq
+    mova  [dstq  +stride8q+ 0], m3
+    mova  [dstq  +stride8q+16], m4
+    mova  [dst8q +stride8q+ 0], m2
+    mova  [dst8q +stride8q+16], m3
+    mova  [dst16q+stride8q+ 0], m1
+    mova  [dst16q+stride8q+16], m2
+    mova  [dst24q+stride8q+ 0], m0
+    mova  [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+    palignr                 m0, m1, m0, 2
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m4, m3, 2
+    palignr                 m4, m5, m4, 2
+    psrldq                  m5, 2
+%elif cpuflag(ssse3)
+    psrldq                  m6, m5, 2
+    palignr                 m5, m4, 2
+    palignr                 m4, m3, 2
+    palignr                 m3, m2, 2
+    palignr                 m2, m1, 2
+    palignr                 m1, m0, 2
+    mova                    m0, m1
+    mova                    m1, m2
+    mova                    m2, m3
+    mova                    m3, m4
+    mova                    m4, m5
+    mova                    m5, m6
+%else
+    ; sort of a half-integrated version of PALIGNR
+    pslldq                  m7, m4, 14
+    pslldq                  m6, m5, 14
+    psrldq                  m4, 2
+    psrldq                  m5, 2
+    por                     m4, m6
+    pslldq                  m6, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    pslldq                  m7, m2, 14
+    psrldq                  m2, 2
+    por                     m2, m6
+    pslldq                  m6, m1, 14
+    psrldq                  m1, 2
+    por                     m1, m7
+    psrldq                  m0, 2
+    por                     m0, m6
+%endif
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to2_5x3]
+%else
+    punpcklbw               m1, m0, m0          ; 00112233
+    pshufw                  m1, m1, q3333       ; 33333333
+    punpckldq               m0, m1              ; 01233333
+%endif
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    SBUTTERFLY              bw,  1, 2, 0
+    PALIGNR                 m2, m1, 2, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    punpckhdq               m1, m1
+    punpckhdq               m2, m2
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+stride3q ], m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+    movq                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0          ; 0011223344556677
+    punpckhwd               m1, m1              ; 4444555566667777
+    shufps                  m0, m1, q3310       ; 0123456777777777
+%endif
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    SBUTTERFLY              bw,  1, 2, 0
+    movq     [dstq +strideq*0], m1
+    movhps   [dst4q+strideq*0], m1
+    PALIGNR                 m0, m2, m1, 2, m3
+    movq     [dstq +strideq*1], m0
+    movhps   [dst4q+strideq*1], m0
+    PALIGNR                 m0, m2, m1, 4, m3
+    movq     [dstq +strideq*2], m0
+    movhps   [dst4q+strideq*2], m0
+    PALIGNR                 m2, m1, 6, m3
+    movq     [dstq +stride3q ], m2
+    movhps   [dst4q+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2toE_3xF]
+    pshufb                  m1, m0, [pb_1toE_2xF]
+    pshufb                  m2, m0, m3
+%else
+    pand                    m3, m0, [pb_15x0_1xm1]
+    psrldq                  m1, m0, 1
+    por                     m1, m3
+    punpckhbw               m3, m3
+    psrldq                  m2, m0, 2
+    por                     m2, m3
+%endif
+    LOWPASS                  2,  1,  0,  4
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea                stride9q, [strideq*8+strideq]
+    mov                   cntd,  4
+    SBUTTERFLY              bw,  1,  2,  0
+
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*8], m2
+    PALIGNR                 m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride9q ], m2
+    PALIGNR                 m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+    mova                    m1, [lq]
+    mova                    m0, [lq+16]
+    PALIGNR                 m2, m0, m1,  1, m5
+    PALIGNR                 m3, m0, m1,  2, m5
+    LOWPASS                  3,  2,  1,  5
+    pavgb                   m2, m1
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2toE_3xF]
+    pshufb                  m5, m0, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]
+    psrldq                  m5, m0, 1
+    por                     m5, m4
+    punpckhbw               m4, m4
+    psrldq                  m1, m0, 2
+    por                     m1, m4
+%endif
+    LOWPASS                  1,  5,  0,  6
+    pavgb                   m0, m5
+    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+    mov                   cntd,  8
+    xor               stride0q, stride0q
+    lea                  dst8q, [dstq  +strideq*8]
+    lea                 dst16q, [dst8q +strideq*8]
+    lea                 dst24q, [dst16q+strideq*8]
+    SBUTTERFLY              bw,  0,  1,  5
+    SBUTTERFLY              bw,  2,  3,  5
+%if cpuflag(ssse3)
+    pshufb                  m6, m1, [pb_15]
+%else
+    pshufhw                 m6, m4, q3333
+    punpckhqdq              m6, m6
+%endif
+
+.loop:
+    mova  [dstq  +stride0q+ 0], m2
+    mova  [dstq  +stride0q+16], m3
+    mova  [dst8q +stride0q+ 0], m3
+    mova  [dst8q +stride0q+16], m0
+    mova  [dst16q+stride0q+ 0], m0
+    mova  [dst16q+stride0q+16], m1
+    mova  [dst24q+stride0q+ 0], m1
+    mova  [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m0, m3, 2
+    palignr                 m0, m1, m0, 2
+    pshufb                  m1, m4
+%elif cpuflag(ssse3)
+    pshufb                  m5, m1, m4
+    palignr                 m1, m0, 2
+    palignr                 m0, m3, 2
+    palignr                 m3, m2, 2
+    mova                    m2, m3
+    mova                    m3, m0
+    mova                    m0, m1
+    mova                    m1, m5
+%else
+    ; half-integrated version of PALIGNR
+    pslldq                  m5, m1, 14
+    pslldq                  m7, m0, 14
+    psrldq                  m1, 2
+    psrldq                  m0, 2
+    por                     m1, m4
+    por                     m0, m5
+    pslldq                  m5, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    psrldq                  m2, 2
+    por                     m2, m5
+%endif
+    add               stride0q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000000..0dca645601
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2135 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    mova                    m3, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m2
+    mova   [dstq+strideq*1+48], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+    mova                    m3, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    punpckhwd               m3, m2, m2
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m3, q1111
+    pshufd                  m1, m3, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    punpcklwd               m2, m2
+    pshufd                  m0, m2, q3333
+    pshufd                  m1, m2, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m2, q1111
+    pshufd                  m1, m2, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 3
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova    [dstq+strideq*0+ 0], m0
+    mova    [dstq+strideq*0+16], m0
+    mova    [dstq+strideq*1+ 0], m1
+    mova    [dstq+strideq*1+16], m1
+    mova    [dstq+strideq*2+ 0], m2
+    mova    [dstq+strideq*2+16], m2
+    mova    [dstq+stride3q + 0], m3
+    mova    [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 7
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m1
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+strideq*2+32], m2
+    mova   [dstq+strideq*2+48], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    mova   [dstq+stride3q +32], m3
+    mova   [dstq+stride3q +48], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [lq+mmsize]
+    paddw                   m0, [aq]
+    paddw                   m0, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0]
+    paddw                   m0, [lq+mmsize*1]
+    paddw                   m0, [lq+mmsize*2]
+    paddw                   m0, [lq+mmsize*3]
+    paddw                   m0, [aq+mmsize*0]
+    paddw                   m0, [aq+mmsize*1]
+    paddw                   m0, [aq+mmsize*2]
+    paddw                   m0, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_32]
+    paddd                   m0, m1
+    psrad                   m0, 6
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_2]
+    paddd                   m0, m1
+    psrad                   m0, 2
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    paddw                   m0, [%2+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2+mmsize*0]
+    paddw                   m0, [%2+mmsize*1]
+    paddw                   m0, [%2+mmsize*2]
+    paddw                   m0, [%2+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DC_1D_FNS top,  aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_1023]
+.body:
+    mova                    m4, [aq]
+    mova                    m3, [lq]
+    movd                    m0, [aq-4]
+    pshufw                  m0, m0, q1111
+    psubw                   m4, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    paddw                   m0, m4
+    paddw                   m1, m4
+    paddw                   m2, m4
+    paddw                   m3, m4
+    pxor                    m4, m4
+    pmaxsw                  m0, m4
+    pmaxsw                  m1, m4
+    pmaxsw                  m2, m4
+    pmaxsw                  m3, m4
+    pminsw                  m0, m5
+    pminsw                  m1, m5
+    pminsw                  m2, m5
+    pminsw                  m3, m5
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m5, [aq]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 1
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m5
+    paddw                   m1, m5
+    paddw                   m2, m5
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m4
+    pminsw                  m1, m4
+    pminsw                  m2, m4
+    pminsw                  m3, m4
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m4, [aq]
+    mova                    m5, [aq+mmsize]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+    punpcklwd               m3, m3
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m2, m4
+    paddw                   m2, m5
+    paddw                   m1, m3, m4
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m7
+    pminsw                  m2, m7
+    pminsw                  m1, m7
+    pminsw                  m3, m7
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_1023]
+.body:
+    pxor                    m1, m1
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+    mova              [rsp+ 0], m0
+    mova              [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+    mova                    m4, [aq+mmsize*0]
+    mova                    m5, [aq+mmsize*1]
+    mova                    m6, [aq+mmsize*2]
+    mova                    m7, [aq+mmsize*3]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    psubw                   m6, m0
+    psubw                   m7, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 31
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+    punpcklwd               m3, m3
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m3, m4
+    paddw                   m1, m3, m5
+    paddw                   m2, m3, m6
+    paddw                   m3, m7
+    pmaxsw                  m0, reg_min
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m2, reg_min
+    pmaxsw                  m3, reg_min
+    pminsw                  m0, reg_max
+    pminsw                  m1, reg_max
+    pminsw                  m2, reg_max
+    pminsw                  m3, reg_max
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    add                   dstq, strideq
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+    paddw                  m%1, m%3
+    psraw                  m%1, 1
+    pavgw                  m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
+%else
+    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
+    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
+%else
+    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
+    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m1, [aq]                ; abcdefgh
+    pshufhw                 m0, m1, q3310           ; abcdefhh
+    SHIFT_RIGHT             m1, m1                  ; bcdefghh
+    psrldq                  m2, m1, 2               ; cdefghh.
+    LOWPASS                  0,  1,  2              ; BCDEFGh.
+    pshufd                  m1, m0, q3321           ; DEFGh...
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    add                   dstq, strideq
+    psrldq                  m0, 2                   ; CDEFGh..
+    psrldq                  m1, 2                   ; EFGh....
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
+    LOWPASS                  0,  1,  2              ; BCDEFGHh
+    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
+    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
+    DEFINE_ARGS dst, stride, stride5
+    lea               stride5q, [strideq*5]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*4], m1
+    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
+    pshuflw                 m1, m1, q3321           ; GHhhhhhh
+    pshufd                  m2, m0, q3321           ; EFGHhhhh
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
+    mova      [dstq+strideq*0], m3
+    mova      [dstq+strideq*4], m1
+    pshuflw                 m1, m1, q3321           ; hhhhhhhh
+    mova      [dstq+strideq*1], m2
+    mova      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+    mova                    m3, [aq+mmsize]         ; ijklmnop
+    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
+    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
+    LOWPASS                  0,  1,  2              ; BCDEFGHI
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
+    LOWPASS                  1,  2,  3              ; JKLMNOPp
+    pshufd                  m2, m2, q3333           ; pppppppp
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*8+ 0], m1
+    mova   [dstq+strideq*8+16], m2
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+%else
+    PALIGNR                 m3, m1, m0, 2, m4
+    mova                    m0, m3
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]       ; abcdefgh
+    mova                    m1, [aq+mmsize*1]       ; ijklmnop
+    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
+    mova                    m3, [aq+mmsize*3]       ; yz012345
+    PALIGNR                 m4, m1, m0, 2, m6
+    PALIGNR                 m5, m1, m0, 4, m6
+    LOWPASS                  0,  4,  5              ; BCDEFGHI
+    PALIGNR                 m4, m2, m1, 2, m6
+    PALIGNR                 m5, m2, m1, 4, m6
+    LOWPASS                  1,  4,  5              ; JKLMNOPQ
+    PALIGNR                 m4, m3, m2, 2, m6
+    PALIGNR                 m5, m3, m2, 4, m6
+    LOWPASS                  2,  4,  5              ; RSTUVWXY
+%if cpuflag(ssse3)
+    mova                    m6, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m4, m5, m3, m6
+    LOWPASS                  3,  4,  5              ; Z0123455
+    pshufd                  m4, m4, q3333           ; 55555555
+    DEFINE_ARGS dst, stride, stride8, stride24, cnt
+    mov                   cntd, 8
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+
+.loop:
+    mova  [dstq+stride8q*0+ 0], m0
+    mova  [dstq+stride8q*0+16], m1
+    mova  [dstq+stride8q*0+32], m2
+    mova  [dstq+stride8q*0+48], m3
+    mova  [dstq+stride8q*1+ 0], m1
+    mova  [dstq+stride8q*1+16], m2
+    mova  [dstq+stride8q*1+32], m3
+    mova  [dstq+stride8q*1+48], m4
+    mova  [dstq+stride8q*2+ 0], m2
+    mova  [dstq+stride8q*2+16], m3
+    mova  [dstq+stride8q*2+32], m4
+    mova  [dstq+stride8q*2+48], m4
+    mova  [dstq+stride24q + 0], m3
+    mova  [dstq+stride24q +16], m4
+    mova  [dstq+stride24q +32], m4
+    mova  [dstq+stride24q +48], m4
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+    vpalignr                m1, m2, m1, 2
+    vpalignr                m2, m3, m2, 2
+%else
+    PALIGNR                 m5, m1, m0, 2, m6
+    mova                    m0, m5
+    PALIGNR                 m5, m2, m1, 2, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 2, m6
+    mova                    m2, m5
+%endif
+    SHIFT_RIGHT             m3, m3, m6
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; wxyz....
+    movhps                  m0, [aq-2]              ; wxyz*abc
+    movd                    m1, [aq+6]              ; d.......
+    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
+    psrldq                  m2, m1, 2               ; yz*abcd.
+    LOWPASS                  0, 1, 2                ; XYZ#ABC.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m0
+    psrldq                  m0, 2                   ; YZ#ABC..
+    movh      [dstq+strideq*2], m0
+    psrldq                  m0, 2                   ; Z#ABC...
+    movh      [dstq+strideq*1], m0
+    psrldq                  m0, 2                   ; #ABC....
+    movh      [dstq+strideq*0], m0
+    RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]                ; stuvwxyz
+    movu                    m1, [aq-2]              ; *abcdefg
+    mova                    m2, [aq]                ; abcdefgh
+    psrldq                  m3, m2, 2               ; bcdefgh.
+    LOWPASS                  3,  2, 1               ; ABCDEFG.
+    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
+    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
+    LOWPASS                  2,  1, 0               ; TUVWXYZ#
+    DEFINE_ARGS dst, stride, dst4, stride3
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+
+    movhps [dstq +stride3q +0], m2
+    movh   [dstq+ stride3q +8], m3
+    mova   [dst4q+stride3q +0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*2+0], m1
+    movh   [dstq+ strideq*2+8], m3
+    mova   [dst4q+strideq*2+0], m1
+    PALIGNR                 m2, m3, m1, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*1+0], m2
+    movh   [dstq+ strideq*1+8], m3
+    mova   [dst4q+strideq*1+0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*0+0], m1
+    movh   [dstq+ strideq*0+8], m3
+    mova   [dst4q+strideq*0+0], m1
+    RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [lq]                ; klmnopqr
+    mova                    m1, [lq+mmsize]         ; stuvwxyz
+    movu                    m2, [aq-2]              ; *abcdefg
+    movu                    m3, [aq+mmsize-2]       ; hijklmno
+    mova                    m4, [aq]                ; abcdefgh
+    mova                    m5, [aq+mmsize]         ; ijklmnop
+    psrldq                  m6, m5, 2               ; jklmnop.
+    LOWPASS                  6,  5, 3               ; IJKLMNO.
+    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
+    LOWPASS                  5,  4, 2               ; ABCDEFGH
+    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
+    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
+    LOWPASS                  4,  2, 1               ; TUVWXYZ#
+    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
+    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
+    LOWPASS                  2, 1, 0                ; LMNOPQRS
+    DEFINE_ARGS dst, stride, dst8, cnt
+    lea                  dst8q, [dstq+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+    mova  [dst8q+strideq*0+ 0], m4
+    mova  [dst8q+strideq*0+16], m5
+    mova  [dst8q+strideq*8+ 0], m2
+    mova  [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+    vpalignr                m2, m4, m2, 2
+    vpalignr                m4, m5, m4, 2
+    vpalignr                m5, m6, m5, 2
+%else
+    PALIGNR                 m0, m4, m2, 2, m1
+    mova                    m2, m0
+    PALIGNR                 m0, m5, m4, 2, m1
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m5, 2, m1
+    mova                    m5, m0
+%endif
+    psrldq                  m6, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+                               %1 * ARCH_X86_32 * mmsize, dst, stride, l, a
+    mova                    m0, [aq+mmsize*3]       ; a[24-31]
+    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
+    psrldq                  m2, m0, 2               ; a[25-31].
+    LOWPASS                  2,  0, 1               ; A[24-30].
+    mova                    m1, [aq+mmsize*2]       ; a[16-23]
+    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
+    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
+    LOWPASS                  0,  1, 3               ; A[16-23]
+    mova                    m3, [aq+mmsize*1]       ; a[8-15]
+    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
+    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
+    LOWPASS                  1,  3, 4               ; A[8-15]
+    mova                    m4, [aq+mmsize*0]       ; a[0-7]
+    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
+    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
+    LOWPASS                  3,  4, 5               ; A[0-7]
+    SCRATCH                  1,  8, rsp+0*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0, 10, rsp+2*mmsize
+%endif
+    mova                    m6, [lq+mmsize*3]       ; l[24-31]
+    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
+    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
+    LOWPASS                  4,  5, 6               ; L[25-31]#
+    mova                    m7, [lq+mmsize*2]       ; l[16-23]
+    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
+    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
+    LOWPASS                  5,  6, 7               ; L[17-24]
+    mova                    m1, [lq+mmsize*1]       ; l[8-15]
+    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
+    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
+    LOWPASS                  6,  7, 1               ; L[9-16]
+    mova                    m3, [lq+mmsize*0]       ; l[0-7]
+    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
+    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
+    LOWPASS                  7,  1, 3               ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%endif
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%else
+    UNSCRATCH                0, 10, rsp+2*mmsize
+%endif
+    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+    lea                  dst8q, [dst8q+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+%if notcpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%endif
+%endif
+    mova [dst8q+stride8q*0+ 0], m4
+    mova [dst8q+stride8q*0+16], m3
+    mova [dst8q+stride8q*0+32], m1
+    mova [dst8q+stride8q*0+48], m0
+    mova [dst8q+stride8q*1+ 0], m5
+    mova [dst8q+stride8q*1+16], m4
+    mova [dst8q+stride8q*1+32], m3
+    mova [dst8q+stride8q*1+48], m1
+    mova [dst8q+stride8q*2+ 0], m6
+    mova [dst8q+stride8q*2+16], m5
+    mova [dst8q+stride8q*2+32], m4
+    mova [dst8q+stride8q*2+48], m3
+    mova [dst8q+stride24q + 0], m7
+    mova [dst8q+stride24q +16], m6
+    mova [dst8q+stride24q +32], m5
+    mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+    vpalignr                m7, m6, m7, 2
+    vpalignr                m6, m5, m6, 2
+    vpalignr                m5, m4, m5, 2
+    vpalignr                m4, m3, m4, 2
+    vpalignr                m3, m1, m3, 2
+    vpalignr                m1, m0, m1, 2
+    vpalignr                m0, m2, m0, 2
+%else
+    SCRATCH                  2,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m6, m7, 2, m0
+    mova                    m7, m2
+    PALIGNR                 m2, m5, m6, 2, m0
+    mova                    m6, m2
+    PALIGNR                 m2, m4, m5, 2, m0
+    mova                    m5, m2
+    PALIGNR                 m2, m3, m4, 2, m0
+    mova                    m4, m2
+    PALIGNR                 m2, m1, m3, 2, m0
+    mova                    m3, m2
+%if notcpuflag(ssse3)
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m0, m1, 2, m3
+    mova                    m1, m2
+    UNSCRATCH                2,  8, rsp+0*mmsize
+    SCRATCH                  1,  8, rsp+0*mmsize
+    PALIGNR                 m1, m2, m0, 2, m3
+    mova                    m0, m1
+%endif
+    psrldq                  m2, 2
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m0, [aq]                ; abcdefgh
+    psrldq                  m1, m0, 2               ; bcdefgh.
+    psrldq                  m2, m0, 4               ; cdefgh..
+    LOWPASS                  2,  1, 0               ; BCDEFGH.
+    pavgw                   m1, m0                  ; ABCDEFG.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1
+    movh      [dstq+strideq*1], m2
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    movh      [dstq+strideq*2], m1
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
+    LOWPASS                  2,  1, 0               ; BCDEFGHh
+    pavgw                   m1, m0                  ; ABCDEFGh
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m5, m0, m1, m4
+    LOWPASS                  0,  5,  1
+    pavgw                   m1, m5
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m2
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m3
+    mova   [dstq+strideq*1+16], m0
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m2, m1, m2, 2
+    vpalignr                m3, m0, m3, 2
+%else
+    PALIGNR                 m5, m1, m2, 2, m4
+    mova                    m2, m5
+    PALIGNR                 m5, m0, m3, 2, m4
+    mova                    m3, m5
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    SHIFT_RIGHT             m0, m0, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    PALIGNR                 m6, m1, m0, 2, m5
+    PALIGNR                 m7, m1, m0, 4, m5
+    LOWPASS                  7,  6,  0
+    pavgw                   m6, m0
+    SCRATCH                  6,  8, rsp+0*mmsize
+    PALIGNR                 m4, m2, m1, 2, m0
+    PALIGNR                 m5, m2, m1, 4, m0
+    LOWPASS                  5,  4,  1
+    pavgw                   m4, m1
+    mova                    m0, [aq+mmsize*3]
+    PALIGNR                 m1, m0, m2, 2, m6
+    PALIGNR                 m3, m0, m2, 4, m6
+    LOWPASS                  3,  1,  2
+    pavgw                   m2, m1
+%if cpuflag(ssse3)
+    PRELOAD                 10, pb_2to15_14_15, shuf
+%endif
+    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
+    LOWPASS                  1,  6,  0
+    pavgw                   m0, m6
+%if ARCH_X86_64
+    pshufd                  m9, m6, q3333
+%endif
+%if cpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride16, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+    lea              stride17q, [stride16q+strideq]
+
+    ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    mova   [dstq+strideq*0+ 0], m6
+    mova   [dstq+strideq*0+16], m4
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m7
+    mova   [dstq+strideq*1+16], m5
+    mova   [dstq+strideq*1+32], m3
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+stride16q+ 0], m4
+    mova   [dstq+stride16q+16], m2
+    mova   [dstq+stride16q+32], m0
+%if ARCH_X86_64
+    mova   [dstq+stride16q+48], m9
+%endif
+    mova   [dstq+stride17q+ 0], m5
+    mova   [dstq+stride17q+16], m3
+    mova   [dstq+stride17q+32], m1
+%if ARCH_X86_64
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m6, m4, m6, 2
+    vpalignr                m4, m2, m4, 2
+    vpalignr                m2, m0, m2, 2
+    vpalignr                m7, m5, m7, 2
+    vpalignr                m5, m3, m5, 2
+    vpalignr                m3, m1, m3, 2
+%else
+    SCRATCH                  3,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  1, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m3, m4, m6, 2, m1
+    mova                    m6, m3
+    PALIGNR                 m3, m2, m4, 2, m1
+    mova                    m4, m3
+    PALIGNR                 m3, m0, m2, 2, m1
+    mova                    m2, m3
+    PALIGNR                 m3, m5, m7, 2, m1
+    mova                    m7, m3
+    UNSCRATCH                3,  8, rsp+0*mmsize
+    SCRATCH                  6,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                1, 10, rsp+1*mmsize
+    SCRATCH                  7, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m6, m3, m5, 2, m7
+    mova                    m5, m6
+    PALIGNR                 m6, m1, m3, 2, m7
+    mova                    m3, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+1*mmsize
+%endif
+%endif
+    SHIFT_RIGHT             m1, m1, reg_shuf
+    SHIFT_RIGHT             m0, m0, reg_shuf
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q +48], m0
+%if %%n < 3
+    lea                   dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movu                    m0, [aq-2]
+    movhps                  m1, [lq]
+    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
+    pslldq                  m1, m0, 2               ; .xyz*abc
+    pslldq                  m2, m0, 4               ; ..xyz*ab
+    LOWPASS                  2,  1, 0               ; ..YZ#ABC
+    pavgw                   m1, m0                  ; ....#ABC
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movhps    [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m2
+    shufps                  m0, m2, m1, q3210
+%if cpuflag(ssse3)
+    pshufb                  m2, [pb_4_5_8to13_8x0]
+%else
+    pshuflw                 m2, m2, q2222
+    psrldq                  m2, 6
+%endif
+    psrldq                  m0, 6
+    movh      [dstq+strideq*2], m0
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [lq]                ; stuvwxyz
+    mova                    m0, [aq]                ; abcdefgh
+    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
+    LOWPASS                  3,  1,  0
+    pavgw                   m0, m1
+    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
+    pslldq                  m4, m2,  2              ; .stuvwxy
+    LOWPASS                  4,  2,  1
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m4
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [aq+mmsize-2]       ; hijklmno
+    mova                    m3, [aq]                ; abcdefgh
+    mova                    m4, [aq+mmsize]         ; ijklmnop
+    mova                    m5, [lq+mmsize]         ; stuvwxyz
+    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
+    movu                    m6, [aq+mmsize-4]       ; ghijklmn
+    LOWPASS                  6,  2,  4
+    pavgw                   m2, m4
+    LOWPASS                  0,  1,  3
+    pavgw                   m3, m1
+    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
+    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
+    LOWPASS                  1,  5,  7
+    movu                    m5, [lq+2]              ; lmnopqrs
+    pslldq                  m4, m5,  2              ; .lmnopqr
+    pslldq                  m7, m5,  4              ; ..lmnopq
+    LOWPASS                  5,  4,  7
+    psrld                   m4, m1, 16
+    psrld                   m7, m5, 16
+    pand                    m1, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m7, m4
+    packssdw                m5, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m3
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m6
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m2, m3, 14, m4
+    PALIGNR                 m3, m7, 14, m4
+    pslldq                  m7, 2
+    PALIGNR                 m6, m0, 14, m4
+    PALIGNR                 m0, m5, 14, m4
+    pslldq                  m5, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
+    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
+    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
+    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
+    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
+    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
+    LOWPASS                  5,  3,  4              ; A[23-30]
+    SCRATCH                  5,  8, rsp+0*mmsize
+    pavgw                   m3, m4
+    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
+    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
+    LOWPASS                  6,  2,  4              ; A[15-22]
+    SCRATCH                  6,  9, rsp+1*mmsize
+    pavgw                   m2, m4
+    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
+    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
+    LOWPASS                  7,  1,  4              ; A[7-14]
+    SCRATCH                  7, 10, rsp+2*mmsize
+    pavgw                   m1, m4
+    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
+    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
+    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
+    LOWPASS                  6,  0,  4              ; #A[0-6]
+    SCRATCH                  6, 11, rsp+3*mmsize
+    pavgw                   m4, m0
+    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
+    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
+    LOWPASS                  0,  5,  7              ; L[24-31]
+    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
+    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
+    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
+    LOWPASS                  5,  7,  6              ; L[16-23]
+    psrld                   m7, m0, 16
+    psrld                   m6, m5, 16
+    pand                    m0, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m6, m7
+    packssdw                m5, m0
+    SCRATCH                  5, 12, rsp+4*mmsize
+    SCRATCH                  6, 13, rsp+5*mmsize
+    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
+    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
+    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
+    LOWPASS                  6,  0,  5              ; L[8-15]
+    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
+    pslldq                  m5, m0,  2              ; .l[1-7]
+    pslldq                  m7, m0,  4              ; ..l[1-6]
+    LOWPASS                  0,  5,  7
+    psrld                   m5, m6, 16
+    psrld                   m7, m0, 16
+    pand                    m6, [pd_65535]
+    pand                    m0, [pd_65535]
+    packssdw                m7, m5
+    packssdw                m0, m6
+    UNSCRATCH                6, 13, rsp+5*mmsize
+    DEFINE_ARGS dst, stride, stride16, cnt, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+%if ARCH_X86_64
+    lea              stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+    mova   [dstq+strideq*1+ 0], m11
+    mova   [dstq+strideq*1+16], m10
+    mova   [dstq+strideq*1+32], m9
+    mova   [dstq+strideq*1+48], m8
+%endif
+    mova   [dstq+stride16q+ 0], m6
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m1
+    mova   [dstq+stride16q+48], m2
+%if ARCH_X86_64
+    mova   [dstq+stride17q+ 0], m12
+    mova   [dstq+stride17q+16], m11
+    mova   [dstq+stride17q+32], m10
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m3, m2,  14, m5
+    PALIGNR                 m2, m1,  14, m5
+    PALIGNR                 m1, m4,  14, m5
+    PALIGNR                 m4, m6,  14, m5
+    PALIGNR                 m6, m7,  14, m5
+    pslldq                  m7, 2
+%if ARCH_X86_64
+    PALIGNR                 m8, m9,  14, m5
+    PALIGNR                 m9, m10, 14, m5
+    PALIGNR                m10, m11, 14, m5
+    PALIGNR                m11, m12, 14, m5
+    PALIGNR                m12, m0,  14, m5
+    pslldq                  m0, 2
+%endif
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                5, 12, rsp+4*mmsize
+    UNSCRATCH                4, 11, rsp+3*mmsize
+    UNSCRATCH                3, 10, rsp+2*mmsize
+    UNSCRATCH                2,  9, rsp+1*mmsize
+    UNSCRATCH                1,  8, rsp+0*mmsize
+    mov                   dstq, dstm
+    mov                   cntd, 8
+    add                   dstq, strideq
+.loop2:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m3
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m1
+    mova   [dstq+stride16q+ 0], m5
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m3
+    mova   [dstq+stride16q+48], m2
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m1, m2,  14, m6
+    PALIGNR                 m2, m3,  14, m6
+    PALIGNR                 m3, m4,  14, m6
+    PALIGNR                 m4, m5,  14, m6
+    PALIGNR                 m5, m0,  14, m6
+    pslldq                  m0, 2
+    dec                   cntd
+    jg .loop2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; abcd
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
+%else
+    punpcklqdq              m0, m0
+    pshufhw                 m0, m0, q3333           ; abcddddd
+%endif
+    psrldq                  m1, m0,  2              ; bcddddd.
+    psrldq                  m2, m0,  4              ; cddddd..
+    LOWPASS                  2,  1,  0              ; BCDddd..
+    pavgw                   m1, m0                  ; abcddddd
+    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
+    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1                  ; aBbC
+    movh      [dstq+strideq*1], m2                  ; bCcD
+    movhps    [dstq+strideq*2], m1                  ; cDdd
+    movhps    [dstq+stride3q ], m2                  ; dddd
+    RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY          wd,  1,  2,  0
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+%else
+    PALIGNR                 m0, m2, m1, 4, m3
+    mova                    m1, m0
+%endif
+    pshufd                  m2, m2, q3321
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m3, [lq+mmsize]
+    movu                    m1, [lq+2]
+    movu                    m2, [lq+4]
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY           wd, 1,  2,  0
+%if cpuflag(ssse3)
+    mova                    m5, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m0, m4, m3, m5
+    LOWPASS                  4,  0,  3
+    pavgw                   m3, m0
+    SBUTTERFLY           wd, 3,  4,  5
+    pshufd                  m0, m0, q3333
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+
+.loop:
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m2
+    mova  [dstq+strideq *4+ 0], m2
+    mova  [dstq+strideq *4+16], m3
+    mova  [dstq+strideq *8+ 0], m3
+    mova  [dstq+strideq *8+16], m4
+    mova  [dstq+stride3q*4+ 0], m4
+    mova  [dstq+stride3q*4+16], m0
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m4, m3, 4
+    vpalignr                m4, m0, m4, 4
+%else
+    PALIGNR                 m5, m2, m1, 4, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 4, m6
+    mova                    m2, m5
+    PALIGNR                 m5, m4, m3, 4, m6
+    mova                    m3, m5
+    PALIGNR                 m5, m0, m4, 4, m6
+    mova                    m4, m5
+%endif
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+                               %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    SCRATCH                  1,  8, rsp+0*mmsize
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m7, [lq+mmsize*3+0]
+    SCRATCH                  0,  9, rsp+1*mmsize
+%if cpuflag(ssse3)
+    mova                    m0, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m6, m7, m0
+    LOWPASS                  6,  1,  7
+    pavgw                   m7, m1
+    SBUTTERFLY           wd, 7,  6,  0
+    pshufd                  m1, m1, q3333
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    lea               stride3q, [strideq*3]
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+    mov                   cntd, 4
+
+.loop:
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+1*mmsize], m1
+    mova                    m1, [rsp+0*mmsize]
+%endif
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m0
+    mova  [dstq+strideq *0+32], m3
+    mova  [dstq+strideq *0+48], m2
+    mova  [dstq+stride4q*1+ 0], m0
+    mova  [dstq+stride4q*1+16], m3
+    mova  [dstq+stride4q*1+32], m2
+    mova  [dstq+stride4q*1+48], m5
+    mova  [dstq+stride4q*2+ 0], m3
+    mova  [dstq+stride4q*2+16], m2
+    mova  [dstq+stride4q*2+32], m5
+    mova  [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+%else
+    SCRATCH                  6,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 10, rsp+3*mmsize
+%endif
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    UNSCRATCH                6,  9, rsp+2*mmsize
+    SCRATCH                  0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+3*mmsize
+    SCRATCH                  3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+0*mmsize], m1
+    mova                    m1, [rsp+1*mmsize]
+%endif
+    mova  [dstq+stride3q*4+ 0], m2
+    mova  [dstq+stride3q*4+16], m5
+    mova  [dstq+stride3q*4+32], m4
+    mova  [dstq+stride3q*4+48], m7
+    mova  [dstq+stride4q*4+ 0], m5
+    mova  [dstq+stride4q*4+16], m4
+    mova  [dstq+stride4q*4+32], m7
+    mova  [dstq+stride4q*4+48], m6
+    mova  [dstq+stride20q + 0], m4
+    mova  [dstq+stride20q +16], m7
+    mova  [dstq+stride20q +32], m6
+    mova  [dstq+stride20q +48], m1
+    mova  [dstq+stride3q*8+ 0], m7
+    mova  [dstq+stride3q*8+16], m6
+    mova  [dstq+stride3q*8+32], m1
+    mova  [dstq+stride3q*8+48], m1
+    mova  [dstq+stride28q + 0], m6
+    mova  [dstq+stride28q +16], m1
+    mova  [dstq+stride28q +32], m1
+    mova  [dstq+stride28q +48], m1
+%if cpuflag(avx)
+    vpalignr                m2, m5, m2, 4
+    vpalignr                m5, m4, m5, 4
+    vpalignr                m4, m7, m4, 4
+    vpalignr                m7, m6, m7, 4
+    vpalignr                m6, m1, m6, 4
+%else
+    PALIGNR                 m0, m5, m2, 4, m3
+    mova                    m2, m0
+    PALIGNR                 m0, m4, m5, 4, m3
+    mova                    m5, m0
+    PALIGNR                 m0, m7, m4, 4, m3
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m7, 4, m3
+    mova                    m7, m0
+    PALIGNR                 m0, m1, m6, 4, m3
+    mova                    m6, m0
+    UNSCRATCH                0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3, 10, rsp+3*mmsize
+%endif
+%endif
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+    movh                    m0, [lq]
+    movhps                  m0, [aq-2]
+    psrldq                  m1, m0, 2
+    psrldq                  m2, m0, 4
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    punpcklwd               m1, m2
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m1
+    movhps    [dstq+strideq*1], m1
+    movhlps                 m2, m2
+    PALIGNR                 m2, m1, 4, m0
+    movh      [dstq+strideq*2], m2
+    movhps    [dstq+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m1, [aq-2]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+    SBUTTERFLY           wd, 2,  3,  0
+    psrldq                  m0, m1,  2
+    psrldq                  m4, m1,  4
+    LOWPASS                  1,  0,  4
+    DEFINE_ARGS dst8, mstride, cnt
+    lea                  dst8q, [dst8q+mstrideq*8]
+    neg               mstrideq
+    mov                   cntd, 4
+
+.loop:
+    add                  dst8q, mstrideq
+    mova    [dst8q+mstrideq*0], m2
+    mova    [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m1, m3, 4
+%else
+    PALIGNR                 m0, m3, m2, 4, m4
+    mova                    m2, m0
+    PALIGNR                 m0, m1, m3, 4, m4
+    mova                    m3, m0
+%endif
+    psrldq                  m1, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+    mova                    m2, [lq]
+    movu                    m1, [lq+2]
+    movu                    m0, [lq+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    mova                    m4, [lq+mmsize]
+    movu                    m5, [aq-2]
+    PALIGNR                 m3, m5, m4, 2, m6
+    PALIGNR                 m2, m5, m4, 4, m6
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 1,  0,  4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [aq]
+    movu                    m4, [aq+2]
+    LOWPASS                  4,  6,  5
+    movu                    m5, [aq+mmsize-2]
+    psrldq                  m6, m5,  2
+    psrldq                  m7, m5,  4
+    LOWPASS                  5,  6,  7
+    DEFINE_ARGS dst, mstride, mstride3, cnt
+    lea                   dstq, [dstq+mstrideq*8]
+    lea                   dstq, [dstq+mstrideq*8]
+    neg               mstrideq
+    lea              mstride3q, [mstrideq*3]
+    mov                   cntd, 4
+
+.loop:
+    add                  dstq, mstrideq
+    mova [dstq+mstride3q*4+ 0], m2
+    mova [dstq+mstride3q*4+16], m4
+    mova [dstq+mstrideq *8+ 0], m3
+    mova [dstq+mstrideq *8+16], m2
+    mova [dstq+mstrideq *4+ 0], m0
+    mova [dstq+mstrideq *4+16], m3
+    mova [dstq+mstrideq *0+ 0], m1
+    mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+    vpalignr                m2, m4, m2, 4
+    vpalignr                m4, m5, m4, 4
+%else
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m4, m2, 4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m5, m4, 4, m7
+    mova                    m4, m6
+%endif
+    psrldq                  m5, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+                               10 * mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    SCRATCH                  0,  8, rsp+0*mmsize
+    SCRATCH                  1,  9, rsp+1*mmsize
+    SCRATCH                  2, 10, rsp+2*mmsize
+    SCRATCH                  3, 11, rsp+3*mmsize
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m0, [lq+mmsize*3+0]
+    movu                    m1, [aq+mmsize*0-2]
+    PALIGNR                 m7, m1, m0, 2, m2
+    PALIGNR                 m6, m1, m0, 4, m2
+    LOWPASS                  6,  7,  0
+    pavgw                   m7, m0
+    SBUTTERFLY           wd, 7,  6,  0
+    mova                    m2, [aq+mmsize*0+0]
+    movu                    m0, [aq+mmsize*0+2]
+    LOWPASS                  0,  2,  1
+    movu                    m1, [aq+mmsize*1-2]
+    mova                    m2, [aq+mmsize*1+0]
+    movu                    m3, [aq+mmsize*1+2]
+    LOWPASS                  1,  2,  3
+    SCRATCH                  6, 12, rsp+6*mmsize
+    SCRATCH                  7, 13, rsp+7*mmsize
+    movu                    m2, [aq+mmsize*2-2]
+    mova                    m3, [aq+mmsize*2+0]
+    movu                    m6, [aq+mmsize*2+2]
+    LOWPASS                  2,  3,  6
+    movu                    m3, [aq+mmsize*3-2]
+    psrldq                  m6, m3,  2
+    psrldq                  m7, m3,  4
+    LOWPASS                  3,  6,  7
+    UNSCRATCH                6, 12, rsp+6*mmsize
+    UNSCRATCH                7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+    mova        [rsp+4*mmsize], m4
+    mova        [rsp+5*mmsize], m5
+    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+    ; to do it again here
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    mov                   cntd, 4
+    lea               stride3q, [strideq*3]
+%if ARCH_X86_64
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+%endif
+    add                   dstq, stride3q
+
+    ; x86-32 doesn't have enough registers, so on that platform, we split
+    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+    mova  [dstq+stride28q + 0], m9
+    mova  [dstq+stride28q +16], m8
+    mova  [dstq+stride28q +32], m11
+    mova  [dstq+stride28q +48], m10
+    mova  [dstq+stride3q*8+ 0], m8
+    mova  [dstq+stride3q*8+16], m11
+    mova  [dstq+stride3q*8+32], m10
+    mova  [dstq+stride3q*8+48], m5
+    mova  [dstq+stride20q + 0], m11
+    mova  [dstq+stride20q +16], m10
+    mova  [dstq+stride20q +32], m5
+    mova  [dstq+stride20q +48], m4
+    mova  [dstq+stride4q*4+ 0], m10
+    mova  [dstq+stride4q*4+16], m5
+    mova  [dstq+stride4q*4+32], m4
+    mova  [dstq+stride4q*4+48], m7
+%endif
+    mova  [dstq+stride3q*4+ 0], m5
+    mova  [dstq+stride3q*4+16], m4
+    mova  [dstq+stride3q*4+32], m7
+    mova  [dstq+stride3q*4+48], m6
+    mova  [dstq+strideq* 8+ 0], m4
+    mova  [dstq+strideq* 8+16], m7
+    mova  [dstq+strideq* 8+32], m6
+    mova  [dstq+strideq* 8+48], m0
+    mova  [dstq+strideq* 4+ 0], m7
+    mova  [dstq+strideq* 4+16], m6
+    mova  [dstq+strideq* 4+32], m0
+    mova  [dstq+strideq* 4+48], m1
+    mova  [dstq+strideq* 0+ 0], m6
+    mova  [dstq+strideq* 0+16], m0
+    mova  [dstq+strideq* 0+32], m1
+    mova  [dstq+strideq* 0+48], m2
+    sub                   dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+    vpalignr                m9, m8,  m9,  4
+    vpalignr                m8, m11, m8,  4
+    vpalignr               m11, m10, m11, 4
+    vpalignr               m10, m5,  m10, 4
+%endif
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+    vpalignr                m6, m0,  m6,  4
+    vpalignr                m0, m1,  m0,  4
+    vpalignr                m1, m2,  m1,  4
+    vpalignr                m2, m3,  m2,  4
+%else
+%if ARCH_X86_64
+    PALIGNR                m12, m8,  m9,  4, m13
+    mova                    m9, m12
+    PALIGNR                m12, m11, m8,  4, m13
+    mova                    m8, m12
+    PALIGNR                m12, m10, m11, 4, m13
+    mova                   m11, m12
+    PALIGNR                m12, m5,  m10, 4, m13
+    mova                   m10, m12
+%endif
+    SCRATCH                  3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  2, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m4,  m5,  4, m2
+    mova                    m5, m3
+    PALIGNR                 m3, m7,  m4,  4, m2
+    mova                    m4, m3
+    PALIGNR                 m3, m6,  m7,  4, m2
+    mova                    m7, m3
+    PALIGNR                 m3, m0,  m6,  4, m2
+    mova                    m6, m3
+    PALIGNR                 m3, m1,  m0,  4, m2
+    mova                    m0, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                2, 13, rsp+9*mmsize
+    SCRATCH                  0, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m2,  m1,  4, m0
+    mova                    m1, m3
+    PALIGNR                 m3, reg_sh,  m2,  4, m0
+    mova                    m2, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                0, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                3, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m3, 4
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                0,  8, rsp+0*mmsize
+    UNSCRATCH                1,  9, rsp+1*mmsize
+    UNSCRATCH                2, 10, rsp+2*mmsize
+    UNSCRATCH                3, 11, rsp+3*mmsize
+    mova                    m4, [rsp+4*mmsize]
+    mova                    m5, [rsp+5*mmsize]
+    mova                    m6, [rsp+6*mmsize]
+    mova                    m7, [rsp+7*mmsize]
+    DEFINE_ARGS dst, stride, stride5, stride3
+    lea               stride5q, [strideq*5]
+    lea                   dstq, [dstq+stride5q*4]
+    DEFINE_ARGS dst, stride, cnt, stride3
+    mov                   cntd, 4
+.loop_2:
+    mova  [dstq+stride3q*4+ 0], m1
+    mova  [dstq+stride3q*4+16], m0
+    mova  [dstq+stride3q*4+32], m3
+    mova  [dstq+stride3q*4+48], m2
+    mova  [dstq+strideq* 8+ 0], m0
+    mova  [dstq+strideq* 8+16], m3
+    mova  [dstq+strideq* 8+32], m2
+    mova  [dstq+strideq* 8+48], m5
+    mova  [dstq+strideq* 4+ 0], m3
+    mova  [dstq+strideq* 4+16], m2
+    mova  [dstq+strideq* 4+32], m5
+    mova  [dstq+strideq* 4+48], m4
+    mova  [dstq+strideq* 0+ 0], m2
+    mova  [dstq+strideq* 0+16], m5
+    mova  [dstq+strideq* 0+32], m4
+    mova  [dstq+strideq* 0+48], m7
+    sub                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m0,  m1,  4
+    vpalignr                m0, m3,  m0,  4
+    vpalignr                m3, m2,  m3,  4
+    vpalignr                m2, m5,  m2,  4
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+%else
+    SCRATCH                  6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m0,  m1,  4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3,  m0,  4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2,  m3,  4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m5,  m2,  4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m4,  m5,  4, m7
+    mova                    m5, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 13, rsp+9*mmsize
+    SCRATCH                  5, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m7,  m4,  4, m5
+    mova                    m4, m6
+    PALIGNR                 m6, reg_sh,  m7,  4, m5
+    mova                    m7, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                5, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                6, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m6, 4
+    dec                   cntd
+    jg .loop_2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000000..4d6a73cde9
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,2725 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_11585x2:  times 8 dw 23170
+pw_m11585x2: times 8 dw -23170
+pw_m11585_11585: times 4 dw -11585, 11585
+pw_11585_11585: times 8 dw 11585
+pw_m11585_m11585: times 8 dw -11585
+
+%macro VP9_IDCT_COEFFS 2-3 0
+pw_%1x2:    times 8 dw  %1*2
+pw_m%1x2:   times 8 dw -%1*2
+pw_%2x2:    times 8 dw  %2*2
+pw_m%2x2:   times 8 dw -%2*2
+pw_m%1_%2:  times 4 dw -%1,  %2
+pw_%2_%1:   times 4 dw  %2,  %1
+pw_m%2_m%1: times 4 dw -%2, -%1
+%if %3 == 1
+pw_m%2_%1:  times 4 dw -%2,  %1
+pw_%1_%2:   times 4 dw  %1,  %2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 15137,  6270, 1
+VP9_IDCT_COEFFS 16069,  3196, 1
+VP9_IDCT_COEFFS  9102, 13623, 1
+VP9_IDCT_COEFFS 16305,  1606
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS 14449,  7723
+VP9_IDCT_COEFFS  4756, 15679
+VP9_IDCT_COEFFS 16364,   804
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 14811,  7005
+VP9_IDCT_COEFFS  5520, 15426
+VP9_IDCT_COEFFS 15893,  3981
+VP9_IDCT_COEFFS  8423, 14053
+VP9_IDCT_COEFFS 13160,  9760
+VP9_IDCT_COEFFS  2404, 16207
+
+pw_5283_13377: times 4 dw 5283, 13377
+pw_9929_13377: times 4 dw 9929, 13377
+pw_15212_m13377: times 4 dw 15212, -13377
+pw_15212_9929: times 4 dw 15212, 9929
+pw_m5283_m15212: times 4 dw -5283, -15212
+pw_13377x2: times 8 dw 13377*2
+pw_m13377_13377: times 4 dw -13377, 13377
+pw_13377_0: times 4 dw 13377, 0
+
+pd_8192: times 4 dd 8192
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+
+SECTION .text
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+    pmaddwd            m%1, m%2, %4
+    pmaddwd            m%2,  %5
+    paddd              m%1,  %3
+    paddd              m%2,  %3
+    psrad              m%1,  14
+    psrad              m%2,  14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
+    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
+    packssdw           m%1, m%7
+    packssdw           m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+    punpckhwd          m%6, m%2, m%1
+    punpcklwd          m%2, m%1
+    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
+%else
+    punpckhwd          m%8, m%4, m%3
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+    punpckhwd          m%4, m%2, m%1
+    punpcklwd          m%2, m%1
+    pmaddwd            m%3, m%4, [pw_m%5_%6]
+    pmaddwd            m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+    SUMSUB_BA            d, %1, %2, %5
+    SUMSUB_BA            d, %3, %4, %5
+    paddd              m%1, %6
+    paddd              m%2, %6
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%1, 14
+    psrad              m%2, 14
+    psrad              m%3, 14
+    psrad              m%4, 14
+    packssdw           m%1, m%3
+    packssdw           m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+    movh               m%3, [%6]
+    movh               m%4, [%6+strideq]
+    punpcklbw          m%3, m%5
+    punpcklbw          m%4, m%5
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    packuswb           m%3, m%5
+    packuswb           m%4, m%5
+    movh              [%6], m%3
+    movh      [%6+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IWHT4_1D 0
+    SWAP                 1, 2, 3
+    paddw               m0, m2
+    psubw               m3, m1
+    psubw               m4, m0, m3
+    psraw               m4, 1
+    psubw               m5, m4, m1
+    SWAP                 5, 1
+    psubw               m4, m2
+    SWAP                 4, 2
+    psubw               m0, m1
+    paddw               m3, m2
+    SWAP                 3, 2, 1
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+    mova                m0, [blockq+0*8]
+    mova                m1, [blockq+1*8]
+    mova                m2, [blockq+2*8]
+    mova                m3, [blockq+3*8]
+    psraw               m0, 2
+    psraw               m1, 2
+    psraw               m2, 2
+    psraw               m3, 2
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m4, m4
+    VP9_STORE_2X         0, 1, 5, 6, 4
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 5, 6, 4
+    ZERO_BLOCK      blockq, 8, 4, m4
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3, 2                            ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+    pmulhrsw            m2, m6                              ; m2=t0
+    pmulhrsw            m0, m6                              ; m0=t1
+%else ; <= sse2
+    VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5    ; m0=t1, m1=t0
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    mova                m2, m0                              ; m2=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    psraw               m0, 4
+    psraw               m1, 4
+%endif
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    VP9_STORE_2X         2,  3,  6,  7,  4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct2x2
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (8 << 14) + 8192
+    sar              coefd, 14 + 4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movh          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    mova                m5, [pw_11585x2]
+    mova                m6, [pw_6270x2]
+    mova                m7, [pw_15137x2]
+    VP9_IDCT4_2x2_1D
+    ; partial 2x4 transpose
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    SBUTTERFLY          dq, 0, 2, 1
+    SWAP                1, 2
+    VP9_IDCT4_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movh       [blockq+ 0], m4
+    movh       [blockq+ 8], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+%if cpuflag(ssse3)
+    paddw               m3, m0
+%endif
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm6, xmm0, [pw_13377_0]
+%endif
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
+%endif
+    pmaddwd           xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+    psubw               m3, m2
+%else
+    paddd             xmm6, xmm7
+%endif
+    paddd             xmm0, xmm2
+    paddd             xmm3, xmm5
+    paddd             xmm2, xmm5
+%if notcpuflag(ssse3)
+    paddd             xmm6, xmm5
+%endif
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+%else
+    psrad             xmm6, 14
+%endif
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+%if notcpuflag(ssse3)
+    packssdw          xmm6, xmm6
+%endif
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+%if notcpuflag(ssse3)
+    movdq2q             m3, xmm6                ; out2
+%endif
+    SWAP                 0, 1, 2, 3
+%endmacro
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
+    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
+    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
+
+    UNSCRATCH            5, 8, blockq+ 0
+    SCRATCH              2, 8, blockq+ 0
+
+    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 4, 9
+%endif
+
+    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
+    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
+    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+%else
+    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
+
+    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
+    UNSCRATCH            4, 9, blockq+64    ; IN(4)
+    SCRATCH              5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
+%else
+    SCRATCH              7, 9, blockq+64
+    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
+    UNSCRATCH            7, 9, blockq+64
+%endif
+    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
+    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
+    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
+    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
+    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
+    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
+    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
+    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
+    paddw               m6, m0                              ; m6=t0a+t3a (t0)
+    SCRATCH              5,  8, blockq+ 0
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
+    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
+    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
+    psubw               m7, m3, m1                          ; t5 = t7a - t4a
+    paddw               m5, m3, m1                          ; t6 = t7a + t4a
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
+    SWAP                 5,  1
+    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+    psubw               m6, m0, m3                          ; m6=t0-t7
+    paddw               m3, m0                              ; m3=t0+t7
+    psubw               m2, m0, m1                          ; m2=t1-t6
+    paddw               m1, m0                              ; m1=t1+t6
+%if %1 == 1
+    punpcklwd           m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+    mova       [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+    psubw               m4, m0, m5                          ; m4=t3-t4
+    paddw               m5, m0                              ; m5=t3+t4
+    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    pmulhrsw           m%2, %6
+%else
+    paddw              m%1, %6
+    paddw              m%2, %6
+    psraw              m%1, %7
+    psraw              m%2, %7
+%endif
+%if %0 <= 7
+    VP9_STORE_2X        %1, %2, %3, %4, %5
+%else
+    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova                m9, [pw_1024]
+%else
+    mova                m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+    SCRATCH              5, 10, blockq+16
+    SCRATCH              7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 10, blockq+16
+    UNSCRATCH            7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 8, blockq+ 0
+    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova               m12, [pw_11585x2]    ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+    cmp eobd, 12 ; top left half or less
+    jg .idctfull
+
+    cmp eobd, 3  ; top left corner or less
+    jg .idcthalf
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idcttopleftcorner
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    pmulhrsw            m0, W_11585x2_REG
+    pmulhrsw            m0, W_11585x2_REG
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (16 << 14) + 8192
+    sar              coefd, 14 + 5
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, 0
+    pxor                m4, m4
+    movd          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+16]
+%if ARCH_X86_64
+    mova               m10, [pw_3196x2]
+    mova               m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+    VP9_IDCT8_2x2_1D 1
+    ; partial 2x8 transpose
+    ; punpcklwd m0, m1 already done inside idct
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    punpckldq           m0, m2
+    punpckldq           m4, m6
+    SBUTTERFLY         qdq, 0, 4, 1
+    SWAP                 1, 4
+    VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movd       [blockq+ 0], m6
+    movd       [blockq+16], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    RET
+
+.idcthalf:
+    movh                m0, [blockq + 0]
+    movh                m1, [blockq +16]
+    movh                m2, [blockq +32]
+    movh                m3, [blockq +48]
+    VP9_IDCT8_4x4_1D
+    ; partial 4x8 transpose
+%if ARCH_X86_32
+    mova                m6, [blockq+ 0]
+%endif
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    SBUTTERFLY          dq, 0, 2, 1
+    SBUTTERFLY          dq, 4, 6, 5
+    SBUTTERFLY         qdq, 0, 4, 1
+    SBUTTERFLY         qdq, 2, 6, 5
+    SWAP                 1, 4
+    SWAP                 3, 6
+    VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movh       [blockq+ 0], m6
+    movh       [blockq+16], m6
+    movh       [blockq+32], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    movh       [blockq+48], m6
+    RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+    mova                m7, [blockq+112]    ; IN(7)
+%if ARCH_X86_64
+    mova               m11, [pd_8192]       ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+    VP9_IDCT8_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova        [blockq+0], m0
+%endif
+    VP9_IDCT8_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     3, 9
+    SWAP                     4, 10
+    SWAP                     7, 11
+%endif
+
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
+    SCRATCH                  4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
+    UNSCRATCH                4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
+
+    UNSCRATCH                0,  8, blockq+16*0
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                7, 11, blockq+16*7
+    SCRATCH                  1,  8, blockq+16*1
+    SCRATCH                  2,  9, blockq+16*2
+    SCRATCH                  5, 10, blockq+16*5
+    SCRATCH                  6, 11, blockq+16*6
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
+    UNSCRATCH                1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
+
+    UNSCRATCH                2,  9, blockq+16*2
+    UNSCRATCH                5, 10, blockq+16*5
+    SCRATCH                  3,  9, blockq+16*3
+    SCRATCH                  4, 10, blockq+16*4
+
+    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
+    UNSCRATCH                1, 12, blockq+ 0*16
+    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
+
+    UNSCRATCH                1,  8, blockq+16*1
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                6, 11, blockq+16*6
+    SCRATCH                  2,  8, blockq+16*0
+
+    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
+    SUMSUB_BA                w,  1,  3, 2
+    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
+
+    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  3,  4,  2
+    SUMSUB_BA                w,  0,  7,  2
+    pmulhrsw                m3, W_11585x2_REG
+    pmulhrsw                m7, W_11585x2_REG
+    pmulhrsw                m4, W_11585x2_REG               ; out4
+    pmulhrsw                m0, W_11585x2_REG               ; out2
+%else
+    SCRATCH                  5,  9, blockq+16*1
+    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
+    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
+    UNSCRATCH                5,  9, blockq+16*1
+%endif
+    PSIGNW                  m3, W_M1_REG                    ; out3
+    PSIGNW                  m7, W_M1_REG                    ; out5
+
+    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+    SWAP                     2, 8
+%endif
+    SWAP                     0, 6, 2
+    SWAP                     7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+    mova                m3, [blockq+ 48]    ; IN(3)
+%endif
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+    mova                m7, [blockq+112]    ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova               m15, [pw_11585x2]    ; often used
+%endif
+    mova               m13, [pd_8192]       ; rounding
+    mova               m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+    ; note different calling conventions for idct8 vs. iadst8 on x86-32
+    VP9_%2_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova      [blockq+  0], m0
+%if second_is_idct == 0
+    mova      [blockq+ 48], m3
+    mova      [blockq+112], m7
+%endif
+%endif
+    VP9_%4_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
+IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
+;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+    UNSCRATCH            5, 11, %4+ 7*%5
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+    mova                m5, [%1+ 1*%3]      ; IN(1)
+    mova                m4, [%1+ 7*%3]      ; IN(7)
+%if %2 <= 8
+    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
+    pmulhrsw            m5, [pw_1606x2]             ; t8
+    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
+    pmulhrsw            m4, [pw_12665x2]            ; t14
+%else
+    mova                m3, [%1+ 9*%3]      ; IN(9)
+    mova                m2, [%1+15*%3]      ; IN(15)
+
+    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
+%endif
+
+    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
+    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
+
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+
+    mova                m6, [%1+ 3*%3]      ; IN(3)
+    mova                m7, [%1+ 5*%3]      ; IN(5)
+%if %2 <= 8
+    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
+    pmulhrsw            m7, [pw_7723x2]             ; t10
+    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
+    pmulhrsw            m6, [pw_15679x2]            ; t12
+%else
+    mova                m0, [%1+11*%3]      ; IN(11)
+    mova                m1, [%1+13*%3]      ; IN(13)
+
+    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
+    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+    UNSCRATCH            5, 11, %4+ 7*%5
+%endif
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
+
+    ; backup first register
+    mova        [%4+15*%5], m7
+
+    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
+    UNSCRATCH            4, 10, %4+ 1*%5
+    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
+    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  2,  5, 7
+    SUMSUB_BA            w,  3,  4, 7
+    pmulhrsw            m5, [pw_11585x2]    ; t10
+    pmulhrsw            m4, [pw_11585x2]    ; t11
+    pmulhrsw            m3, [pw_11585x2]    ; t12
+    pmulhrsw            m2, [pw_11585x2]    ; t13
+%else
+    SCRATCH              6, 10, %4+ 1*%5
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+    UNSCRATCH            6, 10, %4+ 1*%5
+%endif
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+    SCRATCH              0,  8, %4+ 1*%5
+    SCRATCH              1,  9, %4+ 3*%5
+    SCRATCH              2, 10, %4+ 5*%5
+    SCRATCH              3, 11, %4+ 7*%5
+    SCRATCH              4, 12, %4+ 9*%5
+    SCRATCH              5, 13, %4+11*%5
+    SCRATCH              6, 14, %4+13*%5
+
+    ; even (tx8x8)
+%if %2 <= 4
+    mova                m3, [%1+ 0*%3]      ; IN(0)
+    mova                m4, [%1+ 2*%3]      ; IN(2)
+
+    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
+    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
+    pmulhrsw            m4, [pw_3196x2]             ; t4-5
+
+%if 0 ; overflows :(
+    paddw               m6, m7, m4
+    psubw               m5, m7, m4
+    pmulhrsw            m5, [pw_11585x2]            ; t5
+    pmulhrsw            m6, [pw_11585x2]            ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
+%endif
+
+    psubw               m0, m3, m7
+    paddw               m7, m3
+    psubw               m1, m3, m6
+    paddw               m6, m3
+    psubw               m2, m3, m5
+    paddw               m5, m3
+
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+%else
+    mova                m6, [%1+ 2*%3]      ; IN(2)
+    mova                m1, [%1+ 4*%3]      ; IN(4)
+    mova                m7, [%1+ 6*%3]      ; IN(6)
+%if %2 <= 8
+    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
+    pmulhrsw            m1, [pw_6270x2]             ; t2
+    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
+    pmulhrsw            m6, [pw_3196x2]             ; t4
+    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
+    pmulhrsw            m7, [pw_13623x2]            ; t6
+%else
+    mova                m4, [%1+10*%3]      ; IN(10)
+    mova                m0, [%1+12*%3]      ; IN(12)
+    mova                m5, [%1+14*%3]      ; IN(14)
+
+    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
+    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  6,  5, 2
+    pmulhrsw            m5, [pw_11585x2]                              ; t5
+    pmulhrsw            m6, [pw_11585x2]                              ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SCRATCH              5, 15, %4+10*%5
+    mova                m2, [%1+ 0*%3]      ; IN(0)
+%if %2 <= 8
+    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
+    psubw               m3, m2, m0
+    paddw               m0, m2
+
+    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
+%else
+    mova                m3, [%1+ 8*%3]      ; IN(8)
+
+    ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  3,  2, 5
+    pmulhrsw            m3, [pw_11585x2]    ; t0
+    pmulhrsw            m2, [pw_11585x2]    ; t1
+%else
+    mova        [%1+ 0*%3], m0
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
+    mova                m0, [%1+ 0*%3]
+%endif
+
+    ; from 2 stages back
+    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
+
+    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
+%endif
+    UNSCRATCH            5, 15, %4+10*%5
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
+
+    ; from 1 stage back
+    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
+    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
+%endif
+    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 1, 9
+    SWAP                 2, 10
+    SWAP                 3, 11
+    SWAP                 4, 12
+    SWAP                 5, 13
+    SWAP                 6, 14
+
+    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
+    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
+    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
+    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
+    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
+    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
+%else
+    SWAP                 1, 6
+    SWAP                 2, 5
+    SWAP                 3, 4
+    mova        [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+    mova                m6, [%4+%2*%5]
+    SUMSUB_BA            w,  6, %1, 7
+    SWAP                %1, 6
+    mova        [%4+%3*%5], m6
+%endmacro
+
+    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
+    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
+    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
+    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
+    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
+    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+    ; backup a different register
+    mova                m7, [tmpq+15*16]
+    mova      [tmpq+ 1*16], m15
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+ 1*16]
+    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
+%else
+    mova                m6, [tmpq+13*16]
+    mova                m7, [tmpq+14*16]
+    SUMSUB_BA            w, 6, 7                ; t6, t9
+    mova      [tmpq+14*16], m6
+    mova      [tmpq+13*16], m7
+    mova                m7, [tmpq+15*16]
+    mova                m6, [tmpq+12*16]
+    SUMSUB_BA            w, 7, 6                ; t7, t8
+    mova      [tmpq+15*16], m6
+
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+    mova     [tmpq+ 0*16], m0
+    mova     [tmpq+ 2*16], m1
+    mova     [tmpq+ 4*16], m2
+    mova     [tmpq+ 6*16], m3
+    mova     [tmpq+10*16], m5
+    mova     [tmpq+12*16], m6
+    mova     [tmpq+14*16], m7
+
+    mova                m0, [tmpq+15*16]
+    mova                m1, [tmpq+13*16]
+    mova                m2, [tmpq+11*16]
+    mova                m3, [tmpq+ 9*16]
+    mova                m4, [tmpq+ 7*16]
+    mova                m5, [tmpq+ 5*16]
+    mova                m7, [tmpq+ 1*16]
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+    mova     [tmpq+ 1*16], m0
+    mova     [tmpq+ 3*16], m1
+    mova     [tmpq+ 5*16], m2
+    mova     [tmpq+ 7*16], m3
+    mova     [tmpq+11*16], m5
+    mova     [tmpq+13*16], m6
+    mova     [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+    pxor                m7, m7
+%if ARCH_X86_64
+    ; backup more registers
+    mova        [%1+ 2*32], m8
+    mova        [%1+ 3*32], m9
+
+    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    ; restore from cache
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m7, [%1+15*32]
+    mova                m8, [%1+ 2*32]
+    mova                m9, [%1+ 3*32]
+
+    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
+    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
+
+    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+    mova      [tmpq+ 0*32], m5
+
+    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m5, [tmpq+ 0*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+13*32]
+    mova                m7, [tmpq+14*32]
+    mova                m5, [tmpq+15*32]
+    mova                m6, [tmpq+12*32]
+    SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+11*32]
+    mova                m5, [tmpq+ 9*32]
+    mova                m6, [tmpq+ 7*32]
+    mova                m7, [tmpq+ 5*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+ 3*32]
+    mova                m5, [tmpq+ 1*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+    mova               m%3, [dstq]
+    mova               m%5, [dstq+%7]
+    punpcklbw          m%2, m%3, m%6
+    punpckhbw          m%3, m%6
+    punpcklbw          m%4, m%5, m%6
+    punpckhbw          m%5, m%6
+    paddw              m%2, m%1
+    paddw              m%3, m%1
+    paddw              m%4, m%1
+    paddw              m%5, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], m%2
+    mova         [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+    ; 2x2=eob=3, 4x4=eob=10
+    cmp eobd, 38
+    jg .idctfull
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct8x8
+%else
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+%endif
+
+    ; dc-only
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 7
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    RET
+
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT16_1D   blockq, 1, 8, 0
+
+    mov               cntd, 2
+    mov           dst_bakq, dstq
+.loop2_8x8:
+    VP9_IDCT16_1D     tmpq, 2, 8, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 8, m0
+    RET
+%endif
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT16_1D   blockq, 1, 16, 0
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_IDCT16_1D     tmpq, 2, 16, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+    mova                m0, [%1+ 0*32]  ; in0
+    mova                m1, [%1+15*32]  ; in15
+    mova                m2, [%1+ 7*32]  ; in7
+    mova                m3, [%1+ 8*32]  ; in8
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
+    SCRATCH              4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
+    UNSCRATCH            4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    mova   [tmpq+ 7*%%str], m2
+    mova   [tmpq+ 8*%%str], m3
+
+    mova                m1, [%1+ 2*32]  ; in2
+    mova                m0, [%1+13*32]  ; in13
+    mova                m3, [%1+ 5*32]  ; in5
+    mova                m2, [%1+10*32]  ; in10
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
+    SCRATCH              4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
+
+    SCRATCH              0, 12, tmpq+ 2*%%str
+    SCRATCH              1, 13, tmpq+13*%%str
+    mova   [tmpq+ 5*%%str], m2
+    mova   [tmpq+10*%%str], m3
+
+    mova                m2, [%1+ 4*32]  ; in4
+    mova                m3, [%1+11*32]  ; in11
+    mova                m0, [%1+ 3*32]  ; in3
+    mova                m1, [%1+12*32]  ; in12
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
+    SCRATCH              4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
+
+    SCRATCH              0,  8, tmpq+ 4*%%str
+    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
+    UNSCRATCH            0, 10, tmpq+ 0*%%str
+    UNSCRATCH            1, 11, tmpq+15*%%str
+
+    ; round 2 interleaved part 1
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
+    SCRATCH              4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    SCRATCH              2, 14, tmpq+ 3*%%str
+    SCRATCH              3, 15, tmpq+12*%%str
+
+    mova                m2, [%1+ 6*32]  ; in6
+    mova                m3, [%1+ 9*32]  ; in9
+    mova                m0, [%1+ 1*32]  ; in1
+    mova                m1, [%1+14*32]  ; in14
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
+    SCRATCH              4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
+    UNSCRATCH            4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
+
+    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    UNSCRATCH            5, 13, tmpq+13*%%str
+    SCRATCH              0, 12, tmpq+ 1*%%str
+    SCRATCH              1, 13, tmpq+14*%%str
+
+    ; remainder of round 2 (rest of t8-15)
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
+
+    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+    UNSCRATCH            6, 14, tmpq+ 3*%%str
+    UNSCRATCH            7, 15, tmpq+12*%%str
+
+    SUMSUB_BA                w,  3,  7,  1
+    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
+    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
+%else
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    mova       [tmpq+ 3*%%str], m6
+    mova       [tmpq+ 6*%%str], m7
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    UNSCRATCH                7, 11, tmpq+15*%%str
+    mova       [tmpq+13*%%str], m2
+    SCRATCH                  3, 11, tmpq+ 9*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
+    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
+%else
+    PSIGNW                  m7, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+    mova                    m2, [tmpq+ 8*%%str]
+    mova                    m3, [tmpq+ 7*%%str]
+    mova                    m1, [tmpq+11*%%str]
+    mova       [tmpq+ 7*%%str], m6
+    mova       [tmpq+11*%%str], m4
+    mova                    m4, [tmpq+ 5*%%str]
+    SCRATCH                  5, 14, tmpq+ 5*%%str
+    SCRATCH                  7, 15, tmpq+ 8*%%str
+    UNSCRATCH                6,  8, tmpq+ 4*%%str
+    UNSCRATCH                5, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+14*%%str
+
+    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
+    mova                    m0, [tmpq+10*%%str]
+    SCRATCH                  1, 12, tmpq+ 1*%%str
+    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
+    SCRATCH                  6, 13, tmpq+ 4*%%str
+    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
+    SCRATCH                  7,  8, tmpq+10*%%str
+    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
+    SCRATCH                  5,  9, tmpq+14*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
+    SCRATCH                  6, 10, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
+    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
+
+    UNSCRATCH                1,  8, tmpq+10*%%str
+    UNSCRATCH                5,  9, tmpq+14*%%str
+    UNSCRATCH                6, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+ 4*%%str
+    SCRATCH                  4,  9, tmpq+14*%%str
+
+    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
+    SUMSUB_BA                w,  5,  7,  4
+    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+    SUMSUB_BA               w,   7,  6,  4
+    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
+    SWAP                     6,  7
+    SUMSUB_BA                w,  3,  2,  4
+    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
+    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
+%else
+    SCRATCH                  5,  8, tmpq+10*%%str
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
+    UNSCRATCH                5,  8, tmpq+10*%%str
+%endif
+
+    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+    mova                   m13, [tmpq+ 6*%%str]
+    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m11
+    mova          [tmpq+ 4*16], m14
+    mova          [tmpq+ 6*16], m0
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+    mova          [tmpq+ 8*16], m3
+    mova          [tmpq+10*16], m15
+    mova          [tmpq+12*16], m13
+    mova          [tmpq+14*16], m6
+
+    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
+    mova          [tmpq+ 1*16], m7
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m11
+    mova          [tmpq+ 7*16], m2
+    mova          [tmpq+ 9*16], m9
+    mova          [tmpq+11*16], m14
+    mova          [tmpq+13*16], m0
+    mova          [tmpq+15*16], m5
+%else
+    mova       [tmpq+12*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+15*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    mova                    m5, [tmpq+ 5*%%str]
+    mova                    m7, [tmpq+ 8*%%str]
+    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m2
+    mova          [tmpq+ 4*16], m5
+    mova          [tmpq+ 6*16], m0
+    mova          [tmpq+10*16], m7
+    mova                    m3, [tmpq+12*%%str]
+    mova          [tmpq+12*16], m4
+    mova                    m4, [tmpq+14*%%str]
+    mova          [tmpq+14*16], m6
+
+    mova                    m0, [tmpq+15*%%str]
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                    m2, [tmpq+ 7*%%str]
+    mova                    m5, [tmpq+11*%%str]
+    mova                    m7, [tmpq+ 1*%%str]
+    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+    mova          [tmpq+ 1*16], m0
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m2
+    mova          [tmpq+ 7*16], m3
+    mova          [tmpq+11*16], m5
+    mova          [tmpq+13*16], m6
+    mova          [tmpq+15*16], m7
+%endif
+%else
+    pxor                    m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+    mova                   m12, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+
+    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
+%else
+    mova       [tmpq+ 0*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+ 2*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 5*%%str]
+    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 8*%%str]
+    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m0, [tmpq+ 2*%%str]
+    mova                    m3, [tmpq+ 3*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+ 7*%%str]
+    mova                    m3, [tmpq+ 0*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+14*%%str]
+    mova                    m3, [tmpq+11*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+13*%%str]
+    mova                    m3, [tmpq+ 1*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+%endif
+
+    SWAP                     0,  4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_%2_1D       blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_%4_1D         tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%assign %%str 16*%2*%2
+    ; first do t0-15, this can be done identical to idct16x16
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+    ; store everything on stack to make space available for t16-31
+    ; we store interleaved with the output of the second half (t16-31)
+    ; so we don't need to allocate extra stack space
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+%if ARCH_X86_64
+    mova    [tmpq+22*%%str], m10    ; t10
+    mova    [tmpq+18*%%str], m11    ; t11
+    mova    [tmpq+14*%%str], m12    ; t12
+    mova    [tmpq+10*%%str], m13    ; t13
+    mova    [tmpq+ 6*%%str], m14    ; t14
+    mova    [tmpq+ 2*%%str], m15    ; t15
+%endif
+
+    mova                m0, [tmpq+ 30*%%str]
+    UNSCRATCH            1,  6, tmpq+26*%%str
+    UNSCRATCH            2,  8, tmpq+24*%%str
+    UNSCRATCH            3,  9, tmpq+28*%%str
+    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
+    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
+
+    mova    [tmpq+24*%%str], m1     ; t6
+    mova    [tmpq+28*%%str], m0     ; t7
+    mova    [tmpq+30*%%str], m2     ; t8
+    mova    [tmpq+26*%%str], m3     ; t9
+
+    ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
+
+    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
+%else
+    mova                 m0, [%1+ 1*64]
+    mova                 m1, [%1+15*64]
+%if %3 <= 16
+    pmulhrsw             m5, m0, [pw_16364x2]
+    pmulhrsw             m0, [pw_804x2]
+    pmulhrsw             m4, m1, [pw_m11003x2]
+    pmulhrsw             m1, [pw_12140x2]
+%else
+    mova                 m4, [%1+17*64]
+    mova                 m5, [%1+31*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+    SUMSUB_BA             w,  4,  0,  2
+    SUMSUB_BA             w,  1,  5,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    mova                 m2, [%1+ 7*64]
+    mova                 m3, [%1+ 9*64]
+%if %3 <= 16
+    pmulhrsw             m7,  m3, [pw_14811x2]
+    pmulhrsw             m3, [pw_7005x2]
+    pmulhrsw             m6,  m2, [pw_m5520x2]
+    pmulhrsw             m2, [pw_15426x2]
+%else
+    mova                 m7, [%1+23*64]
+    mova                 m6, [%1+25*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+    SUMSUB_BA             w,  3,  6,  4
+    SUMSUB_BA             w,  7,  2,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+    UNSCRATCH             5, 12, tmpq+15*%%str
+    SUMSUB_BA             w,  6,  0,  4
+    mova    [tmpq+25*%%str], m6             ; t19
+    UNSCRATCH             4, 13, tmpq+ 1*%%str
+    SUMSUB_BA             w,  7,  1,  6
+    SUMSUB_BA             w,  3,  4,  6
+    mova    [tmpq+23*%%str], m3             ; t16
+    SUMSUB_BA             w,  2,  5,  6
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
+
+    SCRATCH               0, 10, tmpq+ 1*%%str
+    SCRATCH               1, 11, tmpq+ 7*%%str
+    SCRATCH               2,  9, tmpq+ 9*%%str
+    SCRATCH               4, 14, tmpq+15*%%str
+    SCRATCH               5, 15, tmpq+17*%%str
+    SCRATCH               7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+    mova                 m0, [%1+ 5*64]
+    mova                 m3, [%1+ 3*64]
+
+    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
+
+    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+    mova                 m4, [%1+ 5*64]
+    mova                 m5, [%1+11*64]
+%if %3 <= 16
+    pmulhrsw             m1, m4, [pw_15893x2]
+    pmulhrsw             m4, [pw_3981x2]
+    pmulhrsw             m0, m5, [pw_m8423x2]
+    pmulhrsw             m5, [pw_14053x2]
+%else
+    mova                 m0, [%1+21*64]
+    mova                 m1, [%1+27*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+    SUMSUB_BA             w,  0,  4,  2
+    SUMSUB_BA             w,  5,  1,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    mova                 m7, [%1+ 3*64]
+    mova                 m6, [%1+13*64]
+%if %3 <= 16
+    pmulhrsw             m3, m6, [pw_13160x2]
+    pmulhrsw             m6, [pw_9760x2]
+    pmulhrsw             m2, m7, [pw_m2404x2]
+    pmulhrsw             m7, [pw_16207x2]
+%else
+    mova                 m2, [%1+29*64]
+    mova                 m3, [%1+19*64]
+    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
+    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+    SUMSUB_BA             w,  6,  2,  4
+    SUMSUB_BA             w,  3,  7,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+    UNSCRATCH             4, 12, tmpq+11*%%str
+    SUMSUB_BA             w,  0,  6, 5
+    SUMSUB_BA             w,  4,  2, 5
+    UNSCRATCH             5,  8, tmpq+ 5*%%str
+    SCRATCH               4,  8, tmpq+11*%%str
+    SUMSUB_BA             w,  1,  7, 4
+    SUMSUB_BA             w,  5,  3, 4
+    SCRATCH               5, 12, tmpq+ 5*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+    UNSCRATCH             5,  9, tmpq+ 9*%%str
+    mova                 m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+    SUMSUB_BA             w,  1,  5,  9
+    SUMSUB_BA             w,  0,  4,  9
+%else
+    SUMSUB_BADC           w,  1,  5,  0,  4
+%endif
+    mova    [tmpq+29*%%str], m1     ; t17
+    mova    [tmpq+21*%%str], m0     ; t16
+    UNSCRATCH             0, 10, tmpq+ 1*%%str
+    UNSCRATCH             1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  2,  0,  9
+    SUMSUB_BA             w,  3,  1,  9
+%else
+    SUMSUB_BADC           w,  2,  0,  3,  1
+%endif
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
+    SCRATCH               0, 10, tmpq+23*%%str
+    SCRATCH               1, 11, tmpq+27*%%str
+
+    UNSCRATCH             2, 14, tmpq+15*%%str
+    UNSCRATCH             3, 15, tmpq+17*%%str
+    SUMSUB_BA             w,  6,  2, 0
+    SUMSUB_BA             w,  7,  3, 0
+    SCRATCH               6, 14, tmpq+ 3*%%str
+    SCRATCH               7, 15, tmpq+ 7*%%str
+
+    UNSCRATCH             0,  8, tmpq+11*%%str
+    mova                 m1, [tmpq+25*%%str] ; t19
+    UNSCRATCH             6, 12, tmpq+ 5*%%str
+    UNSCRATCH             7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  0,  1,  9
+    SUMSUB_BA             w,  6,  7,  9
+%else
+    SUMSUB_BADC           w,  0,  1,  6,  7
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+    SUMSUB_BA             w,  4,  7,  8
+    SUMSUB_BA             w,  5,  1,  8
+%else
+    SUMSUB_BADC           w,  4,  7,  5,  1
+%endif
+
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m4, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+    pmulhrsw             m5, [pw_11585x2]
+
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+    SUMSUB_BA             w,  7,  3, 10
+    SUMSUB_BA             w,  1,  2, 10
+%else
+    SUMSUB_BADC           w,  7,  3,  1,  2
+%endif
+
+    pmulhrsw             m3, [pw_11585x2]
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m2, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+%else
+    SCRATCH               0,  8, tmpq+15*%%str
+    SCRATCH               6,  9, tmpq+17*%%str
+    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
+    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
+    UNSCRATCH             0,  8, tmpq+15*%%str
+    UNSCRATCH             6,  9, tmpq+17*%%str
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+    mova    [tmpq+17*%%str], m2     ; t20
+    mova    [tmpq+ 1*%%str], m3     ; t21
+%if ARCH_X86_64
+    mova    [tmpq+25*%%str], m13    ; t22
+
+    mova                 m8, [tmpq+ 0*%%str] ; t0
+    mova                 m9, [tmpq+ 4*%%str] ; t1
+    mova                m12, [tmpq+ 8*%%str] ; t2
+    mova                m11, [tmpq+12*%%str] ; t3
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    mova                m13, [tmpq+24*%%str] ; t6
+
+    SUMSUB_BA             w,  6,  8,  10
+    mova    [tmpq+ 3*%%str], m8              ; t15
+    mova                m10, [tmpq+28*%%str] ; t7
+    SUMSUB_BA             w,  0,  9,  8
+    SUMSUB_BA             w, 15, 12,  8
+    SUMSUB_BA             w, 14, 11,  8
+    SUMSUB_BA             w,  1,  2,  8
+    SUMSUB_BA             w,  7,  3,  8
+    SUMSUB_BA             w,  5, 13,  8
+    SUMSUB_BA             w,  4, 10,  8
+
+    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m15
+    mova    [tmpq+12*%%str], m14
+    mova    [tmpq+16*%%str], m1
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+28*%%str], m4
+
+    mova                  m8, [tmpq+ 3*%%str] ; t15
+    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
+    mova    [tmpq+ 3*%%str], m10
+    mova    [tmpq+ 7*%%str], m13
+    mova    [tmpq+11*%%str], m3
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+19*%%str], m11
+    mova    [tmpq+23*%%str], m12
+    mova    [tmpq+27*%%str], m9
+    mova    [tmpq+31*%%str], m8
+
+    mova                m15, [tmpq+30*%%str] ; t8
+    mova                m14, [tmpq+26*%%str] ; t9
+    mova                m13, [tmpq+22*%%str] ; t10
+    mova                m12, [tmpq+18*%%str] ; t11
+    mova                m11, [tmpq+14*%%str] ; t12
+    mova                m10, [tmpq+10*%%str] ; t13
+    mova                 m9, [tmpq+ 6*%%str] ; t14
+    mova                 m8, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
+
+    SUMSUB_BA             w,  7,  8, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BA             w,  6,  9, 8
+    SUMSUB_BA             w,  5, 10, 8
+    SUMSUB_BA             w,  4, 11, 8
+    SUMSUB_BA             w,  3, 12, 8
+    SUMSUB_BA             w,  2, 13, 8
+    SUMSUB_BA             w,  1, 14, 8
+    SUMSUB_BA             w,  0, 15, 8
+
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
+    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
+%else
+    mova                 m2, [tmpq+24*%%str] ; t6
+    mova                 m3, [tmpq+28*%%str] ; t7
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+28*%%str], m4
+    mova    [tmpq+19*%%str], m3
+
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    SUMSUB_BA             w,  1,  2,  5
+    SUMSUB_BA             w,  7,  3,  5
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+11*%%str], m3
+
+    mova                 m2, [tmpq+ 0*%%str] ; t0
+    mova                 m3, [tmpq+ 4*%%str] ; t1
+    SUMSUB_BA             w,  6,  2,  5
+    SUMSUB_BA             w,  0,  3,  5
+    mova    [tmpq+31*%%str], m2
+    mova    [tmpq+27*%%str], m3
+
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m5, [tmpq+ 7*%%str]
+    mova                 m4, [tmpq+ 3*%%str]
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+ 7*%%str], m2
+    mova    [tmpq+ 3*%%str], m3
+
+    mova                 m3, [tmpq+28*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m5
+    mova    [tmpq+12*%%str], m4
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m2
+    mova    [tmpq+28*%%str], m3
+
+    mova                 m6, [tmpq+19*%%str]
+    mova                 m0, [tmpq+23*%%str]
+    mova                 m5, [tmpq+11*%%str]
+    mova                 m4, [tmpq+15*%%str]
+    mova                 m1, [tmpq+ 3*%%str]
+    mova                 m7, [tmpq+ 7*%%str]
+    mova                 m3, [tmpq+31*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+    mova    [tmpq+ 3*%%str], m6
+    mova    [tmpq+ 7*%%str], m0
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+23*%%str], m7
+    mova    [tmpq+27*%%str], m2
+    mova    [tmpq+31*%%str], m3
+
+    mova                 m1, [tmpq+ 6*%%str] ; t14
+    mova                 m0, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w,  7,  0,  2
+    SUMSUB_BA             w,  6,  1,  2
+    mova    [tmpq+29*%%str], m7
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+21*%%str], m6
+    mova    [tmpq+ 6*%%str], m1
+
+    mova                 m1, [tmpq+14*%%str] ; t12
+    mova                 m0, [tmpq+10*%%str] ; t13
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    SUMSUB_BA             w,  5,  0,  2
+    SUMSUB_BA             w,  4,  1,  2
+    mova     [tmpq+10*%%str], m0
+    mova     [tmpq+14*%%str], m1
+
+    mova                 m1, [tmpq+22*%%str] ; t10
+    mova                 m0, [tmpq+18*%%str] ; t11
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    SUMSUB_BA             w,  3,  0,  6
+    SUMSUB_BA             w,  2,  1,  6
+    mova     [tmpq+18*%%str], m0
+    mova     [tmpq+22*%%str], m1
+
+    mova                 m7, [tmpq+30*%%str] ; t8
+    mova                 m6, [tmpq+26*%%str] ; t9
+    mova                 m1, [tmpq+25*%%str] ; t22
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BADC           w,  1,  6,  0,  7
+    mova     [tmpq+26*%%str], m6
+    mova     [tmpq+30*%%str], m7
+
+    mova                 m7, [tmpq+29*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m0, [tmpq+ 2*%%str]
+    mova                 m1, [tmpq+ 6*%%str]
+    mova                 m2, [tmpq+10*%%str]
+    mova                 m3, [tmpq+14*%%str]
+    mova                 m4, [tmpq+18*%%str]
+    mova                 m5, [tmpq+22*%%str]
+    mova                 m7, [tmpq+30*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 6*%%str], m1
+    mova    [tmpq+10*%%str], m2
+    mova    [tmpq+14*%%str], m3
+    mova    [tmpq+22*%%str], m5
+    mova    [tmpq+26*%%str], m6
+    mova    [tmpq+30*%%str], m7
+%endif
+%else
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+    ; t20-22 is in m4-6
+    ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+    SUMSUB_BA            w, %4, %1, %5
+    SUMSUB_BA            w, %3, %2, %5
+    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+    add               dstq, stride2q
+%endif
+    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+    sub           dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+    pxor               m10, m10
+
+    ; store t0-1 and t30-31
+    mova                m8, [tmpq+ 0*%%str]
+    mova                m9, [tmpq+ 4*%%str]
+    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
+
+    ; store t2-3 and t28-29
+    mova                m8, [tmpq+ 8*%%str]
+    mova                m9, [tmpq+12*%%str]
+    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
+
+    ; store t4-5 and t26-27
+    mova                m8, [tmpq+16*%%str]
+    mova                m9, [tmpq+20*%%str]
+    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
+
+    ; store t6-7 and t24-25
+    mova                m8, [tmpq+24*%%str]
+    mova                m9, [tmpq+28*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t8-9 and t22-23
+    mova                m8, [tmpq+30*%%str]
+    mova                m9, [tmpq+26*%%str]
+    mova                m0, [tmpq+ 5*%%str]
+    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
+
+    ; store t10-11 and t20-21
+    mova                m8, [tmpq+22*%%str]
+    mova                m9, [tmpq+18*%%str]
+    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
+
+    ; store t12-13 and t18-19
+    mova                m8, [tmpq+14*%%str]
+    mova                m9, [tmpq+10*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t14-17
+    mova                m8, [tmpq+ 6*%%str]
+    mova                m9, [tmpq+ 2*%%str]
+    mova                m5, [tmpq+29*%%str]
+    mova                m4, [tmpq+21*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
+
+    SWAP                 1, 10 ; zero
+%else
+    mova   [tmpq+ 1*%%str], m1
+    mova   [tmpq+11*%%str], m2
+    mova   [tmpq+15*%%str], m3
+    mova   [tmpq+17*%%str], m4
+    mova   [tmpq+19*%%str], m5
+    pxor                m1, m1
+
+    ; store t0-1 and t30-31
+    mova                m2, [tmpq+ 0*%%str]
+    mova                m3, [tmpq+ 4*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t2-3 and t28-29
+    mova                m2, [tmpq+ 8*%%str]
+    mova                m3, [tmpq+12*%%str]
+    mova                m0, [tmpq+ 3*%%str]
+    mova                m6, [tmpq+ 7*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t4-5 and t26-27
+    mova                m2, [tmpq+16*%%str]
+    mova                m3, [tmpq+20*%%str]
+    mova                m0, [tmpq+ 1*%%str]
+    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
+
+    ; store t6-7 and t24-25
+    mova                m2, [tmpq+24*%%str]
+    mova                m3, [tmpq+28*%%str]
+    mova                m0, [tmpq+17*%%str]
+    mova                m6, [tmpq+19*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t8-9 and t22-23
+    mova                m2, [tmpq+30*%%str]
+    mova                m3, [tmpq+26*%%str]
+    mova                m0, [tmpq+25*%%str]
+    mova                m6, [tmpq+ 5*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t10-11 and t20-21
+    mova                m2, [tmpq+22*%%str]
+    mova                m3, [tmpq+18*%%str]
+    mova                m0, [tmpq+11*%%str]
+    mova                m6, [tmpq+15*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t12-13 and t18-19
+    mova                m2, [tmpq+14*%%str]
+    mova                m3, [tmpq+10*%%str]
+    mova                m6, [tmpq+13*%%str]
+    mova                m0, [tmpq+ 9*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t14-17
+    mova                m2, [tmpq+ 6*%%str]
+    mova                m3, [tmpq+ 2*%%str]
+    mova                m6, [tmpq+29*%%str]
+    mova                m0, [tmpq+21*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+    movifnidn         eobd, dword eobm
+%if cpuflag(ssse3)
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 34
+    jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+    ; dc-only case
+    movifnidn       blockq, blockmp
+    movifnidn         dstq, dstmp
+    movifnidn      strideq, stridemp
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 31
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    add               dstq, strideq
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    RET
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+    DEFINE_ARGS block, u1, u2, u3, u4, tmp
+    mov             blockq, r2mp
+%endif
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 8
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 8
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64,  8, m1
+    RET
+
+.idct16x16:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_16x16:
+    VP9_IDCT32_1D   blockq, 1, 16
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_16x16
+
+%if ARCH_X86_64
+    sub             blockq, 32
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 4
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_full
+
+%if ARCH_X86_64
+    sub             blockq, 64
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
new file mode 100644
index 0000000000..2c4fe214da
--- /dev/null
+++ b/libavcodec/x86/vp9lpf.asm
@@ -0,0 +1,1139 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
+;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_3
+cextern pb_80
+
+pb_4:   times 16 db 0x04
+pb_10:  times 16 db 0x10
+pb_40:  times 16 db 0x40
+pb_81:  times 16 db 0x81
+pb_f8:  times 16 db 0xf8
+pb_fe:  times 16 db 0xfe
+pb_ff:  times 16 db 0xff
+
+cextern pw_4
+cextern pw_8
+
+; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
+; the following mask is used to splat both in the same register
+mask_mix: times 8 db 0
+          times 8 db 1
+
+mask_mix84: times 8 db 0xff
+            times 8 db 0x00
+mask_mix48: times 8 db 0x00
+            times 8 db 0xff
+
+SECTION .text
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+; %1 = abs(%2-%3)
+%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
+%if ARCH_X86_64
+    psubusb             %1, %3, %2
+    psubusb             %4, %2, %3
+%else
+    mova                %1, %3
+    mova                %4, %2
+    psubusb             %1, %2
+    psubusb             %4, %3
+%endif
+    por                 %1, %4
+%endmacro
+
+; %1 = %1>%2
+%macro CMP_GT 2-3 ; src/dst, cmp, pb_80
+%if %0 == 3
+    pxor                %1, %3
+%endif
+    pcmpgtb             %1, %2
+%endmacro
+
+; %1 = abs(%2-%3) > %4
+%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80]
+    ABSSUB              %1, %2, %3, %5      ; dst = abs(src1-src2)
+    CMP_GT              %1, %4, %6          ; dst > cmp
+%endmacro
+
+%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
+    pand                %1, %3              ; new &= mask
+    pandn               %4, %3, %2          ; tmp = ~mask & old
+    por                 %1, %4              ; new&mask | old&~mask
+%endmacro
+
+%macro UNPACK 4
+%if ARCH_X86_64
+    punpck%1bw          %2, %3, %4
+%else
+    mova                %2, %3
+    punpck%1bw          %2, %4
+%endif
+%endmacro
+
+%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
+                             ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
+    psubw               %3, [rsp+%4+%5*32]
+    psubw               %3, [rsp+%4+%6*32]
+    paddw               %3, [rsp+%4+%7*32]
+%ifnidn %10, ""
+%if %11 == 0
+    punpck%2bw          %1, %10, m0
+%else
+    UNPACK          %2, %1, %10, m0
+%endif
+    mova    [rsp+%4+%8*32], %1
+    paddw               %3, %1
+%else
+    paddw               %3, [rsp+%4+%8*32]
+%endif
+    psraw               %1, %3, %9
+%endmacro
+
+; FIXME interleave l/h better (for instruction pairing)
+%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
+    FILTER%7_INIT       %1, l, %3, %6 +  0
+    FILTER%7_INIT       %2, h, %4, %6 + 16
+    packuswb            %1, %2
+    MASK_APPLY          %1, %9, %8, %2
+    mova                %5, %1
+%endmacro
+
+
+%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift,
+                                         ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32]
+; FIXME interleave this properly with the subx2/addx2
+%ifnidn %15, ""
+%if %16 == 0 || ARCH_X86_64
+    mova               %14, %15
+%endif
+%endif
+    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +  0, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
+    packuswb            %1, %2
+%ifnidn %13, ""
+    MASK_APPLY          %1, %13, %12, %2
+%else
+    MASK_APPLY          %1, %5, %12, %2
+%endif
+    mova                %5, %1
+%endmacro
+
+%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
+    mova                %4, [pb_f8]
+    pand                %1, %4
+    pand                %2, %4
+    psrlq               %1, 3
+    psrlq               %2, 3
+    pxor                %1, %3
+    pxor                %2, %3
+    psubb               %1, %3
+    psubb               %2, %3
+%endmacro
+
+%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
+    pxor                %3, %3
+    pxor                %2, %2
+    pcmpgtb             %3, %1                          ; i8 < 0 mask
+    psubb               %2, %1                          ; neg values (only the originally - will be kept)
+    pand                %2, %3                          ; negative values of i8 (but stored as +)
+    pandn               %3, %1                          ; positive values of i8
+%endmacro
+
+; clip_u8(u8 + i8)
+%macro SIGN_ADD 4 ; dst, u8, i8, tmp1
+    EXTRACT_POS_NEG     %3, %4, %1
+    paddusb             %1, %2                          ; add the positives
+    psubusb             %1, %4                          ; sub the negatives
+%endmacro
+
+; clip_u8(u8 - i8)
+%macro SIGN_SUB 4 ; dst, u8, i8, tmp1
+    EXTRACT_POS_NEG     %3, %1, %4
+    paddusb             %1, %2                          ; add the negatives
+    psubusb             %1, %4                          ; sub the positives
+%endmacro
+
+%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+    UNPACK          %2, %1, rp3, m0                     ; p3: B->W
+    mova     [rsp+%4+0*32], %1
+    paddw               %3, %1, %1                      ; p3*2
+    paddw               %3, %1                          ; p3*3
+    punpck%2bw          %1, m1,  m0                     ; p2: B->W
+    mova     [rsp+%4+1*32], %1
+    paddw               %3, %1                          ; p3*3 + p2
+    paddw               %3, %1                          ; p3*3 + p2*2
+    UNPACK          %2, %1, rp1, m0                     ; p1: B->W
+    mova     [rsp+%4+2*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1
+    UNPACK          %2, %1, rp0, m0                     ; p0: B->W
+    mova     [rsp+%4+3*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
+    UNPACK          %2, %1, rq0, m0                     ; q0: B->W
+    mova     [rsp+%4+4*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
+    paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+    psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+%endmacro
+
+%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+    punpck%2bw          %1, m2, m0                      ; p7: B->W
+    mova    [rsp+%4+ 8*32], %1
+    psllw               %3, %1, 3                       ; p7*8
+    psubw               %3, %1                          ; p7*7
+    punpck%2bw          %1, m3, m0                      ; p6: B->W
+    mova    [rsp+%4+ 9*32], %1
+    paddw               %3, %1                          ; p7*7 + p6
+    paddw               %3, %1                          ; p7*7 + p6*2
+    UNPACK          %2, %1, rp5, m0                     ; p5: B->W
+    mova    [rsp+%4+10*32], %1
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5
+    UNPACK          %2, %1, rp4, m0                     ; p4: B->W
+    mova    [rsp+%4+11*32], %1
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
+    paddw               %3, [rsp+%4+ 0*32]              ; p7*7 + p6*2 + p5 + p4 + p3
+    paddw               %3, [rsp+%4+ 1*32]              ; p7*7 + p6*2 + p5 + .. + p2
+    paddw               %3, [rsp+%4+ 2*32]              ; p7*7 + p6*2 + p5 + .. + p1
+    paddw               %3, [rsp+%4+ 3*32]              ; p7*7 + p6*2 + p5 + .. + p0
+    paddw               %3, [rsp+%4+ 4*32]              ; p7*7 + p6*2 + p5 + .. + p0 + q0
+    paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+    psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+%endmacro
+
+%macro TRANSPOSE16x16B 17
+    mova %17, m%16
+    SBUTTERFLY bw,  %1,  %2,  %16
+    SBUTTERFLY bw,  %3,  %4,  %16
+    SBUTTERFLY bw,  %5,  %6,  %16
+    SBUTTERFLY bw,  %7,  %8,  %16
+    SBUTTERFLY bw,  %9,  %10, %16
+    SBUTTERFLY bw,  %11, %12, %16
+    SBUTTERFLY bw,  %13, %14, %16
+    mova m%16,  %17
+    mova  %17, m%14
+    SBUTTERFLY bw,  %15, %16, %14
+    SBUTTERFLY wd,  %1,  %3,  %14
+    SBUTTERFLY wd,  %2,  %4,  %14
+    SBUTTERFLY wd,  %5,  %7,  %14
+    SBUTTERFLY wd,  %6,  %8,  %14
+    SBUTTERFLY wd,  %9,  %11, %14
+    SBUTTERFLY wd,  %10, %12, %14
+    SBUTTERFLY wd,  %13, %15, %14
+    mova m%14,  %17
+    mova  %17, m%12
+    SBUTTERFLY wd,  %14, %16, %12
+    SBUTTERFLY dq,  %1,  %5,  %12
+    SBUTTERFLY dq,  %2,  %6,  %12
+    SBUTTERFLY dq,  %3,  %7,  %12
+    SBUTTERFLY dq,  %4,  %8,  %12
+    SBUTTERFLY dq,  %9,  %13, %12
+    SBUTTERFLY dq,  %10, %14, %12
+    SBUTTERFLY dq,  %11, %15, %12
+    mova m%12, %17
+    mova  %17, m%8
+    SBUTTERFLY dq,  %12, %16, %8
+    SBUTTERFLY qdq, %1,  %9,  %8
+    SBUTTERFLY qdq, %2,  %10, %8
+    SBUTTERFLY qdq, %3,  %11, %8
+    SBUTTERFLY qdq, %4,  %12, %8
+    SBUTTERFLY qdq, %5,  %13, %8
+    SBUTTERFLY qdq, %6,  %14, %8
+    SBUTTERFLY qdq, %7,  %15, %8
+    mova m%8, %17
+    mova %17, m%1
+    SBUTTERFLY qdq, %8,  %16, %1
+    mova m%1, %17
+    SWAP %2,  %9
+    SWAP %3,  %5
+    SWAP %4,  %13
+    SWAP %6,  %11
+    SWAP %8,  %15
+    SWAP %12, %14
+%endmacro
+
+%macro TRANSPOSE8x8B 13
+    SBUTTERFLY bw,  %1, %2, %7
+    movdq%10 m%7, %9
+    movdqa %11, m%2
+    SBUTTERFLY bw,  %3, %4, %2
+    SBUTTERFLY bw,  %5, %6, %2
+    SBUTTERFLY bw,  %7, %8, %2
+    SBUTTERFLY wd,  %1, %3, %2
+    movdqa m%2, %11
+    movdqa %11, m%3
+    SBUTTERFLY wd,  %2, %4, %3
+    SBUTTERFLY wd,  %5, %7, %3
+    SBUTTERFLY wd,  %6, %8, %3
+    SBUTTERFLY dq, %1, %5, %3
+    SBUTTERFLY dq, %2, %6, %3
+    movdqa m%3, %11
+    movh   %12, m%2
+    movhps %13, m%2
+    SBUTTERFLY dq, %3, %7, %2
+    SBUTTERFLY dq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%endmacro
+
+%macro DEFINE_REAL_P7_TO_Q7 0-1 0
+%define P7 dstq  + 4*mstrideq  + %1
+%define P6 dstq  +   mstride3q + %1
+%define P5 dstq  + 2*mstrideq  + %1
+%define P4 dstq  +   mstrideq  + %1
+%define P3 dstq                + %1
+%define P2 dstq  +    strideq  + %1
+%define P1 dstq  + 2* strideq  + %1
+%define P0 dstq  +    stride3q + %1
+%define Q0 dstq  + 4* strideq  + %1
+%define Q1 dst2q +   mstride3q + %1
+%define Q2 dst2q + 2*mstrideq  + %1
+%define Q3 dst2q +   mstrideq  + %1
+%define Q4 dst2q               + %1
+%define Q5 dst2q +    strideq  + %1
+%define Q6 dst2q + 2* strideq  + %1
+%define Q7 dst2q +    stride3q + %1
+%endmacro
+
+%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
+%define P3 rsp +   0 + %1
+%define P2 rsp +  16 + %1
+%define P1 rsp +  32 + %1
+%define P0 rsp +  48 + %1
+%define Q0 rsp +  64 + %1
+%define Q1 rsp +  80 + %1
+%define Q2 rsp +  96 + %1
+%define Q3 rsp + 112 + %1
+%define P7 rsp + 128 + %1
+%define P6 rsp + 144 + %1
+%define P5 rsp + 160 + %1
+%define P4 rsp + 176 + %1
+%define Q4 rsp + 192 + %1
+%define Q5 rsp + 208 + %1
+%define Q6 rsp + 224 + %1
+%define Q7 rsp + 240 + %1
+%endmacro
+
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+    pshufb     %1, %2
+%else
+    punpcklbw  %1, %1
+    punpcklwd  %1, %1
+    punpckldq  %1, %1
+%endif
+%endmacro
+
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
+%if UNIX64
+cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+%else
+%if WIN64
+cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
+%else
+cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
+%define Ed dword r2m
+%define Id dword r3m
+%endif
+%define Hd dword r4m
+%endif
+
+    mov               mstrideq, strideq
+    neg               mstrideq
+
+    lea               stride3q, [strideq*3]
+    lea              mstride3q, [mstrideq*3]
+
+%ifidn %1, h
+%if %2 > 16
+%define movx movh
+    lea                   dstq, [dstq + 4*strideq - 4]
+%else
+%define movx movu
+    lea                   dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
+%endif
+    lea                  dst2q, [dstq + 8*strideq]
+%else
+    lea                   dstq, [dstq + 4*mstrideq]
+    lea                  dst2q, [dstq + 8*strideq]
+%endif
+
+    DEFINE_REAL_P7_TO_Q7
+
+%ifidn %1, h
+    movx                    m0, [P7]
+    movx                    m1, [P6]
+    movx                    m2, [P5]
+    movx                    m3, [P4]
+    movx                    m4, [P3]
+    movx                    m5, [P2]
+%if ARCH_X86_64 || %2 != 16
+    movx                    m6, [P1]
+%endif
+    movx                    m7, [P0]
+%if ARCH_X86_64
+    movx                    m8, [Q0]
+    movx                    m9, [Q1]
+    movx                   m10, [Q2]
+    movx                   m11, [Q3]
+    movx                   m12, [Q4]
+    movx                   m13, [Q5]
+    movx                   m14, [Q6]
+    movx                   m15, [Q7]
+    DEFINE_TRANSPOSED_P7_TO_Q7
+%if %2 == 16
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    mova           [P7],  m0
+    mova           [P6],  m1
+    mova           [P5],  m2
+    mova           [P4],  m3
+%else ; %2 == 44/48/84/88
+    ; 8x16 transpose
+    punpcklbw        m0,  m1
+    punpcklbw        m2,  m3
+    punpcklbw        m4,  m5
+    punpcklbw        m6,  m7
+    punpcklbw        m8,  m9
+    punpcklbw       m10, m11
+    punpcklbw       m12, m13
+    punpcklbw       m14, m15
+    TRANSPOSE8x8W     0, 2, 4, 6, 8, 10, 12, 14, 15
+    SWAP              0,  4
+    SWAP              2,  5
+    SWAP              0,  6
+    SWAP              0,  7
+    SWAP             10,  9
+    SWAP             12, 10
+    SWAP             14, 11
+%endif ; %2
+    mova           [P3],  m4
+    mova           [P2],  m5
+    mova           [P1],  m6
+    mova           [P0],  m7
+    mova           [Q0],  m8
+    mova           [Q1],  m9
+    mova           [Q2], m10
+    mova           [Q3], m11
+%if %2 == 16
+    mova           [Q4], m12
+    mova           [Q5], m13
+    mova           [Q6], m14
+    mova           [Q7], m15
+%endif ; %2
+%else ; x86-32
+%if %2 == 16
+    TRANSPOSE8x8B    0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80]
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    movh          [P7], m0
+    movh          [P5], m1
+    movh          [P3], m2
+    movh          [P1], m3
+    movh          [Q2], m5
+    movh          [Q4], m6
+    movh          [Q6], m7
+    movhps        [P6], m0
+    movhps        [P4], m1
+    movhps        [P2], m2
+    movhps        [P0], m3
+    movhps        [Q3], m5
+    movhps        [Q5], m6
+    movhps        [Q7], m7
+    DEFINE_REAL_P7_TO_Q7
+    movx                    m0, [Q0]
+    movx                    m1, [Q1]
+    movx                    m2, [Q2]
+    movx                    m3, [Q3]
+    movx                    m4, [Q4]
+    movx                    m5, [Q5]
+    movx                    m7, [Q7]
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88]
+    DEFINE_TRANSPOSED_P7_TO_Q7 8
+    movh          [P7], m0
+    movh          [P5], m1
+    movh          [P3], m2
+    movh          [P1], m3
+    movh          [Q2], m5
+    movh          [Q4], m6
+    movh          [Q6], m7
+    movhps        [P6], m0
+    movhps        [P4], m1
+    movhps        [P2], m2
+    movhps        [P0], m3
+    movhps        [Q3], m5
+    movhps        [Q5], m6
+    movhps        [Q7], m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+%else ; %2 == 44/48/84/88
+    punpcklbw        m0, m1
+    punpcklbw        m2, m3
+    punpcklbw        m4, m5
+    punpcklbw        m6, m7
+    movx             m1, [Q0]
+    movx             m3, [Q1]
+    movx             m5, [Q2]
+    movx             m7, [Q3]
+    punpcklbw        m1, m3
+    punpcklbw        m5, m7
+    movx             m3, [Q4]
+    movx             m7, [Q5]
+    punpcklbw        m3, m7
+    mova          [rsp], m3
+    movx             m3, [Q6]
+    movx             m7, [Q7]
+    punpcklbw        m3, m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    TRANSPOSE8x8W     0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
+    mova           [P3],  m0
+    mova           [P2],  m2
+    mova           [P1],  m4
+    mova           [P0],  m6
+    mova           [Q1],  m5
+    mova           [Q2],  m7
+    mova           [Q3],  m3
+%endif ; %2
+%endif ; x86-32/64
+%endif ; %1 == h
+
+    ; calc fm mask
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m2, I, m0                       ; I I I I ...
+    SPLATB_REG          m3, E, m0                       ; E E E E ...
+%else
+%if cpuflag(ssse3)
+    mova                m0, [mask_mix]
+%endif
+    movd                m2, Id
+    movd                m3, Ed
+    SPLATB_MIX          m2, m0
+    SPLATB_MIX          m3, m0
+%endif
+    mova                m0, [pb_80]
+    pxor                m2, m0
+    pxor                m3, m0
+%if ARCH_X86_64
+%ifidn %1, v
+    mova                m8, [P3]
+    mova                m9, [P2]
+    mova               m10, [P1]
+    mova               m11, [P0]
+    mova               m12, [Q0]
+    mova               m13, [Q1]
+    mova               m14, [Q2]
+    mova               m15, [Q3]
+%else
+    ; In case of horizontal, P3..Q3 are already present in some registers due
+    ; to the previous transpose, so we just swap registers.
+    SWAP                 8,  4, 12
+    SWAP                 9,  5, 13
+    SWAP                10,  6, 14
+    SWAP                11,  7, 15
+%endif
+%define rp3 m8
+%define rp2 m9
+%define rp1 m10
+%define rp0 m11
+%define rq0 m12
+%define rq1 m13
+%define rq2 m14
+%define rq3 m15
+%else
+%define rp3 [P3]
+%define rp2 [P2]
+%define rp1 [P1]
+%define rp0 [P0]
+%define rq0 [Q0]
+%define rq1 [Q1]
+%define rq2 [Q2]
+%define rq3 [Q3]
+%endif
+    ABSSUB_GT           m5, rp3, rp2, m2, m7, m0        ; m5 = abs(p3-p2) <= I
+    ABSSUB_GT           m1, rp2, rp1, m2, m7, m0        ; m1 = abs(p2-p1) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rp1, rp0, m2, m7, m0        ; m1 = abs(p1-p0) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq0, rq1, m2, m7, m0        ; m1 = abs(q1-q0) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq1, rq2, m2, m7, m0        ; m1 = abs(q2-q1) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq2, rq3, m2, m7, m0        ; m1 = abs(q3-q2) <= I
+    por                 m5, m1
+    ABSSUB              m1, rp0, rq0, m7                ; abs(p0-q0)
+    paddusb             m1, m1                          ; abs(p0-q0) * 2
+    ABSSUB              m2, rp1, rq1, m7                ; abs(p1-q1)
+    pand                m2, [pb_fe]                     ; drop lsb so shift can work
+    psrlq               m2, 1                           ; abs(p1-q1)/2
+    paddusb             m1, m2                          ; abs(p0-q0)*2 + abs(p1-q1)/2
+    pxor                m1, m0
+    pcmpgtb             m1, m3
+    por                 m1, m5                          ; fm final value
+    SWAP                 1, 3
+    pxor                m3, [pb_ff]
+
+    ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8in (if not 44_16) and hev masks
+%if %2 != 44
+    mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
+    ABSSUB_GT           m2, rp3, rp0, m6, m5            ; abs(p3 - p0) <= 1
+%if ARCH_X86_64
+    mova                m8, [pb_80]
+%define rb80 m8
+%else
+%define rb80 [pb_80]
+%endif
+    ABSSUB_GT           m1, rp2, rp0, m6, m5, rb80      ; abs(p2 - p0) <= 1
+    por                 m2, m1
+    ABSSUB              m4, rp1, rp0, m5                ; abs(p1 - p0)
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m7, H, m0                       ; H H H H ...
+%else
+    movd                m7, Hd
+    SPLATB_MIX          m7
+%endif
+    pxor                m7, rb80
+    pxor                m4, rb80
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    CMP_GT              m4, m6                          ; abs(p1 - p0) <= 1
+    por                 m2, m4                          ; (flat8in)
+    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
+    pxor                m4, rb80
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+    CMP_GT              m4, m6                          ; abs(q1 - q0) <= 1
+    por                 m2, m4                          ; (flat8in)
+    ABSSUB_GT           m1, rq2, rq0, m6, m5, rb80      ; abs(q2 - q0) <= 1
+    por                 m2, m1
+    ABSSUB_GT           m1, rq3, rq0, m6, m5, rb80      ; abs(q3 - q0) <= 1
+    por                 m2, m1                          ; flat8in final value
+    pxor                m2, [pb_ff]
+%if %2 == 84 || %2 == 48
+    pand                m2, [mask_mix%2]
+%endif
+%else
+    mova                m6, [pb_80]
+    movd                m7, Hd
+    SPLATB_MIX          m7
+    pxor                m7, m6
+    ABSSUB              m4, rp1, rp0, m1                ; abs(p1 - p0)
+    pxor                m4, m6
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
+    pxor                m4, m6
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+%endif
+
+%if %2 == 16
+    ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8out mask
+%if ARCH_X86_64
+    mova                m8, [P7]
+    mova                m9, [P6]
+%define rp7 m8
+%define rp6 m9
+%else
+%define rp7 [P7]
+%define rp6 [P6]
+%endif
+    ABSSUB_GT           m1, rp7, rp0, m6, m5            ; abs(p7 - p0) <= 1
+    ABSSUB_GT           m7, rp6, rp0, m6, m5            ; abs(p6 - p0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m8, [P5]
+    mova                m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%endif
+    ABSSUB_GT           m7, rp5, rp0, m6, m5            ; abs(p5 - p0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rp4, rp0, m6, m5            ; abs(p4 - p0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m14, [Q4]
+    mova                m15, [Q5]
+%define rq4 m14
+%define rq5 m15
+%else
+%define rq4 [Q4]
+%define rq5 [Q5]
+%endif
+    ABSSUB_GT           m7, rq4, rq0, m6, m5            ; abs(q4 - q0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rq5, rq0, m6, m5            ; abs(q5 - q0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m14, [Q6]
+    mova                m15, [Q7]
+%define rq6 m14
+%define rq7 m15
+%else
+%define rq6 [Q6]
+%define rq7 [Q7]
+%endif
+    ABSSUB_GT           m7, rq6, rq0, m6, m5            ; abs(q4 - q0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rq7, rq0, m6, m5            ; abs(q5 - q0) <= 1
+    por                 m1, m7                          ; flat8out final value
+    pxor                m1, [pb_ff]
+%endif
+
+    ; if (fm) {
+    ;     if (out && in) filter_14()
+    ;     else if (in)   filter_6()
+    ;     else if (hev)  filter_2()
+    ;     else           filter_4()
+    ; }
+    ;
+    ; f14:                                                                            fm &  out &  in
+    ; f6:  fm & ~f14 & in        => fm & ~(out & in) & in                          => fm & ~out &  in
+    ; f2:  fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev          => fm &  ~in &  hev
+    ; f4:  fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm &  ~in & ~hev
+
+    ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
+    ; filter2()
+%if %2 != 44
+    mova                m6, [pb_80]                     ; already in m6 if 44_16
+    SCRATCH              2, 15, rsp+%3+%4
+%if %2 == 16
+    SCRATCH              1,  8, rsp+%3+%4+16
+%endif
+%endif
+    pxor                m2, m6, rq0                     ; q0 ^ 0x80
+    pxor                m4, m6, rp0                     ; p0 ^ 0x80
+    psubsb              m2, m4                          ; (signed) q0 - p0
+    pxor                m4, m6, rp1                     ; p1 ^ 0x80
+    pxor                m5, m6, rq1                     ; q1 ^ 0x80
+    psubsb              m4, m5                          ; (signed) p1 - q1
+    paddsb              m4, m2                          ;   (q0 - p0) + (p1 - q1)
+    paddsb              m4, m2                          ; 2*(q0 - p0) + (p1 - q1)
+    paddsb              m4, m2                          ; 3*(q0 - p0) + (p1 - q1)
+    paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
+    paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
+%if ARCH_X86_64
+    mova                m14, [pb_10]                    ; will be reused in filter4()
+%define rb10 m14
+%else
+%define rb10 [pb_10]
+%endif
+    SRSHIFT3B_2X        m6, m4, rb10, m7                ; f1 and f2 sign byte shift by 3
+    SIGN_SUB            m7, rq0, m6, m5                 ; m7 = q0 - f1
+    SIGN_ADD            m1, rp0, m4, m5                 ; m1 = p0 + f2
+%if %2 != 44
+%if ARCH_X86_64
+    pandn               m6, m15, m3                     ;  ~mask(in) & mask(fm)
+%else
+    mova                m6, [rsp+%3+%4]
+    pandn               m6, m3
+%endif
+    pand                m6, m0                          ; (~mask(in) & mask(fm)) & mask(hev)
+%else
+    pand                m6, m3, m0
+%endif
+    MASK_APPLY          m7, rq0, m6, m5                 ; m7 = filter2(q0) & mask / we write it in filter4()
+    MASK_APPLY          m1, rp0, m6, m5                 ; m1 = filter2(p0) & mask / we write it in filter4()
+
+    ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
+    ; filter4()
+    mova                m4, m2
+    paddsb              m2, m4                          ; 2 * (q0 - p0)
+    paddsb              m2, m4                          ; 3 * (q0 - p0)
+    paddsb              m6, m2, [pb_4]                  ; m6:  f1 = clip(f + 4, 127)
+    paddsb              m2, [pb_3]                      ; m2: f2 = clip(f + 3, 127)
+    SRSHIFT3B_2X        m6, m2, rb10, m4                ; f1 and f2 sign byte shift by 3
+%if %2 != 44
+%if ARCH_X86_64
+    pandn               m5, m15, m3                     ;               ~mask(in) & mask(fm)
+%else
+    mova                m5, [rsp+%3+%4]
+    pandn               m5, m3
+%endif
+    pandn               m0, m5                          ; ~mask(hev) & (~mask(in) & mask(fm))
+%else
+    pandn               m0, m3
+%endif
+    SIGN_SUB            m5, rq0, m6, m4                 ; q0 - f1
+    MASK_APPLY          m5, m7, m0, m4                  ; filter4(q0) & mask
+    mova                [Q0], m5
+    SIGN_ADD            m7, rp0, m2, m4                 ; p0 + f2
+    MASK_APPLY          m7, m1, m0, m4                  ; filter4(p0) & mask
+    mova                [P0], m7
+    paddb               m6, [pb_80]                     ;
+    pxor                m1, m1                          ;   f=(f1+1)>>1
+    pavgb               m6, m1                          ;
+    psubb               m6, [pb_40]                     ;
+    SIGN_ADD            m1, rp1, m6, m2                 ; p1 + f
+    SIGN_SUB            m4, rq1, m6, m2                 ; q1 - f
+    MASK_APPLY          m1, rp1, m0, m2                 ; m1 = filter4(p1)
+    MASK_APPLY          m4, rq1, m0, m2                 ; m4 = filter4(q1)
+    mova                [P1], m1
+    mova                [Q1], m4
+
+%if %2 != 44
+    UNSCRATCH            2, 15, rsp+%3+%4
+%endif
+
+    ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
+    ; filter6()
+%if %2 != 44
+    pxor                m0, m0
+%if %2 > 16
+    pand                m3, m2
+%else
+    pand                m2, m3                          ;               mask(fm) & mask(in)
+%if ARCH_X86_64
+    pandn               m3, m8, m2                      ; ~mask(out) & (mask(fm) & mask(in))
+%else
+    mova                m3, [rsp+%3+%4+16]
+    pandn               m3, m2
+%endif
+%endif
+%if ARCH_X86_64
+    mova               m14, [P3]
+    mova                m9, [Q3]
+%define rp3 m14
+%define rq3 m9
+%else
+%define rp3 [P3]
+%define rq3 [Q3]
+%endif
+    mova                m1, [P2]
+    FILTER_INIT         m4, m5, m6, m7, [P2], %4, 6,             m3,  m1             ; [p2]
+    mova                m1, [Q2]
+    FILTER_UPDATE       m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3,  "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1
+    FILTER_UPDATE       m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3,  "", m1         ; [p0] -p3 -p1 +p0 +q2
+    FILTER_UPDATE       m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3,  "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3
+    FILTER_UPDATE       m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3,  ""             ; [q1] -p2 -q0 +q1 +q3
+    FILTER_UPDATE       m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3,  m1             ; [q2] -p1 -q1 +q2 +q3
+%endif
+
+%if %2 == 16
+    UNSCRATCH            1,  8, rsp+%3+%4+16
+%endif
+
+    ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
+    ; filter14()
+    ;
+    ;                            m2  m3  m8  m9 m14 m15 m10 m11 m12 m13
+    ;
+    ;                                    q2  q3  p3  p2  p1  p0  q0  q1
+    ; p6  -7                     p7  p6  p5  p4   .   .   .   .   .
+    ; p5  -6  -p7 -p6 +p5 +q1     .   .   .                           .
+    ; p4  -5  -p7 -p5 +p4 +q2     .       .   .                      q2
+    ; p3  -4  -p7 -p4 +p3 +q3     .           .   .                  q3
+    ; p2  -3  -p7 -p3 +p2 +q4     .               .   .              q4
+    ; p1  -2  -p7 -p2 +p1 +q5     .                   .   .          q5
+    ; p0  -1  -p7 -p1 +p0 +q6     .                       .   .      q6
+    ; q0  +0  -p7 -p0 +q0 +q7     .                           .   .  q7
+    ; q1  +1  -p6 -q0 +q1 +q7    q1   .                           .   .
+    ; q2  +2  -p5 -q1 +q2 +q7     .  q2   .                           .
+    ; q3  +3  -p4 -q2 +q3 +q7         .  q3   .                       .
+    ; q4  +4  -p3 -q3 +q4 +q7             .  q4   .                   .
+    ; q5  +5  -p2 -q4 +q5 +q7                 .  q5   .               .
+    ; q6  +6  -p1 -q5 +q6 +q7                     .  q6   .           .
+
+%if %2 == 16
+    pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
+    mova            m2, [P7]
+    mova            m3, [P6]
+%if ARCH_X86_64
+    mova            m8, [P5]
+    mova            m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%define rp5s m8
+%define rp4s m9
+%define rp3s m14
+%define rq4 m8
+%define rq5 m9
+%define rq6 m14
+%define rq7 m15
+%define rq4s m8
+%define rq5s m9
+%define rq6s m14
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%define rp5s ""
+%define rp4s ""
+%define rp3s ""
+%define rq4 [Q4]
+%define rq5 [Q5]
+%define rq6 [Q6]
+%define rq7 [Q7]
+%define rq4s ""
+%define rq5s ""
+%define rq6s ""
+%endif
+    FILTER_INIT     m4, m5, m6, m7, [P6], %4, 14,                m1,  m3            ; [p6]
+    FILTER_UPDATE   m4, m5, m6, m7, [P5], %4,  8,  9, 10,  5, 4, m1, rp5s           ; [p5] -p7 -p6 +p5 +q1
+    FILTER_UPDATE   m4, m5, m6, m7, [P4], %4,  8, 10, 11,  6, 4, m1, rp4s           ; [p4] -p7 -p5 +p4 +q2
+    FILTER_UPDATE   m4, m5, m6, m7, [P3], %4,  8, 11,  0,  7, 4, m1, rp3s           ; [p3] -p7 -p4 +p3 +q3
+    FILTER_UPDATE   m4, m5, m6, m7, [P2], %4,  8,  0,  1, 12, 4, m1,  "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4
+    FILTER_UPDATE   m4, m5, m6, m7, [P1], %4,  8,  1,  2, 13, 4, m1,  "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5
+    FILTER_UPDATE   m4, m5, m6, m7, [P0], %4,  8,  2,  3, 14, 4, m1,  "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6
+    FILTER_UPDATE   m4, m5, m6, m7, [Q0], %4,  8,  3,  4, 15, 4, m1,  "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q1], %4,  9,  4,  5, 15, 4, m1,  ""            ; [q1] -p6 -q0 +q1 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q2], %4, 10,  5,  6, 15, 4, m1,  ""            ; [q2] -p5 -q1 +q2 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q3], %4, 11,  6,  7, 15, 4, m1,  ""            ; [q3] -p4 -q2 +q3 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q4], %4,  0,  7, 12, 15, 4, m1, rq4s           ; [q4] -p3 -q3 +q4 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q5], %4,  1, 12, 13, 15, 4, m1, rq5s           ; [q5] -p2 -q4 +q5 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q6], %4,  2, 13, 14, 15, 4, m1, rq6s           ; [q6] -p1 -q5 +q6 +q7
+%endif
+
+%ifidn %1, h
+%if %2 == 16
+    mova                    m0, [P7]
+    mova                    m1, [P6]
+    mova                    m2, [P5]
+    mova                    m3, [P4]
+    mova                    m4, [P3]
+    mova                    m5, [P2]
+%if ARCH_X86_64
+    mova                    m6, [P1]
+%endif
+    mova                    m7, [P0]
+%if ARCH_X86_64
+    mova                    m8, [Q0]
+    mova                    m9, [Q1]
+    mova                   m10, [Q2]
+    mova                   m11, [Q3]
+    mova                   m12, [Q4]
+    mova                   m13, [Q5]
+    mova                   m14, [Q6]
+    mova                   m15, [Q7]
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    DEFINE_REAL_P7_TO_Q7
+    movu  [P7],  m0
+    movu  [P6],  m1
+    movu  [P5],  m2
+    movu  [P4],  m3
+    movu  [P3],  m4
+    movu  [P2],  m5
+    movu  [P1],  m6
+    movu  [P0],  m7
+    movu  [Q0],  m8
+    movu  [Q1],  m9
+    movu  [Q2], m10
+    movu  [Q3], m11
+    movu  [Q4], m12
+    movu  [Q5], m13
+    movu  [Q6], m14
+    movu  [Q7], m15
+%else
+    DEFINE_REAL_P7_TO_Q7
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1]
+    movh   [P7],  m0
+    movh   [P5],  m1
+    movh   [P3],  m2
+    movh   [P1],  m3
+    movh   [Q2],  m5
+    movh   [Q4],  m6
+    movh   [Q6],  m7
+    movhps [P6],  m0
+    movhps [P4],  m1
+    movhps [P2],  m2
+    movhps [P0],  m3
+    movhps [Q3],  m5
+    movhps [Q5],  m6
+    movhps [Q7],  m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    mova                    m0, [Q0]
+    mova                    m1, [Q1]
+    mova                    m2, [Q2]
+    mova                    m3, [Q3]
+    mova                    m4, [Q4]
+    mova                    m5, [Q5]
+    mova                    m7, [Q7]
+    DEFINE_REAL_P7_TO_Q7 8
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1]
+    movh   [P7],  m0
+    movh   [P5],  m1
+    movh   [P3],  m2
+    movh   [P1],  m3
+    movh   [Q2],  m5
+    movh   [Q4],  m6
+    movh   [Q6],  m7
+    movhps [P6],  m0
+    movhps [P4],  m1
+    movhps [P2],  m2
+    movhps [P0],  m3
+    movhps [Q3],  m5
+    movhps [Q5],  m6
+    movhps [Q7],  m7
+%endif
+%elif %2 == 44
+    SWAP 0, 1   ; m0 = p1
+    SWAP 1, 7   ; m1 = p0
+    SWAP 2, 5   ; m2 = q0
+    SWAP 3, 4   ; m3 = q1
+    DEFINE_REAL_P7_TO_Q7 2
+    SBUTTERFLY  bw, 0, 1, 4
+    SBUTTERFLY  bw, 2, 3, 4
+    SBUTTERFLY  wd, 0, 2, 4
+    SBUTTERFLY  wd, 1, 3, 4
+    movd  [P7], m0
+    movd  [P3], m2
+    movd  [Q0], m1
+    movd  [Q4], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P6], m0
+    movd  [P2], m2
+    movd  [Q1], m1
+    movd  [Q5], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P5], m0
+    movd  [P1], m2
+    movd  [Q2], m1
+    movd  [Q6], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P4], m0
+    movd  [P0], m2
+    movd  [Q3], m1
+    movd  [Q7], m3
+%else
+    ; the following code do a transpose of 8 full lines to 16 half
+    ; lines (high part). It is inlined to avoid the need of a staging area
+    mova                    m0, [P3]
+    mova                    m1, [P2]
+    mova                    m2, [P1]
+    mova                    m3, [P0]
+    mova                    m4, [Q0]
+    mova                    m5, [Q1]
+%if ARCH_X86_64
+    mova                    m6, [Q2]
+%endif
+    mova                    m7, [Q3]
+    DEFINE_REAL_P7_TO_Q7
+%if ARCH_X86_64
+    SBUTTERFLY  bw,  0,  1, 8
+    SBUTTERFLY  bw,  2,  3, 8
+    SBUTTERFLY  bw,  4,  5, 8
+    SBUTTERFLY  bw,  6,  7, 8
+    SBUTTERFLY  wd,  0,  2, 8
+    SBUTTERFLY  wd,  1,  3, 8
+    SBUTTERFLY  wd,  4,  6, 8
+    SBUTTERFLY  wd,  5,  7, 8
+    SBUTTERFLY  dq,  0,  4, 8
+    SBUTTERFLY  dq,  1,  5, 8
+    SBUTTERFLY  dq,  2,  6, 8
+    SBUTTERFLY  dq,  3,  7, 8
+%else
+    SBUTTERFLY  bw,  0,  1, 6
+    mova  [rsp+64], m1
+    mova        m6, [rsp+96]
+    SBUTTERFLY  bw,  2,  3, 1
+    SBUTTERFLY  bw,  4,  5, 1
+    SBUTTERFLY  bw,  6,  7, 1
+    SBUTTERFLY  wd,  0,  2, 1
+    mova  [rsp+96], m2
+    mova        m1, [rsp+64]
+    SBUTTERFLY  wd,  1,  3, 2
+    SBUTTERFLY  wd,  4,  6, 2
+    SBUTTERFLY  wd,  5,  7, 2
+    SBUTTERFLY  dq,  0,  4, 2
+    SBUTTERFLY  dq,  1,  5, 2
+    movh      [Q0], m1
+    movhps    [Q1], m1
+    mova        m2, [rsp+96]
+    SBUTTERFLY  dq,  2,  6, 1
+    SBUTTERFLY  dq,  3,  7, 1
+%endif
+    SWAP         3, 6
+    SWAP         1, 4
+    movh      [P7], m0
+    movhps    [P6], m0
+    movh      [P5], m1
+    movhps    [P4], m1
+    movh      [P3], m2
+    movhps    [P2], m2
+    movh      [P1], m3
+    movhps    [P0], m3
+%if ARCH_X86_64
+    movh      [Q0], m4
+    movhps    [Q1], m4
+%endif
+    movh      [Q2], m5
+    movhps    [Q3], m5
+    movh      [Q4], m6
+    movhps    [Q5], m6
+    movh      [Q6], m7
+    movhps    [Q7], m7
+%endif
+%endif
+
+    RET
+%endmacro
+
+%macro LPF_16_VH 5
+INIT_XMM %5
+LOOPFILTER v, %1, %2,  0, %4
+LOOPFILTER h, %1, %2, %3, %4
+%endmacro
+
+%macro LPF_16_VH_ALL_OPTS 4
+LPF_16_VH %1, %2, %3, %4, sse2
+LPF_16_VH %1, %2, %3, %4, ssse3
+LPF_16_VH %1, %2, %3, %4, avx
+%endmacro
+
+LPF_16_VH_ALL_OPTS 16, 512, 256, 32
+LPF_16_VH_ALL_OPTS 44,   0, 128,  0
+LPF_16_VH_ALL_OPTS 48, 256, 128, 16
+LPF_16_VH_ALL_OPTS 84, 256, 128, 16
+LPF_16_VH_ALL_OPTS 88, 256, 128, 16
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 0000000000..c15437b8ba
--- /dev/null
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calulate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+    psubw               m4, m0                      ; q4-q0
+    psubw               m5, m0                      ; q5-q0
+    psubw               m6, m0                      ; q6-q0
+    psubw               m7, m0                      ; q7-q0
+    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
+    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
+    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
+    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
+    por                 m5, m4
+    por                 m7, m6
+    por                 m7, m5                      ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+    psubw               m4, m3, m0                  ; q3-q0
+    psubw               m5, m2, m0                  ; q2-q0
+    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
+%endif
+    psubw               m3, m2                      ; q3-q2
+    psubw               m2, m1                      ; q2-q1
+    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
+    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
+    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
+%if %1 > 4
+    por                 m4, m5
+%endif
+    por                 m2, m3
+    psubw               m3, m1, m0                  ; q1-q0
+    ABS1                m3, m5                      ; abs(q1-q0)
+%if %1 > 4
+    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
+%endif
+    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
+    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
+%if %1 > 4
+    por                 m4, m6
+%endif
+    por                 m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+                                      ; src/sub1, sub2, add1, add2, dont_store
+    psrlw               %1, %2, %4
+    psubw               %1, %6                      ; abs->delta
+%ifnidn %7, ""
+    psubw               %2, %6
+    psubw               %2, %7
+    paddw               %2, %8
+    paddw               %2, %9
+%endif
+    pand                %1, reg_%3                  ; apply mask
+%if %10 == 1
+    paddw               %6, %1                      ; delta->abs
+%else
+    paddw               %1, %6                      ; delta->abs
+    mova              [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+    ; prepare E, I and H masks
+    shl                 Ed, %3-8
+    shl                 Id, %3-8
+    shl                 Hd, %3-8
+%if cpuflag(ssse3)
+    mova                m0, [pw_256]
+%endif
+    movd                m1, Ed
+    movd                m2, Id
+    movd                m3, Hd
+%if cpuflag(ssse3)
+    pshufb              m1, m0                      ; E << (bit_depth - 8)
+    pshufb              m2, m0                      ; I << (bit_depth - 8)
+    pshufb              m3, m0                      ; H << (bit_depth - 8)
+%else
+    punpcklwd           m1, m1
+    punpcklwd           m2, m2
+    punpcklwd           m3, m3
+    pshufd              m1, m1, q0000
+    pshufd              m2, m2, q0000
+    pshufd              m3, m3, q0000
+%endif
+    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
+    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
+    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
+%if %2 > 4
+    PRELOAD                 11, pw_ %+ %%maxf, F
+%endif
+
+    ; set up variables to load data
+%ifidn %1, v
+    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+    lea           stride3q, [strideq*3]
+    neg            strideq
+%if %2 == 16
+    lea              dst0q, [dst8q+strideq*8]
+%else
+    lea              dst4q, [dst8q+strideq*4]
+%endif
+    neg            strideq
+%if %2 == 16
+    lea             dst12q, [dst8q+strideq*4]
+    lea              dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+    DEFINE_ARGS dst0, stride, stride3, dst4
+    lea           stride3q, [strideq*3]
+    lea              dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+    movu                m0, [dst0q+strideq*0-8]
+    movu                m1, [dst0q+strideq*1-8]
+    movu                m2, [dst0q+strideq*2-8]
+    movu                m3, [dst0q+stride3q -8]
+    movu                m4, [dst4q+strideq*0-8]
+    movu                m5, [dst4q+strideq*1-8]
+    movu                m6, [dst4q+strideq*2-8]
+    movu                m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+    mova            [%%p3], m0
+    mova            [%%p2], m1
+    mova            [%%p1], m2
+    mova            [%%p0], m3
+%if ARCH_X86_64
+    mova            [%%q0], m4
+%endif
+    mova            [%%q1], m5
+    mova            [%%q2], m6
+    mova            [%%q3], m7
+
+    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+    ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+    mova                m0, [dst0q+strideq*0-16]
+    mova                m1, [dst0q+strideq*1-16]
+    mova                m2, [dst0q+strideq*2-16]
+    mova                m3, [dst0q+stride3q -16]
+    mova                m4, [dst4q+strideq*0-16]
+    mova                m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2-16]
+%endif
+    mova                m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+    mova            [%%p7], m0
+    mova            [%%p6], m1
+    mova            [%%p5], m2
+    mova            [%%p4], m3
+%if ARCH_X86_64
+    mova            [%%p3], m4
+%endif
+    mova            [%%p2], m5
+    mova            [%%p1], m6
+    mova            [%%p0], m7
+
+    mova                m0, [dst0q+strideq*0]
+    mova                m1, [dst0q+strideq*1]
+    mova                m2, [dst0q+strideq*2]
+    mova                m3, [dst0q+stride3q ]
+    mova                m4, [dst4q+strideq*0]
+    mova                m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2]
+%endif
+    mova                m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+    mova            [%%q0], m0
+    mova            [%%q1], m1
+    mova            [%%q2], m2
+    mova            [%%q3], m3
+%if ARCH_X86_64
+    mova            [%%q4], m4
+%endif
+    mova            [%%q5], m5
+    mova            [%%q6], m6
+    mova            [%%q7], m7
+
+    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+    ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+    ; load q0|q4-7 data
+    mova                m0, [%%q0]
+%if %2 == 16
+    mova                m4, [%%q4]
+    mova                m5, [%%q5]
+    mova                m6, [%%q6]
+    mova                m7, [%%q7]
+
+    ; flat8out q portion
+    FLAT8OUT_HALF
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; load q1-3 data
+    mova                m1, [%%q1]
+    mova                m2, [%%q2]
+    mova                m3, [%%q3]
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flatout[q]
+    ; m12-14=free
+    ; m0-3=q0-q3
+    ; m4-7=free
+
+    ; flat8in|fm|hev q portion
+    FLAT8IN_HALF        %2
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; m2=!fm[q]
+    ; m0,1=q0-q1
+    ; m2-7=free
+    ; m12=free
+
+    ; load p0-1
+    mova                m3, [%%p0]
+    mova                m4, [%%p1]
+
+    ; fm mb_edge portion
+    psubw               m5, m3, m0                  ; q0-p0
+    psubw               m6, m4, m1                  ; q1-p1
+%if ARCH_X86_64
+    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
+%else
+    ABS1                m5, m7                      ; abs(q0-p0)
+    ABS1                m6, m7                      ; abs(q1-p1)
+%endif
+    paddw               m5, m5
+    psraw               m6, 1
+    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+    pcmpgtw             m6, reg_E
+    por                 m2, m6
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m3-4=q0-1
+    ; m0-2/5-7=free
+
+    ; load p4-7 data
+    SWAP                 3, 0                       ; p0
+    SWAP                 4, 1                       ; p1
+%if %2 == 16
+    mova                m7, [%%p7]
+    mova                m6, [%%p6]
+    mova                m5, [%%p5]
+    mova                m4, [%%p4]
+
+    ; flat8out p portion
+    FLAT8OUT_HALF
+    por                 m7, reg_F8O
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m0=p0
+    ; m1-7=free
+
+    ; load p2-3 data
+    mova                m2, [%%p2]
+    mova                m3, [%%p3]
+
+    ; flat8in|fm|hev p portion
+    FLAT8IN_HALF        %2
+    por                 m7, reg_HEV
+%if %2 > 4
+    por                 m4, reg_F8I
+%endif
+    por                 m2, reg_FM
+%if %2 > 4
+    por                 m4, m2                      ; !flat8|!fm
+%if %2 == 16
+    por                 m5, m4, reg_F8O             ; !flat16|!fm
+    pandn               m2, m4                      ; filter4_mask
+    pandn               m4, m5                      ; filter8_mask
+    pxor                m5, [pw_m1]                 ; filter16_mask
+    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+    pandn               m2, m4                      ; filter4_mask
+    pxor                m4, [pw_m1]                 ; filter8_mask
+%endif
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+    pxor                m2, [pw_m1]                 ; filter4_mask
+%endif
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
+
+    ; r9[m15]=filter16_mask
+    ; r10[m13]=hev
+    ; r11[m14]=filter8_mask
+    ; r12[m12]=filter4_mask
+    ; m0,1=p0-p1
+    ; m2-7=free
+    ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+    ; filter_14
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m6, [%%p5]
+    mova                m7, [%%p4]
+    PRELOAD              8, %%p3, P3
+    PRELOAD              9, %%p2, P2
+%endif
+    PRELOAD             10, %%q0, Q0
+    PRELOAD             11, %%q1, Q1
+%if %2 == 16
+    psllw               m4, m2, 3
+    paddw               m5, m3, m3
+    paddw               m4, m6
+    paddw               m5, m7
+    paddw               m4, reg_P3
+    paddw               m5, reg_P2
+    paddw               m4, m1
+    paddw               m5, m0
+    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
+    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
+    paddw               m4, [pw_8]
+    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+    ; at the end of the filter
+
+    mova    [rsp+0*mmsize], m3
+    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
+%endif
+    mova                m3, [%%q2]
+%if %2 == 16
+    mova    [rsp+1*mmsize], m6
+    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
+%endif
+    mova                m6, [%%q3]
+%if %2 == 16
+    mova    [rsp+2*mmsize], m7
+    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
+    mova                m7, [%%q4]
+%if ARCH_X86_64
+    mova    [rsp+3*mmsize], reg_P3
+%else
+    mova                m4, reg_P3
+    mova    [rsp+3*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
+    PRELOAD              8, %%q5, Q5
+%if ARCH_X86_64
+    mova    [rsp+4*mmsize], reg_P2
+%else
+    mova                m4, reg_P2
+    mova    [rsp+4*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
+    PRELOAD              9, %%q6, Q6
+    mova    [rsp+5*mmsize], m1
+    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
+    mova                m1, [%%q7]
+    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
+
+    mova                m7, [%%p1]
+%else
+    SWAP                 1, 7
+%endif
+
+    mova                m2, [%%p3]
+    mova                m1, [%%p2]
+
+    ; reg_Q0-1 (m10-m11)
+    ; m0=p0
+    ; m1=p2
+    ; m2=p3
+    ; m3=q2
+    ; m4-5=free
+    ; m6=q3
+    ; m7=p1
+    ; m8-9 unused
+
+    ; filter_6
+    psllw               m4, m2, 2
+    paddw               m5, m1, m1
+    paddw               m4, m7
+    psubw               m5, m2
+    paddw               m4, m0
+    paddw               m5, reg_Q0
+    paddw               m4, [pw_4]
+    paddw               m5, m4
+
+%if ARCH_X86_64
+    mova                m8, m1
+    mova                m9, m7
+%else
+    mova    [rsp+0*mmsize], m1
+    mova    [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
+    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
+%if ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
+
+    UNSCRATCH            2, 10, %%q0
+    UNSCRATCH            6, 11, %%q1
+%else
+    SWAP                 1, 7
+    mova                m2, [%%q0]
+    mova                m6, [%%q1]
+%endif
+    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
+
+    ; m0=p0
+    ; m1=p2
+    ; m2=q0
+    ; m3=hev_mask
+    ; m4-5=free
+    ; m6=q1
+    ; m7=p1
+
+    ; filter_4
+    psubw               m4, m7, m6              ; p1-q1
+    psubw               m5, m2, m0              ; q0-p0
+    pand                m4, m3
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
+    paddw               m4, m5
+    paddw               m5, m5
+    paddw               m4, m5                  ; 3*(q0-p0)+f
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
+    pand                m4, reg_F4M
+    paddw               m5, m4, [pw_4]
+    paddw               m4, [pw_3]
+    pminsw              m5, [pw_ %+ %%maxsgn]
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
+    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
+    psubw               m2, m5                  ; q0-f1
+    paddw               m0, m4                  ; p0+f2
+    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
+    pxor                m4, m4
+    mova                m5, [pw_ %+ %%maxusgn]
+    pmaxsw              m2, m4
+    pmaxsw              m0, m4
+    pminsw              m2, m5
+    pminsw              m0, m5
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
+%else
+    paddw               m3, [pw_1]
+    psraw               m3, 1
+%endif
+    paddw               m7, m3                  ; p1+f
+    psubw               m6, m3                  ; q1-f
+    pmaxsw              m7, m4
+    pmaxsw              m6, m4
+    pminsw              m7, m5
+    pminsw              m6, m5
+
+    ; store
+%ifidn %1, v
+    mova            [%%p1], m7
+    mova            [%%p0], m0
+    mova            [%%q0], m2
+    mova            [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+    TRANSPOSE4x4W        7, 0, 2, 6, 1
+    movh   [dst0q+strideq*0-4], m7
+    movhps [dst0q+strideq*1-4], m7
+    movh   [dst0q+strideq*2-4], m0
+    movhps [dst0q+stride3q -4], m0
+    movh   [dst4q+strideq*0-4], m2
+    movhps [dst4q+strideq*1-4], m2
+    movh   [dst4q+strideq*2-4], m6
+    movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+    mova                m3, [%%p3]
+    mova                m4, [%%q2]
+    mova                m5, [%%q3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+    mova                m2, [%%q0]
+%endif
+
+    movu [dst0q+strideq*0-8], m3
+    movu [dst0q+strideq*1-8], m1
+    movu [dst0q+strideq*2-8], m7
+    movu [dst0q+stride3q -8], m0
+    movu [dst4q+strideq*0-8], m2
+    movu [dst4q+strideq*1-8], m6
+    movu [dst4q+strideq*2-8], m4
+    movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+    SCRATCH              2, 8, %%q0
+    SCRATCH              6, 9, %%q1
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m4, [%%p5]
+    mova                m5, [%%p4]
+    mova                m6, [%%p3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+    mova            [%%p1], m7
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+    mova [dst0q+strideq*0-16], m2
+    mova [dst0q+strideq*1-16], m3
+    mova [dst0q+strideq*2-16], m4
+    mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+    mova [dst4q+strideq*0-16], m6
+%endif
+    mova [dst4q+strideq*1-16], m1
+    mova [dst4q+strideq*2-16], m7
+    mova [dst4q+stride3q -16], m0
+
+    UNSCRATCH            2, 8, %%q0
+    UNSCRATCH            6, 9, %%q1
+    mova                m0, [%%q2]
+    mova                m1, [%%q3]
+    mova                m3, [%%q4]
+    mova                m4, [%%q5]
+%if ARCH_X86_64
+    mova                m5, [%%q6]
+%endif
+    mova                m7, [%%q7]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+    mova [dst0q+strideq*0], m2
+    mova [dst0q+strideq*1], m6
+    mova [dst0q+strideq*2], m0
+    mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+    mova [dst4q+strideq*0], m3
+%endif
+    mova [dst4q+strideq*1], m4
+    mova [dst4q+strideq*2], m5
+    mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+    RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1,  4, %2
+LOOP_FILTER_CPUSETS %1,  8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
new file mode 100644
index 0000000000..9152ba541b
--- /dev/null
+++ b/libavcodec/x86/vp9mc.asm
@@ -0,0 +1,676 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_256
+cextern pw_64
+
+%macro F8_SSSE3_TAPS 8
+times 16 db %1, %2
+times 16 db %3, %4
+times 16 db %5, %6
+times 16 db %7, %8
+%endmacro
+
+%macro F8_SSE2_TAPS 8
+times 8 dw %1
+times 8 dw %2
+times 8 dw %3
+times 8 dw %4
+times 8 dw %5
+times 8 dw %6
+times 8 dw %7
+times 8 dw %8
+%endmacro
+
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
+%macro FILTER 1
+const filters_%1 ; smooth
+                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
+                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
+                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
+                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
+                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
+                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
+                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
+                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
+                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
+                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
+                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
+                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
+                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
+                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
+                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
+                    ; regular
+                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
+                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
+                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
+                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
+                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
+                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
+                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
+                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
+                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
+                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
+                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
+                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
+                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
+                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
+                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
+                    ; sharp
+                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
+                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
+                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
+                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
+                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
+                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
+                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
+                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
+                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
+                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
+                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
+                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
+                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
+                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
+                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
+%endmacro
+
+%define F8_TAPS F8_SSSE3_TAPS
+; int8_t ff_filters_ssse3[3][15][4][32]
+FILTER ssse3
+%define F8_TAPS F8_SSE2_TAPS
+; int16_t ff_filters_sse2[3][15][8][8]
+FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
+
+SECTION .text
+
+%macro filter_sse2_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+    pxor        m5, m5
+    mova        m6, [pw_64]
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 16]
+    mova        m9, [filteryq+ 32]
+    mova       m10, [filteryq+ 48]
+    mova       m11, [filteryq+ 64]
+    mova       m12, [filteryq+ 80]
+    mova       m13, [filteryq+ 96]
+    mova       m14, [filteryq+112]
+%endif
+.loop:
+    movh        m0, [srcq-3]
+    movh        m1, [srcq-2]
+    movh        m2, [srcq-1]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+1]
+    punpcklbw   m0, m5
+    punpcklbw   m1, m5
+    punpcklbw   m2, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+    pmullw      m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m8
+    pmullw      m2, m9
+    pmullw      m3, m10
+    pmullw      m4, m11
+%else
+    pmullw      m1, [filteryq+ 16]
+    pmullw      m2, [filteryq+ 32]
+    pmullw      m3, [filteryq+ 48]
+    pmullw      m4, [filteryq+ 64]
+%endif
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m4
+    movh        m1, [srcq+2]
+    movh        m3, [srcq+3]
+    movh        m4, [srcq+4]
+    add       srcq, sstrideq
+    punpcklbw   m1, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m12
+    pmullw      m3, m13
+    pmullw      m4, m14
+%else
+    pmullw      m1, [filteryq+ 80]
+    pmullw      m3, [filteryq+ 96]
+    pmullw      m4, [filteryq+112]
+%endif
+    paddw       m0, m1
+    paddw       m3, m4
+    paddw       m0, m6
+    paddw       m2, m3
+    paddsw      m0, m2
+    psraw       m0, 7
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+INIT_XMM sse2
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+%macro filter_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+    mova        m6, [pw_256]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-3]
+    movh        m1, [srcq-2]
+    movh        m2, [srcq-1]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+1]
+    movh        m5, [srcq+2]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [srcq+3]
+    movh        m3, [srcq+4]
+    add       srcq, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
+%endif
+    paddw       m0, m4
+    paddw       m2, m1
+    paddsw      m0, m2
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_h_fn put
+filter_h_fn avg
+
+INIT_XMM ssse3
+filter_h_fn put
+filter_h_fn avg
+
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+    mova       m13, [pw_256]
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    movu        m0, [srcq-3]
+    movu        m1, [srcq-2]
+    movu        m2, [srcq-1]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+1]
+    movu        m5, [srcq+2]
+    movu        m6, [srcq+3]
+    movu        m7, [srcq+4]
+    add       srcq, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m4
+    paddw       m1, m5
+    paddw       m2, m6
+    paddw       m3, m7
+    paddsw      m0, m2
+    paddsw      m1, m3
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_hx2_fn put
+filter_hx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro filter_sse2_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    pxor        m5, m5
+    mova        m6, [pw_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 16]
+    mova        m9, [filteryq+ 32]
+    mova       m10, [filteryq+ 48]
+    mova       m11, [filteryq+ 64]
+    mova       m12, [filteryq+ 80]
+    mova       m13, [filteryq+ 96]
+    mova       m14, [filteryq+112]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklbw   m0, m5
+    punpcklbw   m1, m5
+    punpcklbw   m2, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+    pmullw      m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m8
+    pmullw      m2, m9
+    pmullw      m3, m10
+    pmullw      m4, m11
+%else
+    pmullw      m1, [filteryq+ 16]
+    pmullw      m2, [filteryq+ 32]
+    pmullw      m3, [filteryq+ 48]
+    pmullw      m4, [filteryq+ 64]
+%endif
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m4
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    movh        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklbw   m1, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m12
+    pmullw      m3, m13
+    pmullw      m4, m14
+%else
+    pmullw      m1, [filteryq+ 80]
+    pmullw      m3, [filteryq+ 96]
+    pmullw      m4, [filteryq+112]
+%endif
+    paddw       m0, m1
+    paddw       m3, m4
+    paddw       m0, m6
+    paddw       m2, m3
+    paddsw      m0, m2
+    psraw       m0, 7
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+INIT_XMM sse2
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+%macro filter_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m6, [pw_256]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    movh        m4, [src4q]
+    movh        m5, [src4q+sstrideq]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [src4q+sstrideq*2]
+    movh        m3, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
+%endif
+    paddw       m0, m4
+    paddw       m2, m1
+    paddsw      m0, m2
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_v_fn put
+filter_v_fn avg
+
+INIT_XMM ssse3
+filter_v_fn put
+filter_v_fn avg
+
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+    mova       m13, [pw_256]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    movu        m4, [src4q]
+    movu        m5, [src4q+sstrideq]
+    movu        m6, [src4q+sstrideq*2]
+    movu        m7, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m4
+    paddw       m1, m5
+    paddw       m2, m6
+    paddw       m3, m7
+    paddsw      m0, m2
+    paddsw      m1, m3
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_vx2_fn put
+filter_vx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro fpel_fn 6-8 0, 4
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
+%if %2 <= mmsize
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+    lea  sstride3q, [sstrideq*3]
+    lea  dstride3q, [dstrideq*3]
+%else
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
+%endif
+.loop:
+    %%srcfn     m0, [srcq]
+    %%srcfn     m1, [srcq+s%3]
+    %%srcfn     m2, [srcq+s%4]
+    %%srcfn     m3, [srcq+s%5]
+%if %2/mmsize == 8
+    %%srcfn     m4, [srcq+mmsize*4]
+    %%srcfn     m5, [srcq+mmsize*5]
+    %%srcfn     m6, [srcq+mmsize*6]
+    %%srcfn     m7, [srcq+mmsize*7]
+%endif
+    lea       srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+    %%pavg      m0, [dstq]
+    %%pavg      m1, [dstq+d%3]
+    %%pavg      m2, [dstq+d%4]
+    %%pavg      m3, [dstq+d%5]
+%if %2/mmsize == 8
+    %%pavg      m4, [dstq+mmsize*4]
+    %%pavg      m5, [dstq+mmsize*5]
+    %%pavg      m6, [dstq+mmsize*6]
+    %%pavg      m7, [dstq+mmsize*7]
+%endif
+%endif
+    %%dstfn [dstq], m0
+    %%dstfn [dstq+d%3], m1
+    %%dstfn [dstq+d%4], m2
+    %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+    %%dstfn [dstq+mmsize*4], m4
+    %%dstfn [dstq+mmsize*5], m5
+    %%dstfn [dstq+mmsize*6], m6
+    %%dstfn [dstq+mmsize*7], m7
+%endif
+    lea       dstq, [dstq+dstrideq*%6]
+    sub         hd, %6
+    jnz .loop
+    RET
+%endmacro
+
+%define d16 16
+%define s16 16
+%define d32 32
+%define s32 32
+INIT_MMX mmx
+fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
+INIT_MMX mmxext
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
+INIT_YMM avx
+fpel_fn put, 32, strideq, strideq*2, stride3q, 4
+fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
+%endif
+%undef s16
+%undef d16
+%undef s32
+%undef d32
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 0000000000..9a462eaf80
--- /dev/null
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-6]
+    movh        m1, [srcq-4]
+    movh        m2, [srcq-2]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+2]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+32]
+%endif
+    movu        m1, [srcq+4]
+    movu        m3, [srcq+6]
+    paddd       m0, m2
+    movu        m2, [srcq+8]
+    add       srcq, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movu        m0, [srcq-6]
+    movu        m1, [srcq-4]
+    movu        m2, [srcq-2]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+2]
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+    pmaddwd     m4, m9
+%else
+    pmaddwd     m2, [filteryq+32]
+    pmaddwd     m3, [filteryq+32]
+    pmaddwd     m4, [filteryq+64]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    paddd       m0, m4
+    movu        m2, [srcq+4]
+    movu        m3, [srcq+6]
+    movu        m4, [srcq+8]
+    add       srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m9
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m2, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+    pmaddwd     m4, [filteryq+96]
+%endif
+    paddd       m1, m2
+    paddd       m0, m3
+    paddd       m1, m4
+    paddd       m0, m6
+    paddd       m1, m6
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+    packusdw    m1, m1
+%else
+    packssdw    m0, m0
+    packssdw    m1, m1
+%endif
+    punpcklwd   m0, m1
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+%endif
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    paddd       m0, m2
+    movh        m2, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m3, [filteryq+ 96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m12, m12
+%endif
+%if ARCH_X86_64
+    mova       m11, [pd_64]
+%endif
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movu        m4, [src4q]
+    SBUTTERFLY  wd, 0, 1, 6
+    SBUTTERFLY  wd, 2, 3, 6
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+    pmaddwd     m3, [filteryq+ 32]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    movu        m2, [src4q+sstrideq]
+    movu        m3, [src4q+sstrideq*2]
+    SBUTTERFLY  wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m2, m9
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m2, [filteryq+ 64]
+%endif
+    paddd       m0, m4
+    paddd       m1, m2
+    movu        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    SBUTTERFLY  wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m3, [filteryq+ 96]
+    pmaddwd     m4, [filteryq+ 96]
+%endif
+    paddd       m0, m3
+    paddd       m1, m4
+%if ARCH_X86_64
+    paddd       m0, m11
+    paddd       m1, m11
+%else
+    paddd       m0, [pd_64]
+    paddd       m1, [pd_64]
+%endif
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m1
+%else
+    packssdw    m0, m1
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m12
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 2f064cad7b..94b3049a7e 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 0000000000..0220885da6
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,983 @@
+; XVID MPEG-4 VIDEO CODEC
+;
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Christophe Gisquet <christophe.gisquet@gmail.com>
+;
+; ===========     SSE2 inverse discrete cosine transform     ===========
+;
+; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange <astrange@ithinksw.com>
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; =======     MMX and XMM forward discrete cosine transform     =======
+;
+; Copyright(C) 2001 Peter Ross <pross@xvid.org>
+;
+; Originally provided by Intel at AP-922
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+; but in a limited edition.
+; New macro implements a column part for precise iDCT
+; The routine precision now satisfies IEEE standard 1180-1990.
+;
+; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+;
+; http://www.elecard.com/peter/idct.html
+; http://www.linuxvideo.org/mpeg2dec/
+;
+; These examples contain code fragments for first stage iDCT 8x8
+; (for rows) and first stage DCT 8x8 (for columns)
+;
+; conversion to gcc syntax by Michael Niedermayer
+;
+; ======================================================================
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; Similar to tg_1_16 in MMX code
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+; SSE2 tables
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+%if ARCH_X86_32
+; -----------------------------------------------------------------------------
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+; -----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+; -----------------------------------------------------------------------------
+;
+; static const short w[32] = {
+;     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
+;     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
+;     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
+;     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+;     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
+;     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+; {
+;     int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+;     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
+;     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
+;     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
+;     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+;     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+;     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+;     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+;     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+;     y[0] = SHIFT_ROUND(a0 + b0);
+;     y[1] = SHIFT_ROUND(a1 + b1);
+;     y[2] = SHIFT_ROUND(a2 + b2);
+;     y[3] = SHIFT_ROUND(a3 + b3);
+;     y[4] = SHIFT_ROUND(a3 - b3);
+;     y[5] = SHIFT_ROUND(a2 - b2);
+;     y[6] = SHIFT_ROUND(a1 - b1);
+;     y[7] = SHIFT_ROUND(a0 - b0);
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+;     for rows 0,4 - by cos_4_16,
+;     for rows 1,7 - by cos_1_16,
+;     for rows 2,6 - by cos_2_16,
+;     for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy.
+;
+; For the constants used,
+;     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; Tables for mmx processors
+; -----------------------------------------------------------------------------
+
+; Table for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_mmx: dw  16384,  16384,  16384, -16384
+              dw  21407,   8867,   8867, -21407 ; w07 w05 w03 w01
+              dw  16384, -16384,  16384,  16384 ; w14 w12 w10 w08
+              dw  -8867,  21407, -21407,  -8867 ; w15 w13 w11 w09
+              dw  22725,  12873,  19266, -22725 ; w22 w20 w18 w16
+              dw  19266,   4520,  -4520, -12873 ; w23 w21 w19 w17
+              dw  12873,   4520,   4520,  19266 ; w30 w28 w26 w24
+              dw -22725,  19266, -12873, -22725 ; w31 w29 w27 w25
+; Table for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  22725,  22725, -22725 ; movq-> w06 w04 w02 w00
+              dw  29692,  12299,  12299, -29692 ; w07 w05 w03 w01
+              dw  22725, -22725,  22725,  22725 ; w14 w12 w10 w08
+              dw -12299,  29692, -29692, -12299 ; w15 w13 w11 w09
+              dw  31521,  17855,  26722, -31521 ; w22 w20 w18 w16
+              dw  26722,   6270,  -6270, -17855 ; w23 w21 w19 w17
+              dw  17855,   6270,   6270,  26722 ; w30 w28 w26 w24
+              dw -31521,  26722, -17855, -31521 ; w31 w29 w27 w25
+; Table for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  21407,  21407, -21407 ; movq-> w06 w04 w02 w00
+              dw  27969,  11585,  11585, -27969 ; w07 w05 w03 w01
+              dw  21407, -21407,  21407,  21407 ; w14 w12 w10 w08
+              dw -11585,  27969, -27969, -11585 ; w15 w13 w11 w09
+              dw  29692,  16819,  25172, -29692 ; w22 w20 w18 w16
+              dw  25172,   5906,  -5906, -16819 ; w23 w21 w19 w17
+              dw  16819,   5906,   5906,  25172 ; w30 w28 w26 w24
+              dw -29692,  25172, -16819, -29692 ; w31 w29 w27 w25
+; Table for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  19266,  19266, -19266 ; movq-> w06 w04 w02 w00
+              dw  25172,  10426,  10426, -25172 ; w07 w05 w03 w01
+              dw  19266, -19266,  19266,  19266 ; w14 w12 w10 w08
+              dw -10426,  25172, -25172, -10426 ; w15 w13 w11 w09
+              dw  26722,  15137,  22654, -26722 ; w22 w20 w18 w16
+              dw  22654,   5315,  -5315, -15137 ; w23 w21 w19 w17
+              dw  15137,   5315,   5315,  22654 ; w30 w28 w26 w24
+              dw -26722,  22654, -15137, -26722 ; w31 w29 w27 w25
+
+; -----------------------------------------------------------------------------
+; Tables for xmm processors
+; -----------------------------------------------------------------------------
+
+; %3 for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_xmm: dw  16384,  21407,  16384,   8867 ; movq-> w05 w04 w01 w00
+              dw  16384,   8867, -16384, -21407 ; w07 w06 w03 w02
+              dw  16384,  -8867,  16384, -21407 ; w13 w12 w09 w08
+              dw -16384,  21407,  16384,  -8867 ; w15 w14 w11 w10
+              dw  22725,  19266,  19266,  -4520 ; w21 w20 w17 w16
+              dw  12873,   4520, -22725, -12873 ; w23 w22 w19 w18
+              dw  12873, -22725,   4520, -12873 ; w29 w28 w25 w24
+              dw   4520,  19266,  19266, -22725 ; w31 w30 w27 w26
+; %3 for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  29692,  22725,  12299 ; movq-> w05 w04 w01 w00
+              dw  22725,  12299, -22725, -29692 ; w07 w06 w03 w02
+              dw  22725, -12299,  22725, -29692 ; w13 w12 w09 w08
+              dw -22725,  29692,  22725, -12299 ; w15 w14 w11 w10
+              dw  31521,  26722,  26722,  -6270 ; w21 w20 w17 w16
+              dw  17855,   6270, -31521, -17855 ; w23 w22 w19 w18
+              dw  17855, -31521,   6270, -17855 ; w29 w28 w25 w24
+              dw   6270,  26722,  26722, -31521 ; w31 w30 w27 w26
+; %3 for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  27969,  21407,  11585 ; movq-> w05 w04 w01 w00
+              dw  21407,  11585, -21407, -27969 ; w07 w06 w03 w02
+              dw  21407, -11585,  21407, -27969 ; w13 w12 w09 w08
+              dw -21407,  27969,  21407, -11585 ; w15 w14 w11 w10
+              dw  29692,  25172,  25172,  -5906 ; w21 w20 w17 w16
+              dw  16819,   5906, -29692, -16819 ; w23 w22 w19 w18
+              dw  16819, -29692,   5906, -16819 ; w29 w28 w25 w24
+              dw   5906,  25172,  25172, -29692 ; w31 w30 w27 w26
+; %3 for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  25172,  19266,  10426 ; movq-> w05 w04 w01 w00
+              dw  19266,  10426, -19266, -25172 ; w07 w06 w03 w02
+              dw  19266, -10426,  19266, -25172 ; w13 w12 w09 w08
+              dw -19266,  25172,  19266, -10426 ; w15 w14 w11 w10
+              dw  26722,  22654,  22654,  -5315 ; w21 w20 w17 w16
+              dw  15137,   5315, -26722, -15137 ; w23 w22 w19 w18
+              dw  15137, -26722,   5315, -15137 ; w29 w28 w25 w24
+              dw   5315,  22654,  22654, -26722 ; w31 w30 w27 w26
+%endif ; ~ARCH_X86_32
+
+; Similar to rounder_0 in MMX code
+; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
+walkenIdctRounders: times 4 dd 65536
+                    times 4 dd  3597
+                    times 4 dd  2260
+                    times 4 dd  1203
+                    times 4 dd   120
+                    times 4 dd   512
+                    times 2 dd     0
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+    pxor      %1, %1
+%endmacro
+%macro PUT_ODD 1
+    pshufhw   %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+    movdqa    %2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+    CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+    PUT_ODD   %1
+%else
+    pshufhw xmm2, xmm2, 0x1B
+    movdqa    %1, xmm2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+%define ROW0  xmm8
+%define REG0  ROW0
+%define ROW2  xmm9
+%define REG2  ROW2
+%define ROW4  xmm10
+%define REG4  ROW4
+%define ROW6  xmm11
+%define REG6  ROW6
+%define XMMS  xmm12
+%define SREG2 REG2
+%define TAN3  xmm13
+%define TAN1  xmm14
+%else
+%define ROW0  [BLOCK + 0*16]
+%define REG0  xmm4
+%define ROW2  [BLOCK + 2*16]
+%define REG2  xmm4
+%define ROW4  [BLOCK + 4*16]
+%define REG4  xmm6
+%define ROW6  [BLOCK + 6*16]
+%define REG6  xmm6
+%define XMMS  xmm2
+%define SREG2 xmm7
+%define TAN3  xmm0
+%define TAN1  xmm2
+%endif
+
+%macro JZ  2
+    test      %1, %1
+    jz       .%2
+%endmacro
+
+%macro JNZ  2
+    test      %1, %1
+    jnz      .%2
+%endmacro
+
+%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
+    %3        %4
+    movq     mm1, [%1]
+    por      mm1, [%1 + 8]
+    paddusb  mm1, mm0
+    pmovmskb  %2, mm1
+%endmacro
+
+;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
+%macro  TEST_TWO_ROWS  8
+    %5         %6
+    %7         %8
+    movq      mm1, [%1 + 0]
+    por       mm1, [%1 + 8]
+    movq      mm2, [%2 + 0]
+    por       mm2, [%2 + 8]
+    paddusb   mm1, mm0
+    paddusb   mm2, mm0
+    pmovmskb   %3, mm1
+    pmovmskb   %4, mm2
+%endmacro
+
+; IDCT pass on rows.
+%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
+    movdqa       xmm3, [%1]
+    movdqa       xmm0, xmm3
+    pshufd       xmm1, xmm3, 0x11 ; 4602
+    punpcklqdq   xmm0, xmm0       ; 0246
+    pmaddwd      xmm0, [%2]
+    pmaddwd      xmm1, [%2+16]
+    pshufd       xmm2, xmm3, 0xBB ; 5713
+    punpckhqdq   xmm3, xmm3       ; 1357
+    pmaddwd      xmm2, [%2+32]
+    pmaddwd      xmm3, [%2+48]
+    paddd        xmm0, xmm1
+    paddd        xmm2, xmm3
+%if %0 == 5
+    paddd        xmm0, [walkenIdctRounders+%5]
+%endif
+    movdqa       xmm3, xmm2
+    paddd        xmm2, xmm0
+    psubd        xmm0, xmm3
+    psrad        xmm2, 11
+    psrad        xmm0, 11
+    packssdw     xmm2, xmm0
+    %3           %4
+%endmacro
+
+%macro iLLM_HEAD 0
+    movdqa   TAN3, [tan3]
+    movdqa   TAN1, [tan1]
+%endmacro
+
+%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
+    psraw    xmm5, 6
+    psraw    REG0, 6
+    psraw    TAN3, 6
+    psraw    xmm3, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+%else
+    ; Must now load args as gprs are no longer used for masks
+    ; DEST is set to where address of dest was loaded
+    %if ARCH_X86_32
+        %if %2 == 2 ; Not enough xmms, store
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+        %endif
+    %xdefine DEST r2q ; BLOCK is r0, stride r1
+    movifnidn DEST, destm
+    movifnidn strideq, stridem
+    %else
+    %xdefine DEST r0q
+    %endif
+    lea      r3q, [3*strideq]
+    %if %2 == 1
+    packuswb TAN3, xmm3
+    packuswb xmm5, REG0
+    movq     [DEST + strideq], TAN3
+    movhps   [DEST + 2*strideq], TAN3
+    ; REG0 and TAN3 are now available (and likely used in second half)
+    %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
+    psraw    %3, 6
+    psraw    %4, 6
+    psraw    %5, 6
+    psraw    %6, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+0*16], %3
+    movdqa   [%1+3*16], %5
+    movdqa   [%1+4*16], %6
+    movdqa   [%1+7*16], %4
+%elif %2 == 1
+    packuswb %3, %5
+    packuswb %6, %4
+    ; address of dest may have been loaded
+    movq     [DEST], %3
+    movhps   [DEST + r3q], %3
+    lea      DEST, [DEST + 4*strideq]
+    movq     [DEST], %6
+    movhps   [DEST + r3q], %6
+    ; and now write remainder of first half
+    movq     [DEST + 2*strideq], xmm5
+    movhps   [DEST + strideq], xmm5
+%elif %2 == 2
+    pxor        xmm0, xmm0
+    %if ARCH_X86_32
+    ; free: m3 REG0=m4 m5
+    ; input: m1, m7, m2, m6
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    paddsw      xmm3, %3
+    paddsw      xmm4, [%1 + 1*16]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw        %3, [%1 + 2*16]
+    paddsw      xmm5, %5
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw      xmm3, %6
+    paddsw      xmm4, [%1 + 5*16]
+    paddsw        %3, [%1 + 6*16]
+    paddsw      xmm5, %4
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    %else
+    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %3
+    paddsw      xmm4, TAN3
+    paddsw     xmm12, xmm3
+    paddsw     xmm11, %5
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %6
+    paddsw      xmm4, REG0
+    paddsw     xmm12, xmm5
+    paddsw     xmm11, %4
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    %endif
+%endif
+%endmacro
+
+
+; IDCT pass on columns.
+%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
+    movdqa   xmm1, TAN3
+    movdqa   xmm3, TAN1
+    pmulhw   TAN3, xmm4
+    pmulhw   xmm1, xmm5
+    paddsw   TAN3, xmm4
+    paddsw   xmm1, xmm5
+    psubsw   TAN3, xmm5
+    paddsw   xmm1, xmm4
+    pmulhw   xmm3, xmm7
+    pmulhw   TAN1, xmm6
+    paddsw   xmm3, xmm6
+    psubsw   TAN1, xmm7
+    movdqa   xmm7, xmm3
+    movdqa   xmm6, TAN1
+    psubsw   xmm3, xmm1
+    psubsw   TAN1, TAN3
+    paddsw   xmm1, xmm7
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm7, [tan2]
+    MOV32    ROW2, REG2
+    MOV32    ROW6, REG6
+    movdqa   xmm5, xmm7
+    pmulhw   xmm7, REG6
+    pmulhw   xmm5, REG2
+    paddsw   xmm7, REG2
+    psubsw   xmm5, REG6
+    MOV32    ROW0, REG0
+    MOV32    ROW4, REG4
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, REG4
+    paddsw   REG4, XMMS
+    movdqa   XMMS, REG4
+    psubsw   REG4, xmm7
+    paddsw   xmm7, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, xmm7
+    movdqa   xmm4, REG4
+    psubsw   xmm7, xmm1
+    psubsw   REG4, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
+%endmacro
+
+; IDCT pass on columns, assuming rows 4-7 are zero
+%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, xmm4
+    movdqa   xmm3, xmm6
+    pmulhw   TAN1, xmm6
+    movdqa   xmm1, xmm4
+    psubsw   xmm3, xmm1
+    paddsw   xmm1, xmm6
+    movdqa   xmm6, TAN1
+    psubsw   TAN1, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm5, [tan2]
+    MOV32    ROW2, SREG2
+    pmulhw   xmm5, SREG2
+    MOV32    ROW0, REG0
+    movdqa   xmm6, REG0
+    psubsw   xmm6, SREG2
+    paddsw  SREG2, REG0
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, SREG2
+    movdqa   xmm4, xmm6
+    psubsw  SREG2, xmm1
+    psubsw   xmm6, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
+%endmacro
+
+%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
+%if %1 == 0 || ARCH_X86_32
+    %define GPR0  r1d
+    %define GPR1  r2d
+    %define GPR2  r3d
+    %define GPR3  r4d
+    %define NUM_GPRS 5
+%else
+    %define GPR0  r3d
+    %define GPR1  r4d
+    %define GPR2  r5d
+    %define GPR3  r6d
+    %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+    %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %endif
+    %if ARCH_X86_64
+    %xdefine BLOCK blockq
+    %else
+    mov    r0q, blockm
+    %xdefine BLOCK r0q
+    %endif
+%endif
+    movq           mm0, [pb_127]
+    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
+    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+
+    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+    JZ   GPR0, col1
+    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
+.col1:
+    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
+
+    iLLM_HEAD
+    JNZ  GPR1, 2
+    JNZ  GPR0, 3
+    JNZ  GPR2, 4
+    JNZ  GPR3, 5
+    iLLM_PASS_SPARSE BLOCK, %1
+    jmp .6
+.2:
+    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
+.3:
+    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
+    JZ   GPR2, col2
+.4:
+    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+.col2:
+    JZ   GPR3, col3
+.5:
+    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
+.col3:
+%if ARCH_X86_32
+    iLLM_HEAD
+%endif
+    iLLM_PASS     BLOCK, %1
+.6:
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
+IDCT_SSE2 2
+
+%if ARCH_X86_32
+
+; %1=offset  %2=tab_offset
+; %3=rnd_offset where 4*8->6*16  5*8->4*16  6/7*8->5*16
+%macro DCT_8_INV_ROW  3
+    movq       mm0, [r0+16*%1+0]  ; 0 ; x3 x2 x1 x0
+    movq       mm1, [r0+16*%1+8]  ; 1 ; x7 x6 x5 x4
+    movq       mm2, mm0       ; 2 ; x3 x2 x1 x0
+    movq       mm3, [%2+ 0]   ; 3 ; w06 w04 w02 w00
+%if cpuflag(mmxext)
+    pshufw     mm0, mm0, 0x88 ; x2 x0 x2 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w06 w03 w02
+    movq       mm5, mm1       ; 5 ; x7 x6 x5 x4
+    pmaddwd    mm3, mm0       ; x2*w05+x0*w04 x2*w01+x0*w00
+    movq       mm6, [%2+32]   ; 6 ; w21 w20 w17 w16
+    pshufw     mm1, mm1, 0x88 ; x6 x4 x6 x4
+    pmaddwd    mm4, mm1       ; x6*w07+x4*w06 x6*w03+x4*w02
+    movq       mm7, [%2+40]   ; 7; w23 w22 w19 w18
+    pshufw     mm2, mm2, 0xdd ; x3 x1 x3 x1
+    pmaddwd    mm6, mm2       ; x3*w21+x1*w20 x3*w17+x1*w16
+    pshufw     mm5, mm5, 0xdd ; x7 x5 x7 x5
+    pmaddwd    mm7, mm5       ; x7*w23+x5*w22 x7*w19+x5*w18
+    paddd      mm3, [walkenIdctRounders + %3]      ; +%3
+    pmaddwd    mm0, [%2+16]   ; x2*w13+x0*w12 x2*w09+x0*w08
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm1, [%2+24]   ; x6*w15+x4*w14 x6*w11+x4*w10
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm2, [%2+48]   ; x3*w29+x1*w28 x3*w25+x1*w24
+    paddd      mm6, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    pmaddwd    mm5, [%2+56]   ; x7*w31+x5*w30 x7*w27+x5*w26
+    paddd      mm3, mm6       ; a1+b1 a0+b0
+    paddd      mm0, [walkenIdctRounders + %3]      ; +%3
+    psrad      mm3, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm0, mm1       ; 1 ; a3=sum(even3) a2=sum(even2)
+    psubd      mm4, mm6       ; 6 ; a1-b1 a0-b0
+    movq       mm7, mm0       ; 7 ; a3 a2
+    paddd      mm2, mm5       ; 5 ; b3=sum(odd3) b2=sum(odd2)
+    paddd      mm0, mm2       ; a3+b3 a2+b2
+    psrad      mm4, 11        ; y6=a1-b1 y7=a0-b0
+    psubd      mm7, mm2       ; 2 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm7, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm3, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm7, mm4       ; 4 ; y6 y7 y4 y5
+    movq  [r0+16*%1+0], mm3       ; 3 ; save y3 y2 y1 y0
+    pshufw     mm7, mm7, 0xb1 ; y7 y6 y5 y4
+%else
+    punpcklwd  mm0, mm1       ; x5 x1 x4 x0
+    movq       mm5, mm0       ; 5 ; x5 x1 x4 x0
+    punpckldq  mm0, mm0       ; x4 x0 x4 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w05 w03 w01
+    punpckhwd  mm2, mm1       ; 1 ; x7 x3 x6 x2
+    pmaddwd    mm3, mm0       ; x4*w06+x0*w04 x4*w02+x0*w00
+    movq       mm6, mm2       ; 6 ; x7 x3 x6 x2
+    movq       mm1, [%2+32]   ; 1 ; w22 w20 w18 w16
+    punpckldq  mm2, mm2       ; x6 x2 x6 x2
+    pmaddwd    mm4, mm2       ; x6*w07+x2*w05 x6*w03+x2*w01
+    punpckhdq  mm5, mm5       ; x5 x1 x5 x1
+    pmaddwd    mm0, [%2+16]   ; x4*w14+x0*w12 x4*w10+x0*w08
+    punpckhdq  mm6, mm6       ; x7 x3 x7 x3
+    movq       mm7, [%2+40]   ; 7 ; w23 w21 w19 w17
+    pmaddwd    mm1, mm5       ; x5*w22+x1*w20 x5*w18+x1*w16
+    paddd      mm3, [walkenIdctRounders + %3]     ; +%3
+    pmaddwd    mm7, mm6       ; x7*w23+x3*w21 x7*w19+x3*w17
+    pmaddwd    mm2, [%2+24]   ; x6*w15+x2*w13 x6*w11+x2*w09
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm5, [%2+48]   ; x5*w30+x1*w28 x5*w26+x1*w24
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm6, [%2+56]   ; x7*w31+x3*w29 x7*w27+x3*w25
+    paddd      mm1, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    paddd      mm0, [walkenIdctRounders + %3]     ; +%3
+    psubd      mm3, mm1       ; a1-b1 a0-b0
+    psrad      mm3, 11        ; y6=a1-b1 y7=a0-b0
+    paddd      mm1, mm4       ; 4 ; a1+b1 a0+b0
+    paddd      mm0, mm2       ; 2 ; a3=sum(even3) a2=sum(even2)
+    psrad      mm1, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm5, mm6       ; 6 ; b3=sum(odd3) b2=sum(odd2)
+    movq       mm4, mm0       ; 4 ; a3 a2
+    paddd      mm0, mm5       ; a3+b3 a2+b2
+    psubd      mm4, mm5       ; 5 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm4, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm1, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm4, mm3       ; 3 ; y6 y7 y4 y5
+    movq       mm7, mm4       ; 7 ; y6 y7 y4 y5
+    psrld      mm4, 16        ; 0 y6 0 y4
+    pslld      mm7, 16        ; y7 0 y5 0
+    movq  [r0+16*%1+0], mm1   ; 1 ; save y3 y2 y1 y0
+    por        mm7, mm4       ; 4 ; y7 y6 y5 y4
+%endif
+    movq  [r0+16*%1+8], mm7   ; 7 ; save y7 y6 y5 y4
+%endmacro
+
+; -----------------------------------------------------------------------------
+;
+; The first stage DCT 8x8 - forward DCTs of columns
+;
+; The %2puts are multiplied
+; for rows 0,4 - on cos_4_16,
+; for rows 1,7 - on cos_1_16,
+; for rows 2,6 - on cos_2_16,
+; for rows 3,5 - on cos_3_16
+; and are shifted to the left for rise of accuracy
+;
+; -----------------------------------------------------------------------------
+;
+; The 8-point scaled forward DCT algorithm (26a8m)
+;
+; -----------------------------------------------------------------------------
+;
+;#define DCT_8_FRW_COL(x, y)
+; {
+;     short t0, t1, t2, t3, t4, t5, t6, t7;
+;     short tp03, tm03, tp12, tm12, tp65, tm65;
+;     short tp465, tm465, tp765, tm765;
+;
+;     t0 = LEFT_SHIFT(x[0] + x[7]);
+;     t1 = LEFT_SHIFT(x[1] + x[6]);
+;     t2 = LEFT_SHIFT(x[2] + x[5]);
+;     t3 = LEFT_SHIFT(x[3] + x[4]);
+;     t4 = LEFT_SHIFT(x[3] - x[4]);
+;     t5 = LEFT_SHIFT(x[2] - x[5]);
+;     t6 = LEFT_SHIFT(x[1] - x[6]);
+;     t7 = LEFT_SHIFT(x[0] - x[7]);
+;
+;     tp03 = t0 + t3;
+;     tm03 = t0 - t3;
+;     tp12 = t1 + t2;
+;     tm12 = t1 - t2;
+;
+;     y[0] = tp03 + tp12;
+;     y[4] = tp03 - tp12;
+;
+;     y[2] = tm03 + tm12 * tg_2_16;
+;     y[6] = tm03 * tg_2_16 - tm12;
+;
+;     tp65 = (t6 + t5) * cos_4_16;
+;     tm65 = (t6 - t5) * cos_4_16;
+;
+;     tp765 = t7 + tp65;
+;     tm765 = t7 - tp65;
+;     tp465 = t4 + tm65;
+;     tm465 = t4 - tm65;
+;
+;     y[1] = tp765 + tp465 * tg_1_16;
+;     y[7] = tp765 * tg_1_16 - tp465;
+;     y[5] = tm765 * tg_3_16 + tm465;
+;     y[3] = tm765 - tm465 * tg_3_16;
+; }
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; DCT_8_INV_COL_4  INP,OUT
+; -----------------------------------------------------------------------------
+%macro DCT_8_INV_COL 1
+    movq        mm0, [tan3]
+    movq        mm3, [%1+16*3]
+    movq        mm1, mm0 ; tg_3_16
+    movq        mm5, [%1+16*5]
+    pmulhw      mm0, mm3 ; x3*(tg_3_16-1)
+    movq        mm4, [tan1]
+    pmulhw      mm1, mm5 ; x5*(tg_3_16-1)
+    movq        mm7, [%1+16*7]
+    movq        mm2, mm4 ; tg_1_16
+    movq        mm6, [%1+16*1]
+    pmulhw      mm4, mm7 ; x7*tg_1_16
+    paddsw      mm0, mm3 ; x3*tg_3_16
+    pmulhw      mm2, mm6 ; x1*tg_1_16
+    paddsw      mm1, mm3 ; x3+x5*(tg_3_16-1)
+    psubsw      mm0, mm5 ; x3*tg_3_16-x5 = tm35
+    movq        mm3, [sqrt2]
+    paddsw      mm1, mm5 ; x3+x5*tg_3_16 = tp35
+    paddsw      mm4, mm6 ; x1+tg_1_16*x7 = tp17
+    psubsw      mm2, mm7 ; x1*tg_1_16-x7 = tm17
+    movq        mm5, mm4 ; tp17
+    movq        mm6, mm2 ; tm17
+    paddsw      mm5, mm1 ; tp17+tp35 = b0
+    psubsw      mm6, mm0 ; tm17-tm35 = b3
+    psubsw      mm4, mm1 ; tp17-tp35 = t1
+    paddsw      mm2, mm0 ; tm17+tm35 = t2
+    movq        mm7, [tan2]
+    movq        mm1, mm4 ; t1
+    movq  [%1+3*16], mm5 ; save b0
+    paddsw      mm1, mm2 ; t1+t2
+    movq  [%1+5*16], mm6 ; save b3
+    psubsw      mm4, mm2 ; t1-t2
+    movq        mm5, [%1+2*16]
+    movq        mm0, mm7 ; tg_2_16
+    movq        mm6, [%1+6*16]
+    pmulhw      mm0, mm5 ; x2*tg_2_16
+    pmulhw      mm7, mm6 ; x6*tg_2_16
+    pmulhw      mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
+    movq        mm2, [%1+0*16]
+    pmulhw      mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
+    psubsw      mm0, mm6 ; t2*tg_2_16-x6 = tm26
+    movq        mm3, mm2 ; x0
+    movq        mm6, [%1+4*16]
+    paddsw      mm7, mm5 ; x2+x6*tg_2_16 = tp26
+    paddsw      mm2, mm6 ; x0+x4 = tp04
+    psubsw      mm3, mm6 ; x0-x4 = tm04
+    movq        mm5, mm2 ; tp04
+    movq        mm6, mm3 ; tm04
+    psubsw      mm2, mm7 ; tp04-tp26 = a3
+    paddsw      mm3, mm0 ; tm04+tm26 = a1
+    paddsw      mm1, mm1 ; b1
+    paddsw      mm4, mm4 ; b2
+    paddsw      mm5, mm7 ; tp04+tp26 = a0
+    psubsw      mm6, mm0 ; tm04-tm26 = a2
+    movq        mm7, mm3 ; a1
+    movq        mm0, mm6 ; a2
+    paddsw      mm3, mm1 ; a1+b1
+    paddsw      mm6, mm4 ; a2+b2
+    psraw       mm3, 6   ; dst1
+    psubsw      mm7, mm1 ; a1-b1
+    psraw       mm6, 6   ; dst2
+    psubsw      mm0, mm4 ; a2-b2
+    movq        mm1, [%1+3*16] ; load b0
+    psraw       mm7, 6   ; dst6
+    movq        mm4, mm5 ; a0
+    psraw       mm0, 6   ; dst5
+    movq  [%1+1*16], mm3
+    paddsw      mm5, mm1 ; a0+b0
+    movq  [%1+2*16], mm6
+    psubsw      mm4, mm1 ; a0-b0
+    movq        mm3, [%1+5*16] ; load b3
+    psraw       mm5, 6   ; dst0
+    movq        mm6, mm2 ; a3
+    psraw       mm4, 6   ; dst7
+    movq  [%1+5*16], mm0
+    paddsw      mm2, mm3 ; a3+b3
+    movq  [%1+6*16], mm7
+    psubsw      mm6, mm3 ; a3-b3
+    movq  [%1+0*16], mm5
+    psraw       mm2, 6   ; dst3
+    movq  [%1+7*16], mm4
+    psraw       mm6, 6   ; dst4
+    movq  [%1+3*16], mm2
+    movq  [%1+4*16], mm6
+%endmacro
+
+%macro XVID_IDCT_MMX 0
+cglobal xvid_idct, 1, 1, 0, block
+%if cpuflag(mmxext)
+%define TAB tab_i_04_xmm
+%else
+%define TAB tab_i_04_mmx
+%endif
+    ; Process each row - beware of rounder offset
+    DCT_8_INV_ROW  0, TAB + 64 * 0, 0*16
+    DCT_8_INV_ROW  1, TAB + 64 * 1, 1*16
+    DCT_8_INV_ROW  2, TAB + 64 * 2, 2*16
+    DCT_8_INV_ROW  3, TAB + 64 * 3, 3*16
+    DCT_8_INV_ROW  4, TAB + 64 * 0, 6*16
+    DCT_8_INV_ROW  5, TAB + 64 * 3, 4*16
+    DCT_8_INV_ROW  6, TAB + 64 * 2, 5*16
+    DCT_8_INV_ROW  7, TAB + 64 * 1, 5*16
+
+    ; Process the columns (4 at a time)
+    DCT_8_INV_COL  r0+0
+    DCT_8_INV_COL  r0+8
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+XVID_IDCT_MMX
+INIT_MMX mmxext
+XVID_IDCT_MMX
+
+%endif ; ~ARCH_X86_32
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 13a4e85890..573b25c6b5 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
 void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
 
 void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block);
-void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
 
 #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345795..8b9d8de0cd 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,36 @@
 #include "idctdsp.h"
 #include "xvididct.h"
 
+#if ARCH_X86_32 && HAVE_YASM
+static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmx_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+#endif
+
 av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (high_bit_depth ||
@@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
           avctx->idct_algo == FF_IDCT_XVID))
         return;
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmx_put;
-        c->idct_add  = ff_xvid_idct_mmx_add;
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmx_put;
+        c->idct_add  = xvid_idct_mmx_add;
         c->idct      = ff_xvid_idct_mmx;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmxext_put;
-        c->idct_add  = ff_xvid_idct_mmxext_add;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmxext_put;
+        c->idct_add  = xvid_idct_mmxext_add;
         c->idct      = ff_xvid_idct_mmxext;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
+#endif
 
-    if (INLINE_SSE2(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_sse2_put;
-        c->idct_add  = ff_xvid_idct_sse2_add;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->idct_put  = ff_xvid_idct_put_sse2;
+        c->idct_add  = ff_xvid_idct_add_sse2;
         c->idct      = ff_xvid_idct_sse2;
         c->perm_type = FF_IDCT_PERM_SSE2;
     }
+#endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
deleted file mode 100644
index e371142974..0000000000
--- a/libavcodec/x86/xvididct_mmx.c
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "libavcodec/avcodec.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_MMX_INLINE
-
-// -----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-// -----------------------------------------------------------------------------
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = {
-     13036,  13036,  13036,  13036, // tg * (2 << 16) + 0.5
-     27146,  27146,  27146,  27146, // tg * (2 << 16) + 0.5
-    -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5
-     23170,  23170,  23170,  23170
-};                                  // cos * (2 << 15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = {
-    65536, 65536,
-    3597,   3597,
-    2260,   2260,
-    1203,   1203,
-    0,         0,
-    120,     120,
-    512,     512,
-    512, 512
-};
-
-// -----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-// -----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-// -----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-//     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
-//     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
-//     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
-//     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-//     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
-//     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-//     int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-//     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
-//     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
-//     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
-//     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-//     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-//     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-//     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-//     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-//     y[0] = SHIFT_ROUND(a0 + b0);
-//     y[1] = SHIFT_ROUND(a1 + b1);
-//     y[2] = SHIFT_ROUND(a2 + b2);
-//     y[3] = SHIFT_ROUND(a3 + b3);
-//     y[4] = SHIFT_ROUND(a3 - b3);
-//     y[5] = SHIFT_ROUND(a2 - b2);
-//     y[6] = SHIFT_ROUND(a1 - b1);
-//     y[7] = SHIFT_ROUND(a0 - b0);
-// }
-//
-// -----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-//     for rows 0,4 - by cos_4_16,
-//     for rows 1,7 - by cos_1_16,
-//     for rows 2,6 - by cos_2_16,
-//     for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy.
-//
-// For the constants used,
-//     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// Tables for mmx processors
-// -----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = {
-     16384,  16384,  16384, -16384, // movq-> w06 w04 w02 w00
-     21407,   8867,   8867, -21407, // w07 w05 w03 w01
-     16384, -16384,  16384,  16384, // w14 w12 w10 w08
-     -8867,  21407, -21407,  -8867, // w15 w13 w11 w09
-     22725,  12873,  19266, -22725, // w22 w20 w18 w16
-     19266,   4520,  -4520, -12873, // w23 w21 w19 w17
-     12873,   4520,   4520,  19266, // w30 w28 w26 w24
-    -22725,  19266, -12873, -22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  22725,  22725, -22725, // movq-> w06 w04 w02 w00
-     29692,  12299,  12299, -29692, // w07 w05 w03 w01
-     22725, -22725,  22725,  22725, // w14 w12 w10 w08
-    -12299,  29692, -29692, -12299, // w15 w13 w11 w09
-     31521,  17855,  26722, -31521, // w22 w20 w18 w16
-     26722,   6270,  -6270, -17855, // w23 w21 w19 w17
-     17855,   6270,   6270,  26722, // w30 w28 w26 w24
-    -31521,  26722, -17855, -31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  21407,  21407, -21407, // movq-> w06 w04 w02 w00
-     27969,  11585,  11585, -27969, // w07 w05 w03 w01
-     21407, -21407,  21407,  21407, // w14 w12 w10 w08
-    -11585,  27969, -27969, -11585, // w15 w13 w11 w09
-     29692,  16819,  25172, -29692, // w22 w20 w18 w16
-     25172,   5906,  -5906, -16819, // w23 w21 w19 w17
-     16819,   5906,   5906,  25172, // w30 w28 w26 w24
-    -29692,  25172, -16819, -29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  19266,  19266, -19266, // movq-> w06 w04 w02 w00
-     25172,  10426,  10426, -25172, // w07 w05 w03 w01
-     19266, -19266,  19266,  19266, // w14 w12 w10 w08
-    -10426,  25172, -25172, -10426, // w15 w13 w11 w09
-     26722,  15137,  22654, -26722, // w22 w20 w18 w16
-     22654,   5315,  -5315, -15137, // w23 w21 w19 w17
-     15137,   5315,   5315,  22654, // w30 w28 w26 w24
-    -26722,  22654, -15137, -26722, // w31 w29 w27 w25
-};
-// -----------------------------------------------------------------------------
-// Tables for xmm processors
-// -----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = {
-     16384,  21407,  16384,   8867, // movq-> w05 w04 w01 w00
-     16384,   8867, -16384, -21407, // w07 w06 w03 w02
-     16384,  -8867,  16384, -21407, // w13 w12 w09 w08
-    -16384,  21407,  16384,  -8867, // w15 w14 w11 w10
-     22725,  19266,  19266,  -4520, // w21 w20 w17 w16
-     12873,   4520, -22725, -12873, // w23 w22 w19 w18
-     12873, -22725,   4520, -12873, // w29 w28 w25 w24
-      4520,  19266,  19266, -22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  29692,  22725,  12299, // movq-> w05 w04 w01 w00
-     22725,  12299, -22725, -29692, // w07 w06 w03 w02
-     22725, -12299,  22725, -29692, // w13 w12 w09 w08
-    -22725,  29692,  22725, -12299, // w15 w14 w11 w10
-     31521,  26722,  26722,  -6270, // w21 w20 w17 w16
-     17855,   6270, -31521, -17855, // w23 w22 w19 w18
-     17855, -31521,   6270, -17855, // w29 w28 w25 w24
-      6270,  26722,  26722, -31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  27969,  21407,  11585, // movq-> w05 w04 w01 w00
-     21407,  11585, -21407, -27969, // w07 w06 w03 w02
-     21407, -11585,  21407, -27969, // w13 w12 w09 w08
-    -21407,  27969,  21407, -11585, // w15 w14 w11 w10
-     29692,  25172,  25172,  -5906, // w21 w20 w17 w16
-     16819,   5906, -29692, -16819, // w23 w22 w19 w18
-     16819, -29692,   5906, -16819, // w29 w28 w25 w24
-      5906,  25172,  25172, -29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  25172,  19266,  10426, // movq-> w05 w04 w01 w00
-     19266,  10426, -19266, -25172, // w07 w06 w03 w02
-     19266, -10426,  19266, -25172, // w13 w12 w09 w08
-    -19266,  25172,  19266, -10426, // w15 w14 w11 w10
-     26722,  22654,  22654,  -5315, // w21 w20 w17 w16
-     15137,   5315, -26722, -15137, // w23 w22 w19 w18
-     15137, -26722,   5315, -15137, // w29 w28 w25 w24
-      5315,  22654,  22654, -26722, // w31 w30 w27 w26
-};
-// =============================================================================
-// Helper macros for the code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0    \n\t" /* 0 ; x3 x2 x1 x0 */                     \
-    "movq     8+"#A1", %%mm1    \n\t" /* 1 ; x7 x6 x5 x4 */                     \
-    "movq       %%mm0, %%mm2    \n\t" /* 2 ; x3 x2 x1 x0 */                     \
-    "movq       "#A3", %%mm3    \n\t" /* 3 ; w06 w04 w02 w00 */                 \
-    "punpcklwd  %%mm1, %%mm0    \n\t" /* x5 x1 x4 x0 */                         \
-    "movq       %%mm0, %%mm5    \n\t" /* 5 ; x5 x1 x4 x0 */                     \
-    "punpckldq  %%mm0, %%mm0    \n\t" /* x4 x0 x4 x0 */                         \
-    "movq     8+"#A3", %%mm4    \n\t" /* 4 ; w07 w05 w03 w01 */                 \
-    "punpckhwd  %%mm1, %%mm2    \n\t" /* 1 ; x7 x3 x6 x2 */                     \
-    "pmaddwd    %%mm0, %%mm3    \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */         \
-    "movq       %%mm2, %%mm6    \n\t" /* 6 ; x7 x3 x6 x2 */                     \
-    "movq    32+"#A3", %%mm1    \n\t" /* 1 ; w22 w20 w18 w16 */                 \
-    "punpckldq  %%mm2, %%mm2    \n\t" /* x6 x2 x6 x2 */                         \
-    "pmaddwd    %%mm2, %%mm4    \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */         \
-    "punpckhdq  %%mm5, %%mm5    \n\t" /* x5 x1 x5 x1 */                         \
-    "pmaddwd 16+"#A3", %%mm0    \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */         \
-    "punpckhdq  %%mm6, %%mm6    \n\t" /* x7 x3 x7 x3 */                         \
-    "movq 40+   "#A3", %%mm7    \n\t" /* 7 ; w23 w21 w19 w17 */                 \
-    "pmaddwd    %%mm5, %%mm1    \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */         \
-    "paddd      "#A4", %%mm3    \n\t" /* +%4 */                                 \
-    "pmaddwd    %%mm6, %%mm7    \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */         \
-    "pmaddwd 24+"#A3", %%mm2    \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */         \
-    "paddd      %%mm4, %%mm3    \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */     \
-    "pmaddwd 48+"#A3", %%mm5    \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */         \
-    "movq       %%mm3, %%mm4    \n\t" /* 4 ; a1 a0 */                           \
-    "pmaddwd 56+"#A3", %%mm6    \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */         \
-    "paddd      %%mm7, %%mm1    \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */       \
-    "paddd      "#A4", %%mm0    \n\t" /* +%4 */                                 \
-    "psubd      %%mm1, %%mm3    \n\t" /* a1-b1 a0-b0 */                         \
-    "psrad        $11, %%mm3    \n\t" /* y6=a1-b1 y7=a0-b0 */                   \
-    "paddd      %%mm4, %%mm1    \n\t" /* 4 ; a1+b1 a0+b0 */                     \
-    "paddd      %%mm2, %%mm0    \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */     \
-    "psrad        $11, %%mm1    \n\t" /* y1=a1+b1 y0=a0+b0 */                   \
-    "paddd      %%mm6, %%mm5    \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */       \
-    "movq       %%mm0, %%mm4    \n\t" /* 4 ; a3 a2 */                           \
-    "paddd      %%mm5, %%mm0    \n\t" /* a3+b3 a2+b2 */                         \
-    "psubd      %%mm5, %%mm4    \n\t" /* 5 ; a3-b3 a2-b2 */                     \
-    "psrad        $11, %%mm0    \n\t" /* y3=a3+b3 y2=a2+b2 */                   \
-    "psrad        $11, %%mm4    \n\t" /* y4=a3-b3 y5=a2-b2 */                   \
-    "packssdw   %%mm0, %%mm1    \n\t" /* 0 ; y3 y2 y1 y0 */                     \
-    "packssdw   %%mm3, %%mm4    \n\t" /* 3 ; y6 y7 y4 y5 */                     \
-    "movq       %%mm4, %%mm7    \n\t" /* 7 ; y6 y7 y4 y5 */                     \
-    "psrld        $16, %%mm4    \n\t" /* 0 y6 0 y4 */                           \
-    "pslld        $16, %%mm7    \n\t" /* y7 0 y5 0 */                           \
-    "movq       %%mm1, "#A2"    \n\t" /* 1 ; save y3 y2 y1 y0 */                \
-    "por        %%mm4, %%mm7    \n\t" /* 4 ; y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"  \n\t" /* 7 ; save y7 y6 y5 y4 */                \
-
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0        \n\t" /* 0 ; x3 x2 x1 x0 */                 \
-    "movq     8+"#A1", %%mm1        \n\t" /* 1 ; x7 x6 x5 x4 */                 \
-    "movq       %%mm0, %%mm2        \n\t" /* 2 ; x3 x2 x1 x0 */                 \
-    "movq       "#A3", %%mm3        \n\t" /* 3 ; w05 w04 w01 w00 */             \
-    "pshufw     $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */                     \
-    "movq     8+"#A3", %%mm4        \n\t" /* 4 ; w07 w06 w03 w02 */             \
-    "movq       %%mm1, %%mm5        \n\t" /* 5 ; x7 x6 x5 x4 */                 \
-    "pmaddwd    %%mm0, %%mm3        \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */     \
-    "movq    32+"#A3", %%mm6        \n\t" /* 6 ; w21 w20 w17 w16 */             \
-    "pshufw     $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */                     \
-    "pmaddwd    %%mm1, %%mm4        \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */     \
-    "movq    40+"#A3", %%mm7        \n\t" /* 7; w23 w22 w19 w18 */              \
-    "pshufw     $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */                     \
-    "pmaddwd    %%mm2, %%mm6        \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */     \
-    "pshufw     $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */                     \
-    "pmaddwd    %%mm5, %%mm7        \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */     \
-    "paddd      "#A4", %%mm3        \n\t" /* +%4 */                             \
-    "pmaddwd 16+"#A3", %%mm0        \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */     \
-    "paddd      %%mm4, %%mm3        \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
-    "pmaddwd 24+"#A3", %%mm1        \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */     \
-    "movq       %%mm3, %%mm4        \n\t" /* 4 ; a1 a0 */                       \
-    "pmaddwd 48+"#A3", %%mm2        \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */     \
-    "paddd      %%mm7, %%mm6        \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */   \
-    "pmaddwd 56+"#A3", %%mm5        \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */     \
-    "paddd      %%mm6, %%mm3        \n\t" /* a1+b1 a0+b0 */                     \
-    "paddd      "#A4", %%mm0        \n\t" /* +%4 */                             \
-    "psrad        $11, %%mm3        \n\t" /* y1=a1+b1 y0=a0+b0 */               \
-    "paddd      %%mm1, %%mm0        \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \
-    "psubd      %%mm6, %%mm4        \n\t" /* 6 ; a1-b1 a0-b0 */                 \
-    "movq       %%mm0, %%mm7        \n\t" /* 7 ; a3 a2 */                       \
-    "paddd      %%mm5, %%mm2        \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */   \
-    "paddd      %%mm2, %%mm0        \n\t" /* a3+b3 a2+b2 */                     \
-    "psrad        $11, %%mm4        \n\t" /* y6=a1-b1 y7=a0-b0 */               \
-    "psubd      %%mm2, %%mm7        \n\t" /* 2 ; a3-b3 a2-b2 */                 \
-    "psrad        $11, %%mm0        \n\t" /* y3=a3+b3 y2=a2+b2 */               \
-    "psrad        $11, %%mm7        \n\t" /* y4=a3-b3 y5=a2-b2 */               \
-    "packssdw   %%mm0, %%mm3        \n\t" /* 0 ; y3 y2 y1 y0 */                 \
-    "packssdw   %%mm4, %%mm7        \n\t" /* 4 ; y6 y7 y4 y5 */                 \
-    "movq       %%mm3, "#A2"        \n\t" /* 3 ; save y3 y2 y1 y0 */            \
-    "pshufw     $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"      \n\t" /* 7 ; save y7 y6 y5 y4 */            \
-
-
-// -----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-// -----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-// -----------------------------------------------------------------------------
-//
-//#define DCT_8_FRW_COL(x, y)
-// {
-//     short t0, t1, t2, t3, t4, t5, t6, t7;
-//     short tp03, tm03, tp12, tm12, tp65, tm65;
-//     short tp465, tm465, tp765, tm765;
-//
-//     t0 = LEFT_SHIFT(x[0] + x[7]);
-//     t1 = LEFT_SHIFT(x[1] + x[6]);
-//     t2 = LEFT_SHIFT(x[2] + x[5]);
-//     t3 = LEFT_SHIFT(x[3] + x[4]);
-//     t4 = LEFT_SHIFT(x[3] - x[4]);
-//     t5 = LEFT_SHIFT(x[2] - x[5]);
-//     t6 = LEFT_SHIFT(x[1] - x[6]);
-//     t7 = LEFT_SHIFT(x[0] - x[7]);
-//
-//     tp03 = t0 + t3;
-//     tm03 = t0 - t3;
-//     tp12 = t1 + t2;
-//     tm12 = t1 - t2;
-//
-//     y[0] = tp03 + tp12;
-//     y[4] = tp03 - tp12;
-//
-//     y[2] = tm03 + tm12 * tg_2_16;
-//     y[6] = tm03 * tg_2_16 - tm12;
-//
-//     tp65 = (t6 + t5) * cos_4_16;
-//     tm65 = (t6 - t5) * cos_4_16;
-//
-//     tp765 = t7 + tp65;
-//     tm765 = t7 - tp65;
-//     tp465 = t4 + tm65;
-//     tm465 = t4 - tm65;
-//
-//     y[1] = tp765 + tp465 * tg_1_16;
-//     y[7] = tp765 * tg_1_16 - tp465;
-//     y[5] = tm765 * tg_3_16 + tm465;
-//     y[3] = tm765 - tm465 * tg_3_16;
-// }
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_COL_4  INP,OUT
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1, A2)                                                   \
-    "movq    2*8(%3), %%mm0         \n\t"                                       \
-    "movq 16*3+"#A1", %%mm3         \n\t"                                       \
-    "movq      %%mm0, %%mm1         \n\t" /* tg_3_16 */                         \
-    "movq 16*5+"#A1", %%mm5         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm0         \n\t" /* x3*(tg_3_16-1) */                  \
-    "movq       (%3), %%mm4         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm1         \n\t" /* x5*(tg_3_16-1) */                  \
-    "movq 16*7+"#A1", %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm2         \n\t" /* tg_1_16 */                         \
-    "movq 16*1+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm7, %%mm4         \n\t" /* x7*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm0         \n\t" /* x3*tg_3_16 */                      \
-    "pmulhw    %%mm6, %%mm2         \n\t" /* x1*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm1         \n\t" /* x3+x5*(tg_3_16-1) */               \
-    "psubsw    %%mm5, %%mm0         \n\t" /* x3*tg_3_16-x5 = tm35 */            \
-    "movq    3*8(%3), %%mm3         \n\t"                                       \
-    "paddsw    %%mm5, %%mm1         \n\t" /* x3+x5*tg_3_16 = tp35 */            \
-    "paddsw    %%mm6, %%mm4         \n\t" /* x1+tg_1_16*x7 = tp17 */            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* x1*tg_1_16-x7 = tm17 */            \
-    "movq      %%mm4, %%mm5         \n\t" /* tp17 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* tm17 */                            \
-    "paddsw    %%mm1, %%mm5         \n\t" /* tp17+tp35 = b0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm17-tm35 = b3 */                  \
-    "psubsw    %%mm1, %%mm4         \n\t" /* tp17-tp35 = t1 */                  \
-    "paddsw    %%mm0, %%mm2         \n\t" /* tm17+tm35 = t2 */                  \
-    "movq    1*8(%3), %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm1         \n\t" /* t1 */                              \
-    "movq      %%mm5, 3*16+"#A2"    \n\t" /* save b0 */                         \
-    "paddsw    %%mm2, %%mm1         \n\t" /* t1+t2 */                           \
-    "movq      %%mm6, 5*16+"#A2"    \n\t" /* save b3 */                         \
-    "psubsw    %%mm2, %%mm4         \n\t" /* t1-t2 */                           \
-    "movq 2*16+"#A1", %%mm5         \n\t"                                       \
-    "movq      %%mm7, %%mm0         \n\t" /* tg_2_16 */                         \
-    "movq 6*16+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm0         \n\t" /* x2*tg_2_16 */                      \
-    "pmulhw    %%mm6, %%mm7         \n\t" /* x6*tg_2_16 */                      \
-    "pmulhw    %%mm3, %%mm1         \n\t" /* ocos_4_16*(t1+t2) = b1/2 */        \
-    "movq 0*16+"#A1", %%mm2         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm4         \n\t" /* ocos_4_16*(t1-t2) = b2/2 */        \
-    "psubsw    %%mm6, %%mm0         \n\t" /* t2*tg_2_16-x6 = tm26 */            \
-    "movq      %%mm2, %%mm3         \n\t" /* x0 */                              \
-    "movq 4*16+"#A1", %%mm6         \n\t"                                       \
-    "paddsw    %%mm5, %%mm7         \n\t" /* x2+x6*tg_2_16 = tp26 */            \
-    "paddsw    %%mm6, %%mm2         \n\t" /* x0+x4 = tp04 */                    \
-    "psubsw    %%mm6, %%mm3         \n\t" /* x0-x4 = tm04 */                    \
-    "movq      %%mm2, %%mm5         \n\t" /* tp04 */                            \
-    "movq      %%mm3, %%mm6         \n\t" /* tm04 */                            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* tp04-tp26 = a3 */                  \
-    "paddsw    %%mm0, %%mm3         \n\t" /* tm04+tm26 = a1 */                  \
-    "paddsw    %%mm1, %%mm1         \n\t" /* b1 */                              \
-    "paddsw    %%mm4, %%mm4         \n\t" /* b2 */                              \
-    "paddsw    %%mm7, %%mm5         \n\t" /* tp04+tp26 = a0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm04-tm26 = a2 */                  \
-    "movq      %%mm3, %%mm7         \n\t" /* a1 */                              \
-    "movq      %%mm6, %%mm0         \n\t" /* a2 */                              \
-    "paddsw    %%mm1, %%mm3         \n\t" /* a1+b1 */                           \
-    "paddsw    %%mm4, %%mm6         \n\t" /* a2+b2 */                           \
-    "psraw        $6, %%mm3         \n\t" /* dst1 */                            \
-    "psubsw    %%mm1, %%mm7         \n\t" /* a1-b1 */                           \
-    "psraw        $6, %%mm6         \n\t" /* dst2 */                            \
-    "psubsw    %%mm4, %%mm0         \n\t" /* a2-b2 */                           \
-    "movq 3*16+"#A2", %%mm1         \n\t" /* load b0 */                         \
-    "psraw        $6, %%mm7         \n\t" /* dst6 */                            \
-    "movq      %%mm5, %%mm4         \n\t" /* a0 */                              \
-    "psraw        $6, %%mm0         \n\t" /* dst5 */                            \
-    "movq      %%mm3, 1*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm1, %%mm5         \n\t" /* a0+b0 */                           \
-    "movq      %%mm6, 2*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm1, %%mm4         \n\t" /* a0-b0 */                           \
-    "movq 5*16+"#A2", %%mm3         \n\t" /* load b3 */                         \
-    "psraw        $6, %%mm5         \n\t" /* dst0 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* a3 */                              \
-    "psraw        $6, %%mm4         \n\t" /* dst7 */                            \
-    "movq      %%mm0, 5*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm3, %%mm2         \n\t" /* a3+b3 */                           \
-    "movq      %%mm7, 6*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm3, %%mm6         \n\t" /* a3-b3 */                           \
-    "movq      %%mm5, 0*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm2         \n\t" /* dst3 */                            \
-    "movq      %%mm4, 7*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm6         \n\t" /* dst4 */                            \
-    "movq      %%mm2, 3*16+"#A2"    \n\t"                                       \
-    "movq      %%mm6, 4*16+"#A2"    \n\t"                                       \
-
-// =============================================================================
-// Code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmx(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-// -----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmxext(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
deleted file mode 100644
index d4f01693ea..0000000000
--- a/libavcodec/x86/xvididct_sse2.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x) x, x, x, x, x, x, x, x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[]  = { X8(13036) }; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[]  = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[]  = { X8(43790) }; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127)[]  = { X8(127) };
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
-    0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
-    0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
-    0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
-    0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
-    0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
-    0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
-    0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
-    0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
-    0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
-    0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
-    0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
-    0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
-    0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
-    0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
-    0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
-    0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
-    65536, 65536, 65536, 65536,
-     3597,  3597,  3597,  3597,
-     2260,  2260,  2260,  2260,
-     1203,  1203,  1203,  1203,
-      120,   120,   120,   120,
-      512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                                    \
-    clear2                                                    \
-    "movq     "row1", %%mm1           \n\t"                   \
-    "por    8+"row1", %%mm1           \n\t"                   \
-    "movq     "row2", %%mm2           \n\t"                   \
-    "por    8+"row2", %%mm2           \n\t"                   \
-    "paddusb   %%mm0, %%mm1           \n\t"                   \
-    "paddusb   %%mm0, %%mm2           \n\t"                   \
-    "pmovmskb  %%mm1, "reg1"          \n\t"                   \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-/// IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put)            \
-    "movdqa        "src", %%xmm3      \n\t"            \
-    "movdqa       %%xmm3, %%xmm0      \n\t"            \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t"            \
-    "pmaddwd  16+"table", %%xmm1      \n\t"            \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t"            \
-    "pmaddwd  48+"table", %%xmm3      \n\t"            \
-    "paddd        %%xmm1, %%xmm0      \n\t"            \
-    "paddd        %%xmm3, %%xmm2      \n\t"            \
-    rounder",     %%xmm0              \n\t"            \
-    "movdqa       %%xmm2, %%xmm3      \n\t"            \
-    "paddd        %%xmm0, %%xmm2      \n\t"            \
-    "psubd        %%xmm3, %%xmm0      \n\t"            \
-    "psrad           $11, %%xmm2      \n\t"            \
-    "psrad           $11, %%xmm0      \n\t"            \
-    "packssdw     %%xmm0, %%xmm2      \n\t"            \
-    put                                                \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-/// IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-/// IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-inline void ff_xvid_idct_sse2(short *block)
-{
-    __asm__ volatile (
-        "movq     "MANGLE (m127) ", %%mm0                              \n\t"
-        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),          PUT_EVEN(ROW0))
-        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
-        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
-
-        TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-        JZ("%%eax", "1f")
-        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
-
-        TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-        TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-        iLLM_HEAD
-        ".p2align 4 \n\t"
-        JNZ("%%ecx", "2f")
-        JNZ("%%eax", "3f")
-        JNZ("%%edx", "4f")
-        JNZ("%%esi", "5f")
-        iLLM_PASS_SPARSE("%0")
-        "jmp 6f                                                      \n\t"
-        "2:                                                          \n\t"
-        iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-        "3:                                                          \n\t"
-        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
-        JZ("%%edx", "1f")
-        "4:                                                          \n\t"
-        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
-        JZ("%%esi", "1f")
-        "5:                                                          \n\t"
-        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
-#if ARCH_X86_32
-        iLLM_HEAD
-#endif
-        iLLM_PASS("%0")
-        "6:                                                          \n\t"
-        : "+r" (block)
-        :
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                       "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
-#if ARCH_X86_64
-          XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
-                       "%xmm12", "%xmm13", "%xmm14", )
-#endif
-          "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/libavcodec/xan.c b/libavcodec/xan.c
index e7bbe413b6..662386af9a 100644
--- a/libavcodec/xan.c
+++ b/libavcodec/xan.c
@@ -1,21 +1,21 @@
 /*
  * Wing Commander/Xan Video Decoder
- * Copyright (C) 2003 the ffmpeg project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,13 +54,13 @@ typedef struct XanContext {
     AVCodecContext *avctx;
     AVFrame *last_frame;
 
-    const unsigned char *buf;
+    const uint8_t *buf;
     int size;
 
     /* scratch space */
-    unsigned char *buffer1;
+    uint8_t *buffer1;
     int buffer1_size;
-    unsigned char *buffer2;
+    uint8_t *buffer2;
     int buffer2_size;
 
     unsigned *palettes;
@@ -113,22 +113,21 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int xan_huffman_decode(unsigned char *dest, int dest_len,
-                              const unsigned char *src, int src_len)
+static int xan_huffman_decode(uint8_t *dest, int dest_len,
+                              const uint8_t *src, int src_len)
 {
-    unsigned char byte = *src++;
-    unsigned char ival = byte + 0x16;
-    const unsigned char * ptr = src + byte*2;
+    uint8_t byte = *src++;
+    uint8_t ival = byte + 0x16;
+    const uint8_t * ptr = src + byte*2;
     int ptr_len = src_len - 1 - byte*2;
-    unsigned char val = ival;
-    unsigned char *dest_end = dest + dest_len;
-    unsigned char *dest_start = dest;
+    uint8_t val = ival;
+    uint8_t *dest_end = dest + dest_len;
+    uint8_t *dest_start = dest;
+    int ret;
     GetBitContext gb;
 
-    if (ptr_len < 0)
-        return AVERROR_INVALIDDATA;
-
-    init_get_bits(&gb, ptr, ptr_len * 8);
+    if ((ret = init_get_bits8(&gb, ptr, ptr_len)) < 0)
+        return ret;
 
     while (val != 0x16) {
         unsigned idx = val - 0x17 + get_bits1(&gb) * byte;
@@ -152,13 +151,13 @@ static int xan_huffman_decode(unsigned char *dest, int dest_len,
  *
  * @param dest destination buffer of dest_len, must be padded with at least 130 bytes
  */
-static void xan_unpack(unsigned char *dest, int dest_len,
-                       const unsigned char *src, int src_len)
+static void xan_unpack(uint8_t *dest, int dest_len,
+                       const uint8_t *src, int src_len)
 {
-    unsigned char opcode;
+    uint8_t opcode;
     int size;
-    unsigned char *dest_org = dest;
-    unsigned char *dest_end = dest + dest_len;
+    uint8_t *dest_org = dest;
+    uint8_t *dest_end = dest + dest_len;
     GetByteContext ctx;
 
     bytestream2_init(&ctx, src, src_len);
@@ -207,14 +206,14 @@ static void xan_unpack(unsigned char *dest, int dest_len,
 }
 
 static inline void xan_wc3_output_pixel_run(XanContext *s, AVFrame *frame,
-    const unsigned char *pixel_buffer, int x, int y, int pixel_count)
+    const uint8_t *pixel_buffer, int x, int y, int pixel_count)
 {
     int stride;
     int line_inc;
     int index;
     int current_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane;
+    uint8_t *palette_plane;
 
     palette_plane = frame->data[0];
     stride = frame->linesize[0];
@@ -246,7 +245,7 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     int curframe_index, prevframe_index;
     int curframe_x, prevframe_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane, *prev_palette_plane;
+    uint8_t *palette_plane, *prev_palette_plane;
 
     if (y + motion_y < 0 || y + motion_y >= s->avctx->height ||
         x + motion_x < 0 || x + motion_x >= s->avctx->width)
@@ -262,6 +261,12 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     curframe_x = x;
     prevframe_index = (y + motion_y) * stride + x + motion_x;
     prevframe_x = x + motion_x;
+
+    if (prev_palette_plane == palette_plane && FFABS(curframe_index - prevframe_index) < pixel_count) {
+         avpriv_request_sample(s->avctx, "Overlapping copy");
+         return ;
+    }
+
     while (pixel_count &&
            curframe_index  < s->frame_size &&
            prevframe_index < s->frame_size) {
@@ -294,22 +299,22 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
     int width  = s->avctx->width;
     int height = s->avctx->height;
     int total_pixels = width * height;
-    unsigned char opcode;
-    unsigned char flag = 0;
+    uint8_t opcode;
+    uint8_t flag = 0;
     int size = 0;
     int motion_x, motion_y;
     int x, y, ret;
 
-    unsigned char *opcode_buffer = s->buffer1;
-    unsigned char *opcode_buffer_end = s->buffer1 + s->buffer1_size;
+    uint8_t *opcode_buffer = s->buffer1;
+    uint8_t *opcode_buffer_end = s->buffer1 + s->buffer1_size;
     int opcode_buffer_size = s->buffer1_size;
-    const unsigned char *imagedata_buffer = s->buffer2;
+    const uint8_t *imagedata_buffer = s->buffer2;
 
     /* pointers to segments inside the compressed chunk */
-    const unsigned char *huffman_segment;
+    const uint8_t *huffman_segment;
     GetByteContext       size_segment;
     GetByteContext       vector_segment;
-    const unsigned char *imagedata_segment;
+    const uint8_t *imagedata_segment;
     int huffman_offset, size_offset, vector_offset, imagedata_offset,
         imagedata_size;
 
@@ -382,16 +387,28 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
 
         case 9:
         case 19:
+            if (bytestream2_get_bytes_left(&size_segment) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_byte(&size_segment);
             break;
 
         case 10:
         case 20:
+            if (bytestream2_get_bytes_left(&size_segment) < 2) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be16(&size_segment);
             break;
 
         case 11:
         case 21:
+            if (bytestream2_get_bytes_left(&size_segment) < 3) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be24(&size_segment);
             break;
         }
@@ -413,8 +430,13 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
                 imagedata_size -= size;
             }
         } else {
+            uint8_t vector;
+            if (bytestream2_get_bytes_left(&vector_segment) <= 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "vector_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             /* run-based motion compensation from last frame */
-            uint8_t vector = bytestream2_get_byte(&vector_segment);
+            vector = bytestream2_get_byte(&vector_segment);
             motion_x = sign_extend(vector >> 4,  4);
             motion_y = sign_extend(vector & 0xF, 4);
 
@@ -534,6 +556,10 @@ static int xan_decode_frame(AVCodecContext *avctx,
         int i;
         tag  = bytestream2_get_le32(&ctx);
         size = bytestream2_get_be32(&ctx);
+        if (size < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid tag size %d\n", size);
+            return AVERROR_INVALIDDATA;
+        }
         size = FFMIN(size, bytestream2_get_bytes_left(&ctx));
         switch (tag) {
         case PALT_TAG:
@@ -541,8 +567,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             if (s->palettes_count >= PALETTES_MAX)
                 return AVERROR_INVALIDDATA;
-            tmpptr = av_realloc(s->palettes,
-                                (s->palettes_count + 1) * AVPALETTE_SIZE);
+            tmpptr = av_realloc_array(s->palettes,
+                                      s->palettes_count + 1, AVPALETTE_SIZE);
             if (!tmpptr)
                 return AVERROR(ENOMEM);
             s->palettes = tmpptr;
@@ -557,7 +583,7 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 int g = gamma_lookup[bytestream2_get_byteu(&ctx)];
                 int b = gamma_lookup[bytestream2_get_byteu(&ctx)];
 #endif
-                *tmpptr++ = (r << 16) | (g << 8) | b;
+                *tmpptr++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
             }
             s->palettes_count++;
             break;
@@ -584,10 +610,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF))) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (!s->frame_size)
         s->frame_size = frame->linesize[0] * s->avctx->height;
diff --git a/libavcodec/xbmdec.c b/libavcodec/xbmdec.c
index 2ce1465170..d19bdaee23 100644
--- a/libavcodec/xbmdec.c
+++ b/libavcodec/xbmdec.c
@@ -1,20 +1,22 @@
 /*
  * XBM image format
  *
- * This file is part of Libav.
+ * Copyright (c) 2012 Paul B Mahol
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,43 +26,54 @@
 #include "internal.h"
 #include "mathops.h"
 
+static int convert(uint8_t x)
+{
+    if (x >= 'a')
+        x -= 87;
+    else if (x >= 'A')
+        x -= 55;
+    else
+        x -= '0';
+    return x;
+}
+
+static int parse_str_int(const uint8_t *p, int len, const uint8_t *key)
+{
+    const uint8_t *end = p + len;
+
+    for(; p<end - strlen(key); p++) {
+        if (!memcmp(p, key, strlen(key)))
+            break;
+    }
+    p += strlen(key);
+    if (p >= end)
+        return INT_MIN;
+
+    for(; p<end; p++) {
+        char *eptr;
+        int64_t ret = strtol(p, &eptr, 10);
+        if ((const uint8_t *)eptr != p)
+            return ret;
+    }
+    return INT_MIN;
+}
+
 static int xbm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
 {
     AVFrame *p = data;
-    int ret, linesize, i;
+    int ret, linesize, i, j;
     int width  = 0;
     int height = 0;
-    const uint8_t *ptr = avpkt->data;
+    const uint8_t *end, *ptr = avpkt->data;
+    const uint8_t *next;
     uint8_t *dst;
 
     avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-    while (!width || !height) {
-        ptr += strcspn(ptr, "#");
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (strncmp(ptr, "#define", 7) != 0) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Unexpected preprocessor directive.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        // skip the name
-        ptr += strcspn(ptr, "_") + 1;
-        // get width or height
-        if (strncmp(ptr, "width", 5) == 0) {
-            ptr += strcspn(ptr, " ");
-            width = strtol(ptr, NULL, 10);
-        } else if (strncmp(ptr, "height", 6) == 0) {
-            ptr += strcspn(ptr, " ");
-            height = strtol(ptr, NULL, 10);
-        } else {
-            // skip offset and unknown variables
-            av_log(avctx, AV_LOG_VERBOSE,
-                   "Ignoring preprocessor directive.\n");
-        }
-    }
+    end = avpkt->data + avpkt->size;
+
+    width  = parse_str_int(avpkt->data, avpkt->size, "_width");
+    height = parse_str_int(avpkt->data, avpkt->size, "_height");
 
     if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
         return ret;
@@ -68,46 +81,48 @@ static int xbm_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
 
-    // go to start of image data
-    ptr += strcspn(ptr, "{");
+    // goto start of image data
+    next = memchr(ptr, '{', avpkt->size);
+    if (!next)
+        next = memchr(ptr, '(', avpkt->size);
+    if (!next)
+        return AVERROR_INVALIDDATA;
+    ptr = next + 1;
 
     linesize = (avctx->width + 7) / 8;
     for (i = 0; i < avctx->height; i++) {
-        int eol = 0, e = 0;
         dst = p->data[0] + i * p->linesize[0];
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        do {
-            int val;
-            uint8_t *endptr;
+        for (j = 0; j < linesize; j++) {
+            uint8_t val;
 
-            ptr += strcspn(ptr, "x") - 1; // -1 to get 0x
-            val = strtol(ptr, (char **)&endptr, 16);
+            while (ptr < end && *ptr != 'x' && *ptr != '$')
+                ptr++;
 
-            if (endptr - ptr == 4) {
-                // XBM X11 format
+            ptr ++;
+            if (ptr < end && av_isxdigit(*ptr)) {
+                val = convert(*ptr++);
+                if (av_isxdigit(*ptr))
+                    val = (val << 4) + convert(*ptr++);
                 *dst++ = ff_reverse[val];
-                eol = linesize;
-            } else if (endptr - ptr == 6) {
-                // XBM X10 format
-                *dst++ = ff_reverse[val >> 8];
-                *dst++ = ff_reverse[val & 0xFF];
-                eol = linesize / 2; // 2 bytes read
+                if (av_isxdigit(*ptr) && j+1 < linesize) {
+                    j++;
+                    val = convert(*ptr++);
+                    if (av_isxdigit(*ptr))
+                        val = (val << 4) + convert(*ptr++);
+                    *dst++ = ff_reverse[val];
+                }
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "Unexpected data at %.8s.\n", ptr);
                 return AVERROR_INVALIDDATA;
             }
-            ptr = endptr;
-        } while (++e < eol);
+        }
     }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
 
-    *got_frame = 1;
+    *got_frame       = 1;
 
     return avpkt->size;
 }
diff --git a/libavcodec/xbmenc.c b/libavcodec/xbmenc.c
index 4840050984..b25615f2a4 100644
--- a/libavcodec/xbmenc.c
+++ b/libavcodec/xbmenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,17 +24,6 @@
 #include "internal.h"
 #include "mathops.h"
 
-static av_cold int xbm_encode_init(AVCodecContext *avctx)
-{
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    return 0;
-}
-
 static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *p, int *got_packet)
 {
@@ -43,10 +32,8 @@ static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     linesize = (avctx->width + 7) / 8;
     size     = avctx->height * (linesize * 7 + 2) + 110;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
 
     buf = pkt->data;
     ptr = p->data[0];
@@ -73,8 +60,7 @@ AVCodec ff_xbm_encoder = {
     .long_name    = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
     .type         = AVMEDIA_TYPE_VIDEO,
     .id           = AV_CODEC_ID_XBM,
-    .init         = xbm_encode_init,
     .encode2      = xbm_encode_frame,
     .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE,
-                                                 AV_PIX_FMT_NONE },
+                                                   AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xface.c b/libavcodec/xface.c
new file mode 100644
index 0000000000..8c0cbfdb84
--- /dev/null
+++ b/libavcodec/xface.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common data and utilities definition.
+ */
+
+#include "libavutil/avassert.h"
+
+#include "xface.h"
+
+void ff_big_add(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 0)
+        return;
+    w = b->words;
+    c = a;
+    for (i = 0; i < b->nb_words && c; i++) {
+        c += *w;
+        *w++ = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (i == b->nb_words && c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c, d;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0) {
+        *r = 0;
+        return;
+    }
+
+    /* treat this as a == WORDCARRY and just shift everything right a WORD */
+    if (a == 0) {
+        i = --b->nb_words;
+        w = b->words;
+        *r = *w;
+        while (i--) {
+            *w = *(w + 1);
+            w++;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words + i;
+    c = 0;
+    while (i--) {
+        c <<= XFACE_BITSPERWORD;
+        c += *--w;
+        d = c / (uint16_t)a;
+        c = c % (uint16_t)a;
+        *w = d & XFACE_WORDMASK;
+    }
+    *r = c;
+    if (b->words[b->nb_words - 1] == 0)
+        b->nb_words--;
+}
+
+void ff_big_mul(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0)
+        return;
+    if (a == 0) {
+        /* treat this as a == WORDCARRY and just shift everything left a WORD */
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        i = b->nb_words++;
+        w = b->words + i;
+        while (i--) {
+            *w = *(w - 1);
+            w--;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words;
+    c = 0;
+    while (i--) {
+        c += (uint16_t)*w * (uint16_t)a;
+        *(w++) = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+const ProbRange ff_xface_probranges_per_level[4][3] = {
+    //  black      grey       white
+    { {  1, 255}, {251, 0}, {  4, 251} }, /* Top of tree almost always grey */
+    { {  1, 255}, {200, 0}, { 55, 200} },
+    { { 33, 223}, {159, 0}, { 64, 159} },
+    { {131,   0}, {  0, 0}, {125, 131} }, /* Grey disallowed at bottom */
+};
+
+const ProbRange ff_xface_probranges_2x2[16] = {
+    { 0,   0},  {38,   0}, {38,  38},  {13, 152},
+    {38,  76},  {13, 165}, {13, 178},  { 6, 230},
+    {38, 114},  {13, 191}, {13, 204},  { 6, 236},
+    {13, 217},  { 6, 242}, { 5, 248},  { 3, 253},
+};
+
+/*
+ * The "guess the next pixel" tables follow. Normally there are 12
+ * neighbour pixels used to give 1<<12 cases as we get closer to the
+ * upper left corner lesser numbers of neighbours are available.
+ *
+ * Each byte in the tables represents 8 boolean values starting from
+ * the most significant bit.
+ */
+
+static const uint8_t g_00[] = {
+    0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xe3, 0xdf, 0x05, 0x17,
+    0x05, 0x0f, 0x00, 0x1b, 0x0f, 0xdf, 0x00, 0x04, 0x00, 0x00,
+    0x0d, 0x0f, 0x03, 0x7f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x1d,
+    0x45, 0x2f, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x0a, 0xff, 0xff,
+    0x00, 0x04, 0x00, 0x05, 0x01, 0x3f, 0xcf, 0xff, 0x10, 0x01,
+    0x80, 0xc9, 0x0f, 0x0f, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x1b, 0x1f, 0xff, 0xff, 0x4f, 0x54, 0x07, 0x1f, 0x57, 0x47,
+    0xd7, 0x3d, 0xff, 0xff, 0x5f, 0x1f, 0x7f, 0xff, 0x7f, 0x7f,
+    0x05, 0x0f, 0x01, 0x0f, 0x0f, 0x5f, 0x9b, 0xdf, 0x7f, 0xff,
+    0x5f, 0x1d, 0x5f, 0xff, 0x0f, 0x1f, 0x0f, 0x5f, 0x03, 0x1f,
+    0x4f, 0x5f, 0xf7, 0x7f, 0x7f, 0xff, 0x0d, 0x0f, 0xfb, 0xff,
+    0xf7, 0xbf, 0x0f, 0x4f, 0xd7, 0x3f, 0x4f, 0x7f, 0xff, 0xff,
+    0x67, 0xbf, 0x56, 0x25, 0x1f, 0x7f, 0x9f, 0xff, 0x00, 0x00,
+    0x00, 0x05, 0x5f, 0x7f, 0x01, 0xdf, 0x14, 0x00, 0x05, 0x0f,
+    0x07, 0xa2, 0x09, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x5f,
+    0x18, 0xd7, 0x94, 0x71, 0x00, 0x05, 0x1f, 0xb7, 0x0c, 0x07,
+    0x0f, 0x0f, 0x00, 0x0f, 0x0f, 0x1f, 0x84, 0x8f, 0x05, 0x15,
+    0x05, 0x0f, 0x4f, 0xff, 0x87, 0xdf, 0x05, 0x01, 0x10, 0x00,
+    0x0f, 0x0f, 0x00, 0x08, 0x05, 0x04, 0x04, 0x01, 0x4f, 0xff,
+    0x9f, 0x8f, 0x4a, 0x40, 0x5f, 0x5f, 0xff, 0xfe, 0xdf, 0xff,
+    0x7f, 0xf7, 0xff, 0x7f, 0xff, 0xff, 0x7b, 0xff, 0x0f, 0xfd,
+    0xd7, 0x5f, 0x4f, 0x7f, 0x7f, 0xdf, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0x77, 0xdf, 0x7f, 0x4f, 0xef, 0xff, 0xff, 0x77, 0xff,
+    0xff, 0xff, 0x6f, 0xff, 0x0f, 0x4f, 0xff, 0xff, 0x9d, 0xff,
+    0x0f, 0xef, 0xff, 0xdf, 0x6f, 0xff, 0xff, 0xff, 0x4f, 0xff,
+    0xcd, 0x0f, 0x4f, 0xff, 0xff, 0xdf, 0x00, 0x00, 0x00, 0x0b,
+    0x05, 0x02, 0x02, 0x0f, 0x04, 0x00, 0x00, 0x0c, 0x01, 0x06,
+    0x00, 0x0f, 0x20, 0x03, 0x00, 0x00, 0x05, 0x0f, 0x40, 0x08,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x0c, 0x0f, 0x01, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x14, 0x01, 0x05,
+    0x01, 0x15, 0xaf, 0x0f, 0x00, 0x01, 0x10, 0x00, 0x08, 0x00,
+    0x46, 0x0c, 0x20, 0x00, 0x88, 0x00, 0x0f, 0x15, 0xff, 0xdf,
+    0x02, 0x00, 0x00, 0x0f, 0x7f, 0x5f, 0xdb, 0xff, 0x4f, 0x3e,
+    0x05, 0x0f, 0x7f, 0xf7, 0x95, 0x4f, 0x0d, 0x0f, 0x01, 0x0f,
+    0x4f, 0x5f, 0x9f, 0xdf, 0x25, 0x0e, 0x0d, 0x0d, 0x4f, 0x7f,
+    0x8f, 0x0f, 0x0f, 0xfa, 0x04, 0x4f, 0x4f, 0xff, 0xf7, 0x77,
+    0x47, 0xed, 0x05, 0x0f, 0xff, 0xff, 0xdf, 0xff, 0x4f, 0x6f,
+    0xd8, 0x5f, 0x0f, 0x7f, 0xdf, 0x5f, 0x07, 0x0f, 0x94, 0x0d,
+    0x1f, 0xff, 0xff, 0xff, 0x00, 0x02, 0x00, 0x03, 0x46, 0x57,
+    0x01, 0x0d, 0x01, 0x08, 0x01, 0x0f, 0x47, 0x6c, 0x0d, 0x0f,
+    0x02, 0x00, 0x00, 0x00, 0x0b, 0x4f, 0x00, 0x08, 0x05, 0x00,
+    0x95, 0x01, 0x0f, 0x7f, 0x0c, 0x0f, 0x01, 0x0e, 0x00, 0x00,
+    0x0f, 0x41, 0x00, 0x00, 0x04, 0x24, 0x0d, 0x0f, 0x0f, 0x7f,
+    0xcf, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00,
+    0x06, 0x26, 0xcf, 0x05, 0xcf, 0x7f, 0xdf, 0xdf, 0x00, 0x00,
+    0x17, 0x5f, 0xff, 0xfd, 0xff, 0xff, 0x46, 0x09, 0x4f, 0x5f,
+    0x7f, 0xfd, 0xdf, 0xff, 0x0a, 0x88, 0xa7, 0x7f, 0x7f, 0xff,
+    0xff, 0xff, 0x0f, 0x04, 0xdf, 0x7f, 0x4f, 0xff, 0x9f, 0xff,
+    0x0e, 0xe6, 0xdf, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x0f, 0xec,
+    0x8f, 0x4f, 0x7f, 0xff, 0xdf, 0xff, 0x0f, 0xcf, 0xdf, 0xff,
+    0x6f, 0x7f, 0xff, 0xff, 0x03, 0x0c, 0x9d, 0x0f, 0x7f, 0xff,
+    0xff, 0xff,
+};
+
+static const uint8_t g_01[] = {
+    0x37, 0x73, 0x00, 0x19, 0x57, 0x7f, 0xf5, 0xfb, 0x70, 0x33,
+    0xf0, 0xf9, 0x7f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_02[] = {
+    0x50,
+};
+
+static const uint8_t g_10[] = {
+    0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0xf3, 0x5f, 0x84, 0x04,
+    0x17, 0x9f, 0x04, 0x23, 0x05, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x03, 0x03, 0x33, 0xd7, 0x05, 0x03, 0x5f, 0x3f, 0x17, 0x33,
+    0xff, 0xff, 0x00, 0x80, 0x02, 0x04, 0x12, 0x00, 0x11, 0x57,
+    0x05, 0x25, 0x05, 0x03, 0x35, 0xbf, 0x9f, 0xff, 0x07, 0x6f,
+    0x20, 0x40, 0x17, 0x06, 0xfa, 0xe8, 0x01, 0x07, 0x1f, 0x9f,
+    0x1f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_20[] = {
+    0x04, 0x00, 0x01, 0x01, 0x43, 0x2e, 0xff, 0x3f,
+};
+
+static const uint8_t g_30[] = {
+    0x11, 0x11, 0x11, 0x11, 0x51, 0x11, 0x13, 0x11, 0x11, 0x11,
+    0x13, 0x11, 0x11, 0x11, 0x33, 0x11, 0x13, 0x11, 0x13, 0x13,
+    0x13, 0x13, 0x31, 0x31, 0x11, 0x01, 0x11, 0x11, 0x71, 0x11,
+    0x11, 0x75,
+};
+
+static const uint8_t g_40[] = {
+    0x00, 0x0f, 0x00, 0x09, 0x00, 0x0d, 0x00, 0x0d, 0x00, 0x0f,
+    0x00, 0x4e, 0xe4, 0x0d, 0x10, 0x0f, 0x00, 0x0f, 0x44, 0x4f,
+    0x00, 0x1e, 0x0f, 0x0f, 0xae, 0xaf, 0x45, 0x7f, 0xef, 0xff,
+    0x0f, 0xff, 0x00, 0x09, 0x01, 0x11, 0x00, 0x01, 0x1c, 0xdd,
+    0x00, 0x15, 0x00, 0xff, 0x00, 0x10, 0x00, 0xfd, 0x00, 0x0f,
+    0x4f, 0x5f, 0x3d, 0xff, 0xff, 0xff, 0x4f, 0xff, 0x1c, 0xff,
+    0xdf, 0xff, 0x8f, 0xff, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x15,
+    0x01, 0x07, 0x00, 0x01, 0x02, 0x1f, 0x01, 0x11, 0x05, 0x7f,
+    0x00, 0x1f, 0x41, 0x57, 0x1f, 0xff, 0x05, 0x77, 0x0d, 0x5f,
+    0x4d, 0xff, 0x4f, 0xff, 0x0f, 0xff, 0x00, 0x00, 0x02, 0x05,
+    0x00, 0x11, 0x05, 0x7d, 0x10, 0x15, 0x2f, 0xff, 0x40, 0x50,
+    0x0d, 0xfd, 0x04, 0x0f, 0x07, 0x1f, 0x07, 0x7f, 0x0f, 0xbf,
+    0x0d, 0x7f, 0x0f, 0xff, 0x4d, 0x7d, 0x0f, 0xff,
+};
+
+static const uint8_t g_11[] = {
+    0x01, 0x13, 0x03, 0x7f,
+};
+
+static const uint8_t g_21[] = {
+    0x17,
+};
+
+static const uint8_t g_31[] = {
+    0x55, 0x57, 0x57, 0x7f,
+};
+
+static const uint8_t g_41[] = {
+    0x01, 0x01, 0x01, 0x1f, 0x03, 0x1f, 0x3f, 0xff,
+};
+
+static const uint8_t g_12[] = {
+    0x40,
+};
+
+static const uint8_t g_22[] = {
+    0x00,
+};
+
+static const uint8_t g_32[] = {
+    0x10,
+};
+
+static const uint8_t g_42[] = {
+    0x10,
+};
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src)
+{
+    int h, i, j, k, l, m;
+
+    for (j = 0; j < XFACE_HEIGHT; j++) {
+        for (i = 0; i < XFACE_WIDTH; i++) {
+            h = i + j * XFACE_WIDTH;
+            k = 0;
+
+            /*
+               Compute k, encoding the bits *before* the current one, contained in the
+               image buffer. That is, given the grid:
+
+                l      i
+                |      |
+                v      v
+               +--+--+--+--+--+
+          m -> | 1| 2| 3| 4| 5|
+               +--+--+--+--+--+
+               | 6| 7| 8| 9|10|
+               +--+--+--+--+--+
+          j -> |11|12| *|  |  |
+               +--+--+--+--+--+
+
+               the value k for the pixel marked as "*" will contain the bit encoding of
+               the values in the matrix marked from "1" to "12". In case the pixel is
+               near the border of the grid, the number of values contained within the
+               grid will be lesser than 12.
+             */
+
+            for (l = i - 2; l <= i + 2; l++) {
+                for (m = j - 2; m <= j; m++) {
+                    if (l >= i && m == j)
+                        continue;
+                    if (l > 0 && l <= XFACE_WIDTH && m > 0)
+                        k = 2*k + src[l + m * XFACE_WIDTH];
+                }
+            }
+
+            /*
+              Use the guess for the given position and the computed value of k.
+
+              The following table shows the number of digits in k, depending on
+              the position of the pixel, and shows the corresponding guess table
+              to use:
+
+                 i=1  i=2  i=3       i=w-1 i=w
+               +----+----+----+ ... +----+----+
+           j=1 |  0 |  1 |  2 |     |  2 |  2 |
+               |g22 |g12 |g02 |     |g42 |g32 |
+               +----+----+----+ ... +----+----+
+           j=2 |  3 |  5 |  7 |     |  6 |  5 |
+               |g21 |g11 |g01 |     |g41 |g31 |
+               +----+----+----+ ... +----+----+
+           j=3 |  5 |  9 | 12 |     | 10 |  8 |
+               |g20 |g10 |g00 |     |g40 |g30 |
+               +----+----+----+ ... +----+----+
+            */
+
+#define GEN(table) dst[h] ^= (table[k>>3]>>(7-(k&7)))&1
+
+            switch (i) {
+            case 1:
+                switch (j) {
+                case 1:  GEN(g_22); break;
+                case 2:  GEN(g_21); break;
+                default: GEN(g_20); break;
+                }
+                break;
+            case 2:
+                switch (j) {
+                case 1:  GEN(g_12); break;
+                case 2:  GEN(g_11); break;
+                default: GEN(g_10); break;
+                }
+                break;
+            case XFACE_WIDTH - 1:
+                switch (j) {
+                case 1:  GEN(g_42); break;
+                case 2:  GEN(g_41); break;
+                default: GEN(g_40); break;
+                }
+                break;
+            case XFACE_WIDTH:
+                switch (j) {
+                case 1:  GEN(g_32); break;
+                case 2:  GEN(g_31); break;
+                default: GEN(g_30); break;
+                }
+                break;
+            default:
+                switch (j) {
+                case 1:  GEN(g_02); break;
+                case 2:  GEN(g_01); break;
+                default: GEN(g_00); break;
+                }
+                break;
+            }
+        }
+    }
+}
diff --git a/libavcodec/xface.h b/libavcodec/xface.h
new file mode 100644
index 0000000000..0236d713ad
--- /dev/null
+++ b/libavcodec/xface.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common definitions.
+ */
+
+#include <stdint.h>
+
+/* define the face size - 48x48x1 */
+#define XFACE_WIDTH  48
+#define XFACE_HEIGHT 48
+#define XFACE_PIXELS (XFACE_WIDTH * XFACE_HEIGHT)
+
+/* compressed output uses the full range of printable characters.
+ * In ASCII these are in a contiguous block so we just need to know
+ * the first and last. The total number of printables is needed too. */
+#define XFACE_FIRST_PRINT '!'
+#define XFACE_LAST_PRINT '~'
+#define XFACE_PRINTS (XFACE_LAST_PRINT - XFACE_FIRST_PRINT + 1)
+
+/*
+ * Image is encoded as a big integer, using characters from '~' to
+ * '!', for a total of 94 symbols. In order to express
+ * 48x48 pixels with the worst case encoding 666 symbols should
+ * be sufficient.
+ */
+#define XFACE_MAX_DIGITS 666
+
+#define XFACE_BITSPERWORD 8
+#define XFACE_WORDCARRY (1 << XFACE_BITSPERWORD)
+#define XFACE_WORDMASK (XFACE_WORDCARRY - 1)
+
+// This must be larger or equal to log256(94^XFACE_MAX_DIGITS)
+#define XFACE_MAX_WORDS 546
+
+/* Portable, very large unsigned integer arithmetic is needed.
+ * Implementation uses arrays of WORDs. */
+typedef struct {
+    int nb_words;
+    uint8_t words[XFACE_MAX_WORDS];
+} BigInt;
+
+/**
+ * Add a to b storing the result in b.
+ */
+void ff_big_add(BigInt *b, uint8_t a);
+
+/**
+ * Divide b by a storing the result in b and the remainder in the word
+ * pointed to by r.
+ */
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r);
+
+/**
+ * Multiply a by b storing the result in b.
+ */
+void ff_big_mul(BigInt *b, uint8_t a);
+
+/* Each face is encoded using 9 octrees of 16x16 each. Each level of the
+ * trees has varying probabilities of being white, grey or black.
+ * The table below is based on sampling many faces */
+enum XFaceColor { XFACE_COLOR_BLACK = 0, XFACE_COLOR_GREY, XFACE_COLOR_WHITE };
+
+/* Data of varying probabilities are encoded by a value in the range 0 - 255.
+ * The probability of the data determines the range of possible encodings.
+ * Offset gives the first possible encoding of the range. */
+typedef struct {
+    uint8_t range;
+    uint8_t offset;
+} ProbRange;
+
+extern const ProbRange ff_xface_probranges_per_level[4][3];
+
+extern const ProbRange ff_xface_probranges_2x2[16];
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src);
diff --git a/libavcodec/xfacedec.c b/libavcodec/xfacedec.c
new file mode 100644
index 0000000000..d045cb6ef4
--- /dev/null
+++ b/libavcodec/xfacedec.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face decoder, based on libcompface, by James Ashton.
+ */
+
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "xface.h"
+
+static int pop_integer(BigInt *b, const ProbRange *pranges)
+{
+    uint8_t r;
+    int i;
+
+    /* extract the last byte into r, and shift right b by 8 bits */
+    ff_big_div(b, 0, &r);
+
+    i = 0;
+    while (r < pranges->offset || r >= pranges->range + pranges->offset) {
+        pranges++;
+        i++;
+    }
+    ff_big_mul(b, pranges->range);
+    ff_big_add(b, r - pranges->offset);
+    return i;
+}
+
+static void pop_greys(BigInt *b, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        pop_greys(b, bitmap,                       w, h);
+        pop_greys(b, bitmap + w,                   w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h,     w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        w = pop_integer(b, ff_xface_probranges_2x2);
+        if (w & 1) bitmap[0]               = 1;
+        if (w & 2) bitmap[1]               = 1;
+        if (w & 4) bitmap[XFACE_WIDTH]     = 1;
+        if (w & 8) bitmap[XFACE_WIDTH + 1] = 1;
+    }
+}
+
+static void decode_block(BigInt *b, char *bitmap, int w, int h, int level)
+{
+    switch (pop_integer(b, &ff_xface_probranges_per_level[level][0])) {
+    case XFACE_COLOR_WHITE:
+        return;
+    case XFACE_COLOR_BLACK:
+        pop_greys(b, bitmap, w, h);
+        return;
+    default:
+        w /= 2;
+        h /= 2;
+        level++;
+        decode_block(b, bitmap,                       w, h, level);
+        decode_block(b, bitmap + w,                   w, h, level);
+        decode_block(b, bitmap + h * XFACE_WIDTH,     w, h, level);
+        decode_block(b, bitmap + w + h * XFACE_WIDTH, w, h, level);
+        return;
+    }
+}
+
+typedef struct XFaceContext {
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+} XFaceContext;
+
+static av_cold int xface_decode_init(AVCodecContext *avctx)
+{
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    avctx->width   = XFACE_WIDTH;
+    avctx->height  = XFACE_HEIGHT;
+    avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+
+    return 0;
+}
+
+static int xface_decode_frame(AVCodecContext *avctx,
+                              void *data, int *got_frame,
+                              AVPacket *avpkt)
+{
+    XFaceContext *xface = avctx->priv_data;
+    int ret, i, j, k;
+    uint8_t byte;
+    BigInt b = {0};
+    char *buf;
+    int64_t c;
+    AVFrame *frame = data;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0, k = 0; avpkt->data[i] && i < avpkt->size; i++) {
+        c = avpkt->data[i];
+
+        /* ignore invalid digits */
+        if (c < XFACE_FIRST_PRINT || c > XFACE_LAST_PRINT)
+            continue;
+
+        if (++k > XFACE_MAX_DIGITS) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Buffer is longer than expected, truncating at byte %d\n", i);
+            break;
+        }
+        ff_big_mul(&b, XFACE_PRINTS);
+        ff_big_add(&b, c - XFACE_FIRST_PRINT);
+    }
+
+    /* decode image and put it in bitmap */
+    memset(xface->bitmap, 0, XFACE_PIXELS);
+    buf = xface->bitmap;
+    decode_block(&b, buf,                         16, 16, 0);
+    decode_block(&b, buf + 16,                    16, 16, 0);
+    decode_block(&b, buf + 32,                    16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16,      16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 32, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32     , 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 32, 16, 16, 0);
+
+    ff_xface_generate_face(xface->bitmap, xface->bitmap);
+
+    /* convert image from 1=black 0=white bitmap to MONOWHITE */
+    buf = frame->data[0];
+    for (i = 0, j = 0, k = 0, byte = 0; i < XFACE_PIXELS; i++) {
+        byte += xface->bitmap[i];
+        if (k == 7) {
+            buf[j++] = byte;
+            byte = k = 0;
+        } else {
+            k++;
+            byte <<= 1;
+        }
+        if (j == XFACE_WIDTH/8) {
+            j = 0;
+            buf += frame->linesize[0];
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_xface_decoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .init           = xface_decode_init,
+    .decode         = xface_decode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/xfaceenc.c b/libavcodec/xfaceenc.c
new file mode 100644
index 0000000000..bfb9fb9ece
--- /dev/null
+++ b/libavcodec/xfaceenc.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face encoder, based on libcompface, by James Ashton.
+ */
+
+#include "xface.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+
+typedef struct XFaceContext {
+    AVClass *class;
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+    int max_line_len;             ///< max line length for compressed data
+    int set_header;               ///< set X-Face header in the output
+} XFaceContext;
+
+static int all_same(char *bitmap, int w, int h)
+{
+    char val, *row;
+    int x;
+
+    val = *bitmap;
+    while (h--) {
+        row = bitmap;
+        x = w;
+        while (x--)
+            if (*(row++) != val)
+                return 0;
+        bitmap += XFACE_WIDTH;
+    }
+    return 1;
+}
+
+static int all_black(char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        return (all_black(bitmap, w, h) && all_black(bitmap + w, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h + w, w, h));
+    } else {
+        /* at least one pixel in the 2x2 grid is non-zero */
+        return *bitmap || *(bitmap + 1) ||
+               *(bitmap + XFACE_WIDTH) || *(bitmap + XFACE_WIDTH + 1);
+    }
+}
+
+static int all_white(char *bitmap, int w, int h)
+{
+    return *bitmap == 0 && all_same(bitmap, w, h);
+}
+
+typedef struct {
+    ProbRange prob_ranges[XFACE_PIXELS*2];
+    int prob_ranges_idx;
+} ProbRangesQueue;
+
+static inline int pq_push(ProbRangesQueue *pq, const ProbRange *p)
+{
+    if (pq->prob_ranges_idx >= XFACE_PIXELS * 2 - 1)
+        return -1;
+    pq->prob_ranges[pq->prob_ranges_idx++] = *p;
+    return 0;
+}
+
+static void push_greys(ProbRangesQueue *pq, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        push_greys(pq, bitmap,                       w, h);
+        push_greys(pq, bitmap + w,                   w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h,     w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        const ProbRange *p = ff_xface_probranges_2x2 +
+                 *bitmap +
+            2 * *(bitmap + 1) +
+            4 * *(bitmap + XFACE_WIDTH) +
+            8 * *(bitmap + XFACE_WIDTH + 1);
+        pq_push(pq, p);
+    }
+}
+
+static void encode_block(char *bitmap, int w, int h, int level, ProbRangesQueue *pq)
+{
+    if (all_white(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_WHITE]);
+    } else if (all_black(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_BLACK]);
+        push_greys(pq, bitmap, w, h);
+    } else {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_GREY]);
+        w /= 2;
+        h /= 2;
+        level++;
+        encode_block(bitmap,                       w, h, level, pq);
+        encode_block(bitmap + w,                   w, h, level, pq);
+        encode_block(bitmap + h * XFACE_WIDTH,     w, h, level, pq);
+        encode_block(bitmap + w + h * XFACE_WIDTH, w, h, level, pq);
+    }
+}
+
+static void push_integer(BigInt *b, const ProbRange *prange)
+{
+    uint8_t r;
+
+    ff_big_div(b, prange->range, &r);
+    ff_big_mul(b, 0);
+    ff_big_add(b, r + prange->offset);
+}
+
+static int xface_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *frame, int *got_packet)
+{
+    XFaceContext *xface = avctx->priv_data;
+    ProbRangesQueue pq = {{{ 0 }}, 0};
+    uint8_t bitmap_copy[XFACE_PIXELS];
+    BigInt b = {0};
+    int i, j, k, ret = 0;
+    const uint8_t *buf;
+    uint8_t *p;
+    char intbuf[XFACE_MAX_DIGITS];
+
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+    avctx->width  = XFACE_WIDTH;
+    avctx->height = XFACE_HEIGHT;
+
+    /* convert image from MONOWHITE to 1=black 0=white bitmap */
+    buf = frame->data[0];
+    i = j = 0;
+    do {
+        for (k = 0; k < 8; k++)
+            xface->bitmap[i++] = (buf[j]>>(7-k))&1;
+        if (++j == XFACE_WIDTH/8) {
+            buf += frame->linesize[0];
+            j = 0;
+        }
+    } while (i < XFACE_PIXELS);
+
+    /* create a copy of bitmap */
+    memcpy(bitmap_copy, xface->bitmap, XFACE_PIXELS);
+    ff_xface_generate_face(xface->bitmap, bitmap_copy);
+
+    encode_block(xface->bitmap,                         16, 16, 0, &pq);
+    encode_block(xface->bitmap + 16,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + 32,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 32, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 32, 16, 16, 0, &pq);
+
+    while (pq.prob_ranges_idx > 0)
+        push_integer(&b, &pq.prob_ranges[--pq.prob_ranges_idx]);
+
+    /* write the inverted big integer in b to intbuf */
+    i = 0;
+    av_assert0(b.nb_words < XFACE_MAX_WORDS);
+    while (b.nb_words) {
+        uint8_t r;
+        ff_big_div(&b, XFACE_PRINTS, &r);
+        av_assert0(i < sizeof(intbuf));
+        intbuf[i++] = r + XFACE_FIRST_PRINT;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, i+2, 0)) < 0)
+        return ret;
+
+    /* revert the number, and close the buffer */
+    p = pkt->data;
+    while (--i >= 0)
+        *(p++) = intbuf[i];
+    *(p++) = '\n';
+    *(p++) = 0;
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+AVCodec ff_xface_encoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .encode2        = xface_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/xiph.c b/libavcodec/xiph.c
index 7c3c7100c6..d072224b4a 100644
--- a/libavcodec/xiph.c
+++ b/libavcodec/xiph.c
@@ -1,28 +1,28 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "xiph.h"
 
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                          int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                          int first_header_size, const uint8_t *header_start[3],
                           int header_len[3])
 {
     int i;
diff --git a/libavcodec/xiph.h b/libavcodec/xiph.h
index afaece7cee..1741a51b65 100644
--- a/libavcodec/xiph.h
+++ b/libavcodec/xiph.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  * @param[out] header_len The sizes of each of the three headers.
  * @return On error a negative value is returned, on success zero.
  */
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                              int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                              int first_header_size, const uint8_t *header_start[3],
                               int header_len[3]);
 
 #endif /* AVCODEC_XIPH_H */
diff --git a/libavcodec/xl.c b/libavcodec/xl.c
index 7286c14fdc..37ab46e4f7 100644
--- a/libavcodec/xl.c
+++ b/libavcodec/xl.c
@@ -2,20 +2,20 @@
  * Miro VideoXL codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,19 +50,16 @@ static int decode_frame(AVCodecContext *avctx,
     int y0, y1, y2, y3 = 0, c0 = 0, c1 = 0;
 
     if (avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "Width not a multiple of 4.\n");
+        av_log(avctx, AV_LOG_ERROR, "width is not a multiple of 4\n");
         return AVERROR_INVALIDDATA;
     }
-
     if (buf_size < avctx->width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
diff --git a/libavcodec/xsubdec.c b/libavcodec/xsubdec.c
index a7dd7ee628..2db263ba29 100644
--- a/libavcodec/xsubdec.c
+++ b/libavcodec/xsubdec.c
@@ -2,20 +2,20 @@
  * XSUB subtitle decoder
  * Copyright (c) 2007 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,11 +58,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     GetBitContext gb;
     int has_alpha = avctx->codec_tag == MKTAG('D','X','S','A');
 
-    memset(sub, 0, sizeof(*sub));
-
     // check that at least header fits
     if (buf_size < 27 + 7 * 2 + 4 * (3 + has_alpha)) {
-        av_log(avctx, AV_LOG_ERROR, "coded frame too small\n");
+        av_log(avctx, AV_LOG_ERROR, "coded frame size %d too small\n", buf_size);
         return -1;
     }
 
@@ -97,12 +95,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     sub->rects =  av_mallocz(sizeof(*sub->rects));
     if (!sub->rects)
         return AVERROR(ENOMEM);
+
     sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
     if (!sub->rects[0]) {
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
-    sub->num_rects = 1;
     sub->rects[0]->x = x; sub->rects[0]->y = y;
     sub->rects[0]->w = w; sub->rects[0]->h = h;
     sub->rects[0]->type = SUBTITLE_BITMAP;
@@ -117,6 +115,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
+    sub->num_rects = 1;
 
     // read palette
     for (i = 0; i < sub->rects[0]->nb_colors; i++)
diff --git a/libavcodec/xsubenc.c b/libavcodec/xsubenc.c
index fc46fb8651..707085488c 100644
--- a/libavcodec/xsubenc.c
+++ b/libavcodec/xsubenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 DivX, Inc.
  * Copyright (c) 2009 Bjorn Axelsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
     }
 
     // TODO: support multiple rects
-    if (h->num_rects > 1)
+    if (h->num_rects != 1)
         av_log(avctx, AV_LOG_WARNING, "Only single rects supported (%d in subtitle.)\n", h->num_rects);
 
     // TODO: render text-based subtitles into bitmaps
@@ -142,7 +142,7 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
         av_log(avctx, AV_LOG_WARNING, "No more than 4 subtitle colors supported (%d found.)\n", h->rects[0]->nb_colors);
 
     // TODO: Palette swapping if color zero is not transparent
-    if (((uint32_t *)h->rects[0]->pict.data[1])[0] & 0xff)
+    if (((uint32_t *)h->rects[0]->pict.data[1])[0] & 0xff000000)
         av_log(avctx, AV_LOG_WARNING, "Color index 0 is not transparent. Transparency will be messed up.\n");
 
     if (make_tc(startTime, start_tc) || make_tc(endTime, end_tc)) {
@@ -166,8 +166,8 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
     bytestream_put_le16(&hdr, height);
     bytestream_put_le16(&hdr, h->rects[0]->x);
     bytestream_put_le16(&hdr, h->rects[0]->y);
-    bytestream_put_le16(&hdr, h->rects[0]->x + width);
-    bytestream_put_le16(&hdr, h->rects[0]->y + height);
+    bytestream_put_le16(&hdr, h->rects[0]->x + width -1);
+    bytestream_put_le16(&hdr, h->rects[0]->y + height -1);
 
     rlelenptr = hdr; // Will store length of first field here later.
     hdr+=2;
@@ -190,7 +190,7 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
                         h->rects[0]->w, h->rects[0]->h >> 1))
         return -1;
 
-    // Enforce total height to be be multiple of 2
+    // Enforce total height to be a multiple of 2
     if (h->rects[0]->h & 1) {
         put_xsub_rle(&pb, h->rects[0]->w, PADDING_COLOR);
         avpriv_align_put_bits(&pb);
@@ -206,6 +206,8 @@ static av_cold int xsub_encoder_init(AVCodecContext *avctx)
     if (!avctx->codec_tag)
         avctx->codec_tag = MKTAG('D','X','S','B');
 
+    avctx->bits_per_coded_sample = 4;
+
     return 0;
 }
 
diff --git a/libavcodec/xvididct.c b/libavcodec/xvididct.c
index ca89703233..1f96ccc35c 100644
--- a/libavcodec/xvididct.c
+++ b/libavcodec/xvididct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2006-2011 Xvid Solutions GmbH
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -334,9 +334,12 @@ av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (!high_bit_depth &&
-        (avctx->idct_algo == FF_IDCT_AUTO ||
-         avctx->idct_algo == FF_IDCT_XVID)) {
+    if (high_bit_depth || avctx->lowres ||
+        !(avctx->idct_algo == FF_IDCT_AUTO ||
+          avctx->idct_algo == FF_IDCT_XVID))
+        return;
+
+    if (avctx->idct_algo == FF_IDCT_XVID) {
         c->idct_put  = xvid_idct_put;
         c->idct_add  = xvid_idct_add;
         c->idct      = ff_xvid_idct;
@@ -345,6 +348,8 @@ av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 
     if (ARCH_X86)
         ff_xvid_idct_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_xvid_idct_init_mips(c, avctx, high_bit_depth);
 
     ff_init_scantable_permutation(c->idct_permutation, c->perm_type);
 }
diff --git a/libavcodec/xvididct.h b/libavcodec/xvididct.h
index 499f8190fc..e0bc1a2b91 100644
--- a/libavcodec/xvididct.h
+++ b/libavcodec/xvididct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,7 @@ void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
 void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                            unsigned high_bit_depth);
+void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                            unsigned high_bit_depth);
 
 #endif /* AVCODEC_XVIDIDCT_H */
diff --git a/libavcodec/xvmc.h b/libavcodec/xvmc.h
index 950ed18276..c2e187cc16 100644
--- a/libavcodec/xvmc.h
+++ b/libavcodec/xvmc.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003 Ivan Kalvachev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include "version.h"
 #include "avcodec.h"
 
-#if FF_API_XVMC
-
 /**
  * @defgroup lavc_codec_hwaccel_xvmc XvMC
  * @ingroup lavc_codec_hwaccel
@@ -169,6 +167,4 @@ attribute_deprecated struct xvmc_pix_fmt {
  * @}
  */
 
-#endif /* FF_API_XVMC */
-
 #endif /* AVCODEC_XVMC_H */
diff --git a/libavcodec/xvmc_internal.h b/libavcodec/xvmc_internal.h
index 9018e4a40a..d365ef0266 100644
--- a/libavcodec/xvmc_internal.h
+++ b/libavcodec/xvmc_internal.h
@@ -1,20 +1,20 @@
 /*
  * XVideo Motion Compensation internal functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,14 +25,7 @@
 #include "mpegvideo.h"
 #include "version.h"
 
-#if FF_API_XVMC
-
 void ff_xvmc_init_block(MpegEncContext *s);
 void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp);
-int  ff_xvmc_field_start(MpegEncContext*s, AVCodecContext *avctx);
-void ff_xvmc_field_end(MpegEncContext *s);
-void ff_xvmc_decode_mb(MpegEncContext *s);
-
-#endif /* FF_API_XVMC */
 
 #endif /* AVCODEC_XVMC_INTERNAL_H */
diff --git a/libavcodec/xwd.h b/libavcodec/xwd.h
index f41e2cd651..d0460465e0 100644
--- a/libavcodec/xwd.h
+++ b/libavcodec/xwd.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/xwddec.c b/libavcodec/xwddec.c
index f6d3d97de5..2febedc4aa 100644
--- a/libavcodec/xwddec.c
+++ b/libavcodec/xwddec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -147,7 +147,7 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (pixformat != XWD_Z_PIXMAP) {
-        av_log(avctx, AV_LOG_ERROR, "pixmap format %"PRIu32" unsupported\n", pixformat);
+        avpriv_report_missing_feature(avctx, "Pixmap format %"PRIu32, pixformat);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -155,10 +155,13 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     switch (vclass) {
     case XWD_STATIC_GRAY:
     case XWD_GRAY_SCALE:
-        if (bpp != 1)
+        if (bpp != 1 && bpp != 8)
             return AVERROR_INVALIDDATA;
-        if (pixdepth == 1)
+        if (pixdepth == 1) {
             avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        } else if (pixdepth == 8) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        }
         break;
     case XWD_STATIC_COLOR:
     case XWD_PSEUDO_COLOR:
@@ -204,10 +207,8 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
diff --git a/libavcodec/xwdenc.c b/libavcodec/xwdenc.c
index e346b5c0c4..43bca89033 100644
--- a/libavcodec/xwdenc.c
+++ b/libavcodec/xwdenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 #define WINDOW_NAME_SIZE    11
 
 static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *p, int *got_packet)
+                            const AVFrame *pict, int *got_packet)
 {
     enum AVPixelFormat pix_fmt = avctx->pix_fmt;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -40,6 +40,7 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t header_size;
     int i, out_size, ret;
     uint8_t *ptr, *buf;
+    AVFrame * const p = (AVFrame *)pict;
 
     pixdepth = av_get_bits_per_pixel(desc);
     if (desc->flags & AV_PIX_FMT_FLAG_BE)
@@ -124,6 +125,11 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         bpad     = 8;
         ncolors  = 256;
         break;
+    case AV_PIX_FMT_GRAY8:
+        bpp      = 8;
+        bpad     = 8;
+        vclass   = XWD_STATIC_GRAY;
+        break;
     case AV_PIX_FMT_MONOWHITE:
         be       = 1;
         bitorder = 1;
@@ -132,7 +138,7 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         vclass   = XWD_STATIC_GRAY;
         break;
     default:
-        av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
+        av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
         return AVERROR(EINVAL);
     }
 
@@ -140,18 +146,12 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     header_size = XWD_HEADER_SIZE + WINDOW_NAME_SIZE;
     out_size    = header_size + ncolors * XWD_CMAP_SIZE + avctx->height * lsize;
 
-    if ((ret = ff_alloc_packet(pkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "output buffer too small\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, out_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
 
     bytestream_put_be32(&buf, header_size);
     bytestream_put_be32(&buf, XWD_VERSION);   // file version
@@ -233,6 +233,7 @@ AVCodec ff_xwd_encoder = {
                                                  AV_PIX_FMT_RGB4_BYTE,
                                                  AV_PIX_FMT_BGR4_BYTE,
                                                  AV_PIX_FMT_PAL8,
+                                                 AV_PIX_FMT_GRAY8,
                                                  AV_PIX_FMT_MONOWHITE,
                                                  AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c
index 6369b7de8a..54852962a1 100644
--- a/libavcodec/xxan.c
+++ b/libavcodec/xxan.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2011 Konstantin Shishkov
  * based on work by Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     s->scratch_buffer = av_malloc(s->buffer_size + 130);
     if (!s->scratch_buffer) {
-        av_freep(&s->y_buffer);
+        xan_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
 
@@ -224,16 +224,18 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
     if (mode) {
         for (j = 0; j < avctx->height >> 1; j++) {
             for (i = 0; i < avctx->width >> 1; i++) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
                     U[i] = uval | (uval >> 5);
                     V[i] = vval | (vval >> 5);
                 }
-                if (src == src_end)
-                    return 0;
             }
             U += s->pic->linesize[1];
             V += s->pic->linesize[2];
@@ -248,8 +250,12 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
 
         for (j = 0; j < avctx->height >> 2; j++) {
             for (i = 0; i < avctx->width >> 1; i += 2) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
@@ -288,7 +294,7 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
     if ((ret = xan_decode_chroma(avctx, chroma_off)) != 0)
         return ret;
 
-    if (corr_off >= (s->gb.buffer_end - s->gb.buffer_start)) {
+    if (corr_off >= bytestream2_size(&s->gb)) {
         av_log(avctx, AV_LOG_WARNING, "Ignoring invalid correction block position\n");
         corr_off = 0;
     }
@@ -333,6 +339,9 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
         dec_size = xan_unpack(s, s->scratch_buffer, s->buffer_size / 2);
         if (dec_size < 0)
             dec_size = 0;
+        else
+            dec_size = FFMIN(dec_size, s->buffer_size/2 - 1);
+
         for (i = 0; i < dec_size; i++)
             s->y_buffer[i*2+1] = (s->y_buffer[i*2+1] + (s->scratch_buffer[i] << 1)) & 0x3F;
     }
@@ -402,10 +411,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
     int ftype;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->pic))) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     ftype = bytestream2_get_le32(&s->gb);
diff --git a/libavcodec/y41pdec.c b/libavcodec/y41pdec.c
new file mode 100644
index 0000000000..1b177d4262
--- /dev/null
+++ b/libavcodec/y41pdec.c
@@ -0,0 +1,92 @@
+/*
+ * y41p decoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV411P;
+    avctx->bits_per_raw_sample = 12;
+
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_WARNING, "y41p requires width to be divisible by 8.\n");
+    }
+
+    return 0;
+}
+
+static int y41p_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3LL * avctx->height * avctx->width / 2) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    for (i = avctx->height - 1; i >= 0 ; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_y41p_decoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_decode_init,
+    .decode       = y41p_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/y41penc.c b/libavcodec/y41penc.c
new file mode 100644
index 0000000000..94acc343fb
--- /dev/null
+++ b/libavcodec/y41penc.c
@@ -0,0 +1,93 @@
+/*
+ * y41p encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_ERROR, "y41p requires width to be divisible by 8.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    avctx->bits_per_coded_sample = 12;
+
+    return 0;
+}
+
+static int y41p_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 1.5, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    for (i = avctx->height - 1; i >= 0; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int y41p_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_y41p_encoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_encode_init,
+    .encode2      = y41p_encode_frame,
+    .close        = y41p_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
+                                                 AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/yop.c b/libavcodec/yop.c
index 98ed7237a3..c6b19ec795 100644
--- a/libavcodec/yop.c
+++ b/libavcodec/yop.c
@@ -5,20 +5,20 @@
  * derived from the code by
  * Copyright (C) 2009 Thomas P. Higdon <thomas.p.higdon@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 
 typedef struct YopDecContext {
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     int num_pal_colors;
     int first_color[2];
@@ -78,6 +79,15 @@ static const int8_t motion_vector[16][2] =
      { 4, -2}, {-2,  0},
     };
 
+static av_cold int yop_decode_close(AVCodecContext *avctx)
+{
+    YopDecContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
 static av_cold int yop_decode_init(AVCodecContext *avctx)
 {
     YopDecContext *s = avctx->priv_data;
@@ -103,10 +113,14 @@ static av_cold int yop_decode_init(AVCodecContext *avctx)
     if (s->num_pal_colors + s->first_color[0] > 256 ||
         s->num_pal_colors + s->first_color[1] > 256) {
         av_log(avctx, AV_LOG_ERROR,
-               "YOP: palette parameters invalid, header probably corrupt\n");
+               "Palette parameters invalid, header probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
@@ -144,8 +158,7 @@ static int yop_copy_previous_block(YopDecContext *s, int linesize, int copy_tag)
     bufptr = s->dstptr + motion_vector[copy_tag][0] +
              linesize * motion_vector[copy_tag][1];
     if (bufptr < s->dstbuf) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "YOP: cannot decode, file probably corrupt\n");
+        av_log(s->avctx, AV_LOG_ERROR, "File probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -179,7 +192,7 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
     YopDecContext *s = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = s->frame;
     int tag, firstcolor, is_odd_frame;
     int ret, i, x, y;
     uint32_t *palette;
@@ -189,11 +202,8 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
         return ret;
-    }
 
     if (!avctx->frame_number)
         memset(frame->data[1], 0, AVPALETTE_SIZE);
@@ -205,13 +215,20 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->low_nibble = NULL;
 
     is_odd_frame = avpkt->data[0];
+    if(is_odd_frame>1){
+        av_log(avctx, AV_LOG_ERROR, "frame is too odd %d\n", is_odd_frame);
+        return AVERROR_INVALIDDATA;
+    }
     firstcolor   = s->first_color[is_odd_frame];
     palette      = (uint32_t *)frame->data[1];
 
-    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3)
+    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3) {
         palette[i + firstcolor] = (s->srcptr[0] << 18) |
                                   (s->srcptr[1] << 10) |
                                   (s->srcptr[2] << 2);
+        palette[i + firstcolor] |= 0xFFU << 24 |
+                                   (palette[i + firstcolor] >> 6) & 0x30303;
+    }
 
     frame->palette_has_changed = 1;
 
@@ -239,6 +256,9 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         s->dstptr += 2*frame->linesize[0] - x;
     }
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+
     *got_frame = 1;
     return avpkt->size;
 }
@@ -250,6 +270,6 @@ AVCodec ff_yop_decoder = {
     .id             = AV_CODEC_ID_YOP,
     .priv_data_size = sizeof(YopDecContext),
     .init           = yop_decode_init,
+    .close          = yop_decode_close,
     .decode         = yop_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/yuv4dec.c b/libavcodec/yuv4dec.c
new file mode 100644
index 0000000000..f89f62debe
--- /dev/null
+++ b/libavcodec/yuv4dec.c
@@ -0,0 +1,84 @@
+/*
+ * libquicktime yuv4 decoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    return 0;
+}
+
+static int yuv4_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1)) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < (avctx->height + 1) >> 1; i++) {
+        for (j = 0; j < (avctx->width + 1) >> 1; j++) {
+            u[j] = *src++ ^ 0x80;
+            v[j] = *src++ ^ 0x80;
+            y[                   2 * j    ] = *src++;
+            y[                   2 * j + 1] = *src++;
+            y[pic->linesize[0] + 2 * j    ] = *src++;
+            y[pic->linesize[0] + 2 * j + 1] = *src++;
+        }
+
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_yuv4_decoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_decode_init,
+    .decode       = yuv4_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/yuv4enc.c b/libavcodec/yuv4enc.c
new file mode 100644
index 0000000000..cc8846d7e5
--- /dev/null
+++ b/libavcodec/yuv4enc.c
@@ -0,0 +1,80 @@
+/*
+ * libquicktime yuv4 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_encode_init(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+static int yuv4_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1), 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height + 1 >> 1; i++) {
+        for (j = 0; j < avctx->width + 1 >> 1; j++) {
+            *dst++ = u[j] ^ 0x80;
+            *dst++ = v[j] ^ 0x80;
+            *dst++ = y[                   2 * j    ];
+            *dst++ = y[                   2 * j + 1];
+            *dst++ = y[pic->linesize[0] + 2 * j    ];
+            *dst++ = y[pic->linesize[0] + 2 * j + 1];
+        }
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int yuv4_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_yuv4_encoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_encode_init,
+    .encode2      = yuv4_encode_frame,
+    .close        = yuv4_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/zerocodec.c b/libavcodec/zerocodec.c
index 1419e84a34..55a9a9173a 100644
--- a/libavcodec/zerocodec.c
+++ b/libavcodec/zerocodec.c
@@ -59,10 +59,8 @@ static int zerocodec_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
 
     zstream->next_in  = avpkt->data;
     zstream->avail_in = avpkt->size;
diff --git a/libavcodec/zmbv.c b/libavcodec/zmbv.c
index 0dbf8a17c5..25a1cd215d 100644
--- a/libavcodec/zmbv.c
+++ b/libavcodec/zmbv.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -64,6 +65,7 @@ typedef struct ZmbvContext {
     int fmt;
     int comp;
     int flags;
+    int stride;
     int bw, bh, bx, by;
     int decomp_len;
     z_stream zstream;
@@ -143,7 +145,7 @@ static int zmbv_decode_xor_8(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -217,7 +219,7 @@ static int zmbv_decode_xor_16(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -375,7 +377,7 @@ static int zmbv_decode_xor_32(ZmbvContext *c)
         prev   += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -406,17 +408,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int zret = Z_OK; // Zlib return code
     int len = buf_size;
     int hi_ver, lo_ver, ret;
-    uint8_t *tmp;
-
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
 
     /* parse header */
+    if (len < 1)
+        return AVERROR_INVALIDDATA;
     c->flags = buf[0];
     buf++; len--;
     if (c->flags & ZMBV_KEYFRAME) {
+        void *decode_intra = NULL;
+        c->decode_intra= NULL;
+
+        if (len < 6)
+            return AVERROR_INVALIDDATA;
         hi_ver = buf[0];
         lo_ver = buf[1];
         c->comp = buf[2];
@@ -447,29 +450,39 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
             c->bpp = 8;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_8;
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            c->stride = c->width;
             break;
         case ZMBV_FMT_15BPP:
         case ZMBV_FMT_16BPP:
             c->bpp = 16;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_16;
+            if (c->fmt == ZMBV_FMT_15BPP)
+                avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_RGB565LE;
+            c->stride = c->width * 2;
             break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
             c->bpp = 24;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_24;
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            c->stride = c->width * 3;
             break;
 #endif //ZMBV_ENABLE_24BPP
         case ZMBV_FMT_32BPP:
             c->bpp = 32;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_32;
+            avctx->pix_fmt = AV_PIX_FMT_BGR0;
+            c->stride = c->width * 4;
             break;
         default:
-            c->decode_intra = NULL;
             c->decode_xor = NULL;
             avpriv_request_sample(avctx, "Format %i", c->fmt);
             return AVERROR_PATCHWELCOME;
@@ -481,16 +494,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
             return AVERROR_UNKNOWN;
         }
 
-        tmp = av_realloc(c->cur,  avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
-            return AVERROR(ENOMEM);
-        c->cur = tmp;
-        tmp = av_realloc(c->prev, avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
+        c->cur  = av_realloc_f(c->cur, avctx->width * avctx->height,  (c->bpp / 8));
+        c->prev = av_realloc_f(c->prev, avctx->width * avctx->height,  (c->bpp / 8));
+        c->bx = (c->width + c->bw - 1) / c->bw;
+        c->by = (c->height+ c->bh - 1) / c->bh;
+        if (!c->cur || !c->prev)
             return AVERROR(ENOMEM);
-        c->prev = tmp;
-        c->bx   = (c->width  + c->bw - 1) / c->bw;
-        c->by   = (c->height + c->bh - 1) / c->bh;
+        memset(c->cur, 0, avctx->width * avctx->height * (c->bpp / 8));
+        memset(c->prev, 0, avctx->width * avctx->height * (c->bpp / 8));
+        c->decode_intra= decode_intra;
     }
 
     if (!c->decode_intra) {
@@ -498,6 +510,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         return AVERROR_INVALIDDATA;
     }
 
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
     if (c->comp == 0) { //Uncompressed data
         if (c->decomp_size < len) {
             av_log(avctx, AV_LOG_ERROR, "Buffer too small\n");
@@ -506,7 +521,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         memcpy(c->decomp_buf, buf, len);
     } else { // ZLIB-compressed data
         c->zstream.total_in = c->zstream.total_out = 0;
-        c->zstream.next_in = buf;
+        c->zstream.next_in = (uint8_t*)buf;
         c->zstream.avail_in = len;
         c->zstream.next_out = c->decomp_buf;
         c->zstream.avail_out = c->decomp_size;
@@ -531,64 +546,22 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     /* update frames */
     {
         uint8_t *out, *src;
-        int i, j;
+        int j;
 
         out = frame->data[0];
         src = c->cur;
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    out[i * 3 + 0] = c->pal[(*src) * 3 + 0];
-                    out[i * 3 + 1] = c->pal[(*src) * 3 + 1];
-                    out[i * 3 + 2] = c->pal[(*src) * 3 + 2];
-                    src++;
-                }
-                out += frame->linesize[0];
-            }
-            break;
+            for (j = 0; j < 256; j++)
+                AV_WN32(&frame->data[1][j * 4], 0xFFU << 24 | AV_RB24(&c->pal[j * 3]));
         case ZMBV_FMT_15BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0x7C00) >> 7;
-                    out[i * 3 + 1] = (tmp & 0x03E0) >> 2;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
         case ZMBV_FMT_16BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0xF800) >> 8;
-                    out[i * 3 + 1] = (tmp & 0x07E0) >> 3;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
-            for (j = 0; j < c->height; j++) {
-                memcpy(out, src, c->width * 3);
-                src += c->width * 3;
-                out += frame->linesize[0];
-            }
-            break;
-#endif //ZMBV_ENABLE_24BPP
+#endif
         case ZMBV_FMT_32BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint32_t tmp = AV_RL32(src);
-                    src += 4;
-                    AV_WB24(out+(i*3), tmp);
-                }
-                out += frame->linesize[0];
-            }
+            av_image_copy_plane(out, frame->linesize[0], src, c->stride,
+                                c->stride, c->height);
             break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Cannot handle format %i\n", c->fmt);
@@ -616,12 +589,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // Needed if zlib unused or init aborted before inflateInit
     memset(&c->zstream, 0, sizeof(z_stream));
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB24;
     c->decomp_size = (avctx->width + 255) * 4 * (avctx->height + 64);
 
     /* Allocate decompression buffer */
     if (c->decomp_size) {
-        if (!(c->decomp_buf = av_malloc(c->decomp_size))) {
+        if (!(c->decomp_buf = av_mallocz(c->decomp_size))) {
             av_log(avctx, AV_LOG_ERROR,
                    "Can't allocate decompression buffer.\n");
             return AVERROR(ENOMEM);
diff --git a/libavcodec/zmbvenc.c b/libavcodec/zmbvenc.c
index 4436bb3c51..df06e37cb2 100644
--- a/libavcodec/zmbvenc.c
+++ b/libavcodec/zmbvenc.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) encoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -231,10 +231,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     pkt_size = c->zstream.total_out + 1 + 6*keyframe;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting packet of size %d.\n", pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     fl = (keyframe ? ZMBV_KEYFRAME : 0) | (chpal ? ZMBV_DELTAPAL : 0);